{-# LANGUAGE DeriveDataTypeable #-}
module Text.HTML.TagSoup.Options where
import Data.Typeable
import Text.HTML.TagSoup.Type
import Text.HTML.TagSoup.Entity
import Text.StringLike
-- | These options control how 'parseTags' works. The 'ParseOptions' type is usually generated by one of
-- 'parseOptions', 'parseOptionsFast' or 'parseOptionsEntities', then selected fields may be overriden.
--
-- The options 'optTagPosition' and 'optTagWarning' specify whether to generate
-- 'TagPosition' or 'TagWarning' elements respectively. Usually these options should be set to @False@
-- to simplify future stages, unless you rely on position information or want to give malformed HTML
-- messages to the end user.
--
-- The options 'optEntityData' and 'optEntityAttrib' control how entities, for example @ @ are handled.
-- Both take a string, and a boolean, where @True@ indicates that the entity ended with a semi-colon @;@.
-- Inside normal text 'optEntityData' will be called, and the results will be inserted in the tag stream.
-- Inside a tag attribute 'optEntityAttrib' will be called, and the first component of the result will be used
-- in the attribute, and the second component will be appended after the 'TagOpen' value (usually the second
-- component is @[]@). As an example, to not decode any entities, pass:
--
-- > parseOptions
-- > {optEntityData=\(str,b) -> [TagText $ "&" ++ str ++ [';' | b]]
-- > ,optEntityAttrib\(str,b) -> ("&" ++ str ++ [';' | b], [])
-- The 'optTagTextMerge' value specifies if you always want adjacent 'TagText' values to be merged.
-- Merging adjacent pieces of text has a small performance penalty, but will usually make subsequent analysis
-- simpler. Contiguous runs of characters without entities or tags will also be generated as single 'TagText'
-- values.
data ParseOptions str = ParseOptions
{optTagPosition :: Bool -- ^ Should 'TagPosition' values be given before some items (default=False,fast=False).
,optTagWarning :: Bool -- ^ Should 'TagWarning' values be given (default=False,fast=False)
,optEntityData :: (str,Bool) -> [Tag str] -- ^ How to lookup an entity (Bool = has ending @';'@)
,optEntityAttrib :: (str,Bool) -> (str,[Tag str]) -- ^ How to lookup an entity in an attribute (Bool = has ending @';'@?)
,optTagTextMerge :: Bool -- ^ Require no adjacent 'TagText' values (default=True,fast=False)
}
deriving Typeable
-- | A 'ParseOptions' structure using a custom function to lookup attributes. Any attribute
-- that is not found will be left intact, and a 'TagWarning' given (if 'optTagWarning' is set).
--
-- If you do not want to resolve any entities, simpliy pass @const Nothing@ for the lookup function.
parseOptionsEntities :: StringLike str => (str -> Maybe str) -> ParseOptions str
parseOptionsEntities lookupEntity = ParseOptions False False entityData entityAttrib True
where
entityData x = TagText a : b
where (a,b) = entityAttrib x
entityAttrib ~(x,b) =
let x' = x `append` fromString [';'|b]
in case lookupEntity x' of
Just y -> (y, [])
Nothing -> (fromChar '&' `append` x'
,[TagWarning $ fromString "Unknown entity: " `append` x])
-- | The default parse options value, described in 'ParseOptions'. Equivalent to
-- @'parseOptionsEntities' 'lookupEntity'@.
parseOptions :: StringLike str => ParseOptions str
parseOptions = parseOptionsEntities $ fmap fromString . lookupEntity . toString
-- | A 'ParseOptions' structure optimised for speed, following the fast options.
parseOptionsFast :: StringLike str => ParseOptions str
parseOptionsFast = parseOptions{optTagTextMerge=False}
-- | Change the underlying string type of a 'ParseOptions' value.
fmapParseOptions :: (StringLike from, StringLike to) => ParseOptions from -> ParseOptions to
fmapParseOptions (ParseOptions a b c d e) = ParseOptions a b c2 d2 e
where
c2 ~(x,y) = map (fmap castString) $ c (castString x, y)
d2 ~(x,y) = (castString r, map (fmap castString) s)
where (r,s) = d (castString x, y)