From 247f4e5b4ad3dbf1e95ec12a15fa0d7377e36be1 Mon Sep 17 00:00:00 2001 From: Packit Date: Sep 25 2020 16:06:22 +0000 Subject: ghc-tagsoup-0.14.2 base --- diff --git a/CHANGES.txt b/CHANGES.txt new file mode 100644 index 0000000..cb3156d --- /dev/null +++ b/CHANGES.txt @@ -0,0 +1,86 @@ +Changelog for TagSoup + +0.14.2 + #66, make sure positions are correct for lone & characters +0.14.1 + #63, add maybeAttrib +0.14 + #14, eliminate Text.HTML.Download +0.13.10 + #51, improve the Haddock documentation + #52, fix some > 16bit HTML entities +0.13.9 + #50, fix a space leak + #36, fix the demo examples + #35, make IsString a superclass of StringLike + #33, make flattenTree O(n) instead of O(n^2) +0.13.8 + #30, add parse/render functions directly to the Tree module +0.13.7 + #32, make sure upper case &#X works in lookupEntity +0.13.6 + #28, some named entities require a trailing semicolon (e.g. mid) +0.13.5 + #26, rename the test program to test-tagsoup +0.13.4 + #24, add isTagComment function + Update the copyright year +0.13.3 + Work on GHC 7.9 +0.13.2 + Remove all package upper bounds + Allow QuickCheck-2.6 +0.13.1 + #562, treat we output to the text +-- Deviation: is a closing tag, not a bogus comment +closeTagOpen S{..} = case hd of + _ | isAlpha hd || hd `elem` "?!" -> TagShut & hd & tagName TypeNormal tl + '>' -> errSeen "" & '<' & '/' & '>' & dat tl + _ | eof -> '<' & '/' & dat s + _ -> errWant "tag name" & bogusComment s + + +-- 8.2.4.5 Tag name state +tagName typ S{..} = pos $ case hd of + _ | white hd -> beforeAttName typ tl + '/' -> selfClosingStartTag typ tl + '>' -> neilTagEnd typ tl + '?' | typ == TypeXml -> neilXmlTagClose tl + _ | isAlpha hd -> hd & tagName typ tl + _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s + _ -> hd & tagName typ tl + + +-- 8.2.4.6 Before attribute name state +beforeAttName typ S{..} = pos $ case hd of + _ | white hd -> beforeAttName typ tl + '/' -> selfClosingStartTag typ tl + '>' -> neilTagEnd typ tl + '?' | typ == TypeXml -> neilXmlTagClose tl + _ | typ /= TypeNormal && hd `elem` "\'\"" -> beforeAttValue typ s -- NEIL + _ | hd `elem` "\"'<=" -> errSeen [hd] & AttName & hd & attName typ tl + _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s + _ -> AttName & hd & attName typ tl + + +-- 8.2.4.7 Attribute name state +attName typ S{..} = pos $ case hd of + _ | white hd -> afterAttName typ tl + '/' -> selfClosingStartTag typ tl + '=' -> beforeAttValue typ tl + '>' -> neilTagEnd typ tl + '?' | typ == TypeXml -> neilXmlTagClose tl + _ | hd `elem` "\"'<" -> errSeen [hd] & def + _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s + _ -> def + where def = hd & attName typ tl + + +-- 8.2.4.8 After attribute name state +afterAttName typ S{..} = pos $ case hd of + _ | white hd -> afterAttName typ tl + '/' -> selfClosingStartTag typ tl + '=' -> beforeAttValue typ tl + '>' -> neilTagEnd typ tl + '?' | typ == TypeXml -> neilXmlTagClose tl + _ | typ /= TypeNormal && hd `elem` "\"'" -> AttVal & beforeAttValue typ s -- NEIL + _ | hd `elem` "\"'<" -> errSeen [hd] & def + _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s + _ -> def + where def = AttName & hd & attName typ tl + +-- 8.2.4.9 Before attribute value state +beforeAttValue typ S{..} = pos $ case hd of + _ | white hd -> beforeAttValue typ tl + '\"' -> AttVal & attValueDQuoted typ tl + '&' -> AttVal & attValueUnquoted typ s + '\'' -> AttVal & attValueSQuoted typ tl + '>' -> errSeen "=" & neilTagEnd typ tl + '?' | typ == TypeXml -> neilXmlTagClose tl + _ | hd `elem` "<=" -> errSeen [hd] & def + _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s + _ -> def + where def = AttVal & hd & attValueUnquoted typ tl + + +-- 8.2.4.10 Attribute value (double-quoted) state +attValueDQuoted typ S{..} = pos $ case hd of + '\"' -> afterAttValueQuoted typ tl + '&' -> charRefAttValue (attValueDQuoted typ) (Just '\"') tl + _ | eof -> errWant "\"" & dat s + _ -> hd & attValueDQuoted typ tl + + +-- 8.2.4.11 Attribute value (single-quoted) state +attValueSQuoted typ S{..} = pos $ case hd of + '\'' -> afterAttValueQuoted typ tl + '&' -> charRefAttValue (attValueSQuoted typ) (Just '\'') tl + _ | eof -> errWant "\'" & dat s + _ -> hd & attValueSQuoted typ tl + + +-- 8.2.4.12 Attribute value (unquoted) state +attValueUnquoted typ S{..} = pos $ case hd of + _ | white hd -> beforeAttName typ tl + '&' -> charRefAttValue (attValueUnquoted typ) Nothing tl + '>' -> neilTagEnd typ tl + '?' | typ == TypeXml -> neilXmlTagClose tl + _ | hd `elem` "\"'<=" -> errSeen [hd] & def + _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s + _ -> def + where def = hd & attValueUnquoted typ tl + + +-- 8.2.4.13 Character reference in attribute value state +charRefAttValue :: Parser -> Maybe Char -> Parser +charRefAttValue resume c s = charRef resume True c s + + +-- 8.2.4.14 After attribute value (quoted) state +afterAttValueQuoted typ S{..} = pos $ case hd of + _ | white hd -> beforeAttName typ tl + '/' -> selfClosingStartTag typ tl + '>' -> neilTagEnd typ tl + '?' | typ == TypeXml -> neilXmlTagClose tl + _ | eof -> dat s + _ -> errSeen [hd] & beforeAttName typ s + + +-- 8.2.4.15 Self-closing start tag state +selfClosingStartTag typ S{..} = pos $ case hd of + _ | typ == TypeXml -> errSeen "/" & beforeAttName typ s + '>' -> TagEndClose & dat tl + _ | eof -> errWant ">" & dat s + _ -> errSeen "/" & beforeAttName typ s + + +-- 8.2.4.16 Bogus comment state +bogusComment S{..} = Comment & bogusComment1 s +bogusComment1 S{..} = pos $ case hd of + '>' -> CommentEnd & dat tl + _ | eof -> CommentEnd & dat s + _ -> hd & bogusComment1 tl + + +-- 8.2.4.17 Markup declaration open state +markupDeclOpen S{..} = pos $ case hd of + _ | Just s <- next "--" -> Comment & commentStart s + _ | isAlpha hd -> Tag & '!' & hd & tagName TypeDecl tl -- NEIL + _ | Just s <- next "[CDATA[" -> cdataSection s + _ -> errWant "tag name" & bogusComment s + + +-- 8.2.4.18 Comment start state +commentStart S{..} = pos $ case hd of + '-' -> commentStartDash tl + '>' -> errSeen "" & CommentEnd & dat tl + _ | eof -> errWant "-->" & CommentEnd & dat s + _ -> hd & comment tl + + +-- 8.2.4.19 Comment start dash state +commentStartDash S{..} = pos $ case hd of + '-' -> commentEnd tl + '>' -> errSeen "" & CommentEnd & dat tl + _ | eof -> errWant "-->" & CommentEnd & dat s + _ -> '-' & hd & comment tl + + +-- 8.2.4.20 Comment state +comment S{..} = pos $ case hd of + '-' -> commentEndDash tl + _ | eof -> errWant "-->" & CommentEnd & dat s + _ -> hd & comment tl + + +-- 8.2.4.21 Comment end dash state +commentEndDash S{..} = pos $ case hd of + '-' -> commentEnd tl + _ | eof -> errWant "-->" & CommentEnd & dat s + _ -> '-' & hd & comment tl + + +-- 8.2.4.22 Comment end state +commentEnd S{..} = pos $ case hd of + '>' -> CommentEnd & dat tl + '-' -> errWant "-->" & '-' & commentEnd tl + _ | white hd -> errSeen "--" & '-' & '-' & hd & commentEndSpace tl + '!' -> errSeen "!" & commentEndBang tl + _ | eof -> errWant "-->" & CommentEnd & dat s + _ -> errSeen "--" & '-' & '-' & hd & comment tl + + +-- 8.2.4.23 Comment end bang state +commentEndBang S{..} = pos $ case hd of + '>' -> CommentEnd & dat tl + '-' -> '-' & '-' & '!' & commentEndDash tl + _ | eof -> errWant "-->" & CommentEnd & dat s + _ -> '-' & '-' & '!' & hd & comment tl + + +-- 8.2.4.24 Comment end space state +commentEndSpace S{..} = pos $ case hd of + '>' -> CommentEnd & dat tl + '-' -> commentEndDash tl + _ | white hd -> hd & commentEndSpace tl + _ | eof -> errWant "-->" & CommentEnd & dat s + _ -> hd & comment tl + + +-- 8.2.4.38 CDATA section state +cdataSection S{..} = pos $ case hd of + _ | Just s <- next "]]>" -> dat s + _ | eof -> dat s + _ | otherwise -> hd & cdataSection tl + + +-- 8.2.4.39 Tokenizing character references +-- Change from spec: this is reponsible for writing '&' if nothing is to be written +charRef :: Parser -> Bool -> Maybe Char -> S -> [Out] +charRef resume att end S{..} = case hd of + _ | eof || hd `elem` "\t\n\f <&" || maybe False (== hd) end -> '&' & resume s + '#' -> charRefNum resume s tl + _ -> charRefAlpha resume att s + +charRefNum resume o S{..} = case hd of + _ | hd `elem` "xX" -> charRefNum2 resume o True tl + _ -> charRefNum2 resume o False s + +charRefNum2 resume o hex S{..} = case hd of + _ | hexChar hex hd -> (if hex then EntityHex else EntityNum) & hd & charRefNum3 resume hex tl + _ -> errSeen "&" & '&' & resume o + +charRefNum3 resume hex S{..} = case hd of + _ | hexChar hex hd -> hd & charRefNum3 resume hex tl + ';' -> EntityEnd True & resume tl + _ -> EntityEnd False & errWant ";" & resume s + +charRefAlpha resume att S{..} = case hd of + _ | isAlpha hd -> EntityName & hd & charRefAlpha2 resume att tl + _ -> errSeen "&" & '&' & resume s + +charRefAlpha2 resume att S{..} = case hd of + _ | alphaChar hd -> hd & charRefAlpha2 resume att tl + ';' -> EntityEnd True & resume tl + _ | att -> EntityEnd False & resume s + _ -> EntityEnd False & errWant ";" & resume s + + +alphaChar x = isAlphaNum x || x `elem` ":-_" + +hexChar False x = isDigit x +hexChar True x = isDigit x || (x >= 'a' && x <= 'f') || (x >= 'A' && x <= 'F') diff --git a/src/Text/HTML/TagSoup/Tree.hs b/src/Text/HTML/TagSoup/Tree.hs new file mode 100644 index 0000000..864b6a1 --- /dev/null +++ b/src/Text/HTML/TagSoup/Tree.hs @@ -0,0 +1,118 @@ +{-| + /NOTE/: This module is preliminary and may change at a future date. + + This module is intended to help converting a list of tags into a + tree of tags. +-} + +module Text.HTML.TagSoup.Tree + ( + TagTree(..), tagTree, parseTree, parseTreeOptions, ParseOptions(..), + flattenTree, renderTree, renderTreeOptions, RenderOptions(..), transformTree, universeTree + ) where + +import Text.HTML.TagSoup (parseTags, parseTagsOptions, renderTags, renderTagsOptions, ParseOptions(..), RenderOptions(..)) +import Text.HTML.TagSoup.Type +import Control.Arrow +import GHC.Exts (build) + + +-- | A tree of 'Tag' values. +data TagTree str + = -- | A 'TagOpen'/'TagClose' pair with the 'Tag' values in between. + TagBranch str [Attribute str] [TagTree str] + | -- | Any leaf node + TagLeaf (Tag str) + deriving (Eq,Ord,Show) + +instance Functor TagTree where + fmap f (TagBranch x y z) = TagBranch (f x) (map (f***f) y) (map (fmap f) z) + fmap f (TagLeaf x) = TagLeaf (fmap f x) + + +-- | Convert a list of tags into a tree. This version is not lazy at +-- all, that is saved for version 2. +tagTree :: Eq str => [Tag str] -> [TagTree str] +tagTree = g + where + g :: Eq str => [Tag str] -> [TagTree str] + g [] = [] + g xs = a ++ map TagLeaf (take 1 b) ++ g (drop 1 b) + where (a,b) = f xs + + -- the second tuple is either null or starts with a close + f :: Eq str => [Tag str] -> ([TagTree str],[Tag str]) + f (TagOpen name atts:rest) = + case f rest of + (inner,[]) -> (TagLeaf (TagOpen name atts):inner, []) + (inner,TagClose x:xs) + | x == name -> let (a,b) = f xs in (TagBranch name atts inner:a, b) + | otherwise -> (TagLeaf (TagOpen name atts):inner, TagClose x:xs) + _ -> error "TagSoup.Tree.tagTree: safe as - forall x . isTagClose (snd (f x))" + + f (TagClose x:xs) = ([], TagClose x:xs) + f (x:xs) = (TagLeaf x:a,b) + where (a,b) = f xs + f [] = ([], []) + +-- | Build a 'TagTree' from a string. +parseTree :: StringLike str => str -> [TagTree str] +parseTree = tagTree . parseTags + +-- | Build a 'TagTree' from a string, specifying the 'ParseOptions'. +parseTreeOptions :: StringLike str => ParseOptions str -> str -> [TagTree str] +parseTreeOptions opts str = tagTree $ parseTagsOptions opts str + +-- | Flatten a 'TagTree' back to a list of 'Tag'. +flattenTree :: [TagTree str] -> [Tag str] +flattenTree xs = build $ flattenTreeFB xs + +flattenTreeFB :: [TagTree str] -> (Tag str -> lst -> lst) -> lst -> lst +flattenTreeFB xs cons nil = flattenTreeOnto xs nil + where + flattenTreeOnto [] tags = tags + flattenTreeOnto (TagBranch name atts inner:trs) tags = + TagOpen name atts `cons` flattenTreeOnto inner (TagClose name `cons` flattenTreeOnto trs tags) + flattenTreeOnto (TagLeaf x:trs) tags = x `cons` flattenTreeOnto trs tags + +-- | Render a 'TagTree'. +renderTree :: StringLike str => [TagTree str] -> str +renderTree = renderTags . flattenTree + +-- | Render a 'TagTree' with some 'RenderOptions'. +renderTreeOptions :: StringLike str => RenderOptions str -> [TagTree str] -> str +renderTreeOptions opts trees = renderTagsOptions opts $ flattenTree trees + +-- | This operation is based on the Uniplate @universe@ function. Given a +-- list of trees, it returns those trees, and all the children trees at +-- any level. For example: +-- +-- > universeTree +-- > [TagBranch "a" [("href","url")] [TagBranch "b" [] [TagLeaf (TagText "text")]]] +-- > == [TagBranch "a" [("href","url")] [TagBranch "b" [] [TagLeaf (TagText "text")]]] +-- > ,TagBranch "b" [] [TagLeaf (TagText "text")]] +-- +-- This operation is particularly useful for queries. To collect all @\"a\"@ +-- tags in a tree, simply do: +-- +-- > [x | x@(TagBranch "a" _ _) <- universeTree tree] +universeTree :: [TagTree str] -> [TagTree str] +universeTree = concatMap f + where + f t@(TagBranch _ _ inner) = t : universeTree inner + f x = [x] + + +-- | This operation is based on the Uniplate @transform@ function. Given a +-- list of trees, it applies the function to every tree in a bottom-up +-- manner. This operation is useful for manipulating a tree - for example +-- to make all tag names upper case: +-- +-- > upperCase = transformTree f +-- > where f (TagBranch name atts inner) = [TagBranch (map toUpper name) atts inner] +-- > f x = [x] +transformTree :: (TagTree str -> [TagTree str]) -> [TagTree str] -> [TagTree str] +transformTree act = concatMap f + where + f (TagBranch a b inner) = act $ TagBranch a b (transformTree act inner) + f x = act x diff --git a/src/Text/HTML/TagSoup/Type.hs b/src/Text/HTML/TagSoup/Type.hs new file mode 100644 index 0000000..8b3988b --- /dev/null +++ b/src/Text/HTML/TagSoup/Type.hs @@ -0,0 +1,142 @@ +{-# LANGUAGE DeriveDataTypeable #-} +-- | The central type in TagSoup + +module Text.HTML.TagSoup.Type( + -- * Data structures and parsing + StringLike, Tag(..), Attribute, Row, Column, + + -- * Position manipulation + Position(..), tagPosition, nullPosition, positionChar, positionString, + + -- * Tag identification + isTagOpen, isTagClose, isTagText, isTagWarning, isTagPosition, + isTagOpenName, isTagCloseName, isTagComment, + + -- * Extraction + fromTagText, fromAttrib, + maybeTagText, maybeTagWarning, + innerText, + ) where + + +import Data.List (foldl') +import Data.Maybe (fromMaybe, mapMaybe) +import Text.StringLike +import Data.Data(Data, Typeable) + +-- | An HTML attribute @id=\"name\"@ generates @(\"id\",\"name\")@ +type Attribute str = (str,str) + +-- | The row/line of a position, starting at 1 +type Row = Int + +-- | The column of a position, starting at 1 +type Column = Int + + +--- All positions are stored as a row and a column, with (1,1) being the +--- top-left position +data Position = Position !Row !Column deriving (Show,Eq,Ord) + +nullPosition :: Position +nullPosition = Position 1 1 + +positionString :: Position -> String -> Position +positionString = foldl' positionChar + +positionChar :: Position -> Char -> Position +positionChar (Position r c) x = case x of + '\n' -> Position (r+1) 1 + '\t' -> Position r (c + 8 - mod (c-1) 8) + _ -> Position r (c+1) + +tagPosition :: Position -> Tag str +tagPosition (Position r c) = TagPosition r c + + +-- | A single HTML element. A whole document is represented by a list of @Tag@. +-- There is no requirement for 'TagOpen' and 'TagClose' to match. +data Tag str = + TagOpen str [Attribute str] -- ^ An open tag with 'Attribute's in their original order + | TagClose str -- ^ A closing tag + | TagText str -- ^ A text node, guaranteed not to be the empty string + | TagComment str -- ^ A comment + | TagWarning str -- ^ Meta: A syntax error in the input file + | TagPosition !Row !Column -- ^ Meta: The position of a parsed element + deriving (Show, Eq, Ord, Data, Typeable) + +instance Functor Tag where + fmap f (TagOpen x y) = TagOpen (f x) [(f a, f b) | (a,b) <- y] + fmap f (TagClose x) = TagClose (f x) + fmap f (TagText x) = TagText (f x) + fmap f (TagComment x) = TagComment (f x) + fmap f (TagWarning x) = TagWarning (f x) + fmap f (TagPosition x y) = TagPosition x y + + +-- | Test if a 'Tag' is a 'TagOpen' +isTagOpen :: Tag str -> Bool +isTagOpen (TagOpen {}) = True; isTagOpen _ = False + +-- | Test if a 'Tag' is a 'TagClose' +isTagClose :: Tag str -> Bool +isTagClose (TagClose {}) = True; isTagClose _ = False + +-- | Test if a 'Tag' is a 'TagText' +isTagText :: Tag str -> Bool +isTagText (TagText {}) = True; isTagText _ = False + +-- | Extract the string from within 'TagText', otherwise 'Nothing' +maybeTagText :: Tag str -> Maybe str +maybeTagText (TagText x) = Just x +maybeTagText _ = Nothing + +-- | Extract the string from within 'TagText', crashes if not a 'TagText' +fromTagText :: Show str => Tag str -> str +fromTagText (TagText x) = x +fromTagText x = error $ "(" ++ show x ++ ") is not a TagText" + +-- | Extract all text content from tags (similar to Verbatim found in HaXml) +innerText :: StringLike str => [Tag str] -> str +innerText = strConcat . mapMaybe maybeTagText + +-- | Test if a 'Tag' is a 'TagWarning' +isTagWarning :: Tag str -> Bool +isTagWarning (TagWarning {}) = True; isTagWarning _ = False + +-- | Extract the string from within 'TagWarning', otherwise 'Nothing' +maybeTagWarning :: Tag str -> Maybe str +maybeTagWarning (TagWarning x) = Just x +maybeTagWarning _ = Nothing + +-- | Test if a 'Tag' is a 'TagPosition' +isTagPosition :: Tag str -> Bool +isTagPosition TagPosition{} = True; isTagPosition _ = False + +-- | Extract an attribute, crashes if not a 'TagOpen'. +-- Returns @\"\"@ if no attribute present. +-- +-- Warning: does not distinquish between missing attribute +-- and present attribute with value @\"\"@. +fromAttrib :: (Show str, Eq str, StringLike str) => str -> Tag str -> str +fromAttrib att tag = fromMaybe empty $ maybeAttrib att tag + +-- | Extract an attribute, crashes if not a 'TagOpen'. +-- Returns @Nothing@ if no attribute present. +maybeAttrib :: (Show str, Eq str) => str -> Tag str -> Maybe str +maybeAttrib att (TagOpen _ atts) = lookup att atts +maybeAttrib _ x = error ("(" ++ show x ++ ") is not a TagOpen") + +-- | Returns True if the 'Tag' is 'TagOpen' and matches the given name +isTagOpenName :: Eq str => str -> Tag str -> Bool +isTagOpenName name (TagOpen n _) = n == name +isTagOpenName _ _ = False + +-- | Returns True if the 'Tag' is 'TagClose' and matches the given name +isTagCloseName :: Eq str => str -> Tag str -> Bool +isTagCloseName name (TagClose n) = n == name +isTagCloseName _ _ = False + +-- | Test if a 'Tag' is a 'TagComment' +isTagComment :: Tag str -> Bool +isTagComment TagComment {} = True; isTagComment _ = False diff --git a/src/Text/StringLike.hs b/src/Text/StringLike.hs new file mode 100644 index 0000000..fed6f4e --- /dev/null +++ b/src/Text/StringLike.hs @@ -0,0 +1,96 @@ +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances #-} + +-- | /WARNING/: This module is /not/ intended for use outside the TagSoup library. +-- +-- This module provides an abstraction for String's as used inside TagSoup. It allows +-- TagSoup to work with String (list of Char), ByteString.Char8, ByteString.Lazy.Char8, +-- Data.Text and Data.Text.Lazy. +module Text.StringLike (StringLike(..), fromString, castString) where + +import Data.String +import Data.Typeable + +import qualified Data.ByteString.Char8 as BS +import qualified Data.ByteString.Lazy.Char8 as LBS +import qualified Data.Text as T +import qualified Data.Text.Lazy as LT + + +-- | A class to generalise TagSoup parsing over many types of string-like types. +-- Examples are given for the String type. +class (Typeable a, Eq a, IsString a) => StringLike a where + -- | > empty = "" + empty :: a + -- | > cons = (:) + cons :: Char -> a -> a + -- | > uncons [] = Nothing + -- > uncons (x:xs) = Just (x, xs) + uncons :: a -> Maybe (Char, a) + + -- | > toString = id + toString :: a -> String + -- | > fromChar = return + fromChar :: Char -> a + -- | > strConcat = concat + strConcat :: [a] -> a + -- | > strNull = null + strNull :: a -> Bool + -- | > append = (++) + append :: a -> a -> a + + +-- | Convert a String from one type to another. +castString :: (StringLike a, StringLike b) => a -> b +castString = fromString . toString + + +instance StringLike String where + uncons [] = Nothing + uncons (x:xs) = Just (x, xs) + toString = id + fromChar = (:[]) + strConcat = concat + empty = [] + strNull = null + cons c = (c:) + append = (++) + +instance StringLike BS.ByteString where + uncons = BS.uncons + toString = BS.unpack + fromChar = BS.singleton + strConcat = BS.concat + empty = BS.empty + strNull = BS.null + cons = BS.cons + append = BS.append + +instance StringLike LBS.ByteString where + uncons = LBS.uncons + toString = LBS.unpack + fromChar = LBS.singleton + strConcat = LBS.concat + empty = LBS.empty + strNull = LBS.null + cons = LBS.cons + append = LBS.append + +instance StringLike T.Text where + uncons = T.uncons + toString = T.unpack + fromChar = T.singleton + strConcat = T.concat + empty = T.empty + strNull = T.null + cons = T.cons + append = T.append + +instance StringLike LT.Text where + uncons = LT.uncons + toString = LT.unpack + fromChar = LT.singleton + strConcat = LT.concat + empty = LT.empty + strNull = LT.null + cons = LT.cons + append = LT.append diff --git a/tagsoup.cabal b/tagsoup.cabal new file mode 100644 index 0000000..dd4c079 --- /dev/null +++ b/tagsoup.cabal @@ -0,0 +1,66 @@ +cabal-version: >= 1.18 +name: tagsoup +version: 0.14.2 +copyright: Neil Mitchell 2006-2017 +author: Neil Mitchell +maintainer: Neil Mitchell +homepage: https://github.com/ndmitchell/tagsoup#readme +bug-reports: https://github.com/ndmitchell/tagsoup/issues +license: BSD3 +category: XML +license-file: LICENSE +build-type: Simple +synopsis: Parsing and extracting information from (possibly malformed) HTML/XML documents +tested-with: GHC==8.2.1, GHC==8.0.2, GHC==7.10.3, GHC==7.8.4, GHC==7.6.3, GHC==7.4.2 +description: + TagSoup is a library for parsing HTML/XML. It supports the HTML 5 specification, + and can be used to parse either well-formed XML, or unstructured and malformed HTML + from the web. The library also provides useful functions to extract information + from an HTML document, making it ideal for screen-scraping. + . + Users should start from the "Text.HTML.TagSoup" module. +extra-doc-files: + CHANGES.txt + README.md + +source-repository head + type: git + location: https://github.com/ndmitchell/tagsoup.git + +library + default-language: Haskell2010 + build-depends: base == 4.*, containers, bytestring, text + hs-source-dirs: src + + exposed-modules: + Text.HTML.TagSoup + Text.HTML.TagSoup.Entity + Text.HTML.TagSoup.Match + Text.HTML.TagSoup.Tree + Text.StringLike + other-modules: + Text.HTML.TagSoup.Generated + Text.HTML.TagSoup.Implementation + Text.HTML.TagSoup.Manual + Text.HTML.TagSoup.Options + Text.HTML.TagSoup.Parser + Text.HTML.TagSoup.Render + Text.HTML.TagSoup.Specification + Text.HTML.TagSoup.Type + +test-suite test-tagsoup + type: exitcode-stdio-1.0 + default-language: Haskell2010 + build-depends: + base == 4.*, containers, bytestring, text, + QuickCheck >= 2.4, + deepseq >= 1.1, + tagsoup, + time, directory, process + + main-is: Main.hs + hs-source-dirs: test + other-modules: + TagSoup.Benchmark + TagSoup.Sample + TagSoup.Test diff --git a/test/Main.hs b/test/Main.hs new file mode 100644 index 0000000..dd26281 --- /dev/null +++ b/test/Main.hs @@ -0,0 +1,63 @@ + +module Main(main) where + +import System.Environment +import TagSoup.Sample +import TagSoup.Test +import TagSoup.Benchmark +import Data.Char(toLower) + + +helpMsg :: IO () +helpMsg = putStr $ unlines $ + ["TagSoup, (C) Neil Mitchell 2006-2009" + ,"" + ," tagsoup arguments" + ,"" + ," may either be a local file, or a http[s]:// page" + ,"" + ] ++ map f res + where + width = maximum $ map (length . fst) res + res = map g actions + + g (nam,msg,Left _) = (nam,msg) + g (nam,msg,Right _) = (nam ++ " ",msg) + + f (lhs,rhs) = " " ++ lhs ++ replicate (4 + width - length lhs) ' ' ++ rhs + + +actions :: [(String, String, Either (IO ()) (String -> IO ()))] +actions = [("test","Run the test suite",Left test) + ,("grab","Grab a web page",Right grab) + ,("parse","Parse a web page",Right parse) + ,("bench","Benchmark the parsing",Left time) + ,("benchfile","Benchmark the parsing of a file",Right timefile) + ,("validate","Validate a page",Right validate) + ,("lastmodifieddate","Get the wiki.haskell.org last modified date",Left haskellLastModifiedDateTime) + ,("spj","Simon Peyton Jones' papers",Left spjPapers) + ,("ndm","Neil Mitchell's papers",Left ndmPapers) + ,("time","Current time",Left currentTime) + ,("google","Google Tech News",Left googleTechNews) + ,("sequence","Creators on sequence.complete.org",Left rssCreators) + ,("help","This help message",Left helpMsg) + ] + +main :: IO () +main = do + args <- getArgs + case (args, lookup (map toLower $ head args) $ map (\(a,_,c) -> (a,c)) actions) of + ([],_) -> do + putStrLn "No arguments specifying, defaulting to test" + helpMsg + putStrLn $ replicate 70 '-' + test + (x:_,Nothing) -> putStrLn ("Error: unknown command " ++ x) >> helpMsg + ([_],Just (Left a)) -> a + (x:xs,Just (Left a)) -> do + putStrLn $ "Warning: expected no arguments to " ++ x ++ " but got: " ++ unwords xs + a + ([_,y],Just (Right a)) -> a y + (x:xs,Just (Right _)) -> do + putStrLn $ "Error: expected exactly one argument to " ++ x ++ " but got: " ++ unwords xs + helpMsg diff --git a/test/TagSoup/Benchmark.hs b/test/TagSoup/Benchmark.hs new file mode 100644 index 0000000..2f7407e --- /dev/null +++ b/test/TagSoup/Benchmark.hs @@ -0,0 +1,202 @@ +{-# LANGUAGE CPP #-} +{-# OPTIONS_GHC -fno-warn-orphans #-} -- test file, so OK + +module TagSoup.Benchmark where + +import Text.HTML.TagSoup + +import Control.DeepSeq +import Control.Monad +import Data.List +import Data.Maybe +import System.IO.Unsafe(unsafeInterleaveIO) +import qualified Data.ByteString.Char8 as BS +import qualified Data.ByteString.Lazy.Char8 as LBS +import Data.Time.Clock.POSIX(getPOSIXTime) + +conf = 0.95 + + +timefile :: FilePath -> IO () +timefile file = do + -- use LBS to be most representative of real life + lbs <- LBS.readFile file + let str = LBS.unpack lbs + bs = BS.concat $ LBS.toChunks lbs + () <- LBS.length lbs `seq` length str `seq` BS.length bs `seq` return () + benchWith (const str, const bs, const lbs) $ benchStatic (toInteger $ LBS.length lbs) + + +sample :: String +sample = " is " ++ + " and some just random & test ><" + +nsample = genericLength sample :: Integer + +time :: IO () +time = benchWith (str,bs,lbs) benchVariable + where + str = \i -> concat $ genericReplicate i sample + bs = let s = BS.pack sample in \i -> BS.concat (genericReplicate i s) + lbs = let s = LBS.pack sample in \i -> LBS.concat (genericReplicate i s) + + + +benchWith :: (Integer -> String, Integer -> BS.ByteString, Integer -> LBS.ByteString) + -> ((Integer -> ()) -> IO [String]) -> IO () +benchWith (str,bs,lbs) bench = do + putStrLn "Timing parseTags in characters/second" + let header = map (:[]) ["(" ++ show (round $ conf * 100) ++ "% confidence)","String","BS","LBS"] + rows <- mapM row $ replicateM 3 [False,True] + mapM_ (putStrLn . strict . grid) $ delay2 $ header : rows + where + row [a,b,c] = do + let header = intercalate "," [g a "pos", g b "warn", g c "merge"] + g b x = (if b then ' ' else '!') : x + f x = bench $ \i -> rnf $ parseTagsOptions parseOptions{optTagPosition=a,optTagWarning=b,optTagTextMerge=c} $ x i + c1 <- f str + c2 <- f bs + c3 <- f lbs + return [[header],c1,c2,c3] + + strict = reverse . reverse + + +--------------------------------------------------------------------- +-- BENCHMARK ON THE SAMPLE INPUT + +disp xs = showUnit (floor xbar) ++ " (~" ++ rng ++ "%)" + where xbar = mean xs + rng = if length xs <= 1 then "?" else show (ceiling $ (range conf xs) * 100 / xbar) + +cons x = fmap (x:) + + +aimTime = 0.3 :: Double -- seconds to aim for +minTime = 0.2 :: Double -- below this a test is considered invalid + + +-- given a number of times to repeat sample, return a list of what +-- to display +benchVariable :: (Integer -> ()) -> IO [String] +benchVariable op = cons "?" $ f 10 [] + where + f i seen | length seen > 9 = cons (" " ++ disp seen) $ return [] + | otherwise = unsafeInterleaveIO $ do + now <- timer $ op i + let cps = if now == 0 then 0 else fromInteger (i * nsample) / now + if now < minTime || (null seen && now < aimTime) then do + let factor = min 7 $ max 2 $ floor $ aimTime / now + cons ("? " ++ disp [cps]) $ f (i * factor) [] + else + cons (show (9 - length seen) ++ " " ++ disp (cps:seen)) $ f i (cps:seen) + + + +benchStatic :: Integer -> (Integer -> ()) -> IO [String] +benchStatic nsample op = cons "?" $ f [] + where + f seen | length seen > 9 = cons (" " ++ disp seen) $ return [] + | otherwise = unsafeInterleaveIO $ do + now <- timer $ op $ genericLength seen + let cps = if now == 0 then 0 else fromInteger nsample / now + cons (show (9 - length seen) ++ " " ++ disp (cps:seen)) $ f (cps:seen) + + +--------------------------------------------------------------------- +-- UTILITY FUNCTIONS + +-- | Given a number, show it using a unit and decimal place +showUnit :: Integer -> String +showUnit x = num ++ unit + where + units = " KMGTPEZY" + (use,skip) = splitAt 3 $ show x + + unit = [units !! ((length skip + 2) `div` 3)] + + dot = ((length skip - 1) `mod` 3) + 1 + num = a ++ ['.' | b /= ""] ++ b + where (a,b) = splitAt dot use + + +-- copied from the criterion package +getTime :: IO Double +getTime = (fromRational . toRational) `fmap` getPOSIXTime + +timer :: () -> IO Double +timer x = do + start <- getTime + () <- return x + end <- getTime + return $ end - start + + +-- display a grid +grid :: [[String]] -> String +grid xs = unlines $ map (concat . zipWith f cols) xs + where cols = map (maximum . map length) $ transpose xs + f n x = x ++ replicate (n+1 - length x) ' ' + + +-- display a series of grids over time +-- when a grid gets to [] keep its value at that +-- when all grids get to [] return [] +delay2 :: [[[String]]] -> [[[String]]] +delay2 xs = map (map head) xs : (if all (null . tail) (concat xs) then [] else delay2 $ map (map tl) xs) + where tl (x:xs) = if null xs then x:xs else xs + + +--------------------------------------------------------------------- +-- INSTANCES + +instance NFData a => NFData (Tag a) where + rnf (TagOpen x y) = rnf x `seq` rnf y + rnf (TagClose x) = rnf x + rnf (TagText x) = rnf x + rnf (TagComment x) = rnf x + rnf (TagWarning x) = rnf x + rnf (TagPosition x y) = () -- both are already ! bound + + +#ifndef BYTESTRING_HAS_NFDATA +# ifdef MIN_VERSION_bytestring +# define BYTESTRING_HAS_NFDATA (MIN_VERSION_bytestring(0,10,0)) +# else +# define BYTESTRING_HAS_NFDATA (__GLASGOW_HASKELL__ >= 706) +# endif +#endif + +#if !BYTESTRING_HAS_NFDATA +instance NFData LBS.ByteString where + rnf x = LBS.length x `seq` () + +instance NFData BS.ByteString where + rnf x = BS.length x `seq` () +#endif + + +--------------------------------------------------------------------- +-- STATISTICS +-- Provided by Emily Mitchell + +confNs = let (*) = (,) in + [0.95 * 1.96 + ,0.90 * 1.644] + +size :: [Double] -> Double +size = genericLength + +mean :: [Double] -> Double +mean xs = sum xs / size xs + +stddev :: [Double] -> Double +stddev xs = sqrt $ sum [sqr (x - xbar) | x <- xs] / size xs + where xbar = mean xs + sqr x = x * x + +-- given a sample, and a required confidence +-- of the mean (i.e. 2.5% = 0.025) +range ::Double -> [Double] -> Double +range conf xs = conf2 * stddev xs / sqrt (size xs) + where conf2 = fromMaybe (error $ "Unknown confidence interval: " ++ show conf) $ lookup conf confNs diff --git a/test/TagSoup/Sample.hs b/test/TagSoup/Sample.hs new file mode 100644 index 0000000..043a19f --- /dev/null +++ b/test/TagSoup/Sample.hs @@ -0,0 +1,158 @@ + +module TagSoup.Sample where + +import Text.HTML.TagSoup + +import Control.Exception +import Control.Monad +import Data.List +import System.Process +import System.Directory +import System.Exit +import System.IO +import Data.Functor +import Prelude + + +openItem :: String -> IO String +openItem url + | not $ "http://" `isPrefixOf` url || "https://" `isPrefixOf` url = + readFile url +openItem url = bracket + (openTempFile "." "tagsoup.tmp") + (\(file,hndl) -> removeFile file) + $ \(file,hndl) -> do + hClose hndl + putStrLn $ "Downloading: " ++ url + res <- system $ "wget " ++ url ++ " -O " ++ file + when (res /= ExitSuccess) $ error $ "Failed to download using wget: " ++ url + src <- readFile file + length src `seq` return src + + +grab :: String -> IO () +grab x = openItem x >>= putStr + +parse :: String -> IO () +parse x = openItem x >>= putStr . show2 . parseTags + where + show2 [] = "[]" + show2 xs = "[" ++ concat (intersperseNotBroken "\n," $ map show xs) ++ "\n]\n" + + +-- the standard intersperse has a strictness bug which sucks! +intersperseNotBroken :: a -> [a] -> [a] +intersperseNotBroken _ [] = [] +intersperseNotBroken sep (x:xs) = x : is xs + where + is [] = [] + is (y:ys) = sep : y : is ys + + +{- +
  • This page was last modified on 9 September 2013, at 22:38.
  • +-} +haskellLastModifiedDateTime :: IO () +haskellLastModifiedDateTime = do + src <- openItem "http://wiki.haskell.org/Haskell" + let lastModifiedDateTime = fromFooter $ parseTags src + putStrLn $ "wiki.haskell.org was last modified on " ++ lastModifiedDateTime + where fromFooter = unwords . drop 6 . words . innerText . take 2 . dropWhile (~/= "
  • ") + + +googleTechNews :: IO () +googleTechNews = do + tags <- fmap parseTags $ openItem "http://news.google.com/?ned=us&topic=t" + let links = [ ascii name ++ " <" ++ maybe "unknown" shortUrl (lookup "href" atts) ++ ">" + | TagOpen "h2" [("class","title")]:TagText spaces:TagOpen "a" atts:TagText name:_ <- tails tags] + putStr $ unlines links + where + shortUrl x | "http://" `isPrefixOf` x = shortUrl $ drop 7 x + | "www." `isPrefixOf` x = shortUrl $ drop 4 x + | otherwise = takeWhile (/= '/') x + + ascii ('\226':'\128':'\147':xs) = '-' : ascii xs + ascii ('\194':'\163':xs) = "#GBP " ++ ascii xs + ascii (x:xs) = x : ascii xs + ascii [] = [] + + +spjPapers :: IO () +spjPapers = do + tags <- parseTags <$> openItem "http://research.microsoft.com/en-us/people/simonpj/" + let links = map f $ sections (~== "") $ + takeWhile (~/= "") $ + drop 5 $ dropWhile (~/= "") tags + putStr $ unlines links + where + f :: [Tag String] -> String + f = dequote . unwords . words . fromTagText . head . filter isTagText + + dequote ('\"':xs) | last xs == '\"' = init xs + dequote x = x + + +ndmPapers :: IO () +ndmPapers = do + tags <- parseTags <$> openItem "http://community.haskell.org/~ndm/downloads/" + let papers = map f $ sections (~== "
  • ") tags + putStr $ unlines papers + where + f :: [Tag String] -> String + f xs = fromTagText (xs !! 2) + + +currentTime :: IO () +currentTime = do + tags <- parseTags <$> openItem "http://www.timeanddate.com/worldclock/uk/london" + let time = fromTagText (dropWhile (~/= "") tags !! 1) + putStrLn time + + + +type Section = String +data Package = Package {name :: String, desc :: String, href :: String} + deriving Show + +hackage :: IO [(Section,[Package])] +hackage = do + tags <- fmap parseTags $ openItem "http://hackage.haskell.org/packages/archive/pkg-list.html" + return $ map parseSect $ partitions (~== "

    ") tags + where + parseSect xs = (nam, packs) + where + nam = fromTagText $ xs !! 2 + packs = map parsePackage $ partitions (~== "
  • ") xs + + parsePackage xs = + Package + (fromTagText $ xs !! 2) + (drop 2 $ dropWhile (/= ':') $ fromTagText $ xs !! 4) + (fromAttrib "href" $ xs !! 1) + +-- rssCreators Example: prints names of story contributors on +-- sequence.complete.org. This content is RSS (not HTML), and the selected +-- tag uses a different XML namespace "dc:creator". +rssCreators :: IO () +rssCreators = do + tags <- fmap parseTags $ openItem "http://sequence.complete.org/node/feed" + putStrLn $ unlines $ map names $ partitions (~== "") tags + where names xs = fromTagText $ xs !! 1 + + +validate :: String -> IO () +validate x = putStr . unlines . g . f . parseTagsOptions opts =<< openItem x + where + opts = parseOptions{optTagPosition=True, optTagWarning=True} + + f :: [Tag String] -> [String] + f (TagPosition row col:TagWarning warn:rest) = + ("Warning (" ++ show row ++ "," ++ show col ++ "): " ++ warn) : f rest + f (TagWarning warn:rest) = + ("Warning (?,?): " ++ warn) : f rest + f (_:rest) = f rest + f [] = [] + + g xs = xs ++ [if n == 0 then "Success, no warnings" + else "Failed, " ++ show n ++ " warning" ++ ['s'|n>1]] + where n = length xs diff --git a/test/TagSoup/Test.hs b/test/TagSoup/Test.hs new file mode 100644 index 0000000..9f5ba87 --- /dev/null +++ b/test/TagSoup/Test.hs @@ -0,0 +1,257 @@ +{-# OPTIONS_GHC -fno-warn-deprecations #-} + +module TagSoup.Test(test) where + +import Text.HTML.TagSoup +import Text.HTML.TagSoup.Entity +import Text.HTML.TagSoup.Match + +import Control.Monad +import Data.List +import Test.QuickCheck(Arbitrary(..), Testable(..), quickCheckWithResult, stdArgs, + Args(..), listOf, elements, Result(..)) + +-- * The Test Monad + +type Test a = IO a + +pass :: Test () +pass = return () + +runTest :: Test () -> IO () +runTest x = x >> putStrLn "All tests passed" + +(===) :: (Show a, Eq a) => a -> a -> IO () +a === b = if a == b then pass else fail $ "Does not equal: " ++ show a ++ " =/= " ++ show b + +check :: Testable prop => prop -> IO () +check prop = do + res <- quickCheckWithResult stdArgs{maxSuccess=1000} prop + case res of + Success{} -> pass + _ -> fail "Property failed" + +newtype HTML = HTML String deriving Show +instance Arbitrary HTML where + arbitrary = fmap (HTML . concat) $ listOf $ elements frags + where frags = map (:[]) " \n!-#&;xy01[]?'\"" ++ ["CDATA","amp","gt","lt"] + shrink (HTML x) = map HTML $ zipWith (++) (inits x) (tail $ tails x) + + +-- * The Main section + +test :: IO () +test = runTest $ do + warnTests + parseTests + optionsTests + renderTests + combiTests + positionTests + entityTests + lazyTags == lazyTags `seq` pass + matchCombinators + + +{- | +This routine tests the laziness of the TagSoup parser. +For each critical part of the parser we provide a test input +with a token of infinite size. +Then the output must be infinite too. +If the laziness is broken, then the output will stop early. +We collect the thousandth character of the output of each test case. +If computation of the list stops somewhere, +you have found a laziness stopper. +-} + + +lazyTags :: [Char] +lazyTags = map ((!!1000) . show . parseTags) + [cycle "Rhabarber" + ,repeat '&' + ,"<"++cycle "html" + ,"" === [TagOpen "!DOCTYPE" [("TEST","")]] + parseTags "" === [TagOpen "test" [("\"foo",""),("bar\"","")]] + parseTags "" === [TagOpen "test" [("baz",""),("\"foo\"","")]] + parseTags "" === [TagOpen "test" [("'foo",""),("bar'","")]] + parseTags "" === [TagOpen "test" [("bar",""),("'","")], TagClose "test"] + parseTags "" === [TagOpen "test2" [("a",""),("b","")]] + parseTags "" === [TagOpen "test2" [("''","")]] + parseTags "" === [TagClose "test"] + parseTags "" === [TagOpen "test" [], TagClose "test"] + parseTags "" === [TagOpen "test1" [("a","b")]] + parseTags "hello & world" === [TagText "hello & world"] + parseTags "hello @ world" === [TagText "hello @ world"] + parseTags "hello @ world" === [TagText "hello @ world"] + parseTags "hello @ world" === [TagText "hello @ world"] + parseTags "hello &haskell; world" === [TagText "hello &haskell; world"] + parseTags "hello \n\t world" === [TagText "hello \n\t world"] + parseTags "" === [TagOpen "a" [("href","http://www.google.com")]] + parseTags "" === [TagOpen "foo" [("bar","bar6baz")]] + parseTags "" === [TagOpen "foo" [("bar","bar&baz")]] + parseTags "hey &how are you" === [TagText "hey &how are you"] + parseTags "hey &how; are you" === [TagText "hey &how; are you"] + parseTags "hey & are you" === [TagText "hey & are you"] + parseTags "hey & are you" === [TagText "hey & are you"] + + -- real cases reported by users + parseTags "↖x≧̸" === [TagText ['\x2196','x','\x2267','\x0338']] + parseTags "test � test" === [TagText "test ? test"] + + parseTags "" === [TagOpen "a" [("href","series.php?view=single&ID=72710")]] + + parseTags "" === + [TagOpen "!DOCTYPE" [("HTML",""),("PUBLIC",""),("","-//W3C//DTD HTML 4.01//EN"),("","http://www.w3.org/TR/html4/strict.dtd")]] + + parseTags "" === [TagOpen "script" [], TagText " if (x if (x if (x" === [TagOpen "SCRIPT" [("language","foo")], TagText " if (x" === [TagOpen "script" [], TagClose "script", TagOpen "test" []] + + -- some escapes require trailing semicolons, see #28 and #27. + parseTags "one ∣ two" === [TagText "one \8739 two"] + parseTags "one &mid two" === [TagText "one &mid two"] + parseTags "one µ two" === [TagText "one \181 two"] + parseTags "one µ two" === [TagText "one \181 two"] + +optionsTests :: Test () +optionsTests = check $ \(HTML x) -> all (f x) $ replicateM 3 [False,True] + where + f str [pos,warn,merge] = + bool "merge" (not merge || adjacentTagText tags) && + bool "warn" (warn || all (not . isTagWarning) tags) && + bool "pos" (if pos then alternatePos tags else all (not . isTagPosition) tags) + where tags = parseTagsOptions parseOptions{optTagPosition=pos,optTagWarning=warn,optTagTextMerge=merge} str + bool x b = b || error ("optionsTests failed with " ++ x ++ " on " ++ show (pos,warn,merge,str,tags)) + + -- optTagTextMerge implies no adjacent TagText cells + -- and none separated by only warnings or positions + adjacentTagText = g True -- can the next be a tag text + where g i (x:xs) | isTagText x = i && g False xs + | isTagPosition x || isTagWarning x = g i xs + | otherwise = g True xs + g i [] = True + + -- optTagPosition implies every element must be followed + -- by a position node, no two position nodes must be adjacent + -- and all positions must be increasing + alternatePos (TagPosition l1 c1 : x : TagPosition l2 c2 : xs) + | (l1,c1) <= (l2,c2) && not (isTagPosition x) = alternatePos $ TagPosition l2 c2 : xs + alternatePos [TagPosition l1 c1, x] | not $ isTagPosition x = True + alternatePos [] = True + alternatePos _ = False + + +renderTests :: Test () +renderTests = do + let rp = renderTags . parseTags + rp "" === "" + rp "

    " === "
    " + rp "" === "" + rp "hello & world" === "hello & world" + rp "
    " === "" + rp "" === "" + rp "" === "" + rp "" === "" + rp "" === "" + rp "" === "" + rp "" === "" + escapeHTML "this is a &\" '" === "this is a &" <test> '" + check $ \(HTML x) -> let y = rp x in rp y == (y :: String) + + +entityTests :: Test () +entityTests = do + lookupNumericEntity "65" === Just "A" + lookupNumericEntity "x41" === Just "A" + lookupNumericEntity "x4E" === Just "N" + lookupNumericEntity "x4e" === Just "N" + lookupNumericEntity "X4e" === Just "N" + lookupNumericEntity "Haskell" === Nothing + lookupNumericEntity "" === Nothing + lookupNumericEntity "89439085908539082" === Nothing + lookupNamedEntity "amp" === Just "&" + lookupNamedEntity "haskell" === Nothing + escapeXML "hello world" === "hello world" + escapeXML "hello & world" === "hello & world" + + +combiTests :: Test () +combiTests = do + (TagText "test" ~== TagText "" ) === True + (TagText "test" ~== TagText "test") === True + (TagText "test" ~== TagText "soup") === False + (TagText "test" ~== "test") === True + (TagOpen "test" [] ~== "") === True + (TagOpen "test" [] ~== "") === False + (TagOpen "test" [] ~/= "") === True + (TagComment "foo" ~== "") === True + (TagComment "bar" ~== "") === True + + +positionTests :: Test () +positionTests = do + let p = parseTagsOptions parseOptions{optTagPosition=True,optTagWarning=False} + p "&" === [TagPosition 1 1,TagOpen "a" [],TagPosition 1 4,TagText "&",TagPosition 1 5,TagClose "a"] + p "&#z" === [TagPosition 1 1,TagOpen "a" [],TagPosition 1 4,TagText "&#z"] + p "&xz" === [TagPosition 1 1,TagOpen "a" [],TagPosition 1 4,TagText "&xz"] + p "&" === [TagPosition 1 1,TagOpen "a" [],TagPosition 1 4,TagText "&"] + p "&1" === [TagPosition 1 1,TagOpen "a" [],TagPosition 1 4,TagText "&1"] + p "&" === [TagPosition 1 1,TagOpen "a" [],TagPosition 1 4,TagText "&"] + + +warnTests :: Test () +warnTests = do + let p = parseTagsOptions parseOptions{optTagPosition=True,optTagWarning=True} + wt x = [(msg,c) | TagWarning msg:TagPosition _ c:_ <- tails $ p x] + wt "neil &foo bar" === [("Unknown entity: foo",6)]