|
Packit |
247f4e |
{-# LANGUAGE RecordWildCards, PatternGuards, ScopedTypeVariables #-}
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
module Text.HTML.TagSoup.Implementation where
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
import Text.HTML.TagSoup.Type
|
|
Packit |
247f4e |
import Text.HTML.TagSoup.Options
|
|
Packit |
247f4e |
import Text.StringLike as Str
|
|
Packit |
247f4e |
import Numeric (readHex)
|
|
Packit |
247f4e |
import Data.Char (chr, ord)
|
|
Packit |
247f4e |
import Data.Ix
|
|
Packit |
247f4e |
import Control.Exception(assert)
|
|
Packit |
247f4e |
import Control.Arrow
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
---------------------------------------------------------------------
|
|
Packit |
247f4e |
-- BOTTOM LAYER
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
data Out
|
|
Packit |
247f4e |
= Char Char
|
|
Packit |
247f4e |
| Tag -- <
|
|
Packit |
247f4e |
| TagShut -- </
|
|
Packit |
247f4e |
| AttName
|
|
Packit |
247f4e |
| AttVal
|
|
Packit |
247f4e |
| TagEnd -- >
|
|
Packit |
247f4e |
| TagEndClose -- />
|
|
Packit |
247f4e |
| Comment --
|
|
Packit |
247f4e |
| CommentEnd -- -->
|
|
Packit |
247f4e |
| EntityName -- &
|
|
Packit |
247f4e |
| EntityNum -- &#
|
|
Packit |
247f4e |
| EntityHex -- &#x
|
|
Packit |
247f4e |
| EntityEnd Bool -- Attributed followed by ; for True, missing ; for False
|
|
Packit |
247f4e |
| Warn String
|
|
Packit |
247f4e |
| Pos Position
|
|
Packit |
247f4e |
deriving (Show,Eq)
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
errSeen x = Warn $ "Unexpected " ++ show x
|
|
Packit |
247f4e |
errWant x = Warn $ "Expected " ++ show x
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
data S = S
|
|
Packit |
247f4e |
{s :: S
|
|
Packit |
247f4e |
,tl :: S
|
|
Packit |
247f4e |
,hd :: Char
|
|
Packit |
247f4e |
,eof :: Bool
|
|
Packit |
247f4e |
,next :: String -> Maybe S
|
|
Packit |
247f4e |
,pos :: [Out] -> [Out]
|
|
Packit |
247f4e |
}
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
expand :: Position -> String -> S
|
|
Packit |
247f4e |
expand p text = p `seq` res
|
|
Packit |
247f4e |
where res = S{s = res
|
|
Packit |
247f4e |
,tl = expand (positionChar p (head text)) (tail text)
|
|
Packit |
247f4e |
,hd = if null text then '\0' else head text
|
|
Packit |
247f4e |
,eof = null text
|
|
Packit |
247f4e |
,next = next p text
|
|
Packit |
247f4e |
,pos = (Pos p:)
|
|
Packit |
247f4e |
}
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
next p (t:ext) (s:tr) | t == s = next (positionChar p t) ext tr
|
|
Packit |
247f4e |
next p text [] = Just $ expand p text
|
|
Packit |
247f4e |
next _ _ _ = Nothing
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
infixr &
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
class Outable a where (&) :: a -> [Out] -> [Out]
|
|
Packit |
247f4e |
instance Outable Char where (&) = ampChar
|
|
Packit |
247f4e |
instance Outable Out where (&) = ampOut
|
|
Packit |
247f4e |
ampChar x y = Char x : y
|
|
Packit |
247f4e |
ampOut x y = x : y
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
state :: String -> S
|
|
Packit |
247f4e |
state s = expand nullPosition s
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
---------------------------------------------------------------------
|
|
Packit |
247f4e |
-- TOP LAYER
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
output :: forall str . StringLike str => ParseOptions str -> [Out] -> [Tag str]
|
|
Packit |
247f4e |
output ParseOptions{..} x = (if optTagTextMerge then tagTextMerge else id) $ go ((nullPosition,[]),x)
|
|
Packit |
247f4e |
where
|
|
Packit |
247f4e |
-- main choice loop
|
|
Packit |
247f4e |
go :: ((Position,[Tag str]),[Out]) -> [Tag str]
|
|
Packit |
247f4e |
go ((p,ws),xs) | p `seq` False = [] -- otherwise p is a space leak when optTagPosition == False
|
|
Packit |
247f4e |
go ((p,ws),xs) | not $ null ws = (if optTagWarning then (reverse ws++) else id) $ go ((p,[]),xs)
|
|
Packit |
247f4e |
go ((p,ws),Pos p2:xs) = go ((p2,ws),xs)
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
go x | isChar x = pos x $ TagText a : go y
|
|
Packit |
247f4e |
where (y,a) = charsStr x
|
|
Packit |
247f4e |
go x | isTag x = pos x $ TagOpen a b : (if isTagEndClose z then pos x $ TagClose a : go (next z) else go (skip isTagEnd z))
|
|
Packit |
247f4e |
where (y,a) = charsStr $ next x
|
|
Packit |
247f4e |
(z,b) = atts y
|
|
Packit |
247f4e |
go x | isTagShut x = pos x $ (TagClose a:) $
|
|
Packit |
247f4e |
(if not (null b) then warn x "Unexpected attributes in close tag" else id) $
|
|
Packit |
247f4e |
if isTagEndClose z then warn x "Unexpected self-closing in close tag" $ go (next z) else go (skip isTagEnd z)
|
|
Packit |
247f4e |
where (y,a) = charsStr $ next x
|
|
Packit |
247f4e |
(z,b) = atts y
|
|
Packit |
247f4e |
go x | isComment x = pos x $ TagComment a : go (skip isCommentEnd y)
|
|
Packit |
247f4e |
where (y,a) = charsStr $ next x
|
|
Packit |
247f4e |
go x | isEntityName x = poss x ((if optTagWarning then id else filter (not . isTagWarning)) $ optEntityData (a, getEntityEnd y)) ++ go (skip isEntityEnd y)
|
|
Packit |
247f4e |
where (y,a) = charsStr $ next x
|
|
Packit |
247f4e |
go x | isEntityNumHex x = pos x $ TagText (fromChar $ entityChr x a) : go (skip isEntityEnd y)
|
|
Packit |
247f4e |
where (y,a) = chars $ next x
|
|
Packit |
247f4e |
go x | Just a <- fromWarn x = if optTagWarning then pos x $ TagWarning (fromString a) : go (next x) else go (next x)
|
|
Packit |
247f4e |
go x | isEof x = []
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
atts :: ((Position,[Tag str]),[Out]) -> ( ((Position,[Tag str]),[Out]) , [(str,str)] )
|
|
Packit |
247f4e |
atts x | isAttName x = second ((a,b):) $ atts z
|
|
Packit |
247f4e |
where (y,a) = charsStr (next x)
|
|
Packit |
247f4e |
(z,b) = if isAttVal y then charsEntsStr (next y) else (y, empty)
|
|
Packit |
247f4e |
atts x | isAttVal x = second ((empty,a):) $ atts y
|
|
Packit |
247f4e |
where (y,a) = charsEntsStr (next x)
|
|
Packit |
247f4e |
atts x = (x, [])
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
-- chars
|
|
Packit |
247f4e |
chars x = charss False x
|
|
Packit |
247f4e |
charsStr x = (id *** fromString) $ chars x
|
|
Packit |
247f4e |
charsEntsStr x = (id *** fromString) $ charss True x
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
-- loop round collecting characters, if the b is set including entity
|
|
Packit |
247f4e |
charss :: Bool -> ((Position,[Tag str]),[Out]) -> ( ((Position,[Tag str]),[Out]) , String)
|
|
Packit |
247f4e |
charss t x | Just a <- fromChr x = (y, a:b)
|
|
Packit |
247f4e |
where (y,b) = charss t (next x)
|
|
Packit |
247f4e |
charss t x | t, isEntityName x = second (toString n ++) $ charss t $ addWarns m z
|
|
Packit |
247f4e |
where (y,a) = charsStr $ next x
|
|
Packit |
247f4e |
b = getEntityEnd y
|
|
Packit |
247f4e |
z = skip isEntityEnd y
|
|
Packit |
247f4e |
(n,m) = optEntityAttrib (a,b)
|
|
Packit |
247f4e |
charss t x | t, isEntityNumHex x = second (entityChr x a:) $ charss t z
|
|
Packit |
247f4e |
where (y,a) = chars $ next x
|
|
Packit |
247f4e |
z = skip isEntityEnd y
|
|
Packit |
247f4e |
charss t ((_,w),Pos p:xs) = charss t ((p,w),xs)
|
|
Packit |
247f4e |
charss t x | Just a <- fromWarn x = charss t $ (if optTagWarning then addWarns [TagWarning $ fromString a] else id) $ next x
|
|
Packit |
247f4e |
charss t x = (x, [])
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
-- utility functions
|
|
Packit |
247f4e |
next x = second (drop 1) x
|
|
Packit |
247f4e |
skip f x = assert (isEof x || f x) (next x)
|
|
Packit |
247f4e |
addWarns ws x@((p,w),y) = ((p, reverse (poss x ws) ++ w), y)
|
|
Packit |
247f4e |
pos ((p,_),_) rest = if optTagPosition then tagPosition p : rest else rest
|
|
Packit |
247f4e |
warn x s rest = if optTagWarning then pos x $ TagWarning (fromString s) : rest else rest
|
|
Packit |
247f4e |
poss x = concatMap (\w -> pos x [w])
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
entityChr x s | isEntityNum x = chr_ $ read s
|
|
Packit |
247f4e |
| isEntityHex x = chr_ $ fst $ head $ readHex s
|
|
Packit |
247f4e |
where chr_ x | inRange (toInteger $ ord minBound, toInteger $ ord maxBound) x = chr $ fromInteger x
|
|
Packit |
247f4e |
| otherwise = '?'
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
isEof (_,[]) = True; isEof _ = False
|
|
Packit |
247f4e |
isChar (_,Char{}:_) = True; isChar _ = False
|
|
Packit |
247f4e |
isTag (_,Tag{}:_) = True; isTag _ = False
|
|
Packit |
247f4e |
isTagShut (_,TagShut{}:_) = True; isTagShut _ = False
|
|
Packit |
247f4e |
isAttName (_,AttName{}:_) = True; isAttName _ = False
|
|
Packit |
247f4e |
isAttVal (_,AttVal{}:_) = True; isAttVal _ = False
|
|
Packit |
247f4e |
isTagEnd (_,TagEnd{}:_) = True; isTagEnd _ = False
|
|
Packit |
247f4e |
isTagEndClose (_,TagEndClose{}:_) = True; isTagEndClose _ = False
|
|
Packit |
247f4e |
isComment (_,Comment{}:_) = True; isComment _ = False
|
|
Packit |
247f4e |
isCommentEnd (_,CommentEnd{}:_) = True; isCommentEnd _ = False
|
|
Packit |
247f4e |
isEntityName (_,EntityName{}:_) = True; isEntityName _ = False
|
|
Packit |
247f4e |
isEntityNumHex (_,EntityNum{}:_) = True; isEntityNumHex (_,EntityHex{}:_) = True; isEntityNumHex _ = False
|
|
Packit |
247f4e |
isEntityNum (_,EntityNum{}:_) = True; isEntityNum _ = False
|
|
Packit |
247f4e |
isEntityHex (_,EntityHex{}:_) = True; isEntityHex _ = False
|
|
Packit |
247f4e |
isEntityEnd (_,EntityEnd{}:_) = True; isEntityEnd _ = False
|
|
Packit |
247f4e |
isWarn (_,Warn{}:_) = True; isWarn _ = False
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
fromChr (_,Char x:_) = Just x ; fromChr _ = Nothing
|
|
Packit |
247f4e |
fromWarn (_,Warn x:_) = Just x ; fromWarn _ = Nothing
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
getEntityEnd (_,EntityEnd b:_) = b
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
-- Merge all adjacent TagText bits
|
|
Packit |
247f4e |
tagTextMerge :: StringLike str => [Tag str] -> [Tag str]
|
|
Packit |
247f4e |
tagTextMerge (TagText x:xs) = TagText (strConcat (x:a)) : tagTextMerge b
|
|
Packit |
247f4e |
where
|
|
Packit |
247f4e |
(a,b) = f xs
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
-- additional brackets on 3 lines to work around HSE 1.3.2 bugs with pattern fixities
|
|
Packit |
247f4e |
f (TagText x:xs) = (x:a,b)
|
|
Packit |
247f4e |
where (a,b) = f xs
|
|
Packit |
247f4e |
f (TagPosition{}:(x@TagText{}:xs)) = f $ x : xs
|
|
Packit |
247f4e |
f x = g x id x
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
g o op (p@TagPosition{}:(w@TagWarning{}:xs)) = g o (op . (p:) . (w:)) xs
|
|
Packit |
247f4e |
g o op (w@TagWarning{}:xs) = g o (op . (w:)) xs
|
|
Packit |
247f4e |
g o op (p@TagPosition{}:(x@TagText{}:xs)) = f $ p : x : op xs
|
|
Packit |
247f4e |
g o op (x@TagText{}:xs) = f $ x : op xs
|
|
Packit |
247f4e |
g o op _ = ([], o)
|
|
Packit |
247f4e |
|
|
Packit |
247f4e |
tagTextMerge (x:xs) = x : tagTextMerge xs
|
|
Packit |
247f4e |
tagTextMerge [] = []
|