Blame src/Text/HTML/TagSoup/Implementation.hs

Packit 247f4e
{-# LANGUAGE RecordWildCards, PatternGuards, ScopedTypeVariables #-}
Packit 247f4e
Packit 247f4e
module Text.HTML.TagSoup.Implementation where
Packit 247f4e
Packit 247f4e
import Text.HTML.TagSoup.Type
Packit 247f4e
import Text.HTML.TagSoup.Options
Packit 247f4e
import Text.StringLike as Str
Packit 247f4e
import Numeric (readHex)
Packit 247f4e
import Data.Char (chr, ord)
Packit 247f4e
import Data.Ix
Packit 247f4e
import Control.Exception(assert)
Packit 247f4e
import Control.Arrow
Packit 247f4e
Packit 247f4e
---------------------------------------------------------------------
Packit 247f4e
-- BOTTOM LAYER
Packit 247f4e
Packit 247f4e
data Out
Packit 247f4e
    = Char Char
Packit 247f4e
    | Tag             -- <
Packit 247f4e
    | TagShut         -- </
Packit 247f4e
    | AttName
Packit 247f4e
    | AttVal
Packit 247f4e
    | TagEnd          -- >
Packit 247f4e
    | TagEndClose     -- />
Packit 247f4e
    | Comment         -- 
Packit 247f4e
    | CommentEnd      -- -->
Packit 247f4e
    | EntityName      -- &
Packit 247f4e
    | EntityNum       -- &#
Packit 247f4e
    | EntityHex       -- &#x
Packit 247f4e
    | EntityEnd Bool  -- Attributed followed by ; for True, missing ; for False
Packit 247f4e
    | Warn String
Packit 247f4e
    | Pos Position
Packit 247f4e
      deriving (Show,Eq)
Packit 247f4e
Packit 247f4e
errSeen x = Warn $ "Unexpected " ++ show x
Packit 247f4e
errWant x = Warn $ "Expected " ++ show x
Packit 247f4e
Packit 247f4e
data S = S
Packit 247f4e
    {s :: S
Packit 247f4e
    ,tl :: S
Packit 247f4e
    ,hd :: Char
Packit 247f4e
    ,eof :: Bool
Packit 247f4e
    ,next :: String -> Maybe S
Packit 247f4e
    ,pos :: [Out] -> [Out]
Packit 247f4e
    }
Packit 247f4e
Packit 247f4e
Packit 247f4e
expand :: Position -> String -> S
Packit 247f4e
expand p text = p `seq` res
Packit 247f4e
    where res = S{s = res
Packit 247f4e
                 ,tl = expand (positionChar p (head text)) (tail text)
Packit 247f4e
                 ,hd = if null text then '\0' else head text
Packit 247f4e
                 ,eof = null text
Packit 247f4e
                 ,next = next p text
Packit 247f4e
                 ,pos = (Pos p:)
Packit 247f4e
                 }
Packit 247f4e
Packit 247f4e
          next p (t:ext) (s:tr) | t == s = next (positionChar p t) ext tr
Packit 247f4e
          next p text [] = Just $ expand p text
Packit 247f4e
          next _ _ _ = Nothing
Packit 247f4e
Packit 247f4e
Packit 247f4e
infixr &
Packit 247f4e
Packit 247f4e
class Outable a where (&) :: a -> [Out] -> [Out]
Packit 247f4e
instance Outable Char where (&) = ampChar
Packit 247f4e
instance Outable Out where (&) = ampOut
Packit 247f4e
ampChar x y = Char x : y
Packit 247f4e
ampOut x y = x : y
Packit 247f4e
Packit 247f4e
Packit 247f4e
state :: String -> S
Packit 247f4e
state s = expand nullPosition s
Packit 247f4e
Packit 247f4e
---------------------------------------------------------------------
Packit 247f4e
-- TOP LAYER
Packit 247f4e
Packit 247f4e
Packit 247f4e
output :: forall str . StringLike str => ParseOptions str -> [Out] -> [Tag str]
Packit 247f4e
output ParseOptions{..} x = (if optTagTextMerge then tagTextMerge else id) $ go ((nullPosition,[]),x)
Packit 247f4e
    where
Packit 247f4e
        -- main choice loop
Packit 247f4e
        go :: ((Position,[Tag str]),[Out]) -> [Tag str]
Packit 247f4e
        go ((p,ws),xs) | p `seq` False = [] -- otherwise p is a space leak when optTagPosition == False
Packit 247f4e
        go ((p,ws),xs) | not $ null ws = (if optTagWarning then (reverse ws++) else id) $ go ((p,[]),xs)
Packit 247f4e
        go ((p,ws),Pos p2:xs) = go ((p2,ws),xs)
Packit 247f4e
Packit 247f4e
        go x | isChar x = pos x $ TagText a : go y
Packit 247f4e
            where (y,a) = charsStr x
Packit 247f4e
        go x | isTag x = pos x $ TagOpen a b : (if isTagEndClose z then pos x $ TagClose a : go (next z) else go (skip isTagEnd z))
Packit 247f4e
            where (y,a) = charsStr $ next x
Packit 247f4e
                  (z,b) = atts y
Packit 247f4e
        go x | isTagShut x = pos x $ (TagClose a:) $
Packit 247f4e
                (if not (null b) then warn x "Unexpected attributes in close tag" else id) $
Packit 247f4e
                if isTagEndClose z then warn x "Unexpected self-closing in close tag" $ go (next z) else go (skip isTagEnd z)
Packit 247f4e
            where (y,a) = charsStr $ next x
Packit 247f4e
                  (z,b) = atts y
Packit 247f4e
        go x | isComment x = pos x $ TagComment a : go (skip isCommentEnd y)
Packit 247f4e
            where (y,a) = charsStr $ next x
Packit 247f4e
        go x | isEntityName x = poss x ((if optTagWarning then id else filter (not . isTagWarning)) $ optEntityData (a, getEntityEnd y)) ++ go (skip isEntityEnd y) 
Packit 247f4e
            where (y,a) = charsStr $ next x
Packit 247f4e
        go x | isEntityNumHex x = pos x $ TagText (fromChar $ entityChr x a) : go (skip isEntityEnd y)
Packit 247f4e
            where (y,a) = chars $ next x
Packit 247f4e
        go x | Just a <- fromWarn x = if optTagWarning then pos x $ TagWarning (fromString a) : go (next x) else go (next x)
Packit 247f4e
        go x | isEof x = []
Packit 247f4e
Packit 247f4e
        atts :: ((Position,[Tag str]),[Out]) -> ( ((Position,[Tag str]),[Out]) , [(str,str)] )
Packit 247f4e
        atts x | isAttName x = second ((a,b):) $ atts z
Packit 247f4e
            where (y,a) = charsStr (next x)
Packit 247f4e
                  (z,b) = if isAttVal y then charsEntsStr (next y) else (y, empty)
Packit 247f4e
        atts x | isAttVal x = second ((empty,a):) $ atts y
Packit 247f4e
            where (y,a) = charsEntsStr (next x)
Packit 247f4e
        atts x = (x, [])
Packit 247f4e
Packit 247f4e
        -- chars
Packit 247f4e
        chars x = charss False x
Packit 247f4e
        charsStr x = (id *** fromString) $ chars x
Packit 247f4e
        charsEntsStr x = (id *** fromString) $ charss True x
Packit 247f4e
Packit 247f4e
        -- loop round collecting characters, if the b is set including entity
Packit 247f4e
        charss :: Bool -> ((Position,[Tag str]),[Out]) -> ( ((Position,[Tag str]),[Out]) , String)
Packit 247f4e
        charss t x | Just a <- fromChr x = (y, a:b)
Packit 247f4e
            where (y,b) = charss t (next x)
Packit 247f4e
        charss t x | t, isEntityName x = second (toString n ++) $ charss t $ addWarns m z
Packit 247f4e
            where (y,a) = charsStr $ next x
Packit 247f4e
                  b = getEntityEnd y
Packit 247f4e
                  z = skip isEntityEnd y
Packit 247f4e
                  (n,m) = optEntityAttrib (a,b)
Packit 247f4e
        charss t x | t, isEntityNumHex x = second (entityChr x a:) $ charss t z
Packit 247f4e
            where (y,a) = chars $ next x
Packit 247f4e
                  z = skip isEntityEnd y
Packit 247f4e
        charss t ((_,w),Pos p:xs) = charss t ((p,w),xs)
Packit 247f4e
        charss t x | Just a <- fromWarn x = charss t $ (if optTagWarning then addWarns [TagWarning $ fromString a] else id) $ next x
Packit 247f4e
        charss t x = (x, [])
Packit 247f4e
Packit 247f4e
        -- utility functions
Packit 247f4e
        next x = second (drop 1) x
Packit 247f4e
        skip f x = assert (isEof x || f x) (next x)
Packit 247f4e
        addWarns ws x@((p,w),y) = ((p, reverse (poss x ws) ++ w), y)
Packit 247f4e
        pos ((p,_),_) rest = if optTagPosition then tagPosition p : rest else rest
Packit 247f4e
        warn x s rest = if optTagWarning then pos x $ TagWarning (fromString s) : rest else rest
Packit 247f4e
        poss x = concatMap (\w -> pos x [w]) 
Packit 247f4e
Packit 247f4e
Packit 247f4e
entityChr x s | isEntityNum x = chr_ $ read s
Packit 247f4e
              | isEntityHex x = chr_ $ fst $ head $ readHex s
Packit 247f4e
    where chr_ x | inRange (toInteger $ ord minBound, toInteger $ ord maxBound) x = chr $ fromInteger x
Packit 247f4e
                 | otherwise = '?'
Packit 247f4e
Packit 247f4e
Packit 247f4e
isEof (_,[]) = True; isEof _ = False
Packit 247f4e
isChar (_,Char{}:_) = True; isChar _ = False
Packit 247f4e
isTag (_,Tag{}:_) = True; isTag _ = False
Packit 247f4e
isTagShut (_,TagShut{}:_) = True; isTagShut _ = False
Packit 247f4e
isAttName (_,AttName{}:_) = True; isAttName _ = False
Packit 247f4e
isAttVal (_,AttVal{}:_) = True; isAttVal _ = False
Packit 247f4e
isTagEnd (_,TagEnd{}:_) = True; isTagEnd _ = False
Packit 247f4e
isTagEndClose (_,TagEndClose{}:_) = True; isTagEndClose _ = False
Packit 247f4e
isComment (_,Comment{}:_) = True; isComment _ = False
Packit 247f4e
isCommentEnd (_,CommentEnd{}:_) = True; isCommentEnd _ = False
Packit 247f4e
isEntityName (_,EntityName{}:_) = True; isEntityName _ = False
Packit 247f4e
isEntityNumHex (_,EntityNum{}:_) = True; isEntityNumHex (_,EntityHex{}:_) = True; isEntityNumHex _ = False
Packit 247f4e
isEntityNum (_,EntityNum{}:_) = True; isEntityNum _ = False
Packit 247f4e
isEntityHex (_,EntityHex{}:_) = True; isEntityHex _ = False
Packit 247f4e
isEntityEnd (_,EntityEnd{}:_) = True; isEntityEnd _ = False
Packit 247f4e
isWarn (_,Warn{}:_) = True; isWarn _ = False
Packit 247f4e
Packit 247f4e
fromChr (_,Char x:_) = Just x ; fromChr _ = Nothing
Packit 247f4e
fromWarn (_,Warn x:_) = Just x ; fromWarn _ = Nothing
Packit 247f4e
Packit 247f4e
getEntityEnd (_,EntityEnd b:_) = b
Packit 247f4e
Packit 247f4e
Packit 247f4e
-- Merge all adjacent TagText bits
Packit 247f4e
tagTextMerge :: StringLike str => [Tag str] -> [Tag str]
Packit 247f4e
tagTextMerge (TagText x:xs) = TagText (strConcat (x:a)) : tagTextMerge b
Packit 247f4e
    where
Packit 247f4e
        (a,b) = f xs
Packit 247f4e
Packit 247f4e
        -- additional brackets on 3 lines to work around HSE 1.3.2 bugs with pattern fixities
Packit 247f4e
        f (TagText x:xs) = (x:a,b)
Packit 247f4e
            where (a,b) = f xs
Packit 247f4e
        f (TagPosition{}:(x@TagText{}:xs)) = f $ x : xs
Packit 247f4e
        f x = g x id x
Packit 247f4e
Packit 247f4e
        g o op (p@TagPosition{}:(w@TagWarning{}:xs)) = g o (op . (p:) . (w:)) xs
Packit 247f4e
        g o op (w@TagWarning{}:xs) = g o (op . (w:)) xs
Packit 247f4e
        g o op (p@TagPosition{}:(x@TagText{}:xs)) = f $ p : x : op xs
Packit 247f4e
        g o op (x@TagText{}:xs) = f $ x : op xs
Packit 247f4e
        g o op _ = ([], o)
Packit 247f4e
Packit 247f4e
tagTextMerge (x:xs) = x : tagTextMerge xs
Packit 247f4e
tagTextMerge [] = []