Blame Text/XML.hs

Packit 899799
{-# LANGUAGE CPP                #-}
Packit 899799
{-# LANGUAGE DeriveDataTypeable #-}
Packit 899799
{-# LANGUAGE FlexibleContexts   #-}
Packit 899799
{-# LANGUAGE OverloadedStrings  #-}
Packit 899799
{-# LANGUAGE PatternGuards      #-}
Packit 899799
{-# LANGUAGE RankNTypes         #-}
Packit 899799
-- | DOM-based parsing and rendering.
Packit 899799
--
Packit 899799
-- This module requires that all entities be resolved at parsing. If you need
Packit 899799
-- to interact with unresolved entities, please use "Text.XML.Unresolved". This
Packit 899799
-- is the recommended module for most uses cases.
Packit 899799
--
Packit 899799
-- While many of the datatypes in this module are simply re-exported from
Packit 899799
-- @Data.XML.Types@, 'Document', 'Node' and 'Element' are all redefined here to
Packit 899799
-- disallow the possibility of unresolved entities. Conversion functions are
Packit 899799
-- provided to switch between the two sets of datatypes.
Packit 899799
--
Packit 899799
-- For simpler, bidirectional traversal of the DOM tree, see the
Packit 899799
-- "Text.XML.Cursor" module.
Packit 899799
module Text.XML
Packit 899799
    ( -- * Data types
Packit 899799
      Document (..)
Packit 899799
    , Prologue (..)
Packit 899799
    , Instruction (..)
Packit 899799
    , Miscellaneous (..)
Packit 899799
    , Node (..)
Packit 899799
    , Element (..)
Packit 899799
    , Name (..)
Packit 899799
    , Doctype (..)
Packit 899799
    , ExternalID (..)
Packit 899799
      -- * Parsing
Packit 899799
      -- ** Files
Packit 899799
    , readFile
Packit 899799
      -- ** Bytes
Packit 899799
    , parseLBS
Packit 899799
    , parseLBS_
Packit 899799
    , sinkDoc
Packit 899799
      -- ** Text
Packit 899799
    , parseText
Packit 899799
    , parseText_
Packit 899799
    , sinkTextDoc
Packit 899799
      -- ** Other
Packit 899799
    , fromEvents
Packit 899799
    , UnresolvedEntityException (..)
Packit 899799
    , XMLException (..)
Packit 899799
      -- * Rendering
Packit 899799
    , writeFile
Packit 899799
    , renderLBS
Packit 899799
    , renderText
Packit 899799
    , renderBytes
Packit 899799
      -- * Settings
Packit 899799
    , def
Packit 899799
      -- ** Parsing
Packit 899799
    , ParseSettings
Packit 899799
    , psDecodeEntities
Packit 899799
    , P.psRetainNamespaces
Packit 899799
      -- *** Entity decoding
Packit 899799
    , P.decodeXmlEntities
Packit 899799
    , P.decodeHtmlEntities
Packit 899799
      -- ** Rendering
Packit 899799
    , R.RenderSettings
Packit 899799
    , R.rsPretty
Packit 899799
    , R.rsNamespaces
Packit 899799
    , R.rsAttrOrder
Packit 899799
    , R.rsUseCDATA
Packit 899799
    , R.rsXMLDeclaration
Packit 899799
    , R.orderAttrs
Packit 899799
      -- * Conversion
Packit 899799
    , toXMLDocument
Packit 899799
    , fromXMLDocument
Packit 899799
    , toXMLNode
Packit 899799
    , fromXMLNode
Packit 899799
    , toXMLElement
Packit 899799
    , fromXMLElement
Packit 899799
    ) where
Packit 899799
Packit 899799
import           Control.Applicative          ((<$>))
Packit 899799
import           Control.DeepSeq              (NFData (rnf))
Packit 899799
import           Control.Exception            (Exception, SomeException, handle,
Packit 899799
                                               throw, throwIO)
Packit 899799
import           Control.Monad.ST             (runST)
Packit 899799
import           Control.Monad.Trans.Resource (MonadThrow, monadThrow,
Packit 899799
                                               runExceptionT, runResourceT)
Packit 899799
import           Data.ByteString              (ByteString)
Packit 899799
import qualified Data.ByteString.Lazy         as L
Packit 899799
import           Data.Data                    (Data)
Packit 899799
import           Data.Either                  (partitionEithers)
Packit 899799
import qualified Data.Map                     as Map
Packit 899799
import           Data.Set                     (Set)
Packit 899799
import qualified Data.Set                     as Set
Packit 899799
import           Data.Text                    (Text)
Packit 899799
import qualified Data.Text                    as T
Packit 899799
import           Data.Typeable                (Typeable)
Packit 899799
import           Data.XML.Types               (Doctype (..), ExternalID (..),
Packit 899799
                                               Instruction (..),
Packit 899799
                                               Miscellaneous (..), Name (..),
Packit 899799
                                               Prologue (..))
Packit 899799
import qualified Data.XML.Types               as X
Packit 899799
import           Prelude                      hiding (readFile, writeFile)
Packit 899799
import           Text.XML.Stream.Parse        (ParseSettings, def,
Packit 899799
                                               psDecodeEntities)
Packit 899799
import qualified Text.XML.Stream.Parse        as P
Packit 899799
import qualified Text.XML.Stream.Render       as R
Packit 899799
import qualified Text.XML.Unresolved          as D
Packit 899799
Packit 899799
import           Control.Monad.Trans.Class    (lift)
Packit 899799
import           Data.Conduit
Packit 899799
import qualified Data.Conduit.Binary          as CB
Packit 899799
import           Data.Conduit.Lazy            (lazyConsume)
Packit 899799
import qualified Data.Conduit.List            as CL
Packit 899799
import qualified Data.Text.Lazy               as TL
Packit 899799
import qualified Data.Text.Lazy.Encoding      as TLE
Packit 899799
import           System.IO.Unsafe             (unsafePerformIO)
Packit 899799
Packit 899799
import           Control.Arrow                (first)
Packit 899799
import           Data.List                    (foldl')
Packit 899799
import           Data.Monoid                  (mappend, mempty)
Packit 899799
import           Data.String                  (fromString)
Packit 899799
import qualified Text.Blaze                   as B
Packit 899799
import qualified Text.Blaze.Html              as B
Packit 899799
import qualified Text.Blaze.Html5             as B5
Packit 899799
import qualified Text.Blaze.Internal          as BI
Packit 899799
Packit 899799
data Document = Document
Packit 899799
    { documentPrologue :: Prologue
Packit 899799
    , documentRoot     :: Element
Packit 899799
    , documentEpilogue :: [Miscellaneous]
Packit 899799
    }
Packit 899799
  deriving (Show, Eq, Typeable, Data)
Packit 899799
Packit 899799
#if MIN_VERSION_containers(0, 4, 2)
Packit 899799
instance NFData Document where
Packit 899799
  rnf (Document a b c) = rnf a `seq` rnf b `seq` rnf c `seq` ()
Packit 899799
#endif
Packit 899799
Packit 899799
data Node
Packit 899799
    = NodeElement Element
Packit 899799
    | NodeInstruction Instruction
Packit 899799
    | NodeContent Text
Packit 899799
    | NodeComment Text
Packit 899799
  deriving (Show, Eq, Ord, Typeable, Data)
Packit 899799
Packit 899799
#if MIN_VERSION_containers(0, 4, 2)
Packit 899799
instance NFData Node where
Packit 899799
  rnf (NodeElement e)     = rnf e `seq` ()
Packit 899799
  rnf (NodeInstruction i) = rnf i `seq` ()
Packit 899799
  rnf (NodeContent t)     = rnf t `seq` ()
Packit 899799
  rnf (NodeComment t)     = rnf t `seq` ()
Packit 899799
#endif
Packit 899799
Packit 899799
data Element = Element
Packit 899799
    { elementName       :: Name
Packit 899799
    , elementAttributes :: Map.Map Name Text
Packit 899799
    , elementNodes      :: [Node]
Packit 899799
    }
Packit 899799
  deriving (Show, Eq, Ord, Typeable, Data)
Packit 899799
Packit 899799
#if MIN_VERSION_containers(0, 4, 2)
Packit 899799
instance NFData Element where
Packit 899799
  rnf (Element a b c) = rnf a `seq` rnf b `seq` rnf c `seq` ()
Packit 899799
#endif
Packit 899799
Packit 899799
{-
Packit 899799
readFile :: FilePath -> ParseSettings -> IO (Either SomeException Document)
Packit 899799
readFile_ :: FIlePath -> ParseSettings -> IO Document
Packit 899799
-}
Packit 899799
Packit 899799
toXMLDocument :: Document -> X.Document
Packit 899799
toXMLDocument = toXMLDocument' def
Packit 899799
Packit 899799
toXMLDocument' :: R.RenderSettings -> Document -> X.Document
Packit 899799
toXMLDocument' rs (Document a b c) = X.Document a (toXMLElement' rs b) c
Packit 899799
Packit 899799
toXMLElement :: Element -> X.Element
Packit 899799
toXMLElement = toXMLElement' def
Packit 899799
Packit 899799
toXMLElement' :: R.RenderSettings -> Element -> X.Element
Packit 899799
toXMLElement' rs (Element name as nodes) =
Packit 899799
    X.Element name as' nodes'
Packit 899799
  where
Packit 899799
    as' = map (\(x, y) -> (x, [X.ContentText y])) $ R.rsAttrOrder rs name as
Packit 899799
    nodes' = map (toXMLNode' rs) nodes
Packit 899799
Packit 899799
toXMLNode :: Node -> X.Node
Packit 899799
toXMLNode = toXMLNode' def
Packit 899799
Packit 899799
toXMLNode' :: R.RenderSettings -> Node -> X.Node
Packit 899799
toXMLNode' rs (NodeElement e)    = X.NodeElement $ toXMLElement' rs e
Packit 899799
toXMLNode' _ (NodeContent t)     = X.NodeContent $ X.ContentText t
Packit 899799
toXMLNode' _ (NodeComment c)     = X.NodeComment c
Packit 899799
toXMLNode' _ (NodeInstruction i) = X.NodeInstruction i
Packit 899799
Packit 899799
fromXMLDocument :: X.Document -> Either (Set Text) Document
Packit 899799
fromXMLDocument (X.Document a b c) =
Packit 899799
    case fromXMLElement b of
Packit 899799
        Left es  -> Left es
Packit 899799
        Right b' -> Right $ Document a b' c
Packit 899799
Packit 899799
fromXMLElement :: X.Element -> Either (Set Text) Element
Packit 899799
fromXMLElement (X.Element name as nodes) =
Packit 899799
    case (lnodes, las) of
Packit 899799
        ([], []) -> Right $ Element name ras rnodes
Packit 899799
        (x, [])  -> Left $ Set.unions x
Packit 899799
        ([], y)  -> Left $ Set.unions y
Packit 899799
        (x, y)   -> Left $ Set.unions x `Set.union` Set.unions y
Packit 899799
  where
Packit 899799
    enodes = map fromXMLNode nodes
Packit 899799
    (lnodes, rnodes) = partitionEithers enodes
Packit 899799
    eas = map go as
Packit 899799
    (las, ras') = partitionEithers eas
Packit 899799
    ras = Map.fromList ras'
Packit 899799
    go (x, y) =
Packit 899799
        case go' [] id y of
Packit 899799
            Left es  -> Left es
Packit 899799
            Right y' -> Right (x, y')
Packit 899799
    go' [] front []                       = Right $ T.concat $ front []
Packit 899799
    go' errs _ []                         = Left $ Set.fromList errs
Packit 899799
    go' errs front (X.ContentText t:ys)   = go' errs (front . (:) t) ys
Packit 899799
    go' errs front (X.ContentEntity t:ys) = go' (t : errs) front ys
Packit 899799
Packit 899799
fromXMLNode :: X.Node -> Either (Set Text) Node
Packit 899799
fromXMLNode (X.NodeElement e) = NodeElement <$> fromXMLElement e
Packit 899799
fromXMLNode (X.NodeContent (X.ContentText t)) = Right $ NodeContent t
Packit 899799
fromXMLNode (X.NodeContent (X.ContentEntity t)) = Left $ Set.singleton t
Packit 899799
fromXMLNode (X.NodeComment c) = Right $ NodeComment c
Packit 899799
fromXMLNode (X.NodeInstruction i) = Right $ NodeInstruction i
Packit 899799
Packit 899799
readFile :: ParseSettings -> FilePath -> IO Document
Packit 899799
readFile ps fp = handle
Packit 899799
    (throwIO . InvalidXMLFile fp)
Packit 899799
    (runResourceT $ CB.sourceFile fp $$ sinkDoc ps)
Packit 899799
Packit 899799
data XMLException = InvalidXMLFile FilePath SomeException
Packit 899799
    deriving Typeable
Packit 899799
Packit 899799
instance Show XMLException where
Packit 899799
    show (InvalidXMLFile fp e) = concat
Packit 899799
        [ "Error parsing XML file "
Packit 899799
        , fp
Packit 899799
        , ": "
Packit 899799
        , show e
Packit 899799
        ]
Packit 899799
instance Exception XMLException
Packit 899799
Packit 899799
parseLBS :: ParseSettings -> L.ByteString -> Either SomeException Document
Packit 899799
parseLBS ps lbs = runST
Packit 899799
                $ runExceptionT
Packit 899799
                $ CL.sourceList (L.toChunks lbs)
Packit 899799
           $$ sinkDoc ps
Packit 899799
Packit 899799
parseLBS_ :: ParseSettings -> L.ByteString -> Document
Packit 899799
parseLBS_ ps = either throw id . parseLBS ps
Packit 899799
Packit 899799
sinkDoc :: MonadThrow m
Packit 899799
        => ParseSettings
Packit 899799
        -> Consumer ByteString m Document
Packit 899799
sinkDoc ps = P.parseBytesPos ps =$= fromEvents
Packit 899799
Packit 899799
parseText :: ParseSettings -> TL.Text -> Either SomeException Document
Packit 899799
parseText ps tl = runST
Packit 899799
                $ runExceptionT
Packit 899799
                $ CL.sourceList (TL.toChunks tl)
Packit 899799
           $$ sinkTextDoc ps
Packit 899799
Packit 899799
parseText_ :: ParseSettings -> TL.Text -> Document
Packit 899799
parseText_ ps = either throw id . parseText ps
Packit 899799
Packit 899799
sinkTextDoc :: MonadThrow m
Packit 899799
            => ParseSettings
Packit 899799
            -> Consumer Text m Document
Packit 899799
sinkTextDoc ps = P.parseTextPos ps =$= fromEvents
Packit 899799
Packit 899799
fromEvents :: MonadThrow m => Consumer P.EventPos m Document
Packit 899799
fromEvents = do
Packit 899799
    d <- D.fromEvents
Packit 899799
    either (lift . monadThrow . UnresolvedEntityException) return $ fromXMLDocument d
Packit 899799
Packit 899799
data UnresolvedEntityException = UnresolvedEntityException (Set Text)
Packit 899799
    deriving (Show, Typeable)
Packit 899799
instance Exception UnresolvedEntityException
Packit 899799
Packit 899799
--renderBytes :: MonadUnsafeIO m => R.RenderSettings -> Document -> Producer m ByteString
Packit 899799
renderBytes rs doc = D.renderBytes rs $ toXMLDocument' rs doc
Packit 899799
Packit 899799
writeFile :: R.RenderSettings -> FilePath -> Document -> IO ()
Packit 899799
writeFile rs fp doc =
Packit 899799
    runResourceT $ renderBytes rs doc $$ CB.sinkFile fp
Packit 899799
Packit 899799
renderLBS :: R.RenderSettings -> Document -> L.ByteString
Packit 899799
renderLBS rs doc =
Packit 899799
    L.fromChunks $ unsafePerformIO
Packit 899799
                 -- not generally safe, but we know that runResourceT
Packit 899799
                 -- will not deallocate any of the resources being used
Packit 899799
                 -- by the process
Packit 899799
                 $ lazyConsume
Packit 899799
                 $ renderBytes rs doc
Packit 899799
Packit 899799
renderText :: R.RenderSettings -> Document -> TL.Text
Packit 899799
renderText rs = TLE.decodeUtf8 . renderLBS rs
Packit 899799
Packit 899799
instance B.ToMarkup Document where
Packit 899799
    toMarkup (Document _ root _) = B5.docType >> B.toMarkup root
Packit 899799
Packit 899799
-- | Note that the special element name
Packit 899799
-- @{http://www.snoyman.com/xml2html}ie-cond@ with the single attribute @cond@
Packit 899799
-- is used to indicate an IE conditional comment.
Packit 899799
instance B.ToMarkup Element where
Packit 899799
    toMarkup (Element "{http://www.snoyman.com/xml2html}ie-cond" attrs children)
Packit 899799
      | [("cond", cond)] <- Map.toList attrs =
Packit 899799
        B.preEscapedToMarkup ("
Packit 899799
        `mappend` B.preEscapedToMarkup cond
Packit 899799
        `mappend` B.preEscapedToMarkup ("]>" :: T.Text)
Packit 899799
        `mappend` mapM_ B.toMarkup children
Packit 899799
        `mappend` B.preEscapedToMarkup ("" :: T.Text)
Packit 899799
Packit 899799
    toMarkup (Element name' attrs children) =
Packit 899799
        if isVoid
Packit 899799
            then foldl' (B.!) leaf attrs'
Packit 899799
            else foldl' (B.!) parent attrs' childrenHtml
Packit 899799
      where
Packit 899799
        childrenHtml :: B.Html
Packit 899799
        childrenHtml =
Packit 899799
            case (name `elem` ["style", "script"], children) of
Packit 899799
                (True, [NodeContent t]) -> B.preEscapedToMarkup t
Packit 899799
                _                       -> mapM_ B.toMarkup children
Packit 899799
Packit 899799
        isVoid = nameLocalName name' `Set.member` voidElems
Packit 899799
Packit 899799
        parent :: B.Html -> B.Html
Packit 899799
        parent = BI.Parent tag open close
Packit 899799
        leaf :: B.Html
Packit 899799
#if MIN_VERSION_blaze_markup(0,8,0)
Packit 899799
        leaf = BI.Leaf tag open (fromString " />") ()
Packit 899799
#else
Packit 899799
        leaf = BI.Leaf tag open (fromString " />")
Packit 899799
#endif
Packit 899799
Packit 899799
        name = T.unpack $ nameLocalName name'
Packit 899799
        tag = fromString name
Packit 899799
        open = fromString $ '<' : name
Packit 899799
        close = fromString $ concat ["</", name, ">"]
Packit 899799
Packit 899799
        attrs' :: [B.Attribute]
Packit 899799
        attrs' = map (goAttr . first nameLocalName) $ Map.toList attrs
Packit 899799
        goAttr (key, value) = B.customAttribute (B.textTag key) $ B.toValue value
Packit 899799
Packit 899799
instance B.ToMarkup Node where
Packit 899799
    toMarkup (NodeElement e) = B.toMarkup e
Packit 899799
    toMarkup (NodeContent t) = B.toMarkup t
Packit 899799
    toMarkup _               = mempty
Packit 899799
Packit 899799
voidElems :: Set.Set T.Text
Packit 899799
voidElems = Set.fromAscList $ T.words $ T.pack "area base br col command embed hr img input keygen link meta param source track wbr"