{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RankNTypes #-}
-- | DOM-based XML parsing and rendering.
--
-- In this module, attribute values and content nodes can contain either raw
-- text or entities. In most cases, these can be fully resolved at parsing. If
-- that is the case for your documents, the "Text.XML" module provides
-- simplified datatypes that only contain raw text.
module Text.XML.Unresolved
( -- * Non-streaming functions
writeFile
, readFile
-- * Lazy bytestrings
, renderLBS
, parseLBS
, parseLBS_
-- * Text
, parseText
, parseText_
, sinkTextDoc
-- * Byte streams
, sinkDoc
-- * Streaming functions
, toEvents
, elementToEvents
, fromEvents
, elementFromEvents
, renderBuilder
, renderBytes
, renderText
-- * Exceptions
, InvalidEventStream (..)
-- * Settings
, P.def
-- ** Parse
, P.ParseSettings
, P.psDecodeEntities
, P.psRetainNamespaces
-- ** Render
, R.RenderSettings
, R.rsPretty
, R.rsNamespaces
) where
import Blaze.ByteString.Builder (Builder)
import Control.Applicative ((<$>), (<*>))
import Control.Exception (Exception, SomeException, throw)
import Control.Monad (when)
import Control.Monad.ST (runST)
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.Resource (MonadThrow, monadThrow,
runExceptionT, runResourceT)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as L
import Data.Char (isSpace)
import Data.Conduit
import qualified Data.Conduit.Binary as CB
import Data.Conduit.Lazy (lazyConsume)
import qualified Data.Conduit.List as CL
import Data.Maybe (isJust, mapMaybe)
import Data.Monoid (mconcat)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Typeable (Typeable)
import Data.XML.Types
import Prelude hiding (readFile, writeFile)
import System.IO.Unsafe (unsafePerformIO)
import Text.XML.Stream.Parse (ParseSettings)
import qualified Text.XML.Stream.Parse as P
import qualified Text.XML.Stream.Render as R
readFile :: P.ParseSettings -> FilePath -> IO Document
readFile ps fp = runResourceT $ CB.sourceFile fp $$ sinkDoc ps
sinkDoc :: MonadThrow m
=> P.ParseSettings
-> Consumer ByteString m Document
sinkDoc ps = P.parseBytesPos ps =$= fromEvents
writeFile :: R.RenderSettings -> FilePath -> Document -> IO ()
writeFile rs fp doc =
runResourceT $ renderBytes rs doc $$ CB.sinkFile fp
renderLBS :: R.RenderSettings -> Document -> L.ByteString
renderLBS rs doc =
L.fromChunks $ unsafePerformIO
-- not generally safe, but we know that runResourceT
-- will not deallocate any of the resources being used
-- by the process
$ lazyConsume
$ renderBytes rs doc
parseLBS :: P.ParseSettings -> L.ByteString -> Either SomeException Document
parseLBS ps lbs =
runST $ runExceptionT
$ CL.sourceList (L.toChunks lbs) $$ sinkDoc ps
parseLBS_ :: P.ParseSettings -> L.ByteString -> Document
parseLBS_ ps lbs = either throw id $ parseLBS ps lbs
data InvalidEventStream = ContentAfterRoot P.EventPos
| MissingRootElement
| InvalidInlineDoctype P.EventPos
| MissingEndElement Name (Maybe P.EventPos)
| UnterminatedInlineDoctype
deriving Typeable
instance Exception InvalidEventStream
instance Show InvalidEventStream where
show (ContentAfterRoot (pos, e)) = mShowPos pos ++ "Found content after root element: " ++ prettyShowE e
show MissingRootElement = "Missing root element"
show (InvalidInlineDoctype (pos, e)) = mShowPos pos ++ "Invalid content inside doctype: " ++ prettyShowE e
show (MissingEndElement name Nothing) = "Documented ended while expected end element for: " ++ prettyShowName name
show (MissingEndElement name (Just (pos, e))) = mShowPos pos ++ "Expected end element for: " ++ prettyShowName name ++ ", but received: " ++ prettyShowE e
show UnterminatedInlineDoctype = "Unterminated doctype declaration"
mShowPos :: Maybe P.PositionRange -> String
mShowPos Nothing = ""
mShowPos (Just pos) = show pos ++ ": "
prettyShowE :: Event -> String
prettyShowE = show -- FIXME
prettyShowName :: Name -> String
prettyShowName = show -- FIXME
renderBuilder :: Monad m => R.RenderSettings -> Document -> Producer m Builder
renderBuilder rs doc = CL.sourceList (toEvents doc) =$= R.renderBuilder rs
--renderBytes :: MonadUnsafeIO m => R.RenderSettings -> Document -> Producer m ByteString
renderBytes rs doc = CL.sourceList (toEvents doc) =$= R.renderBytes rs
--renderText :: (MonadThrow m, MonadUnsafeIO m) => R.RenderSettings -> Document -> Producer m Text
renderText rs doc = CL.sourceList (toEvents doc) =$= R.renderText rs
manyTries :: Monad m => m (Maybe a) -> m [a]
manyTries f =
go id
where
go front = do
x <- f
case x of
Nothing -> return $ front []
Just y -> go (front . (:) y)
dropReturn :: Monad m => a -> ConduitM i o m a
dropReturn x = CL.drop 1 >> return x
-- | Parse a document from a stream of events.
fromEvents :: MonadThrow m => Consumer P.EventPos m Document
fromEvents = do
skip EventBeginDocument
d <- Document <$> goP <*> require elementFromEvents <*> goM
skip EventEndDocument
y <- CL.head
case y of
Nothing -> return d
Just (_, EventEndDocument) -> lift $ monadThrow MissingRootElement
Just z ->
lift $ monadThrow $ ContentAfterRoot z
where
skip e = do
x <- CL.peek
when (fmap snd x == Just e) (CL.drop 1)
require f = do
x <- f
case x of
Just y -> return y
Nothing -> do
my <- CL.head
case my of
Nothing -> error "Text.XML.Unresolved:impossible"
Just (_, EventEndDocument) -> lift $ monadThrow MissingRootElement
Just y -> lift $ monadThrow $ ContentAfterRoot y
goP = Prologue <$> goM <*> goD <*> goM
goM = manyTries goM'
goM' = do
x <- CL.peek
case x of
Just (_, EventInstruction i) -> dropReturn $ Just $ MiscInstruction i
Just (_, EventComment t) -> dropReturn $ Just $ MiscComment t
Just (_, EventContent (ContentText t))
| T.all isSpace t -> CL.drop 1 >> goM'
_ -> return Nothing
goD = do
x <- CL.peek
case x of
Just (_, EventBeginDoctype name meid) -> do
CL.drop 1
dropTillDoctype
return (Just $ Doctype name meid)
_ -> return Nothing
dropTillDoctype = do
x <- CL.head
case x of
-- Leaving the following line commented so that the intention of
-- this function stays clear. I figure in the future xml-types will
-- be expanded again to support some form of EventDeclaration
--
-- Just (EventDeclaration _) -> dropTillDoctype
Just (_, EventEndDoctype) -> return ()
Just epos -> lift $ monadThrow $ InvalidInlineDoctype epos
Nothing -> lift $ monadThrow UnterminatedInlineDoctype
-- | Try to parse a document element (as defined in XML) from a stream of events.
--
-- @since 1.3.5
elementFromEvents :: MonadThrow m => Consumer P.EventPos m (Maybe Element)
elementFromEvents = goE
where
goE = do
x <- CL.peek
case x of
Just (_, EventBeginElement n as) -> Just <$> goE' n as
_ -> return Nothing
goE' n as = do
CL.drop 1
ns <- manyTries goN
y <- CL.head
if fmap snd y == Just (EventEndElement n)
then return $ Element n as $ compressNodes ns
else lift $ monadThrow $ MissingEndElement n y
goN = do
x <- CL.peek
case x of
Just (_, EventBeginElement n as) -> (Just . NodeElement) <$> goE' n as
Just (_, EventInstruction i) -> dropReturn $ Just $ NodeInstruction i
Just (_, EventContent c) -> dropReturn $ Just $ NodeContent c
Just (_, EventComment t) -> dropReturn $ Just $ NodeComment t
Just (_, EventCDATA t) -> dropReturn $ Just $ NodeContent $ ContentText t
_ -> return Nothing
-- | Render a document into events.
toEvents :: Document -> [Event]
toEvents (Document prol root epi) =
(EventBeginDocument :)
. goP prol . elementToEvents' root . goM epi $ [EventEndDocument]
where
goP (Prologue before doctype after) =
goM before . maybe id goD doctype . goM after
goM [] = id
goM [x] = (goM' x :)
goM (x:xs) = (goM' x :) . goM xs
goM' (MiscInstruction i) = EventInstruction i
goM' (MiscComment t) = EventComment t
goD (Doctype name meid) =
(:) (EventBeginDoctype name meid)
. (:) EventEndDoctype
-- | Render a document element into events.
--
-- @since 1.3.5
elementToEvents :: Element -> [Event]
elementToEvents e = elementToEvents' e []
elementToEvents' :: Element -> [Event] -> [Event]
elementToEvents' = goE
where
goE (Element name as ns) =
(EventBeginElement name as :)
. goN ns
. (EventEndElement name :)
goN [] = id
goN [x] = goN' x
goN (x:xs) = goN' x . goN xs
goN' (NodeElement e) = goE e
goN' (NodeInstruction i) = (EventInstruction i :)
goN' (NodeContent c) = (EventContent c :)
goN' (NodeComment t) = (EventComment t :)
compressNodes :: [Node] -> [Node]
compressNodes [] = []
compressNodes [x] = [x]
compressNodes (x@(NodeContent (ContentText _)) : y@(NodeContent (ContentText _)) : z) =
let (textNodes, remainder) = span (isJust . unContent) (x:y:z)
texts = mapMaybe unContent textNodes
in
compressNodes $ NodeContent (ContentText $ mconcat texts) : remainder
where
unContent (NodeContent (ContentText text)) = Just text
unContent _ = Nothing
compressNodes (x:xs) = x : compressNodes xs
parseText :: ParseSettings -> TL.Text -> Either SomeException Document
parseText ps tl = runST
$ runExceptionT
$ CL.sourceList (TL.toChunks tl)
$$ sinkTextDoc ps
parseText_ :: ParseSettings -> TL.Text -> Document
parseText_ ps = either throw id . parseText ps
sinkTextDoc :: MonadThrow m
=> ParseSettings
-> Consumer Text m Document
sinkTextDoc ps = P.parseTextPos ps =$= fromEvents