|
Packit |
899799 |
{-# LANGUAGE BangPatterns #-}
|
|
Packit |
899799 |
{-# LANGUAGE CPP #-}
|
|
Packit |
899799 |
{-# LANGUAGE DeriveDataTypeable #-}
|
|
Packit |
899799 |
{-# LANGUAGE DeriveFunctor #-}
|
|
Packit |
899799 |
{-# LANGUAGE FlexibleContexts #-}
|
|
Packit |
899799 |
{-# LANGUAGE FlexibleInstances #-}
|
|
Packit |
899799 |
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
|
Packit |
899799 |
{-# LANGUAGE OverloadedStrings #-}
|
|
Packit |
899799 |
{-# LANGUAGE PatternGuards #-}
|
|
Packit |
899799 |
{-# LANGUAGE RankNTypes #-}
|
|
Packit |
899799 |
{-# LANGUAGE StandaloneDeriving #-}
|
|
Packit |
899799 |
{-# LANGUAGE TupleSections #-}
|
|
Packit |
899799 |
{-# LANGUAGE TypeFamilies #-}
|
|
Packit |
899799 |
-- | This module provides both a native Haskell solution for parsing XML
|
|
Packit |
899799 |
-- documents into a stream of events, and a set of parser combinators for
|
|
Packit |
899799 |
-- dealing with a stream of events.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- As a simple example, if you have the following XML file:
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > <people>
|
|
Packit |
899799 |
-- > <person age="25">Michael</person>
|
|
Packit |
899799 |
-- > <person age="2">Eliezer</person>
|
|
Packit |
899799 |
-- > </people>
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Then this code:
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- > {-# LANGUAGE OverloadedStrings #-}
|
|
Packit |
899799 |
-- > import Control.Monad.Trans.Resource
|
|
Packit |
899799 |
-- > import Data.Conduit (Consumer, Sink, ($$))
|
|
Packit |
899799 |
-- > import Data.Text (Text, unpack)
|
|
Packit |
899799 |
-- > import Text.XML.Stream.Parse
|
|
Packit |
899799 |
-- > import Data.XML.Types (Event)
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > data Person = Person Int Text
|
|
Packit |
899799 |
-- > deriving Show
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > parsePerson :: MonadThrow m => Consumer Event m (Maybe Person)
|
|
Packit |
899799 |
-- > parsePerson = tag' "person" (requireAttr "age") $ \age -> do
|
|
Packit |
899799 |
-- > name <- content
|
|
Packit |
899799 |
-- > return $ Person (read $ unpack age) name
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > parsePeople :: MonadThrow m => Sink Event m (Maybe [Person])
|
|
Packit |
899799 |
-- > parsePeople = tagNoAttr "people" $ many parsePerson
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > main = do
|
|
Packit |
899799 |
-- > people <- runResourceT $
|
|
Packit |
899799 |
-- > parseFile def "people.xml" $$ force "people required" parsePeople
|
|
Packit |
899799 |
-- > print people
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- will produce:
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- > [Person 25 "Michael",Person 2 "Eliezer"]
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- This module also supports streaming results using 'yield'.
|
|
Packit |
899799 |
-- This allows parser results to be processed using conduits
|
|
Packit |
899799 |
-- while a particular parser (e.g. 'many') is still running.
|
|
Packit |
899799 |
-- Without using streaming results, you have to wait until the parser finished
|
|
Packit |
899799 |
-- before you can process the result list. Large XML files might be easier
|
|
Packit |
899799 |
-- to process by using streaming results.
|
|
Packit |
899799 |
-- See http://stackoverflow.com/q/21367423/2597135 for a related discussion.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- > {-# LANGUAGE OverloadedStrings #-}
|
|
Packit |
899799 |
-- > import Control.Monad (void)
|
|
Packit |
899799 |
-- > import Control.Monad.Trans.Class (lift)
|
|
Packit |
899799 |
-- > import Control.Monad.Trans.Resource
|
|
Packit |
899799 |
-- > import Data.Conduit
|
|
Packit |
899799 |
-- > import qualified Data.Conduit.List as CL
|
|
Packit |
899799 |
-- > import Data.Text (Text, unpack)
|
|
Packit |
899799 |
-- > import Data.XML.Types (Event)
|
|
Packit |
899799 |
-- > import Text.XML.Stream.Parse
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > data Person = Person Int Text deriving Show
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > parsePerson :: MonadThrow m => Consumer Event m (Maybe Person)
|
|
Packit |
899799 |
-- > parsePerson = tag' "person" (requireAttr "age") $ \age -> do
|
|
Packit |
899799 |
-- > name <- content
|
|
Packit |
899799 |
-- > return $ Person (read $ unpack age) name
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > parsePeople :: MonadThrow m => Conduit Event m Person
|
|
Packit |
899799 |
-- > parsePeople = void $ tagNoAttr "people" $ manyYield parsePerson
|
|
Packit |
899799 |
-- >
|
|
Packit |
899799 |
-- > main = runResourceT $
|
|
Packit |
899799 |
-- > parseFile def "people.xml" $$ parsePeople =$ CL.mapM_ (lift . print)
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Previous versions of this module contained a number of more sophisticated
|
|
Packit |
899799 |
-- functions written by Aristid Breitkreuz and Dmitry Olshansky. To keep this
|
|
Packit |
899799 |
-- package simpler, those functions are being moved to a separate package. This
|
|
Packit |
899799 |
-- note will be updated with the name of the package(s) when available.
|
|
Packit |
899799 |
module Text.XML.Stream.Parse
|
|
Packit |
899799 |
( -- * Parsing XML files
|
|
Packit |
899799 |
parseBytes
|
|
Packit |
899799 |
, parseBytesPos
|
|
Packit |
899799 |
, parseText'
|
|
Packit |
899799 |
, parseText
|
|
Packit |
899799 |
, parseTextPos
|
|
Packit |
899799 |
, detectUtf
|
|
Packit |
899799 |
, parseFile
|
|
Packit |
899799 |
, parseLBS
|
|
Packit |
899799 |
-- ** Parser settings
|
|
Packit |
899799 |
, ParseSettings
|
|
Packit |
899799 |
, def
|
|
Packit |
899799 |
, DecodeEntities
|
|
Packit |
899799 |
, psDecodeEntities
|
|
Packit |
899799 |
, psRetainNamespaces
|
|
Packit |
899799 |
-- *** Entity decoding
|
|
Packit |
899799 |
, decodeXmlEntities
|
|
Packit |
899799 |
, decodeHtmlEntities
|
|
Packit |
899799 |
-- * Event parsing
|
|
Packit |
899799 |
, tag
|
|
Packit |
899799 |
, tag'
|
|
Packit |
899799 |
, tagNoAttr
|
|
Packit |
899799 |
, tagIgnoreAttrs
|
|
Packit |
899799 |
, content
|
|
Packit |
899799 |
, contentMaybe
|
|
Packit |
899799 |
-- * Ignoring tags/trees
|
|
Packit |
899799 |
, ignoreTag
|
|
Packit |
899799 |
, ignoreEmptyTag
|
|
Packit |
899799 |
, ignoreTree
|
|
Packit |
899799 |
, ignoreTreeContent
|
|
Packit |
899799 |
, ignoreAnyTreeContent
|
|
Packit |
899799 |
, ignoreAllTreesContent
|
|
Packit |
899799 |
-- * Streaming events
|
|
Packit |
899799 |
, takeContent
|
|
Packit |
899799 |
, takeTree
|
|
Packit |
899799 |
, takeTreeContent
|
|
Packit |
899799 |
, takeAnyTreeContent
|
|
Packit |
899799 |
, takeAllTreesContent
|
|
Packit |
899799 |
-- * Tag name matching
|
|
Packit |
899799 |
, NameMatcher(..)
|
|
Packit |
899799 |
, matching
|
|
Packit |
899799 |
, anyOf
|
|
Packit |
899799 |
, anyName
|
|
Packit |
899799 |
-- * Attribute parsing
|
|
Packit |
899799 |
, AttrParser
|
|
Packit |
899799 |
, attr
|
|
Packit |
899799 |
, requireAttr
|
|
Packit |
899799 |
, optionalAttr
|
|
Packit |
899799 |
, requireAttrRaw
|
|
Packit |
899799 |
, optionalAttrRaw
|
|
Packit |
899799 |
, ignoreAttrs
|
|
Packit |
899799 |
-- * Combinators
|
|
Packit |
899799 |
, orE
|
|
Packit |
899799 |
, choose
|
|
Packit |
899799 |
, many
|
|
Packit |
899799 |
, many_
|
|
Packit |
899799 |
, manyIgnore
|
|
Packit |
899799 |
, many'
|
|
Packit |
899799 |
, force
|
|
Packit |
899799 |
-- * Streaming combinators
|
|
Packit |
899799 |
, manyYield
|
|
Packit |
899799 |
, manyYield'
|
|
Packit |
899799 |
, manyIgnoreYield
|
|
Packit |
899799 |
-- * Exceptions
|
|
Packit |
899799 |
, XmlException (..)
|
|
Packit |
899799 |
-- * Other types
|
|
Packit |
899799 |
, PositionRange
|
|
Packit |
899799 |
, EventPos
|
|
Packit |
899799 |
) where
|
|
Packit |
899799 |
import Control.Applicative (Alternative (empty, (<|>)),
|
|
Packit |
899799 |
Applicative (..), (<$>))
|
|
Packit |
899799 |
import qualified Control.Applicative as A
|
|
Packit |
899799 |
import Control.Arrow ((***))
|
|
Packit |
899799 |
import Control.Exception (Exception (..), SomeException)
|
|
Packit |
899799 |
import Control.Monad (ap, liftM, void)
|
|
Packit |
899799 |
import Control.Monad.Fix (fix)
|
|
Packit |
899799 |
import Control.Monad.Trans.Class (lift)
|
|
Packit |
899799 |
import Control.Monad.Trans.Maybe (MaybeT (..))
|
|
Packit |
899799 |
import Control.Monad.Trans.Resource (MonadResource, MonadThrow (..),
|
|
Packit |
899799 |
monadThrow)
|
|
Packit |
899799 |
import Data.Attoparsec.Text (Parser, anyChar, char, manyTill,
|
|
Packit |
899799 |
skipWhile, string, takeWhile,
|
|
Packit |
899799 |
takeWhile1, try)
|
|
Packit |
899799 |
import qualified Data.Attoparsec.Text as AT
|
|
Packit |
899799 |
import qualified Data.ByteString as S
|
|
Packit |
899799 |
import qualified Data.ByteString.Lazy as L
|
|
Packit |
899799 |
import Data.Char (isSpace)
|
|
Packit |
899799 |
import Data.Conduit
|
|
Packit |
899799 |
import Data.Conduit.Attoparsec (PositionRange, conduitParser)
|
|
Packit |
899799 |
import Data.Conduit.Binary (sourceFile)
|
|
Packit |
899799 |
import qualified Data.Conduit.List as CL
|
|
Packit |
899799 |
import qualified Data.Conduit.Text as CT
|
|
Packit |
899799 |
import Data.Default.Class (Default (..))
|
|
Packit |
899799 |
import Data.List (intercalate)
|
|
Packit |
899799 |
import Data.List (foldl')
|
|
Packit |
899799 |
import qualified Data.Map as Map
|
|
Packit |
899799 |
import Data.Maybe (fromMaybe, isNothing)
|
|
Packit |
899799 |
import Data.String (IsString (..))
|
|
Packit |
899799 |
import Data.Text (Text, pack)
|
|
Packit |
899799 |
import qualified Data.Text as T
|
|
Packit |
899799 |
import Data.Text.Encoding (decodeUtf8With)
|
|
Packit |
899799 |
import Data.Text.Encoding.Error (lenientDecode)
|
|
Packit |
899799 |
import Data.Typeable (Typeable)
|
|
Packit |
899799 |
import Data.XML.Types (Content (..), Event (..),
|
|
Packit |
899799 |
ExternalID (..),
|
|
Packit |
899799 |
Instruction (..), Name (..))
|
|
Packit |
899799 |
import Prelude hiding (takeWhile)
|
|
Packit |
899799 |
import Text.XML.Stream.Token
|
|
Packit |
899799 |
|
|
Packit |
899799 |
type Ents = [(Text, Text)]
|
|
Packit |
899799 |
|
|
Packit |
899799 |
tokenToEvent :: ParseSettings -> Ents -> [NSLevel] -> Token -> (Ents, [NSLevel], [Event])
|
|
Packit |
899799 |
tokenToEvent _ es n (TokenXMLDeclaration _) = (es, n, [])
|
|
Packit |
899799 |
tokenToEvent _ es n (TokenInstruction i) = (es, n, [EventInstruction i])
|
|
Packit |
899799 |
tokenToEvent ps es n (TokenBeginElement name as isClosed _) =
|
|
Packit |
899799 |
(es, n', if isClosed then [begin, end] else [begin])
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
l0 = case n of
|
|
Packit |
899799 |
[] -> NSLevel Nothing Map.empty
|
|
Packit |
899799 |
x:_ -> x
|
|
Packit |
899799 |
(as', l') = foldl' go (id, l0) as
|
|
Packit |
899799 |
go (front, l) (TName kpref kname, val) =
|
|
Packit |
899799 |
(addNS front, l'')
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
isPrefixed = kpref == Just "xmlns"
|
|
Packit |
899799 |
isUnprefixed = isNothing kpref && kname == "xmlns"
|
|
Packit |
899799 |
|
|
Packit |
899799 |
addNS
|
|
Packit |
899799 |
| not (psRetainNamespaces ps) && (isPrefixed || isUnprefixed) = id
|
|
Packit |
899799 |
| otherwise = (((tname, map resolve val):) .)
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
tname
|
|
Packit |
899799 |
| isPrefixed = TName Nothing ("xmlns:" `T.append` kname)
|
|
Packit |
899799 |
| otherwise = TName kpref kname
|
|
Packit |
899799 |
|
|
Packit |
899799 |
l''
|
|
Packit |
899799 |
| isPrefixed =
|
|
Packit |
899799 |
l { prefixes = Map.insert kname (contentsToText val)
|
|
Packit |
899799 |
$ prefixes l }
|
|
Packit |
899799 |
| isUnprefixed =
|
|
Packit |
899799 |
l { defaultNS = if T.null $ contentsToText val
|
|
Packit |
899799 |
then Nothing
|
|
Packit |
899799 |
else Just $ contentsToText val }
|
|
Packit |
899799 |
| otherwise = l
|
|
Packit |
899799 |
|
|
Packit |
899799 |
resolve (ContentEntity e)
|
|
Packit |
899799 |
| Just t <- lookup e es = ContentText t
|
|
Packit |
899799 |
resolve c = c
|
|
Packit |
899799 |
n' = if isClosed then n else l' : n
|
|
Packit |
899799 |
fixAttName (name', val) = (tnameToName True l' name', val)
|
|
Packit |
899799 |
elementName = tnameToName False l' name
|
|
Packit |
899799 |
begin = EventBeginElement elementName $ map fixAttName $ as' []
|
|
Packit |
899799 |
end = EventEndElement elementName
|
|
Packit |
899799 |
tokenToEvent _ es n (TokenEndElement name) =
|
|
Packit |
899799 |
(es, n', [EventEndElement $ tnameToName False l name])
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
(l, n') =
|
|
Packit |
899799 |
case n of
|
|
Packit |
899799 |
[] -> (NSLevel Nothing Map.empty, [])
|
|
Packit |
899799 |
x:xs -> (x, xs)
|
|
Packit |
899799 |
tokenToEvent _ es n (TokenContent (ContentEntity e))
|
|
Packit |
899799 |
| Just t <- lookup e es = (es, n, [EventContent $ ContentText t])
|
|
Packit |
899799 |
tokenToEvent _ es n (TokenContent c) = (es, n, [EventContent c])
|
|
Packit |
899799 |
tokenToEvent _ es n (TokenComment c) = (es, n, [EventComment c])
|
|
Packit |
899799 |
tokenToEvent _ es n (TokenDoctype t eid es') = (es ++ es', n, [EventBeginDoctype t eid, EventEndDoctype])
|
|
Packit |
899799 |
tokenToEvent _ es n (TokenCDATA t) = (es, n, [EventCDATA t])
|
|
Packit |
899799 |
|
|
Packit |
899799 |
tnameToName :: Bool -> NSLevel -> TName -> Name
|
|
Packit |
899799 |
tnameToName _ _ (TName (Just "xml") name) =
|
|
Packit |
899799 |
Name name (Just "http://www.w3.org/XML/1998/namespace") (Just "xml")
|
|
Packit |
899799 |
tnameToName isAttr (NSLevel def' _) (TName Nothing name) =
|
|
Packit |
899799 |
Name name (if isAttr then Nothing else def') Nothing
|
|
Packit |
899799 |
tnameToName _ (NSLevel _ m) (TName (Just pref) name) =
|
|
Packit |
899799 |
case Map.lookup pref m of
|
|
Packit |
899799 |
Just ns -> Name name (Just ns) (Just pref)
|
|
Packit |
899799 |
Nothing -> Name name Nothing (Just pref) -- FIXME is this correct?
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Automatically determine which UTF variant is being used. This function
|
|
Packit |
899799 |
-- first checks for BOMs, removing them as necessary, and then check for the
|
|
Packit |
899799 |
-- equivalent of
|
|
Packit |
899799 |
-- defaults to assuming UTF-8.
|
|
Packit |
899799 |
detectUtf :: MonadThrow m => Conduit S.ByteString m T.Text
|
|
Packit |
899799 |
detectUtf =
|
|
Packit |
899799 |
conduit id
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
conduit front = await >>= maybe (return ()) (push front)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
push front bss =
|
|
Packit |
899799 |
either conduit
|
|
Packit |
899799 |
(uncurry checkXMLDecl)
|
|
Packit |
899799 |
(getEncoding front bss)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
getEncoding front bs'
|
|
Packit |
899799 |
| S.length bs < 4 =
|
|
Packit |
899799 |
Left (bs `S.append`)
|
|
Packit |
899799 |
| otherwise =
|
|
Packit |
899799 |
Right (bsOut, mcodec)
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
bs = front bs'
|
|
Packit |
899799 |
bsOut = S.append (S.drop toDrop x) y
|
|
Packit |
899799 |
(x, y) = S.splitAt 4 bs
|
|
Packit |
899799 |
(toDrop, mcodec) =
|
|
Packit |
899799 |
case S.unpack x of
|
|
Packit |
899799 |
[0x00, 0x00, 0xFE, 0xFF] -> (4, Just CT.utf32_be)
|
|
Packit |
899799 |
[0xFF, 0xFE, 0x00, 0x00] -> (4, Just CT.utf32_le)
|
|
Packit |
899799 |
0xFE : 0xFF: _ -> (2, Just CT.utf16_be)
|
|
Packit |
899799 |
0xFF : 0xFE: _ -> (2, Just CT.utf16_le)
|
|
Packit |
899799 |
0xEF : 0xBB: 0xBF : _ -> (3, Just CT.utf8)
|
|
Packit |
899799 |
[0x00, 0x00, 0x00, 0x3C] -> (0, Just CT.utf32_be)
|
|
Packit |
899799 |
[0x3C, 0x00, 0x00, 0x00] -> (0, Just CT.utf32_le)
|
|
Packit |
899799 |
[0x00, 0x3C, 0x00, 0x3F] -> (0, Just CT.utf16_be)
|
|
Packit |
899799 |
[0x3C, 0x00, 0x3F, 0x00] -> (0, Just CT.utf16_le)
|
|
Packit |
899799 |
_ -> (0, Nothing) -- Assuming UTF-8
|
|
Packit |
899799 |
|
|
Packit |
899799 |
checkXMLDecl :: MonadThrow m
|
|
Packit |
899799 |
=> S.ByteString
|
|
Packit |
899799 |
-> Maybe CT.Codec
|
|
Packit |
899799 |
-> Conduit S.ByteString m T.Text
|
|
Packit |
899799 |
checkXMLDecl bs (Just codec) = leftover bs >> CT.decode codec
|
|
Packit |
899799 |
checkXMLDecl bs0 Nothing =
|
|
Packit |
899799 |
loop [] (AT.parse (parseToken decodeXmlEntities)) bs0
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
loop chunks0 parser nextChunk =
|
|
Packit |
899799 |
case parser $ decodeUtf8With lenientDecode nextChunk of
|
|
Packit |
899799 |
AT.Fail{} -> fallback
|
|
Packit |
899799 |
AT.Partial f -> await >>= maybe fallback (loop chunks f)
|
|
Packit |
899799 |
AT.Done _ (TokenXMLDeclaration attrs) -> findEncoding attrs
|
|
Packit |
899799 |
AT.Done{} -> fallback
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
chunks = nextChunk : chunks0
|
|
Packit |
899799 |
fallback = complete CT.utf8
|
|
Packit |
899799 |
complete codec = mapM_ leftover chunks >> CT.decode codec
|
|
Packit |
899799 |
|
|
Packit |
899799 |
findEncoding [] = fallback
|
|
Packit |
899799 |
findEncoding ((TName _ "encoding", [ContentText enc]):_) =
|
|
Packit |
899799 |
case T.toLower enc of
|
|
Packit |
899799 |
"iso-8859-1" -> complete CT.iso8859_1
|
|
Packit |
899799 |
"utf-8" -> complete CT.utf8
|
|
Packit |
899799 |
_ -> complete CT.utf8
|
|
Packit |
899799 |
findEncoding (_:xs) = findEncoding xs
|
|
Packit |
899799 |
|
|
Packit |
899799 |
type EventPos = (Maybe PositionRange, Event)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Parses a byte stream into 'Event's. This function is implemented fully in
|
|
Packit |
899799 |
-- Haskell using attoparsec-text for parsing. The produced error messages do
|
|
Packit |
899799 |
-- not give line/column information, so you may prefer to stick with the parser
|
|
Packit |
899799 |
-- provided by libxml-enumerator. However, this has the advantage of not
|
|
Packit |
899799 |
-- relying on any C libraries.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- This relies on 'detectUtf' to determine character encoding, and 'parseText''
|
|
Packit |
899799 |
-- to do the actual parsing.
|
|
Packit |
899799 |
parseBytes :: MonadThrow m
|
|
Packit |
899799 |
=> ParseSettings
|
|
Packit |
899799 |
-> Conduit S.ByteString m Event
|
|
Packit |
899799 |
parseBytes = mapOutput snd . parseBytesPos
|
|
Packit |
899799 |
|
|
Packit |
899799 |
parseBytesPos :: MonadThrow m
|
|
Packit |
899799 |
=> ParseSettings
|
|
Packit |
899799 |
-> Conduit S.ByteString m EventPos
|
|
Packit |
899799 |
parseBytesPos ps = detectUtf =$= parseTextPos ps
|
|
Packit |
899799 |
|
|
Packit |
899799 |
dropBOM :: Monad m => Conduit T.Text m T.Text
|
|
Packit |
899799 |
dropBOM =
|
|
Packit |
899799 |
await >>= maybe (return ()) push
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
push t =
|
|
Packit |
899799 |
case T.uncons t of
|
|
Packit |
899799 |
Nothing -> dropBOM
|
|
Packit |
899799 |
Just (c, cs) ->
|
|
Packit |
899799 |
let output
|
|
Packit |
899799 |
| c == '\xfeef' = cs
|
|
Packit |
899799 |
| otherwise = t
|
|
Packit |
899799 |
in yield output >> idConduit
|
|
Packit |
899799 |
idConduit = await >>= maybe (return ()) (\x -> yield x >> idConduit)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Parses a character stream into 'Event's. This function is implemented
|
|
Packit |
899799 |
-- fully in Haskell using attoparsec-text for parsing. The produced error
|
|
Packit |
899799 |
-- messages do not give line/column information, so you may prefer to stick
|
|
Packit |
899799 |
-- with the parser provided by libxml-enumerator. However, this has the
|
|
Packit |
899799 |
-- advantage of not relying on any C libraries.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.2.4
|
|
Packit |
899799 |
parseText' :: MonadThrow m
|
|
Packit |
899799 |
=> ParseSettings
|
|
Packit |
899799 |
-> Conduit T.Text m Event
|
|
Packit |
899799 |
parseText' = mapOutput snd . parseTextPos
|
|
Packit |
899799 |
|
|
Packit |
899799 |
{-# DEPRECATED parseText "Please use 'parseText'' or 'parseTextPos'." #-}
|
|
Packit |
899799 |
parseText :: MonadThrow m
|
|
Packit |
899799 |
=> ParseSettings
|
|
Packit |
899799 |
-> Conduit T.Text m EventPos
|
|
Packit |
899799 |
parseText = parseTextPos
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Same as 'parseText'', but includes the position of each event.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.2.4
|
|
Packit |
899799 |
parseTextPos :: MonadThrow m
|
|
Packit |
899799 |
=> ParseSettings
|
|
Packit |
899799 |
-> Conduit T.Text m EventPos
|
|
Packit |
899799 |
parseTextPos de =
|
|
Packit |
899799 |
dropBOM
|
|
Packit |
899799 |
=$= tokenize
|
|
Packit |
899799 |
=$= toEventC de
|
|
Packit |
899799 |
=$= addBeginEnd
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
tokenize = conduitToken de
|
|
Packit |
899799 |
addBeginEnd = yield (Nothing, EventBeginDocument) >> addEnd
|
|
Packit |
899799 |
addEnd = await >>= maybe
|
|
Packit |
899799 |
(yield (Nothing, EventEndDocument))
|
|
Packit |
899799 |
(\e -> yield e >> addEnd)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
toEventC :: Monad m => ParseSettings -> Conduit (PositionRange, Token) m EventPos
|
|
Packit |
899799 |
toEventC ps =
|
|
Packit |
899799 |
go [] []
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
go !es !levels =
|
|
Packit |
899799 |
await >>= maybe (return ()) push
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
push (position, token) =
|
|
Packit |
899799 |
mapM_ (yield . (,) (Just position)) events >> go es' levels'
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
(es', levels', events) = tokenToEvent ps es levels token
|
|
Packit |
899799 |
|
|
Packit |
899799 |
data ParseSettings = ParseSettings
|
|
Packit |
899799 |
{ psDecodeEntities :: DecodeEntities
|
|
Packit |
899799 |
, psRetainNamespaces :: Bool
|
|
Packit |
899799 |
-- ^ Whether the original xmlns attributes should be retained in the parsed
|
|
Packit |
899799 |
-- values. For more information on motivation, see:
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- <https://github.com/snoyberg/xml/issues/38>
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Default: False
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.2.1
|
|
Packit |
899799 |
}
|
|
Packit |
899799 |
|
|
Packit |
899799 |
instance Default ParseSettings where
|
|
Packit |
899799 |
def = ParseSettings
|
|
Packit |
899799 |
{ psDecodeEntities = decodeXmlEntities
|
|
Packit |
899799 |
, psRetainNamespaces = False
|
|
Packit |
899799 |
}
|
|
Packit |
899799 |
|
|
Packit |
899799 |
conduitToken :: MonadThrow m => ParseSettings -> Conduit T.Text m (PositionRange, Token)
|
|
Packit |
899799 |
conduitToken = conduitParser . parseToken . psDecodeEntities
|
|
Packit |
899799 |
|
|
Packit |
899799 |
parseToken :: DecodeEntities -> Parser Token
|
|
Packit |
899799 |
parseToken de = (char '<' >> parseLt) <|> TokenContent <$> parseContent de False False
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
parseLt =
|
|
Packit |
899799 |
(char '?' >> parseInstr) <|>
|
|
Packit |
899799 |
(char '!' >> (parseComment <|> parseCdata <|> parseDoctype)) <|>
|
|
Packit |
899799 |
parseBegin <|>
|
|
Packit |
899799 |
(char '/' >> parseEnd)
|
|
Packit |
899799 |
parseInstr = do
|
|
Packit |
899799 |
name <- parseIdent
|
|
Packit |
899799 |
if name == "xml"
|
|
Packit |
899799 |
then do
|
|
Packit |
899799 |
as <- A.many $ parseAttribute de
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
char' '?'
|
|
Packit |
899799 |
char' '>'
|
|
Packit |
899799 |
newline <|> return ()
|
|
Packit |
899799 |
return $ TokenXMLDeclaration as
|
|
Packit |
899799 |
else do
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
x <- T.pack <$> manyTill anyChar (try $ string "?>")
|
|
Packit |
899799 |
return $ TokenInstruction $ Instruction name x
|
|
Packit |
899799 |
parseComment = do
|
|
Packit |
899799 |
char' '-'
|
|
Packit |
899799 |
char' '-'
|
|
Packit |
899799 |
c <- T.pack <$> manyTill anyChar (string "-->") -- FIXME use takeWhile instead
|
|
Packit |
899799 |
return $ TokenComment c
|
|
Packit |
899799 |
parseCdata = do
|
|
Packit |
899799 |
_ <- string "[CDATA["
|
|
Packit |
899799 |
t <- T.pack <$> manyTill anyChar (string "]]>") -- FIXME use takeWhile instead
|
|
Packit |
899799 |
return $ TokenCDATA t
|
|
Packit |
899799 |
parseDoctype = do
|
|
Packit |
899799 |
_ <- string "DOCTYPE"
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
name <- parseName
|
|
Packit |
899799 |
let i =
|
|
Packit |
899799 |
case name of
|
|
Packit |
899799 |
TName Nothing x -> x
|
|
Packit |
899799 |
TName (Just x) y -> T.concat [x, ":", y]
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
eid <- fmap Just parsePublicID <|>
|
|
Packit |
899799 |
fmap Just parseSystemID <|>
|
|
Packit |
899799 |
return Nothing
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
ents <- (do
|
|
Packit |
899799 |
char' '['
|
|
Packit |
899799 |
ents <- parseEntities id
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
return ents) <|> return []
|
|
Packit |
899799 |
char' '>'
|
|
Packit |
899799 |
newline <|> return ()
|
|
Packit |
899799 |
return $ TokenDoctype i eid ents
|
|
Packit |
899799 |
parseEntities front =
|
|
Packit |
899799 |
(char ']' >> return (front [])) <|>
|
|
Packit |
899799 |
(parseEntity >>= \e -> parseEntities (front . (e:))) <|>
|
|
Packit |
899799 |
(char '<' >> parseEntities front) <|>
|
|
Packit |
899799 |
(skipWhile (\t -> t /= ']' && t /= '<') >> parseEntities front)
|
|
Packit |
899799 |
parseEntity = try $ do
|
|
Packit |
899799 |
_ <- string "
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
i <- parseIdent
|
|
Packit |
899799 |
t <- quotedText
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
char' '>'
|
|
Packit |
899799 |
return (i, t)
|
|
Packit |
899799 |
parsePublicID = do
|
|
Packit |
899799 |
_ <- string "PUBLIC"
|
|
Packit |
899799 |
x <- quotedText
|
|
Packit |
899799 |
y <- quotedText
|
|
Packit |
899799 |
return $ PublicID x y
|
|
Packit |
899799 |
parseSystemID = do
|
|
Packit |
899799 |
_ <- string "SYSTEM"
|
|
Packit |
899799 |
x <- quotedText
|
|
Packit |
899799 |
return $ SystemID x
|
|
Packit |
899799 |
quotedText = do
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
between '"' <|> between '\''
|
|
Packit |
899799 |
between c = do
|
|
Packit |
899799 |
char' c
|
|
Packit |
899799 |
x <- takeWhile (/=c)
|
|
Packit |
899799 |
char' c
|
|
Packit |
899799 |
return x
|
|
Packit |
899799 |
parseEnd = do
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
n <- parseName
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
char' '>'
|
|
Packit |
899799 |
return $ TokenEndElement n
|
|
Packit |
899799 |
parseBegin = do
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
n <- parseName
|
|
Packit |
899799 |
as <- A.many $ parseAttribute de
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
isClose <- (char '/' >> skipSpace >> return True) <|> return False
|
|
Packit |
899799 |
char' '>'
|
|
Packit |
899799 |
return $ TokenBeginElement n as isClose 0
|
|
Packit |
899799 |
|
|
Packit |
899799 |
parseAttribute :: DecodeEntities -> Parser TAttribute
|
|
Packit |
899799 |
parseAttribute de = do
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
key <- parseName
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
char' '='
|
|
Packit |
899799 |
skipSpace
|
|
Packit |
899799 |
val <- squoted <|> dquoted
|
|
Packit |
899799 |
return (key, val)
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
squoted = char '\'' *> manyTill (parseContent de False True) (char '\'')
|
|
Packit |
899799 |
dquoted = char '"' *> manyTill (parseContent de True False) (char '"')
|
|
Packit |
899799 |
|
|
Packit |
899799 |
parseName :: Parser TName
|
|
Packit |
899799 |
parseName =
|
|
Packit |
899799 |
name <$> parseIdent <*> A.optional (char ':' >> parseIdent)
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
name i1 Nothing = TName Nothing i1
|
|
Packit |
899799 |
name i1 (Just i2) = TName (Just i1) i2
|
|
Packit |
899799 |
|
|
Packit |
899799 |
parseIdent :: Parser Text
|
|
Packit |
899799 |
parseIdent =
|
|
Packit |
899799 |
takeWhile1 valid
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
valid '&' = False
|
|
Packit |
899799 |
valid '<' = False
|
|
Packit |
899799 |
valid '>' = False
|
|
Packit |
899799 |
valid ':' = False
|
|
Packit |
899799 |
valid '?' = False
|
|
Packit |
899799 |
valid '=' = False
|
|
Packit |
899799 |
valid '"' = False
|
|
Packit |
899799 |
valid '\'' = False
|
|
Packit |
899799 |
valid '/' = False
|
|
Packit |
899799 |
valid ';' = False
|
|
Packit |
899799 |
valid '#' = False
|
|
Packit |
899799 |
valid c = not $ isXMLSpace c
|
|
Packit |
899799 |
|
|
Packit |
899799 |
parseContent :: DecodeEntities
|
|
Packit |
899799 |
-> Bool -- break on double quote
|
|
Packit |
899799 |
-> Bool -- break on single quote
|
|
Packit |
899799 |
-> Parser Content
|
|
Packit |
899799 |
parseContent de breakDouble breakSingle = parseReference <|> parseTextContent where
|
|
Packit |
899799 |
parseReference = do
|
|
Packit |
899799 |
char' '&'
|
|
Packit |
899799 |
t <- parseEntityRef <|> parseHexCharRef <|> parseDecCharRef
|
|
Packit |
899799 |
char' ';'
|
|
Packit |
899799 |
return t
|
|
Packit |
899799 |
parseEntityRef = do
|
|
Packit |
899799 |
TName ma b <- parseName
|
|
Packit |
899799 |
let name = maybe "" (`T.append` ":") ma `T.append` b
|
|
Packit |
899799 |
return $ case name of
|
|
Packit |
899799 |
"lt" -> ContentText "<"
|
|
Packit |
899799 |
"gt" -> ContentText ">"
|
|
Packit |
899799 |
"amp" -> ContentText "&"
|
|
Packit |
899799 |
"quot" -> ContentText "\""
|
|
Packit |
899799 |
"apos" -> ContentText "'"
|
|
Packit |
899799 |
_ -> de name
|
|
Packit |
899799 |
parseHexCharRef = do
|
|
Packit |
899799 |
void $ string "#x"
|
|
Packit |
899799 |
n <- AT.hexadecimal
|
|
Packit |
899799 |
case toValidXmlChar n of
|
|
Packit |
899799 |
Nothing -> fail "Invalid character from hexadecimal character reference."
|
|
Packit |
899799 |
Just c -> return $ ContentText $ T.singleton c
|
|
Packit |
899799 |
parseDecCharRef = do
|
|
Packit |
899799 |
void $ string "#"
|
|
Packit |
899799 |
n <- AT.decimal
|
|
Packit |
899799 |
case toValidXmlChar n of
|
|
Packit |
899799 |
Nothing -> fail "Invalid character from decimal character reference."
|
|
Packit |
899799 |
Just c -> return $ ContentText $ T.singleton c
|
|
Packit |
899799 |
parseTextContent = ContentText <$> takeWhile1 valid
|
|
Packit |
899799 |
valid '"' = not breakDouble
|
|
Packit |
899799 |
valid '\'' = not breakSingle
|
|
Packit |
899799 |
valid '&' = False -- amp
|
|
Packit |
899799 |
valid '<' = False -- lt
|
|
Packit |
899799 |
valid _ = True
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Is this codepoint a valid XML character? See
|
|
Packit |
899799 |
-- <https://www.w3.org/TR/xml/#charsets>. This is proudly XML 1.0 only.
|
|
Packit |
899799 |
toValidXmlChar :: Int -> Maybe Char
|
|
Packit |
899799 |
toValidXmlChar n
|
|
Packit |
899799 |
| any checkRange ranges = Just (toEnum n)
|
|
Packit |
899799 |
| otherwise = Nothing
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
--Inclusive lower bound, inclusive upper bound.
|
|
Packit |
899799 |
ranges :: [(Int, Int)]
|
|
Packit |
899799 |
ranges =
|
|
Packit |
899799 |
[ (0x9, 0xA)
|
|
Packit |
899799 |
, (0xD, 0xD)
|
|
Packit |
899799 |
, (0x20, 0xD7FF)
|
|
Packit |
899799 |
, (0xE000, 0xFFFD)
|
|
Packit |
899799 |
, (0x10000, 0x10FFFF)
|
|
Packit |
899799 |
]
|
|
Packit |
899799 |
checkRange (lb, ub) = lb <= n && n <= ub
|
|
Packit |
899799 |
|
|
Packit |
899799 |
skipSpace :: Parser ()
|
|
Packit |
899799 |
skipSpace = skipWhile isXMLSpace
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Determines whether a character is an XML white space. The list of
|
|
Packit |
899799 |
-- white spaces is given by
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- > S ::= (#x20 | #x9 | #xD | #xA)+
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- in <http://www.w3.org/TR/2008/REC-xml-20081126/#sec-common-syn>.
|
|
Packit |
899799 |
isXMLSpace :: Char -> Bool
|
|
Packit |
899799 |
isXMLSpace ' ' = True
|
|
Packit |
899799 |
isXMLSpace '\t' = True
|
|
Packit |
899799 |
isXMLSpace '\r' = True
|
|
Packit |
899799 |
isXMLSpace '\n' = True
|
|
Packit |
899799 |
isXMLSpace _ = False
|
|
Packit |
899799 |
|
|
Packit |
899799 |
newline :: Parser ()
|
|
Packit |
899799 |
newline = void $ (char '\r' >> char '\n') <|> char '\n'
|
|
Packit |
899799 |
|
|
Packit |
899799 |
char' :: Char -> Parser ()
|
|
Packit |
899799 |
char' = void . char
|
|
Packit |
899799 |
|
|
Packit |
899799 |
data ContentType = Ignore | IsContent Text | IsError String | NotContent
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Grabs the next piece of content if available. This function skips over any
|
|
Packit |
899799 |
-- comments and instructions and concatenates all content until the next start
|
|
Packit |
899799 |
-- or end tag.
|
|
Packit |
899799 |
contentMaybe :: MonadThrow m => Consumer Event m (Maybe Text)
|
|
Packit |
899799 |
contentMaybe = do
|
|
Packit |
899799 |
x <- CL.peek
|
|
Packit |
899799 |
case pc' x of
|
|
Packit |
899799 |
Ignore -> CL.drop 1 >> contentMaybe
|
|
Packit |
899799 |
IsContent t -> CL.drop 1 >> fmap Just (takeContents (t:))
|
|
Packit |
899799 |
IsError e -> lift $ monadThrow $ InvalidEntity e x
|
|
Packit |
899799 |
NotContent -> return Nothing
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
pc' Nothing = NotContent
|
|
Packit |
899799 |
pc' (Just x) = pc x
|
|
Packit |
899799 |
pc (EventContent (ContentText t)) = IsContent t
|
|
Packit |
899799 |
pc (EventContent (ContentEntity e)) = IsError $ "Unknown entity: " ++ show e
|
|
Packit |
899799 |
pc (EventCDATA t) = IsContent t
|
|
Packit |
899799 |
pc EventBeginElement{} = NotContent
|
|
Packit |
899799 |
pc EventEndElement{} = NotContent
|
|
Packit |
899799 |
pc EventBeginDocument{} = Ignore
|
|
Packit |
899799 |
pc EventEndDocument = Ignore
|
|
Packit |
899799 |
pc EventBeginDoctype{} = Ignore
|
|
Packit |
899799 |
pc EventEndDoctype = Ignore
|
|
Packit |
899799 |
pc EventInstruction{} = Ignore
|
|
Packit |
899799 |
pc EventComment{} = Ignore
|
|
Packit |
899799 |
takeContents front = do
|
|
Packit |
899799 |
x <- CL.peek
|
|
Packit |
899799 |
case pc' x of
|
|
Packit |
899799 |
Ignore -> CL.drop 1 >> takeContents front
|
|
Packit |
899799 |
IsContent t -> CL.drop 1 >> takeContents (front . (:) t)
|
|
Packit |
899799 |
IsError e -> lift $ monadThrow $ InvalidEntity e x
|
|
Packit |
899799 |
NotContent -> return $ T.concat $ front []
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Grabs the next piece of content. If none if available, returns 'T.empty'.
|
|
Packit |
899799 |
-- This is simply a wrapper around 'contentMaybe'.
|
|
Packit |
899799 |
content :: MonadThrow m => Consumer Event m Text
|
|
Packit |
899799 |
content = fromMaybe T.empty <$> contentMaybe
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
isWhitespace :: Event -> Bool
|
|
Packit |
899799 |
isWhitespace EventBeginDocument = True
|
|
Packit |
899799 |
isWhitespace EventEndDocument = True
|
|
Packit |
899799 |
isWhitespace EventBeginDoctype{} = True
|
|
Packit |
899799 |
isWhitespace EventEndDoctype = True
|
|
Packit |
899799 |
isWhitespace EventInstruction{} = True
|
|
Packit |
899799 |
isWhitespace (EventContent (ContentText t)) = T.all isSpace t
|
|
Packit |
899799 |
isWhitespace EventComment{} = True
|
|
Packit |
899799 |
isWhitespace (EventCDATA t) = T.all isSpace t
|
|
Packit |
899799 |
isWhitespace _ = False
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | The most generic way to parse a tag. It takes a 'NameMatcher' to check whether
|
|
Packit |
899799 |
-- this is a correct tag name, an 'AttrParser' to handle attributes, and
|
|
Packit |
899799 |
-- then a parser to deal with content.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- 'Events' are consumed if and only if the tag name and its attributes match.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- This function automatically absorbs its balancing closing tag, and will
|
|
Packit |
899799 |
-- throw an exception if not all of the attributes or child elements are
|
|
Packit |
899799 |
-- consumed. If you want to allow extra attributes, see 'ignoreAttrs'.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- This function automatically ignores comments, instructions and whitespace.
|
|
Packit |
899799 |
tag :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a -- ^ Check if this is a correct tag name
|
|
Packit |
899799 |
-- and return a value that can be used to get an @AttrParser@.
|
|
Packit |
899799 |
-- If this fails, the function will return @Nothing@
|
|
Packit |
899799 |
-> (a -> AttrParser b) -- ^ Given the value returned by the name checker, this function will
|
|
Packit |
899799 |
-- be used to get an @AttrParser@ appropriate for the specific tag.
|
|
Packit |
899799 |
-- If the @AttrParser@ fails, the function will also return @Nothing@
|
|
Packit |
899799 |
-> (b -> ConduitM Event o m c) -- ^ Handler function to handle the attributes and children
|
|
Packit |
899799 |
-- of a tag, given the value return from the @AttrParser@
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe c)
|
|
Packit |
899799 |
tag nameMatcher attrParser f = do
|
|
Packit |
899799 |
(x, leftovers) <- dropWS []
|
|
Packit |
899799 |
res <- case x of
|
|
Packit |
899799 |
Just (EventBeginElement name as) -> case runNameMatcher nameMatcher name of
|
|
Packit |
899799 |
Just y -> case runAttrParser' (attrParser y) as of
|
|
Packit |
899799 |
Left _ -> return Nothing
|
|
Packit |
899799 |
Right z -> do
|
|
Packit |
899799 |
z' <- f z
|
|
Packit |
899799 |
(a, _leftovers') <- dropWS []
|
|
Packit |
899799 |
case a of
|
|
Packit |
899799 |
Just (EventEndElement name')
|
|
Packit |
899799 |
| name == name' -> return (Just z')
|
|
Packit |
899799 |
_ -> lift $ monadThrow $ InvalidEndElement name a
|
|
Packit |
899799 |
Nothing -> return Nothing
|
|
Packit |
899799 |
_ -> return Nothing
|
|
Packit |
899799 |
|
|
Packit |
899799 |
case res of
|
|
Packit |
899799 |
-- Did not parse, put back all of the leading whitespace events and the
|
|
Packit |
899799 |
-- final observed event generated by dropWS
|
|
Packit |
899799 |
Nothing -> mapM_ leftover leftovers
|
|
Packit |
899799 |
-- Parse succeeded, discard all of those whitespace events and the
|
|
Packit |
899799 |
-- first parsed event
|
|
Packit |
899799 |
Just _ -> return ()
|
|
Packit |
899799 |
|
|
Packit |
899799 |
return res
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
-- Drop Events until we encounter a non-whitespace element. Return all of
|
|
Packit |
899799 |
-- the events consumed here (including the first non-whitespace event) so
|
|
Packit |
899799 |
-- that the calling function can treat them as leftovers if the parse fails
|
|
Packit |
899799 |
dropWS leftovers = do
|
|
Packit |
899799 |
x <- await
|
|
Packit |
899799 |
let leftovers' = maybe id (:) x leftovers
|
|
Packit |
899799 |
|
|
Packit |
899799 |
case isWhitespace <$> x of
|
|
Packit |
899799 |
Just True -> dropWS leftovers'
|
|
Packit |
899799 |
_ -> return (x, leftovers')
|
|
Packit |
899799 |
runAttrParser' p as =
|
|
Packit |
899799 |
case runAttrParser p as of
|
|
Packit |
899799 |
Left e -> Left e
|
|
Packit |
899799 |
Right ([], x) -> Right x
|
|
Packit |
899799 |
Right (attr, _) -> Left $ toException $ UnparsedAttributes attr
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | A simplified version of 'tag' where the 'NameMatcher' result isn't forwarded to the attributes parser.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
tag' :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a -> AttrParser b -> (b -> ConduitM Event o m c)
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe c)
|
|
Packit |
899799 |
tag' a b = tag a (const b)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | A further simplified tag parser, which requires that no attributes exist.
|
|
Packit |
899799 |
tagNoAttr :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a -- ^ Check if this is a correct tag name
|
|
Packit |
899799 |
-> ConduitM Event o m b -- ^ Handler function to handle the children of the matched tag
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe b)
|
|
Packit |
899799 |
tagNoAttr name f = tag' name (return ()) $ const f
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | A further simplified tag parser, which ignores all attributes, if any exist
|
|
Packit |
899799 |
tagIgnoreAttrs :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a -- ^ Check if this is a correct tag name
|
|
Packit |
899799 |
-> ConduitM Event o m b -- ^ Handler function to handle the children of the matched tag
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe b)
|
|
Packit |
899799 |
tagIgnoreAttrs name f = tag' name ignoreAttrs $ const f
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Ignore an empty tag and all of its attributes.
|
|
Packit |
899799 |
-- This does not ignore the tag recursively
|
|
Packit |
899799 |
-- (i.e. it assumes there are no child elements).
|
|
Packit |
899799 |
-- This function returns @Just ()@ if the tag matched.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
ignoreEmptyTag :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a -- ^ Check if this is a correct tag name
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe ())
|
|
Packit |
899799 |
ignoreEmptyTag nameMatcher = tagIgnoreAttrs nameMatcher (return ())
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
{-# DEPRECATED ignoreTag "Please use 'ignoreEmptyTag'." #-}
|
|
Packit |
899799 |
ignoreTag :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a -- ^ Check if this is a correct tag name
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe ())
|
|
Packit |
899799 |
ignoreTag = ignoreEmptyTag
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Ignore a tag, its attributes and its children subtrees recursively.
|
|
Packit |
899799 |
-- Both content and text events are ignored.
|
|
Packit |
899799 |
-- This function returns @Just ()@ if the tag matched.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
ignoreTreeContent :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a -- ^ Check if this is a correct tag name
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe ())
|
|
Packit |
899799 |
ignoreTreeContent namePred = tagIgnoreAttrs namePred (void $ many ignoreAnyTreeContent)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
{-# DEPRECATED ignoreTree "Please use 'ignoreTreeContent'." #-}
|
|
Packit |
899799 |
ignoreTree :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a -- ^ Check if this is a correct tag name
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe ())
|
|
Packit |
899799 |
ignoreTree = ignoreTreeContent
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Like 'ignoreTreeContent', but matches any name and also ignores content events.
|
|
Packit |
899799 |
ignoreAnyTreeContent :: MonadThrow m => ConduitM Event o m (Maybe ())
|
|
Packit |
899799 |
ignoreAnyTreeContent = (void <$> contentMaybe) `orE` ignoreTreeContent anyName
|
|
Packit |
899799 |
|
|
Packit |
899799 |
{-# DEPRECATED ignoreAllTreesContent "Please use 'ignoreAnyTreeContent'." #-}
|
|
Packit |
899799 |
ignoreAllTreesContent :: MonadThrow m => ConduitM Event o m (Maybe ())
|
|
Packit |
899799 |
ignoreAllTreesContent = ignoreAnyTreeContent
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Get the value of the first parser which returns 'Just'. If no parsers
|
|
Packit |
899799 |
-- succeed (i.e., return @Just@), this function returns 'Nothing'.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- > orE a b = choose [a, b]
|
|
Packit |
899799 |
orE :: Monad m
|
|
Packit |
899799 |
=> Consumer Event m (Maybe a) -- ^ The first (preferred) parser
|
|
Packit |
899799 |
-> Consumer Event m (Maybe a) -- ^ The second parser, only executed if the first parser fails
|
|
Packit |
899799 |
-> Consumer Event m (Maybe a)
|
|
Packit |
899799 |
orE a b = a >>= \x -> maybe b (const $ return x) x
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Get the value of the first parser which returns 'Just'. If no parsers
|
|
Packit |
899799 |
-- succeed (i.e., return 'Just'), this function returns 'Nothing'.
|
|
Packit |
899799 |
choose :: Monad m
|
|
Packit |
899799 |
=> [ConduitM Event o m (Maybe a)] -- ^ List of parsers that will be tried in order.
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe a) -- ^ Result of the first parser to succeed, or @Nothing@
|
|
Packit |
899799 |
-- if no parser succeeded
|
|
Packit |
899799 |
choose [] = return Nothing
|
|
Packit |
899799 |
choose (i:is) = i >>= maybe (choose is) (return . Just)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Force an optional parser into a required parser. All of the 'tag'
|
|
Packit |
899799 |
-- functions, 'attr', 'choose' and 'many' deal with 'Maybe' parsers. Use this when you
|
|
Packit |
899799 |
-- want to finally force something to happen.
|
|
Packit |
899799 |
force :: MonadThrow m
|
|
Packit |
899799 |
=> String -- ^ Error message
|
|
Packit |
899799 |
-> m (Maybe a) -- ^ Optional parser to be forced
|
|
Packit |
899799 |
-> m a
|
|
Packit |
899799 |
force msg i = i >>= maybe (throwM $ XmlException msg Nothing) return
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | A helper function which reads a file from disk using 'enumFile', detects
|
|
Packit |
899799 |
-- character encoding using 'detectUtf', parses the XML using 'parseBytes', and
|
|
Packit |
899799 |
-- then hands off control to your supplied parser.
|
|
Packit |
899799 |
parseFile :: MonadResource m
|
|
Packit |
899799 |
=> ParseSettings
|
|
Packit |
899799 |
-> FilePath
|
|
Packit |
899799 |
-> Producer m Event
|
|
Packit |
899799 |
parseFile ps fp = sourceFile fp =$= parseBytes ps
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Parse an event stream from a lazy 'L.ByteString'.
|
|
Packit |
899799 |
parseLBS :: MonadThrow m
|
|
Packit |
899799 |
=> ParseSettings
|
|
Packit |
899799 |
-> L.ByteString
|
|
Packit |
899799 |
-> Producer m Event
|
|
Packit |
899799 |
parseLBS ps lbs = CL.sourceList (L.toChunks lbs) =$= parseBytes ps
|
|
Packit |
899799 |
|
|
Packit |
899799 |
data XmlException = XmlException
|
|
Packit |
899799 |
{ xmlErrorMessage :: String
|
|
Packit |
899799 |
, xmlBadInput :: Maybe Event
|
|
Packit |
899799 |
}
|
|
Packit |
899799 |
| InvalidEndElement Name (Maybe Event)
|
|
Packit |
899799 |
| InvalidEntity String (Maybe Event)
|
|
Packit |
899799 |
| MissingAttribute String
|
|
Packit |
899799 |
| UnparsedAttributes [(Name, [Content])]
|
|
Packit |
899799 |
deriving (Show, Typeable)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
instance Exception XmlException where
|
|
Packit |
899799 |
#if MIN_VERSION_base(4, 8, 0)
|
|
Packit |
899799 |
displayException (XmlException msg (Just event)) = "Error while parsing XML event " ++ show event ++ ": " ++ msg
|
|
Packit |
899799 |
displayException (XmlException msg _) = "Error while parsing XML: " ++ msg
|
|
Packit |
899799 |
displayException (InvalidEndElement name (Just event)) = "Error while parsing XML event: expected </" ++ T.unpack (nameLocalName name) ++ ">, got " ++ show event
|
|
Packit |
899799 |
displayException (InvalidEndElement name _) = "Error while parsing XML event: expected </" ++ show name ++ ">, got nothing"
|
|
Packit |
899799 |
displayException (InvalidEntity msg (Just event)) = "Error while parsing XML entity " ++ show event ++ ": " ++ msg
|
|
Packit |
899799 |
displayException (InvalidEntity msg _) = "Error while parsing XML entity: " ++ msg
|
|
Packit |
899799 |
displayException (MissingAttribute msg) = "Missing required attribute: " ++ msg
|
|
Packit |
899799 |
displayException (UnparsedAttributes attrs) = show (length attrs) ++ " remaining unparsed attributes: \n" ++ intercalate "\n" (show <$> attrs)
|
|
Packit |
899799 |
#endif
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | A @NameMatcher@ describes which names a tag parser is allowed to match.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
newtype NameMatcher a = NameMatcher { runNameMatcher :: Name -> Maybe a }
|
|
Packit |
899799 |
|
|
Packit |
899799 |
deriving instance Functor NameMatcher
|
|
Packit |
899799 |
|
|
Packit |
899799 |
instance Applicative NameMatcher where
|
|
Packit |
899799 |
pure a = NameMatcher $ const $ pure a
|
|
Packit |
899799 |
NameMatcher f <*> NameMatcher a = NameMatcher $ \name -> f name <*> a name
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | 'NameMatcher's can be combined with @\<|\>@
|
|
Packit |
899799 |
instance Alternative NameMatcher where
|
|
Packit |
899799 |
empty = NameMatcher $ const Nothing
|
|
Packit |
899799 |
NameMatcher f <|> NameMatcher g = NameMatcher (\a -> f a <|> g a)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Match a single 'Name' in a concise way.
|
|
Packit |
899799 |
-- Note that 'Name' is namespace sensitive: when using the 'IsString' instance,
|
|
Packit |
899799 |
-- use @"{http:\/\/a\/b}c"@ to match the tag @c@ in the XML namespace @http://a/b@
|
|
Packit |
899799 |
instance (a ~ Name) => IsString (NameMatcher a) where
|
|
Packit |
899799 |
fromString s = matching (== fromString s)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | @matching f@ matches @name@ iff @f name@ is true. Returns the matched 'Name'.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
matching :: (Name -> Bool) -> NameMatcher Name
|
|
Packit |
899799 |
matching f = NameMatcher $ \name -> if f name then Just name else Nothing
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Matches any 'Name'. Returns the matched 'Name'.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
anyName :: NameMatcher Name
|
|
Packit |
899799 |
anyName = matching (const True)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Matches any 'Name' from the given list. Returns the matched 'Name'.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
anyOf :: [Name] -> NameMatcher Name
|
|
Packit |
899799 |
anyOf values = matching (`elem` values)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | A monad for parsing attributes. By default, it requires you to deal with
|
|
Packit |
899799 |
-- all attributes present on an element, and will throw an exception if there
|
|
Packit |
899799 |
-- are unhandled attributes. Use the 'requireAttr', 'attr' et al
|
|
Packit |
899799 |
-- functions for handling an attribute, and 'ignoreAttrs' if you would like to
|
|
Packit |
899799 |
-- skip the rest of the attributes on an element.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- 'Alternative' instance behaves like 'First' monoid: it chooses first
|
|
Packit |
899799 |
-- parser which doesn't fail.
|
|
Packit |
899799 |
newtype AttrParser a = AttrParser { runAttrParser :: [(Name, [Content])] -> Either SomeException ([(Name, [Content])], a) }
|
|
Packit |
899799 |
|
|
Packit |
899799 |
instance Monad AttrParser where
|
|
Packit |
899799 |
return a = AttrParser $ \as -> Right (as, a)
|
|
Packit |
899799 |
(AttrParser f) >>= g = AttrParser $ \as ->
|
|
Packit |
899799 |
either Left (\(as', f') -> runAttrParser (g f') as') (f as)
|
|
Packit |
899799 |
instance Functor AttrParser where
|
|
Packit |
899799 |
fmap = liftM
|
|
Packit |
899799 |
instance Applicative AttrParser where
|
|
Packit |
899799 |
pure = return
|
|
Packit |
899799 |
(<*>) = ap
|
|
Packit |
899799 |
instance Alternative AttrParser where
|
|
Packit |
899799 |
empty = AttrParser $ const $ Left $ toException $ XmlException "AttrParser.empty" Nothing
|
|
Packit |
899799 |
AttrParser f <|> AttrParser g = AttrParser $ \x ->
|
|
Packit |
899799 |
either (const $ g x) Right (f x)
|
|
Packit |
899799 |
instance MonadThrow AttrParser where
|
|
Packit |
899799 |
throwM = AttrParser . const . throwM
|
|
Packit |
899799 |
|
|
Packit |
899799 |
optionalAttrRaw :: ((Name, [Content]) -> Maybe b) -> AttrParser (Maybe b)
|
|
Packit |
899799 |
optionalAttrRaw f =
|
|
Packit |
899799 |
AttrParser $ go id
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
go front [] = Right (front [], Nothing)
|
|
Packit |
899799 |
go front (a:as) =
|
|
Packit |
899799 |
maybe (go (front . (:) a) as)
|
|
Packit |
899799 |
(\b -> Right (front as, Just b))
|
|
Packit |
899799 |
(f a)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
requireAttrRaw :: String -> ((Name, [Content]) -> Maybe b) -> AttrParser b
|
|
Packit |
899799 |
requireAttrRaw msg f = optionalAttrRaw f >>=
|
|
Packit |
899799 |
maybe (AttrParser $ const $ Left $ toException $ MissingAttribute msg)
|
|
Packit |
899799 |
return
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Return the value for an attribute if present.
|
|
Packit |
899799 |
attr :: Name -> AttrParser (Maybe Text)
|
|
Packit |
899799 |
attr n = optionalAttrRaw
|
|
Packit |
899799 |
(\(x, y) -> if x == n then Just (contentsToText y) else Nothing)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Shortcut composition of 'force' and 'attr'.
|
|
Packit |
899799 |
requireAttr :: Name -> AttrParser Text
|
|
Packit |
899799 |
requireAttr n = force ("Missing attribute: " ++ show n) $ attr n
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
{-# DEPRECATED optionalAttr "Please use 'attr'." #-}
|
|
Packit |
899799 |
optionalAttr :: Name -> AttrParser (Maybe Text)
|
|
Packit |
899799 |
optionalAttr = attr
|
|
Packit |
899799 |
|
|
Packit |
899799 |
contentsToText :: [Content] -> Text
|
|
Packit |
899799 |
contentsToText = T.concat . map toText where
|
|
Packit |
899799 |
toText (ContentText t) = t
|
|
Packit |
899799 |
toText (ContentEntity e) = T.concat ["&", e, ";"]
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Skip the remaining attributes on an element. Since this will clear the
|
|
Packit |
899799 |
-- list of attributes, you must call this /after/ any calls to 'requireAttr',
|
|
Packit |
899799 |
-- 'optionalAttr', etc.
|
|
Packit |
899799 |
ignoreAttrs :: AttrParser ()
|
|
Packit |
899799 |
ignoreAttrs = AttrParser $ const $ Right ([], ())
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Keep parsing elements as long as the parser returns 'Just'.
|
|
Packit |
899799 |
many :: Monad m
|
|
Packit |
899799 |
=> ConduitM Event o m (Maybe a)
|
|
Packit |
899799 |
-> ConduitM Event o m [a]
|
|
Packit |
899799 |
many i = manyIgnore i $ return Nothing
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Like 'many' but discards the results without building an intermediate list.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
many_ :: MonadThrow m
|
|
Packit |
899799 |
=> ConduitM Event o m (Maybe a)
|
|
Packit |
899799 |
-> ConduitM Event o m ()
|
|
Packit |
899799 |
many_ consumer = manyIgnoreYield (return Nothing) (void <$> consumer)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Keep parsing elements as long as the parser returns 'Just'
|
|
Packit |
899799 |
-- or the ignore parser returns 'Just'.
|
|
Packit |
899799 |
manyIgnore :: Monad m
|
|
Packit |
899799 |
=> ConduitM Event o m (Maybe a)
|
|
Packit |
899799 |
-> ConduitM Event o m (Maybe b)
|
|
Packit |
899799 |
-> ConduitM Event o m [a]
|
|
Packit |
899799 |
manyIgnore i ignored = go id where
|
|
Packit |
899799 |
go front = i >>= maybe (onFail front) (\y -> go $ front . (:) y)
|
|
Packit |
899799 |
-- onFail is called if the main parser fails
|
|
Packit |
899799 |
onFail front = ignored >>= maybe (return $ front []) (const $ go front)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Like @many@, but any tags and content the consumer doesn't match on
|
|
Packit |
899799 |
-- are silently ignored.
|
|
Packit |
899799 |
many' :: MonadThrow m
|
|
Packit |
899799 |
=> ConduitM Event o m (Maybe a)
|
|
Packit |
899799 |
-> ConduitM Event o m [a]
|
|
Packit |
899799 |
many' consumer = manyIgnore consumer ignoreAllTreesContent
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Like 'many', but uses 'yield' so the result list can be streamed
|
|
Packit |
899799 |
-- to downstream conduits without waiting for 'manyYield' to finish
|
|
Packit |
899799 |
manyYield :: Monad m
|
|
Packit |
899799 |
=> ConduitM a b m (Maybe b)
|
|
Packit |
899799 |
-> Conduit a m b
|
|
Packit |
899799 |
manyYield consumer = fix $ \loop ->
|
|
Packit |
899799 |
consumer >>= maybe (return ()) (\x -> yield x >> loop)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Like 'manyIgnore', but uses 'yield' so the result list can be streamed
|
|
Packit |
899799 |
-- to downstream conduits without waiting for 'manyIgnoreYield' to finish
|
|
Packit |
899799 |
manyIgnoreYield :: MonadThrow m
|
|
Packit |
899799 |
=> ConduitM Event b m (Maybe b) -- ^ Consuming parser that generates the result stream
|
|
Packit |
899799 |
-> ConduitM Event b m (Maybe ()) -- ^ Ignore parser that consumes elements to be ignored
|
|
Packit |
899799 |
-> Conduit Event m b
|
|
Packit |
899799 |
manyIgnoreYield consumer ignoreParser = fix $ \loop ->
|
|
Packit |
899799 |
consumer >>= maybe (onFail loop) (\x -> yield x >> loop)
|
|
Packit |
899799 |
where onFail loop = ignoreParser >>= maybe (return ()) (const loop)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Like 'many'', but uses 'yield' so the result list can be streamed
|
|
Packit |
899799 |
-- to downstream conduits without waiting for 'manyYield'' to finish
|
|
Packit |
899799 |
manyYield' :: MonadThrow m
|
|
Packit |
899799 |
=> ConduitM Event b m (Maybe b)
|
|
Packit |
899799 |
-> Conduit Event m b
|
|
Packit |
899799 |
manyYield' consumer = manyIgnoreYield consumer ignoreAllTreesContent
|
|
Packit |
899799 |
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Stream a content 'Event'. If next event isn't a content, nothing is consumed.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Returns @Just ()@ if a content 'Event' was consumed, @Nothing@ otherwise.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
takeContent :: MonadThrow m => ConduitM Event Event m (Maybe ())
|
|
Packit |
899799 |
takeContent = do
|
|
Packit |
899799 |
event <- await
|
|
Packit |
899799 |
case event of
|
|
Packit |
899799 |
Just e@(EventContent ContentText{}) -> yield e >> return (Just ())
|
|
Packit |
899799 |
Just e@EventCDATA{} -> yield e >> return (Just ())
|
|
Packit |
899799 |
Just e -> if isWhitespace e then yield e >> takeContent else leftover e >> return Nothing
|
|
Packit |
899799 |
_ -> return Nothing
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Stream 'Event's corresponding to a single element that matches given 'NameMatcher' and 'AttrParser', from the opening- to the closing-tag.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- If next 'Event' isn't an element, nothing is consumed.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- If an opening-tag is consumed but no matching closing-tag is found, an 'XmlException' is thrown.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- This function automatically ignores comments, instructions and whitespace.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Returns @Just ()@ if an element was consumed, 'Nothing' otherwise.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
takeTree :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitM Event Event m (Maybe ())
|
|
Packit |
899799 |
takeTree nameMatcher attrParser = do
|
|
Packit |
899799 |
event <- await
|
|
Packit |
899799 |
case event of
|
|
Packit |
899799 |
Just e@(EventBeginElement name as) -> case runNameMatcher nameMatcher name of
|
|
Packit |
899799 |
Just _ -> case runAttrParser attrParser as of
|
|
Packit |
899799 |
Right _ -> do
|
|
Packit |
899799 |
yield e
|
|
Packit |
899799 |
whileJust takeAnyTreeContent
|
|
Packit |
899799 |
endEvent <- await
|
|
Packit |
899799 |
case endEvent of
|
|
Packit |
899799 |
Just e'@(EventEndElement name') | name == name' -> yield e' >> return (Just ())
|
|
Packit |
899799 |
_ -> lift $ monadThrow $ InvalidEndElement name endEvent
|
|
Packit |
899799 |
_ -> leftover e >> return Nothing
|
|
Packit |
899799 |
_ -> leftover e >> return Nothing
|
|
Packit |
899799 |
|
|
Packit |
899799 |
Just e -> if isWhitespace e then yield e >> takeTree nameMatcher attrParser else leftover e >> return Nothing
|
|
Packit |
899799 |
_ -> return Nothing
|
|
Packit |
899799 |
where
|
|
Packit |
899799 |
whileJust f = fix $ \loop -> f >>= maybe (return ()) (const loop)
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Like 'takeTree', but can also stream a content 'Event'.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
takeTreeContent :: MonadThrow m
|
|
Packit |
899799 |
=> NameMatcher a
|
|
Packit |
899799 |
-> AttrParser b
|
|
Packit |
899799 |
-> ConduitM Event Event m (Maybe ())
|
|
Packit |
899799 |
takeTreeContent nameMatcher attrParser = runMaybeT $ MaybeT (takeTree nameMatcher attrParser) <|> MaybeT takeContent
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Like 'takeTreeContent', without checking for tag name or attributes.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- >>> runResourceT $ parseLBS def "text" $$ takeAnyTreeContent =$= consume
|
|
Packit |
899799 |
-- Just [ EventContent (ContentText "text") ]
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- >>> runResourceT $ parseLBS def "" $$ takeAnyTreeContent =$= consume
|
|
Packit |
899799 |
-- Just [ ]
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- >>> runResourceT $ parseLBS def "<c></c>text" $$ takeAnyTreeContent =$= consume
|
|
Packit |
899799 |
-- Just [ EventBeginElement "b" [], EventBeginElement "c" [], EventEndElement "c", EventEndElement "b" ]
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Since 1.5.0
|
|
Packit |
899799 |
takeAnyTreeContent :: MonadThrow m
|
|
Packit |
899799 |
=> ConduitM Event Event m (Maybe ())
|
|
Packit |
899799 |
takeAnyTreeContent = takeTreeContent anyName ignoreAttrs
|
|
Packit |
899799 |
|
|
Packit |
899799 |
{-# DEPRECATED takeAllTreesContent "Please use 'takeAnyTreeContent'." #-}
|
|
Packit |
899799 |
takeAllTreesContent :: MonadThrow m => ConduitM Event Event m (Maybe ())
|
|
Packit |
899799 |
takeAllTreesContent = takeAnyTreeContent
|
|
Packit |
899799 |
|
|
Packit |
899799 |
type DecodeEntities = Text -> Content
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | Default implementation of 'DecodeEntities', which leaves the
|
|
Packit |
899799 |
-- entity as-is. Numeric character references and the five standard
|
|
Packit |
899799 |
-- entities (lt, gt, amp, quot, pos) are handled internally by the
|
|
Packit |
899799 |
-- parser.
|
|
Packit |
899799 |
decodeXmlEntities :: DecodeEntities
|
|
Packit |
899799 |
decodeXmlEntities = ContentEntity
|
|
Packit |
899799 |
|
|
Packit |
899799 |
-- | HTML4-compliant entity decoder. Handles the additional 248
|
|
Packit |
899799 |
-- entities defined by HTML 4 and XHTML 1.
|
|
Packit |
899799 |
--
|
|
Packit |
899799 |
-- Note that HTML 5 introduces a drastically larger number of entities, and
|
|
Packit |
899799 |
-- this code does not recognize most of them.
|
|
Packit |
899799 |
decodeHtmlEntities :: DecodeEntities
|
|
Packit |
899799 |
decodeHtmlEntities t =
|
|
Packit |
899799 |
maybe (ContentEntity t) ContentText $ Map.lookup t htmlEntities
|
|
Packit |
899799 |
|
|
Packit |
899799 |
htmlEntities :: Map.Map T.Text T.Text
|
|
Packit |
899799 |
htmlEntities = Map.fromList
|
|
Packit |
899799 |
$ map (pack *** pack) -- Work around the long-compile-time bug
|
|
Packit |
899799 |
[ ("nbsp", "\160")
|
|
Packit |
899799 |
, ("iexcl", "\161")
|
|
Packit |
899799 |
, ("cent", "\162")
|
|
Packit |
899799 |
, ("pound", "\163")
|
|
Packit |
899799 |
, ("curren", "\164")
|
|
Packit |
899799 |
, ("yen", "\165")
|
|
Packit |
899799 |
, ("brvbar", "\166")
|
|
Packit |
899799 |
, ("sect", "\167")
|
|
Packit |
899799 |
, ("uml", "\168")
|
|
Packit |
899799 |
, ("copy", "\169")
|
|
Packit |
899799 |
, ("ordf", "\170")
|
|
Packit |
899799 |
, ("laquo", "\171")
|
|
Packit |
899799 |
, ("not", "\172")
|
|
Packit |
899799 |
, ("shy", "\173")
|
|
Packit |
899799 |
, ("reg", "\174")
|
|
Packit |
899799 |
, ("macr", "\175")
|
|
Packit |
899799 |
, ("deg", "\176")
|
|
Packit |
899799 |
, ("plusmn", "\177")
|
|
Packit |
899799 |
, ("sup2", "\178")
|
|
Packit |
899799 |
, ("sup3", "\179")
|
|
Packit |
899799 |
, ("acute", "\180")
|
|
Packit |
899799 |
, ("micro", "\181")
|
|
Packit |
899799 |
, ("para", "\182")
|
|
Packit |
899799 |
, ("middot", "\183")
|
|
Packit |
899799 |
, ("cedil", "\184")
|
|
Packit |
899799 |
, ("sup1", "\185")
|
|
Packit |
899799 |
, ("ordm", "\186")
|
|
Packit |
899799 |
, ("raquo", "\187")
|
|
Packit |
899799 |
, ("frac14", "\188")
|
|
Packit |
899799 |
, ("frac12", "\189")
|
|
Packit |
899799 |
, ("frac34", "\190")
|
|
Packit |
899799 |
, ("iquest", "\191")
|
|
Packit |
899799 |
, ("Agrave", "\192")
|
|
Packit |
899799 |
, ("Aacute", "\193")
|
|
Packit |
899799 |
, ("Acirc", "\194")
|
|
Packit |
899799 |
, ("Atilde", "\195")
|
|
Packit |
899799 |
, ("Auml", "\196")
|
|
Packit |
899799 |
, ("Aring", "\197")
|
|
Packit |
899799 |
, ("AElig", "\198")
|
|
Packit |
899799 |
, ("Ccedil", "\199")
|
|
Packit |
899799 |
, ("Egrave", "\200")
|
|
Packit |
899799 |
, ("Eacute", "\201")
|
|
Packit |
899799 |
, ("Ecirc", "\202")
|
|
Packit |
899799 |
, ("Euml", "\203")
|
|
Packit |
899799 |
, ("Igrave", "\204")
|
|
Packit |
899799 |
, ("Iacute", "\205")
|
|
Packit |
899799 |
, ("Icirc", "\206")
|
|
Packit |
899799 |
, ("Iuml", "\207")
|
|
Packit |
899799 |
, ("ETH", "\208")
|
|
Packit |
899799 |
, ("Ntilde", "\209")
|
|
Packit |
899799 |
, ("Ograve", "\210")
|
|
Packit |
899799 |
, ("Oacute", "\211")
|
|
Packit |
899799 |
, ("Ocirc", "\212")
|
|
Packit |
899799 |
, ("Otilde", "\213")
|
|
Packit |
899799 |
, ("Ouml", "\214")
|
|
Packit |
899799 |
, ("times", "\215")
|
|
Packit |
899799 |
, ("Oslash", "\216")
|
|
Packit |
899799 |
, ("Ugrave", "\217")
|
|
Packit |
899799 |
, ("Uacute", "\218")
|
|
Packit |
899799 |
, ("Ucirc", "\219")
|
|
Packit |
899799 |
, ("Uuml", "\220")
|
|
Packit |
899799 |
, ("Yacute", "\221")
|
|
Packit |
899799 |
, ("THORN", "\222")
|
|
Packit |
899799 |
, ("szlig", "\223")
|
|
Packit |
899799 |
, ("agrave", "\224")
|
|
Packit |
899799 |
, ("aacute", "\225")
|
|
Packit |
899799 |
, ("acirc", "\226")
|
|
Packit |
899799 |
, ("atilde", "\227")
|
|
Packit |
899799 |
, ("auml", "\228")
|
|
Packit |
899799 |
, ("aring", "\229")
|
|
Packit |
899799 |
, ("aelig", "\230")
|
|
Packit |
899799 |
, ("ccedil", "\231")
|
|
Packit |
899799 |
, ("egrave", "\232")
|
|
Packit |
899799 |
, ("eacute", "\233")
|
|
Packit |
899799 |
, ("ecirc", "\234")
|
|
Packit |
899799 |
, ("euml", "\235")
|
|
Packit |
899799 |
, ("igrave", "\236")
|
|
Packit |
899799 |
, ("iacute", "\237")
|
|
Packit |
899799 |
, ("icirc", "\238")
|
|
Packit |
899799 |
, ("iuml", "\239")
|
|
Packit |
899799 |
, ("eth", "\240")
|
|
Packit |
899799 |
, ("ntilde", "\241")
|
|
Packit |
899799 |
, ("ograve", "\242")
|
|
Packit |
899799 |
, ("oacute", "\243")
|
|
Packit |
899799 |
, ("ocirc", "\244")
|
|
Packit |
899799 |
, ("otilde", "\245")
|
|
Packit |
899799 |
, ("ouml", "\246")
|
|
Packit |
899799 |
, ("divide", "\247")
|
|
Packit |
899799 |
, ("oslash", "\248")
|
|
Packit |
899799 |
, ("ugrave", "\249")
|
|
Packit |
899799 |
, ("uacute", "\250")
|
|
Packit |
899799 |
, ("ucirc", "\251")
|
|
Packit |
899799 |
, ("uuml", "\252")
|
|
Packit |
899799 |
, ("yacute", "\253")
|
|
Packit |
899799 |
, ("thorn", "\254")
|
|
Packit |
899799 |
, ("yuml", "\255")
|
|
Packit |
899799 |
, ("OElig", "\338")
|
|
Packit |
899799 |
, ("oelig", "\339")
|
|
Packit |
899799 |
, ("Scaron", "\352")
|
|
Packit |
899799 |
, ("scaron", "\353")
|
|
Packit |
899799 |
, ("Yuml", "\376")
|
|
Packit |
899799 |
, ("fnof", "\402")
|
|
Packit |
899799 |
, ("circ", "\710")
|
|
Packit |
899799 |
, ("tilde", "\732")
|
|
Packit |
899799 |
, ("Alpha", "\913")
|
|
Packit |
899799 |
, ("Beta", "\914")
|
|
Packit |
899799 |
, ("Gamma", "\915")
|
|
Packit |
899799 |
, ("Delta", "\916")
|
|
Packit |
899799 |
, ("Epsilon", "\917")
|
|
Packit |
899799 |
, ("Zeta", "\918")
|
|
Packit |
899799 |
, ("Eta", "\919")
|
|
Packit |
899799 |
, ("Theta", "\920")
|
|
Packit |
899799 |
, ("Iota", "\921")
|
|
Packit |
899799 |
, ("Kappa", "\922")
|
|
Packit |
899799 |
, ("Lambda", "\923")
|
|
Packit |
899799 |
, ("Mu", "\924")
|
|
Packit |
899799 |
, ("Nu", "\925")
|
|
Packit |
899799 |
, ("Xi", "\926")
|
|
Packit |
899799 |
, ("Omicron", "\927")
|
|
Packit |
899799 |
, ("Pi", "\928")
|
|
Packit |
899799 |
, ("Rho", "\929")
|
|
Packit |
899799 |
, ("Sigma", "\931")
|
|
Packit |
899799 |
, ("Tau", "\932")
|
|
Packit |
899799 |
, ("Upsilon", "\933")
|
|
Packit |
899799 |
, ("Phi", "\934")
|
|
Packit |
899799 |
, ("Chi", "\935")
|
|
Packit |
899799 |
, ("Psi", "\936")
|
|
Packit |
899799 |
, ("Omega", "\937")
|
|
Packit |
899799 |
, ("alpha", "\945")
|
|
Packit |
899799 |
, ("beta", "\946")
|
|
Packit |
899799 |
, ("gamma", "\947")
|
|
Packit |
899799 |
, ("delta", "\948")
|
|
Packit |
899799 |
, ("epsilon", "\949")
|
|
Packit |
899799 |
, ("zeta", "\950")
|
|
Packit |
899799 |
, ("eta", "\951")
|
|
Packit |
899799 |
, ("theta", "\952")
|
|
Packit |
899799 |
, ("iota", "\953")
|
|
Packit |
899799 |
, ("kappa", "\954")
|
|
Packit |
899799 |
, ("lambda", "\955")
|
|
Packit |
899799 |
, ("mu", "\956")
|
|
Packit |
899799 |
, ("nu", "\957")
|
|
Packit |
899799 |
, ("xi", "\958")
|
|
Packit |
899799 |
, ("omicron", "\959")
|
|
Packit |
899799 |
, ("pi", "\960")
|
|
Packit |
899799 |
, ("rho", "\961")
|
|
Packit |
899799 |
, ("sigmaf", "\962")
|
|
Packit |
899799 |
, ("sigma", "\963")
|
|
Packit |
899799 |
, ("tau", "\964")
|
|
Packit |
899799 |
, ("upsilon", "\965")
|
|
Packit |
899799 |
, ("phi", "\966")
|
|
Packit |
899799 |
, ("chi", "\967")
|
|
Packit |
899799 |
, ("psi", "\968")
|
|
Packit |
899799 |
, ("omega", "\969")
|
|
Packit |
899799 |
, ("thetasym", "\977")
|
|
Packit |
899799 |
, ("upsih", "\978")
|
|
Packit |
899799 |
, ("piv", "\982")
|
|
Packit |
899799 |
, ("ensp", "\8194")
|
|
Packit |
899799 |
, ("emsp", "\8195")
|
|
Packit |
899799 |
, ("thinsp", "\8201")
|
|
Packit |
899799 |
, ("zwnj", "\8204")
|
|
Packit |
899799 |
, ("zwj", "\8205")
|
|
Packit |
899799 |
, ("lrm", "\8206")
|
|
Packit |
899799 |
, ("rlm", "\8207")
|
|
Packit |
899799 |
, ("ndash", "\8211")
|
|
Packit |
899799 |
, ("mdash", "\8212")
|
|
Packit |
899799 |
, ("lsquo", "\8216")
|
|
Packit |
899799 |
, ("rsquo", "\8217")
|
|
Packit |
899799 |
, ("sbquo", "\8218")
|
|
Packit |
899799 |
, ("ldquo", "\8220")
|
|
Packit |
899799 |
, ("rdquo", "\8221")
|
|
Packit |
899799 |
, ("bdquo", "\8222")
|
|
Packit |
899799 |
, ("dagger", "\8224")
|
|
Packit |
899799 |
, ("Dagger", "\8225")
|
|
Packit |
899799 |
, ("bull", "\8226")
|
|
Packit |
899799 |
, ("hellip", "\8230")
|
|
Packit |
899799 |
, ("permil", "\8240")
|
|
Packit |
899799 |
, ("prime", "\8242")
|
|
Packit |
899799 |
, ("Prime", "\8243")
|
|
Packit |
899799 |
, ("lsaquo", "\8249")
|
|
Packit |
899799 |
, ("rsaquo", "\8250")
|
|
Packit |
899799 |
, ("oline", "\8254")
|
|
Packit |
899799 |
, ("frasl", "\8260")
|
|
Packit |
899799 |
, ("euro", "\8364")
|
|
Packit |
899799 |
, ("image", "\8465")
|
|
Packit |
899799 |
, ("weierp", "\8472")
|
|
Packit |
899799 |
, ("real", "\8476")
|
|
Packit |
899799 |
, ("trade", "\8482")
|
|
Packit |
899799 |
, ("alefsym", "\8501")
|
|
Packit |
899799 |
, ("larr", "\8592")
|
|
Packit |
899799 |
, ("uarr", "\8593")
|
|
Packit |
899799 |
, ("rarr", "\8594")
|
|
Packit |
899799 |
, ("darr", "\8595")
|
|
Packit |
899799 |
, ("harr", "\8596")
|
|
Packit |
899799 |
, ("crarr", "\8629")
|
|
Packit |
899799 |
, ("lArr", "\8656")
|
|
Packit |
899799 |
, ("uArr", "\8657")
|
|
Packit |
899799 |
, ("rArr", "\8658")
|
|
Packit |
899799 |
, ("dArr", "\8659")
|
|
Packit |
899799 |
, ("hArr", "\8660")
|
|
Packit |
899799 |
, ("forall", "\8704")
|
|
Packit |
899799 |
, ("part", "\8706")
|
|
Packit |
899799 |
, ("exist", "\8707")
|
|
Packit |
899799 |
, ("empty", "\8709")
|
|
Packit |
899799 |
, ("nabla", "\8711")
|
|
Packit |
899799 |
, ("isin", "\8712")
|
|
Packit |
899799 |
, ("notin", "\8713")
|
|
Packit |
899799 |
, ("ni", "\8715")
|
|
Packit |
899799 |
, ("prod", "\8719")
|
|
Packit |
899799 |
, ("sum", "\8721")
|
|
Packit |
899799 |
, ("minus", "\8722")
|
|
Packit |
899799 |
, ("lowast", "\8727")
|
|
Packit |
899799 |
, ("radic", "\8730")
|
|
Packit |
899799 |
, ("prop", "\8733")
|
|
Packit |
899799 |
, ("infin", "\8734")
|
|
Packit |
899799 |
, ("ang", "\8736")
|
|
Packit |
899799 |
, ("and", "\8743")
|
|
Packit |
899799 |
, ("or", "\8744")
|
|
Packit |
899799 |
, ("cap", "\8745")
|
|
Packit |
899799 |
, ("cup", "\8746")
|
|
Packit |
899799 |
, ("int", "\8747")
|
|
Packit |
899799 |
, ("there4", "\8756")
|
|
Packit |
899799 |
, ("sim", "\8764")
|
|
Packit |
899799 |
, ("cong", "\8773")
|
|
Packit |
899799 |
, ("asymp", "\8776")
|
|
Packit |
899799 |
, ("ne", "\8800")
|
|
Packit |
899799 |
, ("equiv", "\8801")
|
|
Packit |
899799 |
, ("le", "\8804")
|
|
Packit |
899799 |
, ("ge", "\8805")
|
|
Packit |
899799 |
, ("sub", "\8834")
|
|
Packit |
899799 |
, ("sup", "\8835")
|
|
Packit |
899799 |
, ("nsub", "\8836")
|
|
Packit |
899799 |
, ("sube", "\8838")
|
|
Packit |
899799 |
, ("supe", "\8839")
|
|
Packit |
899799 |
, ("oplus", "\8853")
|
|
Packit |
899799 |
, ("otimes", "\8855")
|
|
Packit |
899799 |
, ("perp", "\8869")
|
|
Packit |
899799 |
, ("sdot", "\8901")
|
|
Packit |
899799 |
, ("lceil", "\8968")
|
|
Packit |
899799 |
, ("rceil", "\8969")
|
|
Packit |
899799 |
, ("lfloor", "\8970")
|
|
Packit |
899799 |
, ("rfloor", "\8971")
|
|
Packit |
899799 |
, ("lang", "\9001")
|
|
Packit |
899799 |
, ("rang", "\9002")
|
|
Packit |
899799 |
, ("loz", "\9674")
|
|
Packit |
899799 |
, ("spades", "\9824")
|
|
Packit |
899799 |
, ("clubs", "\9827")
|
|
Packit |
899799 |
, ("hearts", "\9829")
|
|
Packit |
899799 |
, ("diams", "\9830")
|
|
Packit |
899799 |
]
|