|
Packit |
5b08af |
{-# LANGUAGE FlexibleContexts #-}
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- ------------------------------------------------------------
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
{- |
|
|
Packit |
5b08af |
Module : Text.Regex.XMLSchema.RegexParser
|
|
Packit |
5b08af |
Copyright : Copyright (C) 2014- Uwe Schmidt
|
|
Packit |
5b08af |
License : MIT
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
|
|
Packit |
5b08af |
Stability : stable
|
|
Packit |
5b08af |
Portability: portable
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
W3C XML Schema Regular Expression Parser
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
This parser supports the full W3C standard, the
|
|
Packit |
5b08af |
complete grammar can be found under <http://www.w3.org/TR/xmlschema11-2/#regexs>
|
|
Packit |
5b08af |
and extensions for all missing set operations, intersection,
|
|
Packit |
5b08af |
difference, exclusive or, interleave, complement
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-}
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- ------------------------------------------------------------
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
module Text.Regex.XMLSchema.Generic.RegexParser
|
|
Packit |
5b08af |
( parseRegex
|
|
Packit |
5b08af |
, parseRegexExt
|
|
Packit |
5b08af |
, parseRegex'
|
|
Packit |
5b08af |
, parseRegexExt'
|
|
Packit |
5b08af |
, parseContextRegex
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
import Data.Char.Properties.UnicodeBlocks
|
|
Packit |
5b08af |
import Data.Char.Properties.UnicodeCharProps
|
|
Packit |
5b08af |
import Data.Char.Properties.XMLCharProps
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
import Data.List (isPrefixOf,
|
|
Packit |
5b08af |
isSuffixOf)
|
|
Packit |
5b08af |
import Data.Maybe
|
|
Packit |
5b08af |
import Data.Set.CharSet
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
import Text.ParserCombinators.Parsec
|
|
Packit |
5b08af |
import Text.Regex.XMLSchema.Generic.Regex
|
|
Packit |
5b08af |
import Text.Regex.XMLSchema.Generic.StringLike
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- ------------------------------------------------------------
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- | parse a standard W3C XML Schema regular expression
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
parseRegex :: StringLike s => s -> GenRegex s
|
|
Packit |
5b08af |
parseRegex = parseRegex' . toString
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
parseRegex' :: StringLike s => String -> GenRegex s
|
|
Packit |
5b08af |
parseRegex' = parseRegex'' regExpStd
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- | parse an extended syntax W3C XML Schema regular expression
|
|
Packit |
5b08af |
--
|
|
Packit |
5b08af |
-- The Syntax of the W3C XML Schema spec is extended by
|
|
Packit |
5b08af |
-- further useful set operations, like intersection, difference, exor.
|
|
Packit |
5b08af |
-- Subexpression match becomes possible with \"named\" pairs of parentheses.
|
|
Packit |
5b08af |
-- The multi char escape sequence \\a represents any Unicode char,
|
|
Packit |
5b08af |
-- The multi char escape sequence \\A represents any Unicode word, (\\A = \\a*).
|
|
Packit |
5b08af |
-- All syntactically wrong inputs are mapped to the Zero expression representing the
|
|
Packit |
5b08af |
-- empty set of words. Zero contains as data field a string for an error message.
|
|
Packit |
5b08af |
-- So error checking after parsing becomes possible by checking against Zero ('isZero' predicate)
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
parseRegexExt :: StringLike s => s -> GenRegex s
|
|
Packit |
5b08af |
parseRegexExt = parseRegexExt' . toString
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
parseRegexExt' :: StringLike s => String -> GenRegex s
|
|
Packit |
5b08af |
parseRegexExt' = parseRegex'' regExpExt
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
parseRegex'' :: StringLike s => Parser (GenRegex s) -> String -> GenRegex s
|
|
Packit |
5b08af |
parseRegex'' regExp'
|
|
Packit |
5b08af |
= either (mkZero' . ("syntax error: " ++) . show) id
|
|
Packit |
5b08af |
. parse ( do
|
|
Packit |
5b08af |
r <- regExp'
|
|
Packit |
5b08af |
eof
|
|
Packit |
5b08af |
return r
|
|
Packit |
5b08af |
) ""
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- ------------------------------------------------------------
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- | parse a regular expression surrounded by contenxt spec
|
|
Packit |
5b08af |
--
|
|
Packit |
5b08af |
-- a leading @^@ denotes start of text,
|
|
Packit |
5b08af |
-- a trailing @$@ denotes end of text,
|
|
Packit |
5b08af |
-- a leading @\\<@ denotes word start,
|
|
Packit |
5b08af |
-- a trailing @\\>@ denotes word end.
|
|
Packit |
5b08af |
--
|
|
Packit |
5b08af |
-- The 1. param ist the regex parser ('parseRegex' or 'parseRegexExt')
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
parseContextRegex :: StringLike s => (String -> GenRegex s) -> s -> GenRegex s
|
|
Packit |
5b08af |
parseContextRegex parseRe re0
|
|
Packit |
5b08af |
= re'
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
parseAW = parseRegexExt' "(\\A\\W)?"
|
|
Packit |
5b08af |
parseWA = parseRegexExt' "(\\W\\A)?"
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
re = toString re0
|
|
Packit |
5b08af |
re' = mkSeqs . concat $ [ startContext
|
|
Packit |
5b08af |
, (:[]) . parseRe $ re2
|
|
Packit |
5b08af |
, endContext
|
|
Packit |
5b08af |
]
|
|
Packit |
5b08af |
(startContext, re1)
|
|
Packit |
5b08af |
| "^" `isPrefixOf` re = ([], tail re)
|
|
Packit |
5b08af |
| "\\<" `isPrefixOf` re = ([parseAW], drop 2 re)
|
|
Packit |
5b08af |
| otherwise = ([mkStar mkDot], re)
|
|
Packit |
5b08af |
(endContext, re2)
|
|
Packit |
5b08af |
| "$" `isSuffixOf` re1 = ([], init re1)
|
|
Packit |
5b08af |
| "\\>" `isSuffixOf` re1 = ([parseWA], init . init $ re1)
|
|
Packit |
5b08af |
| otherwise = ([mkStar mkDot], re1)
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- ------------------------------------------------------------
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
regExpExt :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
regExpExt = branchList orElseList
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
regExpStd :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
regExpStd = branchList seqListStd
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
branchList :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s)
|
|
Packit |
5b08af |
branchList exParser
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
r1 <- exParser
|
|
Packit |
5b08af |
rs <- many branchList1
|
|
Packit |
5b08af |
return (foldr1 mkAlt $ r1:rs) -- union is associative, so we use right ass.
|
|
Packit |
5b08af |
-- as with seq, alt and exor
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
branchList1
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- char '|'
|
|
Packit |
5b08af |
exParser
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
orElseList :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
orElseList
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
r1 <- interleaveList
|
|
Packit |
5b08af |
rs <- many orElseList1
|
|
Packit |
5b08af |
return (foldr1 mkElse $ r1:rs) -- orElse is associative, so we choose right ass.
|
|
Packit |
5b08af |
-- as with seq and alt ops
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
orElseList1
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- try (string "{|}")
|
|
Packit |
5b08af |
interleaveList
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
interleaveList :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
interleaveList
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
r1 <- exorList
|
|
Packit |
5b08af |
rs <- many interleaveList1
|
|
Packit |
5b08af |
return (foldr1 mkInterleave $ r1:rs) -- interleave is associative, so we choose right ass.
|
|
Packit |
5b08af |
-- as with seq and alt ops
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
interleaveList1
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- try (string "{:}")
|
|
Packit |
5b08af |
exorList
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
exorList :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
exorList
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
r1 <- diffList
|
|
Packit |
5b08af |
rs <- many exorList1
|
|
Packit |
5b08af |
return (foldr1 mkExor $ r1:rs) -- exor is associative, so we choose right ass.
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
exorList1
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- try (string "{^}")
|
|
Packit |
5b08af |
diffList
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
diffList :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
diffList
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
r1 <- intersectList
|
|
Packit |
5b08af |
rs <- many diffList1
|
|
Packit |
5b08af |
return (foldl1 mkDiff $ r1:rs) -- diff is not associative, so we choose left ass.
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
diffList1
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- try (string "{\\}")
|
|
Packit |
5b08af |
intersectList
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
intersectList :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
intersectList
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
r1 <- seqListExt
|
|
Packit |
5b08af |
rs <- many intersectList1
|
|
Packit |
5b08af |
return (foldr1 mkIsect $ r1:rs)
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
intersectList1
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- try (string "{&}")
|
|
Packit |
5b08af |
seqListExt
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
seqListExt :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
seqListExt = seqList' regExpLabel multiCharEscExt
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
seqListStd :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
seqListStd = seqList' regExpStd multiCharEsc
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
seqList' :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) -> Parser (GenRegex s)
|
|
Packit |
5b08af |
seqList' regExp' multiCharEsc'
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
rs <- many piece
|
|
Packit |
5b08af |
return $ mkSeqs rs
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
-- piece :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
piece
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
r <- atom
|
|
Packit |
5b08af |
quantifier r
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- atom :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
atom
|
|
Packit |
5b08af |
= char1
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
charClass
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
between (char '(') (char ')') regExp'
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- charClass :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
charClass
|
|
Packit |
5b08af |
= charClassEsc multiCharEsc'
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
charClassExpr multiCharEsc'
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
wildCardEsc
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
quantifier :: StringLike s => GenRegex s -> Parser (GenRegex s)
|
|
Packit |
5b08af |
quantifier r
|
|
Packit |
5b08af |
= ( do
|
|
Packit |
5b08af |
_ <- char '?'
|
|
Packit |
5b08af |
return $ mkOpt r )
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
( do
|
|
Packit |
5b08af |
_ <- char '*'
|
|
Packit |
5b08af |
return $ mkStar r )
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
( do
|
|
Packit |
5b08af |
_ <- char '+'
|
|
Packit |
5b08af |
return $ mkRep 1 r )
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
try ( do
|
|
Packit |
5b08af |
_ <- char '{'
|
|
Packit |
5b08af |
res <- quantity r
|
|
Packit |
5b08af |
_ <- char '}'
|
|
Packit |
5b08af |
return res
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
( return r )
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
quantity :: StringLike s => GenRegex s -> Parser (GenRegex s)
|
|
Packit |
5b08af |
quantity r
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
lb <- many1 digit
|
|
Packit |
5b08af |
quantityRest r (read lb)
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
quantityRest :: StringLike s => GenRegex s -> Int -> Parser (GenRegex s)
|
|
Packit |
5b08af |
quantityRest r lb
|
|
Packit |
5b08af |
= ( do
|
|
Packit |
5b08af |
_ <- char ','
|
|
Packit |
5b08af |
ub <- many digit
|
|
Packit |
5b08af |
return ( if null ub
|
|
Packit |
5b08af |
then mkRep lb r
|
|
Packit |
5b08af |
else mkRng lb (read ub) r
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
( return $ mkRng lb lb r)
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
regExpLabel :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
regExpLabel
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
lab <- option id (between (char '{') (char '}') label')
|
|
Packit |
5b08af |
r <- regExpExt
|
|
Packit |
5b08af |
return $ lab r
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
label'
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
l <- many1 (satisfy isXmlNameChar)
|
|
Packit |
5b08af |
return $ mkBr' l
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
char1 :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
char1
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
c <- satisfy (`notElem` ".\\?*+{}()|[]")
|
|
Packit |
5b08af |
return $ mkSym1 c
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
charClassEsc :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s)
|
|
Packit |
5b08af |
charClassEsc multiCharEsc'
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- char '\\'
|
|
Packit |
5b08af |
( singleCharEsc
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
multiCharEsc'
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
catEsc
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
complEsc )
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
singleCharEsc :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
singleCharEsc
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
c <- singleCharEsc'
|
|
Packit |
5b08af |
return $ mkSym1 c
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
singleCharEsc' :: Parser Char
|
|
Packit |
5b08af |
singleCharEsc'
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
c <- satisfy (`elem` "nrt\\|.?*+(){}-[]^")
|
|
Packit |
5b08af |
return $ maybe c id . lookup c . zip "ntr" $ "\n\r\t"
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
multiCharEscExt :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
multiCharEscExt
|
|
Packit |
5b08af |
= multiCharEsc
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
( do -- extension: \a represents the whole alphabet inclusive newline chars: \a == .|\n|\r
|
|
Packit |
5b08af |
_ <- char 'a'
|
|
Packit |
5b08af |
return mkDot )
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
( do -- extension: \A represents all words: \A == \a* or \A == (.|\n|\r)*
|
|
Packit |
5b08af |
_ <- char 'A'
|
|
Packit |
5b08af |
return mkAll )
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
multiCharEsc :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
multiCharEsc
|
|
Packit |
5b08af |
= ( do
|
|
Packit |
5b08af |
c <- satisfy (`elem` es)
|
|
Packit |
5b08af |
return $ mkSym . fromJust . lookup c $ pm )
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
es = map fst pm
|
|
Packit |
5b08af |
pm = [ ('s', charPropXmlSpaceChar )
|
|
Packit |
5b08af |
, ('S', compCS charPropXmlSpaceChar )
|
|
Packit |
5b08af |
, ('i', charPropXmlNameStartChar )
|
|
Packit |
5b08af |
, ('I', compCS charPropXmlNameStartChar )
|
|
Packit |
5b08af |
, ('c', charPropXmlNameChar )
|
|
Packit |
5b08af |
, ('C', compCS charPropXmlNameChar )
|
|
Packit |
5b08af |
, ('d', charPropDigit )
|
|
Packit |
5b08af |
, ('D', compCS charPropDigit )
|
|
Packit |
5b08af |
, ('w', compCS charPropNotWord )
|
|
Packit |
5b08af |
, ('W', charPropNotWord )
|
|
Packit |
5b08af |
]
|
|
Packit |
5b08af |
charPropDigit = rangeCS '0' '9'
|
|
Packit |
5b08af |
charPropNotWord = charPropUnicodeP
|
|
Packit |
5b08af |
`unionCS`
|
|
Packit |
5b08af |
charPropUnicodeZ
|
|
Packit |
5b08af |
`unionCS`
|
|
Packit |
5b08af |
charPropUnicodeC
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
catEsc :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
catEsc
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- char 'p'
|
|
Packit |
5b08af |
s <- between (char '{') (char '}') charProp
|
|
Packit |
5b08af |
return $ mkSym s
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
charProp :: Parser CharSet
|
|
Packit |
5b08af |
charProp
|
|
Packit |
5b08af |
= isCategory
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
isBlock
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
isBlock :: Parser CharSet
|
|
Packit |
5b08af |
isBlock
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- string "Is"
|
|
Packit |
5b08af |
name <- many1 (satisfy legalChar)
|
|
Packit |
5b08af |
case lookup name codeBlocks of
|
|
Packit |
5b08af |
Just b -> return $ uncurry rangeCS b
|
|
Packit |
5b08af |
Nothing -> fail $ "unknown Unicode code block " ++ show name
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
legalChar c = 'A' <= c && c <= 'Z' ||
|
|
Packit |
5b08af |
'a' <= c && c <= 'z' ||
|
|
Packit |
5b08af |
'0' <= c && c <= '9' ||
|
|
Packit |
5b08af |
'-' == c
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
isCategory :: Parser CharSet
|
|
Packit |
5b08af |
isCategory
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
pr <- isCategory'
|
|
Packit |
5b08af |
return $ fromJust (lookup pr categories)
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
categories :: [(String, CharSet)]
|
|
Packit |
5b08af |
categories
|
|
Packit |
5b08af |
= [ ("C", charPropUnicodeC )
|
|
Packit |
5b08af |
, ("Cc", charPropUnicodeCc)
|
|
Packit |
5b08af |
, ("Cf", charPropUnicodeCf)
|
|
Packit |
5b08af |
, ("Co", charPropUnicodeCo)
|
|
Packit |
5b08af |
, ("Cs", charPropUnicodeCs)
|
|
Packit |
5b08af |
, ("L", charPropUnicodeL )
|
|
Packit |
5b08af |
, ("Ll", charPropUnicodeLl)
|
|
Packit |
5b08af |
, ("Lm", charPropUnicodeLm)
|
|
Packit |
5b08af |
, ("Lo", charPropUnicodeLo)
|
|
Packit |
5b08af |
, ("Lt", charPropUnicodeLt)
|
|
Packit |
5b08af |
, ("Lu", charPropUnicodeLu)
|
|
Packit |
5b08af |
, ("M", charPropUnicodeM )
|
|
Packit |
5b08af |
, ("Mc", charPropUnicodeMc)
|
|
Packit |
5b08af |
, ("Me", charPropUnicodeMe)
|
|
Packit |
5b08af |
, ("Mn", charPropUnicodeMn)
|
|
Packit |
5b08af |
, ("N", charPropUnicodeN )
|
|
Packit |
5b08af |
, ("Nd", charPropUnicodeNd)
|
|
Packit |
5b08af |
, ("Nl", charPropUnicodeNl)
|
|
Packit |
5b08af |
, ("No", charPropUnicodeNo)
|
|
Packit |
5b08af |
, ("P", charPropUnicodeP )
|
|
Packit |
5b08af |
, ("Pc", charPropUnicodePc)
|
|
Packit |
5b08af |
, ("Pd", charPropUnicodePd)
|
|
Packit |
5b08af |
, ("Pe", charPropUnicodePe)
|
|
Packit |
5b08af |
, ("Pf", charPropUnicodePf)
|
|
Packit |
5b08af |
, ("Pi", charPropUnicodePi)
|
|
Packit |
5b08af |
, ("Po", charPropUnicodePo)
|
|
Packit |
5b08af |
, ("Ps", charPropUnicodePs)
|
|
Packit |
5b08af |
, ("S", charPropUnicodeS )
|
|
Packit |
5b08af |
, ("Sc", charPropUnicodeSc)
|
|
Packit |
5b08af |
, ("Sk", charPropUnicodeSk)
|
|
Packit |
5b08af |
, ("Sm", charPropUnicodeSm)
|
|
Packit |
5b08af |
, ("So", charPropUnicodeSo)
|
|
Packit |
5b08af |
, ("Z", charPropUnicodeZ )
|
|
Packit |
5b08af |
, ("Zl", charPropUnicodeZl)
|
|
Packit |
5b08af |
, ("Zp", charPropUnicodeZp)
|
|
Packit |
5b08af |
, ("Zs", charPropUnicodeZs)
|
|
Packit |
5b08af |
]
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
isCategory' :: Parser String
|
|
Packit |
5b08af |
isCategory'
|
|
Packit |
5b08af |
= ( foldr1 (<|>) . map (uncurry prop) $
|
|
Packit |
5b08af |
[ ('L', "ultmo")
|
|
Packit |
5b08af |
, ('M', "nce")
|
|
Packit |
5b08af |
, ('N', "dlo")
|
|
Packit |
5b08af |
, ('P', "cdseifo")
|
|
Packit |
5b08af |
, ('Z', "slp")
|
|
Packit |
5b08af |
, ('S', "mcko")
|
|
Packit |
5b08af |
, ('C', "cfon")
|
|
Packit |
5b08af |
]
|
|
Packit |
5b08af |
) "illegal Unicode character property"
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
prop c1 cs2
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- char c1
|
|
Packit |
5b08af |
s2 <- option ""
|
|
Packit |
5b08af |
( do
|
|
Packit |
5b08af |
c2 <- satisfy (`elem` cs2)
|
|
Packit |
5b08af |
return [c2] )
|
|
Packit |
5b08af |
return $ c1:s2
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
complEsc :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
complEsc
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- char 'P'
|
|
Packit |
5b08af |
s <- between (char '{') (char '}') charProp
|
|
Packit |
5b08af |
return $ mkSym $ compCS s
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
charClassExpr :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s)
|
|
Packit |
5b08af |
charClassExpr multiCharEsc'
|
|
Packit |
5b08af |
= between (char '[') (char ']') charGroup
|
|
Packit |
5b08af |
where
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- charGroup :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
charGroup
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
r <- ( negCharGroup -- a ^ at beginning denotes negation, not start of posCharGroup
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
posCharGroup
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
s <- option (mkZero' "") -- charClassSub
|
|
Packit |
5b08af |
( do
|
|
Packit |
5b08af |
_ <- char '-'
|
|
Packit |
5b08af |
charClassExpr multiCharEsc'
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
return $ mkDiff r s
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- posCharGroup :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
posCharGroup
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
rs <- many1 (charRange <|> charClassEsc multiCharEsc')
|
|
Packit |
5b08af |
return $ foldr1 mkAlt rs
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- negCharGroup :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
negCharGroup
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- char '^'
|
|
Packit |
5b08af |
r <- posCharGroup
|
|
Packit |
5b08af |
return $ mkDiff mkDot r
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
charRange :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
charRange
|
|
Packit |
5b08af |
= try seRange
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
xmlCharIncDash
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
seRange :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
seRange
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
c1 <- charOrEsc'
|
|
Packit |
5b08af |
_ <- char '-'
|
|
Packit |
5b08af |
c2 <- charOrEsc'
|
|
Packit |
5b08af |
return $ mkSymRng c1 c2
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
charOrEsc' :: Parser Char
|
|
Packit |
5b08af |
charOrEsc'
|
|
Packit |
5b08af |
= ( do
|
|
Packit |
5b08af |
_ <- char '\\'
|
|
Packit |
5b08af |
singleCharEsc'
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
satisfy (`notElem` "\\-[]")
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
xmlCharIncDash :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
xmlCharIncDash
|
|
Packit |
5b08af |
= try ( do -- dash is only allowed if not followed by a [, else charGroup differences do not parse correctly
|
|
Packit |
5b08af |
_ <- char '-'
|
|
Packit |
5b08af |
notFollowedBy (char '[')
|
|
Packit |
5b08af |
return $ mkSym1 '-'
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
<|>
|
|
Packit |
5b08af |
( do
|
|
Packit |
5b08af |
c <- satisfy (`notElem` "-\\[]")
|
|
Packit |
5b08af |
return $ mkSym1 c
|
|
Packit |
5b08af |
)
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
wildCardEsc :: StringLike s => Parser (GenRegex s)
|
|
Packit |
5b08af |
wildCardEsc
|
|
Packit |
5b08af |
= do
|
|
Packit |
5b08af |
_ <- char '.'
|
|
Packit |
5b08af |
return . mkSym . compCS $ stringCS "\n\r"
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
|
|
Packit |
5b08af |
-- ------------------------------------------------------------
|