From 5b08af37c28d81c8d075565c7e7a55e35142cc2a Mon Sep 17 00:00:00 2001 From: Packit Date: Sep 25 2020 14:11:25 +0000 Subject: ghc-hxt-regex-xmlschema-9.2.0.3 base --- diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..11b706d --- /dev/null +++ b/LICENSE @@ -0,0 +1,9 @@ +The MIT License + +Copyright (c) 2005 Uwe Schmidt, Martin Schmidt, Torben Kuseler + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Setup.hs b/Setup.hs new file mode 100644 index 0000000..cd7dc32 --- /dev/null +++ b/Setup.hs @@ -0,0 +1,3 @@ +#!/usr/bin/env runhaskell +import Distribution.Simple +main = defaultMain diff --git a/examples/RegexXMLSchema/Makefile b/examples/RegexXMLSchema/Makefile new file mode 100644 index 0000000..0204cbc --- /dev/null +++ b/examples/RegexXMLSchema/Makefile @@ -0,0 +1,128 @@ +# $Id: Makefile,v 1.9 2006/11/11 15:36:03 hxml Exp $ + +HXT_HOME = ../../.. +PKGFLAGS = +GHCFLAGS = -Wall -O2 +GHC = ghc $(GHCFLAGS) $(PKGFLAGS) + +DIST = $(HXT_HOME)/dist/examples/arrows +DIST_DIR = $(DIST)/RegexXMLSchema + +CNT = 3 + +ropts = +RTS -s -RTS + +prog = ./REtest +prog2 = ./Lines +prog3 = ./RElines +prog4 = ./Words +prog5 = ./REwords +prog0 = ./Copy + +progs = $(prog) $(prog0) $(prog2) $(prog3) $(prog4) $(prog5) + +all : $(progs) + +$(prog) : $(prog).hs + $(GHC) --make -o $@ $< + +local : + $(GHC) --make -o $(prog) -fglasgow-exts -ignore-package hxt -i../../../src $(prog).hs + +$(prog2) : $(prog) + ln -f $(prog) $(prog2) + +$(prog3) : $(prog) + ln -f $(prog) $(prog3) + +$(prog4) : $(prog) + ln -f $(prog) $(prog4) + +$(prog5) : $(prog) + ln -f $(prog) $(prog5) + +$(prog0) : $(prog) + ln -f $(prog) $(prog0) + +# generate and read documents containing a binary tree +# with 2^i leaf nodes containing the numbers 1 to 2^i +# for i up to at least 22 (8M XML elements) output works fine +# for i up to 19 (1M XML elements) input works without swapping +# with i=20 swapping starts, but the program it still terminates +# the size of the XML file for i=20 is about 36Mb +# these tests have run on a box with 1Gb memory + +tests = 18 + +test : $(prog) + $(MAKE) genfiles tests="$(tests)" + $(MAKE) copy tests="$(tests)" + $(MAKE) lines tests="$(tests)" + $(MAKE) relines tests="$(tests)" + $(MAKE) words tests="$(tests)" + $(MAKE) rewords tests="$(tests)" + +perftest : $(prog) + $(MAKE) test tests="2 3 10 11 12 13 14 15 16 17 18 19 20" + +genfiles : + @for i in $(tests) ; \ + do \ + echo time $(prog) $(ropts) $$i ; \ + time $(prog) $(ropts) $$i ; \ + ls -l tree-*$$i.xml ; \ + echo ; \ + done + +copy : + @for i in $(tests) ; \ + do \ + echo time $(prog0) $(ropts) $$i ; \ + time $(prog0) $(ropts) $$i ; \ + ls -l tree-*$$i.xml.copy ; \ + echo ; \ + done + +lines : + @for i in $(tests) ; \ + do \ + echo time $(prog2) $(ropts) $$i ; \ + time $(prog2) $(ropts) $$i ; \ + ls -l tree-*$$i.xml.lines ; \ + echo ; \ + done + +relines : + @for i in $(tests) ; \ + do \ + echo time $(prog3) $(ropts) $$i ; \ + time $(prog3) $(ropts) $$i ; \ + ls -l tree-*$$i.xml.relines ; \ + echo ; \ + done + +words : + @for i in $(tests) ; \ + do \ + echo time $(prog4) $(ropts) $$i ; \ + time $(prog4) $(ropts) $$i ; \ + ls -l tree-*$$i.xml.words ; \ + echo ; \ + done + +rewords : + @for i in $(tests) ; \ + do \ + echo time $(prog5) $$i ; \ + time $(prog5) $(ropts) $$i ; \ + ls -l tree-*$$i.xml.rewords ; \ + echo ; \ + done + +dist : + [ -d $(DIST_DIR) ] || mkdir -p $(DIST_DIR) + cp Makefile REtest.hs $(DIST_DIR) + +clean : + rm -f $(progs) *.o *.hi *.xml *.xml.* + diff --git a/examples/RegexXMLSchema/REtest.hs b/examples/RegexXMLSchema/REtest.hs new file mode 100644 index 0000000..4c054a8 --- /dev/null +++ b/examples/RegexXMLSchema/REtest.hs @@ -0,0 +1,170 @@ +{-# LANGUAGE BangPatterns#-} + +-- ---------------------------------------- + +module Main(main) +where + +import Text.XML.HXT.Core +import Text.Regex.XMLSchema.Generic + +import Data.String.Unicode + ( unicodeToXmlEntity + ) + +import Control.Monad.State.Strict hiding (when) + +import Data.Maybe + +import System.IO -- import the IO and commandline option stuff +import System.Environment + +-- ---------------------------------------- + +main :: IO () +main + = do + p <- getProgName + al <- getArgs + let i = if null al + then 4 + else (read . head $ al)::Int + main' p i + where + main' p' = fromMaybe main1 . lookup (pn p') $ mpt + mpt = [ ("REtest", main1) + , ("Copy", main2 "copy" (:[])) + , ("Lines", main2 "lines" lines) + , ("RElines", main2 "relines" relines) + , ("Words", main2 "words" words) + , ("REwords", main2 "rewords" rewords) + ] + +-- ---------------------------------------- + +-- generate a document containing a binary tree of 2^i leafs (= 2^(i-1) XML elements) + +main1 :: Int -> IO () +main1 i + = runX (genDoc i (fn i)) + >> return () + +-- ---------------------------------------- + +-- read a document containing a binary tree of 2^i leafs + +main2 :: String -> (String -> [String]) -> Int -> IO () +main2 ext lines' i + = do + hPutStrLn stderr "start processing" + h <- openBinaryFile (fn i) ReadMode + c <- hGetContents h + let ls = lines' c + o <- openBinaryFile (fn i ++ "." ++ ext) WriteMode + mapM_ (hPutStrLn o) ls + hClose o + hClose h + hPutStrLn stderr "end processing" + +relines :: String -> [String] +relines = tokenize "[^\n\r]*" + +rewords :: String -> [String] +rewords = tokenize "[^ \t\n\r]+" + +-- ---------------------------------------- + +pn :: String -> String +pn = reverse . takeWhile (/= '/') . reverse + +fn :: Int -> String +fn = ("tree-" ++) . (++ ".xml") . reverse . take 4 . reverse . ((replicate 4 '0') ++ ) . show + +-- ---------------------------------------- + +genDoc :: Int -> String -> IOSArrow b XmlTree +genDoc d out = constA (mkBTree d) + >>> + xpickleVal xpickle + >>> + indentDoc + >>> + putDoc out + +-- ---------------------------------------- + +type Counter a = State Int a + +incr :: Counter Int +incr = do + modify (+1) + get + +-- ---------------------------------------- + +data BTree = Leaf Int + | Fork BTree BTree + deriving (Show) + +instance XmlPickler BTree where + xpickle = xpAlt tag ps + where + tag (Leaf _ ) = 0 + tag (Fork _ _ ) = 1 + ps = [ xpWrap ( Leaf, \ (Leaf i) -> i) + ( xpElem "leaf" $ xpAttr "value" $ xpickle ) + + , xpWrap ( uncurry Fork, \ (Fork l r) -> (l, r)) + ( xpElem "fork" $ xpPair xpickle xpickle ) + ] + +-- ---------------------------------------- + +mkBTree :: Int -> BTree +mkBTree depth = evalState (mkT depth) 0 + +mkT :: Int -> Counter BTree +mkT 0 = do + i <- incr + return (Leaf i) +mkT n = do + l <- mkT (n-1) + r <- mkT (n-1) + return (Fork l r) + +-- ---------------------------------------- + +-- output is done with low level ops to write the +-- document i a lazy manner +-- adding an xml pi and encoding is done "by hand" +-- latin1 decoding is the identity, so please generate the +-- docs with latin1 encoding. Here ist done even with ASCCI +-- every none ASCII char is represented by a char ref (&nnn;) + +putDoc :: String -> IOStateArrow s XmlTree XmlTree +putDoc dst + = addXmlPi + >>> + addXmlPiEncoding isoLatin1 + >>> + xshow getChildren + >>> + arr unicodeToXmlEntity + >>> + arrIO (\ s -> hPutDocument (\h -> hPutStrLn h s)) + >>> + none + where + isStdout = null dst || dst == "-" + + hPutDocument :: (Handle -> IO()) -> IO() + hPutDocument action + | isStdout + = action stdout + | otherwise + = do + handle <- openBinaryFile dst WriteMode + action handle + hClose handle + +-- ---------------------------------------- diff --git a/examples/colorizeProgs/ColorizeSourceCode.hs b/examples/colorizeProgs/ColorizeSourceCode.hs new file mode 100644 index 0000000..371a425 --- /dev/null +++ b/examples/colorizeProgs/ColorizeSourceCode.hs @@ -0,0 +1,711 @@ +-- ------------------------------------------------------------ + +{- | + Module : ColorizeSourceCode + Copyright : Copyright (C) 2009 Uwe Schmidt + License : BSD3 + + Maintainer : Uwe Schmidt (uwe@fh-wedel.de) + Portability: portable + + Colorize Source Code + + Supports Java and Haskell + +-} + +-- ------------------------------------------------------------ + +module Main +where + +import Control.Arrow + +import Data.List + +import System.Environment +import System.IO -- import the IO and commandline option stuff +import System.Console.GetOpt +import System.Exit + +import Text.Regex.XMLSchema.Generic +import Text.XML.HXT.Core +import Text.XML.HXT.Parser.XhtmlEntities + + +-- ------------------------------------------------------------ + +data Process = P { inFilter :: String -> String + , tokenRE :: Regex + , markupRE :: Regex -> Regex + , formatToken :: (String, String) -> String + , formatDoc :: [String] -> String + , outFilter :: String -> String + , input :: Handle + , output :: Handle + , inputFile :: String + } + +defaultProcess :: Process +defaultProcess = P { inFilter = id + , tokenRE = plainRE + , markupRE = id + , formatToken = uncurry (++) + , formatDoc = unlines + , outFilter = id + , input = stdin + , output = stdout + , inputFile = " " + } + +-- ------------------------------------------------------------ + +main :: IO () +main = do + argv <- getArgs + p <- evalArgs (getOpt Permute options argv) + s <- hGetContents (input p) + hPutStr (output p) (process p s) + hFlush (output p) + hClose (output p) + exitWith ExitSuccess + +options :: [OptDescr (String, String)] +options = [ Option "h?" ["help"] (NoArg ("help", "1")) "this message" + , Option "" ["plain"] (NoArg ("plain", "1")) "don't colorize lines" + , Option "" ["haskell"] (NoArg ("haskell", "1")) "colorize haskell" + , Option "" ["java"] (NoArg ("java", "1")) "colorize java" + , Option "" ["cpp"] (NoArg ("cpp", "1")) "colorize C or C++" + , Option "" ["sh"] (NoArg ("sh", "1")) "colorize sh or bash" + , Option "" ["ruby"] (NoArg ("ruby", "1")) "colorize ruby" + , Option "" ["bnf"] (NoArg ("bnf", "1")) "colorize extended BNF grammar rules" + , Option "" ["ppl"] (NoArg ("ppl", "1")) "colorize ppl" + , Option "" ["pplass"] (NoArg ("pplass", "1")) "colorize ppl assembler" + , Option "n" ["number"] (NoArg ("number", "1")) "with line numbers" + , Option "t" ["tabs"] (NoArg ("tabs", "1")) "substitute tabs by blanks" + , Option "m" ["markup"] (NoArg ("markup", "1")) "text contains embedded markup" + , Option "e" ["erefs"] (NoArg ("erefs", "1")) "resolve HTML entity refs before processing" + , Option "o" ["output"] (ReqArg ((,) "output") "FILE") "output file, \"-\" stands for stdout" + , Option "s" ["scan"] (NoArg ("scan", "1")) "just scan input, for testing" + , Option "x" ["html"] (NoArg ("html", "1")) "html output" + , Option "f" ["full"] (NoArg ("full", "1")) "full HTML document with header and css" + ] + +exitErr :: String -> IO a +exitErr msg = do + hPutStrLn stderr msg + usage + exitWith (ExitFailure 1) + +evalArgs :: ([(String, String)], [FilePath], [String]) -> IO Process +evalArgs (opts, files, errs) + | not (null errs) = exitErr ("illegal arguments " ++ show errs) + | null files = evalOpts opts defaultProcess + | not (null fns) = exitErr ("only one input file allowed") + | otherwise = do + inp <- openFile fn ReadMode + evalOpts opts (defaultProcess { input = inp + , inputFile = fn + } + ) + where + (fn:fns) = files + +evalOpts :: [(String, String)] -> Process -> IO Process +evalOpts [] res = return res +evalOpts (o:os) res = do + res' <- evalOpt o res + evalOpts os res' + +evalOpt :: (String, String) -> Process -> IO Process +evalOpt ("help","1") _ = do + usage + exitWith ExitSuccess + +evalOpt ("output", "-") p = return $ p {output = stdout} + +evalOpt ("output", fn) p = do + outp <- openFile fn WriteMode + return $ p {output = outp} + +evalOpt ("haskell","1") p = return $ p { tokenRE = haskellRE } +evalOpt ("java", "1") p = return $ p { tokenRE = javaRE } +evalOpt ("cpp", "1") p = return $ p { tokenRE = cppRE } +evalOpt ("sh", "1") p = return $ p { tokenRE = shRE } +evalOpt ("ruby", "1") p = return $ p { tokenRE = rubyRE } +evalOpt ("bnf", "1") p = return $ p { tokenRE = bnfRE } +evalOpt ("ppl", "1") p = return $ p { tokenRE = pplRE } +evalOpt ("pplass", "1") p = return $ p { tokenRE = pplassRE } +evalOpt ("plain", "1") p = return $ p { tokenRE = plainRE } +evalOpt ("scan", "1") p = return $ p { tokenRE = plainRE + , formatToken = uncurry formatTok + , formatDoc = formatHList } +evalOpt ("number", "1") p = return $ p { formatDoc = numberLines >>> formatDoc p } +evalOpt ("tabs", "1") p = return $ p { inFilter = inFilter p >>> substTabs } +evalOpt ("erefs", "1") p = return $ p { inFilter = resolveHtmlEntities >>> inFilter p } +evalOpt ("markup", "1") p = return $ p { markupRE = addMarkup } +evalOpt ("html", "1") p = return $ p { formatToken = formatHtmlTok + , formatDoc = formatHtmlDoc } +evalOpt ("full", "1") p = return $ p { outFilter = outFilter p >>> fullHtml (inputFile p) } +evalOpt (opt, _v ) p = exitErr ("illegal option " ++ show opt) >> return p + +usage :: IO () +usage = hPutStrLn stderr use + where + use = usageInfo header options + header = "colorizeSourceCode - colorize source code with HTML, version 0.1.1" + +-- ------------------------------------------------------------ + +process :: Process -> String -> String +process p = inFilter p + >>> tokenizeSubexRE (markupRE p (tokenRE p)) + >>> map (formatToken p) + >>> concat + >>> lines + >>> formatDoc p + >>> outFilter p + +addMarkup :: Regex -> Regex +addMarkup = mkElse (parseRegexExt . mkLE $ markupT) + +tokenizeLines :: String -> [(String, String)] +tokenizeLines = map (\ l -> ("",l ++ "\n")) . lines + +numberLines :: [String] -> [String] +numberLines = zipWith addNum [(1::Int)..] + where + addNum i l = "" ++ fmt 4 i ++ "" ++ l + fmt l = sed (const " ") " " + . reverse + . take l + . reverse + . (replicate l ' ' ++) + . show + +substTabs :: String -> String +substTabs = subs 0 + +subs :: Int -> String -> String +subs _ "" = "" +subs i (x:xs) + | x == '\t' = replicate (8 - (i `mod` 8)) ' ' ++ subs 0 xs + | x == '\n' = x : subs 0 xs + | otherwise = x : subs (i+1) xs + +-- ------------------------------------------------------------ + +resolveHtmlEntities :: String -> String +resolveHtmlEntities = sed (replaceEntity . drop 1 . init) "&\\i\\c*;" + where + replaceEntity e = maybe ("&" ++ e ++ ";") ((:[]) . toEnum) + . lookup e $ xhtmlEntities + +-- ------------------------------------------------------------ + +formatHList :: [String] -> String +formatHList = ("[" ++) . (++ "\n]") . intercalate "\n, " + +formatTok :: String -> String -> String +formatTok kw tok = " (" ++ show kw ++ ",\t" ++ show tok ++ "\t)\n" + +formatHtmlDoc :: [String] -> String +formatHtmlDoc = map (("
" ++) . (++ "
") . preserveEmptyLines) + >>> ("
" :) + >>> (++ ["
"]) + >>> unlines + where + preserveEmptyLines "" = " " + preserveEmptyLines l = l + +formatHtmlTok :: (String, String) -> String +formatHtmlTok ("markup", t@(x:_)) + | x `elem` "<&" = t +formatHtmlTok (m, t) + | otherwise = colorizeTokens m (escapeText >>> sed (const " ") " " $ t) + +escapeText :: String -> String +escapeText = foldr cquote "" + where + cquote = fst escapeHtmlRefs + +-- escapeText = concat . runLA (xshowEscapeXml mkText) + + +fullHtml :: String -> String -> String +fullHtml fn s = unlines + [ "" + , "" + , "" ++ fn ++ "" + , "" + , "" + , "" + , s + , "" + , "" + ] + +css :: String +css = unlines + [ ".typename { color: #0000dd; }" + , ".varname { color: #000000; }" + , ".opname { color: #770000; }" + , ".operator { color: #770000; /* font-weight:bold; */ }" + , ".keyglyph { color: #3070A0; /* font-weight:bold; */ }" + , ".par { }" + , "" + , ".keyword { color: #3070A0; /* font-weight:bold; */ }" + , ".typekeyword { color: #3070A0; /* font-weight:bold; */ }" + , ".strconst { color: #228B22; }" + , ".charconst { color: #228B22; }" + , ".labelname { color: #FF00FF; font-weight:bold; }" + , ".cppcommand { color: #0000CD; }" + , ".specialword { color: #c80000; }" + , ".classname { color: #8B2323; }" + , ".comment { color: #00008B; }" + , ".bnfnt { color: #0000CD; }" + , ".bnfmeta { color: #ff0000; font-weight:bold; }" + , ".bnfterminal { color: #008800; font-weight:bold; }" + , ".tclproc { color: #FF6000; }" + , ".tclvar { color: #0000CD; }" + , ".tclcomment { color: #c80000; }" + , "" + , ".linenr { color: #909090; padding-right: 2em; }" + , "div.codeline { font-family: monospace; width: 100%; white-space: pre; border-width: 1px; border-style: solid; border-color: transparent; padding-left: 0.3em; }" + , "div.codeline:hover { background-color:#ddddff; color:#c80000; border-width: 1px; border-style: solid; border-color: #c80000; }" + + ] + +-- ------------------------------------------------------------ + +colorizeTokens :: String -> String -> String +colorizeTokens tok + | tok `elem` [ "comment" + , "keyword" + , "keyglyph" + , "typekeyword" + , "varname", "typename", "labelname", "instancename", "globalname" + , "opname" + , "par" + , "operator" + , "strconst", "charconst" + , "bnfnt", "bnfmeta" + , "cppcommand" + , "specialword" + ] + = wrap + | tok == "longcomment" = wrap' "comment" . mlc + | tok == "bnfterminal" = wrap . drop 1 . init + -- | tok == "markupstart" = (("")) . drop 4 . init + -- | tok == "markupend" = const "" + | null tok = const "" + | otherwise = id + where + wrap = wrap' tok + wrap' tok' = (("") ++) . (++ "") + mlc = sed (("" ++) . (++ "")) "(\\n\r?)" + +-- ------------------------------------------------------------ + +buildRegex :: [(String, String)] -> Regex +buildRegex = foldr1 mkElse . map (uncurry mkBr') . map (second parseRegexExt) + where + mkBr' "" = id + mkBr' l = mkBr l + + +buildKeywords :: [String] -> String +buildKeywords = intercalate "|" + +untilRE :: String -> String +untilRE re = "(\\A{" ++ "\\}\\A" ++ re ++ "\\A)" ++ re + +mkLE :: (String, String) -> String +mkLE (l, re) = "({" ++ l ++ "}(" ++ re ++ "))" + +ws1RE, ws1RE',ws0RE :: String +ws1RE = "\\s+" +ws1RE' = "[ \t]+" +ws0RE = "[ \t]*" + +ws, ws', javacmt1, javacmt, shcmt1, strconst, + markupT, + charconst, number, + par, xxx :: (String, String) + +-- markupS = ("markupstart", "<[a-zA-Z0-9]+>" ) +-- markupE = ("markupend", "" ) +markupT = ("markup", ( "" + ++ "|" ++ + "&" ++ xname ++ ";" + ) + ) + where + xname = "[A-Za-z][-_:A-Za-z0-9]*" + xattr = ws1RE ++ xname ++ eq ++ "(" ++ dq ++ "|" ++ sq ++ ")" + eq = "\\s*=\\s*" + dq = "\"[^\"]*\"" + sq = "\'[^\']*\'" + +ws = ("ws", ws1RE ) +ws' = ("ws", ws1RE' ) +javacmt1 = ("comment", "//.*" ) +javacmt = ("longcomment", "/\\*" ++ untilRE "\\*/" ) +shcmt1 = ("comment", "#.*" ) +strconst = ("strconst", "\"([^\"\\\\\n\r]|\\\\.)*\"" ) +charconst = ("charconst", "\'([^\'\\\\\n\r]|\\\\.)*\'" ) +number = ("number", "[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?" ) +par = ("par", "[\\(\\)\\[\\]\\{\\}]" ) +xxx = ("xxx", "." ) + +-- ------------------------------------------------------------ + +plainRE :: Regex +plainRE = buildRegex + [ ("xxx", "[^<&\n]+" ) + , ("xxx", "[<&\n]" ) + ] + +-- ------------------------------------------------------------ + +haskellRE :: Regex +haskellRE = buildRegex + [ ws + , ("comment", "(-)- .*" ) + , ("longcomment", "\\{" ++ untilRE "-\\}" ) + , ("keyword", buildKeywords + [ "case", "class" + , "data", "default", "deriving", "do" + , "else" + , "forall" + , "if", "import", "in" + , "infix", "infixl", "infixr" + , "instance" + , "let" + , "module" + , "newtype" + , "of" + , "qualified" + , "then", "type" + , "where" + , "_" + , "as", "ccall", "foreign", "hiding", "proc", "safe", "unsafe" + ] + ) + , ("keyglyph", buildKeywords + ["\\.\\.","::","=","\\\\","\\|","<-","->","-<","@","~","=>","!",",",";"] + ) + , ("varname" , varname ) + , ("typename", "[A-Z_][a-zA-Z0-9_]*[']*" ) + , ("opname", "`" ++ varname ++ "`" ) + , strconst + , charconst + , number + , par + , ("operator", "[-!#$%&\\*\\+./<=>\\?@\\\\^\\|~]+") + , xxx + ] + where + varname = "[a-z_][a-zA-Z0-9_]*[']*" + +-- ------------------------------------------------------------ + +javaRE :: Regex +javaRE = buildRegex + [ ws + , javacmt1 + , javacmt + , ("keyword", buildKeywords + [ "abstract", "assert" + , "break" + , "case", "catch", "class", "continue" + , "default", "do" + , "else", "extends" + , "final", "finally", "for" + , "if", "implements", "import", "instanceof", "interface" + , "native", "new" + , "package", "private", "protected", "public" + , "return" + , "static", "super", "switch", "synchronized" + , "this", "throw", "throws", "transient", "try" + , "volatile" + , "while" + ] ) + , ("typekeyword", + buildKeywords + [ "boolean", "byte" + , "char" + , "double" + , "false", "float" + , "int" + , "long" + , "null" + , "short" + , "true" + , "void" + ] ) + , ("labelname", "(" ++ varname ++ "{\\}default):" ) + , ("", ( mkLE ("keyword", "break|continue") + ++ mkLE ws ++ + mkLE ("labelname", varname) + ) + ) + , ("varname", varname ) + , ("typename", "[A-Z][a-zA-Z0-9_]*" ) + , strconst + , charconst + , number + , par + , ("delimiter", "[.,;]" ) + , ("operator", "[-+!%&/=\\*\\?~|<>:]+" ) + , xxx + ] + where + varname = "[a-z][a-zA-Z0-9_]*" + +-- ------------------------------------------------------------ + +bnfRE :: Regex +bnfRE = buildRegex + [ ws + , ("bnfnt" , "[A-Z][a-zA-Z0-9_]*" ) + , ("bnfterminal", "\"([^\"\\\\\n\r]|\\\\.)*\"" ) + , ("bnfmeta", buildKeywords + [ "\\[" + , "\\]" + , "::=" + , "\\|" + , "\\{" + , "\\}" + ] + ) + , xxx + ] + +-- ------------------------------------------------------------ + +cppRE :: Regex +cppRE = buildRegex + [ ws + , javacmt1 + , javacmt + , ("keyword", buildKeywords + [ "asm" , "auto" + , "break" + , "case" , "catch" , "class" , "const" , "continue" + , "default" , "delete" , "do" + , "else" , "extern" + , "for" , "friend" + , "goto" + , "if" , "inline" + , "new" + , "operator" , "overload" + , "private" , "protected" , "public" + , "register" , "return" + , "sizeof" , "static" , "switch" + , "template" , "this" , "typedef" , "throw" , "try" + , "virtual" , "volatile" + , "while" + ] + ) + , ("typekeyword", + buildKeywords + [ "char" + , "double" + , "enum" + , "float" + , "int" + , "long" + , "short" + , "signed" + , "struct" + , "union" + , "unsigned" + , "void" + ] + ) + , ("cppcommand", ( "#" ++ ws0RE ++ "(" + ++ + buildKeywords + [ "define" + , "else" + , "endif" + , "if" + , "ifdef" + , "ifndef" + , "(include[ \t].*)" + , "undef" + ] + ++ ")" + ) + ) + , ("specialword", buildKeywords + [ "assert" + , "exit" + , "free" + , "main" + , "malloc" + ] + ) + , ("varname", varname ) + , ("typename", "[A-Z][a-zA-Z0-9_]*" ) + , strconst + , charconst + , number + , par + , ("delimiter", "[.,;]" ) + , ("operator", "[-+!%&/=\\*\\?~|<>:]+" ) + , xxx + ] + where + varname = "[a-z][a-zA-Z0-9_]*" + +-- ------------------------------------------------------------ + +shRE :: Regex +shRE = buildRegex + [ ws + , shcmt1 + , ("keyword", buildKeywords + [ "alias" + , "break" , "bg" + , "case" , "cd" , "continue" + , "declare" , "do" , "done" + , "echo" , "elif" , "else" , "env" , "esac" , "eval" , "exec" , "exit" , "export" + , "false" , "fg" , "fi" , "for" , "function" + , "if" , "in" + , "jobs" + , "kill" + , "local" + , "pwd" + , "return" + , "set" , "shift" + , "test" , "then" , "trap" , "true" + , "unalias" , "unset" + , "while" , "wait" + ] + ) + , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) + , ("operator", "[-+!%&=\\\\\\*\\?~|<>:@$]+" ) + , ("operator", "[\\(\\)\\[\\]\\{\\}]+" ) + , strconst + , charconst + , xxx + ] + +-- ------------------------------------------------------------ + +rubyRE :: Regex +rubyRE = buildRegex + [ ws + , rubycmt + , ("keyword", buildKeywords + [ "begin" , "break" + , "catch" , "case" , "class" + , "def" , "do" + , "else" , "elif" , "end" , "ensure" + , "false" , "for" + , "if" , "in" , "include" , "initialize" + , "loop" + , "module" + , "new" , "nil" + , "raise" , "require" , "rescue" + , "self" + , "then" , "true" , "type" + , "until" + , "when" , "while" + , "yield" + ] + ) + , ("typename", "[A-Z][A-Za-z0-9]*" ) + , ("varname", "[A-Za-z_][a-zA-Z0-9_]*(!|\\?)?" ) + , ("instancename", "(@{1,2}|$)[A-Za-z_][a-zA-Z0-9_]*" ) + , ("strconst", "%[qQx]\\{.*\\}" ) + , ("strconst", "#\\{.*\\}" ) + , ("strconst", ":[a-z][A-Za-z0-9]*" ) + , strconst + , charconst + , regex + , xxx + ] + where + rubycmt = ("comment", "#(.{\\}\\{)*" ) + regex = ("strconst", "/([^/\\\\\n\r]|\\\\.)*/" ) + +-- ------------------------------------------------------------ + +pplRE :: Regex +pplRE = buildRegex + [ ws + , ("comment", "(-)- .*" ) + , ("keyword", buildKeywords + [ "and" + , "begin" + , "div" , "do" + , "else" , "elseif" , "endif" , "endwhile" , "end" + , "function" + , "if" + , "max" , "min" , "mod" + , "not" + , "of" , "or" + , "procedure" + , "repeat" + , "return" + , "then" + , "until" + , "var" + , "while" + , "xor" + ] + ) + , ("typekeyword", buildKeywords + [ "boolean" + , "false" , "float" + , "int" + , "list" + , "picture" + , "string" + , "true" + ] + ) + , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) + , strconst + , number + , xxx + ] + +-- ------------------------------------------------------------ + +pplassRE :: Regex +pplassRE = buildRegex + [ ws + , ("comment", "(-)- .*" ) + , ("keyword", buildKeywords + [ "loadi" , "loadf" + , "loads" , "emptyl" + , "undef" , "load" + ] + ) + , ("typename", buildKeywords + [ "store" + , "pop" + ] + ) + , ("typekeyword", buildKeywords + [ "jmp" + , "brfalse" + , "brtrue" + , "pushj" + , "popj" + , "svc" + ] + ) + , ("labelname", "(l[0-9]+:?)|([se]?_[A-Za-z0-9]*:?)" ) + , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) + , strconst + , xxx + ] + +-- ------------------------------------------------------------ diff --git a/examples/colorizeProgs/Makefile b/examples/colorizeProgs/Makefile new file mode 100644 index 0000000..7eb5655 --- /dev/null +++ b/examples/colorizeProgs/Makefile @@ -0,0 +1,14 @@ +all : colorize + +install : all + sudo cp colorize /usr/local/bin + +clean : + cabal clean + +colorize: + cabal install + +.PHONY : all install clean colorize + + diff --git a/examples/performance/Makefile b/examples/performance/Makefile new file mode 100644 index 0000000..65c13b3 --- /dev/null +++ b/examples/performance/Makefile @@ -0,0 +1,145 @@ +PKGFLAGS = +GHCFLAGS = -Wall -O2 +GHC = ghc $(GHCFLAGS) $(PKGFLAGS) + +CNT = 3 + +ropts = +RTS -s -RTS + +prog = ./REtest +prog2 = ./Lines +prog3 = ./RElines +prog3a = ./SElines +prog4 = ./Words +prog5 = ./REwords +prog5a = ./SEwords +prog0 = ./Copy + +progs = $(prog) $(prog0) $(prog2) $(prog3) $(prog3a) $(prog4) $(prog5) $(prog5a) + +all : $(progs) + +$(prog) : $(prog).hs + $(GHC) --make -o $@ $< + +local : + $(GHC) --make -o $(prog) -fglasgow-exts $(prog).hs + +$(prog2) : $(prog) + ln -f $(prog) $(prog2) + +$(prog3) : $(prog) + ln -f $(prog) $(prog3) + +$(prog3a) : $(prog) + ln -f $(prog) $(prog3a) + +$(prog4) : $(prog) + ln -f $(prog) $(prog4) + +$(prog5) : $(prog) + ln -f $(prog) $(prog5) + +$(prog5a) : $(prog) + ln -f $(prog) $(prog5a) + +$(prog0) : $(prog) + ln -f $(prog) $(prog0) + +# generate and read documents containing text +# with 2^i characters + +tests = 25 + +test : $(prog) + $(MAKE) genfiles tests="$(tests)" + $(MAKE) copy tests="$(tests)" + $(MAKE) lines tests="$(tests)" + $(MAKE) relines tests="$(tests)" + $(MAKE) selines tests="$(tests)" + $(MAKE) words tests="$(tests)" + $(MAKE) rewords tests="$(tests)" + $(MAKE) sewords tests="$(tests)" + +perftest : $(prog) + $(MAKE) test tests="2 3 10 11 12 13 14 15 16 17 18 19 20" + +genfiles : + @for i in $(tests) ; \ + do \ + echo time $(prog) $(ropts) $$i ; \ + time $(prog) $(ropts) $$i ; \ + ls -l lines-*$$i.txt ; \ + echo ; \ + done + +copy : + @for i in $(tests) ; \ + do \ + echo time $(prog0) $(ropts) $$i ; \ + time $(prog0) $(ropts) $$i ; \ + ls -l lines-*$$i.txt.copy ; \ + echo ; \ + done + +lines : + @for i in $(tests) ; \ + do \ + echo time $(prog2) $(ropts) $$i ; \ + time $(prog2) $(ropts) $$i ; \ + ls -l lines-*$$i.txt.lines ; \ + echo ; \ + done + +relines : + @for i in $(tests) ; \ + do \ + echo time $(prog3) $(ropts) $$i ; \ + time $(prog3) $(ropts) $$i ; \ + ls -l lines-*$$i.txt.relines ; \ + echo ; \ + done + +selines : + @for i in $(tests) ; \ + do \ + echo time $(prog3a) $(ropts) $$i ; \ + time $(prog3a) $(ropts) $$i ; \ + ls -l lines-*$$i.txt.selines ; \ + echo ; \ + done + +words : + @for i in $(tests) ; \ + do \ + echo time $(prog4) $(ropts) $$i ; \ + time $(prog4) $(ropts) $$i ; \ + ls -l lines-*$$i.txt.words ; \ + echo ; \ + done + +rewords : + @for i in $(tests) ; \ + do \ + echo time $(prog5) $$i ; \ + time $(prog5) $(ropts) $$i ; \ + ls -l lines-*$$i.txt.rewords ; \ + echo ; \ + done + +sewords : + @for i in $(tests) ; \ + do \ + echo time $(prog5a) $$i ; \ + time $(prog5a) $(ropts) $$i ; \ + ls -l lines-*$$i.txt.sewords ; \ + echo ; \ + done + +distclean : clean + +clean : + rm -f $(progs) *.o *.hi *.txt *.txt.* + +.PHONY : clean distclean test perftest all local words rewords sewords lines relines selines copy genfiles + diff --git a/examples/performance/REtest.hs b/examples/performance/REtest.hs new file mode 100644 index 0000000..4536442 --- /dev/null +++ b/examples/performance/REtest.hs @@ -0,0 +1,96 @@ +{-# LANGUAGE BangPatterns#-} + +-- ---------------------------------------- + +module Main(main) +where + +import Text.Regex.XMLSchema.Generic + +import Control.Arrow + +import Data.Maybe + +import System.IO -- import the IO and commandline option stuff +import System.Environment + +-- ---------------------------------------- + +main :: IO () +main + = do + p <- getProgName + al <- getArgs + let i = if null al + then 4 + else (read . head $ al)::Int + main' p i + where + main' p' = fromMaybe main1 . lookup (pn p') $ mpt + mpt = [ ("REtest", main1) + , ("Copy", main2 "copy" (:[])) + , ("Lines", main2 "lines" lines) + , ("RElines", main2 "relines" relines) + , ("SElines", main2 "selines'" relines') + , ("Words", main2 "words" words) + , ("REwords", main2 "rewords" rewords) + , ("SEwords", main2 "sewords" rewords') + ] + +-- ---------------------------------------- + +-- generate a document containing a binary tree of 2^i leafs (= 2^(i-1) XML elements) + +main1 :: Int -> IO () +main1 i + = do + genDoc i "REtest.hs" (fn i) + return () + +-- ---------------------------------------- + +-- read a document containing a binary tree of 2^i leafs + +main2 :: String -> (String -> [String]) -> Int -> IO () +main2 ext lines' i + = do + hPutStrLn stderr "start processing" + h <- openFile (fn i) ReadMode + c <- hGetContents h + let ls = lines' c + o <- openFile (fn i ++ "." ++ ext) WriteMode + mapM_ (hPutStrLn o) ls + hClose o + hClose h + hPutStrLn stderr "end processing" + +relines :: String -> [String] +relines = tokenize ".*" + +relines' :: String -> [String] +relines' = tokenizeSubex "({line}.*)" >>> map snd + +rewords :: String -> [String] +rewords = tokenize "\\S+" + +rewords' :: String -> [String] +rewords' = tokenizeSubex "({word}\\S+)" >>> map snd + +-- ---------------------------------------- + +pn :: String -> String +pn = reverse . takeWhile (/= '/') . reverse + +fn :: Int -> String +fn = ("lines-" ++) . (++ ".txt") . reverse . take 4 . reverse . ((replicate 4 '0') ++ ) . show + +-- ---------------------------------------- + +genDoc :: Int -> String -> String -> IO () +genDoc d inp outp + = do + s <- readFile inp + let s' = take (2^d) . concat . repeat $ s + writeFile outp s' + +-- ---------------------------------------- diff --git a/hxt-regex-xmlschema.cabal b/hxt-regex-xmlschema.cabal new file mode 100644 index 0000000..979a520 --- /dev/null +++ b/hxt-regex-xmlschema.cabal @@ -0,0 +1,115 @@ +Name: hxt-regex-xmlschema +Version: 9.2.0.3 +Synopsis: A regular expression library for W3C XML Schema regular expressions +Description: This library supports full W3C XML Schema regular expressions + inclusive all Unicode character sets and blocks. + The complete grammar can be found under . + It is implemented by the technique of derivations of regular expressions. + . + The W3C syntax is extended to support not only union of regular sets, + but also intersection, set difference, exor. + Matching of subexpressions is also supported. + . + The library can be used for constricting lightweight scanners and tokenizers. + It is a standalone library, no external regex libraries are used. + . + Extensions in 9.2: The library does nor only support String's, but also + ByteString's and Text in strict and lazy variants +License: MIT +License-file: LICENSE +Author: Uwe Schmidt +Maintainer: Uwe Schmidt +Copyright: Copyright (c) 2010- Uwe Schmidt +Stability: stable +Category: Text +Homepage: http://www.haskell.org/haskellwiki/Regular_expressions_for_XML_Schema +Build-type: Simple +Cabal-version: >=1.10 + +extra-source-files: + examples/colorizeProgs/ColorizeSourceCode.hs + examples/colorizeProgs/Makefile + examples/performance/REtest.hs + examples/performance/Makefile + examples/RegexXMLSchema/Makefile + examples/RegexXMLSchema/REtest.hs + +flag profile + description: turn profiling on + default: False + +Library + Exposed-modules: + Text.Regex.Glob.String + Text.Regex.Glob.Generic + Text.Regex.Glob.Generic.RegexParser + Text.Regex.XMLSchema.String + Text.Regex.XMLSchema.Generic + Text.Regex.XMLSchema.Generic.Matching + Text.Regex.XMLSchema.Generic.RegexParser + Text.Regex.XMLSchema.Generic.Regex + Text.Regex.XMLSchema.Generic.StringLike + + hs-source-dirs: src + + default-language: Haskell2010 + + ghc-options: -Wall -fwarn-tabs -funbox-strict-fields + + if flag(profile) + ghc-prof-options: -caf-all + + build-depends: base >= 4 && < 5 + , bytestring >= 0.10 + , hxt-charproperties >= 9 && < 10 + , parsec >= 2.1 && < 4 + , text >= 0.10 + +test-suite SimpleMatch + hs-source-dirs: test + main-is: SimpleTest.hs + type: exitcode-stdio-1.0 + default-language: Haskell2010 + ghc-options: -Wall + + build-depends: base + , bytestring + , hxt-regex-xmlschema + , text + , HUnit + +test-suite Date + hs-source-dirs: test + main-is: Date.hs + type: exitcode-stdio-1.0 + default-language: Haskell2010 + ghc-options: -Wall + + build-depends: base + , bytestring + , hxt-regex-xmlschema + , parsec >= 2 + , text + , HUnit + +Benchmark Benchmark + hs-source-dirs: test + main-is: Benchmark.hs + type: exitcode-stdio-1.0 + + default-language: Haskell2010 + + ghc-options: -Wall -O2 + + build-depends: base + , bytestring + , criterion >= 1 + , deepseq >= 1.2 + , hxt-regex-xmlschema + , parsec >= 2 + , text + +Source-Repository head + Type: git + Location: git://github.com/UweSchmidt/hxt.git + diff --git a/src/Text/Regex/Glob/Generic.hs b/src/Text/Regex/Glob/Generic.hs new file mode 100644 index 0000000..d4d483a --- /dev/null +++ b/src/Text/Regex/Glob/Generic.hs @@ -0,0 +1,51 @@ +-- ------------------------------------------------------------ + +{- | + Module : Text.Regex.Glob.String + Copyright : Copyright (C) 2011- Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt + Stability : stable + Portability: portable + + csh glob style pattern matcher +-} + +-- ------------------------------------------------------------ + +module Text.Regex.Glob.Generic + ( GenRegex + , Regex + , RegexText + , RegexTextLazy + , RegexByteString + , RegexByteStringLazy + , match + , matchNoCase + , parseRegex + , parseRegexNoCase + ) +where + +import Text.Regex.Glob.Generic.RegexParser (parseRegex, parseRegexNoCase) +import Text.Regex.XMLSchema.Generic.Regex (matchWithRegex) +import Text.Regex.XMLSchema.Generic.StringLike +import Text.Regex.XMLSchema.Generic (GenRegex, + Regex, + RegexText, + RegexTextLazy, + RegexByteString, + RegexByteStringLazy + ) + + +-- ------------------------------------------------------------ + +match :: StringLike s => s -> s -> Bool +match = matchWithRegex . parseRegex + +matchNoCase :: StringLike s => s -> s -> Bool +matchNoCase = matchWithRegex . parseRegexNoCase + +-- ------------------------------------------------------------ diff --git a/src/Text/Regex/Glob/Generic/RegexParser.hs b/src/Text/Regex/Glob/Generic/RegexParser.hs new file mode 100644 index 0000000..8489507 --- /dev/null +++ b/src/Text/Regex/Glob/Generic/RegexParser.hs @@ -0,0 +1,111 @@ +{-# LANGUAGE FlexibleContexts #-} + +-- ------------------------------------------------------------ + +{- | + Copyright : Copyright (C) 2014- Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt (uwe@fh-wedel.de) + Stability : stable + Portability: portable + + csh style Glob Pattern Parser for Regular Expressions +-} + +-- ------------------------------------------------------------ + +module Text.Regex.Glob.Generic.RegexParser + ( parseRegex + , parseRegexNoCase + ) +where + +import Data.Char (isLower, isUpper, + toLower, toUpper) + +import Text.ParserCombinators.Parsec +import Text.Regex.XMLSchema.Generic.Regex +import Text.Regex.XMLSchema.Generic.StringLike + +-- ------------------------------------------------------------ + +-- | parse a glob pattern + +parseRegex :: StringLike s => s -> GenRegex s +parseRegex + = parseRegex' mkSymRng . toString + +parseRegexNoCase :: StringLike s => s -> GenRegex s +parseRegexNoCase + = parseRegex' mkNoCaseSymRng . toString + +parseRegex' :: StringLike s => (Char -> Char -> GenRegex s) -> String -> GenRegex s +parseRegex' mkS + = either (mkZero' . ("syntax error: " ++) . show) id + . + parse ( do + r <- pattern mkS + eof + return r + ) "" + +-- ------------------------------------------------------------ + +pattern :: StringLike s => (Char -> Char -> GenRegex s) -> Parser (GenRegex s) +pattern mkS + = many part >>= return . mkSeqs + where + -- part :: Parser (GenRegex s) + part + = ( many1 (noneOf "\\?*[{") >>= return . mkWord' ) + <|> + ( char '?' >> return mkDot ) + <|> + ( char '*' >> return mkAll ) + <|> + ( between (char '{') (char '}') wordList ) + <|> + ( between (char '[') (char ']') charSet ) + <|> + ( do c <- char '\\' >> anyChar + return $ mkS c c + ) + mkWord' + = mkSeqs . map (\ c -> mkS c c) + + -- wordList :: Parser (GenRegex s) + wordList + = sepBy (many1 (noneOf ",}")) (char ',') + >>= return . foldr mkAlt (mkZero' "") . map mkWord' + + -- charSet :: Parser (GenRegex s) + charSet + = ( do p1 <- charSet' anyChar + ps <- many $ charSet' (noneOf "]") + return $ foldr mkAlt (mkZero' "") (p1 : ps) + ) + where + charSet' cp + = do c1 <- cp + c2 <- rest c1 + return $ mkS c1 c2 + rest c1 + = option c1 (char '-' >> anyChar) + +-- ------------------------------------------------------------ + +mkNoCaseSymRng :: StringLike s => Char -> Char -> GenRegex s +mkNoCaseSymRng c1 c2 + | isLower c1 + && + isLower c2 + = mkAlt (mkSymRng (toUpper c1) (toUpper c2)) (mkSymRng c1 c2) + | isUpper c1 + && + isUpper c2 + = mkAlt (mkSymRng (toLower c1) (toLower c2)) (mkSymRng c1 c2) + | otherwise + = mkSymRng c1 c2 + +-- ------------------------------------------------------------ diff --git a/src/Text/Regex/Glob/String.hs b/src/Text/Regex/Glob/String.hs new file mode 100644 index 0000000..40fcf7b --- /dev/null +++ b/src/Text/Regex/Glob/String.hs @@ -0,0 +1,44 @@ +-- ------------------------------------------------------------ + +{- | + Module : Text.Regex.Glob.String + Copyright : Copyright (C) 2011- Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt + Stability : stable + Portability: portable + + csh glob style pattern matcher +-} + +-- ------------------------------------------------------------ + +module Text.Regex.Glob.String + {-# DEPRECATED "use the more general 'Text.Regex.Glob.Generic' instead" #-} + ( Regex + , match + , matchNoCase + , parseRegex + , parseRegexNoCase + ) +where + +import Text.Regex.Glob.Generic (Regex) +import qualified Text.Regex.Glob.Generic as G + +-- ------------------------------------------------------------ + +match :: String -> String -> Bool +match = G.match + +matchNoCase :: String -> String -> Bool +matchNoCase = G.matchNoCase + +parseRegex :: String -> Regex +parseRegex = G.parseRegex + +parseRegexNoCase :: String -> Regex +parseRegexNoCase = G.parseRegexNoCase + +-- ------------------------------------------------------------ diff --git a/src/Text/Regex/XMLSchema/Generic.hs b/src/Text/Regex/XMLSchema/Generic.hs new file mode 100644 index 0000000..cc5141d --- /dev/null +++ b/src/Text/Regex/XMLSchema/Generic.hs @@ -0,0 +1,106 @@ +-- ------------------------------------------------------------ + +{- | + Module : Text.Regex.XMLSchema.Generic + Copyright : Copyright (C) 2014- Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt + Stability : stable + Portability: portable + + Convenient functions for W3C XML Schema Regular Expression Matcher. + For internals see 'Text.Regex.XMLSchema.Generic.Regex' and + 'Text.Regex.XMLSchema.Generic.Matching' + + Grammar can be found under +-} + +-- ------------------------------------------------------------ + +module Text.Regex.XMLSchema.Generic + ( GenRegex + , Regex + , RegexText + , RegexTextLazy + , RegexByteString + , RegexByteStringLazy + + , grep + , grepExt + , grepRE + , grepREwithLineNum + + , match + , matchExt + , matchSubex + + , sed + , sedExt + + , split + , splitExt + , splitSubex + + , tokenize + , tokenizeExt + , tokenize' + , tokenizeExt' + , tokenizeSubex + + , matchRE + , matchSubexRE + , sedRE + , splitRE + , splitSubexRE + , tokenizeRE + , tokenizeRE' + , tokenizeSubexRE + + , mkZero + , mkZero' + , mkUnit + , mkSym1 + , mkSymRng + , mkWord + , mkDot + , mkStar + , mkAll + , mkAlt + , mkElse + , mkSeq + , mkSeqs + , mkRep + , mkRng + , mkOpt + , mkDiff + , mkIsect + , mkExor + , mkCompl + , mkBr + , mkBr' + , isZero + , errRegex + + , parseRegex + , parseRegexExt + , parseContextRegex + ) +where + +import Text.Regex.XMLSchema.Generic.Matching +import Text.Regex.XMLSchema.Generic.Regex +import Text.Regex.XMLSchema.Generic.RegexParser + +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy.Char8 as BL + +type Regex = GenRegex String +type RegexText = GenRegex T.Text +type RegexTextLazy = GenRegex TL.Text +type RegexByteString = GenRegex B.ByteString +type RegexByteStringLazy = GenRegex BL.ByteString + +-- ------------------------------------------------------------ diff --git a/src/Text/Regex/XMLSchema/Generic/Matching.hs b/src/Text/Regex/XMLSchema/Generic/Matching.hs new file mode 100644 index 0000000..d8d6931 --- /dev/null +++ b/src/Text/Regex/XMLSchema/Generic/Matching.hs @@ -0,0 +1,454 @@ +{-# LANGUAGE BangPatterns #-} + +-- ------------------------------------------------------------ + +{- | + Module : Text.Regex.XMLSchema.Generic + Copyright : Copyright (C) 2014- Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt + Stability : stable + Portability: portable + + Convenient functions for W3C XML Schema Regular Expression Matcher. + For internals see 'Text.Regex.XMLSchema.Regex' + + Grammar can be found under + +-} + +-- ------------------------------------------------------------ + +module Text.Regex.XMLSchema.Generic.Matching + ( grep + , grepExt + , grepRE + , grepREwithLineNum + + , match + , matchExt + , matchSubex + + , sed + , sedExt + + , split + , splitExt + , splitSubex + + , tokenize + , tokenizeExt + , tokenize' + , tokenizeExt' + , tokenizeSubex + + , matchRE + , matchSubexRE + , sedRE + , splitRE + , splitSubexRE + , tokenizeRE + , tokenizeRE' + , tokenizeSubexRE + ) +where + +import Control.Arrow + +import Data.Maybe + +import Text.Regex.XMLSchema.Generic.Regex +import Text.Regex.XMLSchema.Generic.RegexParser +import Text.Regex.XMLSchema.Generic.StringLike + +{- +import Debug.Trace (traceShow) + +trc :: Show a => String -> a -> a +trc msg x = traceShow (msg, x) x + +-- -} +-- ------------------------------------------------------------ + +-- | split a string by taking the longest prefix matching a regular expression +-- +-- @Nothing@ is returned in case there is no matching prefix, +-- else the pair of prefix and rest is returned + +splitRE :: StringLike s => GenRegex s -> s -> Maybe (s, s) +splitRE re input + = do + (sms, rest) <- splitWithRegex re input + return (snd . head $ sms, rest) + +-- | convenient function for 'splitRE' +-- +-- examples: +-- +-- > split "a*b" "abc" = ("ab","c") +-- > split "a*" "bc" = ("", "bc") -- "a*" matches "" +-- > split "a+" "bc" = ("", "bc") -- "a+" does not match, no split +-- > split "[" "abc" = ("", "abc") -- "[" syntax error, no split + +split :: StringLike s => s -> s -> (s, s) +split = split' parseRegex + +-- | split with extended syntax + +splitExt :: StringLike s => s -> s -> (s, s) +splitExt = split' parseRegexExt + +split' :: StringLike s => (s -> GenRegex s) -> s -> s -> (s, s) +split' parseRe re input + = fromMaybe (emptyS, input) + . (splitRE . parseRe $ re) $ input + +-- ------------------------------------------------------------ + +-- | split a string by removing the longest prefix matching a regular expression +-- and then return the list of subexpressions found in the matching part +-- +-- @Nothing@ is returned in case of no matching prefix, +-- else the list of pairs of labels and submatches and the +-- rest is returned + +splitSubexRE :: StringLike s => GenRegex s -> s -> Maybe ([(s, s)], s) +splitSubexRE re input + = do + (sms, rest) <- splitWithRegex re input + return (map (first fromJust) . drop 1 $ sms, rest) + +-- | convenient function for 'splitSubex', uses extended syntax +-- +-- examples: +-- +-- > splitSubex "({1}a*)b" "abc" = ([("1","a")],"c") +-- > splitSubex "({2}a*)" "bc" = ([("2","")], "bc") +-- > splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c") -- subex 1 matches 2 times +-- > +-- > splitSubex ".*({x}a*)" "aa" = ([("x",""),("x","a"),("x","aa")],"") +-- > -- nondeterminism: 3 matches for a* +-- > +-- > splitSubex "({1}do)|({2}[a-z]+)" "do you know" +-- > = ([("1","do"),("2","do")]," you know") +-- > -- nondeterminism: 2 matches for do +-- > +-- > splitSubex "({1}do){|}({2}[a-z]+)" "do you know" +-- > = ([("1","do")]," you know") +-- > -- no nondeterminism with {|}: 1. match for do +-- > +-- > splitSubex "({1}a+)" "bcd" = ([], "bcd") -- no match +-- > splitSubex "[" "abc" = ([], "abc") -- syntax error + + +splitSubex :: StringLike s => s -> s -> ([(s, s)], s) +splitSubex re inp + = fromMaybe ([], inp) . (splitSubexRE . parseRegexExt $ re) $ inp + +-- ------------------------------------------------------------ + +-- | The function, that does the real work for 'tokenize' + +tokenizeRE :: StringLike s => GenRegex s -> s -> [s] +tokenizeRE re + = token'' + where + fcs = firstChars re + re1 = mkDiff re mkUnit + token'' = token' re fcs + token1'' = token' re1 fcs + + -- token' :: StringLike s => GenRegex s -> CharSet -> s -> [s] + token' re' fcs' inp + | nullS inp = [] + | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp + where + evalRes Nothing + = token'' (dropS 1 inp) -- re does not match any prefix + + evalRes (Just (toks, rest)) + | nullS tok = tok : token'' (dropS 1 rest) -- re is nullable and only the empty prefix matches + -- discard one char and try again + | otherwise = tok : token1'' rest -- real token found, next token must not be empty + where + tok = snd . head $ toks + +-- | split a string into tokens (words) by giving a regular expression +-- which all tokens must match. +-- +-- Convenient function for 'tokenizeRE' +-- +-- This can be used for simple tokenizers. +-- It is recommended to use regular expressions where the empty word does not match. +-- Else there will appear a lot of probably useless empty tokens in the output. +-- All none matching chars are discarded. If the given regex contains syntax errors, +-- @Nothing@ is returned +-- +-- examples: +-- +-- > tokenize "a" "aabba" = ["a","a","a"] +-- > tokenize "a*" "aaaba" = ["aaa","a"] +-- > tokenize "a*" "bbb" = ["","",""] +-- > tokenize "a+" "bbb" = [] +-- > +-- > tokenize "a*b" "" = [] +-- > tokenize "a*b" "abc" = ["ab"] +-- > tokenize "a*b" "abaab ab" = ["ab","aab","ab"] +-- > +-- > tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc" +-- > = ["ab","123","456.7","abc"] +-- > +-- > tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc" +-- > = ["cab","123","456.7","abc"] +-- > +-- > tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz" +-- > = ["abc","def","xyz"] +-- > +-- > tokenize ".*" "\nabc\n123\n\nxyz\n" +-- > = ["","abc","123","","xyz"] +-- > +-- > tokenize ".*" = lines +-- > +-- > tokenize "[^ \t\n\r]*" = words + +tokenize :: StringLike s => s -> s -> [s] +tokenize = tokenizeRE . parseRegex + +-- | tokenize with extended syntax + +tokenizeExt :: StringLike s => s -> s -> [s] +tokenizeExt = tokenizeRE . parseRegexExt + +-- ------------------------------------------------------------ + +-- | split a string into tokens and delimierter by giving a regular expression +-- which all tokens must match +-- +-- This is a generalisation of the above 'tokenizeRE' functions. +-- The none matching char sequences are marked with @Left@, the matching ones are marked with @Right@ +-- +-- If the regular expression contains syntax errors @Nothing@ is returned +-- +-- The following Law holds: +-- +-- > concat . map (either id id) . tokenizeRE' re == id + +tokenizeRE' :: StringLike s => GenRegex s -> s -> [Either s s] +tokenizeRE' re inp0 + = token'' (inp0, 0) inp0 + where + fcs = firstChars re + re1 = mkDiff re mkUnit + token'' = token' re fcs + token1'' = token' re1 fcs + + -- token' :: StringLike s => GenRegex s -> CharSet -> (s, Int) -> s -> [Either s s] + token' re' fcs' (uns, ! n) inp + | nullS inp = addUnmatched [] + | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp + where + addUnmatched + | n == 0 = id + | otherwise = ((Left $ takeS n uns) :) + + addMatched t + = addUnmatched . ((Right t) :) + + evalRes Nothing + = token'' (uns, n + 1) (dropS 1 inp) -- re does not match any prefix + + evalRes (Just (toks, rest)) + | nullS tok = addMatched tok -- re is nullable and only the empty prefix matches + $ token'' (rest, 1) + (dropS 1 rest) -- discard one char and try again + + | otherwise = addMatched tok + $ token1'' (rest, 0) rest -- real token found, next token must not be empty + where + tok = snd . head $ toks + +-- | convenient function for 'tokenizeRE'' +-- +-- When the regular expression parses as Zero, @[Left input]@ is returned, that means no tokens are found + +tokenize' :: StringLike s => s -> s -> [Either s s] +tokenize' = tokenizeRE' . parseRegex + +tokenizeExt' :: StringLike s => s -> s -> [Either s s] +tokenizeExt' = tokenizeRE' . parseRegexExt + +-- ------------------------------------------------------------ + +-- | split a string into tokens (pair of labels and words) by giving a regular expression +-- containing labeled subexpressions. +-- +-- This function should not be called with regular expressions +-- without any labeled subexpressions. This does not make sense, because the result list +-- will always be empty. +-- +-- Result is the list of matching subexpressions +-- This can be used for simple tokenizers. +-- At least one char is consumed by parsing a token. +-- The pairs in the result list contain the matching substrings. +-- All none matching chars are discarded. If the given regex contains syntax errors, +-- @Nothing@ is returned + +tokenizeSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)] +tokenizeSubexRE re + = token'' + where + fcs = firstChars re + re1 = mkDiff re mkUnit + token'' = token' re fcs + token1'' = token' re1 fcs + + -- token' :: StringLike s => GenRegex s -> CharSet -> s -> [(s, s)] + token' re' fcs' inp + | nullS inp = [] + | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp + where + evalRes Nothing + = token'' (dropS 1 inp) -- re does not match any prefix + + evalRes (Just (toks, rest)) + | nullS tok = res ++ token'' (dropS 1 rest) -- re is nullable and only the empty prefix matches + | otherwise = res ++ token1'' rest -- token found, tokenize the rest + where + res = map (first fromJust) . tail $ toks + tok = snd . head $ toks + +-- | convenient function for 'tokenizeSubexRE' a string +-- +-- examples: +-- +-- > tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)" +-- > "cab123 456.7abc" +-- > = [("name","cab") +-- > ,("num","123") +-- > ,("real","456.7") +-- > ,("name","abc")] +-- > +-- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" +-- > "12.34" = [("real","12.34") +-- > ,("n","12") +-- > ,("f","34")] +-- > +-- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" +-- > "12 34" = [("real","12"),("n","12") +-- > ,("real","34"),("n","34")] +-- > +-- > tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))" +-- > "12 34.56" = [("real","12"),("n","12"),("f","") +-- > ,("real","34.56"),("n","34"),("f","56")] + +tokenizeSubex :: StringLike s => s -> s -> [(s, s)] +tokenizeSubex = tokenizeSubexRE . parseRegexExt + +-- ------------------------------------------------------------ + +-- | sed like editing function +-- +-- All matching tokens are edited by the 1. argument, the editing function, +-- all other chars remain as they are + +sedRE :: StringLike s => (s -> s) -> GenRegex s -> s -> s +sedRE edit re = concatS . map (either id edit) . tokenizeRE' re + +-- | convenient function for 'sedRE' +-- +-- examples: +-- +-- > sed (const "b") "a" "xaxax" = "xbxbx" +-- > sed (\ x -> x ++ x) "a" "xax" = "xaax" +-- > sed undefined "[" "xxx" = "xxx" + +sed :: StringLike s => (s -> s) -> s -> s -> s +sed edit = sedRE edit . parseRegex + +sedExt :: StringLike s => (s -> s) -> s -> s -> s +sedExt edit = sedRE edit . parseRegexExt + +-- ------------------------------------------------------------ + +-- | match a string with a regular expression + +matchRE :: StringLike s => GenRegex s -> s -> Bool +matchRE = matchWithRegex + +-- | convenient function for 'matchRE' +-- +-- Examples: +-- +-- > match "x*" "xxx" = True +-- > match "x" "xxx" = False +-- > match "[" "xxx" = False + +match :: StringLike s => s -> s -> Bool +match = matchWithRegex . parseRegex + +-- | match with extended regular expressions + +matchExt :: StringLike s => s -> s -> Bool +matchExt = matchWithRegex . parseRegexExt + +-- ------------------------------------------------------------ + +-- | match a string with a regular expression +-- and extract subexpression matches + +matchSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)] +matchSubexRE re = map (first fromJust) . fromMaybe [] . matchWithRegex' re + +-- | convenient function for 'matchRE' +-- +-- Examples: +-- +-- > matchSubex "({1}x*)" "xxx" = [("1","xxx")] +-- > matchSubex "({1}x*)" "y" = [] +-- > matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600" = [("w","800"),("h","600")] +-- > matchSubex "[" "xxx" = [] + +matchSubex :: StringLike s => s -> s -> [(s, s)] +matchSubex = matchSubexRE . parseRegexExt + +-- ------------------------------------------------------------ + +-- | grep like filter for lists of strings +-- +-- The regular expression may be prefixed with the usual context spec \"^\" for start of string, +-- and "\\<" for start of word. +-- and suffixed with \"$\" for end of text and "\\>" end of word. +-- Word chars are defined by the multi char escape sequence "\\w" +-- +-- Examples +-- +-- > grep "a" ["_a_", "_a", "a_", "a", "_"] => ["_a_", "_a", "a_", "a"] +-- > grep "^a" ["_a_", "_a", "a_", "a", "_"] => ["a_", "a"] +-- > grep "a$" ["_a_", "_a", "a_", "a", "_"] => ["_a", "a"] +-- > grep "^a$" ["_a_", "_a", "a_", "a", "_"] => ["a"] +-- > grep "\\ ["x a b", " ax "] +-- > grep "a\\>" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " xa "] + +grep :: StringLike s => s -> [s] -> [s] +grep = grep' parseRegex' + +-- | grep with extended regular expressions + +grepExt :: StringLike s => s -> [s] -> [s] +grepExt = grep' parseRegexExt' + +grep' :: StringLike s => (String -> GenRegex s) -> s -> [s] -> [s] +grep' parseRe = grepRE . parseContextRegex parseRe + +-- | grep with already prepared Regex (ususally with 'parseContextRegex') + +grepRE :: StringLike s => GenRegex s-> [s] -> [s] +grepRE re = filter (matchRE re) + +-- | grep with Regex and line numbers + +grepREwithLineNum :: StringLike s => GenRegex s -> [s] -> [(Int, s)] +grepREwithLineNum re = filter (matchRE re . snd) . zip [(1::Int)..] + +-- ------------------------------------------------------------ diff --git a/src/Text/Regex/XMLSchema/Generic/Regex.hs b/src/Text/Regex/XMLSchema/Generic/Regex.hs new file mode 100644 index 0000000..b8f284d --- /dev/null +++ b/src/Text/Regex/XMLSchema/Generic/Regex.hs @@ -0,0 +1,729 @@ +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE FlexibleInstances #-} + + +-- ------------------------------------------------------------ + +{- | + Copyright : Copyright (C) 2014 - Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt + Stability : stable + Portability: portable + + W3C XML Schema Regular Expression Matcher + + Grammar can be found under + +-} + +-- ------------------------------------------------------------ + +module Text.Regex.XMLSchema.Generic.Regex + ( GenRegex + + , mkZero + , mkZero' + , mkUnit + , mkSym + , mkSym1 + , mkSymRng + , mkWord + , mkDot + , mkStar + , mkAll + , mkAlt + , mkElse + , mkSeq + , mkSeqs + , mkRep + , mkRng + , mkOpt + , mkDiff + , mkIsect + , mkExor + , mkInterleave + , mkCompl + , mkBr + , mkBr' + + , isZero + , errRegex + + , nullable + , nullable' + + , delta1 + , delta + + , firstChars + + , matchWithRegex + , matchWithRegex' + , splitWithRegex + , splitWithRegex' + , splitWithRegexCS + , splitWithRegexCS' + ) +where + +import Data.List (intercalate) +import Data.Monoid ((<>)) +import Data.Set.CharSet +import Data.String (IsString(..)) + +import Text.Regex.XMLSchema.Generic.StringLike + +{- +import Debug.Trace (traceShow) + +trc :: Show a => String -> a -> a +trc msg x = traceShow (msg, x) x + +-- -} +-- ------------------------------------------------------------ + +data GenRegex s + = Zero s + | Unit + | Sym CharSet + | Dot + | Star (GenRegex s) + | Alt (GenRegex s) (GenRegex s) + | Else (GenRegex s) (GenRegex s) + | Seq (GenRegex s) (GenRegex s) + | Rep Int (GenRegex s) -- 1 or more repetitions + | Rng Int Int (GenRegex s) -- n..m repetitions + | Diff (GenRegex s) (GenRegex s) -- r1 - r2 + | Isec (GenRegex s) (GenRegex s) -- r1 n r2 + | Exor (GenRegex s) (GenRegex s) -- r1 xor r2 + | Intl (GenRegex s) (GenRegex s) -- r1 interleavedWith r2 + | Br (Label s) (GenRegex s) -- (...) not yet parsed + | Obr (Label s) s !Int (GenRegex s) -- currently parsed (...) + | Cbr [(Label s, s)] (GenRegex s) -- already completely parsed (...) + deriving (Eq, Ord {-, Show -}) + +type Label s + = Maybe s -- we need one special label for the whole expression + -- see splitWithRegex +type SubexResults s + = [(Label s, s)] + +type Nullable s + = (Bool, SubexResults s) + +-- ------------------------------------------------------------ + +{- just for documentation + +class Inv a where + inv :: a -> Bool + +instance Inv (GenRegex s) where + inv (Zero _) = True + inv Unit = True + inv (Sym p) = not (nulCS p) && not (fullCS p) + inv Dot = True + inv (Star e) = inv e + inv (Alt e1 e2) = inv e1 && + inv e2 + inv (Seq e1 e2) = inv e1 && + inv e2 + inv (Rep i e) = i > 0 && inv e + inv (Rng i j e) = (i < j || (i == j && i > 1)) && + inv e + inv (Diff e1 e2) = inv e1 && + inv e2 + inv (Isec e1 e2) = inv e1 && + inv e2 + inv (Exor e1 e2) = inv e1 && + inv e2 +-} + +-- ------------------------------------------------------------ +-- +-- smart constructors + +-- | construct the r.e. for the empty set. +-- An (error-) message may be attached + +mkZero :: s -> GenRegex s +mkZero = Zero +{-# INLINE mkZero #-} + +mkZero' :: (StringLike s) => + String -> GenRegex s +mkZero' = Zero . fromString +{-# INLINE mkZero' #-} + +-- | construct the r.e. for the set containing the empty word + +mkUnit :: GenRegex s +mkUnit = Unit +{-# INLINE mkUnit #-} + +-- | construct the r.e. for a set of chars + +mkSym :: (StringLike s) => + CharSet -> GenRegex s +mkSym s + | nullCS s = mkZero' "empty char range" + | fullCS s = mkDot + | otherwise = Sym s +{-# INLINE mkSym #-} + +-- | construct an r.e. for a single char set +mkSym1 :: (StringLike s) => + Char -> GenRegex s +mkSym1 = mkSym . singleCS +{-# INLINE mkSym1 #-} + +-- | construct an r.e. for an intervall of chars +mkSymRng :: (StringLike s) => + Char -> Char -> GenRegex s +mkSymRng c1 c2 = mkSym $ rangeCS c1 c2 +{-# INLINE mkSymRng #-} + +-- | mkSym generaized for strings +mkWord :: (StringLike s) => + [Char] -> GenRegex s +mkWord = mkSeqs . map mkSym1 + +-- | construct an r.e. for the set of all Unicode chars +mkDot :: GenRegex s +mkDot = Dot +{-# INLINE mkDot #-} + +-- | construct an r.e. for the set of all Unicode words + +mkAll :: (StringLike s) => + GenRegex s +mkAll = mkStar mkDot +{-# INLINE mkAll #-} + + +-- | construct r.e. for r* +mkStar :: (StringLike s) => + GenRegex s -> GenRegex s +mkStar (Zero _) = mkUnit -- {}* == () +mkStar e@Unit = e -- ()* == () +mkStar e@(Star _e1) = e -- (r*)* == r* +mkStar (Rep 1 e1) = mkStar e1 -- (r+)* == r* +mkStar (Rep i e1) + | i == 1 + || + nullable e1 = mkStar e1 -- (r{i,})* == r* when i == 1 or nullable r +mkStar e@(Rng _ _ e1) + | nullable e = mkStar e1 -- (r{i,j})* == r* when i == 0 or nullable r +mkStar e@(Alt _ _) = Star (rmStar e) -- (a*|b)* == (a|b)* + + {- this is wrong, not generally applicable +mkStar (Br l r s) = mkBr0 l (mkStar r) s -- ({l}r)* == ({l}r*) because we want the longest match as result for the subexpression + -} +mkStar e = Star e + +rmStar :: (StringLike s) => + GenRegex s -> GenRegex s +rmStar (Alt e1 e2) = mkAlt (rmStar e1) (rmStar e2) +rmStar (Star e1) = rmStar e1 +rmStar (Rep 1 e1) = rmStar e1 +rmStar e1 = e1 + +-- | construct the r.e for r1|r2 + +mkAlt :: (StringLike s) => + GenRegex s -> GenRegex s -> GenRegex s +mkAlt e1 (Zero _) = e1 -- e1 u {} = e1 +mkAlt (Zero _) e2 = e2 -- {} u e2 = e2 +mkAlt (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2 -- melting of predicates +mkAlt e1 e2@(Sym _) = mkAlt e2 e1 -- symmetry: predicates always first +mkAlt e1@(Sym _) (Alt e2@(Sym _) e3) = mkAlt (mkAlt e1 e2) e3 -- prepare melting of predicates +mkAlt (Sym _) e2@Dot = e2 -- c|. = . for a c's +mkAlt e1@(Star Dot) _e2 = e1 -- A* u e1 = A* +mkAlt _e1 e2@(Star Dot) = e2 -- e1 u A* = A* +mkAlt (Alt e1 e2) e3 = mkAlt e1 (mkAlt e2 e3) -- associativity +mkAlt e1 e2 + | e1 == e2 = e1 + | otherwise = Alt e1 e2 + +-- | construct the r.e. for r1{|}r2 (r1 orElse r2). +-- +-- This represents the same r.e. as r1|r2, but when +-- collecting the results of subexpressions in (...) and r1 succeeds, the +-- subexpressions of r2 are discarded, so r1 matches are prioritized +-- +-- example +-- +-- > splitSubex "({1}x)|({2}.)" "x" = ([("1","x"),("2","x")], "") +-- > +-- > splitSubex "({1}x){|}({2}.)" "x" = ([("1","x")], "") + +mkElse :: (StringLike s) => + GenRegex s -> GenRegex s -> GenRegex s +mkElse e1 (Zero _) = e1 -- e1 u {} = e1 +mkElse (Zero _) e2 = e2 -- {} u e2 = e2 +mkElse (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2 -- melting of predicates + -- no symmetry allowed +mkElse e1@(Sym _) (Else e2@(Sym _) e3) = mkElse (mkElse e1 e2) e3 -- prepare melting of predicates +mkElse (Sym _) e2@Dot = e2 -- c|. = . for a c's +mkElse e1@(Star Dot) _e2 = e1 -- A* u e1 = A* +mkElse _e1 e2@(Star Dot) = e2 -- e1 u A* = A* +mkElse (Else e1 e2) e3 = mkElse e1 (mkElse e2 e3) -- associativity +mkElse e1 e2 + | e1 == e2 = e1 + | otherwise = Else e1 e2 + +-- | Construct the sequence r.e. r1.r2 + +mkSeq :: GenRegex s -> GenRegex s -> GenRegex s +mkSeq e1@(Zero _) _e2 = e1 +mkSeq _e1 e2@(Zero _) = e2 +mkSeq Unit e2 = e2 +mkSeq (Cbr ss1 e1) e2 = mkCbr ss1 (mkSeq e1 e2) -- move finished submatches upwards +mkSeq e1 Unit = e1 +mkSeq (Seq e1 e2) e3 = mkSeq e1 (mkSeq e2 e3) +mkSeq e1 e2 = Seq e1 e2 + +-- | mkSeq extened to lists +mkSeqs :: [GenRegex s] -> GenRegex s +mkSeqs = foldr mkSeq mkUnit + +-- | Construct repetition r{i,} +mkRep :: (StringLike s) => + Int -> GenRegex s -> GenRegex s +mkRep 0 e = mkStar e +mkRep _ e@(Zero _) = e +mkRep _ e + | nullable e = mkStar e +mkRep i (Rep j e) = mkRep (i * j) e +mkRep i e = Rep i e + +-- | Construct range r{i,j} +mkRng :: (StringLike s) => + Int -> Int -> GenRegex s -> GenRegex s +mkRng 0 0 _e = mkUnit +mkRng 1 1 e = e +mkRng lb ub _e + | lb > ub = mkZero' $ + "illegal range " ++ + show lb ++ ".." ++ show ub +mkRng _l _u e@(Zero _) = e +mkRng _l _u e@Unit = e +mkRng lb ub e = Rng lb ub e + +-- | Construct option r? +mkOpt :: (StringLike s) => + GenRegex s -> GenRegex s +mkOpt = mkRng 0 1 +{-# INLINE mkOpt #-} + +-- | Construct difference r.e.: r1 {\\} r2 +-- +-- example +-- +-- > match "[a-z]+{\\}bush" "obama" = True +-- > match "[a-z]+{\\}bush" "clinton" = True +-- > match "[a-z]+{\\}bush" "bush" = False -- not important any more + +mkDiff :: (StringLike s) => + GenRegex s -> GenRegex s -> GenRegex s +mkDiff e1@(Zero _) _e2 = e1 -- {} - r2 = {} +mkDiff e1 (Zero _) = e1 -- r1 - {} = r1 +mkDiff _e1 (Star Dot) = mkZero' "empty set in difference expr" -- r1 - .* = {} +mkDiff Dot (Sym p) = mkSym $ compCS p -- . - s = ~s +mkDiff (Sym _) Dot = mkZero' "empty set in difference expr" -- x - . = {} +mkDiff (Sym p1) (Sym p2) = mkSym $ p1 `diffCS` p2 -- set diff +mkDiff e1 e2 + | e1 == e2 = mkZero' "empty set in difference expr" -- r1 - r1 = {} + | otherwise = Diff e1 e2 + +-- | Construct the Complement of an r.e.: whole set of words - r + +mkCompl :: (StringLike s) => + GenRegex s -> GenRegex s +mkCompl (Zero _) = mkAll +mkCompl (Star Dot) = mkZero' "empty set in compl expr" +mkCompl e = mkDiff (mkStar mkDot) e + +-- | Construct r.e. for intersection: r1 {&} r2 +-- +-- example +-- +-- > match ".*a.*{&}.*b.*" "-a-b-" = True +-- > match ".*a.*{&}.*b.*" "-b-a-" = True +-- > match ".*a.*{&}.*b.*" "-a-a-" = False +-- > match ".*a.*{&}.*b.*" "---b-" = False + +mkIsect :: (StringLike s) => + GenRegex s -> GenRegex s -> GenRegex s +mkIsect e1@(Zero _) _e2 = e1 -- {} n r2 = {} +mkIsect _e1 e2@(Zero _) = e2 -- r1 n {} = {} +mkIsect e1@(Unit) e2 -- () n r2 = () if nullable r2 + | nullable e2 = e1 -- () n r2 = {} if not nullable r2 + | otherwise = mkZero' "intersection empty" +mkIsect e1 e2@(Unit) = mkIsect e2 e1 -- symmetric version of las 2 laws + +mkIsect (Sym p1) (Sym p2) = mkSym $ p1 `intersectCS` p2 -- intersect sets +mkIsect e1@(Sym _) Dot = e1 -- x n . = x +mkIsect Dot e2@(Sym _) = e2 -- . n x = x + +mkIsect (Star Dot) e2 = e2 -- .* n r2 = r2 +mkIsect e1 (Star Dot) = e1 -- r1 n .* = r1 +mkIsect e1 e2 + | e1 == e2 = e1 -- r1 n r1 = r1 + | otherwise = Isec e1 e2 + +-- | Construct r.e. for exclusive or: r1 {^} r2 +-- +-- example +-- +-- > match "[a-c]+{^}[c-d]+" "abc" = True +-- > match "[a-c]+{^}[c-d]+" "acdc" = False +-- > match "[a-c]+{^}[c-d]+" "ccc" = False +-- > match "[a-c]+{^}[c-d]+" "cdc" = True + +mkExor :: (StringLike s) => + GenRegex s -> GenRegex s -> GenRegex s +mkExor (Zero _) e2 = e2 +mkExor e1 (Zero _) = e1 +mkExor (Star Dot) _e2 = mkZero' "empty set in exor expr" +mkExor _e1 (Star Dot) = mkZero' "empty set in exor expr" +mkExor (Sym p1) (Sym p2) = mkSym $ p1 `exorCS` p2 +mkExor (Sym p1) Dot = mkSym $ compCS p1 +mkExor Dot (Sym p2) = mkSym $ compCS p2 +mkExor e1 e2 + | e1 == e2 = mkZero' "empty set in exor expr" -- r1 xor r1 = {} + | otherwise = Exor e1 e2 + +mkInterleave :: GenRegex s -> GenRegex s -> GenRegex s +mkInterleave e1@(Zero _) _ = e1 +mkInterleave _ e2@(Zero _) = e2 +mkInterleave (Unit) e2 = e2 +mkInterleave e1 (Unit) = e1 +mkInterleave e1 e2 = Intl e1 e2 + +-- | Construct a labeled subexpression: ({label}r) + +mkBr :: s -> GenRegex s -> GenRegex s +mkBr l e = Br (Just l) e + +mkBr' :: StringLike s => + String -> GenRegex s -> GenRegex s +mkBr' l e = Br (Just $ fromString l) e + +mkBrN :: GenRegex s -> GenRegex s +mkBrN e = Br Nothing e + +mkObr :: StringLike s => + Label s -> s -> Int -> GenRegex s -> GenRegex s +mkObr _ _ _ e@(Zero _) = e +mkObr l s n Unit = mkCbr [(l, takeS n s)] mkUnit +mkObr l s n e = Obr l s n e + +mkCbr :: SubexResults s -> GenRegex s -> GenRegex s +mkCbr _ e@(Zero _) = e -- dead end, throw away subexpr matches +mkCbr ss (Cbr ss1 e) = mkCbr (ss <> ss1) e -- join inner and this subexpr match +mkCbr ss e = Cbr ss e + +-- ------------------------------------------------------------ + +instance (StringLike s) => Show (GenRegex s) where + show (Zero e) = "{" ++ toString e ++ "}" + show Unit = "()" + show (Sym p) + | p == compCS (stringCS "\n\r") + = "." + | null (tail cs) && + rng1 (head cs) + = escRng . head $ cs + | otherwise = "[" ++ concat cs' ++ "]" + where + rng1 (x,y) = x == y + cs = p -- charRngs . chars $ p + cs' = map escRng p + escRng (x, y) + | x == y = esc x + | succ x == y + = esc x ++ esc y + | otherwise + = esc x ++ "-" ++ esc y + esc x + | x `elem` "\\-[]{}()*+?.^" + = '\\':x:"" + | x >= ' ' && x <= '~' + = x:"" + | otherwise + = "&#" ++ show (fromEnum x) ++ ";" + show Dot = "\\a" + show (Star Dot) = "\\A" + show (Star e) = "(" ++ show e ++ "*)" + show (Alt e1 e2) = "(" ++ show e1 ++ "|" ++ show e2 ++ ")" + show (Else e1 e2) = "(" ++ show e1 ++ "{|}" ++ show e2 ++ ")" + show (Seq e1 e2) = "(" ++ show e1 ++ show e2 ++ ")" + show (Rep 1 e) = "(" ++ show e ++ "+)" + show (Rep i e) = "(" ++ show e ++ "{" ++ show i ++ ",})" + show (Rng 0 1 e) = "(" ++ show e ++ "?)" + show (Rng i j e) = "(" ++ show e ++ "{" ++ show i ++ "," ++ show j ++ "})" + show (Diff e1 e2) = "(" ++ show e1 ++ "{\\}" ++ show e2 ++ ")" + show (Isec e1 e2) = "(" ++ show e1 ++ "{&}" ++ show e2 ++ ")" + show (Exor e1 e2) = "(" ++ show e1 ++ "{^}" ++ show e2 ++ ")" + show (Intl e1 e2) = "(" ++ show e1 ++ "{:}" ++ show e2 ++ ")" + show (Br l e) = "({" ++ showL l ++ "}" ++ show e ++ ")" + show (Obr l s n e) = "({" ++ showL l ++ "=" ++ toString (takeS n s) ++ "}" ++ show e ++ ")" + show (Cbr ss e) = "([" ++ intercalate "," (map (\ (l, s) -> showL l ++ "=" ++ toString s) ss) ++ "]" + ++ show e ++ + ")" + +showL :: Show s => Label s -> String +showL = rmq . maybe "" show + where + rmq ('\"':xs) = init xs + rmq xs = xs + +-- ------------------------------------------------------------ + +isZero :: GenRegex s -> Bool +isZero (Zero _) = True +isZero _ = False +{-# INLINE isZero #-} + +errRegex :: (StringLike s) => + GenRegex s -> s +errRegex (Zero e) = e +errRegex _ = emptyS + +-- ------------------------------------------------------------ + +nullable :: (StringLike s) => + GenRegex s -> Bool +nullable = fst . nullable' +{-# INLINE nullable #-} + +nullable' :: (StringLike s) => + GenRegex s -> Nullable s + +nullable' (Zero _) = (False, []) +nullable' Unit = (True, []) +nullable' Dot = (False, []) +nullable' (Sym _x) = (False, []) + +nullable' (Star _e) = (True, []) +nullable' (Rep _i e) = nullable' e +nullable' (Rng i _ e) = (i == 0, []) `unionN` nullable' e +nullable' (Seq e1 e2) = nullable' e1 `isectN` nullable' e2 + +nullable' (Alt e1 e2) = nullable' e1 `unionN` nullable' e2 +nullable' (Else e1 e2) = nullable' e1 `orElseN` nullable' e2 +nullable' (Isec e1 e2) = nullable' e1 `isectN` nullable' e2 +nullable' (Diff e1 e2) = nullable' e1 `diffN` nullable' e2 +nullable' (Exor e1 e2) = nullable' e1 `exorN` nullable' e2 +nullable' (Intl e1 e2) = nullable' e1 `isectN` nullable' e2 + +nullable' (Br l e) = (True, [(l, emptyS )]) `isectN` nullable' e +nullable' (Obr l s n e) = (True, [(l, takeS n s)]) `isectN` nullable' e +nullable' (Cbr ss e) = (True, ss) `isectN` nullable' e + +isectN :: Nullable s -> Nullable s -> Nullable s +isectN (True, ws1) (True, ws2) = (True, ws1 ++ ws2) +isectN _ _ = (False, []) + +unionN :: Nullable s -> Nullable s -> Nullable s +unionN (False, _) (False, _) = (False, []) +unionN (_, ws1) (_, ws2) = (True, ws1 ++ ws2) + +orElseN :: Nullable s -> Nullable s -> Nullable s +orElseN e1@(True, _ws1) _ = e1 +orElseN _ e2 = e2 + +diffN :: Nullable s -> Nullable s -> Nullable s +diffN n1 (False, _) = n1 +diffN _ _ = (False, []) + +exorN :: Nullable s -> Nullable s -> Nullable s +exorN n1@(True, _) (False, _) = n1 +exorN (False, _) n2@(True, _) = n2 +exorN _ _ = (False, []) + +-- ------------------------------------------------------------ + +-- | FIRST for regular expressions +-- +-- this is only an approximation, the real set of char may be smaller, +-- when the expression contains intersection, set difference or exor operators + +firstChars :: (StringLike s) => + GenRegex s -> CharSet + +firstChars (Zero _) = emptyCS +firstChars Unit = emptyCS +firstChars (Sym p) = p +firstChars Dot = allCS + +firstChars (Star e1) = firstChars e1 +firstChars (Alt e1 e2) = firstChars e1 `unionCS` firstChars e2 +firstChars (Else e1 e2) = firstChars e1 `unionCS` firstChars e2 +firstChars (Seq e1 e2) + | nullable e1 = firstChars e1 `unionCS` firstChars e2 + | otherwise = firstChars e1 +firstChars (Rep _i e) = firstChars e +firstChars (Rng _i _j e) = firstChars e +firstChars (Diff e1 _e2) = firstChars e1 -- this is an approximation +firstChars (Isec e1 e2) = firstChars e1 `intersectCS` firstChars e2 -- this is an approximation +firstChars (Exor e1 e2) = firstChars e1 `unionCS` firstChars e2 -- this is an approximation +firstChars (Intl e1 e2) = firstChars e1 `unionCS` firstChars e2 +firstChars (Br _l e) = firstChars e +firstChars (Obr _l _s _n e) = firstChars e +firstChars (Cbr _ss e) = firstChars e + +-- ------------------------------------------------------------ + +delta1 :: (StringLike s) => Char -> s -> GenRegex s -> GenRegex s +delta1 c inp e0 + = d' e0 + where + d' e@(Zero _) = e + d' Unit = mkZero' $ + "unexpected char " ++ show c + d' (Sym p) + | c `elemCS` p = mkUnit + | otherwise = mkZero' $ + "unexpected char " ++ show c + d' Dot = mkUnit + d' e@(Star Dot) = e + d' e@(Star e1) = mkSeq (d' e1) e + d' (Alt e1 e2) = mkAlt (d' e1) (d' e2) + d' (Else e1 e2) = mkElse (d' e1) (d' e2) + d' (Seq e1@(Obr l s n e1') e2) + | nu = mkAlt (mkSeq (d' e1) e2) + (mkCbr ((l, takeS n s) : ws) (d' e2)) + where + (nu, ws) = nullable' e1' + d' (Seq e1 e2) + | nullable e1 = mkAlt (mkSeq (d' e1) e2) + (d' e2) + | otherwise = mkSeq (d' e1) e2 + d' (Rep i e) = mkSeq (d' e) (mkRep (i-1) e) + d' (Rng i j e) = mkSeq (d' e) (mkRng ((i-1) `max` 0) (j-1) e) + d' (Diff e1 e2) = mkDiff (d' e1) (d' e2) + d' (Isec e1 e2) = mkIsect (d' e1) (d' e2) + d' (Exor e1 e2) = mkExor (d' e1) (d' e2) + d' (Intl e1 e2) = mkAlt (mkInterleave (d' e1) e2 ) + (mkInterleave e1 (d' e2)) + + d' (Br l e) = d' (mkObr l inp 0 e) -- a subex parse starts + d' (Obr l s n e) = mkObr l s (n + 1) (d' e) -- a subex parse cont. + d' (Cbr ss e) = mkCbr ss (d' e) -- the results of a subex parse + +-- ------------------------------------------------------------ + +delta :: (StringLike s) => s -> GenRegex s -> GenRegex s +delta inp@(uncons -> Just (c, inp')) e0 + = d' e0 + where + d' e@(Zero _) = e -- don't process whole input, parse has failed + d' e@(Star Dot) = e -- don't process input, derivative does not change + d' e = delta inp' ( -- trc "delta1=" $ + delta1 c inp e) + +delta _empty e + = e + + +matchWithRegex :: (StringLike s) => + GenRegex s -> s -> Bool +matchWithRegex e s + = nullable $ delta s e + +matchWithRegex' :: (StringLike s) => + GenRegex s -> s -> Maybe (SubexResults s) +matchWithRegex' e s + = (\ (r, l) -> if r then Just l else Nothing) . nullable' $ delta s e + +-- ------------------------------------------------------------ + +-- | This function wraps the whole regex in a subexpression before starting +-- the parse. This is done for getting access to +-- the whole parsed string. Therfore we need one special label, this label +-- is the Nothing value, all explicit labels are Just labels. + +splitWithRegex :: (StringLike s) => + GenRegex s -> s -> Maybe (SubexResults s, s) +splitWithRegex re inp + = do + (re', rest) <- splitWithRegex' (mkBrN re) inp + return ( snd . nullable' $ re', rest) + +splitWithRegexCS :: (StringLike s) => + GenRegex s -> CharSet -> s -> Maybe (SubexResults s, s) +splitWithRegexCS re cs inp + = do + (re', rest) <- splitWithRegexCS' (mkBrN re) cs inp + return ( snd . nullable' $ re', rest) + +-- ---------------------------------------- +-- +-- | The main scanner function + +{- linear recursive function, can lead to stack overflow + +splitWithRegex' :: Eq l => GenRegex s -> String -> Maybe (GenRegex s, String) +splitWithRegex' re "" + | nullable re = Just (re, "") + | otherwise = Nothing + +splitWithRegex' re inp@(c : inp') + | isZero re = Nothing + | otherwise = evalRes . splitWithRegex' (delta1 re c) $ inp' + where + evalRes Nothing + | nullable re = Just (re, inp) + | otherwise = Nothing + evalRes res = res +-} + +-- tail recursive version of above function + +splitWithRegex' :: (StringLike s) => + GenRegex s -> s -> Maybe (GenRegex s, s) +splitWithRegex' re inp + = splitWithRegex'' + ( if nullable re + then Just (re, inp) -- first possible result: empty prefix + else Nothing -- empty prefix not a result + ) re inp + +splitWithRegex'' :: (StringLike s) => + Maybe (GenRegex s, s) -> GenRegex s -> s -> Maybe (GenRegex s, s) + +splitWithRegex'' lastRes re inp@(uncons -> Just (c, inp')) + | isZero re = lastRes + | otherwise = splitWithRegex'' nextRes re' $ inp' + where + re' = delta1 c inp re + nextRes + | nullable re' = Just (re', inp') + | otherwise = lastRes + +splitWithRegex'' lastRes _re _empty + = lastRes + +-- ---------------------------------------- +-- +-- | speedup version for splitWithRegex' +-- +-- This function checks whether the input starts with a char from FIRST re. +-- If this is not the case, the split fails. The FIRST set can be computed once +-- for a whole tokenizer and reused by every call of split + +splitWithRegexCS' :: (StringLike s) => + GenRegex s -> CharSet -> s -> Maybe (GenRegex s, s) +splitWithRegexCS' re cs inp@(uncons -> Just (c, _inp')) + | c `elemCS` cs = splitWithRegex' re inp + +splitWithRegexCS' re _cs inp + | nullable re = Just (re, inp) + | otherwise = Nothing + +-- ------------------------------------------------------------ + diff --git a/src/Text/Regex/XMLSchema/Generic/RegexParser.hs b/src/Text/Regex/XMLSchema/Generic/RegexParser.hs new file mode 100644 index 0000000..27a6fbe --- /dev/null +++ b/src/Text/Regex/XMLSchema/Generic/RegexParser.hs @@ -0,0 +1,532 @@ +{-# LANGUAGE FlexibleContexts #-} + +-- ------------------------------------------------------------ + +{- | + Module : Text.Regex.XMLSchema.RegexParser + Copyright : Copyright (C) 2014- Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt (uwe@fh-wedel.de) + Stability : stable + Portability: portable + + W3C XML Schema Regular Expression Parser + + This parser supports the full W3C standard, the + complete grammar can be found under + and extensions for all missing set operations, intersection, + difference, exclusive or, interleave, complement + +-} + +-- ------------------------------------------------------------ + +module Text.Regex.XMLSchema.Generic.RegexParser + ( parseRegex + , parseRegexExt + , parseRegex' + , parseRegexExt' + , parseContextRegex + ) +where + +import Data.Char.Properties.UnicodeBlocks +import Data.Char.Properties.UnicodeCharProps +import Data.Char.Properties.XMLCharProps + +import Data.List (isPrefixOf, + isSuffixOf) +import Data.Maybe +import Data.Set.CharSet + +import Text.ParserCombinators.Parsec +import Text.Regex.XMLSchema.Generic.Regex +import Text.Regex.XMLSchema.Generic.StringLike + +-- ------------------------------------------------------------ + +-- | parse a standard W3C XML Schema regular expression + +parseRegex :: StringLike s => s -> GenRegex s +parseRegex = parseRegex' . toString + +parseRegex' :: StringLike s => String -> GenRegex s +parseRegex' = parseRegex'' regExpStd + +-- | parse an extended syntax W3C XML Schema regular expression +-- +-- The Syntax of the W3C XML Schema spec is extended by +-- further useful set operations, like intersection, difference, exor. +-- Subexpression match becomes possible with \"named\" pairs of parentheses. +-- The multi char escape sequence \\a represents any Unicode char, +-- The multi char escape sequence \\A represents any Unicode word, (\\A = \\a*). +-- All syntactically wrong inputs are mapped to the Zero expression representing the +-- empty set of words. Zero contains as data field a string for an error message. +-- So error checking after parsing becomes possible by checking against Zero ('isZero' predicate) + +parseRegexExt :: StringLike s => s -> GenRegex s +parseRegexExt = parseRegexExt' . toString + +parseRegexExt' :: StringLike s => String -> GenRegex s +parseRegexExt' = parseRegex'' regExpExt + +parseRegex'' :: StringLike s => Parser (GenRegex s) -> String -> GenRegex s +parseRegex'' regExp' + = either (mkZero' . ("syntax error: " ++) . show) id + . parse ( do + r <- regExp' + eof + return r + ) "" + +-- ------------------------------------------------------------ + +-- | parse a regular expression surrounded by contenxt spec +-- +-- a leading @^@ denotes start of text, +-- a trailing @$@ denotes end of text, +-- a leading @\\<@ denotes word start, +-- a trailing @\\>@ denotes word end. +-- +-- The 1. param ist the regex parser ('parseRegex' or 'parseRegexExt') + +parseContextRegex :: StringLike s => (String -> GenRegex s) -> s -> GenRegex s +parseContextRegex parseRe re0 + = re' + where + parseAW = parseRegexExt' "(\\A\\W)?" + parseWA = parseRegexExt' "(\\W\\A)?" + + re = toString re0 + re' = mkSeqs . concat $ [ startContext + , (:[]) . parseRe $ re2 + , endContext + ] + (startContext, re1) + | "^" `isPrefixOf` re = ([], tail re) + | "\\<" `isPrefixOf` re = ([parseAW], drop 2 re) + | otherwise = ([mkStar mkDot], re) + (endContext, re2) + | "$" `isSuffixOf` re1 = ([], init re1) + | "\\>" `isSuffixOf` re1 = ([parseWA], init . init $ re1) + | otherwise = ([mkStar mkDot], re1) + +-- ------------------------------------------------------------ + +regExpExt :: StringLike s => Parser (GenRegex s) +regExpExt = branchList orElseList + +regExpStd :: StringLike s => Parser (GenRegex s) +regExpStd = branchList seqListStd + +branchList :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) +branchList exParser + = do + r1 <- exParser + rs <- many branchList1 + return (foldr1 mkAlt $ r1:rs) -- union is associative, so we use right ass. + -- as with seq, alt and exor + where + branchList1 + = do + _ <- char '|' + exParser + +orElseList :: StringLike s => Parser (GenRegex s) +orElseList + = do + r1 <- interleaveList + rs <- many orElseList1 + return (foldr1 mkElse $ r1:rs) -- orElse is associative, so we choose right ass. + -- as with seq and alt ops + where + orElseList1 + = do + _ <- try (string "{|}") + interleaveList + +interleaveList :: StringLike s => Parser (GenRegex s) +interleaveList + = do + r1 <- exorList + rs <- many interleaveList1 + return (foldr1 mkInterleave $ r1:rs) -- interleave is associative, so we choose right ass. + -- as with seq and alt ops + where + interleaveList1 + = do + _ <- try (string "{:}") + exorList + +exorList :: StringLike s => Parser (GenRegex s) +exorList + = do + r1 <- diffList + rs <- many exorList1 + return (foldr1 mkExor $ r1:rs) -- exor is associative, so we choose right ass. + where + exorList1 + = do + _ <- try (string "{^}") + diffList + +diffList :: StringLike s => Parser (GenRegex s) +diffList + = do + r1 <- intersectList + rs <- many diffList1 + return (foldl1 mkDiff $ r1:rs) -- diff is not associative, so we choose left ass. + where + diffList1 + = do + _ <- try (string "{\\}") + intersectList + +intersectList :: StringLike s => Parser (GenRegex s) +intersectList + = do + r1 <- seqListExt + rs <- many intersectList1 + return (foldr1 mkIsect $ r1:rs) + where + intersectList1 + = do + _ <- try (string "{&}") + seqListExt + +seqListExt :: StringLike s => Parser (GenRegex s) +seqListExt = seqList' regExpLabel multiCharEscExt + +seqListStd :: StringLike s => Parser (GenRegex s) +seqListStd = seqList' regExpStd multiCharEsc + +seqList' :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) -> Parser (GenRegex s) +seqList' regExp' multiCharEsc' + = do + rs <- many piece + return $ mkSeqs rs + where + -- piece :: StringLike s => Parser (GenRegex s) + piece + = do + r <- atom + quantifier r + + -- atom :: StringLike s => Parser (GenRegex s) + atom + = char1 + <|> + charClass + <|> + between (char '(') (char ')') regExp' + + -- charClass :: StringLike s => Parser (GenRegex s) + charClass + = charClassEsc multiCharEsc' + <|> + charClassExpr multiCharEsc' + <|> + wildCardEsc + + + +quantifier :: StringLike s => GenRegex s -> Parser (GenRegex s) +quantifier r + = ( do + _ <- char '?' + return $ mkOpt r ) + <|> + ( do + _ <- char '*' + return $ mkStar r ) + <|> + ( do + _ <- char '+' + return $ mkRep 1 r ) + <|> + try ( do + _ <- char '{' + res <- quantity r + _ <- char '}' + return res + ) + <|> + ( return r ) + +quantity :: StringLike s => GenRegex s -> Parser (GenRegex s) +quantity r + = do + lb <- many1 digit + quantityRest r (read lb) + +quantityRest :: StringLike s => GenRegex s -> Int -> Parser (GenRegex s) +quantityRest r lb + = ( do + _ <- char ',' + ub <- many digit + return ( if null ub + then mkRep lb r + else mkRng lb (read ub) r + ) + ) + <|> + ( return $ mkRng lb lb r) + +regExpLabel :: StringLike s => Parser (GenRegex s) +regExpLabel + = do + lab <- option id (between (char '{') (char '}') label') + r <- regExpExt + return $ lab r + where + label' + = do + l <- many1 (satisfy isXmlNameChar) + return $ mkBr' l + +char1 :: StringLike s => Parser (GenRegex s) +char1 + = do + c <- satisfy (`notElem` ".\\?*+{}()|[]") + return $ mkSym1 c + +charClassEsc :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) +charClassEsc multiCharEsc' + = do + _ <- char '\\' + ( singleCharEsc + <|> + multiCharEsc' + <|> + catEsc + <|> + complEsc ) + +singleCharEsc :: StringLike s => Parser (GenRegex s) +singleCharEsc + = do + c <- singleCharEsc' + return $ mkSym1 c + +singleCharEsc' :: Parser Char +singleCharEsc' + = do + c <- satisfy (`elem` "nrt\\|.?*+(){}-[]^") + return $ maybe c id . lookup c . zip "ntr" $ "\n\r\t" + +multiCharEscExt :: StringLike s => Parser (GenRegex s) +multiCharEscExt + = multiCharEsc + <|> + ( do -- extension: \a represents the whole alphabet inclusive newline chars: \a == .|\n|\r + _ <- char 'a' + return mkDot ) + <|> + ( do -- extension: \A represents all words: \A == \a* or \A == (.|\n|\r)* + _ <- char 'A' + return mkAll ) + +multiCharEsc :: StringLike s => Parser (GenRegex s) +multiCharEsc + = ( do + c <- satisfy (`elem` es) + return $ mkSym . fromJust . lookup c $ pm ) + where + es = map fst pm + pm = [ ('s', charPropXmlSpaceChar ) + , ('S', compCS charPropXmlSpaceChar ) + , ('i', charPropXmlNameStartChar ) + , ('I', compCS charPropXmlNameStartChar ) + , ('c', charPropXmlNameChar ) + , ('C', compCS charPropXmlNameChar ) + , ('d', charPropDigit ) + , ('D', compCS charPropDigit ) + , ('w', compCS charPropNotWord ) + , ('W', charPropNotWord ) + ] + charPropDigit = rangeCS '0' '9' + charPropNotWord = charPropUnicodeP + `unionCS` + charPropUnicodeZ + `unionCS` + charPropUnicodeC + +catEsc :: StringLike s => Parser (GenRegex s) +catEsc + = do + _ <- char 'p' + s <- between (char '{') (char '}') charProp + return $ mkSym s + +charProp :: Parser CharSet +charProp + = isCategory + <|> + isBlock + +isBlock :: Parser CharSet +isBlock + = do + _ <- string "Is" + name <- many1 (satisfy legalChar) + case lookup name codeBlocks of + Just b -> return $ uncurry rangeCS b + Nothing -> fail $ "unknown Unicode code block " ++ show name + where + legalChar c = 'A' <= c && c <= 'Z' || + 'a' <= c && c <= 'z' || + '0' <= c && c <= '9' || + '-' == c + +isCategory :: Parser CharSet +isCategory + = do + pr <- isCategory' + return $ fromJust (lookup pr categories) + +categories :: [(String, CharSet)] +categories + = [ ("C", charPropUnicodeC ) + , ("Cc", charPropUnicodeCc) + , ("Cf", charPropUnicodeCf) + , ("Co", charPropUnicodeCo) + , ("Cs", charPropUnicodeCs) + , ("L", charPropUnicodeL ) + , ("Ll", charPropUnicodeLl) + , ("Lm", charPropUnicodeLm) + , ("Lo", charPropUnicodeLo) + , ("Lt", charPropUnicodeLt) + , ("Lu", charPropUnicodeLu) + , ("M", charPropUnicodeM ) + , ("Mc", charPropUnicodeMc) + , ("Me", charPropUnicodeMe) + , ("Mn", charPropUnicodeMn) + , ("N", charPropUnicodeN ) + , ("Nd", charPropUnicodeNd) + , ("Nl", charPropUnicodeNl) + , ("No", charPropUnicodeNo) + , ("P", charPropUnicodeP ) + , ("Pc", charPropUnicodePc) + , ("Pd", charPropUnicodePd) + , ("Pe", charPropUnicodePe) + , ("Pf", charPropUnicodePf) + , ("Pi", charPropUnicodePi) + , ("Po", charPropUnicodePo) + , ("Ps", charPropUnicodePs) + , ("S", charPropUnicodeS ) + , ("Sc", charPropUnicodeSc) + , ("Sk", charPropUnicodeSk) + , ("Sm", charPropUnicodeSm) + , ("So", charPropUnicodeSo) + , ("Z", charPropUnicodeZ ) + , ("Zl", charPropUnicodeZl) + , ("Zp", charPropUnicodeZp) + , ("Zs", charPropUnicodeZs) + ] + +isCategory' :: Parser String +isCategory' + = ( foldr1 (<|>) . map (uncurry prop) $ + [ ('L', "ultmo") + , ('M', "nce") + , ('N', "dlo") + , ('P', "cdseifo") + , ('Z', "slp") + , ('S', "mcko") + , ('C', "cfon") + ] + ) "illegal Unicode character property" + where + prop c1 cs2 + = do + _ <- char c1 + s2 <- option "" + ( do + c2 <- satisfy (`elem` cs2) + return [c2] ) + return $ c1:s2 + +complEsc :: StringLike s => Parser (GenRegex s) +complEsc + = do + _ <- char 'P' + s <- between (char '{') (char '}') charProp + return $ mkSym $ compCS s + +charClassExpr :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) +charClassExpr multiCharEsc' + = between (char '[') (char ']') charGroup + where + + -- charGroup :: StringLike s => Parser (GenRegex s) + charGroup + = do + r <- ( negCharGroup -- a ^ at beginning denotes negation, not start of posCharGroup + <|> + posCharGroup + ) + s <- option (mkZero' "") -- charClassSub + ( do + _ <- char '-' + charClassExpr multiCharEsc' + ) + return $ mkDiff r s + + -- posCharGroup :: StringLike s => Parser (GenRegex s) + posCharGroup + = do + rs <- many1 (charRange <|> charClassEsc multiCharEsc') + return $ foldr1 mkAlt rs + + -- negCharGroup :: StringLike s => Parser (GenRegex s) + negCharGroup + = do + _ <- char '^' + r <- posCharGroup + return $ mkDiff mkDot r + +charRange :: StringLike s => Parser (GenRegex s) +charRange + = try seRange + <|> + xmlCharIncDash + +seRange :: StringLike s => Parser (GenRegex s) +seRange + = do + c1 <- charOrEsc' + _ <- char '-' + c2 <- charOrEsc' + return $ mkSymRng c1 c2 + +charOrEsc' :: Parser Char +charOrEsc' + = ( do + _ <- char '\\' + singleCharEsc' + ) + <|> + satisfy (`notElem` "\\-[]") + +xmlCharIncDash :: StringLike s => Parser (GenRegex s) +xmlCharIncDash + = try ( do -- dash is only allowed if not followed by a [, else charGroup differences do not parse correctly + _ <- char '-' + notFollowedBy (char '[') + return $ mkSym1 '-' + ) + <|> + ( do + c <- satisfy (`notElem` "-\\[]") + return $ mkSym1 c + ) + +wildCardEsc :: StringLike s => Parser (GenRegex s) +wildCardEsc + = do + _ <- char '.' + return . mkSym . compCS $ stringCS "\n\r" + + +-- ------------------------------------------------------------ diff --git a/src/Text/Regex/XMLSchema/Generic/StringLike.hs b/src/Text/Regex/XMLSchema/Generic/StringLike.hs new file mode 100644 index 0000000..3a37503 --- /dev/null +++ b/src/Text/Regex/XMLSchema/Generic/StringLike.hs @@ -0,0 +1,175 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE TypeSynonymInstances #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} + +{-# OPTIONS_GHC -fno-warn-orphans #-} + +-- ------------------------------------------------------------ + +{- | + Copyright : Copyright (C) 2014- Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt + Stability : stable +-} + +-- ------------------------------------------------------------ + +module Text.Regex.XMLSchema.Generic.StringLike +where + +import Data.Maybe +import Data.String (IsString(..)) + +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy.Char8 as BL + +-- ------------------------------------------------------------ + +-- | /WARNING/: This StringLike class is /not/ intended for use outside this regex library. +-- It provides an abstraction for String's as used inside this library. +-- It allows the library to work with String (list of Char), +-- ByteString.Char8, ByteString.Lazy.Char8, +-- Data.Text and Data.Text.Lazy. +-- +-- The class is similar to the StringLike class in the tagsoup package + +class (Eq a, IsString a, Show a) => StringLike a where + emptyS :: a + uncons :: a -> Maybe (Char, a) + nullS :: a -> Bool + headS :: a -> Char + takeS :: Int -> a -> a + dropS :: Int -> a -> a + appendS :: a -> a -> a + concatS :: [a] -> a + toString :: a -> String + + nullS = isNothing . uncons + headS (uncons -> Just (c, _)) + = c + headS _ = error "headS: empty StringLike" + concatS = foldl appendS emptyS + + {-# INLINE nullS #-} + {-# INLINE headS #-} + {-# INLINE concatS #-} + +-- ------------------------------------------------------------ + +instance StringLike String where + emptyS = [] + uncons (x : xs) = Just (x, xs) + uncons "" = Nothing + nullS = null + headS = head + takeS = take + dropS = drop + appendS = (++) + concatS = concat + toString = id + + {-# INLINE emptyS #-} + {-# INLINE uncons #-} + {-# INLINE nullS #-} + {-# INLINE takeS #-} + {-# INLINE dropS #-} + {-# INLINE appendS #-} + {-# INLINE concatS #-} + {-# INLINE toString #-} + +-- ------------------------------------------------------------ + +instance StringLike T.Text where + emptyS = T.empty + uncons = T.uncons + nullS = T.null + headS = T.head + takeS = T.take + dropS = T.drop + appendS = T.append + concatS = T.concat + toString = T.unpack + + {-# INLINE emptyS #-} + {-# INLINE uncons #-} + {-# INLINE nullS #-} + {-# INLINE takeS #-} + {-# INLINE dropS #-} + {-# INLINE appendS #-} + {-# INLINE concatS #-} + {-# INLINE toString #-} + +-- ------------------------------------------------------------ + +instance StringLike TL.Text where + emptyS = TL.empty + uncons = TL.uncons + nullS = TL.null + headS = TL.head + takeS = TL.take . toEnum + dropS = TL.drop . toEnum + appendS = TL.append + concatS = TL.concat + toString = TL.unpack + + {-# INLINE emptyS #-} + {-# INLINE uncons #-} + {-# INLINE nullS #-} + {-# INLINE takeS #-} + {-# INLINE dropS #-} + {-# INLINE appendS #-} + {-# INLINE concatS #-} + {-# INLINE toString #-} + +-- ------------------------------------------------------------ + +instance StringLike B.ByteString where + emptyS = B.empty + uncons = B.uncons + nullS = B.null + headS = B.head + takeS = B.take + dropS = B.drop + appendS = B.append + concatS = B.concat + toString = B.unpack + + {-# INLINE emptyS #-} + {-# INLINE uncons #-} + {-# INLINE nullS #-} + {-# INLINE takeS #-} + {-# INLINE dropS #-} + {-# INLINE appendS #-} + {-# INLINE concatS #-} + {-# INLINE toString #-} + +-- ------------------------------------------------------------ + +instance StringLike BL.ByteString where + emptyS = BL.empty + uncons = BL.uncons + nullS = BL.null + headS = BL.head + takeS = BL.take . toEnum + dropS = BL.drop . toEnum + appendS = BL.append + concatS = BL.concat + toString = BL.unpack + + {-# INLINE emptyS #-} + {-# INLINE uncons #-} + {-# INLINE nullS #-} + {-# INLINE takeS #-} + {-# INLINE dropS #-} + {-# INLINE appendS #-} + {-# INLINE concatS #-} + {-# INLINE toString #-} + +-- ------------------------------------------------------------ + + diff --git a/src/Text/Regex/XMLSchema/String.hs b/src/Text/Regex/XMLSchema/String.hs new file mode 100644 index 0000000..5fef800 --- /dev/null +++ b/src/Text/Regex/XMLSchema/String.hs @@ -0,0 +1,386 @@ +-- ------------------------------------------------------------ + +{- | + Copyright : Copyright (C) 2010- Uwe Schmidt + License : MIT + + Maintainer : Uwe Schmidt + Stability : stable + Portability: portable + + Convenient functions for W3C XML Schema Regular Expression Matcher for Strings. + A specialisation of Text.Regex.XMLSchema.Generic as + compatibility module to old non generic version + + Grammar can be found under +-} + +-- ------------------------------------------------------------ + +module Text.Regex.XMLSchema.String + {-# DEPRECATED "use the more general 'Text.Regex.XMLSchema.Generic' instead" #-} + ( Regex + + , grep + , grepExt + , grepRE + , grepREwithLineNum + + , match + , matchExt + , matchSubex + + , sed + , sedExt + + , split + , splitExt + , splitSubex + + , tokenize + , tokenizeExt + , tokenize' + , tokenizeExt' + , tokenizeSubex + + , matchRE + , matchSubexRE + , sedRE + , splitRE + , splitSubexRE + , tokenizeRE + , tokenizeRE' + , tokenizeSubexRE + + -- Text.Regex.XMLSchema.Generic.Regex + , mkZero + , mkUnit + , mkSym1 + , mkSymRng + , mkWord + , mkDot + , mkStar + , mkAll + , mkAlt + , mkElse + , mkSeq + , mkSeqs + , mkRep + , mkRng + , mkOpt + , mkDiff + , mkIsect + , mkExor + , mkCompl + , mkBr + , isZero + , errRegex + + -- Text.Regex.XMLSchema.Generic.RegexParser + , parseRegex + , parseRegexExt + , parseContextRegex + ) +where + +import Text.Regex.XMLSchema.Generic (Regex) +import qualified Text.Regex.XMLSchema.Generic as G +import Text.Regex.XMLSchema.Generic.Regex +import Text.Regex.XMLSchema.Generic.RegexParser + +-- ------------------------------------------------------------ + +-- | split a string by taking the longest prefix matching a regular expression +-- +-- @Nothing@ is returned in case there is no matching prefix, +-- else the pair of prefix and rest is returned + +splitRE :: Regex -> String -> Maybe (String, String) +splitRE = G.splitRE + +-- | convenient function for 'splitRE' +-- +-- examples: +-- +-- > split "a*b" "abc" = ("ab","c") +-- > split "a*" "bc" = ("", "bc") -- "a*" matches "" +-- > split "a+" "bc" = ("", "bc") -- "a+" does not match, no split +-- > split "[" "abc" = ("", "abc") -- "[" syntax error, no split + +split :: String -> String -> (String, String) +split = G.split + +-- | split with extended syntax + +splitExt :: String -> String -> (String, String) +splitExt = G.splitExt + +-- ------------------------------------------------------------ + +-- | split a string by removing the longest prefix matching a regular expression +-- and then return the list of subexpressions found in the matching part +-- +-- @Nothing@ is returned in case of no matching prefix, +-- else the list of pairs of labels and submatches and the +-- rest is returned + +splitSubexRE :: Regex -> String -> Maybe ([(String, String)], String) +splitSubexRE = G.splitSubexRE + +-- | convenient function for 'splitSubex', uses extended syntax +-- +-- examples: +-- +-- > splitSubex "({1}a*)b" "abc" = ([("1","a")],"c") +-- > splitSubex "({2}a*)" "bc" = ([("2","")], "bc") +-- > splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c") -- subex 1 matches 2 times +-- > +-- > splitSubex ".*({x}a*)" "aa" = ([("x",""),("x","a"),("x","aa")],"") +-- > -- nondeterminism: 3 matches for a* +-- > +-- > splitSubex "({1}do)|({2}[a-z]+)" "do you know" +-- > = ([("1","do"),("2","do")]," you know") +-- > -- nondeterminism: 2 matches for do +-- > +-- > splitSubex "({1}do){|}({2}[a-z]+)" "do you know" +-- > = ([("1","do")]," you know") +-- > -- no nondeterminism with {|}: 1. match for do +-- > +-- > splitSubex "({1}a+)" "bcd" = ([], "bcd") -- no match +-- > splitSubex "[" "abc" = ([], "abc") -- syntax error + + +splitSubex :: String -> String -> ([(String,String)], String) +splitSubex = G.splitSubex + +-- ------------------------------------------------------------ + +-- | The function, that does the real work for 'tokenize' + +tokenizeRE :: Regex -> String -> [String] +tokenizeRE = G.tokenizeRE + +-- | split a string into tokens (words) by giving a regular expression +-- which all tokens must match. +-- +-- Convenient function for 'tokenizeRE' +-- +-- This can be used for simple tokenizers. +-- It is recommended to use regular expressions where the empty word does not match. +-- Else there will appear a lot of probably useless empty tokens in the output. +-- All none matching chars are discarded. If the given regex contains syntax errors, +-- @Nothing@ is returned +-- +-- examples: +-- +-- > tokenize "a" "aabba" = ["a","a","a"] +-- > tokenize "a*" "aaaba" = ["aaa","a"] +-- > tokenize "a*" "bbb" = ["","",""] +-- > tokenize "a+" "bbb" = [] +-- > +-- > tokenize "a*b" "" = [] +-- > tokenize "a*b" "abc" = ["ab"] +-- > tokenize "a*b" "abaab ab" = ["ab","aab","ab"] +-- > +-- > tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc" +-- > = ["ab","123","456.7","abc"] +-- > +-- > tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc" +-- > = ["cab","123","456.7","abc"] +-- > +-- > tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz" +-- > = ["abc","def","xyz"] +-- > +-- > tokenize ".*" "\nabc\n123\n\nxyz\n" +-- > = ["","abc","123","","xyz"] +-- > +-- > tokenize ".*" = lines +-- > +-- > tokenize "[^ \t\n\r]*" = words + +tokenize :: String -> String -> [String] +tokenize = G.tokenize + +-- | tokenize with extended syntax + +tokenizeExt :: String -> String -> [String] +tokenizeExt = G.tokenizeExt + +-- ------------------------------------------------------------ + +-- | split a string into tokens and delimierter by giving a regular expression +-- wich all tokens must match +-- +-- This is a generalisation of the above 'tokenizeRE' functions. +-- The none matching char sequences are marked with @Left@, the matching ones are marked with @Right@ +-- +-- If the regular expression contains syntax errors @Nothing@ is returned +-- +-- The following Law holds: +-- +-- > concat . map (either id id) . tokenizeRE' re == id + +tokenizeRE' :: Regex -> String -> [Either String String] +tokenizeRE' = G.tokenizeRE' + +-- | convenient function for 'tokenizeRE'' +-- +-- When the regular expression parses as Zero, +-- @[Left input]@ is returned, that means no tokens are found + +tokenize' :: String -> String -> [Either String String] +tokenize' = G.tokenize' + +tokenizeExt' :: String -> String -> [Either String String] +tokenizeExt' = G.tokenizeExt' + +-- ------------------------------------------------------------ + +-- | split a string into tokens (pair of labels and words) by giving a regular expression +-- containing labeled subexpressions. +-- +-- This function should not be called with regular expressions +-- without any labeled subexpressions. This does not make sense, because the result list +-- will always be empty. +-- +-- Result is the list of matching subexpressions +-- This can be used for simple tokenizers. +-- At least one char is consumed by parsing a token. +-- The pairs in the result list contain the matching substrings. +-- All none matching chars are discarded. If the given regex contains syntax errors, +-- @Nothing@ is returned + +tokenizeSubexRE :: Regex -> String -> [(String, String)] +tokenizeSubexRE = G.tokenizeSubexRE + +-- | convenient function for 'tokenizeSubexRE' a string +-- +-- examples: +-- +-- > tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)" +-- > "cab123 456.7abc" +-- > = [("name","cab") +-- > ,("num","123") +-- > ,("real","456.7") +-- > ,("name","abc")] +-- > +-- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" +-- > "12.34" = [("real","12.34") +-- > ,("n","12") +-- > ,("f","34")] +-- > +-- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" +-- > "12 34" = [("real","12"),("n","12") +-- > ,("real","34"),("n","34")] +-- > +-- > tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))" +-- > "12 34.56" = [("real","12"),("n","12"),("f","") +-- > ,("real","34.56"),("n","34"),("f","56")] + +tokenizeSubex :: String -> String -> [(String,String)] +tokenizeSubex = G.tokenizeSubex + +-- ------------------------------------------------------------ + +-- | sed like editing function +-- +-- All matching tokens are edited by the 1. argument, the editing function, +-- all other chars remain as they are + +sedRE :: (String -> String) -> Regex -> String -> String +sedRE = G.sedRE + +-- | convenient function for 'sedRE' +-- +-- examples: +-- +-- > sed (const "b") "a" "xaxax" = "xbxbx" +-- > sed (\ x -> x ++ x) "a" "xax" = "xaax" +-- > sed undefined "[" "xxx" = "xxx" + +sed :: (String -> String) -> String -> String -> String +sed = G.sed + +sedExt :: (String -> String) -> String -> String -> String +sedExt = G.sedExt + +-- ------------------------------------------------------------ + +-- | match a string with a regular expression + +matchRE :: Regex -> String -> Bool +matchRE = G.matchRE + +-- | convenient function for 'matchRE' +-- +-- Examples: +-- +-- > match "x*" "xxx" = True +-- > match "x" "xxx" = False +-- > match "[" "xxx" = False + +match :: String -> String -> Bool +match = G.match + +-- | match with extended regular expressions + +matchExt :: String -> String -> Bool +matchExt = G.matchExt + +-- ------------------------------------------------------------ + +-- | match a string with a regular expression +-- and extract subexpression matches + +matchSubexRE :: Regex -> String -> [(String, String)] +matchSubexRE = G.matchSubexRE + +-- | convenient function for 'matchRE' +-- +-- Examples: +-- +-- > matchSubex "({1}x*)" "xxx" = [("1","xxx")] +-- > matchSubex "({1}x*)" "y" = [] +-- > matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600" = [("w","800"),("h","600")] +-- > matchSubex "[" "xxx" = [] + +matchSubex :: String -> String -> [(String, String)] +matchSubex = G.matchSubex + +-- ------------------------------------------------------------ + +-- | grep like filter for lists of strings +-- +-- The regular expression may be prefixed with the usual context spec \"^\" for start of string, +-- and "\\<" for start of word. +-- and suffixed with \"$\" for end of text and "\\>" end of word. +-- Word chars are defined by the multi char escape sequence "\\w" +-- +-- Examples +-- +-- > grep "a" ["_a_", "_a", "a_", "a", "_"] => ["_a_", "_a", "a_", "a"] +-- > grep "^a" ["_a_", "_a", "a_", "a", "_"] => ["a_", "a"] +-- > grep "a$" ["_a_", "_a", "a_", "a", "_"] => ["_a", "a"] +-- > grep "^a$" ["_a_", "_a", "a_", "a", "_"] => ["a"] +-- > grep "\\ ["x a b", " ax "] +-- > grep "a\\>" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " xa "] + +grep :: String -> [String] -> [String] +grep = G.grep + +-- | grep with extended regular expressions + +grepExt :: String -> [String] -> [String] +grepExt = G.grepExt + +-- | grep with already prepared Regex (ususally with 'parseContextRegex') + +grepRE :: Regex -> [String] -> [String] +grepRE = G.grepRE + +-- | grep with Regex and line numbers + +grepREwithLineNum :: Regex -> [String] -> [(Int,String)] +grepREwithLineNum = G.grepREwithLineNum + +-- ------------------------------------------------------------ diff --git a/test/Benchmark.hs b/test/Benchmark.hs new file mode 100644 index 0000000..dc62809 --- /dev/null +++ b/test/Benchmark.hs @@ -0,0 +1,74 @@ +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} + +-- ------------------------------------------------------------ + +module Main +where + +import Control.DeepSeq + +import Criterion.Main + +import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy.Char8 as BL +import Data.String (IsString (..)) +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL + +import Text.Regex.XMLSchema.Generic +import Text.Regex.XMLSchema.Generic.StringLike + +-- import Debug.Trace + +-- ------------------------------------------------------------ + +type BS = B.ByteString +type BL = BL.ByteString +type Text = T.Text +type TextL = TL.Text + +-- ------------------------------------------------------------ + +benchSTB :: String -> + (forall s . (NFData s, StringLike s) => s -> [s]) -> + (String, String -> [String]) -> + String -> + Benchmark +benchSTB name fct ref inp + = benchSTB' name fct ref $! mkInput inp + +benchSTB' :: String -> + (forall s . (NFData s, StringLike s) => s -> [s]) -> + (String, String -> [String]) -> + (String, Text, TextL, BS, BL) -> + Benchmark +benchSTB' name fct (refName, ref) (s, t, tl, bs, bl) + = bgroup name + [ bench refName $ nf ref s + , bench "String" $ nf fct s + , bench "Text" $ nf fct t + , bench "Text.Lazy" $ nf fct tl + , bench "ByteString" $ nf fct bs + , bench "ByteString.Lazy" $ nf fct bl + ] + +mkInput :: String -> (String, Text, TextL, BS, BL) +mkInput s + = rnf t5 `seq` t5 + where + t5 = (s, fromString s, fromString s, fromString s, fromString s) + +words' :: StringLike s => s -> [s] +words' inp + = tokenize (fromString "\\w+") inp + + +main :: IO () +main + = do + defaultMain [ benchSTB "100,000-words" words' ("words", words) + $ unwords (replicate 100000 "1234567890") + ] + +-- ------------------------------------------------------------ diff --git a/test/Date.hs b/test/Date.hs new file mode 100644 index 0000000..febd90c --- /dev/null +++ b/test/Date.hs @@ -0,0 +1,633 @@ +{-# LANGUAGE ExplicitForAll #-} +{-# LANGUAGE Rank2Types #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ViewPatterns #-} + +module Main +where + +import Control.Arrow ( (***), second ) + +import Data.Char ( toLower, toUpper ) +import Data.List ( isPrefixOf ) +import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy.Char8 as BL +import Data.String (IsString (..)) +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL + +import System.Exit (ExitCode (..), exitWith) + +import Test.HUnit + +import Text.Parsec +import Text.Regex.XMLSchema.Generic +import Text.Regex.XMLSchema.Generic.StringLike + +-- ------------------------------------------------------------ + +newtype Test' a = Test' {unTest' :: Test} + +type BS = B.ByteString +type BL = BL.ByteString +type Text = T.Text +type TextL = TL.Text + +-- ------------------------------------------------------------ +-- some little helpers for building r.e.s + +star :: String -> String +star = (++ "*") . pars + +plus :: String -> String +plus = (++ "+") . pars + +opt :: String -> String +opt = (++ "?") . pars + +dot :: String -> String +dot = (++ "\\.") + +pars :: String -> String +pars = ("(" ++) . (++ ")") + +orr :: String -> String -> String +orr x y = pars $ pars x ++ "|" ++ pars y + +xor :: String -> String -> String +xor x y = pars $ pars x ++ "{|}" ++ pars y + +nocase :: String -> String +nocase (x:xs) = '[' : toUpper x : toLower x : ']' : xs +nocase [] = error "nocase with empty list" + +alt :: [String] -> String +alt = pars . foldr1 orr + +altNC :: [String] -> String +altNC = pars . alt . map nocase + +subex :: String -> String -> String +subex n e = pars $ "{" ++ n ++ "}" ++ pars e + +ws :: String +ws = "\\s" + +ws0 :: String +ws0 = star ws + +ws1 :: String +ws1 = plus ws + +s0 :: String -> String -> String +s0 x y = x ++ ws0 ++ y + +-- the date and time r.e.s + +day :: String +day = "(0?[1-9]|[12][0-9]|3[01])" + +month :: String +month = "(0?[1-9]|1[0-2])" + +year2 :: String +year2 = "[0-5][0-9]" + +year4 :: String +year4 = "20" ++ year2 + +year :: String +year = year4 `orr` year2 + +year' :: String +year' = "'" ++ year2 + +dayD :: String +dayD = dot day +monthD :: String +monthD = dot month + +dayMonthYear :: String +dayMonthYear = dayD `s0` monthD `s0` year +dayMonth :: String +dayMonth = dayD `s0` monthD + +dayOfWeekL :: String +dayOfWeekL = altNC + [ "montag" + , "dienstag" + , "mittwoch" + , "donnerstag" + , "freitag" + , "samstag" + , "sonnabend" + , "sonntag" + ] + +dayOfWeekA :: String +dayOfWeekA = alt . map dot $ + [ "Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"] + +dayOfWeek :: String +dayOfWeek = dayOfWeekL `orr` dayOfWeekA + +monthL :: String +monthL = altNC + [ "januar" + , "februar" + , "märz" + , "april" + , "mai" + , "juni" + , "juli" + , "august" + , "september" + , "oktober" + , "november" + , "dezember" + ] + +monthA :: String +monthA = altNC . map dot $ map snd monthAbr + +monthAbr :: [(Integer, String)] +monthAbr = (9, "sept") : + zip [1..12] + [ "jan", "feb", "mär", "apr", "mai", "jun", "jul", "aug", "sep", "okt", "nov", "dez"] + +monthN :: String +monthN = pars $ monthL `orr` monthA + +hour :: String +hour = pars "([0-1]?[0-9])|(2[0-4])" + +minute :: String +minute = pars "(0?[0-9])|([1-5][0-9])" + +uhr :: String +uhr = ws0 ++ nocase "uhr" + +hourMin :: String +hourMin = hour ++ ":" ++ minute ++ opt uhr + +wsyear :: String +wsyear = year ++ "/[0-9]{2}" + +wsem :: String +wsem = ("Wi?Se?" `orr` nocase "Wintersemester") ++ ws0 ++ wsyear + +ssem :: String +ssem = ("So?Se?" `orr` nocase "Sommersemester") ++ ws0 ++ year + +sem :: String +sem = wsem `orr` ssem + +num :: String +num = "\\d+" + +-- the token types + +tokenRE :: String +tokenRE = foldr1 xor $ + map (uncurry subex) $ + [ ( "ddmmyyyy", dayMonthYear ) + , ( "ddMonthyyyy", dayD `s0` monthN `s0` (year `orr` year') ) + , ( "ddmm", dayMonth) + , ( "ddMonth", dayD `s0` monthN ) + , ( "yyyymmdd", year ++ "[-/]" ++ month ++ "[-/]" ++ day ) + , ( "yyyy", year4 `orr` ("'" ++ year2) ) + , ( "month", monthN ) + , ( "weekday", dayOfWeek ) + , ( "HHMM", hourMin ++ opt uhr ) + , ( "HH", hour ++ uhr ) + , ( "wsem", wsem) + , ( "ssem", ssem) + , ( "word", "[\\w\\d]+") + , ( "del", "[^\\w\\d]+") + ] + + +-- ------------------------------------------------------------ + +type Token = (String, String) +type TokenStream = [Token] + +type DateParser a = Parsec [(String, String)] () a + +type StringFct = String -> String -- for fast concatenation + +-- must be extended for weekday or semester, if neccessay + +data DateVal = DT { _year :: ! Int + , _month :: ! Int + , _day :: ! Int + , _hour :: ! Int + , _min :: ! Int + } + deriving (Eq, Show) + +data DateParse = DP { _pre :: StringFct + , _rep :: StringFct + , _dat :: ! DateVal + } + +-- just a helper for result output +data DateRep = DR { _p :: String + , _r :: String + , _d :: ! DateVal + } + deriving (Eq, Show) + +-- ------------------------------------------------------------ + +emptyText :: StringFct +emptyText = id + +mkText :: String -> StringFct +mkText = (++) + +concText :: StringFct -> StringFct -> StringFct +concText = (.) + +textToString :: StringFct -> String +textToString = ($ []) + +emptyDateVal :: DateVal +emptyDateVal = DT { _year = -1 + , _month = -1 + , _day = -1 + , _hour = -1 + , _min = -1 + } + +emptyDateParse :: DateParse +emptyDateParse = DP { _pre = emptyText + , _rep = emptyText + , _dat = emptyDateVal + } + +appPre :: String -> DateParse -> DateParse +appPre s d = d { _pre = _pre d `concText` mkText s } + +appRep :: String -> DateParse -> DateParse +appRep s d = d { _rep = _rep d `concText` mkText s } + +setDay :: Int -> Int -> Int -> DateParse -> DateParse +setDay j m t d = d { _dat = setDateVal j m t (-1) (-1) (_dat d) } + +setHour :: Int -> Int -> DateParse -> DateParse +setHour h m d = d { _dat = setDateVal (-1) (-1) (-1) h m (_dat d) } + +setDateVal :: Int -> Int -> Int -> Int -> Int -> DateVal -> DateVal +setDateVal j m t s i (DT j' m' t' s' i' ) + = DT j'' m'' t'' s'' i'' + where + j'' | j < 0 = j' -- year not there + | j < 100 = j + 2000 -- 2 digit year + | otherwise = j -- 4 digit year + m'' = m `max` m' + t'' = t `max` t' + s'' = s `max` s' + i'' = i `max` i' + +datePToDateRep :: DateParse -> DateRep +datePToDateRep dp + = DR { _p = textToString $ _pre dp + , _r = textToString $ _rep dp + , _d = _dat dp + } + +-- ------------------------------------------------------------ + +-- a simple helper for showing the results + +dateSearch' :: TokenStream -> [DateRep] +dateSearch' = map datePToDateRep . + dateSearch + +-- look for a sequence of date specs, the last entry in the list +-- does not contain a valid date, but just the context behind the last real date + +dateSearch :: TokenStream -> [DateParse] +dateSearch = either (const []) id . + parse (many (dateParser emptyDateParse)) "" + +-- all date parsers thread a state the subparsers to accumulate +-- the parts of a date, the context, the external representation and +-- the pure data, year, month, day, ... + +dateParser :: DateParse -> DateParser DateParse +dateParser d = ( do + s <- fillTok + dateParser0 (appPre s d) + ) + <|> + parseDate d -- here is the hook for the real date parser + <|> + ( do + s <- textTok -- the default case: if parseDate fails + dateParser0 (appPre s d) -- the token is handled like a normal word + ) + +dateParser0 :: DateParse -> DateParser DateParse +dateParser0 d = dateParser d <|> return d + + +parseDate :: DateParse -> DateParser DateParse +parseDate d = parseDate0 d + <|> + try + ( do + d1 <- parseWeekDay d + lookAheadN 3 parseDate0 d1 -- Freitag, den 13. + ) + +-- parse a date optionally followed by a time +parseDate0 :: DateParse -> DateParser DateParse +parseDate0 d = ( do + d1 <- parseDay d + option d1 (parseFollowingHour d1) + ) + +-- parse a simple token for a day +parseDay :: DateParse -> DateParser DateParse +parseDay d = ( do + (s, d') <- parseDateTok "ddmmyyyy" d + let [t, m, j] = tokenize num s + return $ setDay (read j) (read m) (read t) d' + ) + <|> + ( do + (s, d') <- parseDateTok "ddMonthyyyy" d + let s' = sed ((++ ".") . monthToM) monthN s + let [t, m, j] = tokenize num s' + return $ setDay (read j) (read m) (read t) d' + ) + <|> + ( do + (s, d') <- parseDateTok "ddmm" d + let [t, m] = tokenize num s + return $ setDay (-1) (read m) (read t) d' + ) + <|> + ( do + (s, d') <- parseDateTok "ddMonth" d + let s' = sed ((++ ".") . monthToM) monthN s + let [t, m] = tokenize num s' + return $ setDay (-1) (read m) (read t) d' + ) + <|> + ( do + (s, d') <- parseDateTok "yyyymmdd" d + let [j, m, t] = tokenize num s + return $ setDay (read j) (read m) (read t) d' + ) + +parseYear :: DateParse -> DateParser DateParse +parseYear d = ( do + (s, d') <- parseDateTok "yyyy" d + let [j] = tokenize num s + return $ setDay (read j) (-1) (-1) d' + ) + +-- parse a weekday and add it to the external rep. + +parseWeekDay :: DateParse -> DateParser DateParse +parseWeekDay d = ( do + (_s, d') <- parseDateTok "weekday" d + return d' + ) + +-- parse a following hour spec, 5 fill tokens, words or delimiters are possible + +parseFollowingHour :: DateParse -> DateParser DateParse +parseFollowingHour + = try . -- backtracking becomes neccessary + lookAheadN 5 parseHour -- max 2 words and 3 delimiters + +-- parse the simple time formats +parseHour :: DateParse -> DateParser DateParse +parseHour d = ( do + (s, d') <- parseDateTok "HHMM" d + let [h, m] = tokenize num s + return $ setHour (read h) (read m) d' + ) + <|> + ( do + (s, d') <- parseDateTok "HH" d + let [h] = tokenize num s + return $ setHour (read h) 0 d' + ) + +-- ------------------------------------------------------------ +-- +-- auxiliary parser combinators + +-- parse a token of a given type and add the text to the external rep. + +parseDateTok :: String -> DateParse -> DateParser (String, DateParse) +parseDateTok tty d + = dateTok (isTokType (== tty)) d + +dateTok :: DateParser String -> DateParse -> DateParser (String, DateParse) +dateTok t d = ( do + s <- t + return (s, appRep s d) + ) + +-- try to apply a parser, but first skip a given # of fill tokens + +lookAheadN :: Int -> (DateParse -> DateParser DateParse) -> DateParse -> DateParser DateParse +lookAheadN n p d + | n <= 0 = p d + | otherwise = do + (_, d1) <- dateTok fillTok d + ( lookAheadN (n - 1) p d1 <|> p d1 ) + +-- ------------------------------------------------------------ +-- +-- basic token parsers + +-- the interface to the primitive parsec token parser +tok :: (Token -> Bool) -> DateParser Token +tok prd = tokenPrim showTok nextPos testTok + where + showTok = show . fst + nextPos pos _tok _ts = incSourceColumn pos 1 + testTok tk = if prd tk then Just tk else Nothing + +-- check for specific token type and in case of success return the text value +isTokType :: (String -> Bool) -> DateParser String +isTokType isT = tok (isT . fst) >>= return . snd + +-- parse an arbitrary token and return the text value +textTok :: DateParser String +textTok = isTokType (const True) + +-- a word +wordTok :: DateParser String +wordTok = isTokType (== "word") + +-- a delimiter, whitespace is normalized, sequences are reduced to a single space char +delTok :: DateParser String +delTok = isTokType (== "del") + >>= + return . sed (const " ") ws1 + +-- tokens that don't contain date info + +fillTok :: DateParser String +fillTok = delTok <|> wordTok + +-- semester tokens, not yet interpreted +semTok' :: String -> DateParser (String, Int, Bool) +semTok' sem' = do v <- isTokType (== sem') + return (v, read . head . tokenizeExt year $ v, sem' == "ssem") + +semTok :: DateParser (String, Int, Bool) +semTok = semTok' "ssem" <|> semTok' "wsem" + +-- ------------------------------------------------------------ + +-- conversion from month names to 1..12 +monthToM :: String -> String +monthToM m + = show . + (\ l -> if null l then 99 else head l) . + map fst . + filter ((== True) . snd) . + map (second (`isPrefixOf` map toLower m)) $ + monthAbr + +-- ------------------------------------------------------------ + +ts :: String +ts = "Am Sonntag, dem 17. Februar '03 findet um 9 Uhr ein wichtiger Termin für das Sommersemester 2000 statt. " + ++ "Dieser wird allerdings auf Montag verschoben. Und zwar auf den ersten Montag im Wintersemester 11/12, 12:30. " + ++ "Ein wichtiger Termin findet im SoSe 2011 statt. Im Jahr '12 gibt es Termine, aber auch in WS 2010/11. " + ++ "Ein weiterer Termin ist am 2.4.11 um 12 Uhr. Oder war es doch Di. der 3.4.? Egal. " + ++ "Ein weiterer wichtiger Termin findet am 2001-3-4 statt bzw. generell zwischen 01/3/4 - 01/6/4 um 13 Uhr. " + ++ "Am kommenden Mittwoch findet Changemanagement in HS5 statt. Dies gilt dann auch für den 7. Juni " + ++ "des Jahres 2011. Noch ein wichtiger Termin findet um 16:15 Uhr am Do., 1.2.03 statt. " + ++ "Freitag, der 13. Juli ist kein Glückstag" + ++ "und Freitag, der 13. Juli um 11:55 Uhr ist es zu spät." + +rrr :: [String] +rrr = map _r . dateSearch' . tokenizeSubex tokenRE $ ts + +ddd :: [DateVal] +ddd = map _d . dateSearch' . tokenizeSubex tokenRE $ ts + +aaa :: [DateRep] +aaa = dateSearch' . tokenizeSubex tokenRE $ ts + +tt :: String -> [(String, String)] +tt = tokenizeSubex tokenRE + +dd :: String -> [DateVal] +dd = map _d . dateSearch' . tt + +rr :: String -> [String] +rr = map _r . dateSearch' . tt + +pp :: String -> [String] +pp = map _p . dateSearch' . tt + +-- ------------------------------------------------------------ + +testDate :: forall a . StringLike a => Test' a +testDate + = Test' $ + TestLabel "date and time extraction from free text" $ + TestList $ + zipWith parseT toks exx + where + parseT res ok + = TestCase $ + assertEqual (show res ++ " == " ++ show ok) res ok + + toks :: [(a, a)] + toks = tokenizeSubex (fromString tokenRE) (fromString ts) + + exx :: [(a, a)] + exx = map (fromString *** fromString) $ + [("word","Am"),("del"," "),("weekday","Sonntag"),("del",", "),("word","dem"),("del"," ") + ,("ddMonthyyyy","17. Februar '03"),("del"," "),("word","findet"),("del"," "),("word","um") + ,("del"," "),("HH","9 Uhr"),("del"," "),("word","ein"),("del"," "),("word","wichtiger") + ,("del"," "),("word","Termin"),("del"," "),("word","f\252r"),("del"," "),("word","das") + ,("del"," "),("ssem","Sommersemester 2000"),("del"," "),("word","statt"),("del",". ") + ,("word","Dieser"),("del"," "),("word","wird"),("del"," "),("word","allerdings") + ,("del"," "),("word","auf"),("del"," "),("weekday","Montag"),("del"," ") + ,("word","verschoben"),("del",". "),("word","Und"),("del"," "),("word","zwar") + ,("del"," "),("word","auf"),("del"," "),("word","den"),("del"," "),("word","ersten") + ,("del"," "),("weekday","Montag"),("del"," "),("word","im"),("del"," ") + ,("wsem","Wintersemester 11/12"),("del",", "),("HHMM","12:30"),("del",". ") + ,("word","Ein"),("del"," "),("word","wichtiger"),("del"," "),("word","Termin") + ,("del"," "),("word","findet"),("del"," "),("word","im"),("del"," "),("ssem","SoSe 2011") + ,("del"," "),("word","statt"),("del",". "),("word","Im"),("del"," "),("word","Jahr") + ,("del"," '"),("word","12"),("del"," "),("word","gibt"),("del"," "),("word","es") + ,("del"," "),("word","Termine"),("del",", "),("word","aber"),("del"," "),("word","auch") + ,("del"," "),("word","in"),("del"," "),("wsem","WS 2010/11"),("del",". "),("word","Ein") + ,("del"," "),("word","weiterer"),("del"," "),("word","Termin"),("del"," "),("word","ist") + ,("del"," "),("word","am"),("del"," "),("ddmmyyyy","2.4.11"),("del"," "),("word","um") + ,("del"," "),("HH","12 Uhr"),("del",". "),("word","Oder"),("del"," "),("word","war") + ,("del"," "),("word","es"),("del"," "),("word","doch"),("del"," "),("weekday","Di.") + ,("del"," "),("word","der"),("del"," "),("ddmm","3.4."),("del","? "),("word","Egal") + ,("del",". "),("word","Ein"),("del"," "),("word","weiterer"),("del"," ") + ,("word","wichtiger"),("del"," "),("word","Termin"),("del"," "),("word","findet") + ,("del"," "),("word","am"),("del"," "),("yyyymmdd","2001-3-4"),("del"," ") + ,("word","statt"),("del"," "),("word","bzw"),("del",". "),("word","generell") + ,("del"," "),("word","zwischen"),("del"," "),("yyyymmdd","01/3/4"),("del"," - ") + ,("yyyymmdd","01/6/4"),("del"," "),("word","um"),("del"," "),("HH","13 Uhr") + ,("del",". "),("word","Am"),("del"," "),("word","kommenden"),("del"," ") + ,("weekday","Mittwoch"),("del"," "),("word","findet"),("del"," ") + ,("word","Changemanagement"),("del"," "),("word","in"),("del"," "),("word","HS5") + ,("del"," "),("word","statt"),("del",". "),("word","Dies"),("del"," "),("word","gilt") + ,("del"," "),("word","dann"),("del"," "),("word","auch"),("del"," "),("word","f\252r") + ,("del"," "),("word","den"),("del"," "),("ddMonth","7. Juni"),("del"," "),("word","des") + ,("del"," "),("word","Jahres"),("del"," "),("yyyy","2011"),("del",". "),("word","Noch") + ,("del"," "),("word","ein"),("del"," "),("word","wichtiger"),("del"," "),("word","Termin") + ,("del"," "),("word","findet"),("del"," "),("word","um"),("del"," "),("HHMM","16:15 Uhr") + ,("del"," "),("word","am"),("del"," "),("weekday","Do."),("del",", "),("ddmmyyyy","1.2.03") + ,("del"," "),("word","statt"),("del",". "),("weekday","Freitag"),("del",", ") + ,("word","der"),("del"," "),("ddMonth","13. Juli"),("del"," "),("word","ist"),("del"," ") + ,("word","kein"),("del"," "),("word","Gl\252ckstagund"),("del"," "),("weekday","Freitag") + ,("del",", "),("word","der"),("del"," "),("ddMonth","13. Juli"),("del"," "),("word","um") + ,("del"," "),("HHMM","11:55 Uhr"),("del"," "),("word","ist"),("del"," "),("word","es") + ,("del"," "),("word","zu"),("del"," "),("word","sp\228t"),("del",".") + ] + + +-- ------------------------------------------------------------ + +genericTest :: (forall a . StringLike a => Test' a) -> Test +genericTest t + = TestList $ + [ TestLabel "Test with 'String'" $ unTest' (t :: Test' String) + , TestLabel "Test with 'Text'" $ unTest' (t :: Test' Text) + , TestLabel "Test with 'Text.Lazy'" $ unTest' (t :: Test' TextL) + , TestLabel "Test with 'ByteString'" $ unTest' (t :: Test' BS) + , TestLabel "Test with 'ByteString.Lazy'" $ unTest' (t :: Test' BL) + ] + +allTests :: Test +allTests + = TestList + [ genericTest testDate ] + +main :: IO () +main + = do + c <- runTestTT allTests + putStrLn $ show c + let errs = errors c + fails = failures c + exitWith (codeGet errs fails) + +codeGet :: Int -> Int -> ExitCode +codeGet errs fails + | fails > 0 = ExitFailure 2 + | errs > 0 = ExitFailure 1 + | otherwise = ExitSuccess + +-- ------------------------------------------------------------ diff --git a/test/SimpleTest.hs b/test/SimpleTest.hs new file mode 100644 index 0000000..aceda45 --- /dev/null +++ b/test/SimpleTest.hs @@ -0,0 +1,470 @@ +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ViewPatterns #-} + +-- ------------------------------------------------------------ + +module Main +where + +import Control.Arrow + +import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy.Char8 as BL +import Data.String (IsString (..)) +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL + +import System.Exit (ExitCode (..), + exitWith) + +import Text.Regex.XMLSchema.Generic +import Text.Regex.XMLSchema.Generic.Regex +import Text.Regex.XMLSchema.Generic.StringLike + +import Test.HUnit + +-- ------------------------------------------------------------ + +newtype Test' a = Test' {unTest' :: Test} + +type BS = B.ByteString +type BL = BL.ByteString +type Text = T.Text +type TextL = TL.Text + +-- ------------------------------------------------------------ + +parseTestsStdLatin1 :: forall s . StringLike s => Test' s +parseTestsStdLatin1 = parseTestsStd' testsLatin1 + +parseTestsStdUnicode :: forall s . StringLike s => Test' s +parseTestsStdUnicode = parseTestsStd' testsUnicode + +parseTestsStd' :: forall s . StringLike s => [(String, String)] -> Test' s +parseTestsStd' tests + = Test' $ + TestLabel "standard XML parse tests" $ + TestList $ + map parseTest $ tests + where + parseTest (re0, rep) + = TestCase $ + assertEqual (show re ++ " must be parsed as " ++ show rep) + rep + (show . parseRegexExt $ re) + where + re :: s + re = fromString re0 + +testsLatin1 :: [(String, String)] +testsLatin1 + = [ ("", "()") + , (".", ".") + , (".*", "(.*)") + , ("(())", "()") + , ("(a*)*", "(a*)") + , ("(a*)+", "(a*)") + , ("(a+)*", "(a*)") + , ("(a+)+", "(a+)") + , ("(a?){2,}", "(a*)") + , ("((a?){2,}){0,}", "(a*)") + , ("((a?){2,}){3,}", "(a*)") + , ("(a{0,}){2,}", "(a*)") + , ("(a{2,}){3,}", "(a{6,})") + , ("[9-0]", "{empty char range}") + , ("[0-9]", "[0-9]") + , ("[0-99-0]", "[0-9]") + , ("[abc]", "[a-c]") + , ("[abc-[b]]", "[ac]" ) + , ("a|b|c|d", "[a-d]" ) + , ("(a|b)|c", "[a-c]" ) + , ("a|(b|c)", "[a-c]" ) + , ("abc", "(a(bc))" ) -- seq is right ass + , ("({1}a+)", "({1}(a+))" ) -- extension: labeled subexpressions + , ("({1}({2}({3}a)))", "({1}({2}({3}a)))" ) + ] + +testsUnicode :: [(String, String)] +testsUnicode + = [ ("[\0-\1114111]", "\\a") + , ("[\0-\1114111]|[0-9]", "\\a") + , ("[\0-\1114110]", "[�-􏿾]" ) + ] + +parseTestsExtLatin1 :: forall s . StringLike s => Test' s +parseTestsExtLatin1 = parseTestsExt' testsExtLatin1 + +parseTestsExtUnicode :: forall s . StringLike s => Test' s +parseTestsExtUnicode = parseTestsExt' testsExtUnicode + +parseTestsExt' :: forall s . StringLike s => [(String, String)] -> Test' s +parseTestsExt' tests + = Test' $ + TestLabel "extended parse tests" $ + TestList $ + map parseTest $ tests + where + parseTest (re0, rep) + = TestCase $ + assertEqual (show re ++ " must be parsed as " ++ show rep) + rep + (show . parseRegexExt $ re) + where + re :: s + re = fromString re0 + +testsExtLatin1 :: [(String, String)] +testsExtLatin1 + = [ ("", "()") + , (".", ".") + , (".*", "(.*)") + , ("\\a", "\\a") + , ("\\A", "\\A") + , ("(())", "()") + , ("(a*)*", "(a*)") + , ("(a*)+", "(a*)") + , ("(a+)*", "(a*)") + , ("(a+)+", "(a+)") + , ("(a?){2,}", "(a*)") + , ("((a?){2,}){0,}", "(a*)") + , ("((a?){2,}){3,}", "(a*)") + , ("(a{0,}){2,}", "(a*)") + , ("(a{2,}){3,}", "(a{6,})") + , ("[9-0]", "{empty char range}") + , ("[0-9]", "[0-9]") + , ("[0-99-0]", "[0-9]") + , ("[abc]", "[a-c]") + , ("[abc-[b]]", "[ac]" ) + , ("a|b|c|d", "[a-d]" ) + , ("(a|b)|c", "[a-c]" ) + , ("a|(b|c)", "[a-c]" ) + , ("abc", "(a(bc))" ) -- seq is right ass + , ("a*{^}b*", "((a*){^}(b*))" ) -- extension: exor + , ("a*{^}b*{^}c*", "((a*){^}((b*){^}(c*)))") + , ("a*{&}b*", "((a*){&}(b*))" ) -- extension: intersection + , ("a*{&}b*{&}c*", "((a*){&}((b*){&}(c*)))") + , ("a*{\\}b*", "((a*){\\}(b*))" ) -- extension: set difference + , ("a*{\\}b*{\\}c*", "(((a*){\\}(b*)){\\}(c*))" ) + , ("(a|b)*{\\}(.*aa.*)", "(([ab]*){\\}((.*)(a(a(.*)))))" ) + , ("({1}a+)", "({1}(a+))" ) -- extension: labeled subexpressions + , ("({1}({2}({3}a)))", "({1}({2}({3}a)))" ) + , ("({1}do){|}({2}[a-z]+)", "(({1}(do)){|}({2}([a-z]+)))" ) -- deterministic choice of submatches + , ("a{:}b{:}c", "(a{:}(b{:}c))" ) -- interleave + ] + +testsExtUnicode :: [(String, String)] +testsExtUnicode + = [ ("[\0-\1114111]", "\\a") + , ("[\0-\1114111]|[0-9]", "\\a") + , ("[\0-\1114110]", "[�-􏿾]" ) + , ("[abc-[b]]", "[ac]" ) + , ("a|b|c|d", "[a-d]" ) + , ("(a|b)|c", "[a-c]" ) + , ("a|(b|c)", "[a-c]" ) + , ("abc", "(a(bc))" ) -- seq is right ass + , ("a*{^}b*", "((a*){^}(b*))" ) -- extension: exor + , ("a*{^}b*{^}c*", "((a*){^}((b*){^}(c*)))") + , ("a*{&}b*", "((a*){&}(b*))" ) -- extension: intersection + , ("a*{&}b*{&}c*", "((a*){&}((b*){&}(c*)))") + , ("a*{\\}b*", "((a*){\\}(b*))" ) -- extension: set difference + , ("a*{\\}b*{\\}c*", "(((a*){\\}(b*)){\\}(c*))" ) + , ("(a|b)*{\\}(.*aa.*)", "(([ab]*){\\}((.*)(a(a(.*)))))" ) + , ("({1}a+)", "({1}(a+))" ) -- extension: labeled subexpressions + , ("({1}({2}({3}a)))", "({1}({2}({3}a)))" ) + , ("({1}do){|}({2}[a-z]+)", "(({1}(do)){|}({2}([a-z]+)))" ) -- deterministic choice of submatches + , ("a{:}b{:}c", "(a{:}(b{:}c))" ) -- interleave + ] + +simpleMatchTests :: forall a . StringLike a => Test' a +simpleMatchTests + = Test' $ + TestLabel "simple match tests" $ + TestList $ + concatMap matchTest $ testsMatch + where + matchTest :: (String, [String], [String]) -> [Test] + matchTest (re0, ok, er) + = map (matchOK re . fromString) ok + ++ + map (matchErr re . fromString) er + where + re :: a + re = fromString re0 + + matchOK :: a -> a -> Test + matchOK re xs + = TestCase $ assertBool (show xs ++ " must match " ++ show re) (matchExt re xs) + matchErr re xs + = TestCase $ assertBool (show xs ++ " must not match " ++ show re) (not (matchExt re xs)) + +testsMatch :: [(String, [String], [String])] +testsMatch + = [ ( "" + , [""] + , ["a"] + ) + , ( "a" + , ["a"] + , ["", "b", "ab"] + ) + , ( "()" + , [""] + , ["a"] + ) + , ( "ab" + , ["ab"] + , ["", "b", "abc"] + ) + , ( "." + , [".","a","\0","\1114111"] + , ["\n","\r","",".."] + ) + , ( "\\a" + , [".","a","\n","\r","\0","\1114111"] + , ["",".."] + ) + , ( "\\A" + , ["",".","a","\n","\r","\0","\1114111",".."] + , [] + ) + , ( "a*" + , ["", "a", "aa"] + , ["b", "ab", "aab"] + ) + , ( "a+" + , ["a", "aa", "aaa"] + , ["", "b", "ab"] + ) + , ( "a?" + , ["", "a"] + , ["b", "ab"] + ) + , ( "a{2}" + , ["aa"] + , ["", "a", "aaa"] + ) + , ( "a{2,}" + , ["aa","aaa"] + , ["", "a", "aaab"] + ) + , ( "a{2,4}" + , ["aa", "aaa", "aaaa"] + , ["", "a", "aaaaa", "ab"] + ) + , ( "a|b" + , ["a", "b"] + , ["", "c", "ab", "abc"] + ) + , ( "[0-9]" + , ["0", "5", "9"] + , ["", "a", "00"] + ) + , ( "[^0-9]" + , ["a"] + , ["", "0", "9", "00"]) + , ( "\32" + , [" "] + , [] + ) + , ( "[\0-\1114111]" + , ["\0","\1114111","a"] + , ["","aaa"] + ) + , ( "[^\0-\1114111]" + , [] + , ["","aaa","\0","\1114111","a"] + ) + , ( ".*a.*|.*b.*|.*c.*" + , ["a", "abc", "acdc"] + , ["", "dddd"] + ) + , ( ".*a.*{&}.*b.*{&}.*c.*" + , ["abc", "abcd", "abcabcd"] + , ["", "a", "bc", "acdc", "dddd"] + ) + , ( ".*a.*{&}.*b.*{&}.*c.*{&}.{3}" -- all permutations of "abc" + , ["abc", "acb", "bac", "bca", "cab", "cba"] + , ["", "a", "bc", "acd", "aaaa", "aba"] + ) + , ( ".*a.*{&}.*b.*{&}.*c.*" -- all words containing at least 1 a, 1 b and 1 c + , ["abc", "acb", "bac", "bca", "cab", "cba", "abcd", "abcabc"] + , ["", "a", "bc", "acd", "aaaa"] + ) + , ( ".*a.*{^}.*b.*" -- all words containing at least 1 a or 1 b but not both a's and b's + , ["a", "b", "ac", "bc", "aaaa", "bbb", "aacc", "ccbb", "acdc"] + , ["", "ab", "abc", "dddd"] + ) + , ( "/[*](.*{\\}(.*[*]/.*))[*]/" -- single line C comment of form /*...*/, but without any */ in the comment body + -- this is the way to specify none greedy expessions + -- if multi-line comment are required, substitute .* by \A, so newlines are allowed + , ["/**/","/***/","/*x*/","/*///*/"] + , ["", "/", "/*", "/*/", "/**/*/", "/*xxx*/xxx*/"] + ) + , ( "a{:}b{:}c" + , ["abc", "acb", "bac", "bca", "cab", "cba"] + , ["", "a", "ab", "abcc", "abca", "aba"] + ) + ] + +-- ------------------------------------------------------------ + +simpleSplitTests :: forall a . StringLike a => Test' a +simpleSplitTests + = Test' $ + TestLabel "simple split tests" $ + TestList $ + map splitTest $ testsSplit + where + splitTest (re0, inp0, tok0, rest0) + = TestCase $ + assertEqual + ("split " ++ show re ++ " " ++ show inp0 ++ " = " ++ show (tok, rest)) + (tok, rest) + (split re (fromString inp0)) + where + re, tok, rest :: a + re = fromString re0 + tok = fromString tok0 + rest = fromString rest0 + +testsSplit :: [(String, String, String, String)] +testsSplit + = [ ("", "a", "", "a" ) + , ("a*b", "abc", "ab", "c" ) + , ("a*", "bc", "", "bc" ) + , ("a+", "bc", "", "bc" ) + , ("[", "bc", "", "bc" ) + , ("a{2}", "aaa", "aa", "a" ) + , ("a{2,}", "aaa", "aaa", "" ) + , ("a|b", "ab", "a", "b" ) + , ("a|b*", "bbba", "bbb", "a" ) + , ("abc", "abcd", "abc", "d" ) + ] + +-- ------------------------------------------------------------ + +simpleTokenTests :: forall a . StringLike a => Test' a +simpleTokenTests + = Test' $ + TestLabel "simple token tests" $ + TestList $ + map tokenTest $ testsToken + where + tokenTest (re0, inp0, toks0) + = TestCase $ + assertEqual + ("tokenize " ++ show re ++ " " ++ show inp ++ " = " ++ show toks) + toks + (tokenize re inp) + where + re, inp :: a + re = fromString re0 + inp = fromString inp0 + toks :: [a] + toks = map fromString toks0 + +testsToken :: [(String, String, [String])] +testsToken + = [ ("", "", [] ) + , ("a", "aba", ["a", "a"] ) + , ("a", "b", [] ) + , ("a", "ba", ["a"] ) + , ("a*", "a", ["a"] ) + , ("a*", "ba", ["","a"] ) + , ("a*", "aba", ["a", "a"] ) + , ("a*", "abba", ["a", "", "a"] ) + , ("a+", "abba", ["a", "a"] ) + , ("a*b", "abba", ["ab", "b"] ) + , (".*", "a\n\nb", ["a", "", "b"] ) + , (".*", "a\n\nb\n", ["a", "", "b"] ) + , ("\\w+", "a\n\nb\n", ["a", "b"] ) + , ("\\w|ab", "aaa\n\nabc\n", ["a", "a", "a", "ab", "c"] ) + , ("\\w|ab", "aaa abc", ["a", "a", "a", "ab", "c"] ) + ] + +-- ------------------------------------------------------------ + +genericTest :: (forall a . StringLike a => Test' a) -> Test +genericTest t + = TestList $ + [ TestLabel "Test with 'String'" $ unTest' (t :: Test' String) + , TestLabel "Test with 'Text'" $ unTest' (t :: Test' Text) + , TestLabel "Test with 'Text.Lazy'" $ unTest' (t :: Test' TextL) + , TestLabel "Test with 'ByteString'" $ unTest' (t :: Test' BS) + , TestLabel "Test with 'ByteString.Lazy'" $ unTest' (t :: Test' BL) + ] + +unicodeTest :: (forall a . StringLike a => Test' a) -> Test +unicodeTest t + = TestList $ + [ TestLabel "Test with 'String'" $ unTest' (t :: Test' String) + , TestLabel "Test with 'Text'" $ unTest' (t :: Test' Text) + , TestLabel "Test with 'Text.Lazy'" $ unTest' (t :: Test' TextL) + ] + +allTests :: Test +allTests + = TestList + [ genericTest parseTestsStdLatin1 + , unicodeTest parseTestsStdUnicode + , genericTest parseTestsExtLatin1 + , unicodeTest parseTestsExtUnicode + , genericTest simpleMatchTests + , genericTest simpleSplitTests + , genericTest simpleTokenTests + ] + +main :: IO () +main + = do + c <- runTestTT allTests + putStrLn $ show c + let errs = errors c + fails = failures c + exitWith (codeGet errs fails) + +codeGet :: Int -> Int -> ExitCode +codeGet errs fails + | fails > 0 = ExitFailure 2 + | errs > 0 = ExitFailure 1 + | otherwise = ExitSuccess + +-- ------------------------------------------------------------ + +deltaTrc :: StringLike s => s -> GenRegex s -> [(s, GenRegex s)] +deltaTrc s@(uncons -> Just (c, cs)) re + = (s, re) + : + ( if isZero re' + then [(emptyS,re')] + else deltaTrc cs re' + ) + where + re' = delta1 c s re +deltaTrc _ re = [(emptyS, re)] + +matchTrc :: StringLike s => s -> s -> (Bool, [(s, GenRegex s)]) +matchTrc re s = (nullable . snd . last $ res, res) + where + res = deltaTrc s (parseRegex re) + + +trcMatch :: StringLike s => s -> s -> IO() +trcMatch re = putStrLn . showTrc . matchTrc re + where + showTrc = + ( (show >>> (++ "\n")) + *** + (concatMap ( ( (toString >>> (++ "\t")) + *** + (show >>> (++"\n")) + ) + >>> uncurry (++) + ) + ) + ) + >>> + uncurry (flip (++)) + +-- ------------------------------------------------------------