Blame src/Text/Regex/XMLSchema/String.hs

Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
{- |
Packit 5b08af
   Copyright  : Copyright (C) 2010- Uwe Schmidt
Packit 5b08af
   License    : MIT
Packit 5b08af
Packit 5b08af
   Maintainer : Uwe Schmidt <uwe@fh-wedel.de>
Packit 5b08af
   Stability  : stable
Packit 5b08af
   Portability: portable
Packit 5b08af
Packit 5b08af
   Convenient functions for W3C XML Schema Regular Expression Matcher for Strings.
Packit 5b08af
   A specialisation of Text.Regex.XMLSchema.Generic as
Packit 5b08af
   compatibility module to old non generic version
Packit 5b08af
Packit 5b08af
   Grammar can be found under <http://www.w3.org/TR/xmlschema11-2/#regexs>
Packit 5b08af
-}
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
module Text.Regex.XMLSchema.String
Packit 5b08af
    {-# DEPRECATED "use the more general 'Text.Regex.XMLSchema.Generic' instead" #-}
Packit 5b08af
    ( Regex
Packit 5b08af
Packit 5b08af
    , grep
Packit 5b08af
    , grepExt
Packit 5b08af
    , grepRE
Packit 5b08af
    , grepREwithLineNum
Packit 5b08af
Packit 5b08af
    , match
Packit 5b08af
    , matchExt
Packit 5b08af
    , matchSubex
Packit 5b08af
Packit 5b08af
    , sed
Packit 5b08af
    , sedExt
Packit 5b08af
Packit 5b08af
    , split
Packit 5b08af
    , splitExt
Packit 5b08af
    , splitSubex
Packit 5b08af
Packit 5b08af
    , tokenize
Packit 5b08af
    , tokenizeExt
Packit 5b08af
    , tokenize'
Packit 5b08af
    , tokenizeExt'
Packit 5b08af
    , tokenizeSubex
Packit 5b08af
Packit 5b08af
    , matchRE
Packit 5b08af
    , matchSubexRE
Packit 5b08af
    , sedRE
Packit 5b08af
    , splitRE
Packit 5b08af
    , splitSubexRE
Packit 5b08af
    , tokenizeRE
Packit 5b08af
    , tokenizeRE'
Packit 5b08af
    , tokenizeSubexRE
Packit 5b08af
Packit 5b08af
      -- Text.Regex.XMLSchema.Generic.Regex
Packit 5b08af
    , mkZero
Packit 5b08af
    , mkUnit
Packit 5b08af
    , mkSym1
Packit 5b08af
    , mkSymRng
Packit 5b08af
    , mkWord
Packit 5b08af
    , mkDot
Packit 5b08af
    , mkStar
Packit 5b08af
    , mkAll
Packit 5b08af
    , mkAlt
Packit 5b08af
    , mkElse
Packit 5b08af
    , mkSeq
Packit 5b08af
    , mkSeqs
Packit 5b08af
    , mkRep
Packit 5b08af
    , mkRng
Packit 5b08af
    , mkOpt
Packit 5b08af
    , mkDiff
Packit 5b08af
    , mkIsect
Packit 5b08af
    , mkExor
Packit 5b08af
    , mkCompl
Packit 5b08af
    , mkBr
Packit 5b08af
    , isZero
Packit 5b08af
    , errRegex
Packit 5b08af
Packit 5b08af
    -- Text.Regex.XMLSchema.Generic.RegexParser
Packit 5b08af
    , parseRegex
Packit 5b08af
    , parseRegexExt
Packit 5b08af
    , parseContextRegex
Packit 5b08af
    )
Packit 5b08af
where
Packit 5b08af
Packit 5b08af
import           Text.Regex.XMLSchema.Generic             (Regex)
Packit 5b08af
import qualified Text.Regex.XMLSchema.Generic             as G
Packit 5b08af
import           Text.Regex.XMLSchema.Generic.Regex
Packit 5b08af
import           Text.Regex.XMLSchema.Generic.RegexParser
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | split a string by taking the longest prefix matching a regular expression
Packit 5b08af
--
Packit 5b08af
-- @Nothing@ is returned in case there is no matching prefix,
Packit 5b08af
-- else the pair of prefix and rest is returned
Packit 5b08af
Packit 5b08af
splitRE         :: Regex -> String -> Maybe (String, String)
Packit 5b08af
splitRE         = G.splitRE
Packit 5b08af
Packit 5b08af
-- | convenient function for 'splitRE'
Packit 5b08af
--
Packit 5b08af
-- examples:
Packit 5b08af
--
Packit 5b08af
-- > split "a*b" "abc" = ("ab","c")
Packit 5b08af
-- > split "a*"  "bc"  = ("", "bc")    -- "a*" matches ""
Packit 5b08af
-- > split "a+"  "bc"  = ("", "bc")    -- "a+" does not match, no split
Packit 5b08af
-- > split "["   "abc" = ("", "abc")   -- "["  syntax error, no split
Packit 5b08af
Packit 5b08af
split           :: String -> String -> (String, String)
Packit 5b08af
split           = G.split
Packit 5b08af
Packit 5b08af
-- | split with extended syntax
Packit 5b08af
Packit 5b08af
splitExt        :: String -> String -> (String, String)
Packit 5b08af
splitExt        = G.splitExt
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | split a string by removing the longest prefix matching a regular expression
Packit 5b08af
-- and then return the list of subexpressions found in the matching part
Packit 5b08af
--
Packit 5b08af
-- @Nothing@ is returned in case of no matching prefix,
Packit 5b08af
-- else the list of pairs of labels and submatches and the
Packit 5b08af
-- rest is returned
Packit 5b08af
Packit 5b08af
splitSubexRE    :: Regex -> String -> Maybe ([(String, String)], String)
Packit 5b08af
splitSubexRE    = G.splitSubexRE
Packit 5b08af
Packit 5b08af
-- | convenient function for 'splitSubex', uses extended syntax
Packit 5b08af
--
Packit 5b08af
-- examples:
Packit 5b08af
--
Packit 5b08af
-- > splitSubex "({1}a*)b"  "abc" = ([("1","a")],"c")
Packit 5b08af
-- > splitSubex "({2}a*)"   "bc"  = ([("2","")], "bc")
Packit 5b08af
-- > splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c")        -- subex 1 matches 2 times
Packit 5b08af
-- >
Packit 5b08af
-- > splitSubex ".*({x}a*)" "aa"  = ([("x",""),("x","a"),("x","aa")],"")
Packit 5b08af
-- >                                                                   -- nondeterminism: 3 matches for a*
Packit 5b08af
-- >
Packit 5b08af
-- > splitSubex "({1}do)|({2}[a-z]+)" "do you know"
Packit 5b08af
-- >                                = ([("1","do"),("2","do")]," you know")
Packit 5b08af
-- >                                                                   -- nondeterminism: 2 matches for do
Packit 5b08af
-- >
Packit 5b08af
-- > splitSubex "({1}do){|}({2}[a-z]+)" "do you know"
Packit 5b08af
-- >                                = ([("1","do")]," you know")
Packit 5b08af
-- >                                                                   -- no nondeterminism with {|}: 1. match for do
Packit 5b08af
-- >
Packit 5b08af
-- > splitSubex "({1}a+)"   "bcd" = ([], "bcd")                        -- no match
Packit 5b08af
-- > splitSubex "["         "abc" = ([], "abc")                        -- syntax error
Packit 5b08af
Packit 5b08af
Packit 5b08af
splitSubex      :: String -> String -> ([(String,String)], String)
Packit 5b08af
splitSubex      = G.splitSubex
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | The function, that does the real work for 'tokenize'
Packit 5b08af
Packit 5b08af
tokenizeRE      :: Regex -> String -> [String]
Packit 5b08af
tokenizeRE      = G.tokenizeRE
Packit 5b08af
Packit 5b08af
-- | split a string into tokens (words) by giving a regular expression
Packit 5b08af
-- which all tokens must match.
Packit 5b08af
--
Packit 5b08af
-- Convenient function for 'tokenizeRE'
Packit 5b08af
--
Packit 5b08af
-- This can be used for simple tokenizers.
Packit 5b08af
-- It is recommended to use regular expressions where the empty word does not match.
Packit 5b08af
-- Else there will appear a lot of probably useless empty tokens in the output.
Packit 5b08af
-- All none matching chars are discarded. If the given regex contains syntax errors,
Packit 5b08af
-- @Nothing@ is returned
Packit 5b08af
--
Packit 5b08af
-- examples:
Packit 5b08af
--
Packit 5b08af
-- > tokenize "a" "aabba"      = ["a","a","a"]
Packit 5b08af
-- > tokenize "a*" "aaaba"     = ["aaa","a"]
Packit 5b08af
-- > tokenize "a*" "bbb"       = ["","",""]
Packit 5b08af
-- > tokenize "a+" "bbb"       = []
Packit 5b08af
-- >
Packit 5b08af
-- > tokenize "a*b" ""         = []
Packit 5b08af
-- > tokenize "a*b" "abc"      = ["ab"]
Packit 5b08af
-- > tokenize "a*b" "abaab ab" = ["ab","aab","ab"]
Packit 5b08af
-- >
Packit 5b08af
-- > tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc"
Packit 5b08af
-- >                           = ["ab","123","456.7","abc"]
Packit 5b08af
-- >
Packit 5b08af
-- > tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc"
Packit 5b08af
-- >                           = ["cab","123","456.7","abc"]
Packit 5b08af
-- >
Packit 5b08af
-- > tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz"
Packit 5b08af
-- >                           = ["abc","def","xyz"]
Packit 5b08af
-- >
Packit 5b08af
-- > tokenize ".*"   "\nabc\n123\n\nxyz\n"
Packit 5b08af
-- >                           = ["","abc","123","","xyz"]
Packit 5b08af
-- >
Packit 5b08af
-- > tokenize ".*"             = lines
Packit 5b08af
-- >
Packit 5b08af
-- > tokenize "[^ \t\n\r]*"    = words
Packit 5b08af
Packit 5b08af
tokenize        :: String -> String -> [String]
Packit 5b08af
tokenize        = G.tokenize
Packit 5b08af
Packit 5b08af
-- | tokenize with extended syntax
Packit 5b08af
Packit 5b08af
tokenizeExt     :: String -> String -> [String]
Packit 5b08af
tokenizeExt     = G.tokenizeExt
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | split a string into tokens and delimierter by giving a regular expression
Packit 5b08af
-- wich all tokens must match
Packit 5b08af
--
Packit 5b08af
-- This is a generalisation of the above 'tokenizeRE' functions.
Packit 5b08af
-- The none matching char sequences are marked with @Left@, the matching ones are marked with @Right@
Packit 5b08af
--
Packit 5b08af
-- If the regular expression contains syntax errors @Nothing@ is returned
Packit 5b08af
--
Packit 5b08af
-- The following Law holds:
Packit 5b08af
--
Packit 5b08af
-- > concat . map (either id id) . tokenizeRE' re == id
Packit 5b08af
Packit 5b08af
tokenizeRE'     :: Regex -> String -> [Either String String]
Packit 5b08af
tokenizeRE'     =  G.tokenizeRE'
Packit 5b08af
Packit 5b08af
-- | convenient function for 'tokenizeRE''
Packit 5b08af
--
Packit 5b08af
-- When the regular expression parses as Zero,
Packit 5b08af
-- @[Left input]@ is returned, that means no tokens are found
Packit 5b08af
Packit 5b08af
tokenize'       :: String -> String -> [Either String String]
Packit 5b08af
tokenize'       = G.tokenize'
Packit 5b08af
Packit 5b08af
tokenizeExt'    :: String -> String -> [Either String String]
Packit 5b08af
tokenizeExt'    = G.tokenizeExt'
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | split a string into tokens (pair of labels and words) by giving a regular expression
Packit 5b08af
-- containing labeled subexpressions.
Packit 5b08af
--
Packit 5b08af
-- This function should not be called with regular expressions
Packit 5b08af
-- without any labeled subexpressions. This does not make sense, because the result list
Packit 5b08af
-- will always be empty.
Packit 5b08af
--
Packit 5b08af
-- Result is the list of matching subexpressions
Packit 5b08af
-- This can be used for simple tokenizers.
Packit 5b08af
-- At least one char is consumed by parsing a token.
Packit 5b08af
-- The pairs in the result list contain the matching substrings.
Packit 5b08af
-- All none matching chars are discarded. If the given regex contains syntax errors,
Packit 5b08af
-- @Nothing@ is returned
Packit 5b08af
Packit 5b08af
tokenizeSubexRE :: Regex -> String -> [(String, String)]
Packit 5b08af
tokenizeSubexRE = G.tokenizeSubexRE
Packit 5b08af
Packit 5b08af
-- | convenient function for 'tokenizeSubexRE' a string
Packit 5b08af
--
Packit 5b08af
-- examples:
Packit 5b08af
--
Packit 5b08af
-- > tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)"
Packit 5b08af
-- >                 "cab123 456.7abc"
Packit 5b08af
-- >                                  = [("name","cab")
Packit 5b08af
-- >                                    ,("num","123")
Packit 5b08af
-- >                                    ,("real","456.7")
Packit 5b08af
-- >                                    ,("name","abc")]
Packit 5b08af
-- >
Packit 5b08af
-- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)"
Packit 5b08af
-- >                 "12.34"          = [("real","12.34")
Packit 5b08af
-- >                                    ,("n","12")
Packit 5b08af
-- >                                    ,("f","34")]
Packit 5b08af
-- >
Packit 5b08af
-- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)"
Packit 5b08af
-- >                  "12 34"         = [("real","12"),("n","12")
Packit 5b08af
-- >                                    ,("real","34"),("n","34")]
Packit 5b08af
-- >
Packit 5b08af
-- > tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))"
Packit 5b08af
-- >                  "12 34.56"      = [("real","12"),("n","12"),("f","")
Packit 5b08af
-- >                                    ,("real","34.56"),("n","34"),("f","56")]
Packit 5b08af
Packit 5b08af
tokenizeSubex   :: String -> String -> [(String,String)]
Packit 5b08af
tokenizeSubex   = G.tokenizeSubex
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | sed like editing function
Packit 5b08af
--
Packit 5b08af
-- All matching tokens are edited by the 1. argument, the editing function,
Packit 5b08af
-- all other chars remain as they are
Packit 5b08af
Packit 5b08af
sedRE           :: (String -> String) ->  Regex -> String -> String
Packit 5b08af
sedRE           = G.sedRE
Packit 5b08af
Packit 5b08af
-- | convenient function for 'sedRE'
Packit 5b08af
--
Packit 5b08af
-- examples:
Packit 5b08af
--
Packit 5b08af
-- > sed (const "b") "a" "xaxax"       = "xbxbx"
Packit 5b08af
-- > sed (\ x -> x ++ x) "a" "xax"     = "xaax"
Packit 5b08af
-- > sed undefined       "[" "xxx"     = "xxx"
Packit 5b08af
Packit 5b08af
sed             :: (String -> String) -> String -> String -> String
Packit 5b08af
sed             = G.sed
Packit 5b08af
Packit 5b08af
sedExt          :: (String -> String) -> String -> String -> String
Packit 5b08af
sedExt          = G.sedExt
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | match a string with a regular expression
Packit 5b08af
Packit 5b08af
matchRE         :: Regex -> String -> Bool
Packit 5b08af
matchRE         = G.matchRE
Packit 5b08af
Packit 5b08af
-- | convenient function for 'matchRE'
Packit 5b08af
--
Packit 5b08af
-- Examples:
Packit 5b08af
--
Packit 5b08af
-- > match "x*" "xxx" = True
Packit 5b08af
-- > match "x" "xxx"  = False
Packit 5b08af
-- > match "[" "xxx"  = False
Packit 5b08af
Packit 5b08af
match           :: String -> String -> Bool
Packit 5b08af
match           = G.match
Packit 5b08af
Packit 5b08af
-- | match with extended regular expressions
Packit 5b08af
Packit 5b08af
matchExt        :: String -> String -> Bool
Packit 5b08af
matchExt        = G.matchExt
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | match a string with a regular expression
Packit 5b08af
-- and extract subexpression matches
Packit 5b08af
Packit 5b08af
matchSubexRE            :: Regex -> String -> [(String, String)]
Packit 5b08af
matchSubexRE            = G.matchSubexRE
Packit 5b08af
Packit 5b08af
-- | convenient function for 'matchRE'
Packit 5b08af
--
Packit 5b08af
-- Examples:
Packit 5b08af
--
Packit 5b08af
-- > matchSubex "({1}x*)"                 "xxx"      = [("1","xxx")]
Packit 5b08af
-- > matchSubex "({1}x*)"                 "y"        = []
Packit 5b08af
-- > matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600"  = [("w","800"),("h","600")]
Packit 5b08af
-- > matchSubex "[" "xxx"                            = []
Packit 5b08af
Packit 5b08af
matchSubex              :: String -> String -> [(String, String)]
Packit 5b08af
matchSubex              = G.matchSubex
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------
Packit 5b08af
Packit 5b08af
-- | grep like filter for lists of strings
Packit 5b08af
--
Packit 5b08af
-- The regular expression may be prefixed with the usual context spec \"^\" for start of string,
Packit 5b08af
-- and "\\<" for start of word.
Packit 5b08af
-- and suffixed with \"$\" for end of text and "\\>" end of word.
Packit 5b08af
-- Word chars are defined by the multi char escape sequence "\\w"
Packit 5b08af
--
Packit 5b08af
-- Examples
Packit 5b08af
--
Packit 5b08af
-- > grep "a"    ["_a_", "_a", "a_", "a", "_"]      => ["_a_", "_a", "a_", "a"]
Packit 5b08af
-- > grep "^a"   ["_a_", "_a", "a_", "a", "_"]      => ["a_", "a"]
Packit 5b08af
-- > grep "a$"   ["_a_", "_a", "a_", "a", "_"]      => ["_a", "a"]
Packit 5b08af
-- > grep "^a$"  ["_a_", "_a", "a_", "a", "_"]      => ["a"]
Packit 5b08af
-- > grep "\\<a" ["x a b", " ax ", " xa ", "xab"]   => ["x a b", " ax "]
Packit 5b08af
-- > grep "a\\>" ["x a b", " ax ", " xa ", "xab"]   => ["x a b", " xa "]
Packit 5b08af
Packit 5b08af
grep                    :: String -> [String] -> [String]
Packit 5b08af
grep                    = G.grep
Packit 5b08af
Packit 5b08af
-- | grep with extended regular expressions
Packit 5b08af
Packit 5b08af
grepExt                 :: String -> [String] -> [String]
Packit 5b08af
grepExt                 = G.grepExt
Packit 5b08af
Packit 5b08af
-- | grep with already prepared Regex (ususally with 'parseContextRegex')
Packit 5b08af
Packit 5b08af
grepRE                  :: Regex -> [String] -> [String]
Packit 5b08af
grepRE                  = G.grepRE
Packit 5b08af
Packit 5b08af
-- | grep with Regex and line numbers
Packit 5b08af
Packit 5b08af
grepREwithLineNum       :: Regex -> [String] -> [(Int,String)]
Packit 5b08af
grepREwithLineNum       = G.grepREwithLineNum
Packit 5b08af
Packit 5b08af
-- ------------------------------------------------------------