{-# LANGUAGE ExplicitForAll #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
module Main
where
import Control.Arrow ( (***), second )
import Data.Char ( toLower, toUpper )
import Data.List ( isPrefixOf )
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.String (IsString (..))
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import System.Exit (ExitCode (..), exitWith)
import Test.HUnit
import Text.Parsec
import Text.Regex.XMLSchema.Generic
import Text.Regex.XMLSchema.Generic.StringLike
-- ------------------------------------------------------------
newtype Test' a = Test' {unTest' :: Test}
type BS = B.ByteString
type BL = BL.ByteString
type Text = T.Text
type TextL = TL.Text
-- ------------------------------------------------------------
-- some little helpers for building r.e.s
star :: String -> String
star = (++ "*") . pars
plus :: String -> String
plus = (++ "+") . pars
opt :: String -> String
opt = (++ "?") . pars
dot :: String -> String
dot = (++ "\\.")
pars :: String -> String
pars = ("(" ++) . (++ ")")
orr :: String -> String -> String
orr x y = pars $ pars x ++ "|" ++ pars y
xor :: String -> String -> String
xor x y = pars $ pars x ++ "{|}" ++ pars y
nocase :: String -> String
nocase (x:xs) = '[' : toUpper x : toLower x : ']' : xs
nocase [] = error "nocase with empty list"
alt :: [String] -> String
alt = pars . foldr1 orr
altNC :: [String] -> String
altNC = pars . alt . map nocase
subex :: String -> String -> String
subex n e = pars $ "{" ++ n ++ "}" ++ pars e
ws :: String
ws = "\\s"
ws0 :: String
ws0 = star ws
ws1 :: String
ws1 = plus ws
s0 :: String -> String -> String
s0 x y = x ++ ws0 ++ y
-- the date and time r.e.s
day :: String
day = "(0?[1-9]|[12][0-9]|3[01])"
month :: String
month = "(0?[1-9]|1[0-2])"
year2 :: String
year2 = "[0-5][0-9]"
year4 :: String
year4 = "20" ++ year2
year :: String
year = year4 `orr` year2
year' :: String
year' = "'" ++ year2
dayD :: String
dayD = dot day
monthD :: String
monthD = dot month
dayMonthYear :: String
dayMonthYear = dayD `s0` monthD `s0` year
dayMonth :: String
dayMonth = dayD `s0` monthD
dayOfWeekL :: String
dayOfWeekL = altNC
[ "montag"
, "dienstag"
, "mittwoch"
, "donnerstag"
, "freitag"
, "samstag"
, "sonnabend"
, "sonntag"
]
dayOfWeekA :: String
dayOfWeekA = alt . map dot $
[ "Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"]
dayOfWeek :: String
dayOfWeek = dayOfWeekL `orr` dayOfWeekA
monthL :: String
monthL = altNC
[ "januar"
, "februar"
, "märz"
, "april"
, "mai"
, "juni"
, "juli"
, "august"
, "september"
, "oktober"
, "november"
, "dezember"
]
monthA :: String
monthA = altNC . map dot $ map snd monthAbr
monthAbr :: [(Integer, String)]
monthAbr = (9, "sept") :
zip [1..12]
[ "jan", "feb", "mär", "apr", "mai", "jun", "jul", "aug", "sep", "okt", "nov", "dez"]
monthN :: String
monthN = pars $ monthL `orr` monthA
hour :: String
hour = pars "([0-1]?[0-9])|(2[0-4])"
minute :: String
minute = pars "(0?[0-9])|([1-5][0-9])"
uhr :: String
uhr = ws0 ++ nocase "uhr"
hourMin :: String
hourMin = hour ++ ":" ++ minute ++ opt uhr
wsyear :: String
wsyear = year ++ "/[0-9]{2}"
wsem :: String
wsem = ("Wi?Se?" `orr` nocase "Wintersemester") ++ ws0 ++ wsyear
ssem :: String
ssem = ("So?Se?" `orr` nocase "Sommersemester") ++ ws0 ++ year
sem :: String
sem = wsem `orr` ssem
num :: String
num = "\\d+"
-- the token types
tokenRE :: String
tokenRE = foldr1 xor $
map (uncurry subex) $
[ ( "ddmmyyyy", dayMonthYear )
, ( "ddMonthyyyy", dayD `s0` monthN `s0` (year `orr` year') )
, ( "ddmm", dayMonth)
, ( "ddMonth", dayD `s0` monthN )
, ( "yyyymmdd", year ++ "[-/]" ++ month ++ "[-/]" ++ day )
, ( "yyyy", year4 `orr` ("'" ++ year2) )
, ( "month", monthN )
, ( "weekday", dayOfWeek )
, ( "HHMM", hourMin ++ opt uhr )
, ( "HH", hour ++ uhr )
, ( "wsem", wsem)
, ( "ssem", ssem)
, ( "word", "[\\w\\d]+")
, ( "del", "[^\\w\\d]+")
]
-- ------------------------------------------------------------
type Token = (String, String)
type TokenStream = [Token]
type DateParser a = Parsec [(String, String)] () a
type StringFct = String -> String -- for fast concatenation
-- must be extended for weekday or semester, if neccessay
data DateVal = DT { _year :: ! Int
, _month :: ! Int
, _day :: ! Int
, _hour :: ! Int
, _min :: ! Int
}
deriving (Eq, Show)
data DateParse = DP { _pre :: StringFct
, _rep :: StringFct
, _dat :: ! DateVal
}
-- just a helper for result output
data DateRep = DR { _p :: String
, _r :: String
, _d :: ! DateVal
}
deriving (Eq, Show)
-- ------------------------------------------------------------
emptyText :: StringFct
emptyText = id
mkText :: String -> StringFct
mkText = (++)
concText :: StringFct -> StringFct -> StringFct
concText = (.)
textToString :: StringFct -> String
textToString = ($ [])
emptyDateVal :: DateVal
emptyDateVal = DT { _year = -1
, _month = -1
, _day = -1
, _hour = -1
, _min = -1
}
emptyDateParse :: DateParse
emptyDateParse = DP { _pre = emptyText
, _rep = emptyText
, _dat = emptyDateVal
}
appPre :: String -> DateParse -> DateParse
appPre s d = d { _pre = _pre d `concText` mkText s }
appRep :: String -> DateParse -> DateParse
appRep s d = d { _rep = _rep d `concText` mkText s }
setDay :: Int -> Int -> Int -> DateParse -> DateParse
setDay j m t d = d { _dat = setDateVal j m t (-1) (-1) (_dat d) }
setHour :: Int -> Int -> DateParse -> DateParse
setHour h m d = d { _dat = setDateVal (-1) (-1) (-1) h m (_dat d) }
setDateVal :: Int -> Int -> Int -> Int -> Int -> DateVal -> DateVal
setDateVal j m t s i (DT j' m' t' s' i' )
= DT j'' m'' t'' s'' i''
where
j'' | j < 0 = j' -- year not there
| j < 100 = j + 2000 -- 2 digit year
| otherwise = j -- 4 digit year
m'' = m `max` m'
t'' = t `max` t'
s'' = s `max` s'
i'' = i `max` i'
datePToDateRep :: DateParse -> DateRep
datePToDateRep dp
= DR { _p = textToString $ _pre dp
, _r = textToString $ _rep dp
, _d = _dat dp
}
-- ------------------------------------------------------------
-- a simple helper for showing the results
dateSearch' :: TokenStream -> [DateRep]
dateSearch' = map datePToDateRep .
dateSearch
-- look for a sequence of date specs, the last entry in the list
-- does not contain a valid date, but just the context behind the last real date
dateSearch :: TokenStream -> [DateParse]
dateSearch = either (const []) id .
parse (many (dateParser emptyDateParse)) ""
-- all date parsers thread a state the subparsers to accumulate
-- the parts of a date, the context, the external representation and
-- the pure data, year, month, day, ...
dateParser :: DateParse -> DateParser DateParse
dateParser d = ( do
s <- fillTok
dateParser0 (appPre s d)
)
<|>
parseDate d -- here is the hook for the real date parser
<|>
( do
s <- textTok -- the default case: if parseDate fails
dateParser0 (appPre s d) -- the token is handled like a normal word
)
dateParser0 :: DateParse -> DateParser DateParse
dateParser0 d = dateParser d <|> return d
parseDate :: DateParse -> DateParser DateParse
parseDate d = parseDate0 d
<|>
try
( do
d1 <- parseWeekDay d
lookAheadN 3 parseDate0 d1 -- Freitag, den 13.
)
-- parse a date optionally followed by a time
parseDate0 :: DateParse -> DateParser DateParse
parseDate0 d = ( do
d1 <- parseDay d
option d1 (parseFollowingHour d1)
)
-- parse a simple token for a day
parseDay :: DateParse -> DateParser DateParse
parseDay d = ( do
(s, d') <- parseDateTok "ddmmyyyy" d
let [t, m, j] = tokenize num s
return $ setDay (read j) (read m) (read t) d'
)
<|>
( do
(s, d') <- parseDateTok "ddMonthyyyy" d
let s' = sed ((++ ".") . monthToM) monthN s
let [t, m, j] = tokenize num s'
return $ setDay (read j) (read m) (read t) d'
)
<|>
( do
(s, d') <- parseDateTok "ddmm" d
let [t, m] = tokenize num s
return $ setDay (-1) (read m) (read t) d'
)
<|>
( do
(s, d') <- parseDateTok "ddMonth" d
let s' = sed ((++ ".") . monthToM) monthN s
let [t, m] = tokenize num s'
return $ setDay (-1) (read m) (read t) d'
)
<|>
( do
(s, d') <- parseDateTok "yyyymmdd" d
let [j, m, t] = tokenize num s
return $ setDay (read j) (read m) (read t) d'
)
parseYear :: DateParse -> DateParser DateParse
parseYear d = ( do
(s, d') <- parseDateTok "yyyy" d
let [j] = tokenize num s
return $ setDay (read j) (-1) (-1) d'
)
-- parse a weekday and add it to the external rep.
parseWeekDay :: DateParse -> DateParser DateParse
parseWeekDay d = ( do
(_s, d') <- parseDateTok "weekday" d
return d'
)
-- parse a following hour spec, 5 fill tokens, words or delimiters are possible
parseFollowingHour :: DateParse -> DateParser DateParse
parseFollowingHour
= try . -- backtracking becomes neccessary
lookAheadN 5 parseHour -- max 2 words and 3 delimiters
-- parse the simple time formats
parseHour :: DateParse -> DateParser DateParse
parseHour d = ( do
(s, d') <- parseDateTok "HHMM" d
let [h, m] = tokenize num s
return $ setHour (read h) (read m) d'
)
<|>
( do
(s, d') <- parseDateTok "HH" d
let [h] = tokenize num s
return $ setHour (read h) 0 d'
)
-- ------------------------------------------------------------
--
-- auxiliary parser combinators
-- parse a token of a given type and add the text to the external rep.
parseDateTok :: String -> DateParse -> DateParser (String, DateParse)
parseDateTok tty d
= dateTok (isTokType (== tty)) d
dateTok :: DateParser String -> DateParse -> DateParser (String, DateParse)
dateTok t d = ( do
s <- t
return (s, appRep s d)
)
-- try to apply a parser, but first skip a given # of fill tokens
lookAheadN :: Int -> (DateParse -> DateParser DateParse) -> DateParse -> DateParser DateParse
lookAheadN n p d
| n <= 0 = p d
| otherwise = do
(_, d1) <- dateTok fillTok d
( lookAheadN (n - 1) p d1 <|> p d1 )
-- ------------------------------------------------------------
--
-- basic token parsers
-- the interface to the primitive parsec token parser
tok :: (Token -> Bool) -> DateParser Token
tok prd = tokenPrim showTok nextPos testTok
where
showTok = show . fst
nextPos pos _tok _ts = incSourceColumn pos 1
testTok tk = if prd tk then Just tk else Nothing
-- check for specific token type and in case of success return the text value
isTokType :: (String -> Bool) -> DateParser String
isTokType isT = tok (isT . fst) >>= return . snd
-- parse an arbitrary token and return the text value
textTok :: DateParser String
textTok = isTokType (const True)
-- a word
wordTok :: DateParser String
wordTok = isTokType (== "word")
-- a delimiter, whitespace is normalized, sequences are reduced to a single space char
delTok :: DateParser String
delTok = isTokType (== "del")
>>=
return . sed (const " ") ws1
-- tokens that don't contain date info
fillTok :: DateParser String
fillTok = delTok <|> wordTok
-- semester tokens, not yet interpreted
semTok' :: String -> DateParser (String, Int, Bool)
semTok' sem' = do v <- isTokType (== sem')
return (v, read . head . tokenizeExt year $ v, sem' == "ssem")
semTok :: DateParser (String, Int, Bool)
semTok = semTok' "ssem" <|> semTok' "wsem"
-- ------------------------------------------------------------
-- conversion from month names to 1..12
monthToM :: String -> String
monthToM m
= show .
(\ l -> if null l then 99 else head l) .
map fst .
filter ((== True) . snd) .
map (second (`isPrefixOf` map toLower m)) $
monthAbr
-- ------------------------------------------------------------
ts :: String
ts = "Am Sonntag, dem 17. Februar '03 findet um 9 Uhr ein wichtiger Termin für das Sommersemester 2000 statt. "
++ "Dieser wird allerdings auf Montag verschoben. Und zwar auf den ersten Montag im Wintersemester 11/12, 12:30. "
++ "Ein wichtiger Termin findet im SoSe 2011 statt. Im Jahr '12 gibt es Termine, aber auch in WS 2010/11. "
++ "Ein weiterer Termin ist am 2.4.11 um 12 Uhr. Oder war es doch Di. der 3.4.? Egal. "
++ "Ein weiterer wichtiger Termin findet am 2001-3-4 statt bzw. generell zwischen 01/3/4 - 01/6/4 um 13 Uhr. "
++ "Am kommenden Mittwoch findet Changemanagement in HS5 statt. Dies gilt dann auch für den 7. Juni "
++ "des Jahres 2011. Noch ein wichtiger Termin findet um 16:15 Uhr am Do., 1.2.03 statt. "
++ "Freitag, der 13. Juli ist kein Glückstag"
++ "und Freitag, der 13. Juli um 11:55 Uhr ist es zu spät."
rrr :: [String]
rrr = map _r . dateSearch' . tokenizeSubex tokenRE $ ts
ddd :: [DateVal]
ddd = map _d . dateSearch' . tokenizeSubex tokenRE $ ts
aaa :: [DateRep]
aaa = dateSearch' . tokenizeSubex tokenRE $ ts
tt :: String -> [(String, String)]
tt = tokenizeSubex tokenRE
dd :: String -> [DateVal]
dd = map _d . dateSearch' . tt
rr :: String -> [String]
rr = map _r . dateSearch' . tt
pp :: String -> [String]
pp = map _p . dateSearch' . tt
-- ------------------------------------------------------------
testDate :: forall a . StringLike a => Test' a
testDate
= Test' $
TestLabel "date and time extraction from free text" $
TestList $
zipWith parseT toks exx
where
parseT res ok
= TestCase $
assertEqual (show res ++ " == " ++ show ok) res ok
toks :: [(a, a)]
toks = tokenizeSubex (fromString tokenRE) (fromString ts)
exx :: [(a, a)]
exx = map (fromString *** fromString) $
[("word","Am"),("del"," "),("weekday","Sonntag"),("del",", "),("word","dem"),("del"," ")
,("ddMonthyyyy","17. Februar '03"),("del"," "),("word","findet"),("del"," "),("word","um")
,("del"," "),("HH","9 Uhr"),("del"," "),("word","ein"),("del"," "),("word","wichtiger")
,("del"," "),("word","Termin"),("del"," "),("word","f\252r"),("del"," "),("word","das")
,("del"," "),("ssem","Sommersemester 2000"),("del"," "),("word","statt"),("del",". ")
,("word","Dieser"),("del"," "),("word","wird"),("del"," "),("word","allerdings")
,("del"," "),("word","auf"),("del"," "),("weekday","Montag"),("del"," ")
,("word","verschoben"),("del",". "),("word","Und"),("del"," "),("word","zwar")
,("del"," "),("word","auf"),("del"," "),("word","den"),("del"," "),("word","ersten")
,("del"," "),("weekday","Montag"),("del"," "),("word","im"),("del"," ")
,("wsem","Wintersemester 11/12"),("del",", "),("HHMM","12:30"),("del",". ")
,("word","Ein"),("del"," "),("word","wichtiger"),("del"," "),("word","Termin")
,("del"," "),("word","findet"),("del"," "),("word","im"),("del"," "),("ssem","SoSe 2011")
,("del"," "),("word","statt"),("del",". "),("word","Im"),("del"," "),("word","Jahr")
,("del"," '"),("word","12"),("del"," "),("word","gibt"),("del"," "),("word","es")
,("del"," "),("word","Termine"),("del",", "),("word","aber"),("del"," "),("word","auch")
,("del"," "),("word","in"),("del"," "),("wsem","WS 2010/11"),("del",". "),("word","Ein")
,("del"," "),("word","weiterer"),("del"," "),("word","Termin"),("del"," "),("word","ist")
,("del"," "),("word","am"),("del"," "),("ddmmyyyy","2.4.11"),("del"," "),("word","um")
,("del"," "),("HH","12 Uhr"),("del",". "),("word","Oder"),("del"," "),("word","war")
,("del"," "),("word","es"),("del"," "),("word","doch"),("del"," "),("weekday","Di.")
,("del"," "),("word","der"),("del"," "),("ddmm","3.4."),("del","? "),("word","Egal")
,("del",". "),("word","Ein"),("del"," "),("word","weiterer"),("del"," ")
,("word","wichtiger"),("del"," "),("word","Termin"),("del"," "),("word","findet")
,("del"," "),("word","am"),("del"," "),("yyyymmdd","2001-3-4"),("del"," ")
,("word","statt"),("del"," "),("word","bzw"),("del",". "),("word","generell")
,("del"," "),("word","zwischen"),("del"," "),("yyyymmdd","01/3/4"),("del"," - ")
,("yyyymmdd","01/6/4"),("del"," "),("word","um"),("del"," "),("HH","13 Uhr")
,("del",". "),("word","Am"),("del"," "),("word","kommenden"),("del"," ")
,("weekday","Mittwoch"),("del"," "),("word","findet"),("del"," ")
,("word","Changemanagement"),("del"," "),("word","in"),("del"," "),("word","HS5")
,("del"," "),("word","statt"),("del",". "),("word","Dies"),("del"," "),("word","gilt")
,("del"," "),("word","dann"),("del"," "),("word","auch"),("del"," "),("word","f\252r")
,("del"," "),("word","den"),("del"," "),("ddMonth","7. Juni"),("del"," "),("word","des")
,("del"," "),("word","Jahres"),("del"," "),("yyyy","2011"),("del",". "),("word","Noch")
,("del"," "),("word","ein"),("del"," "),("word","wichtiger"),("del"," "),("word","Termin")
,("del"," "),("word","findet"),("del"," "),("word","um"),("del"," "),("HHMM","16:15 Uhr")
,("del"," "),("word","am"),("del"," "),("weekday","Do."),("del",", "),("ddmmyyyy","1.2.03")
,("del"," "),("word","statt"),("del",". "),("weekday","Freitag"),("del",", ")
,("word","der"),("del"," "),("ddMonth","13. Juli"),("del"," "),("word","ist"),("del"," ")
,("word","kein"),("del"," "),("word","Gl\252ckstagund"),("del"," "),("weekday","Freitag")
,("del",", "),("word","der"),("del"," "),("ddMonth","13. Juli"),("del"," "),("word","um")
,("del"," "),("HHMM","11:55 Uhr"),("del"," "),("word","ist"),("del"," "),("word","es")
,("del"," "),("word","zu"),("del"," "),("word","sp\228t"),("del",".")
]
-- ------------------------------------------------------------
genericTest :: (forall a . StringLike a => Test' a) -> Test
genericTest t
= TestList $
[ TestLabel "Test with 'String'" $ unTest' (t :: Test' String)
, TestLabel "Test with 'Text'" $ unTest' (t :: Test' Text)
, TestLabel "Test with 'Text.Lazy'" $ unTest' (t :: Test' TextL)
, TestLabel "Test with 'ByteString'" $ unTest' (t :: Test' BS)
, TestLabel "Test with 'ByteString.Lazy'" $ unTest' (t :: Test' BL)
]
allTests :: Test
allTests
= TestList
[ genericTest testDate ]
main :: IO ()
main
= do
c <- runTestTT allTests
putStrLn $ show c
let errs = errors c
fails = failures c
exitWith (codeGet errs fails)
codeGet :: Int -> Int -> ExitCode
codeGet errs fails
| fails > 0 = ExitFailure 2
| errs > 0 = ExitFailure 1
| otherwise = ExitSuccess
-- ------------------------------------------------------------