Blob Blame History Raw
{-# LANGUAGE ExplicitForAll      #-}
{-# LANGUAGE Rank2Types          #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns        #-}

module Main
where

import           Control.Arrow ( (***), second )

import           Data.Char ( toLower, toUpper )
import           Data.List ( isPrefixOf )
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as BL
import           Data.String (IsString (..))
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL

import           System.Exit (ExitCode (..), exitWith)

import           Test.HUnit

import           Text.Parsec
import           Text.Regex.XMLSchema.Generic
import           Text.Regex.XMLSchema.Generic.StringLike

-- ------------------------------------------------------------

newtype Test' a = Test' {unTest' :: Test}

type BS    = B.ByteString
type BL    = BL.ByteString
type Text  = T.Text
type TextL = TL.Text

-- ------------------------------------------------------------
-- some little helpers for building r.e.s

star                  :: String -> String
star                  = (++ "*") . pars

plus                  :: String -> String
plus                  = (++ "+") . pars

opt                   :: String -> String
opt                   = (++ "?") . pars

dot                   :: String -> String
dot                   = (++ "\\.")

pars                  :: String -> String
pars                  = ("(" ++) . (++ ")")

orr                   :: String -> String -> String
orr x y               = pars $ pars x ++ "|" ++ pars y

xor                   :: String -> String -> String
xor x y               = pars $ pars x ++ "{|}" ++ pars y

nocase                :: String -> String
nocase                (x:xs)  = '[' : toUpper x : toLower x : ']' : xs
nocase []             = error "nocase with empty list"

alt                   :: [String] -> String
alt                   = pars . foldr1 orr

altNC                 :: [String] -> String
altNC                 = pars . alt . map nocase

subex                 :: String -> String -> String
subex n e             = pars $ "{" ++ n ++ "}" ++ pars e

ws                    :: String
ws                    = "\\s"

ws0                   :: String
ws0                   = star ws

ws1                   :: String
ws1                   = plus ws

s0                    :: String -> String -> String
s0 x y                = x ++ ws0 ++ y

-- the date and time r.e.s

day                   :: String
day                   = "(0?[1-9]|[12][0-9]|3[01])"

month                 :: String
month                 = "(0?[1-9]|1[0-2])"

year2                 :: String
year2                 = "[0-5][0-9]"

year4                 :: String
year4                 = "20" ++ year2

year                  :: String
year                  = year4 `orr` year2

year'                 :: String
year'                 = "'" ++ year2

dayD                  :: String
dayD                  = dot day
monthD                :: String
monthD                = dot month

dayMonthYear          :: String
dayMonthYear          = dayD `s0` monthD `s0` year
dayMonth              :: String
dayMonth              = dayD `s0` monthD

dayOfWeekL            :: String
dayOfWeekL            = altNC
                        [ "montag"
                        , "dienstag"
                        , "mittwoch"
                        , "donnerstag"
                        , "freitag"
                        , "samstag"
                        , "sonnabend"
                        , "sonntag"
                        ]

dayOfWeekA            :: String
dayOfWeekA            = alt . map dot $
                        [ "Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"]

dayOfWeek             :: String
dayOfWeek             = dayOfWeekL `orr` dayOfWeekA

monthL                :: String
monthL                = altNC
                        [ "januar"
                        , "februar"
                        , "märz"
                        , "april"
                        , "mai"
                        , "juni"
                        , "juli"
                        , "august"
                        , "september"
                        , "oktober"
                        , "november"
                        , "dezember"
                        ]

monthA                :: String
monthA                = altNC . map dot $ map snd monthAbr

monthAbr              :: [(Integer, String)]
monthAbr              = (9, "sept") :
                        zip [1..12]
                        [ "jan", "feb", "mär", "apr", "mai", "jun", "jul", "aug", "sep", "okt", "nov", "dez"]

monthN                :: String
monthN                = pars $ monthL `orr` monthA

hour                  :: String
hour                  = pars "([0-1]?[0-9])|(2[0-4])"

minute                :: String
minute                = pars "(0?[0-9])|([1-5][0-9])"

uhr                   :: String
uhr                   = ws0 ++ nocase "uhr"

hourMin               :: String
hourMin               = hour ++ ":" ++ minute ++ opt uhr

wsyear                :: String
wsyear                = year ++ "/[0-9]{2}"

wsem                  :: String
wsem                  = ("Wi?Se?" `orr` nocase "Wintersemester") ++ ws0 ++ wsyear

ssem                  :: String
ssem                  = ("So?Se?" `orr` nocase "Sommersemester") ++ ws0 ++ year

sem                   :: String
sem                   = wsem `orr` ssem

num                   :: String
num                   = "\\d+"

-- the token types

tokenRE               :: String
tokenRE               = foldr1 xor $
                        map  (uncurry subex) $
                        [ ( "ddmmyyyy",    dayMonthYear )
                        , ( "ddMonthyyyy", dayD `s0` monthN `s0` (year `orr` year') )
                        , ( "ddmm",        dayMonth)
                        , ( "ddMonth",     dayD `s0` monthN )
                        , ( "yyyymmdd",    year ++ "[-/]" ++ month ++ "[-/]" ++ day )
                        , ( "yyyy",        year4 `orr` ("'" ++ year2) )
                        , ( "month",       monthN )
                        , ( "weekday",     dayOfWeek )
                        , ( "HHMM",        hourMin ++ opt uhr )
                        , ( "HH",          hour    ++ uhr )
                        , ( "wsem",        wsem)
                        , ( "ssem",        ssem)
                        , ( "word",        "[\\w\\d]+")
                        , ( "del",         "[^\\w\\d]+")
                        ]


-- ------------------------------------------------------------

type Token         = (String, String)
type TokenStream   = [Token]

type DateParser a  = Parsec [(String, String)] () a

type StringFct          = String -> String           -- for fast concatenation

-- must be extended for weekday or semester, if neccessay

data DateVal       = DT  { _year   :: ! Int
                         , _month  :: ! Int
                         , _day    :: ! Int
                         , _hour   :: ! Int
                         , _min    :: ! Int
                         }
                     deriving (Eq, Show)

data DateParse     = DP { _pre    ::   StringFct
                        , _rep    ::   StringFct
                        , _dat    :: ! DateVal
                        }

-- just a helper for result output
data DateRep       = DR { _p ::   String
                        , _r ::   String
                        , _d :: ! DateVal
                        }
                     deriving (Eq, Show)

-- ------------------------------------------------------------

emptyText       :: StringFct
emptyText       = id

mkText          :: String -> StringFct
mkText          = (++)

concText        :: StringFct -> StringFct -> StringFct
concText        = (.)

textToString    :: StringFct -> String
textToString    = ($ [])

emptyDateVal    :: DateVal
emptyDateVal    = DT { _year   = -1
                     , _month  = -1
                     , _day    = -1
                     , _hour   = -1
                     , _min    = -1
                     }

emptyDateParse  :: DateParse
emptyDateParse  = DP { _pre = emptyText
                     , _rep = emptyText
                     , _dat = emptyDateVal
                     }

appPre          :: String -> DateParse -> DateParse
appPre s d      = d { _pre = _pre d `concText` mkText s }

appRep          :: String -> DateParse -> DateParse
appRep s d      = d { _rep = _rep d `concText` mkText s }

setDay          :: Int -> Int -> Int -> DateParse -> DateParse
setDay j m t d  = d { _dat = setDateVal j m t (-1) (-1) (_dat d) }

setHour         :: Int -> Int -> DateParse -> DateParse
setHour h m d   = d { _dat = setDateVal (-1) (-1) (-1) h m (_dat d) }

setDateVal      :: Int -> Int -> Int -> Int -> Int -> DateVal -> DateVal
setDateVal j m t s i (DT j' m' t' s' i' )
                = DT j'' m'' t'' s'' i''
    where
      j'' | j < 0     = j'              -- year not there
          | j < 100   = j + 2000        -- 2 digit year
          | otherwise = j               -- 4 digit year
      m''             = m `max` m'
      t''             = t `max` t'
      s''             = s `max` s'
      i''             = i `max` i'

datePToDateRep  :: DateParse -> DateRep
datePToDateRep dp
                = DR { _p = textToString $ _pre dp
                     , _r = textToString $ _rep dp
                     , _d =                _dat dp
                     }

-- ------------------------------------------------------------

-- a simple helper for showing the results

dateSearch'     :: TokenStream -> [DateRep]
dateSearch'     = map datePToDateRep .
                  dateSearch

-- look for a sequence of date specs, the last entry in the list
-- does not contain a valid date, but just the context behind the last real date

dateSearch      :: TokenStream -> [DateParse]
dateSearch      = either (const []) id .
                  parse (many (dateParser emptyDateParse)) ""

-- all date parsers thread a state the subparsers to accumulate
-- the parts of a date, the context, the external representation and
-- the pure data, year, month, day, ...

dateParser      :: DateParse -> DateParser DateParse
dateParser d    = ( do
                    s <- fillTok
                    dateParser0 (appPre s d)
                  )
                  <|>
                  parseDate d                   -- here is the hook for the real date parser
                  <|>
                  ( do
                    s <- textTok                -- the default case: if parseDate fails
                    dateParser0 (appPre s d)    -- the token is handled like a normal word
                  )

dateParser0     :: DateParse -> DateParser DateParse
dateParser0 d   = dateParser d <|> return d


parseDate       :: DateParse -> DateParser DateParse
parseDate d     = parseDate0 d
                  <|>
                  try
                  ( do
                    d1 <- parseWeekDay d
                    lookAheadN 3 parseDate0 d1  -- Freitag, den 13.
                  )

-- parse a date optionally followed by a time
parseDate0      :: DateParse -> DateParser DateParse
parseDate0 d    = ( do
                    d1 <- parseDay d
                    option d1 (parseFollowingHour d1)
                  )

-- parse a simple token for a day
parseDay        :: DateParse -> DateParser DateParse
parseDay d      = ( do
                    (s, d') <- parseDateTok "ddmmyyyy" d
                    let [t, m, j] = tokenize num s
                    return $ setDay (read j) (read m) (read t) d'
                  )
                  <|>
                  ( do
                    (s, d') <- parseDateTok "ddMonthyyyy" d
                    let s' = sed ((++ ".") . monthToM) monthN s
                    let [t, m, j] = tokenize num s'
                    return $ setDay (read j) (read m) (read t) d'
                  )
                  <|>
                  ( do
                    (s, d') <- parseDateTok "ddmm" d
                    let [t, m] = tokenize num s
                    return $ setDay (-1) (read m) (read t) d'
                  )
                  <|>
                  ( do
                    (s, d') <- parseDateTok "ddMonth" d
                    let s'     = sed ((++ ".") . monthToM) monthN s
                    let [t, m] = tokenize num s'
                    return $ setDay (-1) (read m) (read t) d'
                  )
                  <|>
                  ( do
                    (s, d') <- parseDateTok "yyyymmdd" d
                    let [j, m, t] = tokenize num s
                    return $ setDay (read j) (read m) (read t) d'
                  )

parseYear       :: DateParse -> DateParser DateParse
parseYear d     = ( do
                    (s, d') <- parseDateTok "yyyy" d
                    let [j] = tokenize num s
                    return $ setDay (read j) (-1) (-1) d'
                  )

-- parse a weekday and add it to the external rep.

parseWeekDay    :: DateParse -> DateParser DateParse
parseWeekDay d  = ( do
                    (_s, d') <- parseDateTok "weekday" d
                    return d'
                  )

-- parse a following hour spec, 5 fill tokens, words or delimiters are possible

parseFollowingHour      :: DateParse -> DateParser DateParse
parseFollowingHour
                = try .                         -- backtracking becomes neccessary
                  lookAheadN 5 parseHour        -- max 2 words and 3 delimiters

-- parse the simple time formats
parseHour       :: DateParse -> DateParser DateParse
parseHour d     = ( do
                    (s, d') <- parseDateTok "HHMM" d
                    let [h, m] = tokenize num s
                    return $ setHour (read h) (read m) d'
                  )
                  <|>
                  ( do
                    (s, d') <- parseDateTok "HH" d
                    let [h] = tokenize num s
                    return $ setHour (read h) 0 d'
                  )

-- ------------------------------------------------------------
--
-- auxiliary parser combinators

-- parse a token of a given type and add the text to the external rep.

parseDateTok    :: String -> DateParse -> DateParser (String, DateParse)
parseDateTok tty d
                = dateTok (isTokType (== tty)) d

dateTok         :: DateParser String -> DateParse -> DateParser (String, DateParse)
dateTok t d     = ( do
                    s <- t
                    return (s, appRep s d)
                  )

-- try to apply a parser, but first skip a given # of fill tokens

lookAheadN      :: Int -> (DateParse -> DateParser DateParse) -> DateParse -> DateParser DateParse
lookAheadN n p d
    | n <= 0    = p d
    | otherwise = do
                  (_, d1) <- dateTok fillTok d
                  ( lookAheadN (n - 1) p d1 <|> p d1 )

-- ------------------------------------------------------------
--
-- basic token parsers

-- the interface to the primitive parsec token parser
tok             :: (Token -> Bool) -> DateParser Token
tok prd         = tokenPrim showTok nextPos testTok
    where
      showTok               = show . fst
      nextPos pos _tok _ts  = incSourceColumn pos 1
      testTok tk            = if prd tk then Just tk else Nothing

-- check for specific token type and in case of success return the text value
isTokType       :: (String -> Bool) -> DateParser String
isTokType isT   = tok (isT . fst) >>= return . snd

-- parse an arbitrary token and return the text value
textTok         :: DateParser String
textTok         = isTokType (const True)

-- a word
wordTok         :: DateParser String
wordTok         = isTokType (== "word")

-- a delimiter, whitespace is normalized, sequences are reduced to a single space char
delTok          :: DateParser String
delTok          = isTokType (== "del")
                  >>=
                  return . sed (const " ") ws1

-- tokens that don't contain date info

fillTok         :: DateParser String
fillTok         = delTok <|> wordTok

-- semester tokens, not yet interpreted
semTok'         :: String -> DateParser (String, Int, Bool)
semTok' sem'     = do v <- isTokType (== sem')
                      return (v, read . head . tokenizeExt year $ v, sem' == "ssem")

semTok          :: DateParser (String, Int, Bool)
semTok          = semTok' "ssem" <|> semTok' "wsem"

-- ------------------------------------------------------------

-- conversion from month names to 1..12
monthToM        :: String -> String
monthToM m
    = show .
      (\ l -> if null l then 99 else head l) .
      map fst .
      filter ((== True) . snd) .
      map (second (`isPrefixOf` map toLower m)) $
      monthAbr

-- ------------------------------------------------------------

ts :: String
ts = "Am Sonntag, dem 17. Februar '03 findet um 9 Uhr ein wichtiger Termin für das Sommersemester 2000 statt. "
    ++ "Dieser wird allerdings auf Montag verschoben. Und zwar auf den ersten Montag im Wintersemester 11/12, 12:30. "
    ++ "Ein wichtiger Termin findet im SoSe 2011 statt. Im Jahr '12 gibt es Termine, aber auch in WS 2010/11. "
    ++ "Ein weiterer Termin ist  am 2.4.11 um 12 Uhr. Oder war es doch Di. der 3.4.? Egal. "
    ++ "Ein weiterer wichtiger Termin findet am 2001-3-4 statt bzw. generell zwischen 01/3/4 - 01/6/4 um 13 Uhr. "
    ++ "Am kommenden Mittwoch findet Changemanagement in HS5 statt. Dies gilt dann auch für den 7. Juni "
    ++ "des Jahres 2011. Noch ein wichtiger Termin findet um 16:15 Uhr am Do., 1.2.03 statt. "
    ++ "Freitag, der 13. Juli ist kein Glückstag"
    ++ "und Freitag, der 13. Juli um 11:55 Uhr ist es zu spät."

rrr :: [String]
rrr = map _r . dateSearch' . tokenizeSubex tokenRE $ ts

ddd :: [DateVal]
ddd = map _d . dateSearch' . tokenizeSubex tokenRE $ ts

aaa :: [DateRep]
aaa =          dateSearch' . tokenizeSubex tokenRE $ ts

tt :: String -> [(String, String)]
tt = tokenizeSubex tokenRE

dd :: String -> [DateVal]
dd = map _d . dateSearch' . tt

rr :: String -> [String]
rr = map _r . dateSearch' . tt

pp :: String -> [String]
pp = map _p . dateSearch' . tt

-- ------------------------------------------------------------

testDate :: forall a . StringLike a => Test' a
testDate
  = Test' $
    TestLabel "date and time extraction from free text" $
    TestList $
    zipWith parseT toks exx
  where
    parseT res ok
      = TestCase $
        assertEqual (show res ++ " == " ++ show ok) res ok
        
    toks :: [(a, a)]
    toks = tokenizeSubex (fromString tokenRE) (fromString ts)

    exx :: [(a, a)]
    exx = map (fromString *** fromString) $
          [("word","Am"),("del"," "),("weekday","Sonntag"),("del",", "),("word","dem"),("del"," ")
          ,("ddMonthyyyy","17. Februar '03"),("del"," "),("word","findet"),("del"," "),("word","um")
          ,("del"," "),("HH","9 Uhr"),("del"," "),("word","ein"),("del"," "),("word","wichtiger")
          ,("del"," "),("word","Termin"),("del"," "),("word","f\252r"),("del"," "),("word","das")
          ,("del"," "),("ssem","Sommersemester 2000"),("del"," "),("word","statt"),("del",". ")
          ,("word","Dieser"),("del"," "),("word","wird"),("del"," "),("word","allerdings")
          ,("del"," "),("word","auf"),("del"," "),("weekday","Montag"),("del"," ")
          ,("word","verschoben"),("del",". "),("word","Und"),("del"," "),("word","zwar")
          ,("del"," "),("word","auf"),("del"," "),("word","den"),("del"," "),("word","ersten")
          ,("del"," "),("weekday","Montag"),("del"," "),("word","im"),("del"," ")
          ,("wsem","Wintersemester 11/12"),("del",", "),("HHMM","12:30"),("del",". ")
          ,("word","Ein"),("del"," "),("word","wichtiger"),("del"," "),("word","Termin")
          ,("del"," "),("word","findet"),("del"," "),("word","im"),("del"," "),("ssem","SoSe 2011")
          ,("del"," "),("word","statt"),("del",". "),("word","Im"),("del"," "),("word","Jahr")
          ,("del"," '"),("word","12"),("del"," "),("word","gibt"),("del"," "),("word","es")
          ,("del"," "),("word","Termine"),("del",", "),("word","aber"),("del"," "),("word","auch")
          ,("del"," "),("word","in"),("del"," "),("wsem","WS 2010/11"),("del",". "),("word","Ein")
          ,("del"," "),("word","weiterer"),("del"," "),("word","Termin"),("del"," "),("word","ist")
          ,("del","  "),("word","am"),("del"," "),("ddmmyyyy","2.4.11"),("del"," "),("word","um")
          ,("del"," "),("HH","12 Uhr"),("del",". "),("word","Oder"),("del"," "),("word","war")
          ,("del"," "),("word","es"),("del"," "),("word","doch"),("del"," "),("weekday","Di.")
          ,("del"," "),("word","der"),("del"," "),("ddmm","3.4."),("del","? "),("word","Egal")
          ,("del",". "),("word","Ein"),("del"," "),("word","weiterer"),("del"," ")
          ,("word","wichtiger"),("del"," "),("word","Termin"),("del"," "),("word","findet")
          ,("del"," "),("word","am"),("del"," "),("yyyymmdd","2001-3-4"),("del"," ")
          ,("word","statt"),("del"," "),("word","bzw"),("del",". "),("word","generell")
          ,("del"," "),("word","zwischen"),("del"," "),("yyyymmdd","01/3/4"),("del"," - ")
          ,("yyyymmdd","01/6/4"),("del"," "),("word","um"),("del"," "),("HH","13 Uhr")
          ,("del",". "),("word","Am"),("del"," "),("word","kommenden"),("del"," ")
          ,("weekday","Mittwoch"),("del"," "),("word","findet"),("del"," ")
          ,("word","Changemanagement"),("del"," "),("word","in"),("del"," "),("word","HS5")
          ,("del"," "),("word","statt"),("del",". "),("word","Dies"),("del"," "),("word","gilt")
          ,("del"," "),("word","dann"),("del"," "),("word","auch"),("del"," "),("word","f\252r")
          ,("del"," "),("word","den"),("del"," "),("ddMonth","7. Juni"),("del"," "),("word","des")
          ,("del"," "),("word","Jahres"),("del"," "),("yyyy","2011"),("del",". "),("word","Noch")
          ,("del"," "),("word","ein"),("del"," "),("word","wichtiger"),("del"," "),("word","Termin")
          ,("del"," "),("word","findet"),("del"," "),("word","um"),("del"," "),("HHMM","16:15 Uhr")
          ,("del"," "),("word","am"),("del"," "),("weekday","Do."),("del",", "),("ddmmyyyy","1.2.03")
          ,("del"," "),("word","statt"),("del",". "),("weekday","Freitag"),("del",", ")
          ,("word","der"),("del"," "),("ddMonth","13. Juli"),("del"," "),("word","ist"),("del"," ")
          ,("word","kein"),("del"," "),("word","Gl\252ckstagund"),("del"," "),("weekday","Freitag")
          ,("del",", "),("word","der"),("del"," "),("ddMonth","13. Juli"),("del"," "),("word","um")
          ,("del"," "),("HHMM","11:55 Uhr"),("del"," "),("word","ist"),("del"," "),("word","es")
          ,("del"," "),("word","zu"),("del"," "),("word","sp\228t"),("del",".")
          ]

    
-- ------------------------------------------------------------

genericTest :: (forall a . StringLike a => Test' a) -> Test
genericTest t
    = TestList $
      [ TestLabel "Test with 'String'"          $ unTest' (t :: Test' String)
      , TestLabel "Test with 'Text'"            $ unTest' (t :: Test' Text)
      , TestLabel "Test with 'Text.Lazy'"       $ unTest' (t :: Test' TextL)
      , TestLabel "Test with 'ByteString'"      $ unTest' (t :: Test' BS)
      , TestLabel "Test with 'ByteString.Lazy'" $ unTest' (t :: Test' BL)
      ]

allTests        :: Test
allTests
    = TestList
      [ genericTest testDate ]

main    :: IO ()
main
    = do
      c <- runTestTT allTests
      putStrLn $ show c
      let errs = errors c
          fails = failures c
      exitWith (codeGet errs fails)

codeGet :: Int -> Int -> ExitCode
codeGet errs fails
    | fails > 0       = ExitFailure 2
    | errs > 0        = ExitFailure 1
    | otherwise       = ExitSuccess

-- ------------------------------------------------------------