{
-- -----------------------------------------------------------------------------
--
-- Parser.y, part of Alex
--
-- (c) Simon Marlow 2003
--
-- -----------------------------------------------------------------------------
{-# OPTIONS_GHC -w #-}
module Parser ( parse, P ) where
import AbsSyn
import Scan
import CharSet
import ParseMonad hiding ( StartCode )
import Data.Char
--import Debug.Trace
}
%tokentype { Token }
%name parse
%monad { P } { (>>=) } { return }
%lexer { lexer } { T _ EOFT }
%token
'.' { T _ (SpecialT '.') }
';' { T _ (SpecialT ';') }
'<' { T _ (SpecialT '<') }
'>' { T _ (SpecialT '>') }
',' { T _ (SpecialT ',') }
'$' { T _ (SpecialT '$') }
'|' { T _ (SpecialT '|') }
'*' { T _ (SpecialT '*') }
'+' { T _ (SpecialT '+') }
'?' { T _ (SpecialT '?') }
'{' { T _ (SpecialT '{') }
'}' { T _ (SpecialT '}') }
'(' { T _ (SpecialT '(') }
')' { T _ (SpecialT ')') }
'#' { T _ (SpecialT '#') }
'~' { T _ (SpecialT '~') }
'-' { T _ (SpecialT '-') }
'[' { T _ (SpecialT '[') }
']' { T _ (SpecialT ']') }
'^' { T _ (SpecialT '^') }
'/' { T _ (SpecialT '/') }
ZERO { T _ ZeroT }
STRING { T _ (StringT $$) }
BIND { T _ (BindT $$) }
ID { T _ (IdT $$) }
CODE { T _ (CodeT _) }
CHAR { T _ (CharT $$) }
SMAC { T _ (SMacT _) }
RMAC { T _ (RMacT $$) }
SMAC_DEF { T _ (SMacDefT $$) }
RMAC_DEF { T _ (RMacDefT $$) }
WRAPPER { T _ WrapperT }
ENCODING { T _ EncodingT }
ACTIONTYPE { T _ ActionTypeT }
TOKENTYPE { T _ TokenTypeT }
TYPECLASS { T _ TypeClassT }
%%
alex :: { (Maybe (AlexPosn,Code), [Directive], Scanner, Maybe (AlexPosn,Code)) }
: maybe_code directives macdefs scanner maybe_code { ($1,$2,$4,$5) }
maybe_code :: { Maybe (AlexPosn,Code) }
: CODE { case $1 of T pos (CodeT code) ->
Just (pos,code) }
| {- empty -} { Nothing }
directives :: { [Directive] }
: directive directives { $1 : $2 }
| {- empty -} { [] }
directive :: { Directive }
: WRAPPER STRING { WrapperDirective $2 }
| ENCODING encoding { EncodingDirective $2 }
| ACTIONTYPE STRING { ActionType $2 }
| TOKENTYPE STRING { TokenType $2 }
| TYPECLASS STRING { TypeClass $2 }
encoding :: { Encoding }
: STRING {% lookupEncoding $1 }
macdefs :: { () }
: macdef macdefs { () }
| {- empty -} { () }
-- hack: the lexer looks for the '=' in a macro definition, because there
-- doesn't seem to be a way to formulate the grammar here to avoid a
-- conflict (it needs LR(2) rather than LR(1) to find the '=' and distinguish
-- an SMAC/RMAC at the beginning of a definition from an SMAC/RMAC that is
-- part of a regexp in the previous definition).
macdef :: { () }
: SMAC_DEF set {% newSMac $1 $2 }
| RMAC_DEF rexp {% newRMac $1 $2 }
scanner :: { Scanner }
: BIND tokendefs { Scanner $1 $2 }
tokendefs :: { [RECtx] }
: tokendef tokendefs { $1 ++ $2 }
| {- empty -} { [] }
tokendef :: { [RECtx] }
: startcodes rule { [ replaceCodes $1 $2 ] }
| startcodes '{' rules '}' { map (replaceCodes $1) $3 }
| rule { [ $1 ] }
rule :: { RECtx }
: context rhs { let (l,e,r) = $1 in
RECtx [] l e r $2 }
rules :: { [RECtx] }
: rule rules { $1 : $2 }
| {- empty -} { [] }
startcodes :: { [(String,StartCode)] }
: '<' startcodes0 '>' { $2 }
startcodes0 :: { [(String,StartCode)] }
: startcode ',' startcodes0 { ($1,0) : $3 }
| startcode { [($1,0)] }
startcode :: { String }
: ZERO { "0" }
| ID { $1 }
rhs :: { Maybe Code }
: CODE { case $1 of T _ (CodeT code) -> Just code }
| ';' { Nothing }
context :: { Maybe CharSet, RExp, RightContext RExp }
: left_ctx rexp right_ctx { (Just $1,$2,$3) }
| rexp right_ctx { (Nothing,$1,$2) }
left_ctx :: { CharSet }
: '^' { charSetSingleton '\n' }
| set '^' { $1 }
right_ctx :: { RightContext RExp }
: '$' { RightContextRExp (Ch (charSetSingleton '\n')) }
| '/' rexp { RightContextRExp $2 }
| '/' CODE { RightContextCode (case $2 of
T _ (CodeT code) -> code) }
| {- empty -} { NoRightContext }
rexp :: { RExp }
: alt '|' rexp { $1 :| $3 }
| alt { $1 }
alt :: { RExp }
: alt term { $1 :%% $2 }
| term { $1 }
term :: { RExp }
: rexp0 rep { $2 $1 }
| rexp0 { $1 }
rep :: { RExp -> RExp }
: '*' { Star }
| '+' { Plus }
| '?' { Ques }
-- TODO: these don't check for digits
-- properly.
| '{' CHAR '}' { repeat_rng (digit $2) Nothing }
| '{' CHAR ',' '}' { repeat_rng (digit $2) (Just Nothing) }
| '{' CHAR ',' CHAR '}' { repeat_rng (digit $2) (Just (Just (digit $4))) }
rexp0 :: { RExp }
: '(' ')' { Eps }
| STRING { foldr (:%%) Eps
(map (Ch . charSetSingleton) $1) }
| RMAC {% lookupRMac $1 }
| set { Ch $1 }
| '(' rexp ')' { $2 }
set :: { CharSet }
: set '#' set0 { $1 `charSetMinus` $3 }
| set0 { $1 }
set0 :: { CharSet }
: CHAR { charSetSingleton $1 }
| CHAR '-' CHAR { charSetRange $1 $3 }
| smac {% lookupSMac $1 }
| '[' sets ']' { foldr charSetUnion emptyCharSet $2 }
-- [^sets] is the same as '. # [sets]'
-- The upshot is that [^set] does *not* match a newline character,
-- which seems much more useful than just taking the complement.
| '[' '^' sets ']'
{% do { dot <- lookupSMac (tokPosn $1, ".");
return (dot `charSetMinus`
foldr charSetUnion emptyCharSet $3) }}
-- ~set is the same as '. # set'
| '~' set0 {% do { dot <- lookupSMac (tokPosn $1, ".");
return (dot `charSetMinus` $2) } }
sets :: { [CharSet] }
: set sets { $1 : $2 }
| {- empty -} { [] }
smac :: { (AlexPosn,String) }
: '.' { (tokPosn $1, ".") }
| SMAC { case $1 of T p (SMacT s) -> (p, s) }
{
happyError :: P a
happyError = failP "parse error"
-- -----------------------------------------------------------------------------
-- Utils
digit c = ord c - ord '0'
repeat_rng :: Int -> Maybe (Maybe Int) -> (RExp->RExp)
repeat_rng n (Nothing) re = foldr (:%%) Eps (replicate n re)
repeat_rng n (Just Nothing) re = foldr (:%%) (Star re) (replicate n re)
repeat_rng n (Just (Just m)) re = intl :%% rst
where
intl = repeat_rng n Nothing re
rst = foldr (\re re'->Ques(re :%% re')) Eps (replicate (m-n) re)
replaceCodes codes rectx = rectx{ reCtxStartCodes = codes }
lookupEncoding :: String -> P Encoding
lookupEncoding s = case map toLower s of
"iso-8859-1" -> return Latin1
"latin1" -> return Latin1
"utf-8" -> return UTF8
"utf8" -> return UTF8
_ -> failP ("encoding " ++ show s ++ " not supported")
}