Blob Blame History Raw
-------------------------------------------------------------------------------
--                  ALEX SCANNER AND LITERATE PREPROCESSOR
--
-- This Script defines the grammar used to generate the Alex scanner and a
-- preprocessing scanner for dealing with literate scripts.  The actions for
-- the Alex scanner are given separately in the Alex module.
--
-- See the Alex manual for a discussion of the scanners defined here.
--
-- Chris Dornan, Aug-95, 4-Jun-96, 10-Jul-96, 29-Sep-97
-------------------------------------------------------------------------------

{
module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn) where

import Data.Char
import ParseMonad
--import Debug.Trace
}

$digit    = 0-9
$hexdig   = [0-9 A-F a-f]
$octal    = 0-7
$lower    = a-z
$upper    = A-Z
$alpha    = [$upper $lower]
$alphanum = [$alpha $digit]
$idchar   = [$alphanum \_ \']

$special    = [\.\;\,\$\|\*\+\?\#\~\-\{\}\(\)\[\]\^\/]
$graphic    = $printable # $white
$nonspecial = $graphic # [$special \%]

@id     = $alpha $idchar*
@smac   = \$ @id | \$ \{ @id \}
@rmac   = \@ @id | \@ \{ @id \}

@comment = "--".*
@ws      = $white+ | @comment

alex :-

@ws                             { skip }     -- white space; ignore

<0> \" [^\"]* \"                { string }
<0> (@id @ws?)? \:\-            { bind }
<0> \{ / (\n | [^$digit])       { code }
<0> $special                    { special }  -- note: matches {
<0> \% "wrapper"                { wrapper }
<0> \% "encoding"               { encoding }
<0> \% "action"                 { actionty }
<0> \% "token"                  { tokenty }
<0> \% "typeclass"              { typeclass }

<0> \\ $digit+                  { decch }
<0> \\ x $hexdig+               { hexch }
<0> \\ o $octal+                { octch }
<0> \\ $printable               { escape }
<0> $nonspecial # [\<]          { char }
<0> @smac                       { smac }
<0> @rmac                       { rmac }

<0> @smac @ws? \=               { smacdef }
<0> @rmac @ws? \=               { rmacdef }

-- identifiers are allowed to be unquoted in startcode lists
<0>             \<              { special `andBegin` startcodes }
<startcodes>    0               { zero }
<startcodes>    @id             { startcode }
<startcodes>    \,              { special }
<startcodes>    \>              { special `andBegin` afterstartcodes }

-- After a <..> startcode sequence, we can have a {...} grouping of rules,
-- so don't try to interpret the opening { as a code block.
<afterstartcodes> \{ (\n | [^$digit ])  { special `andBegin` 0 }
<afterstartcodes> ()            { skip `andBegin` 0 }  -- note: empty pattern
{

-- -----------------------------------------------------------------------------
-- Token type

data Token = T AlexPosn Tkn
  deriving Show

tokPosn (T p _) = p

data Tkn
  = SpecialT Char
  | CodeT String
  | ZeroT
  | IdT String
  | StringT String
  | BindT String
  | CharT Char
  | SMacT String
  | RMacT String
  | SMacDefT String
  | RMacDefT String
  | NumT Int
  | WrapperT
  | EncodingT
  | ActionTypeT
  | TokenTypeT
  | TypeClassT
  | EOFT
  deriving Show

-- -----------------------------------------------------------------------------
-- Token functions

special, zero, string, bind, escape, decch, hexch, octch, char :: Action
smac, rmac, smacdef, rmacdef, startcode, wrapper, encoding :: Action
actionty, tokenty, typeclass :: Action
special   (p,_,str) _  = return $ T p (SpecialT  (head str))
zero      (p,_,_)   _  = return $ T p ZeroT
string    (p,_,str) ln = return $ T p (StringT (extract ln str))
bind      (p,_,str) _  = return $ T p (BindT (takeWhile isIdChar str))
escape    (p,_,str) _  = return $ T p (CharT (esc str))
decch     (p,_,str) ln = return $ T p (CharT (do_ech 10 ln (take (ln-1) (tail str))))
hexch     (p,_,str) ln = return $ T p (CharT (do_ech 16 ln (take (ln-2) (drop 2 str))))
octch     (p,_,str) ln = return $ T p (CharT (do_ech 8  ln (take (ln-2) (drop 2 str))))
char      (p,_,str) _  = return $ T p (CharT (head str))
smac      (p,_,str) ln = return $ T p (SMacT (mac ln str))
rmac      (p,_,str) ln = return $ T p (RMacT (mac ln str))
smacdef   (p,_,str) ln = return $ T p (SMacDefT (macdef ln str))
rmacdef   (p,_,str) ln = return $ T p (RMacDefT (macdef ln str))
startcode (p,_,str) ln = return $ T p (IdT (take ln str))
wrapper   (p,_,_)   _  = return $ T p WrapperT
encoding  (p,_,_)   _  = return $ T p EncodingT
actionty  (p,_,_)   _  = return $ T p ActionTypeT
tokenty   (p,_,_)   _  = return $ T p TokenTypeT
typeclass (p,_,_)   _  = return $ T p TypeClassT

isIdChar :: Char -> Bool
isIdChar c = isAlphaNum c || c `elem` "_'"

extract :: Int -> String -> String
extract ln str = take (ln-2) (tail str)

do_ech :: Int -> Int -> String -> Char
do_ech radix _ln str = chr (parseInt radix str)

mac :: Int -> String -> String
mac ln str = take (ln-1) $ tail str

-- TODO : replace not . isSpace with (\c -> not (isSpace c) && c /= '=')
macdef :: Int -> String -> String
macdef _ln str = takeWhile (\c -> not (isSpace c) && c /= '=') $ tail str

esc :: String -> Char
esc str =
  case head $ tail str of
    'a' -> '\a'
    'b' -> '\b'
    'f' -> '\f'
    'n' -> '\n'
    'r' -> '\r'
    't' -> '\t'
    'v' -> '\v'
    c   ->  c

parseInt :: Int -> String -> Int
parseInt radix ds = foldl1 (\n d -> n * radix + d) (map digitToInt ds)

-- In brace-delimited code, we have to be careful to match braces
-- within the code, but ignore braces inside strings and character
-- literals.  We do an approximate job (doing it properly requires
-- implementing a large chunk of the Haskell lexical syntax).

code :: Action
code (p,_,_inp) _ = do
  currentInput <- getInput
  go currentInput 1 ""
  where
    go :: AlexInput -> Int -> String -> P Token
    go inp 0 cs = do
      setInput inp
      return (T p (CodeT (reverse (tail cs))))
    go inp n cs = do
      case alexGetChar inp of
        Nothing       -> err inp
        Just (c,inp2) ->
          case c of
            '{'  -> go inp2 (n+1) (c:cs)
            '}'  -> go inp2 (n-1) (c:cs)
            '\'' -> go_char inp2 n (c:cs)
            '\"' -> go_str inp2 n (c:cs) '\"'
            c2   -> go inp2 n (c2:cs)

    go_char :: AlexInput -> Int -> String -> P Token
    -- try to catch multiple occurrences of ' at identifier end
    go_char inp n cs@('\'':'\'':_) = go inp n cs
    -- try to catch occurrences of ' within an identifier
    go_char inp n cs@('\'':c2:_)
      | isAlphaNum c2              = go inp n cs
    go_char inp n cs               = go_str inp n cs '\''

    go_str :: AlexInput -> Int -> String -> Char -> P Token
    go_str inp n cs end = do
      case alexGetChar inp of
          Nothing -> err inp
          Just (c,inp2)
            | c == end  -> go inp2 n (c:cs)
            | otherwise ->
              case c of
                '\\' -> case alexGetChar inp2 of
                          Nothing       -> err inp2
                          Just (d,inp3) -> go_str inp3 n (d:c:cs) end
                c2   -> go_str inp2 n (c2:cs) end

    err inp = do setInput inp; lexError "lexical error in code fragment"

lexError :: String -> P a
lexError s = do
  (_,_,_,input) <- getInput
  failP (s ++ (if (not (null input))
                  then " at " ++ show (head input)
                  else " at end of file"))

lexer :: (Token -> P a) -> P a
lexer cont = lexToken >>= cont

lexToken :: P Token
lexToken = do
  inp@(p,c,_,s) <- getInput
  sc <- getStartCode
  case alexScan inp sc of
    AlexEOF -> return (T p EOFT)
    AlexError _ -> lexError "lexical error"
    AlexSkip inp1 _ -> do
      setInput inp1
      lexToken
    AlexToken inp1 len t -> do
      setInput inp1
      t (p,c,s) len

type Action = (AlexPosn,Char,String) -> Int -> P Token

skip :: Action
skip _ _ = lexToken

andBegin :: Action -> StartCode -> Action
andBegin act sc inp len = setStartCode sc >> act inp len
}