Blob Blame History Raw
-- -----------------------------------------------------------------------------
-- 
-- NFA.hs, part of Alex
--
-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
--
-- The `scanner2nfa' takes a `Scanner' (see the `RExp' module) and
-- generates its equivelent nondeterministic finite automaton.  NFAs
-- are turned into DFAs in the DFA module.
-- 
-- See the chapter on `Finite Automata and Lexical Analysis' in the
-- dragon book for an excellent overview of the algorithms in this
-- module.
--
-- ----------------------------------------------------------------------------}

module NFA where

import AbsSyn
import CharSet
import DFS ( t_close, out )
import Map ( Map )
import qualified Map hiding ( Map )
import Util ( str, space )

#if __GLASGOW_HASKELL__ < 710
import Control.Applicative ( Applicative(..) )
#endif
import Control.Monad ( forM_, zipWithM, zipWithM_, when, liftM, ap )
import Data.Array ( Array, (!), array, listArray, assocs, bounds )

-- Each state of a nondeterministic automaton contains a list of `Accept'
-- values, a list of epsilon transitions (an epsilon transition represents a
-- transition to another state that can be made without reading a character)
-- and a list of transitions qualified with a character predicate (the
-- transition can only be made to the given state on input of a character
-- permitted by the predicate).  Although a list of `Accept' values is provided
-- for, in actual fact each state will have zero or one of them (the `Maybe'
-- type is not used because the flexibility offered by the list representation
-- is useful).

type NFA = Array SNum NState

data NState = NSt {
 nst_accs :: [Accept Code],
 nst_cl   :: [SNum],
 nst_outs :: [(ByteSet,SNum)]
 }

-- Debug stuff
instance Show NState where
  showsPrec _ (NSt accs cl outs) =
    str "NSt " . shows accs . space . shows cl . space .
        shows [ (c, s) | (c,s) <- outs ]

{-                           From the Scan Module

-- The `Accept' structure contains the priority of the token being accepted
-- (lower numbers => higher priorities), the name of the token, a place holder
-- that can be used for storing the `action' function, a list of start codes
-- (listing the start codes that the scanner must be in for the token to be
-- accepted; empty => no restriction), the leading and trailing context (both
-- `Nothing' if there is none).
--  
-- The leading context consists simply of a character predicate that will
-- return true if the last character read is acceptable.  The trailing context
-- consists of an alternative starting state within the DFA; if this `sub-dfa'
-- turns up any accepting state when applied to the residual input then the
-- trailing context is acceptable.
-}


-- `scanner2nfa' takes a scanner (see the AbsSyn module) and converts it to an
-- NFA, using the NFA creation monad (see below).
--
-- We generate a start state for each startcode, with the same number
-- as that startcode, and epsilon transitions from this state to each
-- of the sub-NFAs for each of the tokens acceptable in that startcode.

scanner2nfa:: Encoding -> Scanner -> [StartCode] -> NFA
scanner2nfa enc Scanner{scannerTokens = toks} startcodes
   = runNFA enc $
        do
          -- make a start state for each start code (these will be
          -- numbered from zero).
          start_states <- sequence (replicate (length startcodes) newState)
          
          -- construct the NFA for each token
          tok_states <- zipWithM do_token toks [0..]

          -- make an epsilon edge from each state state to each
          -- token that is acceptable in that state
          zipWithM_ (tok_transitions (zip toks tok_states)) 
                startcodes start_states

        where
          do_token (RECtx _scs lctx re rctx code) prio = do
                b <- newState
                e <- newState
                rexp2nfa b e re

                rctx_e <- case rctx of
                                  NoRightContext ->
                                        return NoRightContext
                                  RightContextCode code' ->
                                        return (RightContextCode code')
                                  RightContextRExp re' -> do 
                                        r_b <- newState
                                        r_e <- newState
                                        rexp2nfa r_b r_e re'
                                        accept r_e rctxt_accept
                                        return (RightContextRExp r_b)

                let lctx' = case lctx of
                                  Nothing -> Nothing
                                  Just st -> Just st

                accept e (Acc prio code lctx' rctx_e)
                return b

          tok_transitions toks_with_states start_code start_state = do
                let states = [ s | (RECtx scs _ _ _ _, s) <- toks_with_states,
                                   null scs || start_code `elem` map snd scs ]
                mapM_ (epsilonEdge start_state) states

-- -----------------------------------------------------------------------------
-- NFA creation from a regular expression

-- rexp2nfa B E R generates an NFA that begins in state B, recognises
-- R, and ends in state E only if R has been recognised. 

rexp2nfa :: SNum -> SNum -> RExp -> NFAM ()
rexp2nfa b e Eps    = epsilonEdge b e
rexp2nfa b e (Ch p) = charEdge b p e
rexp2nfa b e (re1 :%% re2) = do
  s <- newState
  rexp2nfa b s re1
  rexp2nfa s e re2
rexp2nfa b e (re1 :| re2) = do
  rexp2nfa b e re1
  rexp2nfa b e re2
rexp2nfa b e (Star re) = do
  s <- newState
  epsilonEdge b s
  rexp2nfa s s re
  epsilonEdge s e
rexp2nfa b e (Plus re) = do
  s1 <- newState
  s2 <- newState
  rexp2nfa s1 s2 re
  epsilonEdge b s1
  epsilonEdge s2 s1
  epsilonEdge s2 e
rexp2nfa b e (Ques re) = do
  rexp2nfa b e re
  epsilonEdge b e

-- -----------------------------------------------------------------------------
-- NFA creation monad.

-- Partial credit to Thomas Hallgren for this code, as I adapted it from
-- his "Lexing Haskell in Haskell" lexer generator.

type MapNFA = Map SNum NState

newtype NFAM a = N {unN :: SNum -> MapNFA -> Encoding -> (SNum, MapNFA, a)}

instance Functor NFAM where
  fmap = liftM

instance Applicative NFAM where
  pure a = N $ \s n _ -> (s,n,a)
  (<*>) = ap

instance Monad NFAM where
  return = pure

  m >>= k  = N $ \s n e -> case unN m s n e of
                                 (s', n', a) -> unN (k a) s' n' e

runNFA :: Encoding -> NFAM () -> NFA
runNFA e m = case unN m 0 Map.empty e of
                (s, nfa_map, ()) -> -- trace ("runNfa.." ++ show (Map.toAscList nfa_map)) $ 
                                    e_close (array (0,s-1) (Map.toAscList nfa_map))

e_close:: Array Int NState -> NFA
e_close ar = listArray bds
                [NSt accs (out gr v) outs|(v,NSt accs _ outs)<-assocs ar]
        where
        gr = t_close (hi+1,\v->nst_cl (ar!v))
        bds@(_,hi) = bounds ar

newState :: NFAM SNum
newState = N $ \s n _ -> (s+1,n,s)

getEncoding :: NFAM Encoding
getEncoding = N $ \s n e -> (s,n,e)

anyBytes :: SNum -> Int -> SNum -> NFAM ()
anyBytes from 0 to = epsilonEdge from to
anyBytes from n to = do
        s <- newState
        byteEdge from (byteSetRange 0 0xff) s
        anyBytes s (n-1) to

bytesEdge :: SNum -> [Byte] -> [Byte] -> SNum -> NFAM ()
bytesEdge from [] [] to = epsilonEdge from to
bytesEdge from [x] [y] to = byteEdge from (byteSetRange x y) to -- (OPTIMISATION)
bytesEdge from (x:xs) (y:ys) to 
    | x == y = do 
        s <- newState
        byteEdge from (byteSetSingleton x) s
        bytesEdge s xs ys to
    | x < y = do
        do s <- newState
           byteEdge from (byteSetSingleton x) s
           bytesEdge s xs (fmap (const 0xff) ys) to

        do t <- newState
           byteEdge from (byteSetSingleton y) t
           bytesEdge t (fmap (const 0x00) xs) ys to

        when ((x+1) <= (y-1)) $ do 
           u <- newState
           byteEdge from (byteSetRange (x+1) (y-1)) u
           anyBytes u (length xs) to
bytesEdge _ _ _ _ = undefined -- hide compiler warning

charEdge :: SNum -> CharSet -> SNum -> NFAM ()
charEdge from charset to = do
  -- trace ("charEdge: " ++ (show $ charset) ++ " => " ++ show (byteRanges charset)) $ 
  e <- getEncoding
  forM_ (byteRanges e charset) $ \(xs,ys) -> do
    bytesEdge from xs ys to
    


byteEdge :: SNum -> ByteSet -> SNum -> NFAM ()
byteEdge from charset to = N $ \s n _ -> (s, addEdge n, ())
 where
   addEdge n =
     case Map.lookup from n of
       Nothing -> 
           Map.insert from (NSt [] [] [(charset,to)]) n
       Just (NSt acc eps trans) ->
           Map.insert from (NSt acc eps ((charset,to):trans)) n

epsilonEdge :: SNum -> SNum -> NFAM ()
epsilonEdge from to 
 | from == to = return ()
 | otherwise  = N $ \s n _ -> let n' = addEdge n in n' `seq` (s, n', ())
 where
   addEdge n =
     case Map.lookup from n of
       Nothing                  -> Map.insert from (NSt [] [to] []) n
       Just (NSt acc eps trans) -> Map.insert from (NSt acc (to:eps) trans) n

accept :: SNum -> Accept Code -> NFAM ()
accept state new_acc = N $ \s n _ -> (s, addAccept n, ())
 where
   addAccept n = 
     case Map.lookup state n of
       Nothing ->
           Map.insert state (NSt [new_acc] [] []) n
       Just (NSt acc eps trans) ->
           Map.insert state (NSt (new_acc:acc) eps trans) n


rctxt_accept :: Accept Code
rctxt_accept = Acc 0 Nothing Nothing NoRightContext