Blame src/DFA.hs

Packit 2cbdf3
-- -----------------------------------------------------------------------------
Packit 2cbdf3
-- 
Packit 2cbdf3
-- DFA.hs, part of Alex
Packit 2cbdf3
--
Packit 2cbdf3
-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
Packit 2cbdf3
--
Packit 2cbdf3
-- This module generates a DFA from a scanner by first converting it
Packit 2cbdf3
-- to an NFA and then converting the NFA with the subset construction.
Packit 2cbdf3
-- 
Packit 2cbdf3
-- See the chapter on `Finite Automata and Lexical Analysis' in the
Packit 2cbdf3
-- dragon book for an excellent overview of the algorithms in this
Packit 2cbdf3
-- module.
Packit 2cbdf3
--
Packit 2cbdf3
-- ----------------------------------------------------------------------------}
Packit 2cbdf3
Packit 2cbdf3
module DFA(scanner2dfa) where
Packit 2cbdf3
Packit 2cbdf3
import AbsSyn
Packit 2cbdf3
import qualified Map
Packit 2cbdf3
import qualified Data.IntMap as IntMap
Packit 2cbdf3
import NFA
Packit 2cbdf3
import Sort ( msort, nub' )
Packit 2cbdf3
import CharSet
Packit 2cbdf3
Packit 2cbdf3
import Data.Array ( (!) )
Packit 2cbdf3
import Data.Maybe ( fromJust )
Packit 2cbdf3
Packit 2cbdf3
{-                        Defined in the Scan Module
Packit 2cbdf3
Packit 2cbdf3
-- (This section should logically belong to the DFA module but it has been
Packit 2cbdf3
-- placed here to make this module self-contained.)
Packit 2cbdf3
--  
Packit 2cbdf3
-- `DFA' provides an alternative to `Scanner' (described in the RExp module);
Packit 2cbdf3
-- it can be used directly to scan text efficiently.  Additionally it has an
Packit 2cbdf3
-- extra place holder for holding action functions for generating
Packit 2cbdf3
-- application-specific tokens.  When this place holder is not being used, the
Packit 2cbdf3
-- unit type will be used.
Packit 2cbdf3
--  
Packit 2cbdf3
-- Each state in the automaton consist of a list of `Accept' values, descending
Packit 2cbdf3
-- in priority, and an array mapping characters to new states.  As the array
Packit 2cbdf3
-- may only cover a sub-range of the characters, a default state number is
Packit 2cbdf3
-- given in the third field.  By convention, all transitions to the -1 state
Packit 2cbdf3
-- represent invalid transitions.
Packit 2cbdf3
--  
Packit 2cbdf3
-- A list of accept states is provided for as the original specification may
Packit 2cbdf3
-- have been ambiguous, in which case the highest priority token should be
Packit 2cbdf3
-- taken (the one appearing earliest in the specification); this can not be
Packit 2cbdf3
-- calculated when the DFA is generated in all cases as some of the tokens may
Packit 2cbdf3
-- be associated with leading or trailing context or start codes.
Packit 2cbdf3
--  
Packit 2cbdf3
-- `scan_token' (see above) can deal with unconditional accept states more
Packit 2cbdf3
-- efficiently than those associated with context; to save it testing each time
Packit 2cbdf3
-- whether the list of accept states contains an unconditional state, the flag
Packit 2cbdf3
-- in the first field of `St' is set to true whenever the list contains an
Packit 2cbdf3
-- unconditional state.
Packit 2cbdf3
--  
Packit 2cbdf3
-- The `Accept' structure contains the priority of the token being accepted
Packit 2cbdf3
-- (lower numbers => higher priorities), the name of the token, a place holder
Packit 2cbdf3
-- that can be used for storing the `action' function for constructing the
Packit 2cbdf3
-- token from the input text and thge scanner's state, a list of start codes
Packit 2cbdf3
-- (listing the start codes that the scanner must be in for the token to be
Packit 2cbdf3
-- accepted; empty => no restriction), the leading and trailing context (both
Packit 2cbdf3
-- `Nothing' if there is none).
Packit 2cbdf3
--  
Packit 2cbdf3
-- The leading context consists simply of a character predicate that will
Packit 2cbdf3
-- return true if the last character read is acceptable.  The trailing context
Packit 2cbdf3
-- consists of an alternative starting state within the DFA; if this `sub-dfa'
Packit 2cbdf3
-- turns up any accepting state when applied to the residual input then the
Packit 2cbdf3
-- trailing context is acceptable (see `scan_token' above).
Packit 2cbdf3
Packit 2cbdf3
type DFA a = Array SNum (State a)
Packit 2cbdf3
Packit 2cbdf3
type SNum = Int
Packit 2cbdf3
Packit 2cbdf3
data State a = St Bool [Accept a] SNum (Array Char SNum)
Packit 2cbdf3
Packit 2cbdf3
data Accept a = Acc Int String a [StartCode] (MB(Char->Bool)) (MB SNum)
Packit 2cbdf3
Packit 2cbdf3
type StartCode = Int
Packit 2cbdf3
-}
Packit 2cbdf3
Packit 2cbdf3
Packit 2cbdf3
-- Scanners are converted to DFAs by converting them to NFAs first.  Converting
Packit 2cbdf3
-- an NFA to a DFA works by identifying the states of the DFA with subsets of
Packit 2cbdf3
-- the NFA.  The PartDFA is used to construct the DFA; it is essentially a DFA
Packit 2cbdf3
-- in which the states are represented directly by state sets of the NFA.
Packit 2cbdf3
-- `nfa2pdfa' constructs the partial DFA from the NFA by searching for all the
Packit 2cbdf3
-- transitions from a given list of state sets, initially containing the start
Packit 2cbdf3
-- state of the partial DFA, until all possible state sets have been considered
Packit 2cbdf3
-- The final DFA is then constructed with a `mk_dfa'.
Packit 2cbdf3
Packit 2cbdf3
scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
Packit 2cbdf3
scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)
Packit 2cbdf3
Packit 2cbdf3
nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
Packit 2cbdf3
nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))
Packit 2cbdf3
        where
Packit 2cbdf3
        pdfa = new_pdfa n_starts nfa
Packit 2cbdf3
        n_starts = length scs  -- number of start states
Packit 2cbdf3
Packit 2cbdf3
-- `nfa2pdfa' works by taking the next outstanding state set to be considered
Packit 2cbdf3
-- and and ignoring it if the state is already in the partial DFA, otherwise
Packit 2cbdf3
-- generating all possible transitions from it, adding the new state to the
Packit 2cbdf3
-- partial DFA and continuing the closure with the extra states.  Note the way
Packit 2cbdf3
-- it incorporates the trailing context references into the search (by
Packit 2cbdf3
-- including `rctx_ss' in the search).
Packit 2cbdf3
Packit 2cbdf3
nfa2pdfa:: NFA -> DFA StateSet Code -> [StateSet] -> DFA StateSet Code
Packit 2cbdf3
nfa2pdfa _   pdfa [] = pdfa
Packit 2cbdf3
nfa2pdfa nfa pdfa (ss:umkd)
Packit 2cbdf3
  |  ss `in_pdfa` pdfa =  nfa2pdfa nfa pdfa  umkd
Packit 2cbdf3
  |  otherwise         =  nfa2pdfa nfa pdfa' umkd'
Packit 2cbdf3
  where
Packit 2cbdf3
        pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa
Packit 2cbdf3
Packit 2cbdf3
        umkd' = rctx_sss ++ map snd ss_outs ++ umkd
Packit 2cbdf3
Packit 2cbdf3
        -- for each character, the set of states that character would take
Packit 2cbdf3
        -- us to from the current set of states in the NFA.
Packit 2cbdf3
        ss_outs :: [(Int, StateSet)]
Packit 2cbdf3
        ss_outs = [ (fromIntegral ch, mk_ss nfa ss')
Packit 2cbdf3
                  | ch  <- byteSetElems $ setUnions [p | (p,_) <- outs],
Packit 2cbdf3
                    let ss'  = [ s' | (p,s') <- outs, byteSetElem p ch ],
Packit 2cbdf3
                    not (null ss')
Packit 2cbdf3
                  ]
Packit 2cbdf3
Packit 2cbdf3
        rctx_sss = [ mk_ss nfa [s]
Packit 2cbdf3
                   | Acc _ _ _ (RightContextRExp s) <- accs ]
Packit 2cbdf3
Packit 2cbdf3
        outs :: [(ByteSet,SNum)]
Packit 2cbdf3
        outs =  [ out | s <- ss, out <- nst_outs (nfa!s) ]
Packit 2cbdf3
Packit 2cbdf3
        accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)]
Packit 2cbdf3
Packit 2cbdf3
-- `sort_accs' sorts a list of accept values into decending order of priority,
Packit 2cbdf3
-- eliminating any elements that follow an unconditional accept value.
Packit 2cbdf3
Packit 2cbdf3
sort_accs:: [Accept a] -> [Accept a]
Packit 2cbdf3
sort_accs accs = foldr chk [] (msort le accs)
Packit 2cbdf3
        where
Packit 2cbdf3
        chk acc@(Acc _ _ Nothing NoRightContext) _   = [acc]
Packit 2cbdf3
        chk acc                                  rst = acc:rst
Packit 2cbdf3
Packit 2cbdf3
        le (Acc{accPrio = n}) (Acc{accPrio=n'}) = n<=n'
Packit 2cbdf3
Packit 2cbdf3
Packit 2cbdf3
Packit 2cbdf3
{------------------------------------------------------------------------------
Packit 2cbdf3
                          State Sets and Partial DFAs
Packit 2cbdf3
------------------------------------------------------------------------------}
Packit 2cbdf3
Packit 2cbdf3
Packit 2cbdf3
Packit 2cbdf3
-- A `PartDFA' is a partially constructed DFA in which the states are
Packit 2cbdf3
-- represented by sets of states of the original NFA.  It is represented by a
Packit 2cbdf3
-- triple consisting of the start state of the partial DFA, the NFA from which
Packit 2cbdf3
-- it is derived and a map from state sets to states of the partial DFA.  The
Packit 2cbdf3
-- state set for a given list of NFA states is calculated by taking the epsilon
Packit 2cbdf3
-- closure of all the states, sorting the result with duplicates eliminated.
Packit 2cbdf3
Packit 2cbdf3
type StateSet = [SNum]
Packit 2cbdf3
Packit 2cbdf3
new_pdfa:: Int -> NFA -> DFA StateSet a
Packit 2cbdf3
new_pdfa starts nfa
Packit 2cbdf3
 = DFA { dfa_start_states = start_ss,
Packit 2cbdf3
         dfa_states = Map.empty
Packit 2cbdf3
       }
Packit 2cbdf3
 where
Packit 2cbdf3
        start_ss = [ msort (<=) (nst_cl(nfa!n)) | n <- [0..(starts-1)]]
Packit 2cbdf3
Packit 2cbdf3
 -- starts is the number of start states
Packit 2cbdf3
Packit 2cbdf3
-- constructs the epsilon-closure of a set of NFA states
Packit 2cbdf3
mk_ss:: NFA -> [SNum] -> StateSet
Packit 2cbdf3
mk_ss nfa l = nub' (<=) [s'| s<-l, s'<-nst_cl(nfa!s)]
Packit 2cbdf3
Packit 2cbdf3
add_pdfa:: StateSet -> State StateSet a -> DFA StateSet a -> DFA StateSet a
Packit 2cbdf3
add_pdfa ss pst (DFA st mp) = DFA st (Map.insert ss pst mp)
Packit 2cbdf3
Packit 2cbdf3
in_pdfa:: StateSet -> DFA StateSet a -> Bool
Packit 2cbdf3
in_pdfa ss (DFA _ mp) = ss `Map.member` mp
Packit 2cbdf3
Packit 2cbdf3
-- Construct a DFA with numbered states, from a DFA whose states are
Packit 2cbdf3
-- sets of states from the original NFA.
Packit 2cbdf3
Packit 2cbdf3
mk_int_dfa:: NFA -> DFA StateSet a -> DFA SNum a
Packit 2cbdf3
mk_int_dfa nfa (DFA start_states mp)
Packit 2cbdf3
  = DFA [0 .. length start_states-1] 
Packit 2cbdf3
        (Map.fromList [ (lookup' st, cnv pds) | (st, pds) <- Map.toAscList mp ])
Packit 2cbdf3
  where
Packit 2cbdf3
        mp' = Map.fromList (zip (start_states ++ 
Packit 2cbdf3
                                 (map fst . Map.toAscList) (foldr Map.delete mp start_states)) [0..])
Packit 2cbdf3
Packit 2cbdf3
        lookup' = fromJust . flip Map.lookup mp'
Packit 2cbdf3
Packit 2cbdf3
        cnv :: State StateSet a -> State SNum a
Packit 2cbdf3
        cnv (State accs as) = State accs' as'
Packit 2cbdf3
                where
Packit 2cbdf3
                as'   = IntMap.mapWithKey (\_ch s -> lookup' s) as
Packit 2cbdf3
Packit 2cbdf3
                accs' = map cnv_acc accs
Packit 2cbdf3
                cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'
Packit 2cbdf3
                  where rctx' = 
Packit 2cbdf3
                          case rctx of
Packit 2cbdf3
                                RightContextRExp s -> 
Packit 2cbdf3
                                  RightContextRExp (lookup' (mk_ss nfa [s]))
Packit 2cbdf3
                                other -> other
Packit 2cbdf3
Packit 2cbdf3
{-
Packit 2cbdf3
Packit 2cbdf3
-- `mk_st' constructs a state node from the list of accept values and a list of
Packit 2cbdf3
-- transitions.  The transitions list all the valid transitions out of the
Packit 2cbdf3
-- node; all invalid transitions should be represented in the array by state
Packit 2cbdf3
-- -1.  `mk_st' has to work out whether the accept states contain an
Packit 2cbdf3
-- unconditional entry, in which case the first field of `St' should be true,
Packit 2cbdf3
-- and which default state to use in constructing the array (the array may span
Packit 2cbdf3
-- a sub-range of the character set, the state number given the third argument
Packit 2cbdf3
-- of `St' being taken as the default if an input character lies outside the
Packit 2cbdf3
-- range).  The default values is chosen to minimise the bounds of the array
Packit 2cbdf3
-- and so there are two candidates: the value that 0 maps to (in which case
Packit 2cbdf3
-- some initial segment of the array may be omitted) or the value that 255 maps
Packit 2cbdf3
-- to (in which case a final segment of the array may be omitted), hence the
Packit 2cbdf3
-- calculation of `(df,bds)'.
Packit 2cbdf3
--  
Packit 2cbdf3
-- Note that empty arrays are avoided as they can cause severe problems for
Packit 2cbdf3
-- some popular Haskell compilers.
Packit 2cbdf3
Packit 2cbdf3
mk_st:: [Accept Code] -> [(Char,Int)] -> State Code
Packit 2cbdf3
mk_st accs as =
Packit 2cbdf3
        if null as
Packit 2cbdf3
           then St accs (-1) (listArray ('0','0') [-1])
Packit 2cbdf3
           else St accs df (listArray bds [arr!c| c<-range bds])
Packit 2cbdf3
        where
Packit 2cbdf3
        bds = if sz==0 then ('0','0') else bds0
Packit 2cbdf3
Packit 2cbdf3
        (sz,df,bds0) | sz1 < sz2 = (sz1,df1,bds1)
Packit 2cbdf3
                     | otherwise = (sz2,df2,bds2)
Packit 2cbdf3
Packit 2cbdf3
        (sz1,df1,bds1) = mk_bds(arr!chr 0)
Packit 2cbdf3
        (sz2,df2,bds2) = mk_bds(arr!chr 255)
Packit 2cbdf3
Packit 2cbdf3
        mk_bds df = (t-b, df, (chr b, chr (255-t)))
Packit 2cbdf3
                where
Packit 2cbdf3
                b = length (takeWhile id [arr!c==df| c<-['\0'..'\xff']])
Packit 2cbdf3
                t = length (takeWhile id [arr!c==df| c<-['\xff','\xfe'..'\0']])
Packit 2cbdf3
Packit 2cbdf3
        arr = listArray ('\0','\xff') (take 256 (repeat (-1))) // as
Packit 2cbdf3
-}