Tree - source-git/alex - CentOS Git server

source-git / alex

Blame src/DFA.hs

Blob History Raw

Packit	2cbdf3	`-- -----------------------------------------------------------------------------`
Packit	2cbdf3	`--`
Packit	2cbdf3	`-- DFA.hs, part of Alex`
Packit	2cbdf3	`--`
Packit	2cbdf3	`-- (c) Chris Dornan 1995-2000, Simon Marlow 2003`
Packit	2cbdf3	`--`
Packit	2cbdf3	`-- This module generates a DFA from a scanner by first converting it`
Packit	2cbdf3	`-- to an NFA and then converting the NFA with the subset construction.`
Packit	2cbdf3	`--`
Packit	2cbdf3	-- See the chapter on `Finite Automata and Lexical Analysis' in the
Packit	2cbdf3	`-- dragon book for an excellent overview of the algorithms in this`
Packit	2cbdf3	`-- module.`
Packit	2cbdf3	`--`
Packit	2cbdf3	`-- ----------------------------------------------------------------------------}`
Packit	2cbdf3
Packit	2cbdf3	`module DFA(scanner2dfa) where`
Packit	2cbdf3
Packit	2cbdf3	`import AbsSyn`
Packit	2cbdf3	`import qualified Map`
Packit	2cbdf3	`import qualified Data.IntMap as IntMap`
Packit	2cbdf3	`import NFA`
Packit	2cbdf3	`import Sort ( msort, nub' )`
Packit	2cbdf3	`import CharSet`
Packit	2cbdf3
Packit	2cbdf3	`import Data.Array ( (!) )`
Packit	2cbdf3	`import Data.Maybe ( fromJust )`
Packit	2cbdf3
Packit	2cbdf3	`{- Defined in the Scan Module`
Packit	2cbdf3
Packit	2cbdf3	`-- (This section should logically belong to the DFA module but it has been`
Packit	2cbdf3	`-- placed here to make this module self-contained.)`
Packit	2cbdf3	`--`
Packit	2cbdf3	-- `DFA' provides an alternative to `Scanner' (described in the RExp module);
Packit	2cbdf3	`-- it can be used directly to scan text efficiently. Additionally it has an`
Packit	2cbdf3	`-- extra place holder for holding action functions for generating`
Packit	2cbdf3	`-- application-specific tokens. When this place holder is not being used, the`
Packit	2cbdf3	`-- unit type will be used.`
Packit	2cbdf3	`--`
Packit	2cbdf3	-- Each state in the automaton consist of a list of `Accept' values, descending
Packit	2cbdf3	`-- in priority, and an array mapping characters to new states. As the array`
Packit	2cbdf3	`-- may only cover a sub-range of the characters, a default state number is`
Packit	2cbdf3	`-- given in the third field. By convention, all transitions to the -1 state`
Packit	2cbdf3	`-- represent invalid transitions.`
Packit	2cbdf3	`--`
Packit	2cbdf3	`-- A list of accept states is provided for as the original specification may`
Packit	2cbdf3	`-- have been ambiguous, in which case the highest priority token should be`
Packit	2cbdf3	`-- taken (the one appearing earliest in the specification); this can not be`
Packit	2cbdf3	`-- calculated when the DFA is generated in all cases as some of the tokens may`
Packit	2cbdf3	`-- be associated with leading or trailing context or start codes.`
Packit	2cbdf3	`--`
Packit	2cbdf3	-- `scan_token' (see above) can deal with unconditional accept states more
Packit	2cbdf3	`-- efficiently than those associated with context; to save it testing each time`
Packit	2cbdf3	`-- whether the list of accept states contains an unconditional state, the flag`
Packit	2cbdf3	-- in the first field of `St' is set to true whenever the list contains an
Packit	2cbdf3	`-- unconditional state.`
Packit	2cbdf3	`--`
Packit	2cbdf3	-- The `Accept' structure contains the priority of the token being accepted
Packit	2cbdf3	`-- (lower numbers => higher priorities), the name of the token, a place holder`
Packit	2cbdf3	-- that can be used for storing the `action' function for constructing the
Packit	2cbdf3	`-- token from the input text and thge scanner's state, a list of start codes`
Packit	2cbdf3	`-- (listing the start codes that the scanner must be in for the token to be`
Packit	2cbdf3	`-- accepted; empty => no restriction), the leading and trailing context (both`
Packit	2cbdf3	-- `Nothing' if there is none).
Packit	2cbdf3	`--`
Packit	2cbdf3	`-- The leading context consists simply of a character predicate that will`
Packit	2cbdf3	`-- return true if the last character read is acceptable. The trailing context`
Packit	2cbdf3	-- consists of an alternative starting state within the DFA; if this `sub-dfa'
Packit	2cbdf3	`-- turns up any accepting state when applied to the residual input then the`
Packit	2cbdf3	-- trailing context is acceptable (see `scan_token' above).
Packit	2cbdf3
Packit	2cbdf3	`type DFA a = Array SNum (State a)`
Packit	2cbdf3
Packit	2cbdf3	`type SNum = Int`
Packit	2cbdf3
Packit	2cbdf3	`data State a = St Bool [Accept a] SNum (Array Char SNum)`
Packit	2cbdf3
Packit	2cbdf3	`data Accept a = Acc Int String a [StartCode] (MB(Char->Bool)) (MB SNum)`
Packit	2cbdf3
Packit	2cbdf3	`type StartCode = Int`
Packit	2cbdf3	`-}`
Packit	2cbdf3
Packit	2cbdf3
Packit	2cbdf3	`-- Scanners are converted to DFAs by converting them to NFAs first. Converting`
Packit	2cbdf3	`-- an NFA to a DFA works by identifying the states of the DFA with subsets of`
Packit	2cbdf3	`-- the NFA. The PartDFA is used to construct the DFA; it is essentially a DFA`
Packit	2cbdf3	`-- in which the states are represented directly by state sets of the NFA.`
Packit	2cbdf3	-- `nfa2pdfa' constructs the partial DFA from the NFA by searching for all the
Packit	2cbdf3	`-- transitions from a given list of state sets, initially containing the start`
Packit	2cbdf3	`-- state of the partial DFA, until all possible state sets have been considered`
Packit	2cbdf3	-- The final DFA is then constructed with a `mk_dfa'.
Packit	2cbdf3
Packit	2cbdf3	`scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code`
Packit	2cbdf3	`scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)`
Packit	2cbdf3
Packit	2cbdf3	`nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code`
Packit	2cbdf3	`nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))`
Packit	2cbdf3	`where`
Packit	2cbdf3	`pdfa = new_pdfa n_starts nfa`
Packit	2cbdf3	`n_starts = length scs -- number of start states`
Packit	2cbdf3
Packit	2cbdf3	-- `nfa2pdfa' works by taking the next outstanding state set to be considered
Packit	2cbdf3	`-- and and ignoring it if the state is already in the partial DFA, otherwise`
Packit	2cbdf3	`-- generating all possible transitions from it, adding the new state to the`
Packit	2cbdf3	`-- partial DFA and continuing the closure with the extra states. Note the way`
Packit	2cbdf3	`-- it incorporates the trailing context references into the search (by`
Packit	2cbdf3	-- including `rctx_ss' in the search).
Packit	2cbdf3
Packit	2cbdf3	`nfa2pdfa:: NFA -> DFA StateSet Code -> [StateSet] -> DFA StateSet Code`
Packit	2cbdf3	`nfa2pdfa _ pdfa [] = pdfa`
Packit	2cbdf3	`nfa2pdfa nfa pdfa (ss:umkd)`
Packit	2cbdf3	\| ss `in_pdfa` pdfa = nfa2pdfa nfa pdfa umkd
Packit	2cbdf3	`\| otherwise = nfa2pdfa nfa pdfa' umkd'`
Packit	2cbdf3	`where`
Packit	2cbdf3	`pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa`
Packit	2cbdf3
Packit	2cbdf3	`umkd' = rctx_sss ++ map snd ss_outs ++ umkd`
Packit	2cbdf3
Packit	2cbdf3	`-- for each character, the set of states that character would take`
Packit	2cbdf3	`-- us to from the current set of states in the NFA.`
Packit	2cbdf3	`ss_outs :: [(Int, StateSet)]`
Packit	2cbdf3	`ss_outs = [ (fromIntegral ch, mk_ss nfa ss')`
Packit	2cbdf3	`\| ch <- byteSetElems $ setUnions [p \| (p,_) <- outs],`
Packit	2cbdf3	`let ss' = [ s' \| (p,s') <- outs, byteSetElem p ch ],`
Packit	2cbdf3	`not (null ss')`
Packit	2cbdf3	`]`
Packit	2cbdf3
Packit	2cbdf3	`rctx_sss = [ mk_ss nfa [s]`
Packit	2cbdf3	`\| Acc _ _ _ (RightContextRExp s) <- accs ]`
Packit	2cbdf3
Packit	2cbdf3	`outs :: [(ByteSet,SNum)]`
Packit	2cbdf3	`outs = [ out \| s <- ss, out <- nst_outs (nfa!s) ]`
Packit	2cbdf3
Packit	2cbdf3	`accs = sort_accs [acc\| s<-ss, acc<-nst_accs (nfa!s)]`
Packit	2cbdf3
Packit	2cbdf3	-- `sort_accs' sorts a list of accept values into decending order of priority,
Packit	2cbdf3	`-- eliminating any elements that follow an unconditional accept value.`
Packit	2cbdf3
Packit	2cbdf3	`sort_accs:: [Accept a] -> [Accept a]`
Packit	2cbdf3	`sort_accs accs = foldr chk [] (msort le accs)`
Packit	2cbdf3	`where`
Packit	2cbdf3	`chk acc@(Acc _ _ Nothing NoRightContext) _ = [acc]`
Packit	2cbdf3	`chk acc rst = acc:rst`
Packit	2cbdf3
Packit	2cbdf3	`le (Acc{accPrio = n}) (Acc{accPrio=n'}) = n<=n'`
Packit	2cbdf3
Packit	2cbdf3
Packit	2cbdf3
Packit	2cbdf3	`{------------------------------------------------------------------------------`
Packit	2cbdf3	`State Sets and Partial DFAs`
Packit	2cbdf3	`------------------------------------------------------------------------------}`
Packit	2cbdf3
Packit	2cbdf3
Packit	2cbdf3
Packit	2cbdf3	-- A `PartDFA' is a partially constructed DFA in which the states are
Packit	2cbdf3	`-- represented by sets of states of the original NFA. It is represented by a`
Packit	2cbdf3	`-- triple consisting of the start state of the partial DFA, the NFA from which`
Packit	2cbdf3	`-- it is derived and a map from state sets to states of the partial DFA. The`
Packit	2cbdf3	`-- state set for a given list of NFA states is calculated by taking the epsilon`
Packit	2cbdf3	`-- closure of all the states, sorting the result with duplicates eliminated.`
Packit	2cbdf3
Packit	2cbdf3	`type StateSet = [SNum]`
Packit	2cbdf3
Packit	2cbdf3	`new_pdfa:: Int -> NFA -> DFA StateSet a`
Packit	2cbdf3	`new_pdfa starts nfa`
Packit	2cbdf3	`= DFA { dfa_start_states = start_ss,`
Packit	2cbdf3	`dfa_states = Map.empty`
Packit	2cbdf3	`}`
Packit	2cbdf3	`where`
Packit	2cbdf3	`start_ss = [ msort (<=) (nst_cl(nfa!n)) \| n <- [0..(starts-1)]]`
Packit	2cbdf3
Packit	2cbdf3	`-- starts is the number of start states`
Packit	2cbdf3
Packit	2cbdf3	`-- constructs the epsilon-closure of a set of NFA states`
Packit	2cbdf3	`mk_ss:: NFA -> [SNum] -> StateSet`
Packit	2cbdf3	`mk_ss nfa l = nub' (<=) [s'\| s<-l, s'<-nst_cl(nfa!s)]`
Packit	2cbdf3
Packit	2cbdf3	`add_pdfa:: StateSet -> State StateSet a -> DFA StateSet a -> DFA StateSet a`
Packit	2cbdf3	`add_pdfa ss pst (DFA st mp) = DFA st (Map.insert ss pst mp)`
Packit	2cbdf3
Packit	2cbdf3	`in_pdfa:: StateSet -> DFA StateSet a -> Bool`
Packit	2cbdf3	in_pdfa ss (DFA _ mp) = ss `Map.member` mp
Packit	2cbdf3
Packit	2cbdf3	`-- Construct a DFA with numbered states, from a DFA whose states are`
Packit	2cbdf3	`-- sets of states from the original NFA.`
Packit	2cbdf3
Packit	2cbdf3	`mk_int_dfa:: NFA -> DFA StateSet a -> DFA SNum a`
Packit	2cbdf3	`mk_int_dfa nfa (DFA start_states mp)`
Packit	2cbdf3	`= DFA [0 .. length start_states-1]`
Packit	2cbdf3	`(Map.fromList [ (lookup' st, cnv pds) \| (st, pds) <- Map.toAscList mp ])`
Packit	2cbdf3	`where`
Packit	2cbdf3	`mp' = Map.fromList (zip (start_states ++`
Packit	2cbdf3	`(map fst . Map.toAscList) (foldr Map.delete mp start_states)) [0..])`
Packit	2cbdf3
Packit	2cbdf3	`lookup' = fromJust . flip Map.lookup mp'`
Packit	2cbdf3
Packit	2cbdf3	`cnv :: State StateSet a -> State SNum a`
Packit	2cbdf3	`cnv (State accs as) = State accs' as'`
Packit	2cbdf3	`where`
Packit	2cbdf3	`as' = IntMap.mapWithKey (\_ch s -> lookup' s) as`
Packit	2cbdf3
Packit	2cbdf3	`accs' = map cnv_acc accs`
Packit	2cbdf3	`cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'`
Packit	2cbdf3	`where rctx' =`
Packit	2cbdf3	`case rctx of`
Packit	2cbdf3	`RightContextRExp s ->`
Packit	2cbdf3	`RightContextRExp (lookup' (mk_ss nfa [s]))`
Packit	2cbdf3	`other -> other`
Packit	2cbdf3
Packit	2cbdf3	`{-`
Packit	2cbdf3
Packit	2cbdf3	-- `mk_st' constructs a state node from the list of accept values and a list of
Packit	2cbdf3	`-- transitions. The transitions list all the valid transitions out of the`
Packit	2cbdf3	`-- node; all invalid transitions should be represented in the array by state`
Packit	2cbdf3	-- -1. `mk_st' has to work out whether the accept states contain an
Packit	2cbdf3	-- unconditional entry, in which case the first field of `St' should be true,
Packit	2cbdf3	`-- and which default state to use in constructing the array (the array may span`
Packit	2cbdf3	`-- a sub-range of the character set, the state number given the third argument`
Packit	2cbdf3	-- of `St' being taken as the default if an input character lies outside the
Packit	2cbdf3	`-- range). The default values is chosen to minimise the bounds of the array`
Packit	2cbdf3	`-- and so there are two candidates: the value that 0 maps to (in which case`
Packit	2cbdf3	`-- some initial segment of the array may be omitted) or the value that 255 maps`
Packit	2cbdf3	`-- to (in which case a final segment of the array may be omitted), hence the`
Packit	2cbdf3	-- calculation of `(df,bds)'.
Packit	2cbdf3	`--`
Packit	2cbdf3	`-- Note that empty arrays are avoided as they can cause severe problems for`
Packit	2cbdf3	`-- some popular Haskell compilers.`
Packit	2cbdf3
Packit	2cbdf3	`mk_st:: [Accept Code] -> [(Char,Int)] -> State Code`
Packit	2cbdf3	`mk_st accs as =`
Packit	2cbdf3	`if null as`
Packit	2cbdf3	`then St accs (-1) (listArray ('0','0') [-1])`
Packit	2cbdf3	`else St accs df (listArray bds [arr!c\| c<-range bds])`
Packit	2cbdf3	`where`
Packit	2cbdf3	`bds = if sz==0 then ('0','0') else bds0`
Packit	2cbdf3
Packit	2cbdf3	`(sz,df,bds0) \| sz1 < sz2 = (sz1,df1,bds1)`
Packit	2cbdf3	`\| otherwise = (sz2,df2,bds2)`
Packit	2cbdf3
Packit	2cbdf3	`(sz1,df1,bds1) = mk_bds(arr!chr 0)`
Packit	2cbdf3	`(sz2,df2,bds2) = mk_bds(arr!chr 255)`
Packit	2cbdf3
Packit	2cbdf3	`mk_bds df = (t-b, df, (chr b, chr (255-t)))`
Packit	2cbdf3	`where`
Packit	2cbdf3	`b = length (takeWhile id [arr!c==df\| c<-['\0'..'\xff']])`
Packit	2cbdf3	`t = length (takeWhile id [arr!c==df\| c<-['\xff','\xfe'..'\0']])`
Packit	2cbdf3
Packit	2cbdf3	`arr = listArray ('\0','\xff') (take 256 (repeat (-1))) // as`
Packit	2cbdf3	`-}`

source-git / alex

Source Code

Blame src/DFA.hs