Blob Blame History Raw
/* ambig.c -
   Content model ambiguity checking.

     Written by James Clark (jjc@jclark.com).
*/
/*
This uses the construction in pp8-9 of [1], extended to deal with AND
groups.

Note that it is not correct for the purposes of ambiguity analysis to
handle AND groups by turning them into an OR group of SEQ groups
(consider (a&b?)).

We build an automaton for the entire content model by adding the
following case for AND:

nullable(v) := nullable(left child) and nullable(right child)
if nullable(right child) then
    for each x in last(left child) do
       follow(v,x) = follow(left child,x) U first(right child);
if nullable(left child) then
    for each x in last(right child) do
        follow(v,x) = follow(right child,x) U first(left child);
first(v) := first(left child) U first(right child);
last(v) := first(left child) U first(right child);

We also build an automaton for each AND group by building automata for
each of the members of the AND group using the above procedure and
then combine the members using:

for each x in last(left child) do
   follow(v,x) = follow(left child,x) U first(right child);
for each x in last(right child) do
   follow(v,x) = follow(right child,x) U first(left child);
first(v) := first(left child) U first(right child);

The content model is ambiguous just in case one of these automata is
non-deterministic.  (Note that when checking determinism we need to
check the `first' set as well as all the `follow' sets.)

Why is this correct?  Consider a primitive token in a member of an AND
group.  There are two worst cases for ambiguity: firstly, when none of
the other members of AND group have been matched; secondly, when just
the nullable members remain to be matched.  The first case is not
affected by context of the AND group (unless the first case is
identical to the second case.)

Note that inclusions are not relevant for the purposes of determining
the ambiguity of content models. Otherwise the case in clause
11.2.5.1:

   An element that can satisfy an element in the content model is
   considered to do so, even if the element is also an inclusion.

could never arise.

[1] Anne Brueggemann-Klein, Regular Expressions into Finite Automata,
Universitaet Freiburg, Institut fur Informatik, 33 July 1991.
*/

#include "sgmlincl.h"

/* Sets of states are represented by 0-terminated, ordered lists of
indexes in gbuf. */

#define MAXSTATES (GRPGTCNT+2)
#define listcat(x, y) strcat((char *)(x), (char *)(y))
#define listcpy(x, y) strcpy((char *)(x), (char *)(y))

/* Information about a content token. */

struct contoken {
     UNCH size;
     UNCH nullable;
     UNCH *first;
     UNCH *last;
};

static VOID contoken P((int, int, struct contoken *));
static VOID andgroup P((int, int, struct contoken *));
static VOID orgroup P((int, int, struct contoken *));
static VOID seqgroup P((int, int, struct contoken *));
static VOID andambig P((int));
static int listambig P((UNCH *));
static VOID listmerge P((UNCH *, UNCH *));
static struct contoken *newcontoken P((void));
static VOID freecontoken P((struct contoken *));


/* Dynamically allocated vector of follow sets. */

static UNCH **follow;
static UNCH *mergebuf;		/* for use by listmerge */

/* Set to non-zero if the content model is ambiguous. */

static int ambigsw;

/* Check the current content model (in gbuf) for ambiguity. */

VOID ambig()
{
     struct contoken *s;
     int i;
     
     if (!follow) {
	  /* We can't allocate everything in one chunk, because that would
	     overflow a 16-bit unsigned if GRPGTCNT was 253. */
	  UNCH *ptr;
	  follow = (UNCH **)rmalloc(MAXSTATES*sizeof(UNCH *));
	  follow[0] = 0;
	  ptr = (UNCH *)rmalloc((MAXSTATES - 1)*MAXSTATES);
	  for (i = 1; i < MAXSTATES; i++) {
	       follow[i] = ptr;
	       ptr += MAXSTATES;
	  }
	  mergebuf = (UNCH *)rmalloc(MAXSTATES);
     }

     for (i = 1; i < MAXSTATES; i++)
	  follow[i][0] = 0;

     ambigsw = 0;

     s = newcontoken();
     contoken(1, 1, s);

     ambigsw = ambigsw || listambig(s->first);

     freecontoken(s);

     for (i = 1; !ambigsw && i < MAXSTATES; i++)
	  if (listambig(follow[i]))
	       ambigsw = 1;

     if (ambigsw)
	  mderr(137, (UNCH *)0, (UNCH *)0);
}

/* Free memory used for ambiguity checking. */

VOID ambigfree()
{
     if (follow) {
	  frem((UNIV)follow[1]);
	  frem((UNIV)follow);
	  frem((UNIV)mergebuf);
	  follow = 0;
     }
}

/* Determine whether a list of primitive content tokens (each
represented by its index in gbuf) is ambiguous. */

static
int listambig(list)
UNCH *list;
{
     UNCH *p;
     int chars = 0;
     int rc = 0;

     for (p = list; *p; p++) {
	  if ((gbuf[*p].ttype & TTMASK) == TTETD) {
	       struct etd *e = gbuf[*p].tu.thetd;
	       if (e->mark) {
		    rc = 1;
		    break;
	       }
	       e->mark = 1;
	  }
	  else {
	       assert((gbuf[*p].ttype & TTMASK) == TTCHARS);
	       if (chars) {
		    rc = 1;
		    break;
	       }
	       chars = 1;
	  }
     }

     for (p = list; *p; p++)
	  if ((gbuf[*p].ttype & TTMASK) == TTETD)
	       gbuf[*p].tu.thetd->mark = 0;

     return rc;
}


/* Analyze a content token.  The `checkand' argument is needed to ensure
that the algorithm is not exponential in the AND-group nesting depth.
*/

static
VOID contoken(m, checkand, res)
int m;				/* Index of content token in gbuf */
int checkand;			/* Non-zero if AND groups should be checked */
struct contoken *res;		/* Result */
{
     UNCH flags = gbuf[m].ttype;
     switch (flags & TTMASK) {
     case TTCHARS:
     case TTETD:
	  res->first[0] = m;
	  res->first[1] = 0;
	  res->last[0] = m;
	  res->last[1] = 0;
	  res->size = 1;
	  res->nullable = 0;
	  break;
     case TTAND:
	  if (checkand)
	       andambig(m);
	  andgroup(m, checkand, res);
	  break;
     case TTOR:
	  orgroup(m, checkand, res);
	  break;
     case TTSEQ:
	  seqgroup(m, checkand, res);
	  break;
     default:
	  abort();
     }
     if (flags & TREP) {
	  UNCH *p;
	  for (p = res->last; *p; p++)
	       listmerge(follow[*p], res->first);
     }
     if (flags & TOPT)
	  res->nullable = 1;
}

/* Check an AND group for ambiguity. */

static
VOID andambig(m)
int m;
{
     int i, tnum;
     int lim;
     struct contoken *curr;
     struct contoken *next;

     tnum = gbuf[m].tu.tnum;
     assert(tnum > 0);
     curr = newcontoken();
     next = newcontoken();
     contoken(m + 1, 0, curr);
     i = m + 1 + curr->size;
     curr->size += 1;
     for (--tnum; tnum > 0; --tnum) {
	  UNCH *p;
	  contoken(i, 0, next);
	  curr->size += next->size;
	  i += next->size;
	  for (p = curr->last; *p; p++)
	       listcat(follow[*p], next->first);
	  for (p = next->last; *p; p++)
	       listmerge(follow[*p], curr->first);
	  listcat(curr->first, next->first);
	  listcat(curr->last, next->last);
     }
     lim = m + curr->size;
     for (i = m + 1; i < lim; i++) {
	  if (listambig(follow[i]))
	       ambigsw = 1;
	  follow[i][0] = 0;
     }
     freecontoken(curr);
     freecontoken(next);
}

/* Handle an AND group. */

static
VOID andgroup(m, checkand, res)
int m;
int checkand;
struct contoken *res;
{
     int i, tnum;
     /* union of the first sets of nullable members of the group */
     UNCH *nullablefirst;
     struct contoken *next;

     tnum = gbuf[m].tu.tnum;
     assert(tnum > 0);
     contoken(m + 1, checkand, res);
     nullablefirst = (UNCH *)rmalloc(MAXSTATES);
     if (res->nullable)
	  listcpy(nullablefirst, res->first);
     else
	  nullablefirst[0] = 0;
     i = m + 1 + res->size;
     res->size += 1;
     next = newcontoken();
     for (--tnum; tnum > 0; --tnum) {
	  UNCH *p;
	  contoken(i, checkand, next);
	  res->size += next->size;
	  i += next->size;
	  if (next->nullable)
	       for (p = res->last; *p; p++)
		    listcat(follow[*p], next->first);
	  for (p = next->last; *p; p++)
	       listmerge(follow[*p], nullablefirst);
	  listcat(res->first, next->first);
	  if (next->nullable)
	       listcat(nullablefirst, next->first);
	  listcat(res->last, next->last);
	  res->nullable &= next->nullable;
     }
     frem((UNIV)nullablefirst);
     freecontoken(next);
}

/* Handle a SEQ group. */

static
VOID seqgroup(m, checkand, res)
int m;
int checkand;
struct contoken *res;
{
     int i, tnum;
     struct contoken *next;

     tnum = gbuf[m].tu.tnum;
     assert(tnum > 0);
     contoken(m + 1, checkand, res);
     i = m + 1 + res->size;
     res->size += 1;
     next = newcontoken();
     for (--tnum; tnum > 0; --tnum) {
	  UNCH *p;
	  contoken(i, checkand, next);
	  res->size += next->size;
	  i += next->size;
	  for (p = res->last; *p; p++)
	       listcat(follow[*p], next->first);
	  if (res->nullable)
	       listcat(res->first, next->first);
	  if (next->nullable)
	       listcat(res->last, next->last);
	  else
	       listcpy(res->last, next->last);
	  res->nullable &= next->nullable;
     }
     freecontoken(next);
}

/* Handle an OR group. */

static
VOID orgroup(m, checkand, res)
int m;
int checkand;
struct contoken *res;
{
     int i, tnum;
     struct contoken *next;

     tnum = gbuf[m].tu.tnum;
     assert(tnum > 0);
     contoken(m + 1, checkand, res);
     i = m + 1 + res->size;
     res->size += 1;
     next = newcontoken();
     for (--tnum; tnum > 0; --tnum) {
	  contoken(i, checkand, next);
	  res->size += next->size;
	  i += next->size;
	  listcat(res->first, next->first);
	  listcat(res->last, next->last);
	  res->nullable |= next->nullable;
     }
     freecontoken(next);
}


/* Merge the second ordered list into the first. */

static
VOID listmerge(p, b)
UNCH *p, *b;
{
     UNCH *a = mergebuf;

     strcpy((char *)a, (char *)p);

     for (;;) {
	  if (*a) {
	       if (*b) {
		    if (*a < *b)
			 *p++ = *a++;
		    else if (*a > *b)
			 *p++ = *b++;
		    else
			 a++;
	       }
	       else
		    *p++ = *a++;
	  }
	  else if (*b)
	       *p++ = *b++;
	  else
	       break;
     }
     *p = '\0';
}

static
struct contoken *newcontoken()
{
     struct contoken *p = (struct contoken *)rmalloc(sizeof(struct contoken)
						     + MAXSTATES*2);
     p->first = (UNCH *)(p + 1);
     p->last = p->first + MAXSTATES;
     return p;
}

static
VOID freecontoken(p)
struct contoken *p;
{
     frem((UNIV)p);
}

/*
Local Variables:
c-indent-level: 5
c-continued-statement-offset: 5
c-brace-offset: -5
c-argdecl-indent: 0
c-label-offset: -5
End:
*/