Blame src/kwset.c

Packit 709fb3
/* kwset.c - search for any of a set of keywords.
Packit 709fb3
   Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2017 Free Software
Packit 709fb3
   Foundation, Inc.
Packit 709fb3
Packit 709fb3
   This program is free software; you can redistribute it and/or modify
Packit 709fb3
   it under the terms of the GNU General Public License as published by
Packit 709fb3
   the Free Software Foundation; either version 3, or (at your option)
Packit 709fb3
   any later version.
Packit 709fb3
Packit 709fb3
   This program is distributed in the hope that it will be useful,
Packit 709fb3
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 709fb3
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 709fb3
   GNU General Public License for more details.
Packit 709fb3
Packit 709fb3
   You should have received a copy of the GNU General Public License
Packit 709fb3
   along with this program; if not, write to the Free Software
Packit 709fb3
   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
Packit 709fb3
   02110-1301, USA.  */
Packit 709fb3
Packit 709fb3
/* Written August 1989 by Mike Haertel.  */
Packit 709fb3
Packit 709fb3
/* For the Aho-Corasick algorithm, see:
Packit 709fb3
   Aho AV, Corasick MJ. Efficient string matching: an aid to
Packit 709fb3
   bibliographic search. CACM 18, 6 (1975), 333-40
Packit 709fb3
   <http://dx.doi.org/10.1145/360825.360855>, which describes the
Packit 709fb3
   failure function used below.
Packit 709fb3
Packit 709fb3
   For the Boyer-Moore algorithm, see: Boyer RS, Moore JS.
Packit 709fb3
   A fast string searching algorithm. CACM 20, 10 (1977), 762-72
Packit 709fb3
   <http://dx.doi.org/10.1145/359842.359859>.
Packit 709fb3
Packit 709fb3
   For a survey of more-recent string matching algorithms that might
Packit 709fb3
   help improve performance, see: Faro S, Lecroq T. The exact online
Packit 709fb3
   string matching problem: a review of the most recent results.
Packit 709fb3
   ACM Computing Surveys 45, 2 (2013), 13
Packit 709fb3
   <http://dx.doi.org/10.1145/2431211.2431212>.  */
Packit 709fb3
Packit 709fb3
#include <config.h>
Packit 709fb3
Packit 709fb3
#include "kwset.h"
Packit 709fb3
Packit 709fb3
#include <stdint.h>
Packit 709fb3
#include <sys/types.h>
Packit 709fb3
#include "system.h"
Packit 709fb3
#include "intprops.h"
Packit 709fb3
#include "memchr2.h"
Packit 709fb3
#include "obstack.h"
Packit 709fb3
#include "xalloc.h"
Packit 709fb3
#include "verify.h"
Packit 709fb3
Packit 709fb3
#define obstack_chunk_alloc xmalloc
Packit 709fb3
#define obstack_chunk_free free
Packit 709fb3
Packit 709fb3
static unsigned char
Packit 709fb3
U (char ch)
Packit 709fb3
{
Packit 709fb3
  return to_uchar (ch);
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Balanced tree of edges and labels leaving a given trie node.  */
Packit 709fb3
struct tree
Packit 709fb3
{
Packit 709fb3
  struct tree *llink;		/* Left link; MUST be first field.  */
Packit 709fb3
  struct tree *rlink;		/* Right link (to larger labels).  */
Packit 709fb3
  struct trie *trie;		/* Trie node pointed to by this edge.  */
Packit 709fb3
  unsigned char label;		/* Label on this edge.  */
Packit 709fb3
  char balance;			/* Difference in depths of subtrees.  */
Packit 709fb3
};
Packit 709fb3
Packit 709fb3
/* Node of a trie representing a set of keywords.  */
Packit 709fb3
struct trie
Packit 709fb3
{
Packit 709fb3
  /* If an accepting node, this is either 2*W + 1 where W is the word
Packit 709fb3
     index, or is SIZE_MAX if Aho-Corasick is in use and FAIL
Packit 709fb3
     specifies where to look for more info.  If not an accepting node,
Packit 709fb3
     this is zero.  */
Packit 709fb3
  size_t accepting;
Packit 709fb3
Packit 709fb3
  struct tree *links;		/* Tree of edges leaving this node.  */
Packit 709fb3
  struct trie *parent;		/* Parent of this node.  */
Packit 709fb3
  struct trie *next;		/* List of all trie nodes in level order.  */
Packit 709fb3
  struct trie *fail;		/* Aho-Corasick failure function.  */
Packit 709fb3
  ptrdiff_t depth;		/* Depth of this node from the root.  */
Packit 709fb3
  ptrdiff_t shift;		/* Shift function for search failures.  */
Packit 709fb3
  ptrdiff_t maxshift;		/* Max shift of self and descendants.  */
Packit 709fb3
};
Packit 709fb3
Packit 709fb3
/* Structure returned opaquely to the caller, containing everything.  */
Packit 709fb3
struct kwset
Packit 709fb3
{
Packit 709fb3
  struct obstack obstack;	/* Obstack for node allocation.  */
Packit 709fb3
  ptrdiff_t words;		/* Number of words in the trie.  */
Packit 709fb3
  struct trie *trie;		/* The trie itself.  */
Packit 709fb3
  ptrdiff_t mind;		/* Minimum depth of an accepting node.  */
Packit 709fb3
  ptrdiff_t maxd;		/* Maximum depth of any node.  */
Packit 709fb3
  unsigned char delta[NCHAR];	/* Delta table for rapid search.  */
Packit 709fb3
  struct trie *next[NCHAR];	/* Table of children of the root.  */
Packit 709fb3
  char *target;			/* Target string if there's only one.  */
Packit 709fb3
  ptrdiff_t *shift;		/* Used in Boyer-Moore search for one
Packit 709fb3
                                   string.  */
Packit 709fb3
  char const *trans;		/* Character translation table.  */
Packit 709fb3
Packit 709fb3
  /* This helps to match a terminal byte, which is the first byte
Packit 709fb3
     for Aho-Corasick, and the last byte for Boyer-More.  If all the
Packit 709fb3
     patterns have the same terminal byte (after translation via TRANS
Packit 709fb3
     if TRANS is nonnull), then this is that byte as an unsigned char.
Packit 709fb3
     Otherwise this is -1 if there is disagreement among the strings
Packit 709fb3
     about terminal bytes, and -2 if there are no terminal bytes and
Packit 709fb3
     no disagreement because all the patterns are empty.  */
Packit 709fb3
  int gc1;
Packit 709fb3
Packit 709fb3
  /* This helps to match a terminal byte.  If 0 <= GC1HELP, B is
Packit 709fb3
     terminal when B == GC1 || B == GC1HELP (note that GC1 == GCHELP
Packit 709fb3
     is common here).  This is typically faster than evaluating
Packit 709fb3
     to_uchar (TRANS[B]) == GC1.  */
Packit 709fb3
  int gc1help;
Packit 709fb3
Packit 709fb3
  /* If the string has two or more bytes, this is the penultimate byte,
Packit 709fb3
     after translation via TRANS if TRANS is nonnull.  This variable
Packit 709fb3
     is used only by Boyer-Moore.  */
Packit 709fb3
  char gc2;
Packit 709fb3
Packit 709fb3
  /* kwsexec implementation.  */
Packit 709fb3
  ptrdiff_t (*kwsexec) (kwset_t, char const *, ptrdiff_t,
Packit 709fb3
                        struct kwsmatch *, bool);
Packit 709fb3
};
Packit 709fb3
Packit 709fb3
/* Use TRANS to transliterate C.  A null TRANS does no transliteration.  */
Packit 709fb3
static inline char
Packit 709fb3
tr (char const *trans, char c)
Packit 709fb3
{
Packit 709fb3
  return trans ? trans[U(c)] : c;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
static ptrdiff_t acexec (kwset_t, char const *, ptrdiff_t,
Packit 709fb3
                         struct kwsmatch *, bool);
Packit 709fb3
static ptrdiff_t bmexec (kwset_t, char const *, ptrdiff_t,
Packit 709fb3
                         struct kwsmatch *, bool);
Packit 709fb3
Packit 709fb3
/* Return a newly allocated keyword set.  A nonnull TRANS specifies a
Packit 709fb3
   table of character translations to be applied to all pattern and
Packit 709fb3
   search text.  */
Packit 709fb3
kwset_t
Packit 709fb3
kwsalloc (char const *trans)
Packit 709fb3
{
Packit 709fb3
  struct kwset *kwset = xmalloc (sizeof *kwset);
Packit 709fb3
Packit 709fb3
  obstack_init (&kwset->obstack);
Packit 709fb3
  kwset->words = 0;
Packit 709fb3
  kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie);
Packit 709fb3
  kwset->trie->accepting = 0;
Packit 709fb3
  kwset->trie->links = NULL;
Packit 709fb3
  kwset->trie->parent = NULL;
Packit 709fb3
  kwset->trie->next = NULL;
Packit 709fb3
  kwset->trie->fail = NULL;
Packit 709fb3
  kwset->trie->depth = 0;
Packit 709fb3
  kwset->trie->shift = 0;
Packit 709fb3
  kwset->mind = PTRDIFF_MAX;
Packit 709fb3
  kwset->maxd = -1;
Packit 709fb3
  kwset->target = NULL;
Packit 709fb3
  kwset->trans = trans;
Packit 709fb3
  kwset->kwsexec = acexec;
Packit 709fb3
Packit 709fb3
  return kwset;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* This upper bound is valid for CHAR_BIT >= 4 and
Packit 709fb3
   exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }.  */
Packit 709fb3
enum { DEPTH_SIZE = CHAR_BIT + CHAR_BIT / 2 };
Packit 709fb3
Packit 709fb3
/* Add the given string to the contents of the keyword set.  */
Packit 709fb3
void
Packit 709fb3
kwsincr (kwset_t kwset, char const *text, ptrdiff_t len)
Packit 709fb3
{
Packit 709fb3
  assume (0 <= len);
Packit 709fb3
  struct trie *trie = kwset->trie;
Packit 709fb3
  char const *trans = kwset->trans;
Packit 709fb3
  bool reverse = kwset->kwsexec == bmexec;
Packit 709fb3
Packit 709fb3
  if (reverse)
Packit 709fb3
    text += len;
Packit 709fb3
Packit 709fb3
  /* Descend the trie (built of keywords) character-by-character,
Packit 709fb3
     installing new nodes when necessary.  */
Packit 709fb3
  while (len--)
Packit 709fb3
    {
Packit 709fb3
      unsigned char uc = reverse ? *--text : *text++;
Packit 709fb3
      unsigned char label = trans ? trans[uc] : uc;
Packit 709fb3
Packit 709fb3
      /* Descend the tree of outgoing links for this trie node,
Packit 709fb3
         looking for the current character and keeping track
Packit 709fb3
         of the path followed.  */
Packit 709fb3
      struct tree *cur = trie->links;
Packit 709fb3
      struct tree *links[DEPTH_SIZE];
Packit 709fb3
      enum { L, R } dirs[DEPTH_SIZE];
Packit 709fb3
      links[0] = (struct tree *) &trie->links;
Packit 709fb3
      dirs[0] = L;
Packit 709fb3
      ptrdiff_t depth = 1;
Packit 709fb3
Packit 709fb3
      while (cur && label != cur->label)
Packit 709fb3
        {
Packit 709fb3
          links[depth] = cur;
Packit 709fb3
          if (label < cur->label)
Packit 709fb3
            dirs[depth++] = L, cur = cur->llink;
Packit 709fb3
          else
Packit 709fb3
            dirs[depth++] = R, cur = cur->rlink;
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      /* The current character doesn't have an outgoing link at
Packit 709fb3
         this trie node, so build a new trie node and install
Packit 709fb3
         a link in the current trie node's tree.  */
Packit 709fb3
      if (!cur)
Packit 709fb3
        {
Packit 709fb3
          cur = obstack_alloc (&kwset->obstack, sizeof *cur);
Packit 709fb3
          cur->llink = NULL;
Packit 709fb3
          cur->rlink = NULL;
Packit 709fb3
          cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie);
Packit 709fb3
          cur->trie->accepting = 0;
Packit 709fb3
          cur->trie->links = NULL;
Packit 709fb3
          cur->trie->parent = trie;
Packit 709fb3
          cur->trie->next = NULL;
Packit 709fb3
          cur->trie->fail = NULL;
Packit 709fb3
          cur->trie->depth = trie->depth + 1;
Packit 709fb3
          cur->trie->shift = 0;
Packit 709fb3
          cur->label = label;
Packit 709fb3
          cur->balance = 0;
Packit 709fb3
Packit 709fb3
          /* Install the new tree node in its parent.  */
Packit 709fb3
          if (dirs[--depth] == L)
Packit 709fb3
            links[depth]->llink = cur;
Packit 709fb3
          else
Packit 709fb3
            links[depth]->rlink = cur;
Packit 709fb3
Packit 709fb3
          /* Back up the tree fixing the balance flags.  */
Packit 709fb3
          while (depth && !links[depth]->balance)
Packit 709fb3
            {
Packit 709fb3
              if (dirs[depth] == L)
Packit 709fb3
                --links[depth]->balance;
Packit 709fb3
              else
Packit 709fb3
                ++links[depth]->balance;
Packit 709fb3
              --depth;
Packit 709fb3
            }
Packit 709fb3
Packit 709fb3
          /* Rebalance the tree by pointer rotations if necessary.  */
Packit 709fb3
          if (depth && ((dirs[depth] == L && --links[depth]->balance)
Packit 709fb3
                        || (dirs[depth] == R && ++links[depth]->balance)))
Packit 709fb3
            {
Packit 709fb3
              struct tree *t, *r, *l, *rl, *lr;
Packit 709fb3
Packit 709fb3
              switch (links[depth]->balance)
Packit 709fb3
                {
Packit 709fb3
                case (char) -2:
Packit 709fb3
                  switch (dirs[depth + 1])
Packit 709fb3
                    {
Packit 709fb3
                    case L:
Packit 709fb3
                      r = links[depth], t = r->llink, rl = t->rlink;
Packit 709fb3
                      t->rlink = r, r->llink = rl;
Packit 709fb3
                      t->balance = r->balance = 0;
Packit 709fb3
                      break;
Packit 709fb3
                    case R:
Packit 709fb3
                      r = links[depth], l = r->llink, t = l->rlink;
Packit 709fb3
                      rl = t->rlink, lr = t->llink;
Packit 709fb3
                      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
Packit 709fb3
                      l->balance = t->balance != 1 ? 0 : -1;
Packit 709fb3
                      r->balance = t->balance != (char) -1 ? 0 : 1;
Packit 709fb3
                      t->balance = 0;
Packit 709fb3
                      break;
Packit 709fb3
                    default:
Packit 709fb3
                      abort ();
Packit 709fb3
                    }
Packit 709fb3
                  break;
Packit 709fb3
                case 2:
Packit 709fb3
                  switch (dirs[depth + 1])
Packit 709fb3
                    {
Packit 709fb3
                    case R:
Packit 709fb3
                      l = links[depth], t = l->rlink, lr = t->llink;
Packit 709fb3
                      t->llink = l, l->rlink = lr;
Packit 709fb3
                      t->balance = l->balance = 0;
Packit 709fb3
                      break;
Packit 709fb3
                    case L:
Packit 709fb3
                      l = links[depth], r = l->rlink, t = r->llink;
Packit 709fb3
                      lr = t->llink, rl = t->rlink;
Packit 709fb3
                      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
Packit 709fb3
                      l->balance = t->balance != 1 ? 0 : -1;
Packit 709fb3
                      r->balance = t->balance != (char) -1 ? 0 : 1;
Packit 709fb3
                      t->balance = 0;
Packit 709fb3
                      break;
Packit 709fb3
                    default:
Packit 709fb3
                      abort ();
Packit 709fb3
                    }
Packit 709fb3
                  break;
Packit 709fb3
                default:
Packit 709fb3
                  abort ();
Packit 709fb3
                }
Packit 709fb3
Packit 709fb3
              if (dirs[depth - 1] == L)
Packit 709fb3
                links[depth - 1]->llink = t;
Packit 709fb3
              else
Packit 709fb3
                links[depth - 1]->rlink = t;
Packit 709fb3
            }
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      trie = cur->trie;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  /* Mark the node finally reached as accepting, encoding the
Packit 709fb3
     index number of this word in the keyword set so far.  */
Packit 709fb3
  if (!trie->accepting)
Packit 709fb3
    {
Packit 709fb3
      size_t words = kwset->words;
Packit 709fb3
      trie->accepting = 2 * words + 1;
Packit 709fb3
    }
Packit 709fb3
  ++kwset->words;
Packit 709fb3
Packit 709fb3
  /* Keep track of the longest and shortest string of the keyword set.  */
Packit 709fb3
  if (trie->depth < kwset->mind)
Packit 709fb3
    kwset->mind = trie->depth;
Packit 709fb3
  if (trie->depth > kwset->maxd)
Packit 709fb3
    kwset->maxd = trie->depth;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
ptrdiff_t
Packit 709fb3
kwswords (kwset_t kwset)
Packit 709fb3
{
Packit 709fb3
  return kwset->words;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Enqueue the trie nodes referenced from the given tree in the
Packit 709fb3
   given queue.  */
Packit 709fb3
static void
Packit 709fb3
enqueue (struct tree *tree, struct trie **last)
Packit 709fb3
{
Packit 709fb3
  if (!tree)
Packit 709fb3
    return;
Packit 709fb3
  enqueue (tree->llink, last);
Packit 709fb3
  enqueue (tree->rlink, last);
Packit 709fb3
  (*last) = (*last)->next = tree->trie;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Compute the Aho-Corasick failure function for the trie nodes referenced
Packit 709fb3
   from the given tree, given the failure function for their parent as
Packit 709fb3
   well as a last resort failure node.  */
Packit 709fb3
static void
Packit 709fb3
treefails (struct tree const *tree, struct trie const *fail,
Packit 709fb3
           struct trie *recourse, bool reverse)
Packit 709fb3
{
Packit 709fb3
  struct tree *cur;
Packit 709fb3
Packit 709fb3
  if (!tree)
Packit 709fb3
    return;
Packit 709fb3
Packit 709fb3
  treefails (tree->llink, fail, recourse, reverse);
Packit 709fb3
  treefails (tree->rlink, fail, recourse, reverse);
Packit 709fb3
Packit 709fb3
  /* Find, in the chain of fails going back to the root, the first
Packit 709fb3
     node that has a descendant on the current label.  */
Packit 709fb3
  while (fail)
Packit 709fb3
    {
Packit 709fb3
      cur = fail->links;
Packit 709fb3
      while (cur && tree->label != cur->label)
Packit 709fb3
        if (tree->label < cur->label)
Packit 709fb3
          cur = cur->llink;
Packit 709fb3
        else
Packit 709fb3
          cur = cur->rlink;
Packit 709fb3
      if (cur)
Packit 709fb3
        {
Packit 709fb3
          tree->trie->fail = cur->trie;
Packit 709fb3
          if (!reverse && cur->trie->accepting && !tree->trie->accepting)
Packit 709fb3
            tree->trie->accepting = SIZE_MAX;
Packit 709fb3
          return;
Packit 709fb3
        }
Packit 709fb3
      fail = fail->fail;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  tree->trie->fail = recourse;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Set delta entries for the links of the given tree such that
Packit 709fb3
   the preexisting delta value is larger than the current depth.  */
Packit 709fb3
static void
Packit 709fb3
treedelta (struct tree const *tree, ptrdiff_t depth, unsigned char delta[])
Packit 709fb3
{
Packit 709fb3
  if (!tree)
Packit 709fb3
    return;
Packit 709fb3
  treedelta (tree->llink, depth, delta);
Packit 709fb3
  treedelta (tree->rlink, depth, delta);
Packit 709fb3
  if (depth < delta[tree->label])
Packit 709fb3
    delta[tree->label] = depth;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Return true if A has every label in B.  */
Packit 709fb3
static bool _GL_ATTRIBUTE_PURE
Packit 709fb3
hasevery (struct tree const *a, struct tree const *b)
Packit 709fb3
{
Packit 709fb3
  if (!b)
Packit 709fb3
    return true;
Packit 709fb3
  if (!hasevery (a, b->llink))
Packit 709fb3
    return false;
Packit 709fb3
  if (!hasevery (a, b->rlink))
Packit 709fb3
    return false;
Packit 709fb3
  while (a && b->label != a->label)
Packit 709fb3
    if (b->label < a->label)
Packit 709fb3
      a = a->llink;
Packit 709fb3
    else
Packit 709fb3
      a = a->rlink;
Packit 709fb3
  return !!a;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Compute a vector, indexed by character code, of the trie nodes
Packit 709fb3
   referenced from the given tree.  */
Packit 709fb3
static void
Packit 709fb3
treenext (struct tree const *tree, struct trie *next[])
Packit 709fb3
{
Packit 709fb3
  if (!tree)
Packit 709fb3
    return;
Packit 709fb3
  treenext (tree->llink, next);
Packit 709fb3
  treenext (tree->rlink, next);
Packit 709fb3
  next[tree->label] = tree->trie;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Prepare a built keyword set for use.  */
Packit 709fb3
void
Packit 709fb3
kwsprep (kwset_t kwset)
Packit 709fb3
{
Packit 709fb3
  char const *trans = kwset->trans;
Packit 709fb3
  ptrdiff_t i;
Packit 709fb3
  unsigned char deltabuf[NCHAR];
Packit 709fb3
  unsigned char *delta = trans ? deltabuf : kwset->delta;
Packit 709fb3
  struct trie *curr, *last;
Packit 709fb3
Packit 709fb3
  /* Use Boyer-Moore if just one pattern, Aho-Corasick otherwise.  */
Packit 709fb3
  bool reverse = kwset->words == 1;
Packit 709fb3
Packit 709fb3
  if (reverse)
Packit 709fb3
    {
Packit 709fb3
      kwset_t new_kwset;
Packit 709fb3
Packit 709fb3
      /* Enqueue the immediate descendants in the level order queue.  */
Packit 709fb3
      for (curr = last = kwset->trie; curr; curr = curr->next)
Packit 709fb3
        enqueue (curr->links, &last);
Packit 709fb3
Packit 709fb3
      /* Looking for just one string.  Extract it from the trie.  */
Packit 709fb3
      kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
Packit 709fb3
      for (i = 0, curr = kwset->trie; i < kwset->mind; ++i)
Packit 709fb3
        {
Packit 709fb3
          kwset->target[i] = curr->links->label;
Packit 709fb3
          curr = curr->next;
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      new_kwset = kwsalloc (kwset->trans);
Packit 709fb3
      new_kwset->kwsexec = bmexec;
Packit 709fb3
      kwsincr (new_kwset, kwset->target, kwset->mind);
Packit 709fb3
      obstack_free (&kwset->obstack, NULL);
Packit 709fb3
      *kwset = *new_kwset;
Packit 709fb3
      free (new_kwset);
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  /* Initial values for the delta table; will be changed later.  The
Packit 709fb3
     delta entry for a given character is the smallest depth of any
Packit 709fb3
     node at which an outgoing edge is labeled by that character.  */
Packit 709fb3
  memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf);
Packit 709fb3
Packit 709fb3
  /* Traverse the nodes of the trie in level order, simultaneously
Packit 709fb3
     computing the delta table, failure function, and shift function.  */
Packit 709fb3
  for (curr = last = kwset->trie; curr; curr = curr->next)
Packit 709fb3
    {
Packit 709fb3
      /* Enqueue the immediate descendants in the level order queue.  */
Packit 709fb3
      enqueue (curr->links, &last);
Packit 709fb3
Packit 709fb3
      /* Update the delta table for the descendants of this node.  */
Packit 709fb3
      treedelta (curr->links, curr->depth, delta);
Packit 709fb3
Packit 709fb3
      /* Compute the failure function for the descendants of this node.  */
Packit 709fb3
      treefails (curr->links, curr->fail, kwset->trie, reverse);
Packit 709fb3
Packit 709fb3
      if (reverse)
Packit 709fb3
        {
Packit 709fb3
          curr->shift = kwset->mind;
Packit 709fb3
          curr->maxshift = kwset->mind;
Packit 709fb3
Packit 709fb3
          /* Update the shifts at each node in the current node's chain
Packit 709fb3
             of fails back to the root.  */
Packit 709fb3
          struct trie *fail;
Packit 709fb3
          for (fail = curr->fail; fail; fail = fail->fail)
Packit 709fb3
            {
Packit 709fb3
              /* If the current node has some outgoing edge that the fail
Packit 709fb3
                 doesn't, then the shift at the fail should be no larger
Packit 709fb3
                 than the difference of their depths.  */
Packit 709fb3
              if (!hasevery (fail->links, curr->links))
Packit 709fb3
                if (curr->depth - fail->depth < fail->shift)
Packit 709fb3
                  fail->shift = curr->depth - fail->depth;
Packit 709fb3
Packit 709fb3
              /* If the current node is accepting then the shift at the
Packit 709fb3
                 fail and its descendants should be no larger than the
Packit 709fb3
                 difference of their depths.  */
Packit 709fb3
              if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
Packit 709fb3
                fail->maxshift = curr->depth - fail->depth;
Packit 709fb3
            }
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  if (reverse)
Packit 709fb3
    {
Packit 709fb3
      /* Traverse the trie in level order again, fixing up all nodes whose
Packit 709fb3
         shift exceeds their inherited maxshift.  */
Packit 709fb3
      for (curr = kwset->trie->next; curr; curr = curr->next)
Packit 709fb3
        {
Packit 709fb3
          if (curr->maxshift > curr->parent->maxshift)
Packit 709fb3
            curr->maxshift = curr->parent->maxshift;
Packit 709fb3
          if (curr->shift > curr->maxshift)
Packit 709fb3
            curr->shift = curr->maxshift;
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  /* Create a vector, indexed by character code, of the outgoing links
Packit 709fb3
     from the root node.  Accumulate GC1 and GC1HELP.  */
Packit 709fb3
  struct trie *nextbuf[NCHAR];
Packit 709fb3
  struct trie **next = trans ? nextbuf : kwset->next;
Packit 709fb3
  memset (next, 0, sizeof nextbuf);
Packit 709fb3
  treenext (kwset->trie->links, next);
Packit 709fb3
  int gc1 = -2;
Packit 709fb3
  int gc1help = -1;
Packit 709fb3
  for (i = 0; i < NCHAR; i++)
Packit 709fb3
    {
Packit 709fb3
      int ti = i;
Packit 709fb3
      if (trans)
Packit 709fb3
        {
Packit 709fb3
          ti = U(trans[i]);
Packit 709fb3
          kwset->next[i] = next[ti];
Packit 709fb3
        }
Packit 709fb3
      if (kwset->next[i])
Packit 709fb3
        {
Packit 709fb3
          if (gc1 < -1)
Packit 709fb3
            {
Packit 709fb3
              gc1 = ti;
Packit 709fb3
              gc1help = i;
Packit 709fb3
            }
Packit 709fb3
          else if (gc1 == ti)
Packit 709fb3
            gc1help = gc1help == ti ? i : -1;
Packit 709fb3
          else if (i == ti && gc1 == gc1help)
Packit 709fb3
            gc1help = i;
Packit 709fb3
          else
Packit 709fb3
            gc1 = -1;
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
  kwset->gc1 = gc1;
Packit 709fb3
  kwset->gc1help = gc1help;
Packit 709fb3
Packit 709fb3
  if (reverse)
Packit 709fb3
    {
Packit 709fb3
      /* Looking for just one string.  Extract it from the trie.  */
Packit 709fb3
      kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
Packit 709fb3
      for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
Packit 709fb3
        {
Packit 709fb3
          kwset->target[i] = curr->links->label;
Packit 709fb3
          curr = curr->next;
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      if (kwset->mind > 1)
Packit 709fb3
        {
Packit 709fb3
          /* Looking for the delta2 shift that might be made after a
Packit 709fb3
             backwards match has failed.  Extract it from the trie.  */
Packit 709fb3
          kwset->shift
Packit 709fb3
            = obstack_alloc (&kwset->obstack,
Packit 709fb3
                             sizeof *kwset->shift * (kwset->mind - 1));
Packit 709fb3
          for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i)
Packit 709fb3
            {
Packit 709fb3
              kwset->shift[i] = curr->shift;
Packit 709fb3
              curr = curr->next;
Packit 709fb3
            }
Packit 709fb3
Packit 709fb3
          /* The penultimate byte.  */
Packit 709fb3
          kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  /* Fix things up for any translation table.  */
Packit 709fb3
  if (trans)
Packit 709fb3
    for (i = 0; i < NCHAR; ++i)
Packit 709fb3
      kwset->delta[i] = delta[U(trans[i])];
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Delta2 portion of a Boyer-Moore search.  *TP is the string text
Packit 709fb3
   pointer; it is updated in place.  EP is the end of the string text,
Packit 709fb3
   and SP the end of the pattern.  LEN is the pattern length; it must
Packit 709fb3
   be at least 2.  TRANS, if nonnull, is the input translation table.
Packit 709fb3
   GC1 and GC2 are the last and second-from last bytes of the pattern,
Packit 709fb3
   transliterated by TRANS; the caller precomputes them for
Packit 709fb3
   efficiency.  If D1 is nonnull, it is a delta1 table for shifting *TP
Packit 709fb3
   when failing.  KWSET->shift says how much to shift.  */
Packit 709fb3
static inline bool
Packit 709fb3
bm_delta2_search (char const **tpp, char const *ep, char const *sp,
Packit 709fb3
                  ptrdiff_t len,
Packit 709fb3
                  char const *trans, char gc1, char gc2,
Packit 709fb3
                  unsigned char const *d1, kwset_t kwset)
Packit 709fb3
{
Packit 709fb3
  char const *tp = *tpp;
Packit 709fb3
  ptrdiff_t d = len, skip = 0;
Packit 709fb3
Packit 709fb3
  while (true)
Packit 709fb3
    {
Packit 709fb3
      ptrdiff_t i = 2;
Packit 709fb3
      if (tr (trans, tp[-2]) == gc2)
Packit 709fb3
        {
Packit 709fb3
          while (++i <= d)
Packit 709fb3
            if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
Packit 709fb3
              break;
Packit 709fb3
          if (i > d)
Packit 709fb3
            {
Packit 709fb3
              for (i = d + skip + 1; i <= len; ++i)
Packit 709fb3
                if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
Packit 709fb3
                  break;
Packit 709fb3
              if (i > len)
Packit 709fb3
                {
Packit 709fb3
                  *tpp = tp - len;
Packit 709fb3
                  return true;
Packit 709fb3
                }
Packit 709fb3
            }
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      tp += d = kwset->shift[i - 2];
Packit 709fb3
      if (tp > ep)
Packit 709fb3
        break;
Packit 709fb3
      if (tr (trans, tp[-1]) != gc1)
Packit 709fb3
        {
Packit 709fb3
          if (d1)
Packit 709fb3
            tp += d1[U(tp[-1])];
Packit 709fb3
          break;
Packit 709fb3
        }
Packit 709fb3
      skip = i - 1;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  *tpp = tp;
Packit 709fb3
  return false;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Return the address of the first byte in the buffer S (of size N)
Packit 709fb3
   that matches the terminal byte specified by KWSET, or NULL if there
Packit 709fb3
   is no match.  KWSET->gc1 should be nonnegative.  */
Packit 709fb3
static char const *
Packit 709fb3
memchr_kwset (char const *s, ptrdiff_t n, kwset_t kwset)
Packit 709fb3
{
Packit 709fb3
  char const *slim = s + n;
Packit 709fb3
  if (kwset->gc1help < 0)
Packit 709fb3
    {
Packit 709fb3
      for (; s < slim; s++)
Packit 709fb3
        if (kwset->next[U(*s)])
Packit 709fb3
          return s;
Packit 709fb3
    }
Packit 709fb3
  else
Packit 709fb3
    {
Packit 709fb3
      int small_heuristic = 2;
Packit 709fb3
      size_t small_bytes = small_heuristic * sizeof (unsigned long int);
Packit 709fb3
      while (s < slim)
Packit 709fb3
        {
Packit 709fb3
          if (kwset->next[U(*s)])
Packit 709fb3
            return s;
Packit 709fb3
          s++;
Packit 709fb3
          if ((uintptr_t) s % small_bytes == 0)
Packit 709fb3
            return memchr2 (s, kwset->gc1, kwset->gc1help, slim - s);
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
  return NULL;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Fast Boyer-Moore search (inlinable version).  */
Packit 709fb3
static inline ptrdiff_t
Packit 709fb3
bmexec_trans (kwset_t kwset, char const *text, ptrdiff_t size)
Packit 709fb3
{
Packit 709fb3
  assume (0 <= size);
Packit 709fb3
  unsigned char const *d1;
Packit 709fb3
  char const *ep, *sp, *tp;
Packit 709fb3
  int d;
Packit 709fb3
  ptrdiff_t len = kwset->mind;
Packit 709fb3
  char const *trans = kwset->trans;
Packit 709fb3
Packit 709fb3
  if (len == 0)
Packit 709fb3
    return 0;
Packit 709fb3
  if (len > size)
Packit 709fb3
    return -1;
Packit 709fb3
  if (len == 1)
Packit 709fb3
    {
Packit 709fb3
      tp = memchr_kwset (text, size, kwset);
Packit 709fb3
      return tp ? tp - text : -1;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  d1 = kwset->delta;
Packit 709fb3
  sp = kwset->target + len;
Packit 709fb3
  tp = text + len;
Packit 709fb3
  char gc1 = kwset->gc1;
Packit 709fb3
  char gc2 = kwset->gc2;
Packit 709fb3
Packit 709fb3
  /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2).  */
Packit 709fb3
  ptrdiff_t len12;
Packit 709fb3
  if (!INT_MULTIPLY_WRAPV (len, 12, &len12) && len12 < size)
Packit 709fb3
    /* 11 is not a bug, the initial offset happens only once.  */
Packit 709fb3
    for (ep = text + size - 11 * len; tp <= ep; )
Packit 709fb3
      {
Packit 709fb3
        char const *tp0 = tp;
Packit 709fb3
        d = d1[U(tp[-1])], tp += d;
Packit 709fb3
        d = d1[U(tp[-1])], tp += d;
Packit 709fb3
        if (d != 0)
Packit 709fb3
          {
Packit 709fb3
            d = d1[U(tp[-1])], tp += d;
Packit 709fb3
            d = d1[U(tp[-1])], tp += d;
Packit 709fb3
            d = d1[U(tp[-1])], tp += d;
Packit 709fb3
            if (d != 0)
Packit 709fb3
              {
Packit 709fb3
                d = d1[U(tp[-1])], tp += d;
Packit 709fb3
                d = d1[U(tp[-1])], tp += d;
Packit 709fb3
                d = d1[U(tp[-1])], tp += d;
Packit 709fb3
                if (d != 0)
Packit 709fb3
                  {
Packit 709fb3
                    d = d1[U(tp[-1])], tp += d;
Packit 709fb3
                    d = d1[U(tp[-1])], tp += d;
Packit 709fb3
Packit 709fb3
                    /* As a heuristic, prefer memchr to seeking by
Packit 709fb3
                       delta1 when the latter doesn't advance much.  */
Packit 709fb3
                    int advance_heuristic = 16 * sizeof (long);
Packit 709fb3
                    if (advance_heuristic <= tp - tp0)
Packit 709fb3
                      continue;
Packit 709fb3
                    tp--;
Packit 709fb3
                    tp = memchr_kwset (tp, text + size - tp, kwset);
Packit 709fb3
                    if (! tp)
Packit 709fb3
                      return -1;
Packit 709fb3
                    tp++;
Packit 709fb3
                    if (ep <= tp)
Packit 709fb3
                      break;
Packit 709fb3
                  }
Packit 709fb3
              }
Packit 709fb3
          }
Packit 709fb3
        if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
Packit 709fb3
          return tp - text;
Packit 709fb3
      }
Packit 709fb3
Packit 709fb3
  /* Now only a few characters are left to search.  Carefully avoid
Packit 709fb3
     ever producing an out-of-bounds pointer.  */
Packit 709fb3
  ep = text + size;
Packit 709fb3
  d = d1[U(tp[-1])];
Packit 709fb3
  while (d <= ep - tp)
Packit 709fb3
    {
Packit 709fb3
      d = d1[U((tp += d)[-1])];
Packit 709fb3
      if (d != 0)
Packit 709fb3
        continue;
Packit 709fb3
      if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset))
Packit 709fb3
        return tp - text;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  return -1;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Fast Boyer-Moore search.  */
Packit 709fb3
static ptrdiff_t
Packit 709fb3
bmexec (kwset_t kwset, char const *text, ptrdiff_t size,
Packit 709fb3
        struct kwsmatch *kwsmatch, bool longest)
Packit 709fb3
{
Packit 709fb3
  /* Help the compiler inline in two ways, depending on whether
Packit 709fb3
     kwset->trans is null.  */
Packit 709fb3
  ptrdiff_t ret = (IGNORE_DUPLICATE_BRANCH_WARNING
Packit 709fb3
                   (kwset->trans
Packit 709fb3
                    ? bmexec_trans (kwset, text, size)
Packit 709fb3
                    : bmexec_trans (kwset, text, size)));
Packit 709fb3
  if (0 <= ret)
Packit 709fb3
    {
Packit 709fb3
       kwsmatch->index = 0;
Packit 709fb3
       kwsmatch->offset[0] = ret;
Packit 709fb3
       kwsmatch->size[0] = kwset->mind;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  return ret;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Hairy multiple string search with the Aho-Corasick algorithm.
Packit 709fb3
   (inlinable version)  */
Packit 709fb3
static inline ptrdiff_t
Packit 709fb3
acexec_trans (kwset_t kwset, char const *text, ptrdiff_t len,
Packit 709fb3
              struct kwsmatch *kwsmatch, bool longest)
Packit 709fb3
{
Packit 709fb3
  struct trie const *trie, *accept;
Packit 709fb3
  char const *tp, *left, *lim;
Packit 709fb3
  struct tree const *tree;
Packit 709fb3
  char const *trans;
Packit 709fb3
Packit 709fb3
  /* Initialize register copies and look for easy ways out.  */
Packit 709fb3
  if (len < kwset->mind)
Packit 709fb3
    return -1;
Packit 709fb3
  trans = kwset->trans;
Packit 709fb3
  trie = kwset->trie;
Packit 709fb3
  lim = text + len;
Packit 709fb3
  tp = text;
Packit 709fb3
Packit 709fb3
  if (!trie->accepting)
Packit 709fb3
    {
Packit 709fb3
      unsigned char c;
Packit 709fb3
      int gc1 = kwset->gc1;
Packit 709fb3
Packit 709fb3
      while (true)
Packit 709fb3
        {
Packit 709fb3
          if (gc1 < 0)
Packit 709fb3
            {
Packit 709fb3
              while (! (trie = kwset->next[c = tr (trans, *tp++)]))
Packit 709fb3
                if (tp >= lim)
Packit 709fb3
                  return -1;
Packit 709fb3
            }
Packit 709fb3
          else
Packit 709fb3
            {
Packit 709fb3
              tp = memchr_kwset (tp, lim - tp, kwset);
Packit 709fb3
              if (!tp)
Packit 709fb3
                return -1;
Packit 709fb3
              c = tr (trans, *tp++);
Packit 709fb3
              trie = kwset->next[c];
Packit 709fb3
            }
Packit 709fb3
Packit 709fb3
          while (true)
Packit 709fb3
            {
Packit 709fb3
              if (trie->accepting)
Packit 709fb3
                goto match;
Packit 709fb3
              if (tp >= lim)
Packit 709fb3
                return -1;
Packit 709fb3
              c = tr (trans, *tp++);
Packit 709fb3
Packit 709fb3
              for (tree = trie->links; c != tree->label; )
Packit 709fb3
                {
Packit 709fb3
                  tree = c < tree->label ? tree->llink : tree->rlink;
Packit 709fb3
                  if (! tree)
Packit 709fb3
                    {
Packit 709fb3
                      trie = trie->fail;
Packit 709fb3
                      if (!trie)
Packit 709fb3
                        {
Packit 709fb3
                          trie = kwset->next[c];
Packit 709fb3
                          if (trie)
Packit 709fb3
                            goto have_trie;
Packit 709fb3
                          if (tp >= lim)
Packit 709fb3
                            return -1;
Packit 709fb3
                          goto next_c;
Packit 709fb3
                        }
Packit 709fb3
                      if (trie->accepting)
Packit 709fb3
                        {
Packit 709fb3
                          --tp;
Packit 709fb3
                          goto match;
Packit 709fb3
                        }
Packit 709fb3
                      tree = trie->links;
Packit 709fb3
                    }
Packit 709fb3
                }
Packit 709fb3
              trie = tree->trie;
Packit 709fb3
            have_trie:;
Packit 709fb3
            }
Packit 709fb3
        next_c:;
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
 match:
Packit 709fb3
  accept = trie;
Packit 709fb3
  while (accept->accepting == SIZE_MAX)
Packit 709fb3
    accept = accept->fail;
Packit 709fb3
  left = tp - accept->depth;
Packit 709fb3
Packit 709fb3
  /* Try left-most longest match.  */
Packit 709fb3
  if (longest)
Packit 709fb3
    {
Packit 709fb3
      while (tp < lim)
Packit 709fb3
        {
Packit 709fb3
          struct trie const *accept1;
Packit 709fb3
          char const *left1;
Packit 709fb3
          unsigned char c = tr (trans, *tp++);
Packit 709fb3
Packit 709fb3
          do
Packit 709fb3
            {
Packit 709fb3
              tree = trie->links;
Packit 709fb3
              while (tree && c != tree->label)
Packit 709fb3
                tree = c < tree->label ? tree->llink : tree->rlink;
Packit 709fb3
            }
Packit 709fb3
          while (!tree && (trie = trie->fail) && accept->depth <= trie->depth);
Packit 709fb3
Packit 709fb3
          if (!tree)
Packit 709fb3
            break;
Packit 709fb3
          trie = tree->trie;
Packit 709fb3
          if (trie->accepting)
Packit 709fb3
            {
Packit 709fb3
              accept1 = trie;
Packit 709fb3
              while (accept1->accepting == SIZE_MAX)
Packit 709fb3
                accept1 = accept1->fail;
Packit 709fb3
              left1 = tp - accept1->depth;
Packit 709fb3
              if (left1 <= left)
Packit 709fb3
                {
Packit 709fb3
                  left = left1;
Packit 709fb3
                  accept = accept1;
Packit 709fb3
                }
Packit 709fb3
            }
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  kwsmatch->index = accept->accepting / 2;
Packit 709fb3
  kwsmatch->offset[0] = left - text;
Packit 709fb3
  kwsmatch->size[0] = accept->depth;
Packit 709fb3
Packit 709fb3
  return left - text;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Hairy multiple string search with Aho-Corasick algorithm.  */
Packit 709fb3
static ptrdiff_t
Packit 709fb3
acexec (kwset_t kwset, char const *text, ptrdiff_t size,
Packit 709fb3
        struct kwsmatch *kwsmatch, bool longest)
Packit 709fb3
{
Packit 709fb3
  assume (0 <= size);
Packit 709fb3
  /* Help the compiler inline in two ways, depending on whether
Packit 709fb3
     kwset->trans is null.  */
Packit 709fb3
  return (IGNORE_DUPLICATE_BRANCH_WARNING
Packit 709fb3
          (kwset->trans
Packit 709fb3
           ? acexec_trans (kwset, text, size, kwsmatch, longest)
Packit 709fb3
           : acexec_trans (kwset, text, size, kwsmatch, longest)));
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Find the first instance of a KWSET member in TEXT, which has SIZE bytes.
Packit 709fb3
   Return the offset (into TEXT) of the first byte of the matching substring,
Packit 709fb3
   or -1 if no match is found.  Upon a match, store details in
Packit 709fb3
   *KWSMATCH: index of matched keyword, start offset (same as the return
Packit 709fb3
   value), and length.  If LONGEST, find the longest match; otherwise
Packit 709fb3
   any match will do.  */
Packit 709fb3
ptrdiff_t
Packit 709fb3
kwsexec (kwset_t kwset, char const *text, ptrdiff_t size,
Packit 709fb3
         struct kwsmatch *kwsmatch, bool longest)
Packit 709fb3
{
Packit 709fb3
  return kwset->kwsexec (kwset, text, size, kwsmatch, longest);
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Free the components of the given keyword set.  */
Packit 709fb3
void
Packit 709fb3
kwsfree (kwset_t kwset)
Packit 709fb3
{
Packit 709fb3
  obstack_free (&kwset->obstack, NULL);
Packit 709fb3
  free (kwset);
Packit 709fb3
}