Blame lib/str-kmp.h

Packit 33f14e
/* Substring search in a NUL terminated string of UNIT elements,
Packit 33f14e
   using the Knuth-Morris-Pratt algorithm.
Packit 33f14e
   Copyright (C) 2005-2017 Free Software Foundation, Inc.
Packit 33f14e
   Written by Bruno Haible <bruno@clisp.org>, 2005.
Packit 33f14e
Packit 33f14e
   This program is free software; you can redistribute it and/or modify
Packit 33f14e
   it under the terms of the GNU General Public License as published by
Packit 33f14e
   the Free Software Foundation; either version 3, or (at your option)
Packit 33f14e
   any later version.
Packit 33f14e
Packit 33f14e
   This program is distributed in the hope that it will be useful,
Packit 33f14e
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 33f14e
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 33f14e
   GNU General Public License for more details.
Packit 33f14e
Packit 33f14e
   You should have received a copy of the GNU General Public License
Packit 33f14e
   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
Packit 33f14e
Packit 33f14e
/* Before including this file, you need to define:
Packit 33f14e
     UNIT                    The element type of the needle and haystack.
Packit 33f14e
     CANON_ELEMENT(c)        A macro that canonicalizes an element right after
Packit 33f14e
                             it has been fetched from needle or haystack.
Packit 33f14e
                             The argument is of type UNIT; the result must be
Packit 33f14e
                             of type UNIT as well.  */
Packit 33f14e
Packit 33f14e
/* Knuth-Morris-Pratt algorithm.
Packit 33f14e
   See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
Packit 33f14e
   HAYSTACK is the NUL terminated string in which to search for.
Packit 33f14e
   NEEDLE is the string to search for in HAYSTACK, consisting of NEEDLE_LEN
Packit 33f14e
   units.
Packit 33f14e
   Return a boolean indicating success:
Packit 33f14e
   Return true and set *RESULTP if the search was completed.
Packit 33f14e
   Return false if it was aborted because not enough memory was available.  */
Packit 33f14e
static bool
Packit 33f14e
knuth_morris_pratt (const UNIT *haystack,
Packit 33f14e
                    const UNIT *needle, size_t needle_len,
Packit 33f14e
                    const UNIT **resultp)
Packit 33f14e
{
Packit 33f14e
  size_t m = needle_len;
Packit 33f14e
Packit 33f14e
  /* Allocate the table.  */
Packit 33f14e
  size_t *table = (size_t *) nmalloca (m, sizeof (size_t));
Packit 33f14e
  if (table == NULL)
Packit 33f14e
    return false;
Packit 33f14e
  /* Fill the table.
Packit 33f14e
     For 0 < i < m:
Packit 33f14e
       0 < table[i] <= i is defined such that
Packit 33f14e
       forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
Packit 33f14e
       and table[i] is as large as possible with this property.
Packit 33f14e
     This implies:
Packit 33f14e
     1) For 0 < i < m:
Packit 33f14e
          If table[i] < i,
Packit 33f14e
          needle[table[i]..i-1] = needle[0..i-1-table[i]].
Packit 33f14e
     2) For 0 < i < m:
Packit 33f14e
          rhaystack[0..i-1] == needle[0..i-1]
Packit 33f14e
          and exists h, i <= h < m: rhaystack[h] != needle[h]
Packit 33f14e
          implies
Packit 33f14e
          forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
Packit 33f14e
     table[0] remains uninitialized.  */
Packit 33f14e
  {
Packit 33f14e
    size_t i, j;
Packit 33f14e
Packit 33f14e
    /* i = 1: Nothing to verify for x = 0.  */
Packit 33f14e
    table[1] = 1;
Packit 33f14e
    j = 0;
Packit 33f14e
Packit 33f14e
    for (i = 2; i < m; i++)
Packit 33f14e
      {
Packit 33f14e
        /* Here: j = i-1 - table[i-1].
Packit 33f14e
           The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
Packit 33f14e
           for x < table[i-1], by induction.
Packit 33f14e
           Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
Packit 33f14e
        UNIT b = CANON_ELEMENT (needle[i - 1]);
Packit 33f14e
Packit 33f14e
        for (;;)
Packit 33f14e
          {
Packit 33f14e
            /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
Packit 33f14e
               is known to hold for x < i-1-j.
Packit 33f14e
               Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
Packit 33f14e
            if (b == CANON_ELEMENT (needle[j]))
Packit 33f14e
              {
Packit 33f14e
                /* Set table[i] := i-1-j.  */
Packit 33f14e
                table[i] = i - ++j;
Packit 33f14e
                break;
Packit 33f14e
              }
Packit 33f14e
            /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
Packit 33f14e
               for x = i-1-j, because
Packit 33f14e
                 needle[i-1] != needle[j] = needle[i-1-x].  */
Packit 33f14e
            if (j == 0)
Packit 33f14e
              {
Packit 33f14e
                /* The inequality holds for all possible x.  */
Packit 33f14e
                table[i] = i;
Packit 33f14e
                break;
Packit 33f14e
              }
Packit 33f14e
            /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
Packit 33f14e
               for i-1-j < x < i-1-j+table[j], because for these x:
Packit 33f14e
                 needle[x..i-2]
Packit 33f14e
                 = needle[x-(i-1-j)..j-1]
Packit 33f14e
                 != needle[0..j-1-(x-(i-1-j))]  (by definition of table[j])
Packit 33f14e
                    = needle[0..i-2-x],
Packit 33f14e
               hence needle[x..i-1] != needle[0..i-1-x].
Packit 33f14e
               Furthermore
Packit 33f14e
                 needle[i-1-j+table[j]..i-2]
Packit 33f14e
                 = needle[table[j]..j-1]
Packit 33f14e
                 = needle[0..j-1-table[j]]  (by definition of table[j]).  */
Packit 33f14e
            j = j - table[j];
Packit 33f14e
          }
Packit 33f14e
        /* Here: j = i - table[i].  */
Packit 33f14e
      }
Packit 33f14e
  }
Packit 33f14e
Packit 33f14e
  /* Search, using the table to accelerate the processing.  */
Packit 33f14e
  {
Packit 33f14e
    size_t j;
Packit 33f14e
    const UNIT *rhaystack;
Packit 33f14e
    const UNIT *phaystack;
Packit 33f14e
Packit 33f14e
    *resultp = NULL;
Packit 33f14e
    j = 0;
Packit 33f14e
    rhaystack = haystack;
Packit 33f14e
    phaystack = haystack;
Packit 33f14e
    /* Invariant: phaystack = rhaystack + j.  */
Packit 33f14e
    while (*phaystack != 0)
Packit 33f14e
      if (CANON_ELEMENT (needle[j]) == CANON_ELEMENT (*phaystack))
Packit 33f14e
        {
Packit 33f14e
          j++;
Packit 33f14e
          phaystack++;
Packit 33f14e
          if (j == m)
Packit 33f14e
            {
Packit 33f14e
              /* The entire needle has been found.  */
Packit 33f14e
              *resultp = rhaystack;
Packit 33f14e
              break;
Packit 33f14e
            }
Packit 33f14e
        }
Packit 33f14e
      else if (j > 0)
Packit 33f14e
        {
Packit 33f14e
          /* Found a match of needle[0..j-1], mismatch at needle[j].  */
Packit 33f14e
          rhaystack += table[j];
Packit 33f14e
          j -= table[j];
Packit 33f14e
        }
Packit 33f14e
      else
Packit 33f14e
        {
Packit 33f14e
          /* Found a mismatch at needle[0] already.  */
Packit 33f14e
          rhaystack++;
Packit 33f14e
          phaystack++;
Packit 33f14e
        }
Packit 33f14e
  }
Packit 33f14e
Packit 33f14e
  freea (table);
Packit 33f14e
  return true;
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
#undef CANON_ELEMENT