Blame src/kwsearch.c

Packit 709fb3
/* kwsearch.c - searching subroutines using kwset for grep.
Packit 709fb3
   Copyright 1992, 1998, 2000, 2007, 2009-2017 Free Software Foundation, Inc.
Packit 709fb3
Packit 709fb3
   This program is free software; you can redistribute it and/or modify
Packit 709fb3
   it under the terms of the GNU General Public License as published by
Packit 709fb3
   the Free Software Foundation; either version 3, or (at your option)
Packit 709fb3
   any later version.
Packit 709fb3
Packit 709fb3
   This program is distributed in the hope that it will be useful,
Packit 709fb3
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 709fb3
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 709fb3
   GNU General Public License for more details.
Packit 709fb3
Packit 709fb3
   You should have received a copy of the GNU General Public License
Packit 709fb3
   along with this program; if not, write to the Free Software
Packit 709fb3
   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
Packit 709fb3
   02110-1301, USA.  */
Packit 709fb3
Packit 709fb3
/* Written August 1992 by Mike Haertel. */
Packit 709fb3
Packit 709fb3
#include <config.h>
Packit 709fb3
#include "search.h"
Packit 709fb3
Packit 709fb3
/* A compiled -F pattern list.  */
Packit 709fb3
Packit 709fb3
struct kwsearch
Packit 709fb3
{
Packit 709fb3
  /* The kwset for this pattern list.  */
Packit 709fb3
  kwset_t kwset;
Packit 709fb3
Packit 709fb3
  /* The number of user-specified patterns.  This is less than
Packit 709fb3
     'kwswords (kwset)' when some extra one-character words have been
Packit 709fb3
     appended, one for each troublesome character that will require a
Packit 709fb3
     DFA search.  */
Packit 709fb3
  ptrdiff_t words;
Packit 709fb3
Packit 709fb3
  /* The user's pattern and its size in bytes.  */
Packit 709fb3
  char *pattern;
Packit 709fb3
  size_t size;
Packit 709fb3
Packit 709fb3
  /* The user's pattern compiled as a regular expression,
Packit 709fb3
     or null if it has not been compiled.  */
Packit 709fb3
  void *re;
Packit 709fb3
};
Packit 709fb3
Packit 709fb3
/* Compile the -F style PATTERN, containing SIZE bytes.  Return a
Packit 709fb3
   description of the compiled pattern.  */
Packit 709fb3
Packit 709fb3
void *
Packit 709fb3
Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
Packit 709fb3
{
Packit 709fb3
  kwset_t kwset;
Packit 709fb3
  ptrdiff_t total = size;
Packit 709fb3
  char *buf = NULL;
Packit 709fb3
  size_t bufalloc = 0;
Packit 709fb3
Packit 709fb3
  kwset = kwsinit (true);
Packit 709fb3
Packit 709fb3
  char const *p = pattern;
Packit 709fb3
  do
Packit 709fb3
    {
Packit 709fb3
      ptrdiff_t len;
Packit 709fb3
      char const *sep = memchr (p, '\n', total);
Packit 709fb3
      if (sep)
Packit 709fb3
        {
Packit 709fb3
          len = sep - p;
Packit 709fb3
          sep++;
Packit 709fb3
          total -= (len + 1);
Packit 709fb3
        }
Packit 709fb3
      else
Packit 709fb3
        {
Packit 709fb3
          len = total;
Packit 709fb3
          total = 0;
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      if (match_lines)
Packit 709fb3
        {
Packit 709fb3
          if (eolbyte == '\n' && pattern < p && sep)
Packit 709fb3
            p--;
Packit 709fb3
          else
Packit 709fb3
            {
Packit 709fb3
              if (bufalloc < len + 2)
Packit 709fb3
                {
Packit 709fb3
                  free (buf);
Packit 709fb3
                  bufalloc = len + 2;
Packit 709fb3
                  buf = x2realloc (NULL, &bufalloc);
Packit 709fb3
                  buf[0] = eolbyte;
Packit 709fb3
                }
Packit 709fb3
              memcpy (buf + 1, p, len);
Packit 709fb3
              buf[len + 1] = eolbyte;
Packit 709fb3
              p = buf;
Packit 709fb3
            }
Packit 709fb3
          len += 2;
Packit 709fb3
        }
Packit 709fb3
      kwsincr (kwset, p, len);
Packit 709fb3
Packit 709fb3
      p = sep;
Packit 709fb3
    }
Packit 709fb3
  while (p);
Packit 709fb3
Packit 709fb3
  free (buf);
Packit 709fb3
  ptrdiff_t words = kwswords (kwset);
Packit 709fb3
Packit 709fb3
  if (match_icase)
Packit 709fb3
    {
Packit 709fb3
      /* For each pattern character C that has a case folded
Packit 709fb3
         counterpart F that is multibyte and so cannot easily be
Packit 709fb3
         implemented via translating a single byte, append a pattern
Packit 709fb3
         containing just F.  That way, if the data contains F, the
Packit 709fb3
         matcher can fall back on DFA.  For example, if C is 'i' and
Packit 709fb3
         the locale is en_US.utf8, append a pattern containing just
Packit 709fb3
         the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that
Packit 709fb3
         Fexecute will use a DFA if the data contain U+0131.  */
Packit 709fb3
      mbstate_t mbs = { 0 };
Packit 709fb3
      char checked[NCHAR] = {0,};
Packit 709fb3
      for (p = pattern; p < pattern + size; p++)
Packit 709fb3
        {
Packit 709fb3
          unsigned char c = *p;
Packit 709fb3
          if (checked[c])
Packit 709fb3
            continue;
Packit 709fb3
          checked[c] = true;
Packit 709fb3
Packit 709fb3
          wint_t wc = localeinfo.sbctowc[c];
Packit 709fb3
          wchar_t folded[CASE_FOLDED_BUFSIZE];
Packit 709fb3
Packit 709fb3
          for (int i = case_folded_counterparts (wc, folded); 0 <= --i; )
Packit 709fb3
            {
Packit 709fb3
              char s[MB_LEN_MAX];
Packit 709fb3
              int nbytes = wcrtomb (s, folded[i], &mbs);
Packit 709fb3
              if (1 < nbytes)
Packit 709fb3
                kwsincr (kwset, s, nbytes);
Packit 709fb3
            }
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  kwsprep (kwset);
Packit 709fb3
Packit 709fb3
  struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch);
Packit 709fb3
  kwsearch->kwset = kwset;
Packit 709fb3
  kwsearch->words = words;
Packit 709fb3
  kwsearch->pattern = pattern;
Packit 709fb3
  kwsearch->size = size;
Packit 709fb3
  kwsearch->re = NULL;
Packit 709fb3
  return kwsearch;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Use the compiled pattern VCP to search the buffer BUF of size SIZE.
Packit 709fb3
   If found, return the offset of the first match and store its
Packit 709fb3
   size into *MATCH_SIZE.  If not found, return SIZE_MAX.
Packit 709fb3
   If START_PTR is nonnull, start searching there.  */
Packit 709fb3
size_t
Packit 709fb3
Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
Packit 709fb3
          char const *start_ptr)
Packit 709fb3
{
Packit 709fb3
  char const *beg, *end, *mb_start;
Packit 709fb3
  ptrdiff_t len;
Packit 709fb3
  char eol = eolbyte;
Packit 709fb3
  struct kwsmatch kwsmatch;
Packit 709fb3
  size_t ret_val;
Packit 709fb3
  bool mb_check;
Packit 709fb3
  bool longest;
Packit 709fb3
  struct kwsearch *kwsearch = vcp;
Packit 709fb3
  kwset_t kwset = kwsearch->kwset;
Packit 709fb3
Packit 709fb3
  if (match_lines)
Packit 709fb3
    mb_check = longest = false;
Packit 709fb3
  else
Packit 709fb3
    {
Packit 709fb3
      mb_check = localeinfo.multibyte & !localeinfo.using_utf8;
Packit 709fb3
      longest = mb_check | !!start_ptr | match_words;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
Packit 709fb3
    {
Packit 709fb3
      ptrdiff_t offset = kwsexec (kwset, beg - match_lines,
Packit 709fb3
                                  buf + size - beg + match_lines, &kwsmatch,
Packit 709fb3
                                  longest);
Packit 709fb3
      if (offset < 0)
Packit 709fb3
        break;
Packit 709fb3
      len = kwsmatch.size[0] - 2 * match_lines;
Packit 709fb3
Packit 709fb3
      if (kwsearch->words <= kwsmatch.index)
Packit 709fb3
        {
Packit 709fb3
          /* The data contain a multibyte character that matches
Packit 709fb3
             some pattern character that is a case folded counterpart.
Packit 709fb3
             Since the kwset code cannot handle this case, fall back
Packit 709fb3
             on the DFA code, which can.  */
Packit 709fb3
          if (! kwsearch->re)
Packit 709fb3
            {
Packit 709fb3
              fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
Packit 709fb3
              kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size,
Packit 709fb3
                                         RE_SYNTAX_GREP);
Packit 709fb3
            }
Packit 709fb3
          return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0)
Packit 709fb3
        {
Packit 709fb3
          /* We have matched a single byte that is not at the beginning of a
Packit 709fb3
             multibyte character.  mb_goback has advanced MB_START past that
Packit 709fb3
             multibyte character.  Now, we want to position BEG so that the
Packit 709fb3
             next kwsexec search starts there.  Thus, to compensate for the
Packit 709fb3
             for-loop's BEG++, above, subtract one here.  This code is
Packit 709fb3
             unusually hard to reach, and exceptionally, let's show how to
Packit 709fb3
             trigger it here:
Packit 709fb3
Packit 709fb3
               printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A
Packit 709fb3
Packit 709fb3
             That assumes the named locale is installed.
Packit 709fb3
             Note that your system's shift-JIS locale may have a different
Packit 709fb3
             name, possibly including "sjis".  */
Packit 709fb3
          beg = mb_start - 1;
Packit 709fb3
          continue;
Packit 709fb3
        }
Packit 709fb3
      beg += offset;
Packit 709fb3
      if (!!start_ptr & !match_words)
Packit 709fb3
        goto success_in_beg_and_len;
Packit 709fb3
      if (match_lines)
Packit 709fb3
        {
Packit 709fb3
          len += start_ptr == NULL;
Packit 709fb3
          goto success_in_beg_and_len;
Packit 709fb3
        }
Packit 709fb3
      if (! match_words)
Packit 709fb3
        goto success;
Packit 709fb3
Packit 709fb3
      /* Succeed if the preceding and following characters are word
Packit 709fb3
         constituents.  If the following character is not a word
Packit 709fb3
         constituent, keep trying with shorter matches.  */
Packit 709fb3
      char const *bol = memrchr (mb_start, eol, beg - mb_start);
Packit 709fb3
      if (bol)
Packit 709fb3
        mb_start = bol + 1;
Packit 709fb3
      if (! wordchar_prev (mb_start, beg, buf + size))
Packit 709fb3
        for (;;)
Packit 709fb3
          {
Packit 709fb3
            if (! wordchar_next (beg + len, buf + size))
Packit 709fb3
              {
Packit 709fb3
                if (start_ptr)
Packit 709fb3
                  goto success_in_beg_and_len;
Packit 709fb3
                else
Packit 709fb3
                  goto success;
Packit 709fb3
              }
Packit 709fb3
            if (!len)
Packit 709fb3
              break;
Packit 709fb3
            offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
Packit 709fb3
            if (offset != 0)
Packit 709fb3
              break;
Packit 709fb3
            len = kwsmatch.size[0];
Packit 709fb3
          }
Packit 709fb3
Packit 709fb3
      /* No word match was found at BEG.  Skip past word constituents,
Packit 709fb3
         since they cannot precede the next match and not skipping
Packit 709fb3
         them could make things much slower.  */
Packit 709fb3
      beg += wordchars_size (beg, buf + size);
Packit 709fb3
      mb_start = beg;
Packit 709fb3
    } /* for (beg in buf) */
Packit 709fb3
Packit 709fb3
  return -1;
Packit 709fb3
Packit 709fb3
 success:
Packit 709fb3
  end = memchr (beg + len, eol, (buf + size) - (beg + len));
Packit 709fb3
  end = end ? end + 1 : buf + size;
Packit 709fb3
  beg = memrchr (buf, eol, beg - buf);
Packit 709fb3
  beg = beg ? beg + 1 : buf;
Packit 709fb3
  len = end - beg;
Packit 709fb3
 success_in_beg_and_len:;
Packit 709fb3
  size_t off = beg - buf;
Packit 709fb3
Packit 709fb3
  *match_size = len;
Packit 709fb3
  ret_val = off;
Packit 709fb3
  return ret_val;
Packit 709fb3
}