Blame src/dfasearch.c

Packit 709fb3
/* dfasearch.c - searching subroutines using dfa and regex for grep.
Packit 709fb3
   Copyright 1992, 1998, 2000, 2007, 2009-2017 Free Software Foundation, Inc.
Packit 709fb3
Packit 709fb3
   This program is free software; you can redistribute it and/or modify
Packit 709fb3
   it under the terms of the GNU General Public License as published by
Packit 709fb3
   the Free Software Foundation; either version 3, or (at your option)
Packit 709fb3
   any later version.
Packit 709fb3
Packit 709fb3
   This program is distributed in the hope that it will be useful,
Packit 709fb3
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 709fb3
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 709fb3
   GNU General Public License for more details.
Packit 709fb3
Packit 709fb3
   You should have received a copy of the GNU General Public License
Packit 709fb3
   along with this program; if not, write to the Free Software
Packit 709fb3
   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
Packit 709fb3
   02110-1301, USA.  */
Packit 709fb3
Packit 709fb3
/* Written August 1992 by Mike Haertel. */
Packit 709fb3
Packit 709fb3
#include <config.h>
Packit 709fb3
#include "intprops.h"
Packit 709fb3
#include "search.h"
Packit 709fb3
#include "die.h"
Packit 709fb3
#include <error.h>
Packit 709fb3
Packit 709fb3
struct dfa_comp
Packit 709fb3
{
Packit 709fb3
  /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
Packit 709fb3
     a list of strings, at least one of which is known to occur in
Packit 709fb3
     any string matching the regexp. */
Packit 709fb3
  kwset_t kwset;
Packit 709fb3
Packit 709fb3
  /* DFA compiled regexp. */
Packit 709fb3
  struct dfa *dfa;
Packit 709fb3
Packit 709fb3
  /* Regex compiled regexps. */
Packit 709fb3
  struct re_pattern_buffer* patterns;
Packit 709fb3
  size_t pcount;
Packit 709fb3
  struct re_registers regs;
Packit 709fb3
Packit 709fb3
  /* Number of compiled fixed strings known to exactly match the regexp.
Packit 709fb3
     If kwsexec returns < kwset_exact_matches, then we don't need to
Packit 709fb3
     call the regexp matcher at all. */
Packit 709fb3
  ptrdiff_t kwset_exact_matches;
Packit 709fb3
Packit 709fb3
  bool begline;
Packit 709fb3
};
Packit 709fb3
Packit 709fb3
void
Packit 709fb3
dfaerror (char const *mesg)
Packit 709fb3
{
Packit 709fb3
  die (EXIT_TROUBLE, 0, "%s", mesg);
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* For now, the sole dfawarn-eliciting condition (use of a regexp
Packit 709fb3
   like '[:lower:]') is unequivocally an error, so treat it as such,
Packit 709fb3
   when possible.  */
Packit 709fb3
void
Packit 709fb3
dfawarn (char const *mesg)
Packit 709fb3
{
Packit 709fb3
  if (!getenv ("POSIXLY_CORRECT"))
Packit 709fb3
    dfaerror (mesg);
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* If the DFA turns out to have some set of fixed strings one of
Packit 709fb3
   which must occur in the match, then we build a kwset matcher
Packit 709fb3
   to find those strings, and thus quickly filter out impossible
Packit 709fb3
   matches. */
Packit 709fb3
static void
Packit 709fb3
kwsmusts (struct dfa_comp *dc)
Packit 709fb3
{
Packit 709fb3
  struct dfamust *dm = dfamust (dc->dfa);
Packit 709fb3
  if (!dm)
Packit 709fb3
    return;
Packit 709fb3
  dc->kwset = kwsinit (false);
Packit 709fb3
  if (dm->exact)
Packit 709fb3
    {
Packit 709fb3
      /* Prepare a substring whose presence implies a match.
Packit 709fb3
         The kwset matcher will return the index of the matching
Packit 709fb3
         string that it chooses. */
Packit 709fb3
      ++dc->kwset_exact_matches;
Packit 709fb3
      ptrdiff_t old_len = strlen (dm->must);
Packit 709fb3
      ptrdiff_t new_len = old_len + dm->begline + dm->endline;
Packit 709fb3
      char *must = xmalloc (new_len);
Packit 709fb3
      char *mp = must;
Packit 709fb3
      *mp = eolbyte;
Packit 709fb3
      mp += dm->begline;
Packit 709fb3
      dc->begline |= dm->begline;
Packit 709fb3
      memcpy (mp, dm->must, old_len);
Packit 709fb3
      if (dm->endline)
Packit 709fb3
        mp[old_len] = eolbyte;
Packit 709fb3
      kwsincr (dc->kwset, must, new_len);
Packit 709fb3
      free (must);
Packit 709fb3
    }
Packit 709fb3
  else
Packit 709fb3
    {
Packit 709fb3
      /* Otherwise, filtering with this substring should help reduce the
Packit 709fb3
         search space, but we'll still have to use the regexp matcher.  */
Packit 709fb3
      kwsincr (dc->kwset, dm->must, strlen (dm->must));
Packit 709fb3
    }
Packit 709fb3
  kwsprep (dc->kwset);
Packit 709fb3
  dfamustfree (dm);
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
void *
Packit 709fb3
GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
Packit 709fb3
{
Packit 709fb3
  char *motif;
Packit 709fb3
  struct dfa_comp *dc = xcalloc (1, sizeof (*dc));
Packit 709fb3
Packit 709fb3
  dc->dfa = dfaalloc ();
Packit 709fb3
Packit 709fb3
  if (match_icase)
Packit 709fb3
    syntax_bits |= RE_ICASE;
Packit 709fb3
  re_set_syntax (syntax_bits);
Packit 709fb3
  int dfaopts = eolbyte ? 0 : DFA_EOL_NUL;
Packit 709fb3
  dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts);
Packit 709fb3
Packit 709fb3
  /* For GNU regex, pass the patterns separately to detect errors like
Packit 709fb3
     "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
Packit 709fb3
     this should be a syntax error.  The same for backref, where the
Packit 709fb3
     backref should be local to each pattern.  */
Packit 709fb3
  char const *p = pattern;
Packit 709fb3
  char const *patlim = pattern + size;
Packit 709fb3
  bool compilation_failed = false;
Packit 709fb3
  size_t palloc = 0;
Packit 709fb3
Packit 709fb3
  do
Packit 709fb3
    {
Packit 709fb3
      size_t len;
Packit 709fb3
      char const *sep = memchr (p, '\n', patlim - p);
Packit 709fb3
      if (sep)
Packit 709fb3
        {
Packit 709fb3
          len = sep - p;
Packit 709fb3
          sep++;
Packit 709fb3
        }
Packit 709fb3
      else
Packit 709fb3
        len = patlim - p;
Packit 709fb3
Packit 709fb3
      if (palloc <= dc->pcount)
Packit 709fb3
        dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns);
Packit 709fb3
      struct re_pattern_buffer *pat = &dc->patterns[dc->pcount];
Packit 709fb3
      pat->buffer = NULL;
Packit 709fb3
      pat->allocated = 0;
Packit 709fb3
Packit 709fb3
      /* Do not use a fastmap with -i, to work around glibc Bug#20381.  */
Packit 709fb3
      pat->fastmap = match_icase ? NULL : xmalloc (UCHAR_MAX + 1);
Packit 709fb3
Packit 709fb3
      pat->translate = NULL;
Packit 709fb3
Packit 709fb3
      char const *err = re_compile_pattern (p, len, pat);
Packit 709fb3
      if (err)
Packit 709fb3
        {
Packit 709fb3
          /* With patterns specified only on the command line, emit the bare
Packit 709fb3
             diagnostic.  Otherwise, include a filename:lineno: prefix.  */
Packit 709fb3
          size_t lineno;
Packit 709fb3
          char const *pat_filename = pattern_file_name (dc->pcount + 1,
Packit 709fb3
                                                        &lineno);
Packit 709fb3
          if (*pat_filename == '\0')
Packit 709fb3
            error (0, 0, "%s", err);
Packit 709fb3
          else
Packit 709fb3
            error (0, 0, "%s:%zu: %s", pat_filename, lineno, err);
Packit 709fb3
          compilation_failed = true;
Packit 709fb3
        }
Packit 709fb3
      dc->pcount++;
Packit 709fb3
      p = sep;
Packit 709fb3
    }
Packit 709fb3
  while (p);
Packit 709fb3
Packit 709fb3
  if (compilation_failed)
Packit 709fb3
    exit (EXIT_TROUBLE);
Packit 709fb3
Packit 709fb3
  /* In the match_words and match_lines cases, we use a different pattern
Packit 709fb3
     for the DFA matcher that will quickly throw out cases that won't work.
Packit 709fb3
     Then if DFA succeeds we do some hairy stuff using the regex matcher
Packit 709fb3
     to decide whether the match should really count. */
Packit 709fb3
  if (match_words || match_lines)
Packit 709fb3
    {
Packit 709fb3
      static char const line_beg_no_bk[] = "^(";
Packit 709fb3
      static char const line_end_no_bk[] = ")$";
Packit 709fb3
      static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
Packit 709fb3
      static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
Packit 709fb3
      static char const line_beg_bk[] = "^\\(";
Packit 709fb3
      static char const line_end_bk[] = "\\)$";
Packit 709fb3
      static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
Packit 709fb3
      static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
Packit 709fb3
      int bk = !(syntax_bits & RE_NO_BK_PARENS);
Packit 709fb3
      char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
Packit 709fb3
Packit 709fb3
      strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
Packit 709fb3
                             : (bk ? word_beg_bk : word_beg_no_bk));
Packit 709fb3
      size_t total = strlen (n);
Packit 709fb3
      memcpy (n + total, pattern, size);
Packit 709fb3
      total += size;
Packit 709fb3
      strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
Packit 709fb3
                                     : (bk ? word_end_bk : word_end_no_bk));
Packit 709fb3
      total += strlen (n + total);
Packit 709fb3
      pattern = motif = n;
Packit 709fb3
      size = total;
Packit 709fb3
    }
Packit 709fb3
  else
Packit 709fb3
    motif = NULL;
Packit 709fb3
Packit 709fb3
  dfacomp (pattern, size, dc->dfa, 1);
Packit 709fb3
  kwsmusts (dc);
Packit 709fb3
Packit 709fb3
  free (motif);
Packit 709fb3
Packit 709fb3
  return dc;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
size_t
Packit 709fb3
EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
Packit 709fb3
           char const *start_ptr)
Packit 709fb3
{
Packit 709fb3
  char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
Packit 709fb3
  char eol = eolbyte;
Packit 709fb3
  regoff_t start;
Packit 709fb3
  size_t len, best_len;
Packit 709fb3
  struct kwsmatch kwsm;
Packit 709fb3
  size_t i;
Packit 709fb3
  struct dfa_comp *dc = vdc;
Packit 709fb3
  struct dfa *superset = dfasuperset (dc->dfa);
Packit 709fb3
  bool dfafast = dfaisfast (dc->dfa);
Packit 709fb3
Packit 709fb3
  mb_start = buf;
Packit 709fb3
  buflim = buf + size;
Packit 709fb3
Packit 709fb3
  for (beg = end = buf; end < buflim; beg = end)
Packit 709fb3
    {
Packit 709fb3
      end = buflim;
Packit 709fb3
Packit 709fb3
      if (!start_ptr)
Packit 709fb3
        {
Packit 709fb3
          char const *next_beg, *dfa_beg = beg;
Packit 709fb3
          size_t count = 0;
Packit 709fb3
          bool exact_kwset_match = false;
Packit 709fb3
          bool backref = false;
Packit 709fb3
Packit 709fb3
          /* Try matching with KWset, if it's defined.  */
Packit 709fb3
          if (dc->kwset)
Packit 709fb3
            {
Packit 709fb3
              char const *prev_beg;
Packit 709fb3
Packit 709fb3
              /* Find a possible match using the KWset matcher.  */
Packit 709fb3
              ptrdiff_t offset = kwsexec (dc->kwset, beg - dc->begline,
Packit 709fb3
                                          buflim - beg + dc->begline,
Packit 709fb3
                                          &kwsm, true);
Packit 709fb3
              if (offset < 0)
Packit 709fb3
                goto failure;
Packit 709fb3
              match = beg + offset;
Packit 709fb3
              prev_beg = beg;
Packit 709fb3
Packit 709fb3
              /* Narrow down to the line containing the possible match.  */
Packit 709fb3
              beg = memrchr (buf, eol, match - buf);
Packit 709fb3
              beg = beg ? beg + 1 : buf;
Packit 709fb3
              dfa_beg = beg;
Packit 709fb3
Packit 709fb3
              /* Determine the end pointer to give the DFA next.  Typically
Packit 709fb3
                 this is after the first newline after MATCH; but if the KWset
Packit 709fb3
                 match is not exact, the DFA is fast, and the offset from
Packit 709fb3
                 PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
Packit 709fb3
                 greater of the latter two values; this temporarily prefers
Packit 709fb3
                 the DFA to KWset.  */
Packit 709fb3
              exact_kwset_match = kwsm.index < dc->kwset_exact_matches;
Packit 709fb3
              end = ((exact_kwset_match || !dfafast
Packit 709fb3
                      || MAX (16, match - beg) < (match - prev_beg) >> 2)
Packit 709fb3
                     ? match
Packit 709fb3
                     : MAX (16, match - beg) < (buflim - prev_beg) >> 2
Packit 709fb3
                     ? prev_beg + 4 * MAX (16, match - beg)
Packit 709fb3
                     : buflim);
Packit 709fb3
              end = memchr (end, eol, buflim - end);
Packit 709fb3
              end = end ? end + 1 : buflim;
Packit 709fb3
Packit 709fb3
              if (exact_kwset_match)
Packit 709fb3
                {
Packit 709fb3
                  if (!localeinfo.multibyte | localeinfo.using_utf8)
Packit 709fb3
                    goto success;
Packit 709fb3
                  if (mb_start < beg)
Packit 709fb3
                    mb_start = beg;
Packit 709fb3
                  if (mb_goback (&mb_start, match, buflim) == 0)
Packit 709fb3
                    goto success;
Packit 709fb3
                  /* The matched line starts in the middle of a multibyte
Packit 709fb3
                     character.  Perform the DFA search starting from the
Packit 709fb3
                     beginning of the next character.  */
Packit 709fb3
                  dfa_beg = mb_start;
Packit 709fb3
                }
Packit 709fb3
            }
Packit 709fb3
Packit 709fb3
          /* Try matching with the superset of DFA, if it's defined.  */
Packit 709fb3
          if (superset && !exact_kwset_match)
Packit 709fb3
            {
Packit 709fb3
              /* Keep using the superset while it reports multiline
Packit 709fb3
                 potential matches; this is more likely to be fast
Packit 709fb3
                 than falling back to KWset would be.  */
Packit 709fb3
              next_beg = dfaexec (superset, dfa_beg, (char *) end, 0,
Packit 709fb3
                                  &count, NULL);
Packit 709fb3
              if (next_beg == NULL || next_beg == end)
Packit 709fb3
                continue;
Packit 709fb3
Packit 709fb3
              /* Narrow down to the line we've found.  */
Packit 709fb3
              if (count != 0)
Packit 709fb3
                {
Packit 709fb3
                  beg = memrchr (buf, eol, next_beg - buf);
Packit 709fb3
                  beg++;
Packit 709fb3
                  dfa_beg = beg;
Packit 709fb3
                }
Packit 709fb3
              end = memchr (next_beg, eol, buflim - next_beg);
Packit 709fb3
              end = end ? end + 1 : buflim;
Packit 709fb3
Packit 709fb3
              count = 0;
Packit 709fb3
            }
Packit 709fb3
Packit 709fb3
          /* Try matching with DFA.  */
Packit 709fb3
          next_beg = dfaexec (dc->dfa, dfa_beg, (char *) end, 0, &count,
Packit 709fb3
                              &backref);
Packit 709fb3
Packit 709fb3
          /* If there's no match, or if we've matched the sentinel,
Packit 709fb3
             we're done.  */
Packit 709fb3
          if (next_beg == NULL || next_beg == end)
Packit 709fb3
            continue;
Packit 709fb3
Packit 709fb3
          /* Narrow down to the line we've found.  */
Packit 709fb3
          if (count != 0)
Packit 709fb3
            {
Packit 709fb3
              beg = memrchr (buf, eol, next_beg - buf);
Packit 709fb3
              beg++;
Packit 709fb3
            }
Packit 709fb3
          end = memchr (next_beg, eol, buflim - next_beg);
Packit 709fb3
          end = end ? end + 1 : buflim;
Packit 709fb3
Packit 709fb3
          /* Successful, no backreferences encountered! */
Packit 709fb3
          if (!backref)
Packit 709fb3
            goto success;
Packit 709fb3
          ptr = beg;
Packit 709fb3
        }
Packit 709fb3
      else
Packit 709fb3
        {
Packit 709fb3
          /* We are looking for the leftmost (then longest) exact match.
Packit 709fb3
             We will go through the outer loop only once.  */
Packit 709fb3
          ptr = start_ptr;
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      /* If the "line" is longer than the maximum regexp offset,
Packit 709fb3
         die as if we've run out of memory.  */
Packit 709fb3
      if (TYPE_MAXIMUM (regoff_t) < end - beg - 1)
Packit 709fb3
        xalloc_die ();
Packit 709fb3
Packit 709fb3
      /* Run the possible match through Regex.  */
Packit 709fb3
      best_match = end;
Packit 709fb3
      best_len = 0;
Packit 709fb3
      for (i = 0; i < dc->pcount; i++)
Packit 709fb3
        {
Packit 709fb3
          dc->patterns[i].not_eol = 0;
Packit 709fb3
          dc->patterns[i].newline_anchor = eolbyte == '\n';
Packit 709fb3
          start = re_search (&dc->patterns[i], beg, end - beg - 1,
Packit 709fb3
                             ptr - beg, end - ptr - 1, &dc->regs);
Packit 709fb3
          if (start < -1)
Packit 709fb3
            xalloc_die ();
Packit 709fb3
          else if (0 <= start)
Packit 709fb3
            {
Packit 709fb3
              len = dc->regs.end[0] - start;
Packit 709fb3
              match = beg + start;
Packit 709fb3
              if (match > best_match)
Packit 709fb3
                continue;
Packit 709fb3
              if (start_ptr && !match_words)
Packit 709fb3
                goto assess_pattern_match;
Packit 709fb3
              if ((!match_lines && !match_words)
Packit 709fb3
                  || (match_lines && len == end - ptr - 1))
Packit 709fb3
                {
Packit 709fb3
                  match = ptr;
Packit 709fb3
                  len = end - ptr;
Packit 709fb3
                  goto assess_pattern_match;
Packit 709fb3
                }
Packit 709fb3
              /* If -w and not -x, check whether the match aligns with
Packit 709fb3
                 word boundaries.  Do this iteratively because:
Packit 709fb3
                 (a) the line may contain more than one occurrence of the
Packit 709fb3
                 pattern, and
Packit 709fb3
                 (b) Several alternatives in the pattern might be valid at a
Packit 709fb3
                 given point, and we may need to consider a shorter one to
Packit 709fb3
                 find a word boundary.  */
Packit 709fb3
              if (!match_lines && match_words)
Packit 709fb3
                while (match <= best_match)
Packit 709fb3
                  {
Packit 709fb3
                    regoff_t shorter_len = 0;
Packit 709fb3
                    if (! wordchar_next (match + len, end - 1)
Packit 709fb3
                        && ! wordchar_prev (beg, match, end - 1))
Packit 709fb3
                      goto assess_pattern_match;
Packit 709fb3
                    if (len > 0)
Packit 709fb3
                      {
Packit 709fb3
                        /* Try a shorter length anchored at the same place. */
Packit 709fb3
                        --len;
Packit 709fb3
                        dc->patterns[i].not_eol = 1;
Packit 709fb3
                        shorter_len = re_match (&dc->patterns[i], beg,
Packit 709fb3
                                                match + len - ptr, match - beg,
Packit 709fb3
                                                &dc->regs);
Packit 709fb3
                        if (shorter_len < -1)
Packit 709fb3
                          xalloc_die ();
Packit 709fb3
                      }
Packit 709fb3
                    if (0 < shorter_len)
Packit 709fb3
                      len = shorter_len;
Packit 709fb3
                    else
Packit 709fb3
                      {
Packit 709fb3
                        /* Try looking further on. */
Packit 709fb3
                        if (match == end - 1)
Packit 709fb3
                          break;
Packit 709fb3
                        match++;
Packit 709fb3
                        dc->patterns[i].not_eol = 0;
Packit 709fb3
                        start = re_search (&dc->patterns[i], beg, end - beg - 1,
Packit 709fb3
                                           match - beg, end - match - 1,
Packit 709fb3
                                           &dc->regs);
Packit 709fb3
                        if (start < 0)
Packit 709fb3
                          {
Packit 709fb3
                            if (start < -1)
Packit 709fb3
                              xalloc_die ();
Packit 709fb3
                            break;
Packit 709fb3
                          }
Packit 709fb3
                        len = dc->regs.end[0] - start;
Packit 709fb3
                        match = beg + start;
Packit 709fb3
                      }
Packit 709fb3
                  } /* while (match <= best_match) */
Packit 709fb3
              continue;
Packit 709fb3
            assess_pattern_match:
Packit 709fb3
              if (!start_ptr)
Packit 709fb3
                {
Packit 709fb3
                  /* Good enough for a non-exact match.
Packit 709fb3
                     No need to look at further patterns, if any.  */
Packit 709fb3
                  goto success;
Packit 709fb3
                }
Packit 709fb3
              if (match < best_match || (match == best_match && len > best_len))
Packit 709fb3
                {
Packit 709fb3
                  /* Best exact match:  leftmost, then longest.  */
Packit 709fb3
                  best_match = match;
Packit 709fb3
                  best_len = len;
Packit 709fb3
                }
Packit 709fb3
            } /* if re_search >= 0 */
Packit 709fb3
        } /* for Regex patterns.  */
Packit 709fb3
        if (best_match < end)
Packit 709fb3
          {
Packit 709fb3
            /* We have found an exact match.  We were just
Packit 709fb3
               waiting for the best one (leftmost then longest).  */
Packit 709fb3
            beg = best_match;
Packit 709fb3
            len = best_len;
Packit 709fb3
            goto success_in_len;
Packit 709fb3
          }
Packit 709fb3
    } /* for (beg = end ..) */
Packit 709fb3
Packit 709fb3
 failure:
Packit 709fb3
  return -1;
Packit 709fb3
Packit 709fb3
 success:
Packit 709fb3
  len = end - beg;
Packit 709fb3
 success_in_len:;
Packit 709fb3
  size_t off = beg - buf;
Packit 709fb3
  *match_size = len;
Packit 709fb3
  return off;
Packit 709fb3
}