|
Packit |
709fb3 |
/* kwsearch.c - searching subroutines using kwset for grep.
|
|
Packit |
709fb3 |
Copyright 1992, 1998, 2000, 2007, 2009-2017 Free Software Foundation, Inc.
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
This program is free software; you can redistribute it and/or modify
|
|
Packit |
709fb3 |
it under the terms of the GNU General Public License as published by
|
|
Packit |
709fb3 |
the Free Software Foundation; either version 3, or (at your option)
|
|
Packit |
709fb3 |
any later version.
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
This program is distributed in the hope that it will be useful,
|
|
Packit |
709fb3 |
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
709fb3 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
Packit |
709fb3 |
GNU General Public License for more details.
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
You should have received a copy of the GNU General Public License
|
|
Packit |
709fb3 |
along with this program; if not, write to the Free Software
|
|
Packit |
709fb3 |
Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
|
|
Packit |
709fb3 |
02110-1301, USA. */
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* Written August 1992 by Mike Haertel. */
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
#include <config.h>
|
|
Packit |
709fb3 |
#include "search.h"
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* A compiled -F pattern list. */
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
struct kwsearch
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
/* The kwset for this pattern list. */
|
|
Packit |
709fb3 |
kwset_t kwset;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* The number of user-specified patterns. This is less than
|
|
Packit |
709fb3 |
'kwswords (kwset)' when some extra one-character words have been
|
|
Packit |
709fb3 |
appended, one for each troublesome character that will require a
|
|
Packit |
709fb3 |
DFA search. */
|
|
Packit |
709fb3 |
ptrdiff_t words;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* The user's pattern and its size in bytes. */
|
|
Packit |
709fb3 |
char *pattern;
|
|
Packit |
709fb3 |
size_t size;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* The user's pattern compiled as a regular expression,
|
|
Packit |
709fb3 |
or null if it has not been compiled. */
|
|
Packit |
709fb3 |
void *re;
|
|
Packit |
709fb3 |
};
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* Compile the -F style PATTERN, containing SIZE bytes. Return a
|
|
Packit |
709fb3 |
description of the compiled pattern. */
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
void *
|
|
Packit |
709fb3 |
Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
kwset_t kwset;
|
|
Packit |
709fb3 |
ptrdiff_t total = size;
|
|
Packit |
709fb3 |
char *buf = NULL;
|
|
Packit |
709fb3 |
size_t bufalloc = 0;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
kwset = kwsinit (true);
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
char const *p = pattern;
|
|
Packit |
709fb3 |
do
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
ptrdiff_t len;
|
|
Packit |
709fb3 |
char const *sep = memchr (p, '\n', total);
|
|
Packit |
709fb3 |
if (sep)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
len = sep - p;
|
|
Packit |
709fb3 |
sep++;
|
|
Packit |
709fb3 |
total -= (len + 1);
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
else
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
len = total;
|
|
Packit |
709fb3 |
total = 0;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
if (match_lines)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
if (eolbyte == '\n' && pattern < p && sep)
|
|
Packit |
709fb3 |
p--;
|
|
Packit |
709fb3 |
else
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
if (bufalloc < len + 2)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
free (buf);
|
|
Packit |
709fb3 |
bufalloc = len + 2;
|
|
Packit |
709fb3 |
buf = x2realloc (NULL, &bufalloc);
|
|
Packit |
709fb3 |
buf[0] = eolbyte;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
memcpy (buf + 1, p, len);
|
|
Packit |
709fb3 |
buf[len + 1] = eolbyte;
|
|
Packit |
709fb3 |
p = buf;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
len += 2;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
kwsincr (kwset, p, len);
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
p = sep;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
while (p);
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
free (buf);
|
|
Packit |
709fb3 |
ptrdiff_t words = kwswords (kwset);
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
if (match_icase)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
/* For each pattern character C that has a case folded
|
|
Packit |
709fb3 |
counterpart F that is multibyte and so cannot easily be
|
|
Packit |
709fb3 |
implemented via translating a single byte, append a pattern
|
|
Packit |
709fb3 |
containing just F. That way, if the data contains F, the
|
|
Packit |
709fb3 |
matcher can fall back on DFA. For example, if C is 'i' and
|
|
Packit |
709fb3 |
the locale is en_US.utf8, append a pattern containing just
|
|
Packit |
709fb3 |
the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that
|
|
Packit |
709fb3 |
Fexecute will use a DFA if the data contain U+0131. */
|
|
Packit |
709fb3 |
mbstate_t mbs = { 0 };
|
|
Packit |
709fb3 |
char checked[NCHAR] = {0,};
|
|
Packit |
709fb3 |
for (p = pattern; p < pattern + size; p++)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
unsigned char c = *p;
|
|
Packit |
709fb3 |
if (checked[c])
|
|
Packit |
709fb3 |
continue;
|
|
Packit |
709fb3 |
checked[c] = true;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
wint_t wc = localeinfo.sbctowc[c];
|
|
Packit |
709fb3 |
wchar_t folded[CASE_FOLDED_BUFSIZE];
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
for (int i = case_folded_counterparts (wc, folded); 0 <= --i; )
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
char s[MB_LEN_MAX];
|
|
Packit |
709fb3 |
int nbytes = wcrtomb (s, folded[i], &mbs);
|
|
Packit |
709fb3 |
if (1 < nbytes)
|
|
Packit |
709fb3 |
kwsincr (kwset, s, nbytes);
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
kwsprep (kwset);
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch);
|
|
Packit |
709fb3 |
kwsearch->kwset = kwset;
|
|
Packit |
709fb3 |
kwsearch->words = words;
|
|
Packit |
709fb3 |
kwsearch->pattern = pattern;
|
|
Packit |
709fb3 |
kwsearch->size = size;
|
|
Packit |
709fb3 |
kwsearch->re = NULL;
|
|
Packit |
709fb3 |
return kwsearch;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* Use the compiled pattern VCP to search the buffer BUF of size SIZE.
|
|
Packit |
709fb3 |
If found, return the offset of the first match and store its
|
|
Packit |
709fb3 |
size into *MATCH_SIZE. If not found, return SIZE_MAX.
|
|
Packit |
709fb3 |
If START_PTR is nonnull, start searching there. */
|
|
Packit |
709fb3 |
size_t
|
|
Packit |
709fb3 |
Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
|
|
Packit |
709fb3 |
char const *start_ptr)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
char const *beg, *end, *mb_start;
|
|
Packit |
709fb3 |
ptrdiff_t len;
|
|
Packit |
709fb3 |
char eol = eolbyte;
|
|
Packit |
709fb3 |
struct kwsmatch kwsmatch;
|
|
Packit |
709fb3 |
size_t ret_val;
|
|
Packit |
709fb3 |
bool mb_check;
|
|
Packit |
709fb3 |
bool longest;
|
|
Packit |
709fb3 |
struct kwsearch *kwsearch = vcp;
|
|
Packit |
709fb3 |
kwset_t kwset = kwsearch->kwset;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
if (match_lines)
|
|
Packit |
709fb3 |
mb_check = longest = false;
|
|
Packit |
709fb3 |
else
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
mb_check = localeinfo.multibyte & !localeinfo.using_utf8;
|
|
Packit |
709fb3 |
longest = mb_check | !!start_ptr | match_words;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
ptrdiff_t offset = kwsexec (kwset, beg - match_lines,
|
|
Packit |
709fb3 |
buf + size - beg + match_lines, &kwsmatch,
|
|
Packit |
709fb3 |
longest);
|
|
Packit |
709fb3 |
if (offset < 0)
|
|
Packit |
709fb3 |
break;
|
|
Packit |
709fb3 |
len = kwsmatch.size[0] - 2 * match_lines;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
if (kwsearch->words <= kwsmatch.index)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
/* The data contain a multibyte character that matches
|
|
Packit |
709fb3 |
some pattern character that is a case folded counterpart.
|
|
Packit |
709fb3 |
Since the kwset code cannot handle this case, fall back
|
|
Packit |
709fb3 |
on the DFA code, which can. */
|
|
Packit |
709fb3 |
if (! kwsearch->re)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
|
|
Packit |
709fb3 |
kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size,
|
|
Packit |
709fb3 |
RE_SYNTAX_GREP);
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
/* We have matched a single byte that is not at the beginning of a
|
|
Packit |
709fb3 |
multibyte character. mb_goback has advanced MB_START past that
|
|
Packit |
709fb3 |
multibyte character. Now, we want to position BEG so that the
|
|
Packit |
709fb3 |
next kwsexec search starts there. Thus, to compensate for the
|
|
Packit |
709fb3 |
for-loop's BEG++, above, subtract one here. This code is
|
|
Packit |
709fb3 |
unusually hard to reach, and exceptionally, let's show how to
|
|
Packit |
709fb3 |
trigger it here:
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
That assumes the named locale is installed.
|
|
Packit |
709fb3 |
Note that your system's shift-JIS locale may have a different
|
|
Packit |
709fb3 |
name, possibly including "sjis". */
|
|
Packit |
709fb3 |
beg = mb_start - 1;
|
|
Packit |
709fb3 |
continue;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
beg += offset;
|
|
Packit |
709fb3 |
if (!!start_ptr & !match_words)
|
|
Packit |
709fb3 |
goto success_in_beg_and_len;
|
|
Packit |
709fb3 |
if (match_lines)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
len += start_ptr == NULL;
|
|
Packit |
709fb3 |
goto success_in_beg_and_len;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
if (! match_words)
|
|
Packit |
709fb3 |
goto success;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* Succeed if the preceding and following characters are word
|
|
Packit |
709fb3 |
constituents. If the following character is not a word
|
|
Packit |
709fb3 |
constituent, keep trying with shorter matches. */
|
|
Packit |
709fb3 |
char const *bol = memrchr (mb_start, eol, beg - mb_start);
|
|
Packit |
709fb3 |
if (bol)
|
|
Packit |
709fb3 |
mb_start = bol + 1;
|
|
Packit |
709fb3 |
if (! wordchar_prev (mb_start, beg, buf + size))
|
|
Packit |
709fb3 |
for (;;)
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
if (! wordchar_next (beg + len, buf + size))
|
|
Packit |
709fb3 |
{
|
|
Packit |
709fb3 |
if (start_ptr)
|
|
Packit |
709fb3 |
goto success_in_beg_and_len;
|
|
Packit |
709fb3 |
else
|
|
Packit |
709fb3 |
goto success;
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
if (!len)
|
|
Packit |
709fb3 |
break;
|
|
Packit |
709fb3 |
offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
|
|
Packit |
709fb3 |
if (offset != 0)
|
|
Packit |
709fb3 |
break;
|
|
Packit |
709fb3 |
len = kwsmatch.size[0];
|
|
Packit |
709fb3 |
}
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
/* No word match was found at BEG. Skip past word constituents,
|
|
Packit |
709fb3 |
since they cannot precede the next match and not skipping
|
|
Packit |
709fb3 |
them could make things much slower. */
|
|
Packit |
709fb3 |
beg += wordchars_size (beg, buf + size);
|
|
Packit |
709fb3 |
mb_start = beg;
|
|
Packit |
709fb3 |
} /* for (beg in buf) */
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
return -1;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
success:
|
|
Packit |
709fb3 |
end = memchr (beg + len, eol, (buf + size) - (beg + len));
|
|
Packit |
709fb3 |
end = end ? end + 1 : buf + size;
|
|
Packit |
709fb3 |
beg = memrchr (buf, eol, beg - buf);
|
|
Packit |
709fb3 |
beg = beg ? beg + 1 : buf;
|
|
Packit |
709fb3 |
len = end - beg;
|
|
Packit |
709fb3 |
success_in_beg_and_len:;
|
|
Packit |
709fb3 |
size_t off = beg - buf;
|
|
Packit |
709fb3 |
|
|
Packit |
709fb3 |
*match_size = len;
|
|
Packit |
709fb3 |
ret_val = off;
|
|
Packit |
709fb3 |
return ret_val;
|
|
Packit |
709fb3 |
}
|