/* kwsearch.c - searching subroutines using kwset for grep. Copyright 1992, 1998, 2000, 2007, 2009-2017 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ /* Written August 1992 by Mike Haertel. */ #include #include "search.h" /* A compiled -F pattern list. */ struct kwsearch { /* The kwset for this pattern list. */ kwset_t kwset; /* The number of user-specified patterns. This is less than 'kwswords (kwset)' when some extra one-character words have been appended, one for each troublesome character that will require a DFA search. */ ptrdiff_t words; /* The user's pattern and its size in bytes. */ char *pattern; size_t size; /* The user's pattern compiled as a regular expression, or null if it has not been compiled. */ void *re; }; /* Compile the -F style PATTERN, containing SIZE bytes. Return a description of the compiled pattern. */ void * Fcompile (char *pattern, size_t size, reg_syntax_t ignored) { kwset_t kwset; ptrdiff_t total = size; char *buf = NULL; size_t bufalloc = 0; kwset = kwsinit (true); char const *p = pattern; do { ptrdiff_t len; char const *sep = memchr (p, '\n', total); if (sep) { len = sep - p; sep++; total -= (len + 1); } else { len = total; total = 0; } if (match_lines) { if (eolbyte == '\n' && pattern < p && sep) p--; else { if (bufalloc < len + 2) { free (buf); bufalloc = len + 2; buf = x2realloc (NULL, &bufalloc); buf[0] = eolbyte; } memcpy (buf + 1, p, len); buf[len + 1] = eolbyte; p = buf; } len += 2; } kwsincr (kwset, p, len); p = sep; } while (p); free (buf); ptrdiff_t words = kwswords (kwset); if (match_icase) { /* For each pattern character C that has a case folded counterpart F that is multibyte and so cannot easily be implemented via translating a single byte, append a pattern containing just F. That way, if the data contains F, the matcher can fall back on DFA. For example, if C is 'i' and the locale is en_US.utf8, append a pattern containing just the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that Fexecute will use a DFA if the data contain U+0131. */ mbstate_t mbs = { 0 }; char checked[NCHAR] = {0,}; for (p = pattern; p < pattern + size; p++) { unsigned char c = *p; if (checked[c]) continue; checked[c] = true; wint_t wc = localeinfo.sbctowc[c]; wchar_t folded[CASE_FOLDED_BUFSIZE]; for (int i = case_folded_counterparts (wc, folded); 0 <= --i; ) { char s[MB_LEN_MAX]; int nbytes = wcrtomb (s, folded[i], &mbs); if (1 < nbytes) kwsincr (kwset, s, nbytes); } } } kwsprep (kwset); struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch); kwsearch->kwset = kwset; kwsearch->words = words; kwsearch->pattern = pattern; kwsearch->size = size; kwsearch->re = NULL; return kwsearch; } /* Use the compiled pattern VCP to search the buffer BUF of size SIZE. If found, return the offset of the first match and store its size into *MATCH_SIZE. If not found, return SIZE_MAX. If START_PTR is nonnull, start searching there. */ size_t Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size, char const *start_ptr) { char const *beg, *end, *mb_start; ptrdiff_t len; char eol = eolbyte; struct kwsmatch kwsmatch; size_t ret_val; bool mb_check; bool longest; struct kwsearch *kwsearch = vcp; kwset_t kwset = kwsearch->kwset; if (match_lines) mb_check = longest = false; else { mb_check = localeinfo.multibyte & !localeinfo.using_utf8; longest = mb_check | !!start_ptr | match_words; } for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) { ptrdiff_t offset = kwsexec (kwset, beg - match_lines, buf + size - beg + match_lines, &kwsmatch, longest); if (offset < 0) break; len = kwsmatch.size[0] - 2 * match_lines; if (kwsearch->words <= kwsmatch.index) { /* The data contain a multibyte character that matches some pattern character that is a case folded counterpart. Since the kwset code cannot handle this case, fall back on the DFA code, which can. */ if (! kwsearch->re) { fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size); kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size, RE_SYNTAX_GREP); } return EGexecute (kwsearch->re, buf, size, match_size, start_ptr); } if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0) { /* We have matched a single byte that is not at the beginning of a multibyte character. mb_goback has advanced MB_START past that multibyte character. Now, we want to position BEG so that the next kwsexec search starts there. Thus, to compensate for the for-loop's BEG++, above, subtract one here. This code is unusually hard to reach, and exceptionally, let's show how to trigger it here: printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A That assumes the named locale is installed. Note that your system's shift-JIS locale may have a different name, possibly including "sjis". */ beg = mb_start - 1; continue; } beg += offset; if (!!start_ptr & !match_words) goto success_in_beg_and_len; if (match_lines) { len += start_ptr == NULL; goto success_in_beg_and_len; } if (! match_words) goto success; /* Succeed if the preceding and following characters are word constituents. If the following character is not a word constituent, keep trying with shorter matches. */ char const *bol = memrchr (mb_start, eol, beg - mb_start); if (bol) mb_start = bol + 1; if (! wordchar_prev (mb_start, beg, buf + size)) for (;;) { if (! wordchar_next (beg + len, buf + size)) { if (start_ptr) goto success_in_beg_and_len; else goto success; } if (!len) break; offset = kwsexec (kwset, beg, --len, &kwsmatch, true); if (offset != 0) break; len = kwsmatch.size[0]; } /* No word match was found at BEG. Skip past word constituents, since they cannot precede the next match and not skipping them could make things much slower. */ beg += wordchars_size (beg, buf + size); mb_start = beg; } /* for (beg in buf) */ return -1; success: end = memchr (beg + len, eol, (buf + size) - (beg + len)); end = end ? end + 1 : buf + size; beg = memrchr (buf, eol, beg - buf); beg = beg ? beg + 1 : buf; len = end - beg; success_in_beg_and_len:; size_t off = beg - buf; *match_size = len; ret_val = off; return ret_val; }