|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* re.c - compile regular expressions.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Copyright (C) 1991-2017 the Free Software Foundation, Inc.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* This file is part of GAWK, the GNU implementation of the
|
|
Packit |
575503 |
* AWK Programming Language.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* GAWK is free software; you can redistribute it and/or modify
|
|
Packit |
575503 |
* it under the terms of the GNU General Public License as published by
|
|
Packit |
575503 |
* the Free Software Foundation; either version 3 of the License, or
|
|
Packit |
575503 |
* (at your option) any later version.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* GAWK is distributed in the hope that it will be useful,
|
|
Packit |
575503 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
575503 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
Packit |
575503 |
* GNU General Public License for more details.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* You should have received a copy of the GNU General Public License
|
|
Packit |
575503 |
* along with this program; if not, write to the Free Software
|
|
Packit |
575503 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
|
|
Packit |
575503 |
#include "awk.h"
|
|
Packit |
575503 |
|
|
Packit |
575503 |
#include "localeinfo.h"
|
|
Packit |
575503 |
|
|
Packit |
575503 |
static reg_syntax_t syn;
|
|
Packit |
575503 |
static void check_bracket_exp(char *s, size_t len);
|
|
Packit |
575503 |
const char *regexflags2str(int flags);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
static struct localeinfo localeinfo;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* make_regexp --- generate compiled regular expressions */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
Regexp *
|
|
Packit |
575503 |
make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
static char metas[] = ".*+(){}[]|?^$\\";
|
|
Packit |
575503 |
Regexp *rp;
|
|
Packit |
575503 |
const char *rerr;
|
|
Packit |
575503 |
const char *src = s;
|
|
Packit |
575503 |
static char *buf = NULL;
|
|
Packit |
575503 |
static size_t buflen;
|
|
Packit |
575503 |
const char *end = s + len;
|
|
Packit |
575503 |
char *dest;
|
|
Packit |
575503 |
int c, c2;
|
|
Packit |
575503 |
static bool first = true;
|
|
Packit |
575503 |
static bool no_dfa = false;
|
|
Packit |
575503 |
int i;
|
|
Packit |
575503 |
static struct dfa* dfaregs[2] = { NULL, NULL };
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* The number of bytes in the current multibyte character.
|
|
Packit |
575503 |
* It is 0, when the current character is a singlebyte character.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
size_t is_multibyte = 0;
|
|
Packit |
575503 |
mbstate_t mbs;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (first) {
|
|
Packit |
575503 |
/* for debugging and testing */
|
|
Packit |
575503 |
no_dfa = (getenv("GAWK_NO_DFA") != NULL);
|
|
Packit |
575503 |
/* don't set first to false here, we do it below */
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* always check */
|
|
Packit |
575503 |
check_bracket_exp((char *) s, len);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* Handle escaped characters first. */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Build a copy of the string (in buf) with the
|
|
Packit |
575503 |
* escaped characters translated, and generate the regex
|
|
Packit |
575503 |
* from that.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
if (buf == NULL) {
|
|
Packit |
575503 |
emalloc(buf, char *, len + 1, "make_regexp");
|
|
Packit |
575503 |
buflen = len;
|
|
Packit |
575503 |
} else if (len > buflen) {
|
|
Packit |
575503 |
erealloc(buf, char *, len + 1, "make_regexp");
|
|
Packit |
575503 |
buflen = len;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
dest = buf;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
while (src < end) {
|
|
Packit |
575503 |
if (gawk_mb_cur_max > 1 && ! is_multibyte) {
|
|
Packit |
575503 |
/* The previous byte is a singlebyte character, or last byte
|
|
Packit |
575503 |
of a multibyte character. We check the next character. */
|
|
Packit |
575503 |
is_multibyte = mbrlen(src, end - src, &mbs);
|
|
Packit |
575503 |
if ( is_multibyte == 1
|
|
Packit |
575503 |
|| is_multibyte == (size_t) -1
|
|
Packit |
575503 |
|| is_multibyte == (size_t) -2
|
|
Packit |
575503 |
|| is_multibyte == 0) {
|
|
Packit |
575503 |
/* We treat it as a single-byte character. */
|
|
Packit |
575503 |
is_multibyte = 0;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* We skip multibyte character, since it must not be a special
|
|
Packit |
575503 |
character. */
|
|
Packit |
575503 |
if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
|
|
Packit |
575503 |
(*src == '\\')) {
|
|
Packit |
575503 |
c = *++src;
|
|
Packit |
575503 |
switch (c) {
|
|
Packit |
575503 |
case 'a':
|
|
Packit |
575503 |
case 'b':
|
|
Packit |
575503 |
case 'f':
|
|
Packit |
575503 |
case 'n':
|
|
Packit |
575503 |
case 'r':
|
|
Packit |
575503 |
case 't':
|
|
Packit |
575503 |
case 'v':
|
|
Packit |
575503 |
case 'x':
|
|
Packit |
575503 |
case '0':
|
|
Packit |
575503 |
case '1':
|
|
Packit |
575503 |
case '2':
|
|
Packit |
575503 |
case '3':
|
|
Packit |
575503 |
case '4':
|
|
Packit |
575503 |
case '5':
|
|
Packit |
575503 |
case '6':
|
|
Packit |
575503 |
case '7':
|
|
Packit |
575503 |
c2 = parse_escape(&src;;
|
|
Packit |
575503 |
if (c2 < 0)
|
|
Packit |
575503 |
cant_happen();
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Unix awk treats octal (and hex?) chars
|
|
Packit |
575503 |
* literally in re's, so escape regexp
|
|
Packit |
575503 |
* metacharacters.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
if (do_traditional
|
|
Packit |
575503 |
&& ! do_posix
|
|
Packit |
575503 |
&& (isdigit(c) || c == 'x')
|
|
Packit |
575503 |
&& strchr("()|*+?.^$\\[]", c2) != NULL)
|
|
Packit |
575503 |
*dest++ = '\\';
|
|
Packit |
575503 |
*dest++ = (char) c2;
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
case '8':
|
|
Packit |
575503 |
case '9': /* a\9b not valid */
|
|
Packit |
575503 |
*dest++ = c;
|
|
Packit |
575503 |
src++;
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
case 'y': /* normally \b */
|
|
Packit |
575503 |
/* gnu regex op */
|
|
Packit |
575503 |
if (! do_traditional) {
|
|
Packit |
575503 |
*dest++ = '\\';
|
|
Packit |
575503 |
*dest++ = 'b';
|
|
Packit |
575503 |
src++;
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
/* else, fall through */
|
|
Packit |
575503 |
default:
|
|
Packit |
575503 |
*dest++ = '\\';
|
|
Packit |
575503 |
*dest++ = (char) c;
|
|
Packit |
575503 |
src++;
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
} /* switch */
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
c = *src;
|
|
Packit |
575503 |
*dest++ = *src++; /* not '\\' */
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
if (gawk_mb_cur_max > 1 && is_multibyte)
|
|
Packit |
575503 |
is_multibyte--;
|
|
Packit |
575503 |
} /* while */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
*dest = '\0';
|
|
Packit |
575503 |
len = dest - buf;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
ezalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
|
|
Packit |
575503 |
rp->pat.allocated = 0; /* regex will allocate the buffer */
|
|
Packit |
575503 |
emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
|
|
Packit |
575503 |
* was going to turn out to be, I wouldn't have bothered with it.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* In the case where we have a multibyte character set, we have no
|
|
Packit |
575503 |
* choice but to use RE_ICASE, since the casetable is for single-byte
|
|
Packit |
575503 |
* character sets only.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* On the other hand, if we do have a single-byte character set,
|
|
Packit |
575503 |
* using the casetable should give a performance improvement, since
|
|
Packit |
575503 |
* it's computed only once, not each time a regex is compiled. We
|
|
Packit |
575503 |
* also think it's probably better for portability. See the
|
|
Packit |
575503 |
* discussion by the definition of casetable[] in eval.c.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
|
|
Packit |
575503 |
ignorecase = !! ignorecase; /* force to 1 or 0 */
|
|
Packit |
575503 |
if (ignorecase) {
|
|
Packit |
575503 |
if (gawk_mb_cur_max > 1) {
|
|
Packit |
575503 |
syn |= RE_ICASE;
|
|
Packit |
575503 |
rp->pat.translate = NULL;
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
syn &= ~RE_ICASE;
|
|
Packit |
575503 |
rp->pat.translate = (RE_TRANSLATE_TYPE) casetable;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
rp->pat.translate = NULL;
|
|
Packit |
575503 |
syn &= ~RE_ICASE;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* initialize dfas to hold syntax */
|
|
Packit |
575503 |
if (first) {
|
|
Packit |
575503 |
first = false;
|
|
Packit |
575503 |
dfaregs[0] = dfaalloc();
|
|
Packit |
575503 |
dfaregs[1] = dfaalloc();
|
|
Packit |
575503 |
dfasyntax(dfaregs[0], & localeinfo, syn, DFA_ANCHOR);
|
|
Packit |
575503 |
dfasyntax(dfaregs[1], & localeinfo, syn | RE_ICASE, DFA_ANCHOR);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
re_set_syntax(syn);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
|
|
Packit |
575503 |
refree(rp);
|
|
Packit |
575503 |
if (! canfatal) {
|
|
Packit |
575503 |
/* rerr already gettextized inside regex routines */
|
|
Packit |
575503 |
error("%s: /%s/", rerr, buf);
|
|
Packit |
575503 |
return NULL;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
fatal("%s: /%s/", rerr, buf);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* gack. this must be done *after* re_compile_pattern */
|
|
Packit |
575503 |
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
|
|
Packit |
575503 |
if (dfa && ! no_dfa) {
|
|
Packit |
575503 |
rp->dfareg = dfaalloc();
|
|
Packit |
575503 |
dfacopysyntax(rp->dfareg, dfaregs[ignorecase]);
|
|
Packit |
575503 |
dfacomp(buf, len, rp->dfareg, true);
|
|
Packit |
575503 |
} else
|
|
Packit |
575503 |
rp->dfareg = NULL;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* Additional flags that help with RS as regexp. */
|
|
Packit |
575503 |
for (i = 0; i < len; i++) {
|
|
Packit |
575503 |
if (strchr(metas, buf[i]) != NULL) {
|
|
Packit |
575503 |
rp->has_meta = true;
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
for (i = len - 1; i >= 0; i--) {
|
|
Packit |
575503 |
if (strchr("*+|?", buf[i]) != NULL) {
|
|
Packit |
575503 |
rp->maybe_long = true;
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
return rp;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* research --- do a regexp search. use dfa if possible */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
int
|
|
Packit |
575503 |
research(Regexp *rp, char *str, int start,
|
|
Packit |
575503 |
size_t len, int flags)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
const char *ret = str;
|
|
Packit |
575503 |
bool try_backref = false;
|
|
Packit |
575503 |
int need_start;
|
|
Packit |
575503 |
int no_bol;
|
|
Packit |
575503 |
int res;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
need_start = ((flags & RE_NEED_START) != 0);
|
|
Packit |
575503 |
no_bol = ((flags & RE_NO_BOL) != 0);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (no_bol)
|
|
Packit |
575503 |
rp->pat.not_bol = 1;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Always do dfa search if can; if it fails, then even if
|
|
Packit |
575503 |
* need_start is true, we won't bother with the regex search.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* The dfa matcher doesn't have a no_bol flag, so don't bother
|
|
Packit |
575503 |
* trying it in that case.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* 7/2008: Skip the dfa matcher if need_start. The dfa matcher
|
|
Packit |
575503 |
* has bugs in certain multibyte cases and it's too difficult
|
|
Packit |
575503 |
* to try to special case things.
|
|
Packit |
575503 |
* 7/2017: Apparently there are some cases where DFA gets
|
|
Packit |
575503 |
* stuck, even in the C locale, so we use dfa only if not need_start.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* Should that issue ever get resolved, note this comment:
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* 7/2016: The dfa matcher can't handle a case where searching
|
|
Packit |
575503 |
* starts in the middle of a string, so don't bother trying it
|
|
Packit |
575503 |
* in that case.
|
|
Packit |
575503 |
* if (rp->dfa && ! no_bol && start == 0) ...
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
if (rp->dfareg != NULL && ! no_bol && ! need_start) {
|
|
Packit |
575503 |
struct dfa *superset = dfasuperset(rp->dfareg);
|
|
Packit |
575503 |
if (superset)
|
|
Packit |
575503 |
ret = dfaexec(superset, str+start, str+start+len,
|
|
Packit |
575503 |
true, NULL, NULL);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (ret && (! need_start
|
|
Packit |
575503 |
|| (! superset && dfaisfast(rp->dfareg))))
|
|
Packit |
575503 |
ret = dfaexec(rp->dfareg, str+start, str+start+len,
|
|
Packit |
575503 |
true, NULL, &try_backref);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (ret) {
|
|
Packit |
575503 |
if ( rp->dfareg == NULL
|
|
Packit |
575503 |
|| start != 0
|
|
Packit |
575503 |
|| no_bol
|
|
Packit |
575503 |
|| need_start
|
|
Packit |
575503 |
|| try_backref) {
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Passing NULL as last arg speeds up search for cases
|
|
Packit |
575503 |
* where we don't need the start/end info.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
res = re_search(&(rp->pat), str, start+len,
|
|
Packit |
575503 |
start, len, need_start ? &(rp->regs) : NULL);
|
|
Packit |
575503 |
} else
|
|
Packit |
575503 |
res = 1;
|
|
Packit |
575503 |
} else
|
|
Packit |
575503 |
res = -1;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
rp->pat.not_bol = 0;
|
|
Packit |
575503 |
return res;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* refree --- free up the dynamic memory used by a compiled regexp */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
void
|
|
Packit |
575503 |
refree(Regexp *rp)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
if (rp == NULL)
|
|
Packit |
575503 |
return;
|
|
Packit |
575503 |
rp->pat.translate = NULL;
|
|
Packit |
575503 |
regfree(& rp->pat);
|
|
Packit |
575503 |
if (rp->regs.start)
|
|
Packit |
575503 |
free(rp->regs.start);
|
|
Packit |
575503 |
if (rp->regs.end)
|
|
Packit |
575503 |
free(rp->regs.end);
|
|
Packit |
575503 |
if (rp->dfareg != NULL) {
|
|
Packit |
575503 |
dfafree(rp->dfareg);
|
|
Packit |
575503 |
free(rp->dfareg);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
efree(rp);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* dfaerror --- print an error message for the dfa routines */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
void
|
|
Packit |
575503 |
dfaerror(const char *s)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
fatal("%s", s);
|
|
Packit |
575503 |
exit(EXIT_FATAL); /* for DJGPP */
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* re_update --- recompile a dynamic regexp */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
Regexp *
|
|
Packit |
575503 |
re_update(NODE *t)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
NODE *t1;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (t->type == Node_val && (t->flags & REGEX) != 0)
|
|
Packit |
575503 |
return t->typed_re->re_reg[IGNORECASE];
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if ((t->re_flags & CONSTANT) != 0) {
|
|
Packit |
575503 |
/* it's a constant, so just return it as is */
|
|
Packit |
575503 |
assert(t->type == Node_regex);
|
|
Packit |
575503 |
return t->re_reg[IGNORECASE];
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
t1 = t->re_exp;
|
|
Packit |
575503 |
if (t->re_text != NULL) {
|
|
Packit |
575503 |
/* if contents haven't changed, just return it */
|
|
Packit |
575503 |
if (cmp_nodes(t->re_text, t1, true) == 0)
|
|
Packit |
575503 |
return t->re_reg[IGNORECASE];
|
|
Packit |
575503 |
/* things changed, fall through to recompile */
|
|
Packit |
575503 |
unref(t->re_text);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
/* get fresh copy of the text of the regexp */
|
|
Packit |
575503 |
t->re_text = dupnode(t1);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* text changed */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* free old */
|
|
Packit |
575503 |
if (t->re_reg[0] != NULL)
|
|
Packit |
575503 |
refree(t->re_reg[0]);
|
|
Packit |
575503 |
if (t->re_reg[1] != NULL)
|
|
Packit |
575503 |
refree(t->re_reg[1]);
|
|
Packit |
575503 |
if (t->re_cnt > 0)
|
|
Packit |
575503 |
t->re_cnt++;
|
|
Packit |
575503 |
if (t->re_cnt > 10)
|
|
Packit |
575503 |
t->re_cnt = 0;
|
|
Packit |
575503 |
if (t->re_text == NULL) {
|
|
Packit |
575503 |
/* reset regexp text if needed */
|
|
Packit |
575503 |
t1 = t->re_exp;
|
|
Packit |
575503 |
unref(t->re_text);
|
|
Packit |
575503 |
t->re_text = dupnode(t1);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
/* compile it */
|
|
Packit |
575503 |
t->re_reg[0] = make_regexp(t->re_text->stptr, t->re_text->stlen,
|
|
Packit |
575503 |
false, t->re_cnt, true);
|
|
Packit |
575503 |
t->re_reg[1] = make_regexp(t->re_text->stptr, t->re_text->stlen,
|
|
Packit |
575503 |
true, t->re_cnt, true);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
return t->re_reg[IGNORECASE];
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* resetup --- choose what kind of regexps we match */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
void
|
|
Packit |
575503 |
resetup()
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
// init localeinfo for dfa
|
|
Packit |
575503 |
init_localeinfo(& localeinfo);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Syntax bits: _that_ is yet another mind trip. Recreational drugs
|
|
Packit |
575503 |
* are helpful for recovering from the experience.
|
|
Packit |
575503 |
*
|
|
Packit |
575503 |
* Aharon Robbins <arnold@skeeve.com>
|
|
Packit |
575503 |
* Sun, 21 Oct 2007 23:55:33 +0200
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
if (do_posix)
|
|
Packit |
575503 |
syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
|
|
Packit |
575503 |
else if (do_traditional)
|
|
Packit |
575503 |
syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
|
|
Packit |
575503 |
else
|
|
Packit |
575503 |
syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Interval expressions are now on by default, as POSIX is
|
|
Packit |
575503 |
* wide-spread enough that people want it. The do_intervals
|
|
Packit |
575503 |
* variable remains for use with --traditional.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
if (do_intervals)
|
|
Packit |
575503 |
syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
(void) re_set_syntax(syn);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* using_utf8 --- are we using utf8 */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
bool
|
|
Packit |
575503 |
using_utf8(void)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
return localeinfo.using_utf8;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* reisstring --- return true if the RE match is a simple string match */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
int
|
|
Packit |
575503 |
reisstring(const char *text, size_t len, Regexp *re, const char *buf)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
int res;
|
|
Packit |
575503 |
const char *matched;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* simple checking for meta characters in re */
|
|
Packit |
575503 |
if (re->has_meta)
|
|
Packit |
575503 |
return false; /* give up early, can't be string match */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* make accessable to gdb */
|
|
Packit |
575503 |
matched = &buf[RESTART(re, buf)];
|
|
Packit |
575503 |
|
|
Packit |
575503 |
res = (memcmp(text, matched, len) == 0);
|
|
Packit |
575503 |
|
|
Packit |
575503 |
return res;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* reflags2str --- make a regex flags value readable */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
const char *
|
|
Packit |
575503 |
reflags2str(int flagval)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
static const struct flagtab values[] = {
|
|
Packit |
575503 |
{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
|
|
Packit |
575503 |
{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
|
|
Packit |
575503 |
{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
|
|
Packit |
575503 |
{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
|
|
Packit |
575503 |
{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
|
|
Packit |
575503 |
{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
|
|
Packit |
575503 |
{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
|
|
Packit |
575503 |
{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
|
|
Packit |
575503 |
{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
|
|
Packit |
575503 |
{ RE_INTERVALS, "RE_INTERVALS" },
|
|
Packit |
575503 |
{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
|
|
Packit |
575503 |
{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
|
|
Packit |
575503 |
{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
|
|
Packit |
575503 |
{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
|
|
Packit |
575503 |
{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
|
|
Packit |
575503 |
{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
|
|
Packit |
575503 |
{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
|
|
Packit |
575503 |
{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
|
|
Packit |
575503 |
{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
|
|
Packit |
575503 |
{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
|
|
Packit |
575503 |
{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
|
|
Packit |
575503 |
{ RE_ICASE, "RE_ICASE" },
|
|
Packit |
575503 |
{ RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
|
|
Packit |
575503 |
{ RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
|
|
Packit |
575503 |
{ RE_NO_SUB, "RE_NO_SUB" },
|
|
Packit |
575503 |
{ 0, NULL },
|
|
Packit |
575503 |
};
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (flagval == RE_SYNTAX_EMACS) /* == 0 */
|
|
Packit |
575503 |
return "RE_SYNTAX_EMACS";
|
|
Packit |
575503 |
|
|
Packit |
575503 |
return genflags2str(flagval, values);
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* dfawarn() is called by the dfa routines whenever a regex is compiled
|
|
Packit |
575503 |
* must supply a dfawarn.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
|
|
Packit |
575503 |
void
|
|
Packit |
575503 |
dfawarn(const char *dfa_warning)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* This routine does nothing, since gawk does its own
|
|
Packit |
575503 |
* (better) check for bad [[:foo:]] syntax.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
static void
|
|
Packit |
575503 |
check_bracket_exp(char *s, size_t length)
|
|
Packit |
575503 |
{
|
|
Packit |
575503 |
static struct reclass {
|
|
Packit |
575503 |
const char *name;
|
|
Packit |
575503 |
size_t len;
|
|
Packit |
575503 |
bool warned;
|
|
Packit |
575503 |
} classes[] = {
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* Ordered by what we hope is frequency,
|
|
Packit |
575503 |
* since it's linear searched.
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
{ "[:alpha:]", 9, false },
|
|
Packit |
575503 |
{ "[:digit:]", 9, false },
|
|
Packit |
575503 |
{ "[:alnum:]", 9, false },
|
|
Packit |
575503 |
{ "[:upper:]", 9, false },
|
|
Packit |
575503 |
{ "[:lower:]", 9, false },
|
|
Packit |
575503 |
{ "[:space:]", 9, false },
|
|
Packit |
575503 |
{ "[:xdigit:]", 10, false },
|
|
Packit |
575503 |
{ "[:punct:]", 9, false },
|
|
Packit |
575503 |
{ "[:print:]", 9, false },
|
|
Packit |
575503 |
{ "[:graph:]", 9, false },
|
|
Packit |
575503 |
{ "[:cntrl:]", 9, false },
|
|
Packit |
575503 |
{ "[:blank:]", 9, false },
|
|
Packit |
575503 |
{ NULL, 0 }
|
|
Packit |
575503 |
};
|
|
Packit |
575503 |
int i;
|
|
Packit |
575503 |
bool found = false;
|
|
Packit |
575503 |
char save;
|
|
Packit |
575503 |
char *sp, *sp2, *end;
|
|
Packit |
575503 |
int len;
|
|
Packit |
575503 |
int count = 0;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (length == 0)
|
|
Packit |
575503 |
return;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
end = s + length;
|
|
Packit |
575503 |
save = s[length];
|
|
Packit |
575503 |
s[length] = '\0';
|
|
Packit |
575503 |
sp = s;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
again:
|
|
Packit |
575503 |
sp = sp2 = memchr(sp, '[', (end - sp));
|
|
Packit |
575503 |
if (sp == NULL)
|
|
Packit |
575503 |
goto done;
|
|
Packit |
575503 |
|
|
Packit |
575503 |
for (count++, sp++; *sp != '\0'; sp++) {
|
|
Packit |
575503 |
if (*sp == '[')
|
|
Packit |
575503 |
count++;
|
|
Packit |
575503 |
/*
|
|
Packit |
575503 |
* ] as first char after open [ is skipped
|
|
Packit |
575503 |
* \] is skipped
|
|
Packit |
575503 |
* [^]] is skipped
|
|
Packit |
575503 |
*/
|
|
Packit |
575503 |
if (*sp == ']' && sp > sp2) {
|
|
Packit |
575503 |
if (sp[-1] != '['
|
|
Packit |
575503 |
&& sp[-1] != '\\')
|
|
Packit |
575503 |
;
|
|
Packit |
575503 |
else if ((sp - sp2) >= 2
|
|
Packit |
575503 |
&& sp[-1] == '^' && sp[-2] == '[')
|
|
Packit |
575503 |
;
|
|
Packit |
575503 |
else
|
|
Packit |
575503 |
count--;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (count == 0) {
|
|
Packit |
575503 |
sp++; /* skip past ']' */
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (count > 0) { /* bad regex, give up */
|
|
Packit |
575503 |
goto done;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
/* sp2 has start */
|
|
Packit |
575503 |
|
|
Packit |
575503 |
for (i = 0; classes[i].name != NULL; i++) {
|
|
Packit |
575503 |
if (classes[i].warned)
|
|
Packit |
575503 |
continue;
|
|
Packit |
575503 |
len = classes[i].len;
|
|
Packit |
575503 |
if ( len == (sp - sp2)
|
|
Packit |
575503 |
&& memcmp(sp2, classes[i].name, len) == 0) {
|
|
Packit |
575503 |
found = true;
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (found && ! classes[i].warned) {
|
|
Packit |
575503 |
warning(_("regexp component `%.*s' should probably be `[%.*s]'"),
|
|
Packit |
575503 |
len, sp2, len, sp2);
|
|
Packit |
575503 |
classes[i].warned = true;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
if (sp < end) {
|
|
Packit |
575503 |
found = false;
|
|
Packit |
575503 |
goto again;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
done:
|
|
Packit |
575503 |
s[length] = save;
|
|
Packit |
575503 |
}
|