From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Apr 2010 14:50:23 +0200 Subject: [PATCH 1/2] dfa: optimize UTF-8 period Backport of upstream commits 7a0ad00 and 42ac56a. * src/dfa.h (struct dfa): Add utf8_anychar_classes. * src/dfa.c (add_utf8_anychar): New. (atom): Simplify if/else nesting. Call add_utf8_anychar for ANYCHAR in UTF-8 locales. (dfaoptimize): Abort on ANYCHAR. --- src/dfa.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- src/dfa.h | 1 + 2 files changed, 82 insertions(+), 14 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index ba78b08..e13c361 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc) } #endif +static void +add_utf8_anychar (void) +{ + static const charclass utf8_classes[5] = { + { 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */ + { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 }, /* 00-bf, f8-ff: 1-byte/invalid */ + { 0, 0, 0, 0, 0, 0, ~0, 0 }, /* c0-df: 2-byte sequence */ + { 0, 0, 0, 0, 0, 0, 0, 0xffff }, /* e0-ef: 3-byte sequence */ + { 0, 0, 0, 0, 0, 0, 0, 0xff0000 } /* f0-f7: 4-byte sequence */ + }; + const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]); + unsigned int i; + + /* Define the five character classes that are needed below. */ + if (dfa->utf8_anychar_classes[0] == 0) + for (i = 0; i < n; i++) + { + charclass c; + memcpy (c, utf8_classes[i], sizeof c); + if (i == 1) + { + if (!(syntax_bits & RE_DOT_NEWLINE)) + clrbit (eolbyte, c); + if (syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', c); + } + dfa->utf8_anychar_classes[i] = CSET + charclass_index(c); + } + + /* A valid UTF-8 character is + + ([0x00-0x7f] + |[0xc2-0xdf][0x80-0xbf] + |[0xe0-0xef[0x80-0xbf][0x80-0xbf] + |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf]) + + which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x80-0xbf] + and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse + Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */ + for (i = 1; i < n; i++) + addtok (dfa->utf8_anychar_classes[i]); + while (--i > 1) + { + addtok (dfa->utf8_anychar_classes[0]); + addtok (CAT); + addtok (OR); + } +} + /* The grammar understood by the parser is as follows. regexp: @@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc) static void atom (void) { + if (0) + { + /* empty */ + } #ifdef MBS_SUPPORT - if (tok == WCHAR) + else if (tok == WCHAR) { addtok_wc (case_fold ? towlower(wctok) : wctok); #ifndef GREP @@ -1242,16 +1295,28 @@ atom (void) #endif tok = lex(); - return; + } + + else if (tok == ANYCHAR && using_utf8()) + { + /* For UTF-8 expand the period to a series of CSETs that define a valid + UTF-8 character. This avoids using the slow multibyte path. I'm + pretty sure it would be both profitable and correct to do it for + any encoding; however, the optimization must be done manually as + it is done above in add_utf8_anychar. So, let's start with + UTF-8: it is the most used, and the structure of the encoding + makes the correctness more obvious. */ + add_utf8_anychar(); + tok = lex(); } #endif /* MBS_SUPPORT */ - if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF - || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD + else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF + || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD #ifdef MBS_SUPPORT - || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ + || tok == ANYCHAR || tok == MBCSET #endif /* MBS_SUPPORT */ - || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) + || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok(tok); tok = lex(); @@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d) for (i = 0; i < d->tindex; ++i) { switch(d->tokens[i]) - { - case ANYCHAR: - case MBCSET: - /* Requires multi-byte algorithm. */ - return; - default: - break; - } + { + case ANYCHAR: + /* Lowered. */ + abort (); + case MBCSET: + /* Requires multi-byte algorithm. */ + return; + default: + break; + } } free_mbdata (d); diff --git a/src/dfa.h b/src/dfa.h index 1c85207..42c177a 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -283,6 +283,7 @@ struct dfa with dfaparse(). */ #ifdef MBS_SUPPORT unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */ + int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ /* The following are used only if MB_CUR_MAX > 1. */ -- 1.6.6.1