commit f591ddde8913633972409b9ebb3967738007730e Author: David Lutterkort Date: Sun Nov 13 19:38:39 2011 -0800 * src/fa.c (totalize): handle case-insensitive FA's properly The convention for case-insensitive FA's is that they do not contain any transitions on [A-Z], effectively removing upper case letters from the alphabet. totalize used to create transitions into the crash state that did transition on upper case letters, violating the convention. diff --git a/src/fa.c b/src/fa.c index ecfe8f4..40194e3 100644 --- a/src/fa.c +++ b/src/fa.c @@ -60,6 +60,10 @@ int fa_minimization_algorithm = FA_MIN_HOPCROFT; * fa_as_regexp, we store regexps on transitions in the re field of each * transition. TRANS_RE indicates that we do that, and is used by fa_dot to * produce proper graphs of an automaton transitioning on regexps. + * + * For case-insensitive regexps (nocase == 1), the FA never has transitions + * on uppercase letters [A-Z], effectively removing these letters from the + * alphabet. */ struct fa { struct state *initial; @@ -2344,6 +2348,34 @@ int fa_contains(struct fa *fa1, struct fa *fa2) { goto done; } +static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash, + int min, int max) { + int result; + + if (fa->nocase) { + /* Never transition on anything in [A-Z] */ + if (min > 'Z' || max < 'A') { + result = add_new_trans(s, crash, min, max); + } else if (min >= 'A' && max <= 'Z') { + result = 0; + } else if (max <= 'Z') { + /* min < 'A' */ + result = add_new_trans(s, crash, min, 'A' - 1); + } else if (min >= 'A') { + /* max > 'Z' */ + result = add_new_trans(s, crash, 'Z' + 1, max); + } else { + /* min < 'A' && max > 'Z' */ + result = add_new_trans(s, crash, min, 'A' - 1); + if (result == 0) + result = add_new_trans(s, crash, 'Z' + 1, max); + } + } else { + result = add_new_trans(s, crash, min, max); + } + return result; +} + static int totalize(struct fa *fa) { int r; struct state *crash = add_state(fa, 0); @@ -2352,42 +2384,25 @@ static int totalize(struct fa *fa) { F(mark_reachable(fa)); sort_transition_intervals(fa); - if (fa->nocase) { - r = add_new_trans(crash, crash, UCHAR_MIN, 'A' - 1); - if (r < 0) - return -1; - r = add_new_trans(crash, crash, 'Z' + 1, UCHAR_MAX); - if (r < 0) - return -1; - } else { - r = add_new_trans(crash, crash, UCHAR_MIN, UCHAR_MAX); - if (r < 0) - return -1; - } + r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX); + if (r < 0) + return -1; list_for_each(s, fa->initial) { int next = UCHAR_MIN; int tused = s->tused; for (int i=0; i < tused; i++) { uchar min = s->trans[i].min, max = s->trans[i].max; - if (fa->nocase) { - /* Don't add transitions on [A-Z] into crash */ - if (isupper(min)) min = 'A'; - if (isupper(max)) max = 'Z'; - } if (min > next) { - r = add_new_trans(s, crash, next, min - 1); + r = add_crash_trans(fa, s, crash, next, min - 1); if (r < 0) return -1; } - if (max + 1 > next) { + if (max + 1 > next) next = max + 1; - if (fa->nocase && isupper(next)) - next = 'Z' + 1; - } } if (next <= UCHAR_MAX) { - r = add_new_trans(s, crash, next, UCHAR_MAX); + r = add_crash_trans(fa, s, crash, next, UCHAR_MAX); if (r < 0) return -1; } @@ -3019,6 +3034,10 @@ int fa_nocase(struct fa *fa) { /* t->min < 'A' */ t->max = 'A' - 1; F(add_new_trans(s, t->to, lc_min, lc_max)); + } else if (t->min >= 'A') { + /* t->max > 'Z' */ + t->min = 'Z' + 1; + F(add_new_trans(s, t->to, lc_min, lc_max)); } else { /* t->min < 'A' && t->max > 'Z' */ F(add_new_trans(s, t->to, 'Z' + 1, t->max)); diff --git a/tests/fatest.c b/tests/fatest.c index be4460b..e3658ab 100644 --- a/tests/fatest.c +++ b/tests/fatest.c @@ -581,6 +581,24 @@ static void testExpandNoCase(CuTest *tc) { free(s); } +static void testNoCaseComplement(CuTest *tc) { + const char *key_s = "keY"; + struct fa *key = make_good_fa(tc, key_s); + struct fa *isect = NULL; + + fa_nocase(key); + + struct fa *comp = mark(fa_complement(key)); + + key = make_good_fa(tc, key_s); + + /* We used to have a bug in totalize that caused the intersection + * to contain "keY" */ + isect = fa_intersect(key, comp); + + CuAssertIntEquals(tc, 1, fa_is_basic(isect, FA_EMPTY)); +} + int main(int argc, char **argv) { if (argc == 1) { char *output = NULL; @@ -605,6 +623,7 @@ int main(int argc, char **argv) { SUITE_ADD_TEST(suite, testExpandCharRanges); SUITE_ADD_TEST(suite, testNoCase); SUITE_ADD_TEST(suite, testExpandNoCase); + SUITE_ADD_TEST(suite, testNoCaseComplement); CuSuiteRun(suite); CuSuiteSummary(suite, &output); diff --git a/tests/modules/pass_nocase.aug b/tests/modules/pass_nocase.aug index 6d254a3..ef248f4 100644 --- a/tests/modules/pass_nocase.aug +++ b/tests/modules/pass_nocase.aug @@ -10,7 +10,7 @@ test lns1 get "KEY" = { "1" = "KEY" } test lns1 get "KeY" = { "1" = "KeY" } let lns2 = - let re = /[a-z]/i - /Key/i in + let re = /[A-Za-z]+/ - /Key/i in [ label "1" . store re ] | [ label "2" . store /Key/i ] test lns2 get "Key" = { "2" = "Key" }