Matthew Booth 00c1a6
commit f591ddde8913633972409b9ebb3967738007730e
Matthew Booth 00c1a6
Author: David Lutterkort <lutter@redhat.com>
Matthew Booth 00c1a6
Date:   Sun Nov 13 19:38:39 2011 -0800
Matthew Booth 00c1a6
Matthew Booth 00c1a6
    * src/fa.c (totalize): handle case-insensitive FA's properly
Matthew Booth 00c1a6
    
Matthew Booth 00c1a6
    The convention for case-insensitive FA's is that they do not contain any
Matthew Booth 00c1a6
    transitions on [A-Z], effectively removing upper case letters from the
Matthew Booth 00c1a6
    alphabet.
Matthew Booth 00c1a6
    
Matthew Booth 00c1a6
    totalize used to create transitions into the crash state that did
Matthew Booth 00c1a6
    transition on upper case letters, violating the convention.
Matthew Booth 00c1a6
Matthew Booth 00c1a6
diff --git a/src/fa.c b/src/fa.c
Matthew Booth 00c1a6
index ecfe8f4..40194e3 100644
Matthew Booth 00c1a6
--- a/src/fa.c
Matthew Booth 00c1a6
+++ b/src/fa.c
Matthew Booth 00c1a6
@@ -60,6 +60,10 @@ int fa_minimization_algorithm = FA_MIN_HOPCROFT;
Matthew Booth 00c1a6
  * fa_as_regexp, we store regexps on transitions in the re field of each
Matthew Booth 00c1a6
  * transition. TRANS_RE indicates that we do that, and is used by fa_dot to
Matthew Booth 00c1a6
  * produce proper graphs of an automaton transitioning on regexps.
Matthew Booth 00c1a6
+ *
Matthew Booth 00c1a6
+ * For case-insensitive regexps (nocase == 1), the FA never has transitions
Matthew Booth 00c1a6
+ * on uppercase letters [A-Z], effectively removing these letters from the
Matthew Booth 00c1a6
+ * alphabet.
Matthew Booth 00c1a6
  */
Matthew Booth 00c1a6
 struct fa {
Matthew Booth 00c1a6
     struct state *initial;
Matthew Booth 00c1a6
@@ -2344,6 +2348,34 @@ int fa_contains(struct fa *fa1, struct fa *fa2) {
Matthew Booth 00c1a6
     goto done;
Matthew Booth 00c1a6
 }
Matthew Booth 00c1a6
 
Matthew Booth 00c1a6
+static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash,
Matthew Booth 00c1a6
+                           int min, int max) {
Matthew Booth 00c1a6
+    int result;
Matthew Booth 00c1a6
+
Matthew Booth 00c1a6
+    if (fa->nocase) {
Matthew Booth 00c1a6
+        /* Never transition on anything in [A-Z] */
Matthew Booth 00c1a6
+        if (min > 'Z' || max < 'A') {
Matthew Booth 00c1a6
+            result = add_new_trans(s, crash, min, max);
Matthew Booth 00c1a6
+        } else if (min >= 'A' && max <= 'Z') {
Matthew Booth 00c1a6
+            result = 0;
Matthew Booth 00c1a6
+        } else if (max <= 'Z') {
Matthew Booth 00c1a6
+            /* min < 'A' */
Matthew Booth 00c1a6
+            result = add_new_trans(s, crash, min, 'A' - 1);
Matthew Booth 00c1a6
+        } else if (min >= 'A') {
Matthew Booth 00c1a6
+            /* max > 'Z' */
Matthew Booth 00c1a6
+            result = add_new_trans(s, crash, 'Z' + 1, max);
Matthew Booth 00c1a6
+        } else {
Matthew Booth 00c1a6
+            /* min < 'A' && max > 'Z' */
Matthew Booth 00c1a6
+            result = add_new_trans(s, crash, min, 'A' - 1);
Matthew Booth 00c1a6
+            if (result == 0)
Matthew Booth 00c1a6
+                result = add_new_trans(s, crash, 'Z' + 1, max);
Matthew Booth 00c1a6
+        }
Matthew Booth 00c1a6
+    } else {
Matthew Booth 00c1a6
+        result = add_new_trans(s, crash, min, max);
Matthew Booth 00c1a6
+    }
Matthew Booth 00c1a6
+    return result;
Matthew Booth 00c1a6
+}
Matthew Booth 00c1a6
+
Matthew Booth 00c1a6
 static int totalize(struct fa *fa) {
Matthew Booth 00c1a6
     int r;
Matthew Booth 00c1a6
     struct state *crash = add_state(fa, 0);
Matthew Booth 00c1a6
@@ -2352,42 +2384,25 @@ static int totalize(struct fa *fa) {
Matthew Booth 00c1a6
     F(mark_reachable(fa));
Matthew Booth 00c1a6
     sort_transition_intervals(fa);
Matthew Booth 00c1a6
 
Matthew Booth 00c1a6
-    if (fa->nocase) {
Matthew Booth 00c1a6
-        r = add_new_trans(crash, crash, UCHAR_MIN, 'A' - 1);
Matthew Booth 00c1a6
-        if (r < 0)
Matthew Booth 00c1a6
-            return -1;
Matthew Booth 00c1a6
-        r = add_new_trans(crash, crash, 'Z' + 1, UCHAR_MAX);
Matthew Booth 00c1a6
-        if (r < 0)
Matthew Booth 00c1a6
-            return -1;
Matthew Booth 00c1a6
-    } else {
Matthew Booth 00c1a6
-        r = add_new_trans(crash, crash, UCHAR_MIN, UCHAR_MAX);
Matthew Booth 00c1a6
-        if (r < 0)
Matthew Booth 00c1a6
-            return -1;
Matthew Booth 00c1a6
-    }
Matthew Booth 00c1a6
+    r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX);
Matthew Booth 00c1a6
+    if (r < 0)
Matthew Booth 00c1a6
+        return -1;
Matthew Booth 00c1a6
 
Matthew Booth 00c1a6
     list_for_each(s, fa->initial) {
Matthew Booth 00c1a6
         int next = UCHAR_MIN;
Matthew Booth 00c1a6
         int tused = s->tused;
Matthew Booth 00c1a6
         for (int i=0; i < tused; i++) {
Matthew Booth 00c1a6
             uchar min = s->trans[i].min, max = s->trans[i].max;
Matthew Booth 00c1a6
-            if (fa->nocase) {
Matthew Booth 00c1a6
-                /* Don't add transitions on [A-Z] into crash */
Matthew Booth 00c1a6
-                if (isupper(min)) min = 'A';
Matthew Booth 00c1a6
-                if (isupper(max)) max = 'Z';
Matthew Booth 00c1a6
-            }
Matthew Booth 00c1a6
             if (min > next) {
Matthew Booth 00c1a6
-                r = add_new_trans(s, crash, next, min - 1);
Matthew Booth 00c1a6
+                r = add_crash_trans(fa, s, crash, next, min - 1);
Matthew Booth 00c1a6
                 if (r < 0)
Matthew Booth 00c1a6
                     return -1;
Matthew Booth 00c1a6
             }
Matthew Booth 00c1a6
-            if (max + 1 > next) {
Matthew Booth 00c1a6
+            if (max + 1 > next)
Matthew Booth 00c1a6
                 next = max + 1;
Matthew Booth 00c1a6
-                if (fa->nocase && isupper(next))
Matthew Booth 00c1a6
-                    next = 'Z' + 1;
Matthew Booth 00c1a6
-            }
Matthew Booth 00c1a6
         }
Matthew Booth 00c1a6
         if (next <= UCHAR_MAX) {
Matthew Booth 00c1a6
-            r = add_new_trans(s, crash, next, UCHAR_MAX);
Matthew Booth 00c1a6
+            r = add_crash_trans(fa, s, crash, next, UCHAR_MAX);
Matthew Booth 00c1a6
             if (r < 0)
Matthew Booth 00c1a6
                 return -1;
Matthew Booth 00c1a6
         }
Matthew Booth 00c1a6
@@ -3019,6 +3034,10 @@ int fa_nocase(struct fa *fa) {
Matthew Booth 00c1a6
                 /* t->min < 'A' */
Matthew Booth 00c1a6
                 t->max = 'A' - 1;
Matthew Booth 00c1a6
                 F(add_new_trans(s, t->to, lc_min, lc_max));
Matthew Booth 00c1a6
+            } else if (t->min >= 'A') {
Matthew Booth 00c1a6
+                /* t->max > 'Z' */
Matthew Booth 00c1a6
+                t->min = 'Z' + 1;
Matthew Booth 00c1a6
+                F(add_new_trans(s, t->to, lc_min, lc_max));
Matthew Booth 00c1a6
             } else {
Matthew Booth 00c1a6
                 /* t->min < 'A' && t->max > 'Z' */
Matthew Booth 00c1a6
                 F(add_new_trans(s, t->to, 'Z' + 1, t->max));
Matthew Booth 00c1a6
diff --git a/tests/fatest.c b/tests/fatest.c
Matthew Booth 00c1a6
index be4460b..e3658ab 100644
Matthew Booth 00c1a6
--- a/tests/fatest.c
Matthew Booth 00c1a6
+++ b/tests/fatest.c
Matthew Booth 00c1a6
@@ -581,6 +581,24 @@ static void testExpandNoCase(CuTest *tc) {
Matthew Booth 00c1a6
     free(s);
Matthew Booth 00c1a6
 }
Matthew Booth 00c1a6
 
Matthew Booth 00c1a6
+static void testNoCaseComplement(CuTest *tc) {
Matthew Booth 00c1a6
+    const char *key_s = "keY";
Matthew Booth 00c1a6
+    struct fa *key = make_good_fa(tc, key_s);
Matthew Booth 00c1a6
+    struct fa *isect = NULL;
Matthew Booth 00c1a6
+
Matthew Booth 00c1a6
+    fa_nocase(key);
Matthew Booth 00c1a6
+
Matthew Booth 00c1a6
+    struct fa *comp = mark(fa_complement(key));
Matthew Booth 00c1a6
+
Matthew Booth 00c1a6
+    key = make_good_fa(tc, key_s);
Matthew Booth 00c1a6
+
Matthew Booth 00c1a6
+    /* We used to have a bug in totalize that caused the intersection
Matthew Booth 00c1a6
+     * to contain "keY" */
Matthew Booth 00c1a6
+    isect = fa_intersect(key, comp);
Matthew Booth 00c1a6
+
Matthew Booth 00c1a6
+    CuAssertIntEquals(tc, 1, fa_is_basic(isect, FA_EMPTY));
Matthew Booth 00c1a6
+}
Matthew Booth 00c1a6
+
Matthew Booth 00c1a6
 int main(int argc, char **argv) {
Matthew Booth 00c1a6
     if (argc == 1) {
Matthew Booth 00c1a6
         char *output = NULL;
Matthew Booth 00c1a6
@@ -605,6 +623,7 @@ int main(int argc, char **argv) {
Matthew Booth 00c1a6
         SUITE_ADD_TEST(suite, testExpandCharRanges);
Matthew Booth 00c1a6
         SUITE_ADD_TEST(suite, testNoCase);
Matthew Booth 00c1a6
         SUITE_ADD_TEST(suite, testExpandNoCase);
Matthew Booth 00c1a6
+        SUITE_ADD_TEST(suite, testNoCaseComplement);
Matthew Booth 00c1a6
 
Matthew Booth 00c1a6
         CuSuiteRun(suite);
Matthew Booth 00c1a6
         CuSuiteSummary(suite, &output);
Matthew Booth 00c1a6
diff --git a/tests/modules/pass_nocase.aug b/tests/modules/pass_nocase.aug
Matthew Booth 00c1a6
index 6d254a3..ef248f4 100644
Matthew Booth 00c1a6
--- a/tests/modules/pass_nocase.aug
Matthew Booth 00c1a6
+++ b/tests/modules/pass_nocase.aug
Matthew Booth 00c1a6
@@ -10,7 +10,7 @@ test lns1 get "KEY" = { "1" = "KEY" }
Matthew Booth 00c1a6
 test lns1 get "KeY" = { "1" = "KeY" }
Matthew Booth 00c1a6
 
Matthew Booth 00c1a6
 let lns2 =
Matthew Booth 00c1a6
-  let re = /[a-z]/i - /Key/i in
Matthew Booth 00c1a6
+  let re = /[A-Za-z]+/ - /Key/i in
Matthew Booth 00c1a6
   [ label "1" . store re ] | [ label "2" . store /Key/i ]
Matthew Booth 00c1a6
 
Matthew Booth 00c1a6
 test lns2 get "Key" = { "2" = "Key" }