Jaroslav Škarvada 6ccb10
From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001
Jaroslav Škarvada 6ccb10
From: Paolo Bonzini <bonzini@gnu.org>
Jaroslav Škarvada 6ccb10
Date: Mon, 19 Apr 2010 14:50:23 +0200
Jaroslav Škarvada 6ccb10
Subject: [PATCH 1/2] dfa: optimize UTF-8 period
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
Backport of upstream commits 7a0ad00 and 42ac56a.
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
* src/dfa.h (struct dfa): Add utf8_anychar_classes.
Jaroslav Škarvada 6ccb10
* src/dfa.c (add_utf8_anychar): New.
Jaroslav Škarvada 6ccb10
(atom): Simplify if/else nesting.  Call add_utf8_anychar for ANYCHAR
Jaroslav Škarvada 6ccb10
in UTF-8 locales.
Jaroslav Škarvada 6ccb10
(dfaoptimize): Abort on ANYCHAR.
Jaroslav Škarvada 6ccb10
---
Jaroslav Škarvada 6ccb10
 src/dfa.c |   95 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Jaroslav Škarvada 6ccb10
 src/dfa.h |    1 +
Jaroslav Škarvada 6ccb10
 2 files changed, 82 insertions(+), 14 deletions(-)
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
diff --git a/src/dfa.c b/src/dfa.c
Jaroslav Škarvada 6ccb10
index ba78b08..e13c361 100644
Jaroslav Škarvada 6ccb10
--- a/src/dfa.c
Jaroslav Škarvada 6ccb10
+++ b/src/dfa.c
Jaroslav Škarvada 6ccb10
@@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc)
Jaroslav Škarvada 6ccb10
 }
Jaroslav Škarvada 6ccb10
 #endif
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
+static void
Jaroslav Škarvada 6ccb10
+add_utf8_anychar (void)
Jaroslav Škarvada 6ccb10
+{
Jaroslav Škarvada 6ccb10
+  static const charclass utf8_classes[5] = {
Jaroslav Škarvada 6ccb10
+      {  0,  0,  0,  0, ~0, ~0, 0, 0 },            /* 80-bf: non-lead bytes */
Jaroslav Škarvada 6ccb10
+      { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 },   /* 00-bf, f8-ff: 1-byte/invalid */
Jaroslav Škarvada 6ccb10
+      {  0,  0,  0,  0,  0,  0, ~0, 0 },           /* c0-df: 2-byte sequence */
Jaroslav Škarvada 6ccb10
+      {  0,  0,  0,  0,  0,  0,  0, 0xffff },      /* e0-ef: 3-byte sequence */
Jaroslav Škarvada 6ccb10
+      {  0,  0,  0,  0,  0,  0,  0, 0xff0000 }     /* f0-f7: 4-byte sequence */
Jaroslav Škarvada 6ccb10
+  };
Jaroslav Škarvada 6ccb10
+  const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
Jaroslav Škarvada 6ccb10
+  unsigned int i;
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+  /* Define the five character classes that are needed below.  */
Jaroslav Škarvada 6ccb10
+  if (dfa->utf8_anychar_classes[0] == 0)
Jaroslav Škarvada 6ccb10
+    for (i = 0; i < n; i++)
Jaroslav Škarvada 6ccb10
+      {
Jaroslav Škarvada 6ccb10
+        charclass c;
Jaroslav Škarvada 6ccb10
+        memcpy (c, utf8_classes[i], sizeof c);
Jaroslav Škarvada 6ccb10
+        if (i == 1)
Jaroslav Škarvada 6ccb10
+          {
Jaroslav Škarvada 6ccb10
+            if (!(syntax_bits & RE_DOT_NEWLINE))
Jaroslav Škarvada 6ccb10
+              clrbit (eolbyte, c);
Jaroslav Škarvada 6ccb10
+            if (syntax_bits & RE_DOT_NOT_NULL)
Jaroslav Škarvada 6ccb10
+              clrbit ('\0', c);
Jaroslav Škarvada 6ccb10
+          }
Jaroslav Škarvada 6ccb10
+        dfa->utf8_anychar_classes[i] = CSET + charclass_index(c);
Jaroslav Škarvada 6ccb10
+      }
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+  /* A valid UTF-8 character is
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+          ([0x00-0x7f]
Jaroslav Škarvada 6ccb10
+           |[0xc2-0xdf][0x80-0xbf]
Jaroslav Škarvada 6ccb10
+           |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
Jaroslav Škarvada 6ccb10
+           |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+     which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x80-0xbf]
Jaroslav Škarvada 6ccb10
+     and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
Jaroslav Škarvada 6ccb10
+     Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
Jaroslav Škarvada 6ccb10
+  for (i = 1; i < n; i++)
Jaroslav Škarvada 6ccb10
+    addtok (dfa->utf8_anychar_classes[i]);
Jaroslav Škarvada 6ccb10
+  while (--i > 1)
Jaroslav Škarvada 6ccb10
+    {
Jaroslav Škarvada 6ccb10
+      addtok (dfa->utf8_anychar_classes[0]);
Jaroslav Škarvada 6ccb10
+      addtok (CAT);
Jaroslav Škarvada 6ccb10
+      addtok (OR);
Jaroslav Škarvada 6ccb10
+    }
Jaroslav Škarvada 6ccb10
+}
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
 /* The grammar understood by the parser is as follows.
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
    regexp:
Jaroslav Škarvada 6ccb10
@@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc)
Jaroslav Škarvada 6ccb10
 static void
Jaroslav Škarvada 6ccb10
 atom (void)
Jaroslav Škarvada 6ccb10
 {
Jaroslav Škarvada 6ccb10
+  if (0)
Jaroslav Škarvada 6ccb10
+    {
Jaroslav Škarvada 6ccb10
+      /* empty */
Jaroslav Škarvada 6ccb10
+    }
Jaroslav Škarvada 6ccb10
 #ifdef MBS_SUPPORT
Jaroslav Škarvada 6ccb10
-  if (tok == WCHAR)
Jaroslav Škarvada 6ccb10
+  else if (tok == WCHAR)
Jaroslav Škarvada 6ccb10
     {
Jaroslav Škarvada 6ccb10
       addtok_wc (case_fold ? towlower(wctok) : wctok);
Jaroslav Škarvada 6ccb10
 #ifndef GREP
Jaroslav Škarvada 6ccb10
@@ -1242,16 +1295,28 @@ atom (void)
Jaroslav Škarvada 6ccb10
 #endif
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
       tok = lex();
Jaroslav Škarvada 6ccb10
-      return;
Jaroslav Škarvada 6ccb10
+    }
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+  else if (tok == ANYCHAR && using_utf8())
Jaroslav Škarvada 6ccb10
+    {
Jaroslav Škarvada 6ccb10
+      /* For UTF-8 expand the period to a series of CSETs that define a valid
Jaroslav Škarvada 6ccb10
+	 UTF-8 character.  This avoids using the slow multibyte path.  I'm
Jaroslav Škarvada 6ccb10
+	 pretty sure it would be both profitable and correct to do it for
Jaroslav Škarvada 6ccb10
+	 any encoding; however, the optimization must be done manually as
Jaroslav Škarvada 6ccb10
+	 it is done above in add_utf8_anychar.	So, let's start with
Jaroslav Škarvada 6ccb10
+	 UTF-8: it is the most used, and the structure of the encoding
Jaroslav Škarvada 6ccb10
+	 makes the correctness more obvious.  */
Jaroslav Škarvada 6ccb10
+      add_utf8_anychar();
Jaroslav Škarvada 6ccb10
+      tok = lex();
Jaroslav Škarvada 6ccb10
     }
Jaroslav Škarvada 6ccb10
 #endif /* MBS_SUPPORT  */
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
-  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
Jaroslav Škarvada 6ccb10
-      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
Jaroslav Škarvada 6ccb10
+  else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
Jaroslav Škarvada 6ccb10
+       	   || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
Jaroslav Škarvada 6ccb10
 #ifdef MBS_SUPPORT
Jaroslav Škarvada 6ccb10
-      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
Jaroslav Škarvada 6ccb10
+     	   || tok == ANYCHAR || tok == MBCSET
Jaroslav Škarvada 6ccb10
 #endif /* MBS_SUPPORT */
Jaroslav Škarvada 6ccb10
-      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
Jaroslav Škarvada 6ccb10
+	   || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
Jaroslav Škarvada 6ccb10
     {
Jaroslav Škarvada 6ccb10
       addtok(tok);
Jaroslav Škarvada 6ccb10
       tok = lex();
Jaroslav Škarvada 6ccb10
@@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d)
Jaroslav Škarvada 6ccb10
   for (i = 0; i < d->tindex; ++i)
Jaroslav Škarvada 6ccb10
     {
Jaroslav Škarvada 6ccb10
       switch(d->tokens[i])
Jaroslav Škarvada 6ccb10
-	{
Jaroslav Škarvada 6ccb10
-	case ANYCHAR:
Jaroslav Škarvada 6ccb10
-	case MBCSET:
Jaroslav Škarvada 6ccb10
-	  /* Requires multi-byte algorithm.  */
Jaroslav Škarvada 6ccb10
-	  return;
Jaroslav Škarvada 6ccb10
-	default:
Jaroslav Škarvada 6ccb10
-	  break;
Jaroslav Škarvada 6ccb10
-	}
Jaroslav Škarvada 6ccb10
+        {
Jaroslav Škarvada 6ccb10
+        case ANYCHAR:
Jaroslav Škarvada 6ccb10
+          /* Lowered.  */
Jaroslav Škarvada 6ccb10
+          abort ();
Jaroslav Škarvada 6ccb10
+        case MBCSET:
Jaroslav Škarvada 6ccb10
+          /* Requires multi-byte algorithm.  */
Jaroslav Škarvada 6ccb10
+          return;
Jaroslav Škarvada 6ccb10
+        default:
Jaroslav Škarvada 6ccb10
+          break;
Jaroslav Škarvada 6ccb10
+        }
Jaroslav Škarvada 6ccb10
     }
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
   free_mbdata (d);
Jaroslav Škarvada 6ccb10
diff --git a/src/dfa.h b/src/dfa.h
Jaroslav Škarvada 6ccb10
index 1c85207..42c177a 100644
Jaroslav Škarvada 6ccb10
--- a/src/dfa.h
Jaroslav Škarvada 6ccb10
+++ b/src/dfa.h
Jaroslav Škarvada 6ccb10
@@ -283,6 +283,7 @@ struct dfa
Jaroslav Škarvada 6ccb10
 				   with dfaparse(). */
Jaroslav Škarvada 6ccb10
 #ifdef MBS_SUPPORT
Jaroslav Škarvada 6ccb10
   unsigned int mb_cur_max;	/* Cached value of MB_CUR_MAX.  */
Jaroslav Škarvada 6ccb10
+  int utf8_anychar_classes[5];	/* To lower ANYCHAR in UTF-8 locales.  */
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
   /* The following are used only if MB_CUR_MAX > 1.  */
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
-- 
Jaroslav Škarvada 6ccb10
1.6.6.1
Jaroslav Škarvada 6ccb10