Boyer-Moore algorithm is faster than Beate Commentz-Walter, in especially
worst case, because Galil rule is applicable only for Boyer-Moore
algorithm.

This patch enables to use Boyer-Moore algorithm for case-insensitive
matching.

Norihiro
From 069d55705973ddecbd5f04a41097893224d55095 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Thu, 20 Mar 2014 08:07:25 +0900
Subject: [PATCH 1/2] grep: may also use Boyer-Moore algorithm for
 case-insensitive matching

* src/kwset.c (bmexec): Use character translation table.
(kwsexec): Call bmexec for case-insensitive matching.
(kwsprep): Change the `if' condition.
---
 src/kwset.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 173 insertions(+), 42 deletions(-)

diff --git a/src/kwset.c b/src/kwset.c
index d5db12f..8d8aa07 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -434,7 +434,7 @@ kwsprep (kwset_t kwset)
 
   /* Check if we can use the simple boyer-moore algorithm, instead
      of the hairy commentz-walter algorithm. */
-  if (!trans && kwset->words == 1)
+  if (kwset->words == 1)
     {
       /* Looking for just one string.  Extract it from the trie. */
       kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
@@ -473,6 +473,7 @@ bmexec (kwset_t kwset, char const *text, size_t size)
   int d, i, skip;
   char gc1, gc2;
   int len = kwset->mind;
+  char const *trans = kwset->trans;
 
   if (len == 0)
     return 0;
@@ -480,15 +481,33 @@ bmexec (kwset_t kwset, char const *text, size_t size)
     return -1;
   if (len == 1)
     {
-      tp = memchr (text, kwset->target[0], size);
-      return tp ? tp - text : -1;
+      if (trans)
+        {
+          for (tp = text; tp < text + size; tp++)
+            if (trans[U(*tp)] == kwset->target[0])
+              return tp - text;
+          return -1;
+        }
+      else
+        {
+          tp = memchr (text, kwset->target[0], size);
+          return tp ? tp - text : -1;
+        }
     }
 
   d1 = kwset->delta;
   sp = kwset->target + len;
-  gc1 = sp[-1];
-  gc2 = sp[-2];
   tp = text + len;
+  if (trans)
+    {
+      gc1 = trans[U(sp[-1])];
+      gc2 = trans[U(sp[-2])];
+    }
+  else
+    {
+      gc1 = sp[-1];
+      gc2 = sp[-2];
+    }
   skip = 0;
 
   /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
@@ -518,30 +537,86 @@ bmexec (kwset_t kwset, char const *text, size_t size)
         break;
       found:
         d = len;
-        while (true)
+        if (trans)
           {
-            if (tp[-2] == gc2)
+            while (true)
               {
-                for (i = 3; i <= d && tp[-i] == sp[-i]; ++i)
-                  continue;
-                if (i > d)
+                if (trans[U(tp[-2])] == gc2)
                   {
-                    for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i)
-                      continue;
-                    if (i > len)
-                      return tp - len - text;
+                    for (i = 3; i <= d; ++i)
+                      if (trans[U(tp[-i])] != trans[U(sp[-i])])
+                        break;
+                    if (i > d)
+                      {
+                        for (i = d + skip + 1; i <= len; ++i)
+                          if (trans[U(tp[-i])] != trans[U(sp[-i])])
+                            break;
+                        if (i > len)
+                          return tp - len - text;
+                      }
+                    d = kwset->shift[i - 2]; tp += d;
+                    if (tp > ep)
+                      break;
+                    if (trans[U(tp[-1])] != gc1)
+                      {
+                        d = d1[U(tp[-1])], tp += d;
+                        break;
+                      }
+                    skip = i - 1;
+                  }
+                else
+                  {
+                    d = kwset->shift[0]; tp += d;
+                    if (tp > ep)
+                      break;
+                    if (trans[U(tp[-1])] != gc1)
+                      {
+                        d = d1[U(tp[-1])], tp += d;
+                        break;
+                      }
+                    skip = 1;
                   }
-                d = kwset->shift[i - 2]; tp += d;
-                if (tp > ep || tp[-1] != gc1)
-                  break;
-                skip = i - 1;
               }
-            else
+          }
+        else
+          {
+            while (true)
               {
-                d = kwset->shift[0]; tp += d;
-                if (tp > ep || tp[-1] != gc1)
-                  break;
-                skip = 1;
+                if (tp[-2] == gc2)
+                  {
+                    for (i = 3; i <= d; ++i)
+                      if (tp[-i] != sp[-i])
+                        break;
+                    if (i > d)
+                      {
+                        for (i = d + skip + 1; i <= len; ++i)
+                          if (tp[-i] != sp[-i])
+                            break;
+                        if (i > len)
+                          return tp - len - text;
+                      }
+                    d = kwset->shift[i - 2]; tp += d;
+                    if (tp > ep)
+                      break;
+                    if (tp[-1] != gc1)
+                      {
+                        d = d1[U(tp[-1])], tp += d;
+                        break;
+                      }
+                    skip = i - 1;
+                  }
+                else
+                  {
+                    d = kwset->shift[0]; tp += d;
+                    if (tp > ep)
+                      break;
+                    if (tp[-1] != gc1)
+                      {
+                        d = d1[U(tp[-1])], tp += d;
+                        break;
+                      }
+                    skip = 1;
+                  }
               }
           }
       }
@@ -556,30 +631,86 @@ bmexec (kwset_t kwset, char const *text, size_t size)
       if (d != 0)
         continue;
       d = len;
-      while (true)
+      if (trans)
         {
-          if (tp[-2] == gc2)
+          while (true)
             {
-              for (i = 3; i <= d && tp[-i] == sp[-i]; ++i)
-                continue;
-              if (i > d)
+              if (trans[U(tp[-2])] == gc2)
+                {
+                  for (i = 3; i <= d; ++i)
+                if (trans[U(tp[-i])] != trans[U(sp[-i])])
+                  break;
+                  if (i > d)
+                    {
+                      for (i = d + skip + 1; i <= len; ++i)
+                        if (trans[U(tp[-i])] != trans[U(sp[-i])])
+                          break;
+                      if (i > len)
+                        return tp - len - text;
+                    }
+                  d = kwset->shift[i - 2]; tp += d;
+                  if (tp > ep)
+                    break;
+                  if (trans[U(tp[-1])] != gc1)
+                    {
+                      d = d1[U(tp[-1])];
+                      break;
+                    }
+                  skip = i - 1;
+                }
+              else
                 {
-                  for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i)
-                    continue;
-                  if (i > len)
-                    return tp - len - text;
+                  d = kwset->shift[0]; tp += d;
+                  if (tp > ep)
+                    break;
+                  if (trans[U(tp[-1])] != gc1)
+                    {
+                      d = d1[U(tp[-1])];
+                      break;
+                    }
+                  skip = 1;
                 }
-              d = kwset->shift[i - 2]; tp += d;
-              if (tp > ep || tp[-1] != gc1)
-                break;
-              skip = i - 1;
             }
-          else
+        }
+      else
+        {
+          while (true)
             {
-              d = kwset->shift[0]; tp += d;
-              if (tp > ep || tp[-1] != gc1)
-                break;
-              skip = 1;
+              if (tp[-2] == gc2)
+                {
+                  for (i = 3; i <= d; ++i)
+                    if (tp[-i] != sp[-i])
+                      break;
+                  if (i > d)
+                    {
+                      for (i = d + skip + 1; i <= len; ++i)
+                        if (tp[-i] != sp[-i])
+                          break;
+                      if (i > len)
+                        return tp - len - text;
+                    }
+                  d = kwset->shift[i - 2]; tp += d;
+                  if (tp > ep)
+                    break;
+                  if (tp[-1] != gc1)
+                    {
+                      d = d1[U(tp[-1])];
+                      break;
+                    }
+                  skip = i - 1;
+                }
+              else
+                {
+                  d = kwset->shift[0]; tp += d;
+                  if (tp > ep)
+                    break;
+                  if (tp[-1] != gc1)
+                    {
+                      d = d1[U(tp[-1])];
+                      break;
+                    }
+                  skip = 1;
+                }
             }
         }
     }
@@ -752,7 +883,7 @@ size_t
 kwsexec (kwset_t kwset, char const *text, size_t size,
          struct kwsmatch *kwsmatch)
 {
-  if (kwset->words == 1 && kwset->trans == NULL)
+  if (kwset->words == 1)
     {
       size_t ret = bmexec (kwset, text, size);
       if (ret != (size_t) -1)
-- 
1.9.1

Reply via email to