Norihiro Tanaka wrote:
Now I wrote the additional patch.

Sorry, I don't understand that patch. I applied your earlier patch (attachment 1) and then merged that patch (attachment 2), but the resulting 'grep' failed 'make check'; I got a "FAIL: fedora" and the test after "file" never finished. This was on Fedora 20 x86-64 with my own installation of GCC 4.9.0.

Instead of trying to understand that patch, I wrote a different patch (attachment 3) that simplifies the code and removes the part that is x86-specific. This improves the performance quite a bit for the test case given in the ChangeLog entry. I also tested performance on Sparc Solaris 10, and memchr was a big win there. Perhaps there are platforms where memchr is a big loss, but we can cross that bridge if we come to it.

Perhaps I merged that patch incorrectly, so I'll leave this bug open for now. To be honest I hope that patch doesn't improve performance significantly if we get it to work correctly, as it looks tricky.

From 0fb2a531011eeca893a2d766b016325399bff8b1 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Thu, 3 Apr 2014 22:57:09 +0900
Subject: [PATCH 1/2] grep: speed-up by using memchr() in Boyer-Moore searching

memchr() of glibc is faster than seeking by delta1 on some platforms.
When there is no chance to match for a while, use it on them.
* src/kwset.c (bmexec): Use memchr() in Boyer-Moore searching.
---
 src/kwset.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/kwset.c b/src/kwset.c
index 69b0a55..78fb0b2 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -582,8 +582,27 @@ bmexec (kwset_t kwset, char const *text, size_t size)
             d = d1[U(tp[-1])], tp += d;
             if (d == 0)
               goto found;
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
+            /* memchar() of glibc is faster than seeking by delta1 on
+               some platforms.  When there is no chance to match for a
+               while, use it on them.  */
+#if defined(__GLIBC__) && (defined(__i386__) || defined(__x86_64__))
+            if (!trans)
+              {
+                tp = memchr (tp - 1, gc1, size + text - tp + 1);
+                if (tp)
+                  {
+                    ++tp;
+                    goto found;
+                  }
+                else
+                  return -1;
+              }
+            else
+#endif
+              {
+                d = d1[U(tp[-1])], tp += d;
+                d = d1[U(tp[-1])], tp += d;
+              }
           }
         break;
       found:
-- 
1.9.0

From dd0de0dce596add9098e1fb6e294c190c9b04f18 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Thu, 10 Apr 2014 20:52:54 +0900
Subject: [PATCH] grep: speed-up by replacing `incr' to `add' in x86 and x86-64

The increment instrument is more faster than the add instuction in x86
and x86-64 architecture.  So prefer the add instrument to the increment
instrument for them.
* src/kwset.c (SHIFT): New macro.  It uses the increment instrument
instead of the add instrument for x86 and x86-64.
(bmexec, cwexec): Use it.
---
 src/kwset.c | 73 ++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/src/kwset.c b/src/kwset.c
index 78fb0b2..ce127e2 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -55,6 +55,35 @@
 
 #define U(c) (to_uchar (c))
 
+#if defined(__i386__) || defined(__x86_64__)
+#define SHIFT(p, d)    \
+  switch (d)           \
+    {                  \
+    default:           \
+      p += d;          \
+      break;           \
+    case 8:            \
+      ++p;             \
+    case 7:            \
+      ++p;             \
+    case 6:            \
+      ++p;             \
+    case 5:            \
+      ++p;             \
+    case 4:            \
+      ++p;             \
+    case 3:            \
+      ++p;             \
+    case 2:            \
+      ++p;             \
+    case 1:            \
+      ++p;             \
+    }
+#else
+#define SHIFT(p, d)    \
+  p += d;
+#endif
+
 /* Balanced tree of edges and labels leaving a given trie node. */
 struct tree
 {
@@ -508,13 +537,14 @@ bm_delta2_search (char const **tpp, char const *ep, char 
const *sp, int len,
             }
         }
 
-      tp += d = kwset->shift[i - 2];
+      d = kwset->shift[i - 2];
+      SHIFT(tp, d);
       if (tp > ep)
         break;
       if (tr (trans, tp[-1]) != gc1)
         {
           if (d1)
-            tp += d1[U(tp[-1])];
+            SHIFT(tp, d1[U(tp[-1])]);
           break;
         }
       skip = i - 1;
@@ -568,18 +598,18 @@ bmexec (kwset_t kwset, char const *text, size_t size)
       {
         while (tp <= ep)
           {
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
             if (d == 0)
               goto found;
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
             if (d == 0)
               goto found;
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
+            d = d1[U(tp[-1])]; SHIFT(tp, d);
             if (d == 0)
               goto found;
             /* memchar() of glibc is faster than seeking by delta1 on
@@ -600,8 +630,8 @@ bmexec (kwset_t kwset, char const *text, size_t size)
             else
 #endif
               {
-                d = d1[U(tp[-1])], tp += d;
-                d = d1[U(tp[-1])], tp += d;
+                d = d1[U(tp[-1])]; SHIFT(tp, d);
+                d = d1[U(tp[-1])]; SHIFT(tp, d);
               }
           }
         break;
@@ -616,7 +646,8 @@ bmexec (kwset_t kwset, char const *text, size_t size)
   d = d1[U(tp[-1])];
   while (d <= ep - tp)
     {
-      d = d1[U((tp += d)[-1])];
+      SHIFT(tp, d);
+      d = d1[U(tp[-1])];
       if (d != 0)
         continue;
       if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset))
@@ -670,17 +701,18 @@ cwexec (kwset_t kwset, char const *text, size_t len, 
struct kwsmatch *kwsmatch)
     {
       if (qlim && end <= qlim)
         {
-          end += d - 1;
           while ((d = delta[c = *end]) && end < qlim)
             {
-              end += d;
-              end += delta[U(*end)];
-              end += delta[U(*end)];
+              SHIFT(end, d);
+              d = delta[U(end[-1])]; SHIFT(end, d);
+              d = delta[U(end[-1])]; SHIFT(end, d);
             }
-          ++end;
         }
       else
-        d = delta[c = (end += d)[-1]];
+        {
+          SHIFT(end, d);
+          d = delta[c = end[-1]];
+        }
       if (d)
         continue;
       beg = end - 1;
@@ -729,7 +761,8 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct 
kwsmatch *kwsmatch)
   d = 1;
   while (lim - end >= d)
     {
-      if ((d = delta[c = (end += d)[-1]]) != 0)
+      SHIFT(end, d);
+      if ((d = delta[c = end[-1]]) != 0)
         continue;
       beg = end - 1;
       if (!(trie = next[c]))
-- 
1.9.0

From 205a0e7aae41fb819c8dbd78e145f34161ec8a4c Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Tue, 22 Apr 2014 23:34:22 -0700
Subject: [PATCH 2/2] kwset: simplify and speed up Boyer-Moore unibyte -i in
 some cases

This improves the performance of, for example,
yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 | grep -i jk
in a unibyte locale.
* src/kwset.c (memchr_trans): New function.
(bmexec): Use it.  Simplify the code and remove some of the
confusing gotos and breaks and labels.  Do not treat glibc memchr
as a special case; if non-glibc memchr is slow, that is lower
priority and I suppose we can try to work around the problem in
gnulib.
---
 src/kwset.c | 77 ++++++++++++++++++++++++++-----------------------------------
 1 file changed, 33 insertions(+), 44 deletions(-)

diff --git a/src/kwset.c b/src/kwset.c
index 78fb0b2..f86ee03 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -524,6 +524,20 @@ bm_delta2_search (char const **tpp, char const *ep, char 
const *sp, int len,
   return false;
 }
 
+/* Return the address of the first byte in the buffer S that equals C.
+   S contains N bytes.  If TRANS is nonnull, use it to transliterate
+   S's bytes before comparing them.  */
+static char const *
+memchr_trans (char const *s, char c, size_t n, char const *trans)
+{
+  if (! trans)
+    return memchr (s, c, n);
+  char const *slim = s + n;
+  for (; s < slim; s++)
+    if (trans[U(*s)] == c)
+      return s;
+  return NULL;
+}
 
 /* Fast boyer-moore search. */
 static size_t _GL_ATTRIBUTE_PURE
@@ -541,18 +555,8 @@ bmexec (kwset_t kwset, char const *text, size_t size)
     return -1;
   if (len == 1)
     {
-      if (trans)
-        {
-          for (tp = text; tp < text + size; tp++)
-            if (trans[U(*tp)] == kwset->target[0])
-              return tp - text;
-          return -1;
-        }
-      else
-        {
-          tp = memchr (text, kwset->target[0], size);
-          return tp ? tp - text : -1;
-        }
+      tp = memchr_trans (text, kwset->target[0], size, trans);
+      return tp ? tp - text : -1;
     }
 
   d1 = kwset->delta;
@@ -564,48 +568,33 @@ bmexec (kwset_t kwset, char const *text, size_t size)
   /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
   if (size > 12 * len)
     /* 11 is not a bug, the initial offset happens only once. */
-    for (ep = text + size - 11 * len;;)
+    for (ep = text + size - 11 * len; tp <= ep; )
       {
-        while (tp <= ep)
+        d = d1[U(tp[-1])], tp += d;
+        d = d1[U(tp[-1])], tp += d;
+        if (d != 0)
           {
             d = d1[U(tp[-1])], tp += d;
             d = d1[U(tp[-1])], tp += d;
-            if (d == 0)
-              goto found;
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
-            if (d == 0)
-              goto found;
-            d = d1[U(tp[-1])], tp += d;
             d = d1[U(tp[-1])], tp += d;
-            d = d1[U(tp[-1])], tp += d;
-            if (d == 0)
-              goto found;
-            /* memchar() of glibc is faster than seeking by delta1 on
-               some platforms.  When there is no chance to match for a
-               while, use it on them.  */
-#if defined(__GLIBC__) && (defined(__i386__) || defined(__x86_64__))
-            if (!trans)
-              {
-                tp = memchr (tp - 1, gc1, size + text - tp + 1);
-                if (tp)
-                  {
-                    ++tp;
-                    goto found;
-                  }
-                else
-                  return -1;
-              }
-            else
-#endif
+            if (d != 0)
               {
                 d = d1[U(tp[-1])], tp += d;
                 d = d1[U(tp[-1])], tp += d;
+                d = d1[U(tp[-1])], tp += d;
+                if (d != 0)
+                  {
+                    /* Typically memchr is faster than seeking by
+                       delta1 when there is no chance to match for
+                       a while.  */
+                    tp--;
+                    tp = memchr_trans (tp, gc1, text + size - tp, trans);
+                    if (! tp)
+                      return -1;
+                    tp++;
+                  }
               }
           }
-        break;
-      found:
         if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
           return tp - text;
       }
-- 
1.9.0

Reply via email to