Norihiro Tanaka wrote:
Now I wrote the additional patch.
Sorry, I don't understand that patch. I applied your earlier patch (attachment 1) and then merged that patch (attachment 2), but the resulting 'grep' failed 'make check'; I got a "FAIL: fedora" and the test after "file" never finished. This was on Fedora 20 x86-64 with my own installation of GCC 4.9.0.
Instead of trying to understand that patch, I wrote a different patch (attachment 3) that simplifies the code and removes the part that is x86-specific. This improves the performance quite a bit for the test case given in the ChangeLog entry. I also tested performance on Sparc Solaris 10, and memchr was a big win there. Perhaps there are platforms where memchr is a big loss, but we can cross that bridge if we come to it.
Perhaps I merged that patch incorrectly, so I'll leave this bug open for now. To be honest I hope that patch doesn't improve performance significantly if we get it to work correctly, as it looks tricky.
From 0fb2a531011eeca893a2d766b016325399bff8b1 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <[email protected]> Date: Thu, 3 Apr 2014 22:57:09 +0900 Subject: [PATCH 1/2] grep: speed-up by using memchr() in Boyer-Moore searching memchr() of glibc is faster than seeking by delta1 on some platforms. When there is no chance to match for a while, use it on them. * src/kwset.c (bmexec): Use memchr() in Boyer-Moore searching. --- src/kwset.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index 69b0a55..78fb0b2 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -582,8 +582,27 @@ bmexec (kwset_t kwset, char const *text, size_t size) d = d1[U(tp[-1])], tp += d; if (d == 0) goto found; - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; + /* memchar() of glibc is faster than seeking by delta1 on + some platforms. When there is no chance to match for a + while, use it on them. */ +#if defined(__GLIBC__) && (defined(__i386__) || defined(__x86_64__)) + if (!trans) + { + tp = memchr (tp - 1, gc1, size + text - tp + 1); + if (tp) + { + ++tp; + goto found; + } + else + return -1; + } + else +#endif + { + d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + } } break; found: -- 1.9.0
From dd0de0dce596add9098e1fb6e294c190c9b04f18 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <[email protected]> Date: Thu, 10 Apr 2014 20:52:54 +0900 Subject: [PATCH] grep: speed-up by replacing `incr' to `add' in x86 and x86-64 The increment instrument is more faster than the add instuction in x86 and x86-64 architecture. So prefer the add instrument to the increment instrument for them. * src/kwset.c (SHIFT): New macro. It uses the increment instrument instead of the add instrument for x86 and x86-64. (bmexec, cwexec): Use it. --- src/kwset.c | 73 ++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index 78fb0b2..ce127e2 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -55,6 +55,35 @@ #define U(c) (to_uchar (c)) +#if defined(__i386__) || defined(__x86_64__) +#define SHIFT(p, d) \ + switch (d) \ + { \ + default: \ + p += d; \ + break; \ + case 8: \ + ++p; \ + case 7: \ + ++p; \ + case 6: \ + ++p; \ + case 5: \ + ++p; \ + case 4: \ + ++p; \ + case 3: \ + ++p; \ + case 2: \ + ++p; \ + case 1: \ + ++p; \ + } +#else +#define SHIFT(p, d) \ + p += d; +#endif + /* Balanced tree of edges and labels leaving a given trie node. */ struct tree { @@ -508,13 +537,14 @@ bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len, } } - tp += d = kwset->shift[i - 2]; + d = kwset->shift[i - 2]; + SHIFT(tp, d); if (tp > ep) break; if (tr (trans, tp[-1]) != gc1) { if (d1) - tp += d1[U(tp[-1])]; + SHIFT(tp, d1[U(tp[-1])]); break; } skip = i - 1; @@ -568,18 +598,18 @@ bmexec (kwset_t kwset, char const *text, size_t size) { while (tp <= ep) { - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])]; SHIFT(tp, d); + d = d1[U(tp[-1])]; SHIFT(tp, d); if (d == 0) goto found; - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])]; SHIFT(tp, d); + d = d1[U(tp[-1])]; SHIFT(tp, d); + d = d1[U(tp[-1])]; SHIFT(tp, d); if (d == 0) goto found; - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])]; SHIFT(tp, d); + d = d1[U(tp[-1])]; SHIFT(tp, d); + d = d1[U(tp[-1])]; SHIFT(tp, d); if (d == 0) goto found; /* memchar() of glibc is faster than seeking by delta1 on @@ -600,8 +630,8 @@ bmexec (kwset_t kwset, char const *text, size_t size) else #endif { - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])]; SHIFT(tp, d); + d = d1[U(tp[-1])]; SHIFT(tp, d); } } break; @@ -616,7 +646,8 @@ bmexec (kwset_t kwset, char const *text, size_t size) d = d1[U(tp[-1])]; while (d <= ep - tp) { - d = d1[U((tp += d)[-1])]; + SHIFT(tp, d); + d = d1[U(tp[-1])]; if (d != 0) continue; if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset)) @@ -670,17 +701,18 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) { if (qlim && end <= qlim) { - end += d - 1; while ((d = delta[c = *end]) && end < qlim) { - end += d; - end += delta[U(*end)]; - end += delta[U(*end)]; + SHIFT(end, d); + d = delta[U(end[-1])]; SHIFT(end, d); + d = delta[U(end[-1])]; SHIFT(end, d); } - ++end; } else - d = delta[c = (end += d)[-1]]; + { + SHIFT(end, d); + d = delta[c = end[-1]]; + } if (d) continue; beg = end - 1; @@ -729,7 +761,8 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) d = 1; while (lim - end >= d) { - if ((d = delta[c = (end += d)[-1]]) != 0) + SHIFT(end, d); + if ((d = delta[c = end[-1]]) != 0) continue; beg = end - 1; if (!(trie = next[c])) -- 1.9.0
From 205a0e7aae41fb819c8dbd78e145f34161ec8a4c Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Tue, 22 Apr 2014 23:34:22 -0700 Subject: [PATCH 2/2] kwset: simplify and speed up Boyer-Moore unibyte -i in some cases This improves the performance of, for example, yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 | grep -i jk in a unibyte locale. * src/kwset.c (memchr_trans): New function. (bmexec): Use it. Simplify the code and remove some of the confusing gotos and breaks and labels. Do not treat glibc memchr as a special case; if non-glibc memchr is slow, that is lower priority and I suppose we can try to work around the problem in gnulib. --- src/kwset.c | 77 ++++++++++++++++++++++++++----------------------------------- 1 file changed, 33 insertions(+), 44 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index 78fb0b2..f86ee03 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -524,6 +524,20 @@ bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len, return false; } +/* Return the address of the first byte in the buffer S that equals C. + S contains N bytes. If TRANS is nonnull, use it to transliterate + S's bytes before comparing them. */ +static char const * +memchr_trans (char const *s, char c, size_t n, char const *trans) +{ + if (! trans) + return memchr (s, c, n); + char const *slim = s + n; + for (; s < slim; s++) + if (trans[U(*s)] == c) + return s; + return NULL; +} /* Fast boyer-moore search. */ static size_t _GL_ATTRIBUTE_PURE @@ -541,18 +555,8 @@ bmexec (kwset_t kwset, char const *text, size_t size) return -1; if (len == 1) { - if (trans) - { - for (tp = text; tp < text + size; tp++) - if (trans[U(*tp)] == kwset->target[0]) - return tp - text; - return -1; - } - else - { - tp = memchr (text, kwset->target[0], size); - return tp ? tp - text : -1; - } + tp = memchr_trans (text, kwset->target[0], size, trans); + return tp ? tp - text : -1; } d1 = kwset->delta; @@ -564,48 +568,33 @@ bmexec (kwset_t kwset, char const *text, size_t size) /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ if (size > 12 * len) /* 11 is not a bug, the initial offset happens only once. */ - for (ep = text + size - 11 * len;;) + for (ep = text + size - 11 * len; tp <= ep; ) { - while (tp <= ep) + d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + if (d != 0) { d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; - if (d == 0) - goto found; - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; - if (d == 0) - goto found; - d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; - d = d1[U(tp[-1])], tp += d; - if (d == 0) - goto found; - /* memchar() of glibc is faster than seeking by delta1 on - some platforms. When there is no chance to match for a - while, use it on them. */ -#if defined(__GLIBC__) && (defined(__i386__) || defined(__x86_64__)) - if (!trans) - { - tp = memchr (tp - 1, gc1, size + text - tp + 1); - if (tp) - { - ++tp; - goto found; - } - else - return -1; - } - else -#endif + if (d != 0) { d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; + d = d1[U(tp[-1])], tp += d; + if (d != 0) + { + /* Typically memchr is faster than seeking by + delta1 when there is no chance to match for + a while. */ + tp--; + tp = memchr_trans (tp, gc1, text + size - tp, trans); + if (! tp) + return -1; + tp++; + } } } - break; - found: if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset)) return tp - text; } -- 1.9.0
