Thanks for the test cases and patch. In my tests, switching to macros
does not help performance, and that SWITCH macro's implementation
actually slows things down a bit, which is what I'd expect. If there is
a reason to use macros I'd like to see a patch that simply changes
functions to macros without changing the algorithm, so that we can
measure this effect separately from the algorithm change.
I attempted to suss out the performance improvements in that patch
without using macros, and installed the attached changes. With these
changes grep performs about as well as it does with that patch, on the
benchmarks you mentioned that I tried (as before, I'm using the default
optimization with GCC 4.9.0 x86-64 on an AMD Phenom II X4 910e). Quite
possibly I've missed something, of course. The two "advance_*"
constants used in the heuristics are guesses: I haven't measured
rigorously to try to come up with good values.
From c3f2a84935ca08e276a34276186022ffcf7d79be Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 25 Apr 2014 18:31:47 -0700
Subject: [PATCH 1/2] kwset: improve performance when large Boyer-Moore key
doesn't match
* src/kwset.c (bmexec): As a heuristic, prefer memchr to seeking
by delta1 only when the latter doesn't advance much.
---
src/kwset.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/src/kwset.c b/src/kwset.c
index f86ee03..406fa0e 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -570,6 +570,7 @@ bmexec (kwset_t kwset, char const *text, size_t size)
/* 11 is not a bug, the initial offset happens only once. */
for (ep = text + size - 11 * len; tp <= ep; )
{
+ char const *tp0 = tp;
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
if (d != 0)
@@ -584,9 +585,14 @@ bmexec (kwset_t kwset, char const *text, size_t size)
d = d1[U(tp[-1])], tp += d;
if (d != 0)
{
- /* Typically memchr is faster than seeking by
- delta1 when there is no chance to match for
- a while. */
+ d = d1[U(tp[-1])], tp += d;
+ d = d1[U(tp[-1])], tp += d;
+
+ /* As a heuristic, prefer memchr to seeking by
+ delta1 when the latter doesn't advance much. */
+ int advance_heuristic = 4 * sizeof (long);
+ if (advance_heuristic <= tp - tp0)
+ goto big_advance;
tp--;
tp = memchr_trans (tp, gc1, text + size - tp, trans);
if (! tp)
@@ -597,6 +603,7 @@ bmexec (kwset_t kwset, char const *text, size_t size)
}
if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
return tp - text;
+ big_advance:;
}
/* Now we have only a few characters left to search. We
--
1.9.0
From 873dd9d626853e195011eedd18b6f7a21c9faec4 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 26 Apr 2014 18:18:59 -0700
Subject: [PATCH 2/2] kwset: speed up by using memchr2
Idea suggested by Eric Blake in: http://bugs.gnu.org/17229#43
* bootstrap.conf (gnulib_modules): Add memchr2.
* src/kwset.c: Include stdint.h, for uintptr_t. Include memchr2.h.
(struct kwset): New members gc1, gc2, gc1help.
(tr): Move earlier, so it can be used earlier.
(kwsprep): Initialize struct kwset's new members.
(memchr_kwset): Rename from memchr_trans. Combine C and TRANS args into
new arg KWSET. All uses changed. Use memchr2 when appropriate.
(bmexec): Use new members instead of recomputing their values.
Increase advance_heuristic; it's just a guess, but memchr2 probably
makes it reasonable to increase it.
---
bootstrap.conf | 1 +
src/kwset.c | 87 ++++++++++++++++++++++++++++++++++++++++++++--------------
2 files changed, 67 insertions(+), 21 deletions(-)
diff --git a/bootstrap.conf b/bootstrap.conf
index 367427d..9f3457d 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -57,6 +57,7 @@ manywarnings
mbrlen
mbrtowc
memchr
+memchr2
mempcpy
minmax
obstack
diff --git a/src/kwset.c b/src/kwset.c
index 406fa0e..8270f05 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -36,8 +36,10 @@
#include "kwset.h"
#include <stdbool.h>
+#include <stdint.h>
#include <sys/types.h>
#include "system.h"
+#include "memchr2.h"
#include "obstack.h"
#include "xalloc.h"
@@ -91,8 +93,33 @@ struct kwset
char *target; /* Target string if there's only one. */
int *shift; /* Used in Boyer-Moore search for one string. */
char const *trans; /* Character translation table. */
+
+ /* If there's only one string, this is the string's last byte,
+ translated via TRANS if TRANS is nonnull. */
+ char gc1;
+
+ /* Likewise for the string's penultimate byte, if it has two or more
+ bytes. */
+ char gc2;
+
+ /* If there's only one string, this helps to match the string's last byte.
+ If GC1HELP is negative, only GC1 matches the string's last byte;
+ otherwise at least two bytes match, and B matches if TRANS[B] == GC1.
+ If GC1HELP is in the range 0..(NCHAR - 1), there are exactly two
+ such matches, and GC1HELP is the other match after conversion to
+ unsigned char. If GC1HELP is at least NCHAR, there are three or
+ more such matches; e.g., Greek has three sigma characters that
+ all match when case-folding. */
+ int gc1help;
};
+/* Use TRANS to transliterate C. A null TRANS does no transliteration. */
+static char
+tr (char const *trans, char c)
+{
+ return trans ? trans[U(c)] : c;
+}
+
/* Allocate and initialize a keyword set object, returning an opaque
pointer to it. */
kwset_t
@@ -456,6 +483,27 @@ kwsprep (kwset_t kwset)
curr = curr->next;
}
}
+
+ char gc1 = tr (trans, kwset->target[kwset->mind - 1]);
+
+ /* Set GC1HELP according to whether exactly one, exactly two, or
+ three-or-more characters match GC1. */
+ int gc1help = -1;
+ if (trans)
+ {
+ char const *equiv1 = memchr (trans, gc1, NCHAR);
+ char const *equiv2 = memchr (equiv1 + 1, gc1,
+ trans + NCHAR - (equiv1 + 1));
+ if (equiv2)
+ gc1help = (memchr (equiv2 + 1, gc1, trans + NCHAR - (equiv2 + 1))
+ ? NCHAR
+ : U(gc1) ^ (equiv1 - trans) ^ (equiv2 - trans));
+ }
+
+ kwset->gc1 = gc1;
+ kwset->gc1help = gc1help;
+ if (kwset->mind > 1)
+ kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
}
/* Fix things up for any translation table. */
@@ -464,13 +512,6 @@ kwsprep (kwset_t kwset)
kwset->delta[i] = delta[U(trans[i])];
}
-/* Use TRANS to transliterate C. A null TRANS does no transliteration. */
-static char
-tr (char const *trans, char c)
-{
- return trans ? trans[U(c)] : c;
-}
-
/* Delta2 portion of a Boyer-Moore search. *TP is the string text
pointer; it is updated in place. EP is the end of the string text,
and SP the end of the pattern. LEN is the pattern length; it must
@@ -524,19 +565,23 @@ bm_delta2_search (char const **tpp, char const *ep, char
const *sp, int len,
return false;
}
-/* Return the address of the first byte in the buffer S that equals C.
- S contains N bytes. If TRANS is nonnull, use it to transliterate
- S's bytes before comparing them. */
+/* Return the address of the first byte in the buffer S (of size N)
+ that matches the last byte specified by KWSET, a singleton. */
static char const *
-memchr_trans (char const *s, char c, size_t n, char const *trans)
+memchr_kwset (char const *s, size_t n, kwset_t kwset)
{
- if (! trans)
- return memchr (s, c, n);
- char const *slim = s + n;
+ if (kwset->gc1help < 0)
+ return memchr (s, kwset->gc1, n);
+ int small_heuristic = 2;
+ int small = (- (uintptr_t) s % sizeof (long)
+ + small_heuristic * sizeof (long));
+ size_t ntrans = kwset->gc1help < NCHAR && small < n ? small : n;
+ char const *slim = s + ntrans;
for (; s < slim; s++)
- if (trans[U(*s)] == c)
+ if (kwset->trans[U(*s)] == kwset->gc1)
return s;
- return NULL;
+ n -= ntrans;
+ return n == 0 ? NULL : memchr2 (s, kwset->gc1, kwset->gc1help, n);
}
/* Fast boyer-moore search. */
@@ -555,15 +600,15 @@ bmexec (kwset_t kwset, char const *text, size_t size)
return -1;
if (len == 1)
{
- tp = memchr_trans (text, kwset->target[0], size, trans);
+ tp = memchr_kwset (text, size, kwset);
return tp ? tp - text : -1;
}
d1 = kwset->delta;
sp = kwset->target + len;
tp = text + len;
- char gc1 = tr (trans, sp[-1]);
- char gc2 = tr (trans, sp[-2]);
+ char gc1 = kwset->gc1;
+ char gc2 = kwset->gc2;
/* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
if (size > 12 * len)
@@ -590,11 +635,11 @@ bmexec (kwset_t kwset, char const *text, size_t size)
/* As a heuristic, prefer memchr to seeking by
delta1 when the latter doesn't advance much. */
- int advance_heuristic = 4 * sizeof (long);
+ int advance_heuristic = 16 * sizeof (long);
if (advance_heuristic <= tp - tp0)
goto big_advance;
tp--;
- tp = memchr_trans (tp, gc1, text + size - tp, trans);
+ tp = memchr_kwset (tp, text + size - tp, kwset);
if (! tp)
return -1;
tp++;
--
1.9.0