On 04/23/2014 10:51 AM, Norihiro Tanaka wrote:
Paul, thanks for a lot of reviews and commits. However, it may be wrong.
I ran several tests for the worst case.
$ yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 >../k
$ env LANG=C time -p src/grep -i kjjjjjjjjjjjjjjjjjjj ../k
My version:
real 0.74
user 0.43
sys 0.30
Simplified version:
real 2.06
user 1.74
sys 0.31
It's slower than DFA.
That's odd, as I'm not observing this slowdown. I'm running on Fedora
20 x86-64 with GCC 4.9.0. The current master
(b0a087b4c7196782d8e76f00c45de1540daf5b91) runs a bit faster on your
benchmark than the branch with your patch
(dd0de0dce596add9098e1fb6e294c190c9b04f18): about 1.167 seconds versus
about 1.230 seconds real time, both in the C locale. I'm not using any
special compiler flags, just the -g -O2 that 'configure' deduces. I'll
attach the exact difference between these two versions, as the two
branches have diverged with time and perhaps something else is affecting
these numbers.
More generally, I don't even understand why your patch would speed
things up. To my mind, it should slow things down a bit, which is what
I'm observing. And (as I mentioned) it causes 'make check' to fail.
Possibly I'm simply not understanding the proposed change.
As far as Solaris and HP-UX goes, I'd like to wait until we've figured
out the GNU situation. Those platforms are lower priority, and the
code's correct for them, it's just a performance issue.
diff --git a/src/dfa.c b/src/dfa.c
index 42a9736..65fc03d 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -339,9 +339,8 @@ struct dfa
with dfaparse. */
unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
- mbstate_t mbs; /* Multibyte conversion state. */
- /* The following are valid only if mb_cur_max > 1. */
+ /* The following are used only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
if tokens[i] < NOTCHAR
@@ -423,7 +422,7 @@ struct dfa
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
- position_set mb_follows; /* Follow set added by ANYCHAR and/or MBCSET
+ position_set *mb_follows; /* Follow set added by ANYCHAR and/or MBCSET
on demand. */
int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or
MBCSET. */
@@ -463,34 +462,33 @@ dfambcache (struct dfa *d)
}
}
-/* Store into *PWC the result of converting the leading bytes of the
- multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
- and updating the conversion state in *D. On conversion error,
- convert just a single byte as-is. Return the number of bytes
- converted.
+/* Given the dfa D, store into *PWC the result of converting the
+ leading bytes of the multibyte buffer S of length N bytes, updating
+ the conversion state in *MBS. On conversion error, convert just a
+ single byte as-is. Return the number of bytes converted.
- This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
+ This differs from mbrtowc (PWC, S, N, MBS) as follows:
- * The last arg is a dfa *D instead of merely a multibyte conversion
- state D->mbs. D also contains an mbrtowc_cache for speed.
+ * Extra arg D, containing an mbrtowc_cache for speed.
* N must be at least 1.
* S[N - 1] must be a sentinel byte.
* Shift encodings are not supported.
* The return value is always in the range 1..N.
- * D->mbs is always valid afterwards.
+ * *MBS is always valid afterwards.
* *PWC is always set to something. */
static size_t
-mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct dfa *d)
+mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n,
+ mbstate_t *mbs)
{
unsigned char uc = s[0];
wint_t wc = d->mbrtowc_cache[uc];
if (wc == WEOF)
{
- size_t nbytes = mbrtowc (pwc, s, n, &d->mbs);
+ size_t nbytes = mbrtowc (pwc, s, n, mbs);
if (0 < nbytes && nbytes < (size_t) -2)
return nbytes;
- memset (&d->mbs, 0, sizeof d->mbs);
+ memset (mbs, 0, sizeof *mbs);
wc = uc;
}
@@ -840,6 +838,7 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */
static int cur_mb_len = 1; /* Length of the multibyte representation of
wctok. */
/* These variables are used only if (MB_CUR_MAX > 1). */
+static mbstate_t mbs; /* mbstate for mbrtowc. */
static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
@@ -857,7 +856,7 @@ static wchar_t wctok; /* Wide character representation of the current
else \
{ \
wchar_t _wc; \
- size_t nbytes = mbs_to_wchar (&_wc, lexptr, lexleft, dfa); \
+ size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \
cur_mb_len = nbytes; \
(wc) = _wc; \
(c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \
@@ -1933,7 +1932,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
if (MB_CUR_MAX > 1)
{
cur_mb_len = 0;
- memset (&d->mbs, 0, sizeof d->mbs);
+ memset (&mbs, 0, sizeof mbs);
}
if (!syntax_bits_set)
@@ -3113,7 +3112,8 @@ transit_state_consume_1char (struct dfa *d, state_num s,
s2 = s1;
rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
}
- copy (&d->states[s1].elems, &d->mb_follows);
+ /* Copy the positions contained by 's1' to the set 'd->mb_follows'. */
+ copy (&(d->states[s1].elems), d->mb_follows);
/* Add all of the positions which can be reached from 's' by consuming
a single character. */
@@ -3123,7 +3123,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
j++)
insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
- &d->mb_follows);
+ d->mb_follows);
}
/* FIXME: this return value is always ignored. */
@@ -3151,7 +3151,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
We check whether each of them can match or not. */
{
/* Note: caller must free the return value of this function. */
- mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+ mbclen = mbs_to_wchar (d, &wc, (char const *) *pp, end - *pp, &mbs);
match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
wc, mbclen);
@@ -3179,7 +3179,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
}
/* This state has some operators which can match a multibyte character. */
- d->mb_follows.nelem = 0;
+ d->mb_follows->nelem = 0;
/* 'maxlen' may be longer than the length of a character, because it may
not be a character but a (multi character) collating element.
@@ -3187,12 +3187,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
'maxlen' bytes. */
transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens);
- s1 = state_index (d, &d->mb_follows, wchar_context (wc));
+ s1 = state_index (d, d->mb_follows, wchar_context (wc));
realloc_trans_if_necessary (d, s1);
while (*pp - p1 < maxlen)
{
- mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+ mbclen = mbs_to_wchar (d, &wc, (char const *) *pp, end - *pp, &mbs);
transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL);
for (i = 0; i < nelem; i++)
@@ -3201,10 +3201,10 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
for (j = 0;
j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
- &d->mb_follows);
+ d->mb_follows);
}
- s1 = state_index (d, &d->mb_follows, wchar_context (wc));
+ s1 = state_index (d, d->mb_follows, wchar_context (wc));
realloc_trans_if_necessary (d, s1);
}
return s1;
@@ -3244,9 +3244,15 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (d->mb_cur_max > 1)
{
- memset (&d->mbs, 0, sizeof d->mbs);
- d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens);
- alloc_position_set (&d->mb_follows, d->nleaves);
+ static bool mb_alloc = false;
+ memset (&mbs, 0, sizeof (mbstate_t));
+ if (!mb_alloc)
+ {
+ d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens);
+ d->mb_follows = xmalloc (sizeof *d->mb_follows);
+ alloc_position_set (d->mb_follows, d->nleaves);
+ mb_alloc = true;
+ }
}
for (;;)
@@ -3271,8 +3277,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
character. */
wchar_t wc;
while (mbp < p)
- mbp += mbs_to_wchar (&wc, (char const *) mbp,
- end - (char const *) mbp, d);
+ mbp += mbs_to_wchar (d, &wc, (char const *) mbp,
+ end - (char const *) mbp, &mbs);
p = mbp;
if ((char *) p >= end)
@@ -3401,6 +3407,7 @@ free_mbdata (struct dfa *d)
size_t i;
free (d->multibyte_prop);
+ d->multibyte_prop = NULL;
for (i = 0; i < d->nmbcsets; ++i)
{
@@ -3420,7 +3427,14 @@ free_mbdata (struct dfa *d)
}
free (d->mbcsets);
- free (d->mb_follows.elems);
+ d->mbcsets = NULL;
+ d->nmbcsets = 0;
+
+ if (d->mb_follows)
+ {
+ free (d->mb_follows->elems);
+ free (d->mb_follows);
+ }
free (d->mb_match_lens);
}
diff --git a/src/kwset.c b/src/kwset.c
index f86ee03..ce127e2 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -55,6 +55,35 @@
#define U(c) (to_uchar (c))
+#if defined(__i386__) || defined(__x86_64__)
+#define SHIFT(p, d) \
+ switch (d) \
+ { \
+ default: \
+ p += d; \
+ break; \
+ case 8: \
+ ++p; \
+ case 7: \
+ ++p; \
+ case 6: \
+ ++p; \
+ case 5: \
+ ++p; \
+ case 4: \
+ ++p; \
+ case 3: \
+ ++p; \
+ case 2: \
+ ++p; \
+ case 1: \
+ ++p; \
+ }
+#else
+#define SHIFT(p, d) \
+ p += d;
+#endif
+
/* Balanced tree of edges and labels leaving a given trie node. */
struct tree
{
@@ -508,13 +537,14 @@ bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len,
}
}
- tp += d = kwset->shift[i - 2];
+ d = kwset->shift[i - 2];
+ SHIFT(tp, d);
if (tp > ep)
break;
if (tr (trans, tp[-1]) != gc1)
{
if (d1)
- tp += d1[U(tp[-1])];
+ SHIFT(tp, d1[U(tp[-1])]);
break;
}
skip = i - 1;
@@ -524,20 +554,6 @@ bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len,
return false;
}
-/* Return the address of the first byte in the buffer S that equals C.
- S contains N bytes. If TRANS is nonnull, use it to transliterate
- S's bytes before comparing them. */
-static char const *
-memchr_trans (char const *s, char c, size_t n, char const *trans)
-{
- if (! trans)
- return memchr (s, c, n);
- char const *slim = s + n;
- for (; s < slim; s++)
- if (trans[U(*s)] == c)
- return s;
- return NULL;
-}
/* Fast boyer-moore search. */
static size_t _GL_ATTRIBUTE_PURE
@@ -555,8 +571,18 @@ bmexec (kwset_t kwset, char const *text, size_t size)
return -1;
if (len == 1)
{
- tp = memchr_trans (text, kwset->target[0], size, trans);
- return tp ? tp - text : -1;
+ if (trans)
+ {
+ for (tp = text; tp < text + size; tp++)
+ if (trans[U(*tp)] == kwset->target[0])
+ return tp - text;
+ return -1;
+ }
+ else
+ {
+ tp = memchr (text, kwset->target[0], size);
+ return tp ? tp - text : -1;
+ }
}
d1 = kwset->delta;
@@ -568,33 +594,48 @@ bmexec (kwset_t kwset, char const *text, size_t size)
/* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
if (size > 12 * len)
/* 11 is not a bug, the initial offset happens only once. */
- for (ep = text + size - 11 * len; tp <= ep; )
+ for (ep = text + size - 11 * len;;)
{
- d = d1[U(tp[-1])], tp += d;
- d = d1[U(tp[-1])], tp += d;
- if (d != 0)
+ while (tp <= ep)
{
- d = d1[U(tp[-1])], tp += d;
- d = d1[U(tp[-1])], tp += d;
- d = d1[U(tp[-1])], tp += d;
- if (d != 0)
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ if (d == 0)
+ goto found;
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ if (d == 0)
+ goto found;
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ if (d == 0)
+ goto found;
+ /* memchar() of glibc is faster than seeking by delta1 on
+ some platforms. When there is no chance to match for a
+ while, use it on them. */
+#if defined(__GLIBC__) && (defined(__i386__) || defined(__x86_64__))
+ if (!trans)
{
- d = d1[U(tp[-1])], tp += d;
- d = d1[U(tp[-1])], tp += d;
- d = d1[U(tp[-1])], tp += d;
- if (d != 0)
+ tp = memchr (tp - 1, gc1, size + text - tp + 1);
+ if (tp)
{
- /* Typically memchr is faster than seeking by
- delta1 when there is no chance to match for
- a while. */
- tp--;
- tp = memchr_trans (tp, gc1, text + size - tp, trans);
- if (! tp)
- return -1;
- tp++;
+ ++tp;
+ goto found;
}
+ else
+ return -1;
+ }
+ else
+#endif
+ {
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
+ d = d1[U(tp[-1])]; SHIFT(tp, d);
}
}
+ break;
+ found:
if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
return tp - text;
}
@@ -605,7 +646,8 @@ bmexec (kwset_t kwset, char const *text, size_t size)
d = d1[U(tp[-1])];
while (d <= ep - tp)
{
- d = d1[U((tp += d)[-1])];
+ SHIFT(tp, d);
+ d = d1[U(tp[-1])];
if (d != 0)
continue;
if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset))
@@ -659,17 +701,18 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch)
{
if (qlim && end <= qlim)
{
- end += d - 1;
while ((d = delta[c = *end]) && end < qlim)
{
- end += d;
- end += delta[U(*end)];
- end += delta[U(*end)];
+ SHIFT(end, d);
+ d = delta[U(end[-1])]; SHIFT(end, d);
+ d = delta[U(end[-1])]; SHIFT(end, d);
}
- ++end;
}
else
- d = delta[c = (end += d)[-1]];
+ {
+ SHIFT(end, d);
+ d = delta[c = end[-1]];
+ }
if (d)
continue;
beg = end - 1;
@@ -718,7 +761,8 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch)
d = 1;
while (lim - end >= d)
{
- if ((d = delta[c = (end += d)[-1]]) != 0)
+ SHIFT(end, d);
+ if ((d = delta[c = end[-1]]) != 0)
continue;
beg = end - 1;
if (!(trie = next[c]))