The previous version of 2nd patch wasn't so better still, because it was
slow for below in order that it never fills beg == prev_beg.
$ yes 'abcdabc
jjjjjjj' | head -50000000 >k
$ env LC_ALL=C time -p src/grep abcd.bd k
Accordingly, I have fixed it. It works below if dfafast is true.
- If `beg - prev_beg' is small, overhead by calls of KWset and DFA is
high. So use constant 64 as minimum shift of the `end' pointer.
- If ratio of `match - beg' to `beg - prev_beg' is large, KWset isn't
effective. So consider it.
As far as I have tested, It improves the performance of the worst cases
with little slowdown to others.
Thanks,
Norihiro
From 6a73a36b4ffa55a9f62018b863b066790b046c03 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 28 Apr 2014 21:13:57 +0900
Subject: [PATCH 1/2] grep: simplification for superset
* src/dfa.h (dfahint): Remove it.
(dfasuperset): New function.
* src/dfa.c (dfahint): Remove it.
(dfassbuild): Rename from dfasuperset.
(dfasuperset): New function. it returns the superset oneself of D.
* src/dfasearch.c: Substitute dfahint to dfasuperset and simplify.
---
src/dfa.c | 26 ++++----------
src/dfa.h | 13 ++-----
src/dfasearch.c | 106 +++++++++++++++++++++++++++++++-------------------------
3 files changed, 69 insertions(+), 76 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 362de2c..c9070fe 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3391,24 +3391,12 @@ dfaexec (struct dfa *d, char const *begin, char *end,
return (char *) p;
}
-/* Search through a buffer looking for a potential match for D.
- Return the offset of the byte after the first potential match.
- If there is no match, return (size_t) -1. If D lacks a superset
- so it's not known whether there is a match, return (size_t) -2.
- BEGIN points to the beginning of the buffer, and END points to the
- first byte after its end. Store a sentinel byte (usually newline)
- in *END, so the actual buffer must be one byte longer. If COUNT is
- non-NULL, increment *COUNT once for each newline processed. */
-size_t
-dfahint (struct dfa *d, char const *begin, char *end, size_t *count)
+/* Return superset for D, which searchs through a buffer looking for a
+ potential match. */
+struct dfa *
+dfasuperset (struct dfa *d)
{
- if (! d->superset)
- return -2;
- else
- {
- char const *match = dfaexec (d->superset, begin, end, 1, count, NULL);
- return match ? match - begin : -1;
- }
+ return d->superset;
}
bool
@@ -3495,7 +3483,7 @@ dfaoptimize (struct dfa *d)
}
static void
-dfasuperset (struct dfa *d)
+dfassbuild (struct dfa *d)
{
size_t i, j;
charclass ccl;
@@ -3579,7 +3567,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, int
searchflag)
dfaparse (s, len, d);
dfamust (d);
dfaoptimize (d);
- dfasuperset (d);
+ dfassbuild (d);
dfaanalyze (d, searchflag);
if (d->superset)
dfaanalyze (d->superset, searchflag);
diff --git a/src/dfa.h b/src/dfa.h
index fbcb191..2de7ba8 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -71,16 +71,9 @@ extern void dfacomp (char const *, size_t, struct dfa *,
int);
extern char *dfaexec (struct dfa *d, char const *begin, char *end,
int newline, size_t *count, int *backref);
-/* Search through a buffer looking for a potential match for D.
- Return the offset of the byte after the first potential match.
- If there is no match, return (size_t) -1. If D lacks a superset
- so it's not known whether there is a match, return (size_t) -2.
- BEGIN points to the beginning of the buffer, and END points to the
- first byte after its end. Store a sentinel byte (usually newline)
- in *END, so the actual buffer must be one byte longer. If COUNT is
- non-NULL, increment *COUNT once for each newline processed. */
-extern size_t dfahint (struct dfa *d, char const *begin, char *end,
- size_t *count);
+/* Return superset for D, which searchs through a buffer looking for a
+ potential match. */
+extern struct dfa *dfasuperset (struct dfa *d);
/* Return true if the DFA is likely to be fast. */
extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE;
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 7edc96b..c1822b4 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -213,6 +213,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
size_t len, best_len;
struct kwsmatch kwsm;
size_t i;
+ struct dfa *superset = dfasuperset (dfa);
bool dfafast = dfaisfast (dfa);
mb_start = buf;
@@ -220,26 +221,37 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
for (beg = end = buf; end < buflim; beg = end)
{
+ end = buflim;
+
if (!start_ptr)
{
- /* We don't care about an exact match. */
+ char const *next_beg, *dfa_beg = beg;
+ size_t count = 0;
+ bool narrowed = false;
+
+ /* Try matching with KWset, if it's defined. */
if (kwset)
{
+ char const *prev_beg;
+
/* Find a possible match using the KWset matcher. */
size_t offset = kwsexec (kwset, beg - begline,
buflim - beg + begline, &kwsm);
if (offset == (size_t) -1)
goto failure;
match = beg + offset;
-
- /* Narrow down to the line containing the candidate. */
+ prev_beg = beg;
+
+ /* Narrow down to the line we've found. */
beg = memrchr (buf, eol, match - buf);
beg = beg ? beg + 1 : buf;
- end = memchr (match, eol, buflim - match);
- end = end ? end + 1 : buflim;
+ dfa_beg = beg;
if (kwsm.index < kwset_exact_matches)
{
+ end = memchr (match, eol, buflim - match);
+ end = end ? end + 1 : buflim;
+
if (MB_CUR_MAX == 1)
goto success;
if (mb_start < beg)
@@ -250,61 +262,63 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
beginning of the next character. */
- if (! dfaexec (dfa, mb_start, (char *) end, 0, NULL,
- &backref))
- continue;
+ dfa_beg = mb_start;
+ narrowed = true;
}
else if (!dfafast)
{
- if (dfahint (dfa, beg, (char *) end, NULL) == (size_t) -1)
- continue;
- if (! dfaexec (dfa, beg, (char *) end, 0, NULL, &backref))
- continue;
+ end = memchr (match, eol, buflim - match);
+ end = end ? end + 1 : buflim;
+ narrowed = true;
}
}
- if (!kwset || dfafast)
+
+ /* Try matching with the superset of DFA, if it's defined. */
+ if (superset && !(kwset && kwsm.index < kwset_exact_matches))
{
- /* No good fixed strings or DFA is fast; use DFA. */
- size_t offset, count;
- char const *next_beg;
- count = 0;
- offset = dfahint (dfa, beg, (char *) buflim, &count);
- if (offset == (size_t) -1)
- goto failure;
- if (offset == (size_t) -2)
+ next_beg = dfaexec (superset, dfa_beg, (char *) end, 1, &count,
NULL);
+ /* If there's no match, or if we've matched the sentinel,
+ we're done. */
+ if (next_beg == NULL || next_beg == end)
+ continue;
+ if (!narrowed)
{
- /* No use hint. */
- next_beg = dfaexec (dfa, beg, (char *) buflim, 0,
- NULL, &backref);
- /* If there's no match, or if we've matched the sentinel,
- we're done. */
- if (next_beg == NULL || next_beg == buflim)
- goto failure;
+ /* Narrow down to the line we've found. */
+ if (count != 0)
+ {
+ beg = memrchr (buf, eol, next_beg - buf);
+ beg = beg ? beg + 1 : buf;
+
+ /* If dfaexec may match in multiple lines, try to
+ match in one line. */
+ end = beg;
+ continue;
+ }
+ end = memchr (next_beg, eol, buflim - next_beg);
+ end = end ? end + 1 : buflim;
+ narrowed = true;
}
- else
- next_beg = beg + offset;
+ }
+
+ /* Try matching with DFA. */
+ next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref);
+
+ /* If there's no match, or if we've matched the sentinel,
+ we're done. */
+ if (next_beg == NULL || next_beg == end)
+ continue;
+ if (!narrowed)
+ {
/* Narrow down to the line we've found. */
- beg = memrchr (buf, eol, next_beg - buf);
- beg = beg ? beg + 1 : buf;
if (count != 0)
{
- /* If dfahint may match in multiple lines, try to
- match in one line. */
- end = beg;
- continue;
+ beg = memrchr (buf, eol, next_beg - buf);
+ beg = beg ? beg + 1 : buf;
}
end = memchr (next_beg, eol, buflim - next_beg);
end = end ? end + 1 : buflim;
- if (offset != (size_t) -2)
- {
- next_beg = dfaexec (dfa, beg, (char *) end, 0, NULL,
- &backref);
- /* If there's no match, or if we've matched the sentinel,
- we're done. */
- if (next_beg == NULL || next_beg == end)
- continue;
- }
}
+
/* Successful, no backreferences encountered! */
if (!backref)
goto success;
@@ -314,8 +328,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
{
/* We are looking for the leftmost (then longest) exact match.
We will go through the outer loop only once. */
- beg = buf;
- end = buflim;
ptr = start_ptr;
}
--
1.9.2
From cee9b6b9efa6db90566f32408dab60680a7b3a9f Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 28 Apr 2014 21:22:04 +0900
Subject: [PATCH 2/2] grep: [PATCH] grep: ajustment of timing back to kwset
when dfaisfast is true
If failed in DFA after succeed in kwset, doesn't return to kwset until
reaches the end of the buffer or find a match. By that, thought some
cases speed up, there is also a case slowdown.
This patch ajusts timing back to kwset when dfaisfast is true.
* src/dfasearch.c (EGexecute): Do it.
---
src/dfasearch.c | 30 +++++++++++++++++++++++++++++-
1 file changed, 29 insertions(+), 1 deletion(-)
diff --git a/src/dfasearch.c b/src/dfasearch.c
index c1822b4..6d3765b 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -265,7 +265,35 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
dfa_beg = mb_start;
narrowed = true;
}
- else if (!dfafast)
+ else if (dfafast)
+ {
+ /* Determine the end pointer to start with DFA. It's
+ `match' by default. If the offset from prev_beg is
+ lesser than (match - beg) * 4 and/or 64, set it to
+ greatest value in them. A larger offset than the
+ default means that DFA is preferred to KWset. */
+ offset = (match - beg) * 4;
+ if (offset < 64)
+ offset = 64;
+ if (prev_beg + offset > match)
+ {
+ end = prev_beg + offset;
+ if (end < buflim)
+ {
+ end = memchr (end, eol, buflim - end);
+ end = end ? end + 1 : buflim;
+ }
+ else
+ end = buflim;
+ }
+ else
+ {
+ end = memchr (match, eol, buflim - match);
+ end = end ? end + 1 : buflim;
+ narrowed = true;
+ }
+ }
+ else
{
end = memchr (match, eol, buflim - match);
end = end ? end + 1 : buflim;
--
1.9.2