Thanks for working on that. A few comments. First, the ChangeLog
entries can be improved; I put a proposed rewrite of them into the first
two attached patches (the code's the same). Second, building with
--enable-gcc-warnings causes a complaint about dfasuperset needing to be
declared pure. Third, EGexecute still has quite a bit of duplicate
code, even with the patch. I took at shot at simplifying it (and fixing
the 2nd problem) in the third attached patch. This passes "make check"
but I have not benchmarked it.
From c9d48419f36e852d4f6736b302bbd890e01a7584 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 28 Apr 2014 21:13:57 +0900
Subject: [PATCH 1/3] grep: simplify superset
* src/dfa.h (dfahint): Remove decl.
(dfasuperset): New decl.
* src/dfa.c (dfahint): Remove.
(dfassbuild): Rename from dfasuperset.
(dfasuperset): New function. it returns the superset of D.
* src/dfasearch.c: Use dfasuperset instead of dfahint, and simplify.
---
src/dfa.c | 26 ++++----------
src/dfa.h | 13 ++-----
src/dfasearch.c | 106 +++++++++++++++++++++++++++++++-------------------------
3 files changed, 69 insertions(+), 76 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 362de2c..c9070fe 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3391,24 +3391,12 @@ dfaexec (struct dfa *d, char const *begin, char *end,
return (char *) p;
}
-/* Search through a buffer looking for a potential match for D.
- Return the offset of the byte after the first potential match.
- If there is no match, return (size_t) -1. If D lacks a superset
- so it's not known whether there is a match, return (size_t) -2.
- BEGIN points to the beginning of the buffer, and END points to the
- first byte after its end. Store a sentinel byte (usually newline)
- in *END, so the actual buffer must be one byte longer. If COUNT is
- non-NULL, increment *COUNT once for each newline processed. */
-size_t
-dfahint (struct dfa *d, char const *begin, char *end, size_t *count)
+/* Return superset for D, which searchs through a buffer looking for a
+ potential match. */
+struct dfa *
+dfasuperset (struct dfa *d)
{
- if (! d->superset)
- return -2;
- else
- {
- char const *match = dfaexec (d->superset, begin, end, 1, count, NULL);
- return match ? match - begin : -1;
- }
+ return d->superset;
}
bool
@@ -3495,7 +3483,7 @@ dfaoptimize (struct dfa *d)
}
static void
-dfasuperset (struct dfa *d)
+dfassbuild (struct dfa *d)
{
size_t i, j;
charclass ccl;
@@ -3579,7 +3567,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, int
searchflag)
dfaparse (s, len, d);
dfamust (d);
dfaoptimize (d);
- dfasuperset (d);
+ dfassbuild (d);
dfaanalyze (d, searchflag);
if (d->superset)
dfaanalyze (d->superset, searchflag);
diff --git a/src/dfa.h b/src/dfa.h
index fbcb191..2de7ba8 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -71,16 +71,9 @@ extern void dfacomp (char const *, size_t, struct dfa *,
int);
extern char *dfaexec (struct dfa *d, char const *begin, char *end,
int newline, size_t *count, int *backref);
-/* Search through a buffer looking for a potential match for D.
- Return the offset of the byte after the first potential match.
- If there is no match, return (size_t) -1. If D lacks a superset
- so it's not known whether there is a match, return (size_t) -2.
- BEGIN points to the beginning of the buffer, and END points to the
- first byte after its end. Store a sentinel byte (usually newline)
- in *END, so the actual buffer must be one byte longer. If COUNT is
- non-NULL, increment *COUNT once for each newline processed. */
-extern size_t dfahint (struct dfa *d, char const *begin, char *end,
- size_t *count);
+/* Return superset for D, which searchs through a buffer looking for a
+ potential match. */
+extern struct dfa *dfasuperset (struct dfa *d);
/* Return true if the DFA is likely to be fast. */
extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE;
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 7edc96b..c1822b4 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -213,6 +213,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
size_t len, best_len;
struct kwsmatch kwsm;
size_t i;
+ struct dfa *superset = dfasuperset (dfa);
bool dfafast = dfaisfast (dfa);
mb_start = buf;
@@ -220,26 +221,37 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
for (beg = end = buf; end < buflim; beg = end)
{
+ end = buflim;
+
if (!start_ptr)
{
- /* We don't care about an exact match. */
+ char const *next_beg, *dfa_beg = beg;
+ size_t count = 0;
+ bool narrowed = false;
+
+ /* Try matching with KWset, if it's defined. */
if (kwset)
{
+ char const *prev_beg;
+
/* Find a possible match using the KWset matcher. */
size_t offset = kwsexec (kwset, beg - begline,
buflim - beg + begline, &kwsm);
if (offset == (size_t) -1)
goto failure;
match = beg + offset;
-
- /* Narrow down to the line containing the candidate. */
+ prev_beg = beg;
+
+ /* Narrow down to the line we've found. */
beg = memrchr (buf, eol, match - buf);
beg = beg ? beg + 1 : buf;
- end = memchr (match, eol, buflim - match);
- end = end ? end + 1 : buflim;
+ dfa_beg = beg;
if (kwsm.index < kwset_exact_matches)
{
+ end = memchr (match, eol, buflim - match);
+ end = end ? end + 1 : buflim;
+
if (MB_CUR_MAX == 1)
goto success;
if (mb_start < beg)
@@ -250,61 +262,63 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
beginning of the next character. */
- if (! dfaexec (dfa, mb_start, (char *) end, 0, NULL,
- &backref))
- continue;
+ dfa_beg = mb_start;
+ narrowed = true;
}
else if (!dfafast)
{
- if (dfahint (dfa, beg, (char *) end, NULL) == (size_t) -1)
- continue;
- if (! dfaexec (dfa, beg, (char *) end, 0, NULL, &backref))
- continue;
+ end = memchr (match, eol, buflim - match);
+ end = end ? end + 1 : buflim;
+ narrowed = true;
}
}
- if (!kwset || dfafast)
+
+ /* Try matching with the superset of DFA, if it's defined. */
+ if (superset && !(kwset && kwsm.index < kwset_exact_matches))
{
- /* No good fixed strings or DFA is fast; use DFA. */
- size_t offset, count;
- char const *next_beg;
- count = 0;
- offset = dfahint (dfa, beg, (char *) buflim, &count);
- if (offset == (size_t) -1)
- goto failure;
- if (offset == (size_t) -2)
+ next_beg = dfaexec (superset, dfa_beg, (char *) end, 1, &count,
NULL);
+ /* If there's no match, or if we've matched the sentinel,
+ we're done. */
+ if (next_beg == NULL || next_beg == end)
+ continue;
+ if (!narrowed)
{
- /* No use hint. */
- next_beg = dfaexec (dfa, beg, (char *) buflim, 0,
- NULL, &backref);
- /* If there's no match, or if we've matched the sentinel,
- we're done. */
- if (next_beg == NULL || next_beg == buflim)
- goto failure;
+ /* Narrow down to the line we've found. */
+ if (count != 0)
+ {
+ beg = memrchr (buf, eol, next_beg - buf);
+ beg = beg ? beg + 1 : buf;
+
+ /* If dfaexec may match in multiple lines, try to
+ match in one line. */
+ end = beg;
+ continue;
+ }
+ end = memchr (next_beg, eol, buflim - next_beg);
+ end = end ? end + 1 : buflim;
+ narrowed = true;
}
- else
- next_beg = beg + offset;
+ }
+
+ /* Try matching with DFA. */
+ next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref);
+
+ /* If there's no match, or if we've matched the sentinel,
+ we're done. */
+ if (next_beg == NULL || next_beg == end)
+ continue;
+ if (!narrowed)
+ {
/* Narrow down to the line we've found. */
- beg = memrchr (buf, eol, next_beg - buf);
- beg = beg ? beg + 1 : buf;
if (count != 0)
{
- /* If dfahint may match in multiple lines, try to
- match in one line. */
- end = beg;
- continue;
+ beg = memrchr (buf, eol, next_beg - buf);
+ beg = beg ? beg + 1 : buf;
}
end = memchr (next_beg, eol, buflim - next_beg);
end = end ? end + 1 : buflim;
- if (offset != (size_t) -2)
- {
- next_beg = dfaexec (dfa, beg, (char *) end, 0, NULL,
- &backref);
- /* If there's no match, or if we've matched the sentinel,
- we're done. */
- if (next_beg == NULL || next_beg == end)
- continue;
- }
}
+
/* Successful, no backreferences encountered! */
if (!backref)
goto success;
@@ -314,8 +328,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
{
/* We are looking for the leftmost (then longest) exact match.
We will go through the outer loop only once. */
- beg = buf;
- end = buflim;
ptr = start_ptr;
}
--
1.9.0
From a27d0e6f3edc5a13e26896165198a3a6664b72fb Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 28 Apr 2014 21:22:04 +0900
Subject: [PATCH 2/3] grep: adjust timing back to kwset when dfaisfast is true
* src/dfasearch.c (EGexecute): If DFA fails after kwset succeeds,
the code doesn't return to kwset until it reaches the end of the buffer
or finds a match. Because of this, although some cases speed up,
others slow down.
Adjust the heuristic for switching to the DFA, so that it
is more likely to switch at the right times.
---
src/dfasearch.c | 30 +++++++++++++++++++++++++++++-
1 file changed, 29 insertions(+), 1 deletion(-)
diff --git a/src/dfasearch.c b/src/dfasearch.c
index c1822b4..6d3765b 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -265,7 +265,35 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
dfa_beg = mb_start;
narrowed = true;
}
- else if (!dfafast)
+ else if (dfafast)
+ {
+ /* Determine the end pointer to start with DFA. It's
+ `match' by default. If the offset from prev_beg is
+ lesser than (match - beg) * 4 and/or 64, set it to
+ greatest value in them. A larger offset than the
+ default means that DFA is preferred to KWset. */
+ offset = (match - beg) * 4;
+ if (offset < 64)
+ offset = 64;
+ if (prev_beg + offset > match)
+ {
+ end = prev_beg + offset;
+ if (end < buflim)
+ {
+ end = memchr (end, eol, buflim - end);
+ end = end ? end + 1 : buflim;
+ }
+ else
+ end = buflim;
+ }
+ else
+ {
+ end = memchr (match, eol, buflim - match);
+ end = end ? end + 1 : buflim;
+ narrowed = true;
+ }
+ }
+ else
{
end = memchr (match, eol, buflim - match);
end = end ? end + 1 : buflim;
--
1.9.0
From bcfff8f71c8f21b90b899269914a62ff43f74f1c Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Wed, 30 Apr 2014 00:03:18 -0700
Subject: [PATCH 3/3] grep: simplify EGexecute further
* src/dfa.c, src/dfa.h (dfasuperset): Arg is now const pointer.
Now pure.
* src/dfasearch.c (EGexecute): Coalesce some duplicate code.
Don't worry about memrchr returning NULL when that's impossible.
---
src/dfa.c | 4 +-
src/dfa.h | 8 ++--
src/dfasearch.c | 111 +++++++++++++++++++++-----------------------------------
3 files changed, 47 insertions(+), 76 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index c9070fe..43ad3cf 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3391,10 +3391,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
return (char *) p;
}
-/* Return superset for D, which searchs through a buffer looking for a
- potential match. */
struct dfa *
-dfasuperset (struct dfa *d)
+dfasuperset (struct dfa const *d)
{
return d->superset;
}
diff --git a/src/dfa.h b/src/dfa.h
index 2de7ba8..e3ab700 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -71,9 +71,11 @@ extern void dfacomp (char const *, size_t, struct dfa *,
int);
extern char *dfaexec (struct dfa *d, char const *begin, char *end,
int newline, size_t *count, int *backref);
-/* Return superset for D, which searchs through a buffer looking for a
- potential match. */
-extern struct dfa *dfasuperset (struct dfa *d);
+/* Return a superset for D. The superset matches everything that D
+ matches, along with some other strings (though the latter should be
+ rare, for efficiency reasons). Return a null pointer if no useful
+ superset is available. */
+extern struct dfa *dfasuperset (struct dfa const *d) _GL_ATTRIBUTE_PURE;
/* Return true if the DFA is likely to be fast. */
extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE;
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 6d3765b..5ad6c9d 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -227,7 +227,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
{
char const *next_beg, *dfa_beg = beg;
size_t count = 0;
- bool narrowed = false;
+ bool exact_kwset_match = false;
/* Try matching with KWset, if it's defined. */
if (kwset)
@@ -241,17 +241,30 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
goto failure;
match = beg + offset;
prev_beg = beg;
-
- /* Narrow down to the line we've found. */
+
+ /* Narrow down to the line containing the possible match. */
beg = memrchr (buf, eol, match - buf);
beg = beg ? beg + 1 : buf;
dfa_beg = beg;
- if (kwsm.index < kwset_exact_matches)
- {
- end = memchr (match, eol, buflim - match);
- end = end ? end + 1 : buflim;
+ /* Determine the end pointer to give the DFA next. Typically
+ this is after the first newline after MATCH; but if the KWset
+ match is not exact, the DFA is fast, and the offset from
+ PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
+ greater of the latter two values; this temporarily prefers
+ the DFA to KWset. */
+ exact_kwset_match = kwsm.index < kwset_exact_matches;
+ end = ((exact_kwset_match || !dfafast
+ || MAX (16, match - beg) < (match - prev_beg) >> 2)
+ ? match
+ : (buflim - prev_beg) >> 2 <= match - beg
+ ? buflim
+ : prev_beg + 4 * (match - beg));
+ end = memchr (end, eol, buflim - end);
+ end = end ? end + 1 : buflim;
+ if (exact_kwset_match)
+ {
if (MB_CUR_MAX == 1)
goto success;
if (mb_start < beg)
@@ -263,69 +276,30 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
character. Perform the DFA search starting from the
beginning of the next character. */
dfa_beg = mb_start;
- narrowed = true;
- }
- else if (dfafast)
- {
- /* Determine the end pointer to start with DFA. It's
- `match' by default. If the offset from prev_beg is
- lesser than (match - beg) * 4 and/or 64, set it to
- greatest value in them. A larger offset than the
- default means that DFA is preferred to KWset. */
- offset = (match - beg) * 4;
- if (offset < 64)
- offset = 64;
- if (prev_beg + offset > match)
- {
- end = prev_beg + offset;
- if (end < buflim)
- {
- end = memchr (end, eol, buflim - end);
- end = end ? end + 1 : buflim;
- }
- else
- end = buflim;
- }
- else
- {
- end = memchr (match, eol, buflim - match);
- end = end ? end + 1 : buflim;
- narrowed = true;
- }
- }
- else
- {
- end = memchr (match, eol, buflim - match);
- end = end ? end + 1 : buflim;
- narrowed = true;
}
}
/* Try matching with the superset of DFA, if it's defined. */
- if (superset && !(kwset && kwsm.index < kwset_exact_matches))
+ if (superset && !exact_kwset_match)
{
- next_beg = dfaexec (superset, dfa_beg, (char *) end, 1, &count,
NULL);
+ next_beg = dfaexec (superset, dfa_beg, (char *) end, 1,
+ &count, NULL);
/* If there's no match, or if we've matched the sentinel,
we're done. */
if (next_beg == NULL || next_beg == end)
continue;
- if (!narrowed)
+
+ /* Narrow down to the line we've found. */
+ if (count != 0)
{
- /* Narrow down to the line we've found. */
- if (count != 0)
- {
- beg = memrchr (buf, eol, next_beg - buf);
- beg = beg ? beg + 1 : buf;
-
- /* If dfaexec may match in multiple lines, try to
- match in one line. */
- end = beg;
- continue;
- }
- end = memchr (next_beg, eol, buflim - next_beg);
- end = end ? end + 1 : buflim;
- narrowed = true;
+ /* If dfaexec may match in multiple lines, try to
+ match in one line. */
+ end = memrchr (buf, eol, next_beg - buf);
+ end++;
+ continue;
}
+ end = memchr (next_beg, eol, buflim - next_beg);
+ end = end ? end + 1 : buflim;
}
/* Try matching with DFA. */
@@ -335,17 +309,15 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
we're done. */
if (next_beg == NULL || next_beg == end)
continue;
- if (!narrowed)
+
+ /* Narrow down to the line we've found. */
+ if (count != 0)
{
- /* Narrow down to the line we've found. */
- if (count != 0)
- {
- beg = memrchr (buf, eol, next_beg - buf);
- beg = beg ? beg + 1 : buf;
- }
- end = memchr (next_beg, eol, buflim - next_beg);
- end = end ? end + 1 : buflim;
+ beg = memrchr (buf, eol, next_beg - buf);
+ beg++;
}
+ end = memchr (next_beg, eol, buflim - next_beg);
+ end = end ? end + 1 : buflim;
/* Successful, no backreferences encountered! */
if (!backref)
@@ -364,8 +336,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
if (TYPE_MAXIMUM (regoff_t) < end - beg - 1)
xalloc_die ();
- /* If we've made it to this point, this means DFA has seen
- a probable match, and we need to run it through Regex. */
+ /* Run the possible match through Regex. */
best_match = end;
best_len = 0;
for (i = 0; i < pcount; i++)
--
1.9.0