Norihiro Tanaka wrote:
> s0: The position set is none.
> s1: The position set is 1:a
> s2: The position set is 1:a 2:CSET
> s3: The position set is 2:CSET 3:b (accepted)
> s4: The position set is 2:CSET
Sorry, it was wrong yet. It should be as follows.
s0: The position set is none.
s1: The position set is 1:a 2:CSET 3:b
s2: The position set is 1:a 2:CSET 3:b (accepted)
First of all, I noticed that the example is wrong, because if fixed
strings are included in the pattern, kwset is used for the pattern,
and dfahint() is called for each single line.
--
By the way, I compared two cases, which are CSET* and
CSET{1,mb_cur_max}. I used `\(a\|z\)[x-y]\{10\}\(b\|z\)' as the pattern.
`[x-z]' is MBCSET in UTF-8 locale.
If replace MBCSET to CSET*, the pattern in the superset is
`\(a\|z\)\(CSET*\)\{10\}\(b\|z\)'.
If replace MBCSET to CSET{1,mb_cur_max}, the pattern in the superset is
`\(a\|z\)\(CSET\{1,6\}\)\{10\}\(b\|z\)'.
In order to simplify, use `\(a\|z\)\(.\{1,6\}\)\{10\}\(b\|z\)' in
POSIX locale for later.
Before testing, apply the patch attached on this mail, Set `DDEBUG' to
CPPFLAGS, and re-build. It outputs more debugging information.
$ yes `printf '%0100d\n'` | head -320 | \
sed -e '1s/^0/a/; $s/0$/b/' | \
env LANG=en_US.UTF-8 src/grep '\(a\|z\)[x-y]\{10\}\(b\|z\)' 2>debug1.log
$ yes `printf '%0100d\n'` | head -320 | \
sed -e '1s/^0/a/; $s/0$/b/' | \
env LANG=C src/grep '\(a\|z\)\(.\{1,6\}\)\{10\}\(b\|z\)' 2>debug2.log
(`320' originates in the default size of a buffer.)
The formar is only built 3 states.
OTOH, the later is built 224 dfastates.
i.e. even if matched multi-lines, many states aren't necessarily built.
Next, I checked performance for each case in worst text.
It's best in 10 times.
yes `printf '%0100d\n'` | head -2 | sed -e '1s/^0/a/; $s/0$/b/' >t
yes t | head -1000000 | xargs cat >u
time -p env LANG=C src/grep '\(a\|z\)\(.\{1,6\}\)\{10\}\(b\|z\)' u
real 1.26 user 0.95 sys 0.28
time -p env LANG=en_US.UTF-8 src/grep '\(a\|z\)[x-y]\{10\}\(b\|z\)' u
real 0.74 user 0.45 sys 0.27
Though, the later is fast, I like the former, because it's very simple.
the later is complicated, also needs to update nleaves and depth.
Norihiro
From df6da5d40a47abbc6e3451cb9fab7a8c9ede12cc Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 28 Mar 2014 19:27:00 -0700
Subject: [PATCH 1/3] dfa: improve port to freestanding DJGPP
Suggested by Aharon Robbins (Bug#17056).
* src/dfa.c (setlocale) [!LC_ALL]: Return NULL, not "C",
reverting part of a recent change.
(using_simple_locale): Return true if setlocale returns null.
---
src/dfa.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 4ed2189..b22fe97 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -37,7 +37,7 @@
/* Gawk doesn't use Gnulib, so don't assume that setlocale and
static_assert are present. */
#ifndef LC_ALL
-# define setlocale(category, locale) "C"
+# define setlocale(category, locale) NULL
#endif
#ifndef static_assert
# define static_assert(cond, diagnostic) \
@@ -850,8 +850,9 @@ using_simple_locale (void)
if (unibyte_c < 0)
{
char const *locale = setlocale (LC_ALL, NULL);
- unibyte_c = (locale && (STREQ (locale, "C")
- || STREQ (locale, "POSIX")));
+ unibyte_c = (!locale
+ || STREQ (locale, "C")
+ || STREQ (locale, "POSIX"));
}
return unibyte_c;
}
--
1.9.1
From d80c9a5da2c5e04844ac39f8cdd45e6425b2dde6 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Tue, 1 Apr 2014 11:18:44 +0200
Subject: [PATCH 2/3] dfa: avoid re-building a state built previously
* src/dfa.c (dfaexec): Avoid to re-build a state built previously.
---
src/dfa.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/dfa.c b/src/dfa.c
index b22fe97..b6fbd58 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3537,7 +3537,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (s >= 0)
{
- build_state (s, d);
+ if (!d->trans[s])
+ build_state (s, d);
trans = d->trans;
continue;
}
--
1.9.1
From f18a096be8a0be2941f97111a3870e14a8abb613 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 22 Mar 2014 17:00:36 +0900
Subject: [PATCH 3/3] grep: print the detail of DFA states
* src/dfa.c (prtok): replace `%c' to `%02x' in format of printf().
(state_index): print detail of new state.
(dfastate) print detail of the process.
---
src/dfa.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 51 insertions(+), 1 deletion(-)
diff --git a/src/dfa.c b/src/dfa.c
index b6fbd58..fc4edcb 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -556,7 +556,7 @@ prtok (token t)
else if (t < NOTCHAR)
{
int ch = t;
- fprintf (stderr, "%c", ch);
+ fprintf (stderr, "0x%02x", ch);
}
else
{
@@ -2155,6 +2155,28 @@ state_index (struct dfa *d, position_set const *s, int
context)
return i;
}
+#ifdef DEBUG
+ fprintf (stderr, "new state %d:\n position:", i);
+ for (j = 0; j < s->nelem; ++j)
+ {
+ fprintf (stderr, " %d:", s->elems[j].index);
+ prtok (d->tokens[s->elems[j].index]);
+ }
+ fprintf (stderr, "\n context:");
+ if (context & CTX_ANY)
+ fprintf (stderr, " CTX_ANY");
+ else
+ {
+ if (context & CTX_NONE)
+ fprintf (stderr, " CTX_NONE");
+ if (context & CTX_NEWLINE)
+ fprintf (stderr, " CTX_LETTER");
+ if (context & CTX_NEWLINE)
+ fprintf (stderr, " CTX_NEWLINE");
+ }
+ fprintf (stderr, "\n");
+#endif
+
/* We'll have to create a new state. */
REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1);
d->states[i].hash = hash;
@@ -2626,6 +2648,10 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
size_t i, j, k;
+#ifdef DEBUG
+ fprintf (stderr, "*** DFA state %d ***\n", s);
+#endif
+
MALLOC (grps, NOTCHAR);
MALLOC (labels, NOTCHAR);
@@ -2678,6 +2704,14 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
continue;
}
+#ifdef DEBUG
+ fprintf (stderr, "position %d:", pos.index);
+ for (j = 0; j < NOTCHAR; j++)
+ if (tstbit (j, matches))
+ fprintf (stderr, " 0x%02x", j);
+ fprintf (stderr, "\n");
+#endif
+
for (j = 0; j < ngrps; ++j)
{
/* If matches contains a single character only, and the current
@@ -2836,6 +2870,22 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
else
state_letter = state;
+#ifdef DEBUG
+ fprintf (stderr, "group %d:\n nextpos :", i);
+ for (j = 0; j < grps[i].nelem; ++j)
+ {
+ fprintf (stderr, " %d:", grps[i].elems[j]);
+ prtok (d->tokens[grps[i].elems[j]]);
+ }
+ fprintf (stderr, "\n follows :");
+ for (j = 0; j < follows.nelem; ++j)
+ {
+ fprintf (stderr, " %d:", follows.elems[j].index);
+ prtok (d->tokens[follows.elems[j].index]);
+ }
+ fprintf (stderr, "\n nextstate: %d, %d, %d\n", state, state_newline,
state_letter);
+#endif
+
/* Set the transitions for each character in the current label. */
for (j = 0; j < CHARCLASS_INTS; ++j)
for (k = 0; k < INTBITS; ++k)
--
1.9.1