Thanks, I pushed that with minor changes to the ChangeLog message (first
attached patch). It's a tad faster to combine the relevant flags at
compile-time so I did that (second attached patch).
"Iff" means "if and only if"; see <http://en.wikipedia.org/wiki/Iff>. I
got the habit of trying to be logically-precise from RMS, who I guess
got it back in the days when he was doing AI. Anyway, I reworded the
text in a different way to avoid the misuse of "if" (third attached patch).
From 13523ef8c728abe35922d6178d94dc1008aa7675 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 5 May 2014 15:14:34 +0900
Subject: [PATCH 1/3] dfa: speed up 'dfaisfast'
* src/dfa.c (struct dfa): New member 'has_backref'.
(addtok_mb): Set it.
(dfaisfast): Use it.
---
src/dfa.c | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 273d3d1..12fbdda 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -337,6 +337,7 @@ struct dfa
size_t nleaves; /* Number of leaves on the parse tree. */
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
+ bool has_backref; /* True if has BACKREF in tokens. */
bool multibyte; /* True iff MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
@@ -1593,6 +1594,8 @@ addtok_mb (token t, int mbprop)
--depth;
break;
+ case BACKREF:
+ dfa->has_backref = true;
default:
++dfa->nleaves;
case EMPTY:
@@ -3419,18 +3422,7 @@ dfasuperset (struct dfa const *d)
bool
dfaisfast (struct dfa const *d)
{
- if (d->superset)
- return true;
- else if (d->multibyte)
- return false;
- else
- {
- size_t i;
- for (i = 0; i < d->tindex; i++)
- if (d->tokens[i] == BACKREF)
- return false;
- return true;
- }
+ return d->superset || (!d->multibyte && !d->has_backref);
}
static void
--
1.9.0
From f082198257733b11195a32d342ae52e95f910f23 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Tue, 6 May 2014 07:23:03 -0700
Subject: [PATCH 2/3] dfa: minor performance improvement for previous change
* src/dfa.c (struct dfa): New member 'fast'. Remove 'has_backref'.
All uses changed.
---
src/dfa.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 12fbdda..5b3704e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -337,7 +337,7 @@ struct dfa
size_t nleaves; /* Number of leaves on the parse tree. */
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
- bool has_backref; /* True if has BACKREF in tokens. */
+ bool fast; /* The DFA is fast. */
bool multibyte; /* True iff MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
@@ -1595,7 +1595,7 @@ addtok_mb (token t, int mbprop)
break;
case BACKREF:
- dfa->has_backref = true;
+ dfa->fast = false;
default:
++dfa->nleaves;
case EMPTY:
@@ -3422,7 +3422,7 @@ dfasuperset (struct dfa const *d)
bool
dfaisfast (struct dfa const *d)
{
- return d->superset || (!d->multibyte && !d->has_backref);
+ return d->fast;
}
static void
@@ -3462,6 +3462,7 @@ dfainit (struct dfa *d)
{
memset (d, 0, sizeof *d);
d->multibyte = MB_CUR_MAX > 1;
+ d->fast = !d->multibyte;
}
static void
@@ -3579,7 +3580,10 @@ dfacomp (char const *s, size_t len, struct dfa *d, int
searchflag)
dfassbuild (d);
dfaanalyze (d, searchflag);
if (d->superset)
- dfaanalyze (d->superset, searchflag);
+ {
+ d->fast = true;
+ dfaanalyze (d->superset, searchflag);
+ }
}
/* Free the storage held by the components of a dfa. */
--
1.9.0
From 3c3468b15bb7d15869f6f1688f28bf943172572e Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Tue, 6 May 2014 07:40:52 -0700
Subject: [PATCH 3/3] dfa: clarify use of "if"
The phrase "Y is true if X" is logically equivalent to "X implies Y",
but often "X if and only if Y" was intended.
* src/dfa.c, src/dfa.h: Reword to avoid the incorrect use of "if".
---
src/dfa.c | 52 ++++++++++++++++++++++++----------------------------
src/dfa.h | 2 +-
2 files changed, 25 insertions(+), 29 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 5b3704e..fc2acdc 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -46,7 +46,6 @@
host does not conform to Posix. */
#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
-/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
#include "gettext.h"
#define _(str) gettext (str)
@@ -183,27 +182,25 @@ enum
a backtracking matcher. */
BEGLINE, /* BEGLINE is a terminal symbol that matches
- the empty string if it is at the beginning
- of a line. */
+ the empty string at the beginning of a
+ line. */
ENDLINE, /* ENDLINE is a terminal symbol that matches
- the empty string if it is at the end of
- a line. */
+ the empty string at the end of a line. */
BEGWORD, /* BEGWORD is a terminal symbol that matches
- the empty string if it is at the beginning
- of a word. */
+ the empty string at the beginning of a
+ word. */
ENDWORD, /* ENDWORD is a terminal symbol that matches
- the empty string if it is at the end of
- a word. */
+ the empty string at the end of a word. */
LIMWORD, /* LIMWORD is a terminal symbol that matches
- the empty string if it is at the beginning
- or the end of a word. */
+ the empty string at the beginning or the
+ end of a word. */
NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
- matches the empty string if it is not at
+ matches the empty string not at
the beginning or end of a word. */
QMARK, /* QMARK is an operator of one argument that
@@ -284,8 +281,8 @@ typedef struct
size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
- bool has_backref; /* True if this state matches a \<digit>. */
- bool has_mbcset; /* True if this state matches a MBCSET. */
+ bool has_backref; /* This state matches a \<digit>. */
+ bool has_mbcset; /* This state matches a MBCSET. */
unsigned short constraint; /* Constraint for this state to accept. */
token first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
@@ -338,7 +335,7 @@ struct dfa
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
bool fast; /* The DFA is fast. */
- bool multibyte; /* True iff MB_CUR_MAX > 1. */
+ bool multibyte; /* MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
@@ -389,7 +386,7 @@ struct dfa
matching the given position in a string
matching the regexp. Allocated to the
maximum possible position index. */
- bool searchflag; /* True if we are supposed to build a searching
+ bool searchflag; /* We are supposed to build a searching
as opposed to an exact matcher. A searching
matcher finds the first and shortest string
matching a regexp anywhere in the buffer,
@@ -432,11 +429,10 @@ struct dfa
/* Some macros for user access to dfa internals. */
-/* ACCEPTING returns true if s could possibly be an accepting state of r. */
+/* S could possibly be an accepting state of R. */
#define ACCEPTING(s, r) ((r).states[s].constraint)
-/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
- specified context. */
+/* STATE accepts in the specified context. */
#define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
@@ -626,9 +622,9 @@ equal (charclass const s1, charclass const s2)
and return its new address. Although PTR may be null, the returned
value is never null.
- The array holds *NALLOC items; *NALLOC must be zero if PTR is null,
- and is updated on reallocation. ITEMSIZE is the size of one item.
- Avoid O(N**2) behavior on arrays growing linearly. */
+ The array holds *NALLOC items; *NALLOC is updated on reallocation.
+ ITEMSIZE is the size of one item. Avoid O(N**2) behavior on arrays
+ growing linearly. */
static void *
maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
{
@@ -691,7 +687,7 @@ static charclass newline;
# define is_valid_unibyte_character(c) (btowc (c) != WEOF)
#endif
-/* Return non-zero if C is a "word-constituent" byte; zero otherwise. */
+/* C is a "word-constituent" byte. */
#define IS_WORD_CONSTITUENT(C) \
(is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_'))
@@ -786,7 +782,7 @@ using_utf8 (void)
return utf8;
}
-/* Return true if the current locale is known to be a unibyte locale
+/* The current locale is known to be a unibyte locale
without multicharacter collating sequences and where range
comparisons simply use the native encoding. These locales can be
processed more efficiently. */
@@ -794,7 +790,7 @@ using_utf8 (void)
static bool
using_simple_locale (void)
{
- /* True if the native character set is known to be compatible with
+ /* The native character set is known to be compatible with
the C locale. The following test isn't perfect, but it's good
enough in practice, as only ASCII and EBCDIC are in common use
and this test correctly accepts ASCII and rejects EBCDIC. */
@@ -834,7 +830,7 @@ using_simple_locale (void)
static char const *lexptr; /* Pointer to next input character. */
static size_t lexleft; /* Number of characters remaining. */
static token lasttok; /* Previous token returned; initially END. */
-static bool laststart; /* True if we're separated from beginning or (,
+static bool laststart; /* We're separated from beginning or (,
| only by zero-width characters. */
static size_t parens; /* Count of outstanding left parens. */
static int minrep, maxrep; /* Repeat counts for {m,n}. */
@@ -976,7 +972,7 @@ parse_bracket_exp (void)
int c, c1, c2;
charclass ccl;
- /* True if this is a bracket expression that dfaexec is known to
+ /* This is a bracket expression that dfaexec is known to
process correctly. */
bool known_bracket_exp = true;
@@ -2560,7 +2556,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_num state; /* New state. */
state_num state_newline; /* New state on a newline transition. */
state_num state_letter; /* New state on a letter transition. */
- bool next_isnt_1st_byte = false; /* Flag if we can't add state0. */
+ bool next_isnt_1st_byte = false; /* We can't add state0. */
size_t i, j, k;
zeroset (matches);
diff --git a/src/dfa.h b/src/dfa.h
index e3ab700..f30c3cb 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -77,7 +77,7 @@ extern char *dfaexec (struct dfa *d, char const *begin, char
*end,
superset is available. */
extern struct dfa *dfasuperset (struct dfa const *d) _GL_ATTRIBUTE_PURE;
-/* Return true if the DFA is likely to be fast. */
+/* The DFA is likely to be fast. */
extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE;
/* Free the storage held by the components of a struct dfa. */
--
1.9.0