I tested below on CentOS 5.10 Intel(R) Core(TM)2 Duo CPU E7500 @ 2.93GHz
GCC 4.1.2.
$ env LANG=C time -p src/grep -f regex.re input_lines.txt
Command exited with non-zero status 1
real 4.81
user 4.40
sys 0.09
I apply an attachment, and tested again.
$ env LANG=C time -p src/grep -f regex.re input_lines.txt
Command exited with non-zero status 1
real 4.63
user 4.46
sys 0.10
Therefore, I don't think that the patch is effective.
diff --git a/src/dfa.c b/src/dfa.c
index 273d3d1..389fc59 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2072,6 +2072,128 @@ delete (position p, position_set * s)
s->elems[i] = s->elems[i + 1];
}
+/* Merge implementation of sorted sets, then number of sets smore than 2.
+ Priority queue aka 'heap' data structure used to select maximum of
+ many elements */
+
+typedef struct {
+ position p;
+ size_t set_index; /* Remember which set this element was taken from */
+} heap_elem;
+
+static void
+heap_add (heap_elem *heap, size_t *heap_size, heap_elem e)
+{
+ size_t i = *heap_size;
+ while (i)
+ {
+ size_t parent = (i - 1) / 2;
+ if (e.p.index < heap[parent].p.index)
+ break;
+ heap[i] = heap[parent];
+ i = parent;
+ }
+ heap[i] = e;
+ (*heap_size)++;
+}
+
+static void
+heap_remove_top_and_add (heap_elem *heap, size_t heap_size, heap_elem e)
+{
+ size_t i = 0;
+ size_t lchild = 2 * i + 1;
+ size_t n = heap_size;
+ while (lchild < n)
+ {
+ size_t rchild = lchild + 1;
+ size_t nextchild = lchild;
+ if (rchild < n && heap[lchild].p.index < heap[rchild].p.index)
+ nextchild = rchild;
+ if (e.p.index >= heap[nextchild].p.index)
+ break;
+ heap[i] = heap[nextchild];
+ i = nextchild;
+ lchild = 2 * i + 1;
+ }
+ heap[i] = e;
+}
+
+static void
+heap_remove_top (heap_elem *heap, size_t *heap_size)
+{
+ if (--(*heap_size) == 0)
+ return;
+ heap_remove_top_and_add (heap, *heap_size, heap[*heap_size]);
+}
+
+static void
+heap_merge_many_sets (position_set *sets, size_t n, position_set *result)
+{
+ size_t heap_size = 0;
+ position_set r;
+ heap_elem *heap = xnmalloc (n, sizeof *heap);
+ size_t *ids = xnmalloc (n, sizeof *ids);
+ size_t sum_of_sizes = 0;
+ size_t i;
+
+ for (i = 0; i < n; ++i)
+ {
+ sum_of_sizes += sets[i].nelem;
+ ids[i] = (size_t) -1;
+ if (sets[i].nelem)
+ {
+ heap_elem e = { sets[i].elems[0], i };
+ heap_add (heap, &heap_size, e);
+ if (sets[i].nelem > 1)
+ ids[i] = 1;
+ }
+ }
+
+ result->elems = maybe_realloc (result->elems, result->nelem,
+ &result->alloc, sizeof *result->elems);
+ r = *result;
+ r.nelem = 0;
+
+ while (heap_size)
+ {
+ heap_elem e = heap[0];
+ if (r.nelem && r.elems[r.nelem-1].index == e.p.index)
+ r.elems[r.nelem-1].constraint |= e.p.constraint;
+ else
+ r.elems[r.nelem++] = e.p;
+ i = e.set_index;
+ if (ids[i] != (size_t) -1)
+ {
+ heap_elem e2 = { sets[i].elems[ids[i]], i };
+ heap_remove_top_and_add (heap, heap_size, e2);
+ ++ids[i];
+ if (ids[i] == sets[i].nelem)
+ ids[i] = (size_t) -1;
+ }
+ else
+ {
+ heap_remove_top (heap, &heap_size);
+ }
+ }
+ *result = r;
+ free (ids);
+ free (heap);
+}
+
+static void
+merge_many_sets (position_set *sets, size_t n, position_set *result)
+{
+ /* optimized solutions for special/trivial cases */
+ if (n == 0)
+ result->nelem = 0;
+ else if (n == 1)
+ copy (&sets[0], result);
+ else if (n == 2)
+ merge (&sets[0], &sets[1], result);
+ else
+ heap_merge_many_sets (sets, n, result);
+}
+
/* Find the index of the state corresponding to the given position set with
the given preceding context, or create a new state if there is no such
state. Context tells whether we got here on a newline or letter. */
@@ -2558,6 +2680,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_num state_newline; /* New state on a newline transition. */
state_num state_letter; /* New state on a letter transition. */
bool next_isnt_1st_byte = false; /* Flag if we can't add state0. */
+ position_set *sets = 0; /* Temporary array of sets, to pass many sets
to merge function */
+ size_t sets_alloc = 0; /* Sets array capacity */
size_t i, j, k;
zeroset (matches);
@@ -2706,13 +2830,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
for (i = 0; i < ngrps; ++i)
{
- follows.nelem = 0;
-
- /* Find the union of the follows of the positions of the group.
- This is a hideously inefficient loop. Fix it someday. */
+ /* Find the union of the follows of the positions of the group. */
+ sets = maybe_realloc (sets, grps[i].nelem, &sets_alloc, sizeof *sets);
for (j = 0; j < grps[i].nelem; ++j)
- for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
- insert (d->follows[grps[i].elems[j]].elems[k], &follows);
+ sets[j] = d->follows[grps[i].elems[j]];
+ merge_many_sets (sets, grps[i].nelem, &follows);
if (d->multibyte)
{