I have polished the patch (
faster code for simple cases of merging 2 or 1 sets,
fixes of code style, like fancy 2+2 indentation,
important fix: memory leaks under valgrind - I improperly returned array
from a function )
New patch available on URL https://gist.github.com/dobrokot/6337339 and
included as attach
> One thing that'd help would be a test case that illustrates the need for the
patch.
Not sure how to properly to send test-case, and how to reference grep compiled
old/new binary. I had put it on URL
http://dobrokot.ru/dump/slow_dfa_merge.2013-08-26.tar.gz
I can send it in a attach for a "history", if binary/large attaches are allowed
in maillists.
Real regex contains sensitive private data, and it's huge. So I had little obfuscated it and reduce to kilobytes. The runtime difference is not so great as in real example (1 to 100), but still large
( 2-3 times faster ).
Times for new/old version: 2.3sec / 8.7sec
> I assume you're willing to assign copyright to the FSF for the change? (I
can send you copies of the paperwork, if so.)
You mean, I should somewhere explicitly state, that I am agree with GPL and
give FSF rights to distribute and use the code from patch?
If so, I am surely agree with the license, and (proudly) give the permission to
use/distribute code from my patch :)
Or you mean some modification of authorship lines in the AUTHORS/THANKS and
beginning of dfa.c, which should contain my name now ?
Which paperwork do you mean? Real paper which I should sign with pen and ink?
Thanks for attention, Ivan Yanikov.
On 25.08.2013 11:02, Paul Eggert wrote:
Ivan Yanikov wrote:
How I can properly send/commit it for review?
You've already done that, sort of, with the pointers
to the patch. One thing that'd help would be a test
case that illustrates the need for the patch. Also,
I assume you're willing to assign copyright to the
FSF for the change? (I can send you copies of the
paperwork, if so.)
Index: dfa.c
===================================================================
--- dfa.c (revision grep-2.14-faster-dfa)
+++ dfa.c (revision grep-2.14)
@@ -1992,6 +1992,133 @@
s->elems[i] = s->elems[i + 1];
}
+/* Merge implementation of sorted sets, then number of sets smore than 2.
+ Priority queue aka 'heap' data structure used to select maximum of
+ many elements */
+
+typedef struct {
+ position p;
+ size_t set_index; /* Remember which set this element was taken from */
+} heap_elem;
+
+
+static void
+heap_add (heap_elem *heap, size_t *heap_size, heap_elem e)
+{
+ size_t i = *heap_size;
+ while (i)
+ {
+ size_t parent = (i-1) / 2;
+ if (e.p.index < heap[parent].p.index)
+ break;
+
+ heap[i] = heap[parent];
+ i = parent;
+ }
+ heap[i] = e;
+ *heap_size += 1;
+}
+
+static void
+heap_remove_top_and_add (heap_elem *heap, size_t heap_size, heap_elem e)
+{
+ size_t i = 0;
+ size_t lchild = 2*i + 1;
+ size_t n = heap_size;
+ while (lchild < n)
+ {
+ size_t rchild = lchild + 1;
+ size_t nextchild = lchild;
+ if (rchild < n && heap[lchild].p.index < heap[rchild].p.index)
+ nextchild = rchild;
+ if (e.p.index >= heap[nextchild].p.index)
+ break;
+ heap[i] = heap[nextchild];
+ i = nextchild;
+ lchild = 2*i + 1;
+ }
+ heap[i] = e;
+}
+
+
+static void
+heap_remove_top (heap_elem *heap, size_t *heap_size)
+{
+ *heap_size -= 1;
+ if (*heap_size == 0)
+ return;
+ heap_remove_top_and_add (heap, *heap_size, heap[*heap_size]);
+}
+
+static void
+heap_merge_many_sets (position_set *sets, size_t n, position_set *result)
+{
+ size_t heap_size = 0;
+ position_set r; /* faster access to local variables */
+ heap_elem *heap = XNMALLOC (n, heap_elem);
+ size_t *ids = XNMALLOC (n, size_t);
+ size_t sum_of_sizes = 0;
+ size_t i;
+
+ for (i = 0; i < n; ++i)
+ {
+ sum_of_sizes += sets[i].nelem;
+ /* special value for "end of set", to reduce
+ dependent memory reads */
+ ids[i] = ~(size_t)0;
+ if (sets[i].nelem)
+ {
+ heap_elem e = { sets[i].elems[0], i };
+ heap_add (heap, &heap_size, e);
+ if (sets[i].nelem > 1)
+ ids[i] = 1;
+ }
+ }
+
+ REALLOC_IF_NECESSARY(result->elems, result->alloc, sum_of_sizes);
+ r = *result;
+ r.nelem = 0;
+
+ while (heap_size)
+ {
+ heap_elem e = heap[0];
+ if (r.nelem && r.elems[r.nelem-1].index == e.p.index)
+ r.elems[r.nelem-1].constraint |= e.p.constraint;
+ else
+ r.elems[r.nelem++] = e.p;
+ i = e.set_index;
+ if (ids[i] != ~(size_t)0)
+ {
+ heap_elem e2 = { sets[i].elems[ids[i]], i };
+ heap_remove_top_and_add (heap, heap_size, e2);
+ ++ids[i];
+ if (ids[i] == sets[i].nelem)
+ ids[i] = ~(size_t)0;
+ }
+ else
+ {
+ heap_remove_top (heap, &heap_size);
+ }
+ }
+ *result = r;
+ free (ids);
+ free (heap);
+}
+
+static void
+merge_many_sets (position_set *sets, size_t n, position_set *result)
+{
+ /* optimized solutions for special/trivial cases */
+ if (n == 0)
+ result->nelem = 0;
+ else if (n == 1)
+ copy (&sets[0], result);
+ else if (n == 2)
+ merge (&sets[0], &sets[1], result);
+ else
+ heap_merge_many_sets(sets, n, result);
+}
+
/* Find the index of the state corresponding to the given position set with
the given preceding context, or create a new state if there is no such
state. Context tells whether we got here on a newline or letter. */
@@ -2482,13 +2609,16 @@
charclass leftovers; /* Stuff in the label that didn't match. */
int leftoversf; /* True if leftovers is nonempty. */
position_set follows; /* Union of the follows of some group. */
- position_set tmp; /* Temporary space for merging sets. */
+ position_set follows_tmp; /* Temporary space for merging sets. */
int possible_contexts; /* Contexts that this group can match. */
int separate_contexts; /* Context that new state wants to know. */
state_num state; /* New state. */
state_num state_newline; /* New state on a newline transition. */
state_num state_letter; /* New state on a letter transition. */
int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
+ position_set *sets = 0; /* Temporary array of sets, to pass many sets
to merge function */
+ size_t sets_n = 0; /* Sets array size */
+ size_t sets_alloc = 0; /* Sets array capacity */
size_t i, j, k;
MALLOC (grps, NOTCHAR);
@@ -2607,7 +2737,7 @@
}
alloc_position_set (&follows, d->nleaves);
- alloc_position_set (&tmp, d->nleaves);
+ alloc_position_set (&follows_tmp, d->nleaves);
/* If we are a searching matcher, the default transition is to a state
containing the positions of state 0, otherwise the default transition
@@ -2637,13 +2767,12 @@
for (i = 0; i < ngrps; ++i)
{
- follows.nelem = 0;
-
- /* Find the union of the follows of the positions of the group.
- This is a hideously inefficient loop. Fix it someday. */
+ /* Find the union of the follows of the positions of the group. */
+ sets_n = grps[i].nelem;
+ REALLOC_IF_NECESSARY(sets, sets_alloc, sets_n);
for (j = 0; j < grps[i].nelem; ++j)
- for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
- insert (d->follows[grps[i].elems[j]].elems[k], &follows);
+ sets[j] = d->follows[grps[i].elems[j]];
+ merge_many_sets (sets, sets_n, &follows_tmp);
if (d->mb_cur_max > 1)
{
@@ -2666,9 +2795,9 @@
<mb A>, so we cannot add state[0]. */
next_isnt_1st_byte = 0;
- for (j = 0; j < follows.nelem; ++j)
+ for (j = 0; j < follows_tmp.nelem; ++j)
{
- if (!(d->multibyte_prop[follows.elems[j].index] & 1))
+ if (!(d->multibyte_prop[follows_tmp.elems[j].index] & 1))
{
next_isnt_1st_byte = 1;
break;
@@ -2680,8 +2809,7 @@
of state 0 as well. */
if (d->searchflag
&& (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
- for (j = 0; j < d->states[0].elems.nelem; ++j)
- insert (d->states[0].elems.elems[j], &follows);
+ merge (&follows_tmp, &d->states[0].elems, &follows);
/* Find out if the new state will want any context information. */
possible_contexts = charclass_context (labels[i]);
@@ -2720,7 +2848,8 @@
for (i = 0; i < ngrps; ++i)
free (grps[i].elems);
free (follows.elems);
- free (tmp.elems);
+ free (sets);
+ free (follows_tmp.elems);
free (grps);
free (labels);
}