I tried it on grep-2.14, and I confirmed that the patch is effective
certainly.  However, I see that it's fixed in bug#17377.

$ env LC_ALL=C time -p ../grep-2.19/src/grep -E -f regex.re input_lines.txt

grep-2.14
  before: real 7.66  user 7.56  sys 0.09
  after : real 2.06  user 1.96  sys 0.09

grep-2.18.146-ebf3
  before: real 1.59  user 1.50  sys 0.08
  after : real 1.99  user 1.86  sys 0.12

If my rewriting of the patch is right, it causes slowdown rather by
building heap.

Norihiro
diff --git a/src/dfa.c b/src/dfa.c
index 3c9cb75..991b176 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2073,6 +2073,133 @@ merge (position_set const *s1, position_set const *s2, 
position_set * m)
     m->elems[m->nelem++] = s2->elems[j++];
 }
 
+/* Merge implementation of sorted sets, then number of sets smore than 2.
+ * Priority queue aka 'heap' data structure used to select maximum of
+ * many elements */
+
+typedef struct {
+  position p;
+  size_t set_index;   /* Remember which set this element was taken from.  */
+} heap_elem;
+
+static void
+heap_add (heap_elem *heap, size_t *heap_size, heap_elem e)
+{
+  size_t i = *heap_size;
+  while (i)
+    {
+      size_t parent = (i-1) / 2;
+      if (e.p.index < heap[parent].p.index)
+        break;
+
+      heap[i] = heap[parent];
+      i = parent;
+    }
+  heap[i] = e;
+  *heap_size += 1;
+}
+
+static void
+heap_remove_top_and_add (heap_elem *heap, size_t heap_size, heap_elem e)
+{
+  size_t i = 0;
+  size_t lchild  = 2*i + 1;
+  size_t n = heap_size;
+  while (lchild < n)
+    {
+      size_t rchild = lchild + 1;
+      size_t nextchild = lchild;
+      if (rchild < n && heap[lchild].p.index < heap[rchild].p.index)
+        nextchild = rchild;
+      if (e.p.index >= heap[nextchild].p.index)
+        break;
+      heap[i] = heap[nextchild];
+      i = nextchild;
+      lchild = 2*i + 1;
+    }
+  heap[i] = e;
+}
+
+
+static void
+heap_remove_top (heap_elem *heap, size_t *heap_size)
+{
+  *heap_size -= 1;
+  if (*heap_size == 0)
+    return;
+  heap_remove_top_and_add (heap, *heap_size, heap[*heap_size]);
+}
+
+static void
+heap_merge_many_sets (position_set *sets, size_t n, position_set *result)
+{
+  size_t heap_size = 0;
+  position_set r; /* faster access to local variables */
+  heap_elem *heap = XNMALLOC (n, heap_elem);
+  size_t *ids = XNMALLOC (n, size_t);
+  size_t sum_of_sizes = 0;
+  size_t i;
+
+  for (i = 0; i < n; ++i)
+    {
+      sum_of_sizes += sets[i].nelem;
+      /* special value for "end of set", to reduce dependent memory
+         reads.  */
+      ids[i] = ~(size_t)0;
+      if (sets[i].nelem)
+        {
+          heap_elem e = { sets[i].elems[0], i };
+          heap_add (heap, &heap_size, e);
+          if (sets[i].nelem > 1)
+            ids[i] = 1;
+        }
+    }
+
+  maybe_realloc (result->elems, sum_of_sizes, &result->alloc,
+                 sizeof *result->elems); 
+  r = *result;
+  r.nelem = 0;
+
+  while (heap_size)
+    {
+      heap_elem e = heap[0];
+      if (r.nelem && r.elems[r.nelem-1].index == e.p.index)
+        r.elems[r.nelem-1].constraint |= e.p.constraint;
+      else
+        r.elems[r.nelem++] = e.p;
+      i = e.set_index;
+      if (ids[i] != ~(size_t)0)
+        {
+          heap_elem e2 = { sets[i].elems[ids[i]], i };
+          heap_remove_top_and_add (heap, heap_size, e2);
+          ++ids[i];
+          if (ids[i] == sets[i].nelem)
+            ids[i] = ~(size_t)0;
+        }
+      else
+        {
+          heap_remove_top (heap, &heap_size);
+        }
+    }
+  *result = r;
+  free (ids);
+  free (heap);
+}
+
+static void
+merge_many_sets (position_set *sets, size_t n, position_set *result)
+{
+  /* optimized solutions for special/trivial cases */
+  if (n == 0)
+    result->nelem = 0;
+  else if (n == 1)
+    copy (&sets[0], result);
+  else if (n == 2)
+    merge (&sets[0], &sets[1], result);
+  else
+    heap_merge_many_sets(sets, n, result);
+}
+
 /* Delete a position from a set.  */
 static void
 delete (position p, position_set * s)
@@ -2574,6 +2701,10 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
   state_num state_newline;      /* New state on a newline transition.  */
   state_num state_letter;       /* New state on a letter transition.  */
   bool next_isnt_1st_byte = false; /* We can't add state0.  */
+  position_set *sets = NULL;    /* Temporary array of sets, to pass many
+                                   sets to merge function.  */
+  size_t sets_n;                /* Sets array size.  */
+  size_t sets_alloc = 0;        /* Sets array capacity.  */
   size_t i, j, k;
 
   zeroset (matches);
@@ -2722,13 +2853,12 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
   for (i = 0; i < ngrps; ++i)
     {
-      follows.nelem = 0;
-
-      /* Find the union of the follows of the positions of the group.
-         This is a hideously inefficient loop.  Fix it someday.  */
+      /* Find the union of the follows of the positions of the group.  */
+      sets_n = grps[i].nelem;
+      sets = maybe_realloc (sets, sets_n, &sets_alloc, sizeof *sets);
       for (j = 0; j < grps[i].nelem; ++j)
-        for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
-          insert (d->follows[grps[i].elems[j]].elems[k], &follows);
+        sets[j] = d->follows[grps[i].elems[j]];
+      merge_many_sets (sets, sets_n, &follows);
 
       if (d->multibyte)
         {

Reply via email to