dfamust() uses a log of memory for long pattern.

  $ echo abc | grep -f /usr/share/dict/words

/usr/share/dict/words is about 5MB.  However, dfamust() will require
memory over 100MB.

Norihiro
From 57e7bdc5a7d9ecf3efef979112317894cf26d8bd Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 12 Apr 2014 14:20:11 +0900
Subject: [PATCH] grep: waste of a lot of memory for a large pattern in dfamust

* src/dfa.c (struct must): New member `prev'.  It has the pointer to
previous must.
(allocmust): New function.
(freemust): New function.
(dfamust): Use it.
---
 src/dfa.c | 125 +++++++++++++++++++++++++++++---------------------------------
 1 file changed, 58 insertions(+), 67 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 2b6c5d6..9268faa 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -4012,13 +4012,30 @@ inboth (char **left, char **right)
   return both;
 }
 
-typedef struct
+typedef struct must must;
+
+struct must
 {
   char **in;
   char *left;
   char *right;
   char *is;
-} must;
+  must *prev;
+};
+
+static must *
+allocmust (must *prev)
+{
+  must *mp = xmalloc (sizeof *prev);
+  mp->in = xmalloc (sizeof *mp->in);
+  mp->left = xmalloc (2);
+  mp->right = xmalloc (2);
+  mp->is = xmalloc (2);
+  mp->left[0] = mp->right[0] = mp->is[0] = '\0';
+  mp->in[0] = NULL;
+  mp->prev = prev;
+  return mp;
+}
 
 static void
 resetmust (must * mp)
@@ -4028,42 +4045,30 @@ resetmust (must * mp)
 }
 
 static void
+freemust (must *mp)
+{
+  freelist (mp->in);
+  free (mp->in);
+  free (mp->left);
+  free (mp->right);
+  free (mp->is);
+  free (mp);
+}
+
+static void
 dfamust (struct dfa *d)
 {
-  must *musts;
-  must *mp;
+  must *mp = NULL;
   char *result;
   size_t ri;
   size_t i;
   bool exact;
-  static must must0;
   struct dfamust *dm;
   static char empty_string[] = "";
 
   result = empty_string;
   exact = false;
-  MALLOC (musts, d->tindex + 1);
-  mp = musts;
-  for (i = 0; i <= d->tindex; ++i)
-    mp[i] = must0;
-  for (i = 0; i <= d->tindex; ++i)
-    {
-      mp[i].in = xmalloc (sizeof *mp[i].in);
-      mp[i].left = xmalloc (2);
-      mp[i].right = xmalloc (2);
-      mp[i].is = xmalloc (2);
-      mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
-      mp[i].in[0] = NULL;
-    }
-#ifdef DEBUG
-  fprintf (stderr, "dfamust:\n");
-  for (i = 0; i < d->tindex; ++i)
-    {
-      fprintf (stderr, " %zd:", i);
-      prtok (d->tokens[i]);
-    }
-  putc ('\n', stderr);
-#endif
+
   for (ri = 0; ri < d->tindex; ++ri)
     {
       token t = d->tokens[ri];
@@ -4073,11 +4078,6 @@ dfamust (struct dfa *d)
         case RPAREN:
           assert (!"neither LPAREN nor RPAREN may appear here");
 
-        case STAR:
-        case QMARK:
-          assert (musts < mp);
-          --mp;
-          /* Fall through.  */
         case EMPTY:
         case BEGLINE:
         case ENDLINE:
@@ -4088,19 +4088,24 @@ dfamust (struct dfa *d)
         case BACKREF:
         case ANYCHAR:
         case MBCSET:
+          mp = allocmust (mp);
+          /* Fall through.  */
+        case STAR:
+        case QMARK:
+          assert (mp != NULL);
           resetmust (mp);
           break;
 
         case OR:
-          assert (&musts[2] <= mp);
+          assert (mp && mp->prev);
           {
             char **new;
             must *lmp;
             must *rmp;
             size_t j, ln, rn, n;
 
-            rmp = --mp;
-            lmp = --mp;
+            rmp = mp;
+            lmp = mp = mp->prev;
             /* Guaranteed to be.  Unlikely, but ...  */
             if (!STREQ (lmp->is, rmp->is))
               lmp->is[0] = '\0';
@@ -4127,32 +4132,35 @@ dfamust (struct dfa *d)
             freelist (lmp->in);
             free (lmp->in);
             lmp->in = new;
+            freemust (rmp);
           }
           break;
 
         case PLUS:
-          assert (musts < mp);
-          --mp;
+          assert (mp != NULL);
           mp->is[0] = '\0';
           break;
 
         case END:
-          assert (mp == &musts[1]);
-          for (i = 0; musts[0].in[i] != NULL; ++i)
-            if (strlen (musts[0].in[i]) > strlen (result))
-              result = musts[0].in[i];
-          if (STREQ (result, musts[0].is))
-            exact = true;
+          assert (!mp || !mp->prev);
+          if (mp)
+            {
+              for (i = 0; mp->in[i] != NULL; ++i)
+                if (strlen (mp->in[i]) > strlen (result))
+                  result = mp->in[i];
+              if (STREQ (result, mp->is))
+                exact = true;
+            }
           goto done;
 
         case CAT:
-          assert (&musts[2] <= mp);
+          assert (mp && mp->prev);
           {
             must *lmp;
             must *rmp;
 
-            rmp = --mp;
-            lmp = --mp;
+            rmp = mp;
+            lmp = mp = mp->prev;
             /* In.  Everything in left, plus everything in
                right, plus concatenation of
                left's right and right's left.  */
@@ -4192,6 +4200,7 @@ dfamust (struct dfa *d)
               }
             else
               lmp->is[0] = '\0';
+            freemust (rmp);
           }
           break;
 
@@ -4200,6 +4209,7 @@ dfamust (struct dfa *d)
           goto done;
 
         default:
+          mp = allocmust (mp);
           resetmust (mp);
           if (CSET <= t)
             {
@@ -4231,17 +4241,6 @@ dfamust (struct dfa *d)
             goto done;
           break;
         }
-#ifdef DEBUG
-      fprintf (stderr, " node: %zd:", ri);
-      prtok (d->tokens[ri]);
-      fprintf (stderr, "\n  in:");
-      for (i = 0; mp->in[i]; ++i)
-        fprintf (stderr, " \"%s\"", mp->in[i]);
-      fprintf (stderr, "\n  is: \"%s\"\n", mp->is);
-      fprintf (stderr, "  left: \"%s\"\n", mp->left);
-      fprintf (stderr, "  right: \"%s\"\n", mp->right);
-#endif
-      ++mp;
     }
 done:
   if (strlen (result))
@@ -4252,16 +4251,8 @@ done:
       dm->next = d->musts;
       d->musts = dm;
     }
-  mp = musts;
-  for (i = 0; i <= d->tindex; ++i)
-    {
-      freelist (mp[i].in);
-      free (mp[i].in);
-      free (mp[i].left);
-      free (mp[i].right);
-      free (mp[i].is);
-    }
-  free (mp);
+  if (mp)
+    freemust (mp);
 }
 
 struct dfa *
-- 
1.9.2

Reply via email to