[PATCH] Round 2 of mod_include/find_start_sequence...

Justin Erenkrantz Wed, 05 Sep 2001 11:42:35 -0700
Replaced Rabin-Karp with the bndm algorithm as implemented by 
Sascha.  Seems to work.  

Can people please test/review?  =-)  

(I'll take care of the formatting and the pre-compilation if
this looks good...) -- justin

Index: modules/filters/mod_include.c
===================================================================
RCS file: /home/cvs/httpd-2.0/modules/filters/mod_include.c,v
retrieving revision 1.143
diff -u -r1.143 mod_include.c
--- modules/filters/mod_include.c       2001/09/04 02:13:58     1.143
+++ modules/filters/mod_include.c       2001/09/05 19:04:10
@@ -206,6 +206,86 @@
 
 /* --------------------------- Parser functions --------------------------- */
 
+typedef struct {
+       unsigned int T[256];
+       unsigned int x;
+} bndm_t;
+
+static void bndm_compile(bndm_t *t, char *n, int nl)
+{
+       unsigned int x;
+       char *ne = n + nl;
+       
+       memset(t->T, 0, sizeof(unsigned int) * 256);
+       
+       for (x = 1; n < ne; x <<= 1)
+               t->T[(unsigned char) *n++] |= x;
+       
+       t->x = x - 1;
+       
+       return;
+}
+
+static int bndm(const char *n, int nl, const char *h, int hl, bndm_t *t)
+{
+       char *he = h + hl;
+       int skip;
+       unsigned int d;
+       char *p, *pi;
+       unsigned int *T = t->T;
+       unsigned int x = t->x << 1;
+
+       pi = h - 1; /* pi: p initial */
+       p = pi + nl; /* compare window right to left. point to the first char */
+
+       do {
+               skip = nl;
+               d = x;
+               do {
+                       d = (d >> 1) & T[(unsigned char) *p--];
+                       if ((d & 1)) {
+                               if (p != pi) {
+                                       skip = p - pi;
+                               } else {
+                    return p - h + 1;
+                                       /* matches++; */
+                               }
+                       }
+               } while (d > 1);
+               p = (pi += skip) + nl; 
+       } while (p < he);
+
+       return hl;
+}
+
+/* We've now found a start sequence tag... */
+static apr_bucket* found_start_sequence(apr_bucket *dptr,
+                                          include_ctx_t *ctx, 
+                                          int tagStart)
+{
+    /* We want to split the bucket at the '<'. */
+    ctx->state = PARSE_DIRECTIVE;
+    ctx->tag_length = 0;
+    ctx->parse_pos = 0;
+    ctx->tag_start_bucket = dptr;
+    ctx->tag_start_index = tagStart;
+    if (ctx->head_start_index > 0) {
+        apr_bucket *tmp_bkt;
+
+        /* Split the bucket with the start of the tag in it */
+        apr_bucket_split(ctx->head_start_bucket, ctx->head_start_index);
+        tmp_bkt = APR_BUCKET_NEXT(ctx->head_start_bucket);
+        /* If it was a one bucket match */
+        if (dptr == ctx->head_start_bucket) {
+            ctx->tag_start_bucket = tmp_bkt;
+            ctx->tag_start_index = tagStart - ctx->head_start_index;
+        }
+        ctx->head_start_bucket = tmp_bkt;
+        ctx->head_start_index = 0;
+    }
+    return ctx->head_start_bucket;
+}
+
 /* This function returns either a pointer to the split bucket containing the
  * first byte of the BEGINNING_SEQUENCE (after finding a complete match) or it
  * returns NULL if no match found.
@@ -217,9 +297,13 @@
     const char *c;
     const char *buf;
     const char *str = STARTING_SEQUENCE;
-    apr_bucket *tmp_bkt;
-    apr_size_t  start_index;
+    int slen = strlen(str);
+    bndm_t spat;
+    int pos;
 
+    /* compile the pattern */
+       bndm_compile(&spat, str, slen); 
+
     *do_cleanup = 0;
 
     do {
@@ -269,8 +353,53 @@
         if (len == 0) { /* end of pipe? */
             break;
         }
+
+        /* Set our buffer to use. */
+
         c = buf;
-        while (c < buf + len) {
+        /* The last bucket had a left over partial match that we need to
+         * complete. 
+         */
+        if (ctx->state == PARSE_HEAD)
+        {
+            apr_size_t tmpLen;
+            tmpLen = (len > (slen - 1)) ? len : (slen - 1);
+
+            while (c < buf + tmpLen && *c == str[ctx->parse_pos])
+            {
+                c++; 
+                ctx->parse_pos++;
+            }
+
+            if (str[ctx->parse_pos] == '\0')
+            {
+                ctx->bytes_parsed += c - buf;
+                return found_start_sequence(dptr, ctx, c - buf);
+            }
+
+            /* False alarm... */
+            ctx->state = PRE_HEAD;
+        }
+
+        if (len)
+        {
+                   pos = bndm(str, slen, c, len, &spat);
+            if (pos != len)
+            {
+                ctx->head_start_bucket = dptr;
+                ctx->head_start_index = pos;
+                ctx->bytes_parsed += pos + slen;
+                return found_start_sequence(dptr, ctx, pos + slen);
+            }
+        }
+        
+        /* Consider the case where we have <!-- at the end of the bucket. */
+        ctx->bytes_parsed += len - slen;
+        ctx->parse_pos = 0;
+
+        c = buf + len - slen + 1;
+        while (c < buf + len)
+        {
             if (*c == str[ctx->parse_pos]) {
                 if (ctx->state == PRE_HEAD) {
                     ctx->state = PARSE_HEAD;
@@ -279,48 +408,24 @@
                 }
                 ctx->parse_pos++;
             }
-            else {
-                if (str[ctx->parse_pos] == '\0') {
-                    /* We want to split the bucket at the '<'. */
-                    ctx->bytes_parsed++;
-                    ctx->state = PARSE_DIRECTIVE;
-                    ctx->tag_length = 0;
-                    ctx->parse_pos = 0;
-                    ctx->tag_start_bucket = dptr;
-                    ctx->tag_start_index = c - buf;
-                    if (ctx->head_start_index > 0) {
-                        start_index = (c - buf) - ctx->head_start_index;
-                        apr_bucket_split(ctx->head_start_bucket, 
-                                         ctx->head_start_index);
-                        tmp_bkt = APR_BUCKET_NEXT(ctx->head_start_bucket);
-                        if (dptr == ctx->head_start_bucket) {
-                            ctx->tag_start_bucket = tmp_bkt;
-                            ctx->tag_start_index = start_index;
-                        }
-                        ctx->head_start_bucket = tmp_bkt;
-                        ctx->head_start_index = 0;
-                    }
-                    return ctx->head_start_bucket;
+            else if (ctx->parse_pos != 0) 
+            {
+                /* The reason for this, is that we need to make sure 
+                 * that we catch cases like <<!--#.  This makes the 
+                 * second check after the original check fails.
+                 * If parse_pos was already 0 then we already checked this.
+                 */
+                /* FIXME: Why? */
+                *do_cleanup = 1;
+                if (*c == str[0]) {
+                    ctx->parse_pos = 1;
+                    ctx->head_start_index = c - buf;
                 }
-                else if (ctx->parse_pos != 0) {
-                    /* The reason for this, is that we need to make sure 
-                     * that we catch cases like <<!--#.  This makes the 
-                     * second check after the original check fails.
-                     * If parse_pos was already 0 then we already checked this.
-                     */
-                    *do_cleanup = 1;
-                    if (*c == str[0]) {
-                        ctx->parse_pos = 1;
-                        ctx->state = PARSE_HEAD;
-                        ctx->head_start_bucket = dptr;
-                        ctx->head_start_index = c - buf;
-                    }
-                    else {
-                        ctx->parse_pos = 0;
-                        ctx->state = PRE_HEAD;
-                        ctx->head_start_bucket = NULL;
-                        ctx->head_start_index = 0;
-                    }
+                else {
+                    ctx->parse_pos = 0;
+                    ctx->state = PRE_HEAD;
+                    ctx->head_start_bucket = NULL;
+                    ctx->head_start_index = 0;
                 }
             }
             c++;
[PATCH] Round 2 of mod_include/find_start_sequence...

Reply via email to