On Wed, 5 Sep 2001, Sander Striker wrote:

> > I'm not totally sure I'm sold on this approach being better.  But,
> > I'm not sure that it is any worse either.  Don't have time to
> > benchmark this right now.  I'm going to throw it to the wolves and
> > see what you think.
>
> Me neither.  Rabin-Karp introduces a lot of * and %.
> I'll try Boyer-Moore with precalced tables for '<!--#' and '--->'.

    Well, there are more advanced algorithms than BM available
    today which are even easier to implement that the original BM
    algo.

    I'd suggest looking at BNDM which combines the advantages of
    bit-parallelism (shift-and/-or algorithms) and suffix
    automata (BM-style).

    http://citeseer.nj.nec.com/navarro01fast.html

    To give you an idea on how a bndm implementation looks like,
    I'm appending an unpolished implementation I did some time
    ago which includes a test-case for locating '<!--#'.

    - Sascha                                     Experience IRCG
      http://schumann.cx/                http://schumann.cx/ircg
#include <sys/types.h>
#include <string.h>
#include <stdio.h>

typedef struct {
        unsigned int T[256];
        unsigned int x;
} bndm_t;

static void bndm_compile(bndm_t *t, char *n, int nl)
{
        unsigned int x;
        char *ne = n + nl;
        
        memset(t->T, 0, sizeof(unsigned int) * 256);
        
        for (x = 1; n < ne; x <<= 1)
                t->T[(unsigned char) *n++] |= x;
        
        t->x = x - 1;
        
        return;
}


static inline int bndm(char *n, int nl, char *h, int hl, bndm_t *t)
{
        char *he = h + hl;
        int skip;
        unsigned int d;
        char *p, *pi;
        int matches = 0;
        unsigned int *T = t->T;
        unsigned int x = t->x << 1;

        pi = h - 1; /* pi: p initial */
        p = pi + nl; /* compare window right to left. point to the first char */

        do {
                skip = nl;
                d = x;
                do {
                        d = (d >> 1) & T[(unsigned char) *p--];
                        if ((d & 1)) {
                                if (p != pi) {
                                        skip = p - pi;
                                } else {
                                        matches++;
                                }
                        }
                } while (d > 1);
                p = (pi += skip) + nl; 
        } while (p < he);

        return matches;
}

#if 0

#define PAT "aabbaab"
#define STRX "abbabaabbaab"

#else

#define PAT "<!--#"
#define STRX 
"blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#"


#endif


#define STR STRX STRX STRX STRX

int main(void)
{
        int n = 500000;
        int d;
        bndm_t foo;

        bndm_compile(&foo, PAT, sizeof(PAT)-1); /* compile the pattern */

        while (n--)
                d = bndm(PAT, sizeof(PAT)-1, STR, sizeof(STR)-1, &foo);

        printf("matches: %d\n", d);

        return (0);
}

Reply via email to