On Wed, 5 Sep 2001, Sander Striker wrote: > > I'm not totally sure I'm sold on this approach being better. But, > > I'm not sure that it is any worse either. Don't have time to > > benchmark this right now. I'm going to throw it to the wolves and > > see what you think. > > Me neither. Rabin-Karp introduces a lot of * and %. > I'll try Boyer-Moore with precalced tables for '<!--#' and '--->'. Well, there are more advanced algorithms than BM available today which are even easier to implement that the original BM algo. I'd suggest looking at BNDM which combines the advantages of bit-parallelism (shift-and/-or algorithms) and suffix automata (BM-style). http://citeseer.nj.nec.com/navarro01fast.html To give you an idea on how a bndm implementation looks like, I'm appending an unpolished implementation I did some time ago which includes a test-case for locating '<!--#'. - Sascha Experience IRCG http://schumann.cx/ http://schumann.cx/ircg
#include <sys/types.h> #include <string.h> #include <stdio.h> typedef struct { unsigned int T[256]; unsigned int x; } bndm_t; static void bndm_compile(bndm_t *t, char *n, int nl) { unsigned int x; char *ne = n + nl; memset(t->T, 0, sizeof(unsigned int) * 256); for (x = 1; n < ne; x <<= 1) t->T[(unsigned char) *n++] |= x; t->x = x - 1; return; } static inline int bndm(char *n, int nl, char *h, int hl, bndm_t *t) { char *he = h + hl; int skip; unsigned int d; char *p, *pi; int matches = 0; unsigned int *T = t->T; unsigned int x = t->x << 1; pi = h - 1; /* pi: p initial */ p = pi + nl; /* compare window right to left. point to the first char */ do { skip = nl; d = x; do { d = (d >> 1) & T[(unsigned char) *p--]; if ((d & 1)) { if (p != pi) { skip = p - pi; } else { matches++; } } } while (d > 1); p = (pi += skip) + nl; } while (p < he); return matches; } #if 0 #define PAT "aabbaab" #define STRX "abbabaabbaab" #else #define PAT "<!--#" #define STRX "blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#blafoobarasdsadsadsadsadsad<!--#" #endif #define STR STRX STRX STRX STRX int main(void) { int n = 500000; int d; bndm_t foo; bndm_compile(&foo, PAT, sizeof(PAT)-1); /* compile the pattern */ while (n--) d = bndm(PAT, sizeof(PAT)-1, STR, sizeof(STR)-1, &foo); printf("matches: %d\n", d); return (0); }