Norihiro Tanaka wrote:
In second patch, I changed so that Boyer-Moore algorithm could be used also to case-insensitive matching if MB_CUR_MAX == 1.
Thanks, I merged this patch into the savannah git master (attachment 1), applied a fixup patch for clarity and to fix some minor style issues (attachment 2), and fixed some longstanding kwset memory-allocation infelicities mostly having to do with unecessary code (attachment 3).
From ad03d4eb22be1b907221af78f4a3b4bcd85e1fc2 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <[email protected]> Date: Sat, 15 Mar 2014 14:41:52 +0900 Subject: [PATCH 1/3] grep: use the Galil rule for Boyer-Moore algorithm in KWSet The Boyer-Moore algorithm is O(m*n), which means it may be much slower than the DFA. Its Galil rule variant is O(n) and increases efficiency in the typical case; it skips sections that are known to match and does not compare more than once for a position in the text. To use the Galil rule, look for the delta2 shift at each position from the trie instead of the 'mind2' value. * src/kwset.c (struct kwset): Replace member 'mind2' with 'shift'. (kwsprep): Look for the delta2 shift. (bmexec): Use it. --- src/kwset.c | 214 ++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 127 insertions(+), 87 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index 410e046..e0bf6b2 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -83,7 +83,7 @@ struct kwset unsigned char delta[NCHAR]; /* Delta table for rapid search. */ struct trie *next[NCHAR]; /* Table of children of the root. */ char *target; /* Target string if there's only one. */ - int mind2; /* Used in Boyer-Moore search for one string. */ + int *shift; /* Used in Boyer-Moore search for one string. */ char const *trans; /* Character translation table. */ }; @@ -397,12 +397,70 @@ kwsprep (kwset_t kws) node at which an outgoing edge is labeled by that character. */ memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR); + struct trie *fail; + struct trie *last, *next[NCHAR]; + + /* Traverse the nodes of the trie in level order, simultaneously + computing the delta table, failure function, and shift function. */ + for (curr = last = kwset->trie; curr; curr = curr->next) + { + /* Enqueue the immediate descendants in the level order queue. */ + enqueue(curr->links, &last); + + curr->shift = kwset->mind; + curr->maxshift = kwset->mind; + + /* Update the delta table for the descendants of this node. */ + treedelta(curr->links, curr->depth, delta); + + /* Compute the failure function for the descendants of this node. */ + treefails(curr->links, curr->fail, kwset->trie); + + /* Update the shifts at each node in the current node's chain + of fails back to the root. */ + for (fail = curr->fail; fail; fail = fail->fail) + { + /* If the current node has some outgoing edge that the fail + doesn't, then the shift at the fail should be no larger + than the difference of their depths. */ + if (!hasevery(fail->links, curr->links)) + if (curr->depth - fail->depth < fail->shift) + fail->shift = curr->depth - fail->depth; + + /* If the current node is accepting then the shift at the + fail and its descendants should be no larger than the + difference of their depths. */ + if (curr->accepting && fail->maxshift > curr->depth - fail->depth) + fail->maxshift = curr->depth - fail->depth; + } + } + + /* Traverse the trie in level order again, fixing up all nodes whose + shift exceeds their inherited maxshift. */ + for (curr = kwset->trie->next; curr; curr = curr->next) + { + if (curr->maxshift > curr->parent->maxshift) + curr->maxshift = curr->parent->maxshift; + if (curr->shift > curr->maxshift) + curr->shift = curr->maxshift; + } + + /* Create a vector, indexed by character code, of the outgoing links + from the root node. */ + for (i = 0; i < NCHAR; ++i) + next[i] = NULL; + treenext(kwset->trie->links, next); + + if ((trans = kwset->trans) != NULL) + for (i = 0; i < NCHAR; ++i) + kwset->next[i] = next[U(trans[i])]; + else + memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); + /* Check if we can use the simple boyer-moore algorithm, instead of the hairy commentz-walter algorithm. */ if (kwset->words == 1 && kwset->trans == NULL) { - char c; - /* Looking for just one string. Extract it from the trie. */ kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); if (!kwset->target) @@ -410,80 +468,22 @@ kwsprep (kwset_t kws) for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) { kwset->target[i] = curr->links->label; - curr = curr->links->trie; + curr = curr->next; } - /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ - for (i = 0; i < kwset->mind; ++i) - delta[U(kwset->target[i])] = kwset->mind - (i + 1); - /* Find the minimal delta2 shift that we might make after - a backwards match has failed. */ - c = kwset->target[kwset->mind - 1]; - for (i = kwset->mind - 2; i >= 0; --i) - if (kwset->target[i] == c) - break; - kwset->mind2 = kwset->mind - (i + 1); - } - else - { - struct trie *fail; - struct trie *last, *next[NCHAR]; - - /* Traverse the nodes of the trie in level order, simultaneously - computing the delta table, failure function, and shift function. */ - for (curr = last = kwset->trie; curr; curr = curr->next) + /* Looking for the delta2 shift that we might make after a + backwards match has failed. Extract it from the trie. */ + if (kwset->mind > 1) { - /* Enqueue the immediate descendants in the level order queue. */ - enqueue(curr->links, &last); - - curr->shift = kwset->mind; - curr->maxshift = kwset->mind; - - /* Update the delta table for the descendants of this node. */ - treedelta(curr->links, curr->depth, delta); - - /* Compute the failure function for the descendants of this node. */ - treefails(curr->links, curr->fail, kwset->trie); - - /* Update the shifts at each node in the current node's chain - of fails back to the root. */ - for (fail = curr->fail; fail; fail = fail->fail) + kwset->shift = obstack_alloc(&kwset->obstack, + sizeof (*kwset->shift) * (kwset->mind - 1)); + if (!kwset->shift) + return _("memory exhausted"); + for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i) { - /* If the current node has some outgoing edge that the fail - doesn't, then the shift at the fail should be no larger - than the difference of their depths. */ - if (!hasevery(fail->links, curr->links)) - if (curr->depth - fail->depth < fail->shift) - fail->shift = curr->depth - fail->depth; - - /* If the current node is accepting then the shift at the - fail and its descendants should be no larger than the - difference of their depths. */ - if (curr->accepting && fail->maxshift > curr->depth - fail->depth) - fail->maxshift = curr->depth - fail->depth; + kwset->shift[i] = curr->shift; + curr = curr->next; } } - - /* Traverse the trie in level order again, fixing up all nodes whose - shift exceeds their inherited maxshift. */ - for (curr = kwset->trie->next; curr; curr = curr->next) - { - if (curr->maxshift > curr->parent->maxshift) - curr->maxshift = curr->parent->maxshift; - if (curr->shift > curr->maxshift) - curr->shift = curr->maxshift; - } - - /* Create a vector, indexed by character code, of the outgoing links - from the root node. */ - for (i = 0; i < NCHAR; ++i) - next[i] = NULL; - treenext(kwset->trie->links, next); - - if ((trans = kwset->trans) != NULL) - for (i = 0; i < NCHAR; ++i) - kwset->next[i] = next[U(trans[i])]; - else - memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); } /* Fix things up for any translation table. */ @@ -503,7 +503,8 @@ bmexec (kwset_t kws, char const *text, size_t size) struct kwset const *kwset; unsigned char const *d1; char const *ep, *sp, *tp; - int d, gc, i, len, md2; + int d, i, len, skip; + char gc1, gc2; kwset = (struct kwset const *) kws; len = kwset->mind; @@ -520,9 +521,10 @@ bmexec (kwset_t kws, char const *text, size_t size) d1 = kwset->delta; sp = kwset->target + len; - gc = U(sp[-2]); - md2 = kwset->mind2; + gc1 = sp[-1]; + gc2 = sp[-2]; tp = text + len; + skip = 0; /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ if (size > 12 * len) @@ -550,14 +552,33 @@ bmexec (kwset_t kws, char const *text, size_t size) } break; found: - if (U(tp[-2]) == gc) + d = len; + while (1) { - for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) - ; - if (i > len) - return tp - len - text; + if (tp[-2] == gc2) + { + for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) + ; + if (i > d) + { + for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = 1; + } } - tp += md2; } /* Now we have only a few characters left to search. We @@ -569,14 +590,33 @@ bmexec (kwset_t kws, char const *text, size_t size) d = d1[U((tp += d)[-1])]; if (d != 0) continue; - if (U(tp[-2]) == gc) + d = len; + while (1) { - for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) - ; - if (i > len) - return tp - len - text; + if (tp[-2] == gc2) + { + for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) + ; + if (i > d) + { + for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = 1; + } } - d = md2; } return -1; -- 1.9.0
From 6546f25525f2912c0733e1e19e4e38e410df28b0 Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Mon, 7 Apr 2014 12:18:33 -0700 Subject: [PATCH 2/3] grep: minor cleanups for Galil speedups * src/kwset.c: Update citations. Include stdbool.h. (kwsincr, kwsprep): Clarify by using C99 decls after statements. (kwsprep): Clarify by using MIN. Avoid a couple of buffer copies when !TRANS. (bmexec): Use bool for boolean. Prefer "continue;" to ";". --- src/kwset.c | 121 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 59 insertions(+), 62 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index e0bf6b2..6c07249 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -23,13 +23,16 @@ /* The algorithm implemented by these routines bears a startling resemblance to one discovered by Beate Commentz-Walter, although it is not identical. - See "A String Matching Algorithm Fast on the Average," Technical Report, - IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900 - Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient - String Matching: An Aid to Bibliographic Search," CACM June 1975, - Vol. 18, No. 6, which describes the failure function used below. */ + See: Commentz-Walter B. A string matching algorithm fast on the average. + Lecture Notes in Computer Science 71 (1979), 118-32 + <http://dx.doi.org/10.1007/3-540-09510-1_10>. + See also: Aho AV, Corasick MJ. Efficient string matching: an aid to + bibliographic search. CACM 18, 6 (1975), 333-40 + <http://dx.doi.org/10.1145/360825.360855>, which describes the + failure function used below. */ #include <config.h> +#include <stdbool.h> #include <sys/types.h> #include "system.h" #include "kwset.h" @@ -131,32 +134,28 @@ kwsalloc (char const *trans) const char * kwsincr (kwset_t kws, char const *text, size_t len) { - struct kwset *kwset; - struct trie *trie; - unsigned char label; - struct tree *link; - int depth; - struct tree *links[DEPTH_SIZE]; - enum { L, R } dirs[DEPTH_SIZE]; - struct tree *t, *r, *l, *rl, *lr; + struct kwset *kwset = kws; + struct trie *trie = kwset->trie; + char const *trans = kwset->trans; - kwset = (struct kwset *) kws; - trie = kwset->trie; text += len; /* Descend the trie (built of reversed keywords) character-by-character, installing new nodes when necessary. */ while (len--) { - label = kwset->trans ? kwset->trans[U(*--text)] : *--text; + unsigned char uc = *--text; + unsigned char label = trans ? trans[uc] : uc; /* Descend the tree of outgoing links for this trie node, looking for the current character and keeping track of the path followed. */ - link = trie->links; + struct tree *link = trie->links; + struct tree *links[DEPTH_SIZE]; + enum { L, R } dirs[DEPTH_SIZE]; links[0] = (struct tree *) &trie->links; dirs[0] = L; - depth = 1; + int depth = 1; while (link && label != link->label) { @@ -215,6 +214,8 @@ kwsincr (kwset_t kws, char const *text, size_t len) if (depth && ((dirs[depth] == L && --links[depth]->balance) || (dirs[depth] == R && ++links[depth]->balance))) { + struct tree *t, *r, *l, *rl, *lr; + switch (links[depth]->balance) { case (char) -2: @@ -384,59 +385,56 @@ treenext (struct tree const *tree, struct trie *next[]) const char * kwsprep (kwset_t kws) { - struct kwset *kwset; + struct kwset *kwset = kws; + char const *trans = kwset->trans; int i; - struct trie *curr; - char const *trans; - unsigned char delta[NCHAR]; - - kwset = (struct kwset *) kws; + unsigned char deltabuf[NCHAR]; + unsigned char *delta = trans ? deltabuf : kwset->delta; /* Initial values for the delta table; will be changed later. The delta entry for a given character is the smallest depth of any node at which an outgoing edge is labeled by that character. */ - memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR); - - struct trie *fail; - struct trie *last, *next[NCHAR]; + memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf); /* Traverse the nodes of the trie in level order, simultaneously - computing the delta table, failure function, and shift function. */ + computing the delta table, failure function, and shift function. */ + struct trie *curr, *last; for (curr = last = kwset->trie; curr; curr = curr->next) { - /* Enqueue the immediate descendants in the level order queue. */ - enqueue(curr->links, &last); + /* Enqueue the immediate descendants in the level order queue. */ + enqueue (curr->links, &last); curr->shift = kwset->mind; curr->maxshift = kwset->mind; - /* Update the delta table for the descendants of this node. */ - treedelta(curr->links, curr->depth, delta); + /* Update the delta table for the descendants of this node. */ + treedelta (curr->links, curr->depth, delta); /* Compute the failure function for the descendants of this node. */ - treefails(curr->links, curr->fail, kwset->trie); + treefails (curr->links, curr->fail, kwset->trie); /* Update the shifts at each node in the current node's chain - of fails back to the root. */ + of fails back to the root. */ + struct trie *fail; for (fail = curr->fail; fail; fail = fail->fail) { /* If the current node has some outgoing edge that the fail doesn't, then the shift at the fail should be no larger - than the difference of their depths. */ - if (!hasevery(fail->links, curr->links)) + than the difference of their depths. */ + if (!hasevery (fail->links, curr->links)) if (curr->depth - fail->depth < fail->shift) fail->shift = curr->depth - fail->depth; /* If the current node is accepting then the shift at the fail and its descendants should be no larger than the - difference of their depths. */ + difference of their depths. */ if (curr->accepting && fail->maxshift > curr->depth - fail->depth) fail->maxshift = curr->depth - fail->depth; } } /* Traverse the trie in level order again, fixing up all nodes whose - shift exceeds their inherited maxshift. */ + shift exceeds their inherited maxshift. */ for (curr = kwset->trie->next; curr; curr = curr->next) { if (curr->maxshift > curr->parent->maxshift) @@ -446,20 +444,18 @@ kwsprep (kwset_t kws) } /* Create a vector, indexed by character code, of the outgoing links - from the root node. */ - for (i = 0; i < NCHAR; ++i) - next[i] = NULL; - treenext(kwset->trie->links, next); - - if ((trans = kwset->trans) != NULL) + from the root node. */ + struct trie *nextbuf[NCHAR]; + struct trie **next = trans ? nextbuf : kwset->next; + memset (next, 0, sizeof nextbuf); + treenext (kwset->trie->links, next); + if (trans) for (i = 0; i < NCHAR; ++i) kwset->next[i] = next[U(trans[i])]; - else - memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); /* Check if we can use the simple boyer-moore algorithm, instead of the hairy commentz-walter algorithm. */ - if (kwset->words == 1 && kwset->trans == NULL) + if (!trans && kwset->words == 1) { /* Looking for just one string. Extract it from the trie. */ kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); @@ -471,11 +467,12 @@ kwsprep (kwset_t kws) curr = curr->next; } /* Looking for the delta2 shift that we might make after a - backwards match has failed. Extract it from the trie. */ + backwards match has failed. Extract it from the trie. */ if (kwset->mind > 1) { - kwset->shift = obstack_alloc(&kwset->obstack, - sizeof (*kwset->shift) * (kwset->mind - 1)); + kwset->shift + = obstack_alloc (&kwset->obstack, + sizeof *kwset->shift * (kwset->mind - 1)); if (!kwset->shift) return _("memory exhausted"); for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i) @@ -487,11 +484,9 @@ kwsprep (kwset_t kws) } /* Fix things up for any translation table. */ - if ((trans = kwset->trans) != NULL) + if (trans) for (i = 0; i < NCHAR; ++i) kwset->delta[i] = delta[U(trans[i])]; - else - memcpy(kwset->delta, delta, NCHAR); return NULL; } @@ -553,16 +548,16 @@ bmexec (kwset_t kws, char const *text, size_t size) break; found: d = len; - while (1) + while (true) { if (tp[-2] == gc2) { for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) - ; + continue; if (i > d) { for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) - ; + continue; if (i > len) return tp - len - text; } @@ -591,16 +586,16 @@ bmexec (kwset_t kws, char const *text, size_t size) if (d != 0) continue; d = len; - while (1) + while (true) { if (tp[-2] == gc2) { for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) - ; + continue; if (i > d) { for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) - ; + continue; if (i > len) return tp - len - text; } @@ -691,7 +686,8 @@ cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) d = trie->shift; while (beg > text) { - c = trans ? trans[U(*--beg)] : *--beg; + unsigned char uc = *--beg; + c = trans ? trans[uc] : uc; tree = trie->links; while (tree && c != tree->label) if (c < tree->label) @@ -742,7 +738,8 @@ cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) d = trie->shift; while (beg > text) { - c = trans ? trans[U(*--beg)] : *--beg; + unsigned char uc = *--beg; + c = trans ? trans[uc] : uc; tree = trie->links; while (tree && c != tree->label) if (c < tree->label) -- 1.9.0
From 22af87b1a8de1958d504b5e6e17553ee11930b6c Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Mon, 7 Apr 2014 19:48:33 -0700 Subject: [PATCH 3/3] grep: simplify memory allocation in kwset * src/kwset.c: Include kwset.h first, to check its prereqs. Include xalloc.h, for xmalloc. (kwsalloc): Use xmalloc, not malloc, so that the caller need not worry about memory allocation failure. (kwsalloc, kwsincr, kwsprep): Do not worry about obstack_alloc returning NULL, as that's not possible. (kwsalloc, kwsincr, kwsprep, bmexec, cwexec, kwsexec, kwsfree): Omit unnecessary conversion between struct kwset * and kwset_t. (kwsincr, kwsprep): Return void since memory-allocation failure is not possible now. All uses changed. * src/kwset.h: Include <stddef.h>, for size_t, so that this include file doesn't require other files to be included first. --- src/dfasearch.c | 10 ++----- src/kwsearch.c | 7 ++--- src/kwset.c | 93 ++++++++++++++++++--------------------------------------- src/kwset.h | 17 +++++------ 4 files changed, 42 insertions(+), 85 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index adec4e2..44360b6 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -90,7 +90,6 @@ kwsmusts (void) struct dfamust const *dm = dfamusts (dfa); if (dm) { - char const *err; kwsinit (&kwset); /* First, we compile in the substrings known to be exact matches. The kwset matcher will return the index @@ -100,8 +99,7 @@ kwsmusts (void) if (!dm->exact) continue; ++kwset_exact_matches; - if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL) - error (EXIT_TROUBLE, 0, "%s", err); + kwsincr (kwset, dm->must, strlen (dm->must)); } /* Now, we compile the substrings that will require the use of the regexp matcher. */ @@ -109,11 +107,9 @@ kwsmusts (void) { if (dm->exact) continue; - if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL) - error (EXIT_TROUBLE, 0, "%s", err); + kwsincr (kwset, dm->must, strlen (dm->must)); } - if ((err = kwsprep (kwset)) != NULL) - error (EXIT_TROUBLE, 0, "%s", err); + kwsprep (kwset); } } diff --git a/src/kwsearch.c b/src/kwsearch.c index dd01518..df94951 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -32,7 +32,6 @@ static kwset_t kwset; void Fcompile (char const *pattern, size_t size) { - char const *err; size_t psize = size; mb_len_map_t *map = NULL; char const *pat = (match_icase && MB_CUR_MAX > 1 @@ -65,14 +64,12 @@ Fcompile (char const *pattern, size_t size) #endif } - if ((err = kwsincr (kwset, beg, end - beg)) != NULL) - error (EXIT_TROUBLE, 0, "%s", err); + kwsincr (kwset, beg, end - beg); beg = lim; } while (beg < pat + psize); - if ((err = kwsprep (kwset)) != NULL) - error (EXIT_TROUBLE, 0, "%s", err); + kwsprep (kwset); } /* Apply the MAP (created by mbtoupper) to the uppercase-buffer-relative diff --git a/src/kwset.c b/src/kwset.c index 6c07249..d5db12f 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -32,11 +32,14 @@ failure function used below. */ #include <config.h> + +#include "kwset.h" + #include <stdbool.h> #include <sys/types.h> #include "system.h" -#include "kwset.h" #include "obstack.h" +#include "xalloc.h" #define link kwset_link @@ -91,25 +94,15 @@ struct kwset }; /* Allocate and initialize a keyword set object, returning an opaque - pointer to it. Return NULL if memory is not available. */ + pointer to it. */ kwset_t kwsalloc (char const *trans) { - struct kwset *kwset; + struct kwset *kwset = xmalloc (sizeof *kwset); - kwset = (struct kwset *) malloc(sizeof (struct kwset)); - if (!kwset) - return NULL; - - obstack_init(&kwset->obstack); + obstack_init (&kwset->obstack); kwset->words = 0; - kwset->trie - = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie)); - if (!kwset->trie) - { - kwsfree((kwset_t) kwset); - return NULL; - } + kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie); kwset->trie->accepting = 0; kwset->trie->links = NULL; kwset->trie->parent = NULL; @@ -122,19 +115,17 @@ kwsalloc (char const *trans) kwset->target = NULL; kwset->trans = trans; - return (kwset_t) kwset; + return kwset; } /* This upper bound is valid for CHAR_BIT >= 4 and exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */ #define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2) -/* Add the given string to the contents of the keyword set. Return NULL - for success, an error message otherwise. */ -const char * -kwsincr (kwset_t kws, char const *text, size_t len) +/* Add the given string to the contents of the keyword set. */ +void +kwsincr (kwset_t kwset, char const *text, size_t len) { - struct kwset *kwset = kws; struct trie *trie = kwset->trie; char const *trans = kwset->trans; @@ -171,19 +162,10 @@ kwsincr (kwset_t kws, char const *text, size_t len) a link in the current trie node's tree. */ if (!link) { - link = (struct tree *) obstack_alloc(&kwset->obstack, - sizeof (struct tree)); - if (!link) - return _("memory exhausted"); + link = obstack_alloc (&kwset->obstack, sizeof *link); link->llink = NULL; link->rlink = NULL; - link->trie = (struct trie *) obstack_alloc(&kwset->obstack, - sizeof (struct trie)); - if (!link->trie) - { - obstack_free(&kwset->obstack, link); - return _("memory exhausted"); - } + link->trie = obstack_alloc (&kwset->obstack, sizeof *link->trie); link->trie->accepting = 0; link->trie->links = NULL; link->trie->parent = trie; @@ -283,8 +265,6 @@ kwsincr (kwset_t kws, char const *text, size_t len) kwset->mind = trie->depth; if (trie->depth > kwset->maxd) kwset->maxd = trie->depth; - - return NULL; } /* Enqueue the trie nodes referenced from the given tree in the @@ -382,10 +362,9 @@ treenext (struct tree const *tree, struct trie *next[]) /* Compute the shift for each trie node, as well as the delta table and next cache for the given keyword set. */ -const char * -kwsprep (kwset_t kws) +void +kwsprep (kwset_t kwset) { - struct kwset *kwset = kws; char const *trans = kwset->trans; int i; unsigned char deltabuf[NCHAR]; @@ -458,9 +437,7 @@ kwsprep (kwset_t kws) if (!trans && kwset->words == 1) { /* Looking for just one string. Extract it from the trie. */ - kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); - if (!kwset->target) - return _("memory exhausted"); + kwset->target = obstack_alloc (&kwset->obstack, kwset->mind); for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) { kwset->target[i] = curr->links->label; @@ -473,8 +450,6 @@ kwsprep (kwset_t kws) kwset->shift = obstack_alloc (&kwset->obstack, sizeof *kwset->shift * (kwset->mind - 1)); - if (!kwset->shift) - return _("memory exhausted"); for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i) { kwset->shift[i] = curr->shift; @@ -487,22 +462,17 @@ kwsprep (kwset_t kws) if (trans) for (i = 0; i < NCHAR; ++i) kwset->delta[i] = delta[U(trans[i])]; - - return NULL; } /* Fast boyer-moore search. */ static size_t _GL_ATTRIBUTE_PURE -bmexec (kwset_t kws, char const *text, size_t size) +bmexec (kwset_t kwset, char const *text, size_t size) { - struct kwset const *kwset; unsigned char const *d1; char const *ep, *sp, *tp; - int d, i, len, skip; + int d, i, skip; char gc1, gc2; - - kwset = (struct kwset const *) kws; - len = kwset->mind; + int len = kwset->mind; if (len == 0) return 0; @@ -619,9 +589,8 @@ bmexec (kwset_t kws, char const *text, size_t size) /* Hairy multiple string search. */ static size_t _GL_ARG_NONNULL ((4)) -cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) +cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) { - struct kwset const *kwset; struct trie * const *next; struct trie const *trie; struct trie const *accept; @@ -638,7 +607,6 @@ cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) #endif /* Initialize register copies and look for easy ways out. */ - kwset = (struct kwset *) kws; if (len < kwset->mind) return -1; next = kwset->next; @@ -775,18 +743,18 @@ cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) return mch - text; } -/* Search TEXT for a match of any member of the keyword set, KWS. +/* Search TEXT for a match of any member of KWSET. Return the offset (into TEXT) of the first byte of the matching substring, or (size_t) -1 if no match is found. Upon a match, store details in *KWSMATCH: index of matched keyword, start offset (same as the return value), and length. */ size_t -kwsexec (kwset_t kws, char const *text, size_t size, struct kwsmatch *kwsmatch) +kwsexec (kwset_t kwset, char const *text, size_t size, + struct kwsmatch *kwsmatch) { - struct kwset const *kwset = (struct kwset *) kws; if (kwset->words == 1 && kwset->trans == NULL) { - size_t ret = bmexec (kws, text, size); + size_t ret = bmexec (kwset, text, size); if (ret != (size_t) -1) { kwsmatch->index = 0; @@ -796,16 +764,13 @@ kwsexec (kwset_t kws, char const *text, size_t size, struct kwsmatch *kwsmatch) return ret; } else - return cwexec(kws, text, size, kwsmatch); + return cwexec (kwset, text, size, kwsmatch); } /* Free the components of the given keyword set. */ void -kwsfree (kwset_t kws) +kwsfree (kwset_t kwset) { - struct kwset *kwset; - - kwset = (struct kwset *) kws; - obstack_free(&kwset->obstack, NULL); - free(kws); + obstack_free (&kwset->obstack, NULL); + free (kwset); } diff --git a/src/kwset.h b/src/kwset.h index 2d3cdd9..12afb8e 100644 --- a/src/kwset.h +++ b/src/kwset.h @@ -21,6 +21,8 @@ The author may be reached (Email) at the address [email protected], or (US mail) as Mike Haertel c/o Free Software Foundation. */ +#include <stddef.h> + struct kwsmatch { size_t index; /* Index number of matching keyword. */ @@ -33,20 +35,17 @@ struct kwsmatch struct kwset; typedef struct kwset *kwset_t; -/* Return an opaque pointer to a newly allocated keyword set, or NULL - if enough memory cannot be obtained. The argument if non-NULL +/* Return an opaque pointer to a newly allocated keyword set. A nonnull arg specifies a table of character translations to be applied to all - pattern and search text. */ + pattern and search text. */ extern kwset_t kwsalloc (char const *); /* Incrementally extend the keyword set to include the given string. - Return NULL for success, or an error message. Remember an index - number for each keyword included in the set. */ -extern const char *kwsincr (kwset_t, char const *, size_t); + Remember an index number for each keyword included in the set. */ +extern void kwsincr (kwset_t, char const *, size_t); -/* When the keyword set has been completely built, prepare it for - use. Return NULL for success, or an error message. */ -extern const char *kwsprep (kwset_t); +/* When the keyword set has been completely built, prepare it for use. */ +extern void kwsprep (kwset_t); /* Search through the given buffer for a member of the keyword set. Return a pointer to the leftmost longest match found, or NULL if -- 1.9.0
