Boyer-Moore algorithm is faster than Beate Commentz-Walter, in especially worst case, because Galil rule is applicable only for Boyer-Moore algorithm.
This patch enables to use Boyer-Moore algorithm for case-insensitive matching. Norihiro
From 069d55705973ddecbd5f04a41097893224d55095 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <[email protected]> Date: Thu, 20 Mar 2014 08:07:25 +0900 Subject: [PATCH 1/2] grep: may also use Boyer-Moore algorithm for case-insensitive matching * src/kwset.c (bmexec): Use character translation table. (kwsexec): Call bmexec for case-insensitive matching. (kwsprep): Change the `if' condition. --- src/kwset.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 173 insertions(+), 42 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index d5db12f..8d8aa07 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -434,7 +434,7 @@ kwsprep (kwset_t kwset) /* Check if we can use the simple boyer-moore algorithm, instead of the hairy commentz-walter algorithm. */ - if (!trans && kwset->words == 1) + if (kwset->words == 1) { /* Looking for just one string. Extract it from the trie. */ kwset->target = obstack_alloc (&kwset->obstack, kwset->mind); @@ -473,6 +473,7 @@ bmexec (kwset_t kwset, char const *text, size_t size) int d, i, skip; char gc1, gc2; int len = kwset->mind; + char const *trans = kwset->trans; if (len == 0) return 0; @@ -480,15 +481,33 @@ bmexec (kwset_t kwset, char const *text, size_t size) return -1; if (len == 1) { - tp = memchr (text, kwset->target[0], size); - return tp ? tp - text : -1; + if (trans) + { + for (tp = text; tp < text + size; tp++) + if (trans[U(*tp)] == kwset->target[0]) + return tp - text; + return -1; + } + else + { + tp = memchr (text, kwset->target[0], size); + return tp ? tp - text : -1; + } } d1 = kwset->delta; sp = kwset->target + len; - gc1 = sp[-1]; - gc2 = sp[-2]; tp = text + len; + if (trans) + { + gc1 = trans[U(sp[-1])]; + gc2 = trans[U(sp[-2])]; + } + else + { + gc1 = sp[-1]; + gc2 = sp[-2]; + } skip = 0; /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ @@ -518,30 +537,86 @@ bmexec (kwset_t kwset, char const *text, size_t size) break; found: d = len; - while (true) + if (trans) { - if (tp[-2] == gc2) + while (true) { - for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) - continue; - if (i > d) + if (trans[U(tp[-2])] == gc2) { - for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) - continue; - if (i > len) - return tp - len - text; + for (i = 3; i <= d; ++i) + if (trans[U(tp[-i])] != trans[U(sp[-i])]) + break; + if (i > d) + { + for (i = d + skip + 1; i <= len; ++i) + if (trans[U(tp[-i])] != trans[U(sp[-i])]) + break; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep) + break; + if (trans[U(tp[-1])] != gc1) + { + d = d1[U(tp[-1])], tp += d; + break; + } + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep) + break; + if (trans[U(tp[-1])] != gc1) + { + d = d1[U(tp[-1])], tp += d; + break; + } + skip = 1; } - d = kwset->shift[i - 2]; tp += d; - if (tp > ep || tp[-1] != gc1) - break; - skip = i - 1; } - else + } + else + { + while (true) { - d = kwset->shift[0]; tp += d; - if (tp > ep || tp[-1] != gc1) - break; - skip = 1; + if (tp[-2] == gc2) + { + for (i = 3; i <= d; ++i) + if (tp[-i] != sp[-i]) + break; + if (i > d) + { + for (i = d + skip + 1; i <= len; ++i) + if (tp[-i] != sp[-i]) + break; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep) + break; + if (tp[-1] != gc1) + { + d = d1[U(tp[-1])], tp += d; + break; + } + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep) + break; + if (tp[-1] != gc1) + { + d = d1[U(tp[-1])], tp += d; + break; + } + skip = 1; + } } } } @@ -556,30 +631,86 @@ bmexec (kwset_t kwset, char const *text, size_t size) if (d != 0) continue; d = len; - while (true) + if (trans) { - if (tp[-2] == gc2) + while (true) { - for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) - continue; - if (i > d) + if (trans[U(tp[-2])] == gc2) + { + for (i = 3; i <= d; ++i) + if (trans[U(tp[-i])] != trans[U(sp[-i])]) + break; + if (i > d) + { + for (i = d + skip + 1; i <= len; ++i) + if (trans[U(tp[-i])] != trans[U(sp[-i])]) + break; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep) + break; + if (trans[U(tp[-1])] != gc1) + { + d = d1[U(tp[-1])]; + break; + } + skip = i - 1; + } + else { - for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) - continue; - if (i > len) - return tp - len - text; + d = kwset->shift[0]; tp += d; + if (tp > ep) + break; + if (trans[U(tp[-1])] != gc1) + { + d = d1[U(tp[-1])]; + break; + } + skip = 1; } - d = kwset->shift[i - 2]; tp += d; - if (tp > ep || tp[-1] != gc1) - break; - skip = i - 1; } - else + } + else + { + while (true) { - d = kwset->shift[0]; tp += d; - if (tp > ep || tp[-1] != gc1) - break; - skip = 1; + if (tp[-2] == gc2) + { + for (i = 3; i <= d; ++i) + if (tp[-i] != sp[-i]) + break; + if (i > d) + { + for (i = d + skip + 1; i <= len; ++i) + if (tp[-i] != sp[-i]) + break; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep) + break; + if (tp[-1] != gc1) + { + d = d1[U(tp[-1])]; + break; + } + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep) + break; + if (tp[-1] != gc1) + { + d = d1[U(tp[-1])]; + break; + } + skip = 1; + } } } } @@ -752,7 +883,7 @@ size_t kwsexec (kwset_t kwset, char const *text, size_t size, struct kwsmatch *kwsmatch) { - if (kwset->words == 1 && kwset->trans == NULL) + if (kwset->words == 1) { size_t ret = bmexec (kwset, text, size); if (ret != (size_t) -1) -- 1.9.1
