c-bindings-wip2 - Implement POSIX RegexTokenizer

nwellnhof Sat, 02 Mar 2013 12:06:41 -0800

Updated Branches:
  refs/heads/c-bindings-wip2 [created] 24d06ccd8


Implement POSIX RegexTokenizer


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/24d06ccd
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/24d06ccd
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/24d06ccd

Branch: refs/heads/c-bindings-wip2
Commit: 24d06ccd88d6d093dbf54638cbc8b9e81f770d34
Parents: 0fc39d5
Author: Nick Wellnhofer <[email protected]>
Authored: Sat Mar 2 21:03:52 2013 +0100
Committer: Nick Wellnhofer <[email protected]>
Committed: Sat Mar 2 21:03:52 2013 +0100

----------------------------------------------------------------------
 c/src/Lucy/Analysis/RegexTokenizer.c |  114 ++++++++++++++++++++++++++++-
 1 files changed, 110 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/24d06ccd/c/src/Lucy/Analysis/RegexTokenizer.c
----------------------------------------------------------------------
diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c 
b/c/src/Lucy/Analysis/RegexTokenizer.c
index 2f21afb..46b0fe4 100644
--- a/c/src/Lucy/Analysis/RegexTokenizer.c
+++ b/c/src/Lucy/Analysis/RegexTokenizer.c
@@ -17,15 +17,62 @@
 #define C_LUCY_REGEXTOKENIZER
 
 #include "CFBind.h"
+
+#include <string.h>
+
+#if defined(CHY_HAS_REGEX_H)
+  #include <regex.h>
+#elif defined(CHY_HAS_PCREPOSIX_H)
+  #include <pcreposix.h>
+#else
+  #error No regex headers found.
+#endif
+
+#include "Clownfish/Util/Memory.h"
+#include "Clownfish/Util/StringHelper.h"
 #include "Lucy/Analysis/RegexTokenizer.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
 
+static uint32_t
+S_count_code_points(const char *string, size_t len);
+
 lucy_RegexTokenizer*
 lucy_RegexTokenizer_init(lucy_RegexTokenizer *self,
                          const lucy_CharBuf *pattern) {
-    THROW(LUCY_ERR, "TODO");
-    UNREACHABLE_RETURN(lucy_RegexTokenizer*);
+    lucy_Analyzer_init((lucy_Analyzer*)self);
+
+    const char *pattern_ptr;
+    if (pattern) {
+        self->pattern = Lucy_CB_Clone(pattern);
+        pattern_ptr = (char*)Lucy_CB_Get_Ptr8(self->pattern);
+    }
+    else {
+        pattern_ptr = "[[:alnum:]]+('[[:alnum:]]+)*";
+        self->pattern
+            = lucy_CB_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
+    }
+
+    // TODO: Make sure that we use a UTF-8 locale.
+
+    int flags = REG_EXTENDED;
+#ifdef CHY_HAS_REG_ENHANCED
+    flags |= REG_ENHANCED;
+#endif
+    regex_t *re = LUCY_MALLOCATE(sizeof(regex_t));
+    int errcode = regcomp(re, pattern_ptr, flags);
+    if (errcode) {
+        size_t errbuf_size = regerror(errcode, re, NULL, 0);
+        char *errbuf = (char*)LUCY_MALLOCATE(errbuf_size);
+        regerror(errcode, re, errbuf, errbuf_size);
+        regfree(re);
+        LUCY_FREEMEM(re);
+        THROW(LUCY_ERR, "%s", errbuf);
+    }
+
+    self->token_re = re;
+
+    return self;
 }
 
 void
@@ -35,14 +82,73 @@ lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, 
void *token_re) {
 
 void
 lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) {
-    THROW(LUCY_ERR, "TODO");
+    CFISH_DECREF(self->pattern);
+    regex_t *re = (regex_t*)self->token_re;
+    if (re) {
+        regfree(re);
+        LUCY_FREEMEM(re);
+    }
+    LUCY_SUPER_DESTROY(self, LUCY_REGEXTOKENIZER);
 }
 
 void
 lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self,
                                  const char *string, size_t string_len,
                                  lucy_Inversion *inversion) {
-    THROW(LUCY_ERR, "TODO");
+    /* Null-terminate string. This could be avoided by using the non-standard
+     * REG_STARTEND flag.
+     */
+    char *c_string = (char*)LUCY_MALLOCATE(string_len + 1);
+    memcpy(c_string, string, string_len);
+    c_string[string_len] = '\0';
+
+    regex_t    *re  = (regex_t*)self->token_re;
+    const char *ptr = c_string;
+    uint32_t    off = 0; // Code points
+    regmatch_t match;
+
+    int errcode = regexec(re, ptr, 1, &match, 0);
+    while (errcode == 0) {
+        const char *match_start = ptr + match.rm_so;
+        size_t      match_len   = match.rm_eo - match.rm_so;
+        uint32_t    start_off   = off + S_count_code_points(ptr, match.rm_so);
+        uint32_t    end_off     = start_off + S_count_code_points(match_start,
+                                                                  match_len); 
+
+        // Add a token to the new inversion.
+        lucy_Token *token = lucy_Token_new(match_start, match_len, start_off,
+                                           end_off, 1.0f, 1);
+        Lucy_Inversion_Append(inversion, token);
+
+        ptr += match.rm_eo;
+        off  = end_off;
+        errcode = regexec(re, ptr, 1, &match, REG_NOTBOL);
+    }
+
+    if (errcode != REG_NOMATCH) {
+        size_t errbuf_size = regerror(errcode, re, NULL, 0);
+        char *errbuf = (char*)LUCY_MALLOCATE(errbuf_size);
+        regerror(errcode, re, errbuf, errbuf_size);
+        THROW(LUCY_ERR, "%s", errbuf);
+    }
+
+    LUCY_FREEMEM(c_string);
 }
 
+static uint32_t
+S_count_code_points(const char *string, size_t len) {
+    uint32_t num_code_points = 0;
+    size_t i = 0;
+
+    while (i < len) {
+        i += lucy_StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+        ++num_code_points;
+    }
+
+    if (i != len) {
+        THROW(LUCY_ERR, "Match between code point boundaries in '%s'", string);
+    }
+
+    return num_code_points;
+}

[lucy-commits] [15/15] git commit: refs/heads/c-bindings-wip2 - Implement POSIX RegexTokenizer

Reply via email to