master - Implement RegexTokenizer for C bindings using PCRE

nwellnhof Mon, 11 Mar 2013 14:52:50 -0700

Implement RegexTokenizer for C bindings using PCRE


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/49dbbec6
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/49dbbec6
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/49dbbec6

Branch: refs/heads/master
Commit: 49dbbec6e81420e2eb7ae815846f522b244e82d0
Parents: 049bd82
Author: Nick Wellnhofer <[email protected]>
Authored: Sat Mar 2 21:03:52 2013 +0100
Committer: Nick Wellnhofer <[email protected]>
Committed: Sat Mar 9 17:51:55 2013 +0100

----------------------------------------------------------------------
 c/src/Lucy/Analysis/RegexTokenizer.c |  153 ++++++++++++++++++++++++++---
 common/charmonizer.main              |    6 +-
 2 files changed, 145 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/49dbbec6/c/src/Lucy/Analysis/RegexTokenizer.c
----------------------------------------------------------------------
diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c 
b/c/src/Lucy/Analysis/RegexTokenizer.c
index 2f21afb..ae4c7d1 100644
--- a/c/src/Lucy/Analysis/RegexTokenizer.c
+++ b/c/src/Lucy/Analysis/RegexTokenizer.c
@@ -15,34 +15,161 @@
  */
 
 #define C_LUCY_REGEXTOKENIZER
+#define CHY_USE_SHORT_NAMES
+#define LUCY_USE_SHORT_NAMES
+
+#include "charmony.h"
+
+#include <string.h>
 
-#include "CFBind.h"
 #include "Lucy/Analysis/RegexTokenizer.h"
+#include "Clownfish/CharBuf.h"
+#include "Clownfish/Err.h"
+#include "Clownfish/Util/Memory.h"
+#include "Clownfish/Util/StringHelper.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
 
-lucy_RegexTokenizer*
-lucy_RegexTokenizer_init(lucy_RegexTokenizer *self,
-                         const lucy_CharBuf *pattern) {
-    THROW(LUCY_ERR, "TODO");
-    UNREACHABLE_RETURN(lucy_RegexTokenizer*);
+#if defined(HAS_PCRE_H)
+
+#include <pcre.h>
+
+static uint32_t
+S_count_code_points(const char *string, size_t len);
+
+RegexTokenizer*
+RegexTokenizer_init(RegexTokenizer *self, const CharBuf *pattern) {
+    Analyzer_init((Analyzer*)self);
+
+    const char *pattern_ptr;
+    if (pattern) {
+        self->pattern = CB_Clone(pattern);
+        pattern_ptr = (char*)CB_Get_Ptr8(self->pattern);
+    }
+    else {
+        pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*";
+        self->pattern
+            = CB_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
+    }
+
+    int options = PCRE_BSR_UNICODE
+                | PCRE_NEWLINE_LF
+                | PCRE_UTF8
+                | PCRE_NO_UTF8_CHECK;
+    const char *err_ptr;
+    int err_offset;
+    pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL);
+    if (!re) {
+        THROW(ERR, "%s", err_ptr);
+    }
+
+    // TODO: Check whether pcre_study improves performance
+
+    self->token_re = re;
+
+    return self;
 }
 
 void
-lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, void *token_re) {
-    THROW(LUCY_ERR, "TODO");
+RegexTokenizer_set_token_re(RegexTokenizer *self, void *token_re) {
+    THROW(ERR, "TODO");
 }
 
 void
-lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) {
-    THROW(LUCY_ERR, "TODO");
+RegexTokenizer_destroy(RegexTokenizer *self) {
+    DECREF(self->pattern);
+    pcre *re = (pcre*)self->token_re;
+    if (re) {
+        pcre_free(re);
+    }
+    SUPER_DESTROY(self, REGEXTOKENIZER);
 }
 
 void
-lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self,
+RegexTokenizer_tokenize_str(RegexTokenizer *self,
                                  const char *string, size_t string_len,
-                                 lucy_Inversion *inversion) {
-    THROW(LUCY_ERR, "TODO");
+                                 Inversion *inversion) {
+    pcre      *re          = (pcre*)self->token_re;
+    int        byte_offset = 0;
+    uint32_t   cp_offset   = 0; // Code points
+    int        options     = PCRE_NO_UTF8_CHECK;
+    int        ovector[3];
+
+    int return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
+                                options, ovector, 3);
+    while (return_code >= 0) {
+        const char *match     = string + ovector[0];
+        size_t      match_len = ovector[1] - ovector[0];
+
+        uint32_t cp_before  = S_count_code_points(string + byte_offset,
+                                                  ovector[0] - byte_offset);
+        uint32_t cp_start   = cp_offset + cp_before;
+        uint32_t cp_matched = S_count_code_points(match, match_len);
+        uint32_t cp_end     = cp_start + cp_matched;
+
+        // Add a token to the new inversion.
+        Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1);
+        Inversion_Append(inversion, token);
+
+        byte_offset = ovector[1];
+        cp_offset   = cp_end;
+        return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
+                                options, ovector, 3);
+    }
+
+    if (return_code != PCRE_ERROR_NOMATCH) {
+        THROW(ERR, "pcre_exec failed: %d", return_code);
+    }
+}
+
+static uint32_t
+S_count_code_points(const char *string, size_t len) {
+    uint32_t num_code_points = 0;
+    size_t i = 0;
+
+    while (i < len) {
+        i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+        ++num_code_points;
+    }
+
+    if (i != len) {
+        THROW(ERR, "Match between code point boundaries in '%s'", string);
+    }
+
+    return num_code_points;
+}
+
+#else // HAS_PCRE_H
+
+RegexTokenizer*
+RegexTokenizer_init(RegexTokenizer *self, const CharBuf *pattern) {
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+    UNREACHABLE_RETURN(RegexTokenizer*);
+}
+
+void
+RegexTokenizer_set_token_re(RegexTokenizer *self, void *token_re) {
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+}
+
+void
+RegexTokenizer_destroy(RegexTokenizer *self) {
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+}
+
+void
+RegexTokenizer_tokenize_str(RegexTokenizer *self, const char *string,
+                            size_t string_len, Inversion *inversion) {
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
 }
 
+#endif // HAS_PCRE_H
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/49dbbec6/common/charmonizer.main
----------------------------------------------------------------------
diff --git a/common/charmonizer.main b/common/charmonizer.main
index 83de908..c0d30b8 100644
--- a/common/charmonizer.main
+++ b/common/charmonizer.main
@@ -278,8 +278,12 @@ S_write_makefile() {
     chaz_MakeRule_add_prereq(rule, json_parser_c);
     chaz_MakeRule_add_prereq(rule, "$(AUTOGEN_DIR)");
 
+    const char *link_flags = "";
+    if (chaz_HeadCheck_check_header("pcre.h")) {
+        link_flags = "-lpcre";
+    }
     chaz_MakeFile_add_shared_obj(makefile, "$(LUCY_SHOBJ)", "$(LUCY_OBJS)",
-                                 "");
+                                 link_flags);
 
     chaz_MakeFile_add_rule(makefile, "$(TEST_LUCY_OBJS)", "$(AUTOGEN_DIR)");

[lucy-commits] [14/18] git commit: refs/heads/master - Implement RegexTokenizer for C bindings using PCRE

Reply via email to