[8/9] lucy git commit: Port RegexTokenizer to Go and CGO.

marvin Mon, 03 Aug 2015 14:59:40 -0700

Port RegexTokenizer to Go and CGO.

Use Go's regular expression engine, the `regexp` package.  Store Go
`regexp` objects using the registry which allows them to be referenced
by integer from C.



Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/5f00a213
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/5f00a213
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/5f00a213

Branch: refs/heads/master
Commit: 5f00a21335c3304c074223b2dba4567a62d9c97a
Parents: 7749e59
Author: Marvin Humphrey <[email protected]>
Authored: Mon Jul 20 12:41:34 2015 -0700
Committer: Marvin Humphrey <[email protected]>
Committed: Fri Jul 31 18:21:28 2015 -0700

----------------------------------------------------------------------
 go/lucy/lucy.go      | 81 ++++++++++++++++++++++++++++++++++++++++++++++-
 go/lucy/lucy_test.go | 10 ++++++
 2 files changed, 90 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/5f00a213/go/lucy/lucy.go
----------------------------------------------------------------------
diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 556235e..bc2e9f8 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -38,6 +38,11 @@ package lucy
 #include "Clownfish/Hash.h"
 #include "Clownfish/HashIterator.h"
 #include "Clownfish/Vector.h"
+#include "Clownfish/Err.h"
+#include "Clownfish/Util/StringHelper.h"
+#include "Lucy/Analysis/Analyzer.h"
+#include "Lucy/Analysis/Inversion.h"
+#include "Lucy/Analysis/Token.h"
 #include "Lucy/Document/HitDoc.h"
 #include "Lucy/Plan/FieldType.h"
 #include "Lucy/Plan/Schema.h"
@@ -133,6 +138,35 @@ GOLUCY_glue_exported_symbols() {
        GOLUCY_Inverter_Invert_Doc_BRIDGE = GOLUCY_Inverter_Invert_Doc;
 }
 
+static uint32_t
+S_count_code_points(const char *string, size_t len) {
+    uint32_t num_code_points = 0;
+    size_t i = 0;
+
+    while (i < len) {
+        i += cfish_StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+        ++num_code_points;
+    }
+
+    if (i != len) {
+        CFISH_THROW(CFISH_ERR, "Match between code point boundaries in '%s'", 
string);
+    }
+
+    return num_code_points;
+}
+
+// Returns the number of code points through the end of the match.
+static int
+push_token(const char *str, int start, int end, int last_end,
+           int cp_count, lucy_Inversion *inversion) {
+       const char *match = str + start;
+       int match_len = end - start;
+       int cp_start = cp_count + S_count_code_points(str + last_end, start - 
last_end);
+       int cp_end   = cp_start + S_count_code_points(match, match_len);
+       lucy_Token *token = lucy_Token_new(match, match_len, cp_start, cp_end, 
1.0f, 1);
+       LUCY_Inversion_Append(inversion, token);
+       return cp_end;
+}
 
 static void
 null_terminate_string(char *string, size_t len) {
@@ -143,25 +177,70 @@ null_terminate_string(char *string, size_t len) {
 import "C"
 import "unsafe"
 import "fmt"
+import "regexp"
 import 
"git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
 
+var registry *objRegistry
+
 func init() {
        C.GOLUCY_glue_exported_symbols()
        C.lucy_bootstrap_parcel()
+       registry = newObjRegistry(16)
 }
 
 //export GOLUCY_RegexTokenizer_init
 func GOLUCY_RegexTokenizer_init(rt *C.lucy_RegexTokenizer, pattern 
*C.cfish_String) *C.lucy_RegexTokenizer {
-       return nil
+       C.lucy_Analyzer_init(((*C.lucy_Analyzer)(unsafe.Pointer(rt))))
+
+       ivars := C.lucy_RegexTokenizer_IVARS(rt)
+       ivars.pattern = C.CFISH_Str_Clone(pattern)
+
+       var patternGo string
+       if pattern == nil {
+               patternGo = "\\w+(?:['\\x{2019}]\\w+)*"
+       } else {
+               patternGo = clownfish.CFStringToGo(unsafe.Pointer(pattern))
+       }
+       rx, err := regexp.Compile(patternGo)
+       if err != nil {
+               panic(err)
+       }
+       rxID := registry.store(rx)
+       ivars.token_re = unsafe.Pointer(rxID)
+
+       return rt
 }
 
 //export GOLUCY_RegexTokenizer_Destroy
 func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) {
+       ivars := C.lucy_RegexTokenizer_IVARS(rt)
+       rxID := uintptr(ivars.token_re)
+       registry.delete(rxID)
+       C.cfish_super_destroy(unsafe.Pointer(rt), C.LUCY_REGEXTOKENIZER)
 }
 
 //export GOLUCY_RegexTokenizer_Tokenize_Utf8
 func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str 
*C.char,
        stringLen C.size_t, inversion *C.lucy_Inversion) {
+
+       ivars := C.lucy_RegexTokenizer_IVARS(rt)
+       rxID := uintptr(ivars.token_re)
+       rx, ok := registry.fetch(rxID).(*regexp.Regexp)
+       if !ok {
+               mess := fmt.Sprintf("Failed to Fetch *RegExp with id %d and 
pattern %s",
+                       rxID, 
clownfish.CFStringToGo(unsafe.Pointer(ivars.pattern)))
+               panic(clownfish.NewErr(mess))
+       }
+
+       buf := C.GoBytes(unsafe.Pointer(str), C.int(stringLen))
+       found := rx.FindAllIndex(buf, int(stringLen))
+       lastEnd := 0
+       cpCount := 0
+       for _, startEnd := range found {
+               cpCount = int(C.push_token(str, C.int(startEnd[0]), 
C.int(startEnd[1]),
+                       C.int(lastEnd), C.int(cpCount), inversion))
+               lastEnd = startEnd[1]
+       }
 }
 
 func NewDoc(docID int32) Doc {

http://git-wip-us.apache.org/repos/asf/lucy/blob/5f00a213/go/lucy/lucy_test.go
----------------------------------------------------------------------
diff --git a/go/lucy/lucy_test.go b/go/lucy/lucy_test.go
index 94e4f0a..82ba878 100644
--- a/go/lucy/lucy_test.go
+++ b/go/lucy/lucy_test.go
@@ -18,6 +18,7 @@ package lucy
 
 import 
"git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
 import "testing"
+import "reflect"
 
 func TestStuff(t *testing.T) {
        NewSchema()
@@ -29,3 +30,12 @@ func TestOpenIndexer(t *testing.T) {
                t.Error("Didn't catch exception opening indexer")
        }
 }
+
+func TestRegex(t *testing.T) {
+       tokenizer := NewRegexTokenizer("\\S+")
+       var expected []interface{} = []interface{}{"foo", "bar", "baz"}
+       got := tokenizer.Split("foo bar baz")
+       if !reflect.DeepEqual(got, expected) {
+               t.Errorf("Expected %v, got %v", expected, got)
+       }
+}

[8/9] lucy git commit: Port RegexTokenizer to Go and CGO.

Reply via email to