Port RegexTokenizer stubs to CGO.
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/44fc440f Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/44fc440f Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/44fc440f Branch: refs/heads/master Commit: 44fc440fdc419b655fb4c482afb63b9020138011 Parents: dab9a88 Author: Marvin Humphrey <[email protected]> Authored: Sun Jul 19 12:57:13 2015 -0700 Committer: Marvin Humphrey <[email protected]> Committed: Fri Jul 31 17:39:28 2015 -0700 ---------------------------------------------------------------------- go/cfext/lucy.c | 157 +++------------------------------------------------ go/lucy/lucy.go | 48 ++++++++++++++++ 2 files changed, 56 insertions(+), 149 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/44fc440f/go/cfext/lucy.c ---------------------------------------------------------------------- diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c index d1044df..5773f16 100644 --- a/go/cfext/lucy.c +++ b/go/cfext/lucy.c @@ -55,175 +55,34 @@ #include "Lucy/Store/OutStream.h" #include "Lucy/Util/Freezer.h" -#if defined(CHY_HAS_PCRE_H) - -#include <pcre.h> - -static uint32_t -S_count_code_points(const char *string, size_t len); - bool RegexTokenizer_is_available(void) { - return true; + return false; } RegexTokenizer* -RegexTokenizer_init(RegexTokenizer *self, String *pattern) { - Analyzer_init((Analyzer*)self); - RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); - - char *pattern_buf = NULL; - const char *pattern_ptr; - if (pattern) { - ivars->pattern = Str_Clone(pattern); - pattern_buf = Str_To_Utf8(ivars->pattern); - pattern_ptr = pattern_buf; - } - else { - pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*"; - ivars->pattern - = Str_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr)); - } - - int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK; -#ifdef PCRE_BSR_UNICODE - // Available since PCRE 7.4 - options |= PCRE_BSR_UNICODE; -#endif -#ifdef PCRE_NEWLINE_LF - // Available since PCRE 6.7 - options |= PCRE_NEWLINE_LF; -#endif - const char *err_ptr; - int err_offset; - pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL); - if (pattern_buf) { - FREEMEM(pattern_buf); - } - if (!re) { - THROW(ERR, "%s", err_ptr); - } - - // TODO: Check whether pcre_study improves performance - - ivars->token_re = re; - - return self; -} - -void -RegexTokenizer_Destroy_IMP(RegexTokenizer *self) { - RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); - DECREF(ivars->pattern); - pcre *re = (pcre*)ivars->token_re; - if (re) { - pcre_free(re); - } - SUPER_DESTROY(self, REGEXTOKENIZER); -} - -void -RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string, - size_t string_len, Inversion *inversion) { - RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); - pcre *re = (pcre*)ivars->token_re; - int byte_offset = 0; - uint32_t cp_offset = 0; // Code points - int options = PCRE_NO_UTF8_CHECK; - int ovector[3]; - - int return_code = pcre_exec(re, NULL, string, string_len, byte_offset, - options, ovector, 3); - while (return_code >= 0) { - const char *match = string + ovector[0]; - size_t match_len = ovector[1] - ovector[0]; - - uint32_t cp_before = S_count_code_points(string + byte_offset, - ovector[0] - byte_offset); - uint32_t cp_start = cp_offset + cp_before; - uint32_t cp_matched = S_count_code_points(match, match_len); - uint32_t cp_end = cp_start + cp_matched; - - // Add a token to the new inversion. - Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1); - Inversion_Append(inversion, token); - - byte_offset = ovector[1]; - cp_offset = cp_end; - return_code = pcre_exec(re, NULL, string, string_len, byte_offset, - options, ovector, 3); - } - - if (return_code != PCRE_ERROR_NOMATCH) { - THROW(ERR, "pcre_exec failed: %d", return_code); - } -} - -static uint32_t -S_count_code_points(const char *string, size_t len) { - uint32_t num_code_points = 0; - size_t i = 0; - - while (i < len) { - i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; - ++num_code_points; - } - - if (i != len) { - THROW(ERR, "Match between code point boundaries in '%s'", string); - } - - return num_code_points; -} - -#else // CHY_HAS_PCRE_H - -bool -RegexTokenizer_is_available(void) { - return false; -} +(*GOLUCY_RegexTokenizer_init_BRIDGE)(RegexTokenizer *self, String *pattern); RegexTokenizer* RegexTokenizer_init(RegexTokenizer *self, String *pattern) { - UNUSED_VAR(self); - UNUSED_VAR(pattern); - THROW(ERR, - "RegexTokenizer is not available because Lucy was compiled" - " without PCRE."); - UNREACHABLE_RETURN(RegexTokenizer*); + return GOLUCY_RegexTokenizer_init_BRIDGE(self, pattern); } -void -RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) { - UNUSED_VAR(self); - UNUSED_VAR(token_re); - THROW(ERR, - "RegexTokenizer is not available because Lucy was compiled" - " without PCRE."); -} +RegexTokenizer_Destroy_t GOLUCY_RegexTokenizer_Destroy_BRIDGE; void RegexTokenizer_Destroy_IMP(RegexTokenizer *self) { - UNUSED_VAR(self); - THROW(ERR, - "RegexTokenizer is not available because Lucy was compiled" - " without PCRE."); + GOLUCY_RegexTokenizer_Destroy_BRIDGE(self); } +RegexTokenizer_Tokenize_Utf8_t GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE; + void RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string, size_t string_len, Inversion *inversion) { - UNUSED_VAR(self); - UNUSED_VAR(string); - UNUSED_VAR(string_len); - UNUSED_VAR(inversion); - THROW(ERR, - "RegexTokenizer is not available because Lucy was compiled" - " without PCRE."); + GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE(self, string, string_len, inversion); } -#endif // CHY_HAS_PCRE_H - /********************************** Doc ********************************/ Doc* http://git-wip-us.apache.org/repos/asf/lucy/blob/44fc440f/go/lucy/lucy.go ---------------------------------------------------------------------- diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go index 908599a..13bdafa 100644 --- a/go/lucy/lucy.go +++ b/go/lucy/lucy.go @@ -17,11 +17,59 @@ package lucy /* +#define C_LUCY_REGEXTOKENIZER + #include "lucy_parcel.h" +#include "Lucy/Analysis/RegexTokenizer.h" + +extern lucy_RegexTokenizer* +GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern); +extern lucy_RegexTokenizer* +(*GOLUCY_RegexTokenizer_init_BRIDGE)(lucy_RegexTokenizer *self, + cfish_String *pattern); +extern void +GOLUCY_RegexTokenizer_Destroy(lucy_RegexTokenizer *self); +extern void +(*GOLUCY_RegexTokenizer_Destroy_BRIDGE)(lucy_RegexTokenizer *self); +extern void +GOLUCY_RegexTokenizer_Tokenize_Utf8(lucy_RegexTokenizer *self, char *str, + size_t string_len, lucy_Inversion *inversion); +extern void +(*GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE)(lucy_RegexTokenizer *self, const char *str, + size_t string_len, lucy_Inversion *inversion); + + +// C symbols linked into a Go-built package archive are not visible to +// external C code -- but internal code *can* see symbols from outside. +// This allows us to fake up symbol export by assigning values only known +// interally to external symbols during Go package initialization. +static CFISH_INLINE void +GOLUCY_glue_exported_symbols() { + GOLUCY_RegexTokenizer_init_BRIDGE = GOLUCY_RegexTokenizer_init; + GOLUCY_RegexTokenizer_Destroy_BRIDGE = GOLUCY_RegexTokenizer_Destroy; + GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE + = (LUCY_RegexTokenizer_Tokenize_Utf8_t)GOLUCY_RegexTokenizer_Tokenize_Utf8; +} + */ import "C" import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish" func init() { + C.GOLUCY_glue_exported_symbols() C.lucy_bootstrap_parcel() } + +//export GOLUCY_RegexTokenizer_init +func GOLUCY_RegexTokenizer_init(rt *C.lucy_RegexTokenizer, pattern *C.cfish_String) *C.lucy_RegexTokenizer { + return nil +} + +//export GOLUCY_RegexTokenizer_Destroy +func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) { +} + +//export GOLUCY_RegexTokenizer_Tokenize_Utf8 +func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char, + stringLen C.size_t, inversion *C.lucy_Inversion) { +}
