Updated Branches: refs/heads/c-bindings-wip2 [created] 24d06ccd8
Implement POSIX RegexTokenizer Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/24d06ccd Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/24d06ccd Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/24d06ccd Branch: refs/heads/c-bindings-wip2 Commit: 24d06ccd88d6d093dbf54638cbc8b9e81f770d34 Parents: 0fc39d5 Author: Nick Wellnhofer <[email protected]> Authored: Sat Mar 2 21:03:52 2013 +0100 Committer: Nick Wellnhofer <[email protected]> Committed: Sat Mar 2 21:03:52 2013 +0100 ---------------------------------------------------------------------- c/src/Lucy/Analysis/RegexTokenizer.c | 114 ++++++++++++++++++++++++++++- 1 files changed, 110 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/24d06ccd/c/src/Lucy/Analysis/RegexTokenizer.c ---------------------------------------------------------------------- diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c b/c/src/Lucy/Analysis/RegexTokenizer.c index 2f21afb..46b0fe4 100644 --- a/c/src/Lucy/Analysis/RegexTokenizer.c +++ b/c/src/Lucy/Analysis/RegexTokenizer.c @@ -17,15 +17,62 @@ #define C_LUCY_REGEXTOKENIZER #include "CFBind.h" + +#include <string.h> + +#if defined(CHY_HAS_REGEX_H) + #include <regex.h> +#elif defined(CHY_HAS_PCREPOSIX_H) + #include <pcreposix.h> +#else + #error No regex headers found. +#endif + +#include "Clownfish/Util/Memory.h" +#include "Clownfish/Util/StringHelper.h" #include "Lucy/Analysis/RegexTokenizer.h" #include "Lucy/Analysis/Token.h" #include "Lucy/Analysis/Inversion.h" +static uint32_t +S_count_code_points(const char *string, size_t len); + lucy_RegexTokenizer* lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, const lucy_CharBuf *pattern) { - THROW(LUCY_ERR, "TODO"); - UNREACHABLE_RETURN(lucy_RegexTokenizer*); + lucy_Analyzer_init((lucy_Analyzer*)self); + + const char *pattern_ptr; + if (pattern) { + self->pattern = Lucy_CB_Clone(pattern); + pattern_ptr = (char*)Lucy_CB_Get_Ptr8(self->pattern); + } + else { + pattern_ptr = "[[:alnum:]]+('[[:alnum:]]+)*"; + self->pattern + = lucy_CB_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr)); + } + + // TODO: Make sure that we use a UTF-8 locale. + + int flags = REG_EXTENDED; +#ifdef CHY_HAS_REG_ENHANCED + flags |= REG_ENHANCED; +#endif + regex_t *re = LUCY_MALLOCATE(sizeof(regex_t)); + int errcode = regcomp(re, pattern_ptr, flags); + if (errcode) { + size_t errbuf_size = regerror(errcode, re, NULL, 0); + char *errbuf = (char*)LUCY_MALLOCATE(errbuf_size); + regerror(errcode, re, errbuf, errbuf_size); + regfree(re); + LUCY_FREEMEM(re); + THROW(LUCY_ERR, "%s", errbuf); + } + + self->token_re = re; + + return self; } void @@ -35,14 +82,73 @@ lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, void *token_re) { void lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) { - THROW(LUCY_ERR, "TODO"); + CFISH_DECREF(self->pattern); + regex_t *re = (regex_t*)self->token_re; + if (re) { + regfree(re); + LUCY_FREEMEM(re); + } + LUCY_SUPER_DESTROY(self, LUCY_REGEXTOKENIZER); } void lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self, const char *string, size_t string_len, lucy_Inversion *inversion) { - THROW(LUCY_ERR, "TODO"); + /* Null-terminate string. This could be avoided by using the non-standard + * REG_STARTEND flag. + */ + char *c_string = (char*)LUCY_MALLOCATE(string_len + 1); + memcpy(c_string, string, string_len); + c_string[string_len] = '\0'; + + regex_t *re = (regex_t*)self->token_re; + const char *ptr = c_string; + uint32_t off = 0; // Code points + regmatch_t match; + + int errcode = regexec(re, ptr, 1, &match, 0); + while (errcode == 0) { + const char *match_start = ptr + match.rm_so; + size_t match_len = match.rm_eo - match.rm_so; + uint32_t start_off = off + S_count_code_points(ptr, match.rm_so); + uint32_t end_off = start_off + S_count_code_points(match_start, + match_len); + + // Add a token to the new inversion. + lucy_Token *token = lucy_Token_new(match_start, match_len, start_off, + end_off, 1.0f, 1); + Lucy_Inversion_Append(inversion, token); + + ptr += match.rm_eo; + off = end_off; + errcode = regexec(re, ptr, 1, &match, REG_NOTBOL); + } + + if (errcode != REG_NOMATCH) { + size_t errbuf_size = regerror(errcode, re, NULL, 0); + char *errbuf = (char*)LUCY_MALLOCATE(errbuf_size); + regerror(errcode, re, errbuf, errbuf_size); + THROW(LUCY_ERR, "%s", errbuf); + } + + LUCY_FREEMEM(c_string); } +static uint32_t +S_count_code_points(const char *string, size_t len) { + uint32_t num_code_points = 0; + size_t i = 0; + + while (i < len) { + i += lucy_StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; + ++num_code_points; + } + + if (i != len) { + THROW(LUCY_ERR, "Match between code point boundaries in '%s'", string); + } + + return num_code_points; +}
