Use short names in RegexTokenizer
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/824146d6 Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/824146d6 Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/824146d6 Branch: refs/heads/c-bindings-wip2 Commit: 824146d66b20e30040b532c3b0534e94608c1392 Parents: 0ed6815 Author: Nick Wellnhofer <[email protected]> Authored: Thu Mar 7 19:20:26 2013 +0100 Committer: Nick Wellnhofer <[email protected]> Committed: Thu Mar 7 19:20:26 2013 +0100 ---------------------------------------------------------------------- c/src/Lucy/Analysis/RegexTokenizer.c | 66 +++++++++++++++------------- 1 files changed, 35 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/824146d6/c/src/Lucy/Analysis/RegexTokenizer.c ---------------------------------------------------------------------- diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c b/c/src/Lucy/Analysis/RegexTokenizer.c index 46b0fe4..5584d8c 100644 --- a/c/src/Lucy/Analysis/RegexTokenizer.c +++ b/c/src/Lucy/Analysis/RegexTokenizer.c @@ -15,42 +15,46 @@ */ #define C_LUCY_REGEXTOKENIZER +#define CHY_USE_SHORT_NAMES +#define LUCY_USE_SHORT_NAMES -#include "CFBind.h" +#include "charmony.h" #include <string.h> -#if defined(CHY_HAS_REGEX_H) +#if defined(HAS_REGEX_H) #include <regex.h> -#elif defined(CHY_HAS_PCREPOSIX_H) +#elif defined(HAS_PCREPOSIX_H) #include <pcreposix.h> #else #error No regex headers found. #endif +#include "Lucy/Analysis/RegexTokenizer.h" +#include "Clownfish/CharBuf.h" +#include "Clownfish/Err.h" #include "Clownfish/Util/Memory.h" #include "Clownfish/Util/StringHelper.h" -#include "Lucy/Analysis/RegexTokenizer.h" #include "Lucy/Analysis/Token.h" #include "Lucy/Analysis/Inversion.h" static uint32_t S_count_code_points(const char *string, size_t len); -lucy_RegexTokenizer* -lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, - const lucy_CharBuf *pattern) { - lucy_Analyzer_init((lucy_Analyzer*)self); +RegexTokenizer* +RegexTokenizer_init(RegexTokenizer *self, + const CharBuf *pattern) { + Analyzer_init((Analyzer*)self); const char *pattern_ptr; if (pattern) { - self->pattern = Lucy_CB_Clone(pattern); - pattern_ptr = (char*)Lucy_CB_Get_Ptr8(self->pattern); + self->pattern = CB_Clone(pattern); + pattern_ptr = (char*)CB_Get_Ptr8(self->pattern); } else { pattern_ptr = "[[:alnum:]]+('[[:alnum:]]+)*"; self->pattern - = lucy_CB_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr)); + = CB_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr)); } // TODO: Make sure that we use a UTF-8 locale. @@ -59,15 +63,15 @@ lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, #ifdef CHY_HAS_REG_ENHANCED flags |= REG_ENHANCED; #endif - regex_t *re = LUCY_MALLOCATE(sizeof(regex_t)); + regex_t *re = MALLOCATE(sizeof(regex_t)); int errcode = regcomp(re, pattern_ptr, flags); if (errcode) { size_t errbuf_size = regerror(errcode, re, NULL, 0); - char *errbuf = (char*)LUCY_MALLOCATE(errbuf_size); + char *errbuf = (char*)MALLOCATE(errbuf_size); regerror(errcode, re, errbuf, errbuf_size); regfree(re); - LUCY_FREEMEM(re); - THROW(LUCY_ERR, "%s", errbuf); + FREEMEM(re); + THROW(ERR, "%s", errbuf); } self->token_re = re; @@ -76,29 +80,29 @@ lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, } void -lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, void *token_re) { - THROW(LUCY_ERR, "TODO"); +RegexTokenizer_set_token_re(RegexTokenizer *self, void *token_re) { + THROW(ERR, "TODO"); } void -lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) { - CFISH_DECREF(self->pattern); +RegexTokenizer_destroy(RegexTokenizer *self) { + DECREF(self->pattern); regex_t *re = (regex_t*)self->token_re; if (re) { regfree(re); - LUCY_FREEMEM(re); + FREEMEM(re); } - LUCY_SUPER_DESTROY(self, LUCY_REGEXTOKENIZER); + SUPER_DESTROY(self, REGEXTOKENIZER); } void -lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self, +RegexTokenizer_tokenize_str(RegexTokenizer *self, const char *string, size_t string_len, - lucy_Inversion *inversion) { + Inversion *inversion) { /* Null-terminate string. This could be avoided by using the non-standard * REG_STARTEND flag. */ - char *c_string = (char*)LUCY_MALLOCATE(string_len + 1); + char *c_string = (char*)MALLOCATE(string_len + 1); memcpy(c_string, string, string_len); c_string[string_len] = '\0'; @@ -116,9 +120,9 @@ lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self, match_len); // Add a token to the new inversion. - lucy_Token *token = lucy_Token_new(match_start, match_len, start_off, + Token *token = Token_new(match_start, match_len, start_off, end_off, 1.0f, 1); - Lucy_Inversion_Append(inversion, token); + Inversion_Append(inversion, token); ptr += match.rm_eo; off = end_off; @@ -127,12 +131,12 @@ lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self, if (errcode != REG_NOMATCH) { size_t errbuf_size = regerror(errcode, re, NULL, 0); - char *errbuf = (char*)LUCY_MALLOCATE(errbuf_size); + char *errbuf = (char*)MALLOCATE(errbuf_size); regerror(errcode, re, errbuf, errbuf_size); - THROW(LUCY_ERR, "%s", errbuf); + THROW(ERR, "%s", errbuf); } - LUCY_FREEMEM(c_string); + FREEMEM(c_string); } static uint32_t @@ -141,12 +145,12 @@ S_count_code_points(const char *string, size_t len) { size_t i = 0; while (i < len) { - i += lucy_StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; + i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; ++num_code_points; } if (i != len) { - THROW(LUCY_ERR, "Match between code point boundaries in '%s'", string); + THROW(ERR, "Match between code point boundaries in '%s'", string); } return num_code_points;
