Fix end offsets for edge case highlight data. Under some circumstances (outside the most common code paths), the end offset for the last token in a field may have been too high, as a result of counting bytes rather than code points in UTF-8 source data. However, Highlighter only uses this data for heat mapping; it uses safe string iteration when actually choosing excerpt boundaries, and cannot overrun.
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/080c33ac Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/080c33ac Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/080c33ac Branch: refs/heads/master Commit: 080c33acda88cb5d35a8a91541873b906ad38310 Parents: d6135b5 Author: Marvin Humphrey <[email protected]> Authored: Tue May 3 18:59:04 2016 -0700 Committer: Marvin Humphrey <[email protected]> Committed: Wed May 4 19:21:36 2016 -0700 ---------------------------------------------------------------------- core/Lucy/Analysis/Analyzer.c | 5 ++++- core/Lucy/Analysis/PolyAnalyzer.c | 7 ++++++- core/Lucy/Index/Inverter.c | 3 ++- perl/buildlib/Lucy/Build/Binding/Analysis.pm | 3 ++- 4 files changed, 14 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/080c33ac/core/Lucy/Analysis/Analyzer.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Analysis/Analyzer.c b/core/Lucy/Analysis/Analyzer.c index 5560531..3676fc8 100644 --- a/core/Lucy/Analysis/Analyzer.c +++ b/core/Lucy/Analysis/Analyzer.c @@ -31,8 +31,11 @@ Analyzer_init(Analyzer *self) { Inversion* Analyzer_Transform_Text_IMP(Analyzer *self, String *text) { size_t token_len = Str_Get_Size(text); + if (token_len >= INT32_MAX) { + THROW(ERR, "Text too long: %u64", (uint64_t)token_len); + } Token *seed = Token_new(Str_Get_Ptr8(text), token_len, 0, - token_len, 1.0, 1); + (uint32_t)Str_Length(text), 1.0, 1); Inversion *starter = Inversion_new(seed); Inversion *retval = Analyzer_Transform(self, starter); DECREF(seed); http://git-wip-us.apache.org/repos/asf/lucy/blob/080c33ac/core/Lucy/Analysis/PolyAnalyzer.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Analysis/PolyAnalyzer.c b/core/Lucy/Analysis/PolyAnalyzer.c index 695c2d7..d295981 100644 --- a/core/Lucy/Analysis/PolyAnalyzer.c +++ b/core/Lucy/Analysis/PolyAnalyzer.c @@ -93,7 +93,12 @@ PolyAnalyzer_Transform_Text_IMP(PolyAnalyzer *self, String *text) { if (num_analyzers == 0) { size_t token_len = Str_Get_Size(text); const char *buf = Str_Get_Ptr8(text); - Token *seed = Token_new(buf, token_len, 0, token_len, 1.0f, 1); + if (token_len >= INT32_MAX) { + THROW(ERR, "Can't process string over 2GB: %u64", + (uint64_t)token_len); + } + Token *seed + = Token_new(buf, token_len, 0, (uint32_t)Str_Length(text),1.0f, 1); retval = Inversion_new(seed); DECREF(seed); } http://git-wip-us.apache.org/repos/asf/lucy/blob/080c33ac/core/Lucy/Index/Inverter.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Inverter.c b/core/Lucy/Index/Inverter.c index 32ce69d..42b325e 100644 --- a/core/Lucy/Index/Inverter.c +++ b/core/Lucy/Index/Inverter.c @@ -185,8 +185,9 @@ Inverter_Add_Field_IMP(Inverter *self, InverterEntry *entry) { else if (entry_ivars->indexed || entry_ivars->highlightable) { String *value = (String*)entry_ivars->value; size_t token_len = Str_Get_Size(value); + size_t cp_len = Str_Length(value); Token *seed = Token_new(Str_Get_Ptr8(value), - token_len, 0, token_len, 1.0f, 1); + token_len, 0, (uint32_t)cp_len, 1.0f, 1); DECREF(entry_ivars->inversion); entry_ivars->inversion = Inversion_new(seed); DECREF(seed); http://git-wip-us.apache.org/repos/asf/lucy/blob/080c33ac/perl/buildlib/Lucy/Build/Binding/Analysis.pm ---------------------------------------------------------------------- diff --git a/perl/buildlib/Lucy/Build/Binding/Analysis.pm b/perl/buildlib/Lucy/Build/Binding/Analysis.pm index ff2e3f2..3d58652 100644 --- a/perl/buildlib/Lucy/Build/Binding/Analysis.pm +++ b/perl/buildlib/Lucy/Build/Binding/Analysis.pm @@ -156,7 +156,8 @@ CODE: if (XSBind_sv_defined(aTHX_ text_sv)) { STRLEN len; char *text = SvPVutf8(text_sv, len); - starter_token = lucy_Token_new(text, len, 0, len, 1.0, 1); + STRLEN length = utf8_length((U8*)text, (U8*)text + len); + starter_token = lucy_Token_new(text, len, 0, length, 1.0, 1); } RETVAL = CFISH_OBJ_TO_SV_NOINC(lucy_Inversion_new(starter_token));
