Fix end offsets for edge case highlight data.

Under some circumstances (outside the most common code paths), the end
offset for the last token in a field may have been too high, as a result
of counting bytes rather than code points in UTF-8 source data.
However, Highlighter only uses this data for heat mapping; it uses safe
string iteration when actually choosing excerpt boundaries, and cannot
overrun.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/080c33ac
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/080c33ac
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/080c33ac

Branch: refs/heads/master
Commit: 080c33acda88cb5d35a8a91541873b906ad38310
Parents: d6135b5
Author: Marvin Humphrey <[email protected]>
Authored: Tue May 3 18:59:04 2016 -0700
Committer: Marvin Humphrey <[email protected]>
Committed: Wed May 4 19:21:36 2016 -0700

----------------------------------------------------------------------
 core/Lucy/Analysis/Analyzer.c                | 5 ++++-
 core/Lucy/Analysis/PolyAnalyzer.c            | 7 ++++++-
 core/Lucy/Index/Inverter.c                   | 3 ++-
 perl/buildlib/Lucy/Build/Binding/Analysis.pm | 3 ++-
 4 files changed, 14 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/080c33ac/core/Lucy/Analysis/Analyzer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/Analyzer.c b/core/Lucy/Analysis/Analyzer.c
index 5560531..3676fc8 100644
--- a/core/Lucy/Analysis/Analyzer.c
+++ b/core/Lucy/Analysis/Analyzer.c
@@ -31,8 +31,11 @@ Analyzer_init(Analyzer *self) {
 Inversion*
 Analyzer_Transform_Text_IMP(Analyzer *self, String *text) {
     size_t token_len = Str_Get_Size(text);
+    if (token_len >= INT32_MAX) {
+        THROW(ERR, "Text too long: %u64", (uint64_t)token_len);
+    }
     Token *seed = Token_new(Str_Get_Ptr8(text), token_len, 0,
-                            token_len, 1.0, 1);
+                            (uint32_t)Str_Length(text), 1.0, 1);
     Inversion *starter = Inversion_new(seed);
     Inversion *retval  = Analyzer_Transform(self, starter);
     DECREF(seed);

http://git-wip-us.apache.org/repos/asf/lucy/blob/080c33ac/core/Lucy/Analysis/PolyAnalyzer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/PolyAnalyzer.c 
b/core/Lucy/Analysis/PolyAnalyzer.c
index 695c2d7..d295981 100644
--- a/core/Lucy/Analysis/PolyAnalyzer.c
+++ b/core/Lucy/Analysis/PolyAnalyzer.c
@@ -93,7 +93,12 @@ PolyAnalyzer_Transform_Text_IMP(PolyAnalyzer *self, String 
*text) {
     if (num_analyzers == 0) {
         size_t      token_len = Str_Get_Size(text);
         const char *buf       = Str_Get_Ptr8(text);
-        Token *seed = Token_new(buf, token_len, 0, token_len, 1.0f, 1);
+        if (token_len >= INT32_MAX) {
+            THROW(ERR, "Can't process string over 2GB: %u64",
+                  (uint64_t)token_len);
+        }
+        Token *seed
+            = Token_new(buf, token_len, 0, (uint32_t)Str_Length(text),1.0f, 1);
         retval = Inversion_new(seed);
         DECREF(seed);
     }

http://git-wip-us.apache.org/repos/asf/lucy/blob/080c33ac/core/Lucy/Index/Inverter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Inverter.c b/core/Lucy/Index/Inverter.c
index 32ce69d..42b325e 100644
--- a/core/Lucy/Index/Inverter.c
+++ b/core/Lucy/Index/Inverter.c
@@ -185,8 +185,9 @@ Inverter_Add_Field_IMP(Inverter *self, InverterEntry 
*entry) {
     else if (entry_ivars->indexed || entry_ivars->highlightable) {
         String *value = (String*)entry_ivars->value;
         size_t token_len = Str_Get_Size(value);
+        size_t cp_len = Str_Length(value);
         Token *seed = Token_new(Str_Get_Ptr8(value),
-                                token_len, 0, token_len, 1.0f, 1);
+                                token_len, 0, (uint32_t)cp_len, 1.0f, 1);
         DECREF(entry_ivars->inversion);
         entry_ivars->inversion = Inversion_new(seed);
         DECREF(seed);

http://git-wip-us.apache.org/repos/asf/lucy/blob/080c33ac/perl/buildlib/Lucy/Build/Binding/Analysis.pm
----------------------------------------------------------------------
diff --git a/perl/buildlib/Lucy/Build/Binding/Analysis.pm 
b/perl/buildlib/Lucy/Build/Binding/Analysis.pm
index ff2e3f2..3d58652 100644
--- a/perl/buildlib/Lucy/Build/Binding/Analysis.pm
+++ b/perl/buildlib/Lucy/Build/Binding/Analysis.pm
@@ -156,7 +156,8 @@ CODE:
     if (XSBind_sv_defined(aTHX_ text_sv)) {
         STRLEN len;
         char *text = SvPVutf8(text_sv, len);
-        starter_token = lucy_Token_new(text, len, 0, len, 1.0, 1);
+        STRLEN length = utf8_length((U8*)text, (U8*)text + len);
+        starter_token = lucy_Token_new(text, len, 0, length, 1.0, 1);
     }
 
     RETVAL = CFISH_OBJ_TO_SV_NOINC(lucy_Inversion_new(starter_token));

Reply via email to