jenkins-bot has submitted this change and it was merged.

Change subject: Fix when merging overlapping matches
......................................................................


Fix when merging overlapping matches

If matches overlap we now extend the original match's end offset to make
sure that it includes the overlapping end offset.

We also add some documentation and tests around long regex matches.

Change-Id: I342d63cdcdcdaf2247dd15a618e6a834ee8e1c8a
---
M README.md
M 
experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java
M 
experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java
M 
experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
4 files changed, 42 insertions(+), 11 deletions(-)

Approvals:
  Jdouglas: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/README.md b/README.md
index 770c086..d36cb45 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,7 @@
     }
   }
 ```
+If a regex match is wider than the allowed snippet size it won't be returned.
 
 The ```max_determinized_states``` option can be used to limit the complexity
 explosion that comes from compiling Lucene Regular Expressions into DFAs.  It
diff --git 
a/experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java
 
b/experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java
index d3737a4..981b7f5 100644
--- 
a/experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java
+++ 
b/experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java
@@ -1,12 +1,13 @@
 package org.wikimedia.search.highlighter.experimental.hit;
 
+import static java.lang.Math.max;
+
 import org.wikimedia.search.highlighter.experimental.HitEnum;
 
 /**
  * HitEnum that merges hits that are "on top" of one another according to start
- * and end offset. Always takes the maximum weight. Delegate must be in order 
of
- * offsets (startOffset first, endOffset second). Just keeps the first position
- * value.
+ * and end offset. Always takes the maximum weight and end offset. Delegate 
must
+ * be in order of offsets (startOffset first, endOffset second).
  */
 public class OverlapMergingHitEnumWrapper implements HitEnum {
     /**
@@ -54,11 +55,11 @@
                 break;
             }
             // Merge overlapping hits
-            endOffset = delegate.endOffset();
+            endOffset = max(endOffset, delegate.endOffset());
             assert delegate.startOffset() <= delegate.endOffset();
             assert startOffset <= endOffset;
-            queryWeight = Math.max(queryWeight, delegate.queryWeight());
-            corpusWeight = Math.max(corpusWeight, delegate.corpusWeight());
+            queryWeight = max(queryWeight, delegate.queryWeight());
+            corpusWeight = max(corpusWeight, delegate.corpusWeight());
             /*
              * If both hits can't be traced back to the same source we declare
              * that they are from a new source by merging the hashes. This 
might
diff --git 
a/experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java
 
b/experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java
index d2e7bea..ac34109 100644
--- 
a/experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java
+++ 
b/experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java
@@ -77,17 +77,17 @@
     }
 
     @Test
-    public void overlapPicksMaxWeight() {
+    public void overlapPicksMaxWeightAndSize() {
         ReplayingHitEnum replaying = new ReplayingHitEnum();
         // The first overlapping hit has corpus weight of 3
-        replaying.record(0, 0, 2, 2, 3, 1);
+        replaying.record(0, 0, 5, 2, 3, 1);
         // The second has query weight of 3
         replaying.record(0, 1, 3, 3, 2, 2);
         HitEnum e = new OverlapMergingHitEnumWrapper(replaying);
         assertThat(e, advances());
         assertThat(e, allOf(
                 // So together they should have a multiplied weight of 9 (3*3)
-                atWeight(9), atPosition(0), atStartOffset(0), atEndOffset(3), 
atSource(33)));
+                atWeight(9), atPosition(0), atStartOffset(0), atEndOffset(5), 
atSource(33)));
         assertThat(e, isEmpty());
     }
 }
diff --git 
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
 
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
index 9448e3e..0d3c265 100644
--- 
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
+++ 
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
@@ -1,7 +1,11 @@
 package org.wikimedia.highlighter.experimental.elasticsearch.integration;
 
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*;
-import static org.hamcrest.Matchers.*;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFailures;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
+import static 
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNotHighlighted;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.equalTo;
 
 import java.io.IOException;
 import java.util.HashMap;
@@ -222,4 +226,29 @@
         // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
         // I can't find any way from here to mark it otherwise.
     }
+
+    @Test
+    public void overlapMerge() throws IOException {
+        buildIndex();
+        indexTestData("This test has overlapping highlights.");
+
+        Map<String, Object> options = new HashMap<>();
+        options.put("regex", "T.+\\.");
+        SearchRequestBuilder search = 
testSearch().setHighlighterOptions(options);
+        SearchResponse response = search.get();
+        assertHighlight(response, 0, "test", 0,
+                equalTo("<em>This test has overlapping highlights.</em>"));
+    }
+
+    @Test
+    public void longResultAreNotReturned() throws IOException {
+        buildIndex();
+        indexTestData("This test is much longer than the window but we match 
all of it. Isn't that a shame?");
+
+        Map<String, Object> options = new HashMap<>();
+        options.put("regex", "T.+shame\\?");
+        SearchRequestBuilder search = 
testSearch().setHighlighterOptions(options).setHighlighterFragmentSize(30);
+        SearchResponse response = search.get();
+        assertNotHighlighted(response, 0, "test");
+    }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/206522
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I342d63cdcdcdaf2247dd15a618e6a834ee8e1c8a
Gerrit-PatchSet: 1
Gerrit-Project: search/highlighter
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <never...@wikimedia.org>
Gerrit-Reviewer: Chad <ch...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Jdouglas <jdoug...@wikimedia.org>
Gerrit-Reviewer: Manybubbles <never...@wikimedia.org>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to