jenkins-bot has submitted this change and it was merged. Change subject: Fix when merging overlapping matches ......................................................................
Fix when merging overlapping matches If matches overlap we now extend the original match's end offset to make sure that it includes the overlapping end offset. We also add some documentation and tests around long regex matches. Change-Id: I342d63cdcdcdaf2247dd15a618e6a834ee8e1c8a --- M README.md M experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java M experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java M experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java 4 files changed, 42 insertions(+), 11 deletions(-) Approvals: Jdouglas: Looks good to me, approved jenkins-bot: Verified diff --git a/README.md b/README.md index 770c086..d36cb45 100644 --- a/README.md +++ b/README.md @@ -249,6 +249,7 @@ } } ``` +If a regex match is wider than the allowed snippet size it won't be returned. The ```max_determinized_states``` option can be used to limit the complexity explosion that comes from compiling Lucene Regular Expressions into DFAs. It diff --git a/experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java b/experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java index d3737a4..981b7f5 100644 --- a/experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java +++ b/experimental-highlighter-core/src/main/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapper.java @@ -1,12 +1,13 @@ package org.wikimedia.search.highlighter.experimental.hit; +import static java.lang.Math.max; + import org.wikimedia.search.highlighter.experimental.HitEnum; /** * HitEnum that merges hits that are "on top" of one another according to start - * and end offset. Always takes the maximum weight. Delegate must be in order of - * offsets (startOffset first, endOffset second). Just keeps the first position - * value. + * and end offset. Always takes the maximum weight and end offset. Delegate must + * be in order of offsets (startOffset first, endOffset second). */ public class OverlapMergingHitEnumWrapper implements HitEnum { /** @@ -54,11 +55,11 @@ break; } // Merge overlapping hits - endOffset = delegate.endOffset(); + endOffset = max(endOffset, delegate.endOffset()); assert delegate.startOffset() <= delegate.endOffset(); assert startOffset <= endOffset; - queryWeight = Math.max(queryWeight, delegate.queryWeight()); - corpusWeight = Math.max(corpusWeight, delegate.corpusWeight()); + queryWeight = max(queryWeight, delegate.queryWeight()); + corpusWeight = max(corpusWeight, delegate.corpusWeight()); /* * If both hits can't be traced back to the same source we declare * that they are from a new source by merging the hashes. This might diff --git a/experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java b/experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java index d2e7bea..ac34109 100644 --- a/experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java +++ b/experimental-highlighter-core/src/test/java/org/wikimedia/search/highlighter/experimental/hit/OverlapMergingHitEnumWrapperTest.java @@ -77,17 +77,17 @@ } @Test - public void overlapPicksMaxWeight() { + public void overlapPicksMaxWeightAndSize() { ReplayingHitEnum replaying = new ReplayingHitEnum(); // The first overlapping hit has corpus weight of 3 - replaying.record(0, 0, 2, 2, 3, 1); + replaying.record(0, 0, 5, 2, 3, 1); // The second has query weight of 3 replaying.record(0, 1, 3, 3, 2, 2); HitEnum e = new OverlapMergingHitEnumWrapper(replaying); assertThat(e, advances()); assertThat(e, allOf( // So together they should have a multiplied weight of 9 (3*3) - atWeight(9), atPosition(0), atStartOffset(0), atEndOffset(3), atSource(33))); + atWeight(9), atPosition(0), atStartOffset(0), atEndOffset(5), atSource(33))); assertThat(e, isEmpty()); } } diff --git a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java index 9448e3e..0d3c265 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java +++ b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java @@ -1,7 +1,11 @@ package org.wikimedia.highlighter.experimental.elasticsearch.integration; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*; -import static org.hamcrest.Matchers.*; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFailures; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNotHighlighted; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; import java.io.IOException; import java.util.HashMap; @@ -222,4 +226,29 @@ // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but // I can't find any way from here to mark it otherwise. } + + @Test + public void overlapMerge() throws IOException { + buildIndex(); + indexTestData("This test has overlapping highlights."); + + Map<String, Object> options = new HashMap<>(); + options.put("regex", "T.+\\."); + SearchRequestBuilder search = testSearch().setHighlighterOptions(options); + SearchResponse response = search.get(); + assertHighlight(response, 0, "test", 0, + equalTo("<em>This test has overlapping highlights.</em>")); + } + + @Test + public void longResultAreNotReturned() throws IOException { + buildIndex(); + indexTestData("This test is much longer than the window but we match all of it. Isn't that a shame?"); + + Map<String, Object> options = new HashMap<>(); + options.put("regex", "T.+shame\\?"); + SearchRequestBuilder search = testSearch().setHighlighterOptions(options).setHighlighterFragmentSize(30); + SearchResponse response = search.get(); + assertNotHighlighted(response, 0, "test"); + } } -- To view, visit https://gerrit.wikimedia.org/r/206522 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I342d63cdcdcdaf2247dd15a618e6a834ee8e1c8a Gerrit-PatchSet: 1 Gerrit-Project: search/highlighter Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Chad <ch...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Jdouglas <jdoug...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits