DCausse has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/371747 )
Change subject: Add new formatter to output offsets+text snippets ...................................................................... Add new formatter to output offsets+text snippets Bug: T173231 Change-Id: I69deeca8589138fe512f6fc782ecc80897cf26bb --- M experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighter.java A experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/OffsetAugmenterSnippetFormatter.java M experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java 3 files changed, 85 insertions(+), 40 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/highlighter refs/changes/47/371747/1 diff --git a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighter.java b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighter.java index 6163e36..a430d72 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighter.java +++ b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighter.java @@ -1,16 +1,5 @@ package org.wikimedia.highlighter.experimental.elasticsearch; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - import org.apache.logging.log4j.Logger; import org.apache.lucene.search.Query; import org.apache.lucene.util.automaton.RegExp; @@ -45,6 +34,17 @@ import org.wikimedia.search.highlighter.experimental.tools.GraphvizHit; import org.wikimedia.search.highlighter.experimental.tools.GraphvizHitEnum; import org.wikimedia.search.highlighter.experimental.tools.GraphvizSnippetFormatter; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; public class ExperimentalHighlighter implements Highlighter { public static final String NAME = "experimental"; @@ -134,6 +134,7 @@ static class HighlightExecutionContext { private static final String OPTION_RETURN_DEBUG_GRAPH = "return_debug_graph"; + private static final String OPTION_RETURN_SNIPPETS_WITH_OFFSET = "return_snippets_and_offsets"; private static final int DEFAULT_MAX_DETERMINIZED_STATES = 20000; private final HighlighterContext context; private final CacheEntry cache; @@ -522,6 +523,9 @@ formatter = new OffsetSnippetFormatter(); } else if (getOption(OPTION_RETURN_DEBUG_GRAPH, false)) { formatter = new GraphvizSnippetFormatter(defaultField.buildSourceExtracter()); + } else if (getOption(OPTION_RETURN_SNIPPETS_WITH_OFFSET, false)) { + formatter = new OffsetAugmenterSnippetFormatter(new SnippetFormatter.Default(defaultField.buildSourceExtracter(), context.field.fieldOptions().preTags()[0], + context.field.fieldOptions().postTags()[0])); } else { formatter = new SnippetFormatter.Default(defaultField.buildSourceExtracter(), context.field.fieldOptions().preTags()[0], context.field.fieldOptions().postTags()[0]); diff --git a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/OffsetAugmenterSnippetFormatter.java b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/OffsetAugmenterSnippetFormatter.java new file mode 100644 index 0000000..e56caad --- /dev/null +++ b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/OffsetAugmenterSnippetFormatter.java @@ -0,0 +1,20 @@ +package org.wikimedia.highlighter.experimental.elasticsearch; + +import org.wikimedia.search.highlighter.experimental.Snippet; +import org.wikimedia.search.highlighter.experimental.SnippetFormatter; + +public class OffsetAugmenterSnippetFormatter implements SnippetFormatter { + private static final OffsetSnippetFormatter OFFSETS = new OffsetSnippetFormatter(); + private final SnippetFormatter formatter; + + public OffsetAugmenterSnippetFormatter(SnippetFormatter formatter) { + this.formatter = formatter; + } + + @Override + public String format(Snippet snippet) { + StringBuilder sb = new StringBuilder(); + sb.append(OFFSETS.format(snippet)); + return sb.append('|').append(formatter.format(snippet)).toString(); + } +} diff --git a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java index 21eb5f4..002d307 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java +++ b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java @@ -1,8 +1,35 @@ package org.wikimedia.highlighter.experimental.elasticsearch.integration; -import static org.elasticsearch.index.query.QueryBuilders.idsQuery; +import com.google.common.base.Charsets; +import com.google.common.collect.ImmutableList; +import com.google.common.io.Resources; +import org.elasticsearch.action.admin.indices.forcemerge.ForceMergeResponse; +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.index.IndexRequestBuilder; +import org.elasticsearch.action.search.SearchRequestBuilder; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.StopWatch; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.elasticsearch.index.query.BoolQueryBuilder; +import org.elasticsearch.index.query.QueryBuilder; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; +import org.junit.Test; +import org.wikimedia.highlighter.experimental.elasticsearch.AbstractExperimentalHighlighterIntegrationTestBase; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.ExecutionException; + import static org.elasticsearch.index.query.QueryBuilders.boolQuery; import static org.elasticsearch.index.query.QueryBuilders.fuzzyQuery; +import static org.elasticsearch.index.query.QueryBuilders.idsQuery; import static org.elasticsearch.index.query.QueryBuilders.matchPhrasePrefixQuery; import static org.elasticsearch.index.query.QueryBuilders.matchQuery; import static org.elasticsearch.index.query.QueryBuilders.prefixQuery; @@ -19,34 +46,6 @@ import static org.hamcrest.Matchers.both; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.concurrent.ExecutionException; - -import com.google.common.collect.ImmutableList; - -import org.elasticsearch.action.admin.indices.forcemerge.ForceMergeResponse; -import org.elasticsearch.action.bulk.BulkRequestBuilder; -import org.elasticsearch.action.index.IndexRequestBuilder; -import org.elasticsearch.action.search.SearchRequestBuilder; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.common.StopWatch; -import org.elasticsearch.common.unit.Fuzziness; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.common.xcontent.json.JsonXContent; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; -import org.junit.Test; -import org.wikimedia.highlighter.experimental.elasticsearch.AbstractExperimentalHighlighterIntegrationTestBase; - -import com.google.common.base.Charsets; -import com.google.common.io.Resources; /** * Miscellaneous integration test that don't really have a good home. @@ -431,6 +430,28 @@ } @Test + public void offsetsAugmenter() throws IOException { + buildIndex(); + indexTestData(); + Map<String, Object> options = new HashMap<String, Object>(); + options.put("return_snippets_and_offsets", true); + SearchResponse response = testSearch(matchQuery("test.english", "test"), + x -> x.options(options).field("test.english")).get(); + assertHighlight(response, 0, "test.english", 0, equalTo("0:0-5,18-22:22|<em>tests</em> very simple <em>test</em>")); + } + + @Test + public void offsetsAugmenterWithEmptyArray() throws IOException { + buildIndex(); + indexTestData(Arrays.asList("", "after_empty_array")); + Map<String, Object> options = new HashMap<String, Object>(); + options.put("return_snippets_and_offsets", true); + SearchResponse response = testSearch(matchQuery("test.english", "after_empty_array"), + x -> x.options(options).field("test.english")).get(); + assertHighlight(response, 0, "test.english", 0, equalTo("1:1-18:18|<em>after_empty_array</em>")); + } + + @Test public void returnOffsetsMultiValued() throws IOException { buildIndex(); indexTestData(ImmutableList.of("tests very simple test", "with more test")); -- To view, visit https://gerrit.wikimedia.org/r/371747 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I69deeca8589138fe512f6fc782ecc80897cf26bb Gerrit-PatchSet: 1 Gerrit-Project: search/highlighter Gerrit-Branch: master Gerrit-Owner: DCausse <dcau...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits