jenkins-bot has submitted this change and it was merged.
Change subject: Fix function_score and common_terms queries
......................................................................
Fix function_score and common_terms queries
also add a the max_expanded_terms option and change its default from 100 to
1024.
function_score is pretty much a passthrough for highlighting.
common_terms queries are handled by only highlighting the _uncommon_ terms
by default but the remove_high_freq_terms_from_common_terms option can be
set to false to leave them in.
Closes #14
Closes #13
Change-Id: I722f9a18eafb9f1c0af36ae03eaacbd227cbee32
---
M README.md
M
experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
M
experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java
M
experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java
M
experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java
M
experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java
M
experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java
A
experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MultimatchTest.java
M experimental-highlighter-lucene/pom.xml
M
experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java
M
experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java
M
experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java
12 files changed, 461 insertions(+), 52 deletions(-)
Approvals:
Manybubbles: Looks good to me, approved
Jdouglas: Looks good to me, but someone else must approve
jenkins-bot: Verified
diff --git a/README.md b/README.md
index d36cb45..cd49151 100644
--- a/README.md
+++ b/README.md
@@ -282,6 +282,20 @@
* category will only be highlighted if there isn't a match in section_heading,
redirect, or title.
+The ```remove_high_freq_terms_from_common_terms``` option can be used to
+highlight common terms when using the ```common_terms``` query. It defaults to
+```true``` meaning common terms will not be highlighted. Setting it to
+```false``` will highlight common terms in ```common_terms``` queries. Note
+that this behavior was added in 1.3.1, 1.4.3, and 1.5.0 and before that common
+terms were always highlighted by the ```common_terms``` query.
+
+The ```max_expanded_terms``` option can be used to control how many terms the
+highlighter expands multi term queries into. The default is 1024 which is the
+same as the ```fvh```'s default. Note that the highlighter doesn't need to
+expand all multi term queries because it has special handling for many of them.
+But when it does, this is how many terms it expands them into. This was added
+in 1.3.1, 1.4.3, and 1.5.0 and before the value was hard coded to 100.
+
Offsets in postings or term vectors
-----------------------------------
Since adding offsets to the postings (set ```index_options``` to ```offsets```
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
index 5cb5c55..a53e0f1 100644
---
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
+++
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
@@ -91,10 +91,15 @@
static class QueryCacheKey {
private final Query query;
+ private final int maxExpandedTerms;
private final boolean phraseAsTerms;
- public QueryCacheKey(Query query, boolean phraseAsTerms) {
+ private final boolean removeHighFrequencyTermsFromCommonTerms;
+
+ public QueryCacheKey(Query query, int maxExpandedTerms, boolean
phraseAsTerms, boolean removeHighFrequencyTermsFromCommonTerms) {
this.query = query;
+ this.maxExpandedTerms = maxExpandedTerms;
this.phraseAsTerms = phraseAsTerms;
+ this.removeHighFrequencyTermsFromCommonTerms =
removeHighFrequencyTermsFromCommonTerms;
}
@Override
@@ -102,6 +107,8 @@
final int prime = 31;
int result = 1;
result = prime * result + (phraseAsTerms ? 1231 : 1237);
+ result = prime * result + maxExpandedTerms;
+ result = prime * result + (removeHighFrequencyTermsFromCommonTerms
? 1231 : 1237);
result = prime * result + ((query == null) ? 0 : query.hashCode());
return result;
}
@@ -115,7 +122,11 @@
if (getClass() != obj.getClass())
return false;
QueryCacheKey other = (QueryCacheKey) obj;
+ if (maxExpandedTerms != other.maxExpandedTerms)
+ return false;
if (phraseAsTerms != other.phraseAsTerms)
+ return false;
+ if (removeHighFrequencyTermsFromCommonTerms !=
other.removeHighFrequencyTermsFromCommonTerms)
return false;
if (query == null) {
if (other.query != null)
@@ -231,7 +242,13 @@
}
return context.field.fieldOptions().options().get(key);
}
-
+
+ <T> T getOption(String key, T defaultValue) {
+ @SuppressWarnings("unchecked")
+ T value = (T) getOption(key);
+ return value == null ? defaultValue : value;
+ }
+
boolean scoreMatters() {
return scoreMatters;
}
@@ -240,13 +257,11 @@
if (weigher != null) {
return;
}
- boolean phraseAsTerms = false;
+ boolean phraseAsTerms = getOption("phrase_as_terms", false);
+ boolean removeHighFrequencyTermsFromCommonTerms =
getOption("remove_high_freq_terms_from_common_terms", true);
+ int maxExpandedTerms = getOption("max_expanded_terms", 1024);
// TODO simplify
- Boolean phraseAsTermsOption = (Boolean)
getOption("phrase_as_terms");
- if (phraseAsTermsOption != null) {
- phraseAsTerms = phraseAsTermsOption;
- }
- QueryCacheKey key = new
QueryCacheKey(context.query.originalQuery(), phraseAsTerms);
+ QueryCacheKey key = new
QueryCacheKey(context.query.originalQuery(), maxExpandedTerms, phraseAsTerms,
removeHighFrequencyTermsFromCommonTerms);
weigher = cache.queryWeighers.get(key);
if (weigher != null) {
return;
@@ -257,13 +272,13 @@
BigArrays.NON_RECYCLING_INSTANCE);
// context.context.addReleasable(infos);
weigher = new BasicQueryWeigher(
- new ElasticsearchQueryFlattener(100, phraseAsTerms), infos,
+ new ElasticsearchQueryFlattener(maxExpandedTerms,
phraseAsTerms, removeHighFrequencyTermsFromCommonTerms), infos,
context.hitContext.topLevelReader(),
context.query.originalQuery());
// Build the QueryWeigher with the top level reader to get all
// the frequency information
cache.queryWeighers.put(key, weigher);
}
-
+
/**
* Builds the hit enum including any required wrappers.
*/
@@ -307,13 +322,13 @@
}
Boolean caseInsensitiveOption = (Boolean)
getOption("regex_case_insensitive");
boolean caseInsensitive = caseInsensitiveOption == null ? false :
caseInsensitiveOption;
-
+
List<HitEnum> hitEnums = new ArrayList<>();
List<String> fieldValues = defaultField.getFieldValues();
if (fieldValues.size() == 0) {
return hitEnums;
}
-
+
for (String regex : getRegexes()) {
if (luceneRegex) {
if (caseInsensitive) {
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java
index 3c42ef6..5513966 100644
---
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java
+++
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java
@@ -7,11 +7,20 @@
import org.apache.lucene.search.Query;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.elasticsearch.common.lucene.search.XFilteredQuery;
+import
org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
+import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
import org.wikimedia.highlighter.experimental.lucene.QueryFlattener;
public class ElasticsearchQueryFlattener extends QueryFlattener {
- public ElasticsearchQueryFlattener(int maxMultiTermQueryTerms, boolean
phraseAsTerms) {
- super(maxMultiTermQueryTerms, phraseAsTerms);
+ /**
+ * Default configuration.
+ */
+ public ElasticsearchQueryFlattener() {
+ super();
+ }
+
+ public ElasticsearchQueryFlattener(int maxMultiTermQueryTerms, boolean
phraseAsTerms, boolean removeHighFrequencyTermsFromCommonTerms) {
+ super(maxMultiTermQueryTerms, phraseAsTerms,
removeHighFrequencyTermsFromCommonTerms);
}
@Override
@@ -23,6 +32,16 @@
}
if (query instanceof MultiPhrasePrefixQuery) {
flattenQuery((MultiPhrasePrefixQuery) query, pathBoost,
sourceOverride, reader,
+ callback);
+ return true;
+ }
+ if (query instanceof FunctionScoreQuery) {
+ flattenQuery((FunctionScoreQuery) query, pathBoost,
sourceOverride, reader,
+ callback);
+ return true;
+ }
+ if (query instanceof FiltersFunctionScoreQuery) {
+ flattenQuery((FiltersFunctionScoreQuery) query, pathBoost,
sourceOverride, reader,
callback);
return true;
}
@@ -77,4 +96,18 @@
callback.endPhrase(query.getField(), query.getSlop(), boost);
}
}
+
+ protected void flattenQuery(FunctionScoreQuery query, float pathBoost,
+ Object sourceOverride, IndexReader reader, Callback callback) {
+ if (query.getSubQuery() != null) {
+ flatten(query.getSubQuery(), pathBoost * query.getBoost(),
sourceOverride, reader, callback);
+ }
+ }
+
+ protected void flattenQuery(FiltersFunctionScoreQuery query, float
pathBoost,
+ Object sourceOverride, IndexReader reader, Callback callback) {
+ if (query.getSubQuery() != null) {
+ flatten(query.getSubQuery(), pathBoost * query.getBoost(),
sourceOverride, reader, callback);
+ }
+ }
}
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java
index b73cf0a..8ccc782 100644
---
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java
+++
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java
@@ -51,6 +51,7 @@
throws IOException {
XContentBuilder mapping = jsonBuilder().startObject();
mapping.startObject("test").startObject("properties");
+ mapping.startObject("bar").field("type", "integer").endObject();
addField(mapping, "test", offsetsInPostings, fvhLikeTermVectors);
addField(mapping, "test2", offsetsInPostings, fvhLikeTermVectors);
mapping.startObject("foo").field("type").value("object").startObject("properties");
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java
index bc5ce5b..2d3ca22 100644
---
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java
+++
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java
@@ -16,13 +16,22 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.automaton.Automaton;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
+import
org.elasticsearch.common.lucene.search.function.FieldValueFactorFunction;
+import
org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
+import
org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery.FilterFunction;
+import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
+import org.elasticsearch.common.lucene.search.function.ScoreFunction;
import org.junit.Test;
import org.mockito.ArgumentCaptor;
import org.wikimedia.highlighter.experimental.lucene.QueryFlattener.Callback;
public class ElasticsearchQueryFlattenerTest {
+ private final Term bar = new Term("foo", "bar");
+ private final ScoreFunction scoreFunction = new
FieldValueFactorFunction("foo", 1, FieldValueFactorFunction.Modifier.LN, null);
+
@Test
public void phrasePrefixQueryPhraseAsPhrase() {
phrasePrefixQueryTestCase(false);
@@ -31,6 +40,21 @@
@Test
public void phrasePrefixQueryPhraseAsTerms() {
phrasePrefixQueryTestCase(true);
+ }
+
+ @Test
+ public void functionScoreQuery() {
+ Callback callback = mock(Callback.class);
+ new ElasticsearchQueryFlattener().flatten(new FunctionScoreQuery(new
TermQuery(bar), scoreFunction), null, callback);
+ verify(callback).flattened(bar.bytes(), 1f, null);
+ }
+
+ @Test
+ public void filtersFunctionScoreQuery() {
+ Callback callback = mock(Callback.class);
+ Query query = new FiltersFunctionScoreQuery(new TermQuery(bar), null,
new FilterFunction[] {}, 1);
+ new ElasticsearchQueryFlattener().flatten(query, null, callback);
+ verify(callback).flattened(bar.bytes(), 1f, null);
}
private void phrasePrefixQueryTestCase(boolean phraseAsTerms) {
@@ -45,7 +69,7 @@
query.add(new Term[] { bar, anoth });
Callback callback = mock(Callback.class);
- new ElasticsearchQueryFlattener(1, phraseAsTerms).flatten(query, null,
callback);
+ new ElasticsearchQueryFlattener(1, phraseAsTerms, true).flatten(query,
null, callback);
// The first positions are sent as terms
verify(callback).flattened(foo.bytes(), phraseAsTerms ? 1f : 0, null);
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java
index 59235c8..37f3506 100644
---
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java
+++
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java
@@ -1,8 +1,9 @@
package org.wikimedia.highlighter.experimental.elasticsearch.integration;
+import static org.elasticsearch.index.query.FilterBuilders.termFilter;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
+import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
import static org.elasticsearch.index.query.QueryBuilders.fuzzyQuery;
-import static org.elasticsearch.index.query.QueryBuilders.queryString;
import static org.elasticsearch.index.query.QueryBuilders.rangeQuery;
import static org.elasticsearch.index.query.QueryBuilders.regexpQuery;
import static org.elasticsearch.index.query.QueryBuilders.spanFirstQuery;
@@ -12,6 +13,7 @@
import static org.elasticsearch.index.query.QueryBuilders.spanTermQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
import static org.elasticsearch.index.query.QueryBuilders.wildcardQuery;
+import static
org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders.fieldValueFactorFunction;
import static
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
import static org.hamcrest.Matchers.equalTo;
@@ -182,4 +184,30 @@
equalTo("<em>tests</em> very simple <em>test</em>"));
}
}
+
+ @Test
+ public void functionScoreQueryWithoutFilter() throws IOException {
+ buildIndex();
+ client().prepareIndex("test", "test", "1").setSource("test", "test",
"bar", 2).get();
+ refresh();
+
+ SearchRequestBuilder search =
testSearch(functionScoreQuery(termQuery("test",
"test")).add(fieldValueFactorFunction("bar")));
+ for (String hitSource : HIT_SOURCES) {
+ SearchResponse response = setHitSource(search, hitSource).get();
+ assertHighlight(response, 0, "test", 0, equalTo("<em>test</em>"));
+ }
+ }
+
+ @Test
+ public void functionScoreQueryWithFilter() throws IOException {
+ buildIndex();
+ client().prepareIndex("test", "test", "1").setSource("test", "test",
"bar", 2).get();
+ refresh();
+
+ SearchRequestBuilder search =
testSearch(functionScoreQuery(termQuery("test", "test")).add(termFilter("test",
"test"), fieldValueFactorFunction("bar")));
+ for (String hitSource : HIT_SOURCES) {
+ SearchResponse response = setHitSource(search, hitSource).get();
+ assertHighlight(response, 0, "test", 0, equalTo("<em>test</em>"));
+ }
+ }
}
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java
index 5090b77..9b896e7 100644
---
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java
+++
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java
@@ -6,6 +6,7 @@
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.index.query.QueryBuilders.prefixQuery;
import static org.elasticsearch.index.query.QueryBuilders.queryString;
+import static org.elasticsearch.index.query.QueryBuilders.rangeQuery;
import static org.elasticsearch.index.query.QueryBuilders.regexpQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
import static org.elasticsearch.index.query.QueryBuilders.wildcardQuery;
@@ -20,8 +21,10 @@
import java.io.IOException;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
import java.util.Locale;
+import java.util.Map;
import java.util.concurrent.ExecutionException;
import org.elasticsearch.action.admin.indices.optimize.OptimizeResponse;
@@ -355,6 +358,27 @@
assertHighlight(response, 0, "test", 0, equalTo("tests <em>very</em>
simple test"));
}
+ /**
+ * max_expanded_terms should control how many terms we expand multi term
+ * queries into when we expand multi term queries.
+ */
+ @Test
+ public void singleRangeQueryWithSmallRewrites() throws IOException {
+ buildIndex();
+ client().prepareIndex("test", "test", "2").setSource("test",
"test").get();
+ indexTestData();
+
+ Map<String, Object> options = new HashMap<String, Object>();
+ options.put("max_expanded_terms", 1);
+ SearchRequestBuilder search =
testSearch(rangeQuery("test").from("teso").to("tesz")).setHighlighterOptions(options);
+ for (String hitSource : HIT_SOURCES) {
+ options.put("hit_source", hitSource);
+ SearchResponse response = search.get();
+ assertHighlight(response, 0, "test", 0,
+ equalTo("tests very simple <em>test</em>"));
+ }
+ }
+
// TODO matched_fields with different hit source
- // TODO infer proper hit source
+ // TODO infer proper hit source
}
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MultimatchTest.java
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MultimatchTest.java
new file mode 100644
index 0000000..bbc83e5
--- /dev/null
+++
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MultimatchTest.java
@@ -0,0 +1,98 @@
+package org.wikimedia.highlighter.experimental.elasticsearch.integration;
+
+import static org.elasticsearch.index.query.FilterBuilders.idsFilter;
+import static org.elasticsearch.index.query.QueryBuilders.filteredQuery;
+import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
+import static
org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
+import static org.hamcrest.Matchers.equalTo;
+
+import java.io.IOException;
+
+import org.elasticsearch.action.search.SearchRequestBuilder;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.index.query.MultiMatchQueryBuilder;
+import org.junit.Test;
+import
org.wikimedia.highlighter.experimental.elasticsearch.AbstractExperimentalHighlighterIntegrationTestBase;
+
+public class MultimatchTest extends
AbstractExperimentalHighlighterIntegrationTestBase {
+ @Test
+ public void multiMatch() throws IOException {
+ buildIndex();
+ indexTestData();
+
+ SearchRequestBuilder search = testSearch(multiMatchQuery("test",
"test"));
+ for (String hitSource : HIT_SOURCES) {
+ SearchResponse response = setHitSource(search, hitSource).get();
+ assertHighlight(response, 0, "test", 0, equalTo("tests very simple
<em>test</em>"));
+ }
+ }
+
+ @Test
+ public void multiMatchCutoffAllLow() throws IOException {
+ buildIndex(true, true, 1);
+ indexTestData();
+
+ SearchRequestBuilder search = testSearch(multiMatchQuery("very test",
"test").cutoffFrequency(1f));
+ for (String hitSource : HIT_SOURCES) {
+ SearchResponse response = setHitSource(search, hitSource).get();
+ assertHighlight(response, 0, "test", 0, equalTo("tests
<em>very</em> simple <em>test</em>"));
+ }
+ }
+
+ @Test
+ public void multiMatchCutoffHighAndLow() throws IOException {
+ buildIndex(true, true, 1);
+ client().prepareIndex("test", "test", "2").setSource("test",
"test").get();
+ indexTestData();
+
+ SearchRequestBuilder search = testSearch(multiMatchQuery("very test",
"test").cutoffFrequency(1f));
+ for (String hitSource : HIT_SOURCES) {
+ SearchResponse response = setHitSource(search, hitSource).get();
+ assertHighlight(response, 0, "test", 0, equalTo("tests
<em>very</em> simple test"));
+ }
+ }
+
+ @Test
+ public void multiMatchPhraseCutoffHighAndLow() throws IOException {
+ buildIndex(true, true, 1);
+ client().prepareIndex("test", "test", "2").setSource("test",
"simple").get();
+ indexTestData();
+
+ // Looks like phrase doesn't respect cutoff in multimatch
+ SearchRequestBuilder search = testSearch(multiMatchQuery("very
simple", "test").type(MultiMatchQueryBuilder.Type.PHRASE)
+ .cutoffFrequency(1));
+ for (String hitSource : HIT_SOURCES) {
+ SearchResponse response = setHitSource(search, hitSource).get();
+ assertHighlight(response, 0, "test", 0, equalTo("tests
<em>very</em> <em>simple</em> test"));
+ }
+ }
+
+ @Test
+ public void multiMatchPhrasePrefixCutoffHighAndLow() throws IOException {
+ buildIndex(true, true, 1);
+ client().prepareIndex("test", "test", "2").setSource("test",
"simple").get();
+ indexTestData();
+
+ // Looks like phrase doesn't respect cutoff in multimatch
+ SearchRequestBuilder search = testSearch(multiMatchQuery("very
simple", "test").type(MultiMatchQueryBuilder.Type.PHRASE_PREFIX)
+ .cutoffFrequency(1));
+ for (String hitSource : HIT_SOURCES) {
+ SearchResponse response = setHitSource(search, hitSource).get();
+ assertHighlight(response, 0, "test", 0, equalTo("tests
<em>very</em> <em>simple</em> test"));
+ }
+ }
+
+ @Test
+ public void multiMatchCutoffAllHigh() throws IOException {
+ buildIndex(true, true, 1);
+ client().prepareIndex("test", "test", "2").setSource("test", "very
test").get();
+ indexTestData();
+
+ SearchRequestBuilder search =
testSearch(filteredQuery(multiMatchQuery("very test",
"test").cutoffFrequency(1f),
+ idsFilter("test").addIds("1")));
+ for (String hitSource : HIT_SOURCES) {
+ SearchResponse response = setHitSource(search, hitSource).get();
+ assertHighlight(response, 0, "test", 0, equalTo("tests
<em>very</em> simple <em>test</em>"));
+ }
+ }
+}
diff --git a/experimental-highlighter-lucene/pom.xml
b/experimental-highlighter-lucene/pom.xml
index 3635c1b..fd7a245 100644
--- a/experimental-highlighter-lucene/pom.xml
+++ b/experimental-highlighter-lucene/pom.xml
@@ -30,6 +30,11 @@
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-queries</artifactId>
+ <version>${lucene.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
<artifactId>lucene-memory</artifactId>
<version>${lucene.version}</version>
<scope>test</scope>
diff --git
a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java
b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java
index 59296ea..09e651d 100644
---
a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java
+++
b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java
@@ -7,7 +7,9 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
@@ -44,16 +46,25 @@
private final Set<Object> sentAutomata = new HashSet<Object>();
private final int maxMultiTermQueryTerms;
private final boolean phraseAsTerms;
+ private final boolean removeHighFrequencyTermsFromCommonTerms;
- public QueryFlattener(int maxMultiTermQueryTerms, boolean phraseAsTerms) {
+ /**
+ * Default configuration.
+ */
+ public QueryFlattener() {
+ this(1000, false, true);
+ }
+
+ public QueryFlattener(int maxMultiTermQueryTerms, boolean phraseAsTerms,
boolean removeHighFrequencyTermsFromCommonTerms) {
this.maxMultiTermQueryTerms = maxMultiTermQueryTerms;
this.phraseAsTerms = phraseAsTerms;
+ this.removeHighFrequencyTermsFromCommonTerms =
removeHighFrequencyTermsFromCommonTerms;
}
public interface Callback {
/**
* Called once per query containing the term.
- *
+ *
* @param term the term
* @param boost weight of the term
* @param sourceOverride null if the source of the term is the query
@@ -65,7 +76,7 @@
/**
* Called with each new automaton. QueryFlattener makes an effort to
* only let the first copy of any duplicate automata through.
- *
+ *
* @param automaton automaton from the query
* @param boost weight of terms matchign the automaton
* @param source hashcode of the source. Automata don't have a hashcode
@@ -94,7 +105,7 @@
/**
* Should phrase queries be returned as terms?
- *
+ *
* @return true mean skip startPhrase and endPhrase and give the terms in a
* phrase the weight of the whole phrase
*/
@@ -130,19 +141,10 @@
flattenQuery((WildcardQuery) query, pathBoost, sourceOverride,
reader, callback);
} else if (query instanceof PrefixQuery) {
flattenQuery((PrefixQuery) query, pathBoost, sourceOverride,
reader, callback);
+ } else if (query instanceof CommonTermsQuery) {
+ flattenQuery((CommonTermsQuery) query, pathBoost, sourceOverride,
reader, callback);
} else if (!flattenUnknown(query, pathBoost, sourceOverride, reader,
callback)) {
- if (query instanceof MultiTermQuery) {
- MultiTermQuery copy = (MultiTermQuery) query.clone();
- copy.setRewriteMethod(new
MultiTermQuery.TopTermsScoringBooleanQueryRewrite(
- maxMultiTermQueryTerms));
- query = copy;
- }
- Query newRewritten;
- try {
- newRewritten = query.rewrite(reader);
- } catch (IOException e) {
- throw new WrappedExceptionFromLucene(e);
- }
+ Query newRewritten = rewriteQuery(query, pathBoost,
sourceOverride, reader);
if (newRewritten != query) {
// only rewrite once and then flatten again - the rewritten
// query could have a special treatment
@@ -377,6 +379,71 @@
callback.flattened(automaton, boost, source.hashCode());
}
+ protected void flattenQuery(CommonTermsQuery query, float pathBoost,
Object sourceOverride,
+ IndexReader reader, Callback callback) {
+ Query rewritten = rewriteQuery(query, pathBoost, sourceOverride,
reader);
+ if (!removeHighFrequencyTermsFromCommonTerms) {
+ flatten(rewritten, pathBoost, sourceOverride, reader, callback);
+ return;
+ }
+ /*
+ * Try to figure out if the query was rewritten into a list of low and
+ * high frequency terms. If it was, remove the high frequency terms.
+ *
+ * Note that this only works if high frequency terms are set to
+ * Occur.SHOULD and low frequency terms are set to Occur.MUST. That is
+ * usually the way it is done though.
+ */
+ if (!(rewritten instanceof BooleanQuery)) {
+ // Nope - its a term query or something more exotic
+ flatten(rewritten, pathBoost, sourceOverride, reader, callback);
+ }
+ BooleanQuery bq = (BooleanQuery) rewritten;
+ BooleanClause[] clauses = bq.getClauses();
+ if (clauses.length != 2) {
+ // Nope - its just a list of terms.
+ flattenQuery(bq, pathBoost, sourceOverride, reader, callback);
+ return;
+ }
+ if (clauses[0].getOccur() != Occur.SHOULD || clauses[1].getOccur() !=
Occur.MUST) {
+ // Nope - just a two term query
+ flattenQuery(bq, pathBoost, sourceOverride, reader, callback);
+ return;
+ }
+ if (!(clauses[0].getQuery() instanceof BooleanQuery &&
clauses[1].getQuery() instanceof BooleanQuery)) {
+ // Nope - terms of the wrong type. not sure how that happened.
+ flattenQuery(bq, pathBoost, sourceOverride, reader, callback);
+ return;
+ }
+ BooleanQuery lowFrequency = (BooleanQuery) clauses[1].getQuery();
+ flattenQuery(lowFrequency, pathBoost, sourceOverride, reader,
callback);
+ }
+
+ protected Query rewriteQuery(MultiTermQuery query, float pathBoost, Object
sourceOverride, IndexReader reader) {
+ query = (MultiTermQuery) query.clone();
+ query.setRewriteMethod(new
MultiTermQuery.TopTermsScoringBooleanQueryRewrite(
+ maxMultiTermQueryTerms));
+ return rewritePreparedQuery(query, pathBoost, sourceOverride, reader);
+ }
+
+ protected Query rewriteQuery(Query query, float pathBoost, Object
sourceOverride, IndexReader reader) {
+ if (query instanceof MultiTermQuery) {
+ return rewriteQuery((MultiTermQuery) query, pathBoost,
sourceOverride, reader);
+ }
+ return rewritePreparedQuery(query, pathBoost, sourceOverride, reader);
+ }
+
+ /**
+ * Rewrites a query that's already ready for rewriting.
+ */
+ protected Query rewritePreparedQuery(Query query, float pathBoost, Object
sourceOverride, IndexReader reader) {
+ try {
+ return query.rewrite(reader);
+ } catch (IOException e) {
+ throw new WrappedExceptionFromLucene(e);
+ }
+ }
+
private static class FuzzyQueryInfo {
private final String term;
private final int maxEdits;
diff --git
a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java
b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java
index 5edef9f..2a14445 100644
---
a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java
+++
b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java
@@ -37,7 +37,7 @@
private CompiledAutomaton acceptable;
public BasicQueryWeigher(IndexReader reader, Query query) {
- this(new QueryFlattener(1000, false), new HashMapTermInfos(), reader,
query);
+ this(new QueryFlattener(1000, false, true), new HashMapTermInfos(),
reader, query);
}
public BasicQueryWeigher(QueryFlattener flattener, final TermInfos
termInfos, IndexReader reader, Query query) {
diff --git
a/experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java
b/experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java
index cad5462..f3f943e 100644
---
a/experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java
+++
b/experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java
@@ -1,7 +1,6 @@
package org.wikimedia.highlighter.experimental.lucene;
import static org.hamcrest.Matchers.not;
-import static org.junit.Assert.assertThat;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyFloat;
import static org.mockito.Matchers.anyInt;
@@ -15,9 +14,20 @@
import static org.mockito.Mockito.when;
import static
org.wikimedia.highlighter.experimental.lucene.LuceneMatchers.recognises;
+import java.io.Closeable;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
@@ -28,20 +38,26 @@
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.hamcrest.Matcher;
import org.junit.Test;
import org.mockito.ArgumentCaptor;
import org.wikimedia.highlighter.experimental.lucene.QueryFlattener.Callback;
-public class QueryFlattenerTest {
+import com.google.common.collect.Lists;
+
+public class QueryFlattenerTest extends LuceneTestCase {
+ private final List<Closeable> toClose = new ArrayList<>();
private final Term bar = new Term("foo", "bar");
private final Term baz = new Term("foo", "baz");
@Test
public void termQuery() {
Callback callback = mock(Callback.class);
- new QueryFlattener(1, false).flatten(new TermQuery(bar), null,
callback);
+ new QueryFlattener().flatten(new TermQuery(bar), null, callback);
verify(callback).flattened(bar.bytes(), 1f, null);
}
@@ -60,7 +76,7 @@
PhraseQuery q = new PhraseQuery();
q.add(bar);
q.add(baz);
- new QueryFlattener(1, phraseAsTerms).flatten(q, null, callback);
+ new QueryFlattener(1, phraseAsTerms, true).flatten(q, null, callback);
verify(callback).flattened(bar.bytes(), phraseAsTerms ? 1f : 0, null);
verify(callback).flattened(baz.bytes(), phraseAsTerms ? 1f : 0, null);
if (phraseAsTerms) {
@@ -82,7 +98,7 @@
BooleanQuery bq = new BooleanQuery();
bq.add(new BooleanClause(new TermQuery(bar), Occur.MUST));
bq.add(new BooleanClause(new TermQuery(baz), Occur.MUST_NOT));
- new QueryFlattener(1, false).flatten(bq, null, callback);
+ new QueryFlattener().flatten(bq, null, callback);
verify(callback).flattened(bar.bytes(), 1f, null);
verify(callback, never()).flattened(eq(baz.bytes()), anyFloat(),
isNull(Query.class));
}
@@ -92,53 +108,137 @@
Callback callback = mock(Callback.class);
Query rewritten = mock(Query.class);
when(rewritten.rewrite(null)).thenReturn(new TermQuery(bar));
- new QueryFlattener(1, false).flatten(rewritten, null, callback);
+ new QueryFlattener().flatten(rewritten, null, callback);
verify(callback).flattened(bar.bytes(), 1f, rewritten);
}
@Test
public void fuzzyQuery() {
- flattenedToAutomatonThatMatches(new FuzzyQuery(bar), recognises(bar),
recognises(baz),
- recognises("barr"), recognises("bor"),
not(recognises("barrrr")));
+ flattenedToAutomatonThatMatches(new FuzzyQuery(bar), recognises(bar),
recognises(baz), recognises("barr"), recognises("bor"),
+ not(recognises("barrrr")));
}
@Test
public void fuzzyQueryShorterThenPrefix() {
Callback callback = mock(Callback.class);
- new QueryFlattener(1, false).flatten(new FuzzyQuery(bar, 2, 100),
null, callback);
+ new QueryFlattener().flatten(new FuzzyQuery(bar, 2, 100), null,
callback);
verify(callback).flattened(bar.bytes(), 1f, null);
verify(callback, never()).flattened(any(Automaton.class), anyFloat(),
anyInt());
}
@Test
public void regexpQuery() {
- flattenedToAutomatonThatMatches(new RegexpQuery(new Term("test",
"ba[zr]")),
- recognises(bar), recognises(baz), not(recognises("barr")));
+ flattenedToAutomatonThatMatches(new RegexpQuery(new Term("test",
"ba[zr]")), recognises(bar), recognises(baz),
+ not(recognises("barr")));
}
@Test
public void wildcardQuery() {
- flattenedToAutomatonThatMatches(new WildcardQuery(new Term("test",
"ba?")),
- recognises(bar), recognises(baz), not(recognises("barr")));
+ flattenedToAutomatonThatMatches(new WildcardQuery(new Term("test",
"ba?")), recognises(bar), recognises(baz),
+ not(recognises("barr")));
- flattenedToAutomatonThatMatches(new WildcardQuery(new Term("test",
"ba*")),
- recognises(bar), recognises(baz), recognises("barr"),
not(recognises("bor")));
+ flattenedToAutomatonThatMatches(new WildcardQuery(new Term("test",
"ba*")), recognises(bar), recognises(baz), recognises("barr"),
+ not(recognises("bor")));
}
@Test
public void prefixQuery() {
- flattenedToAutomatonThatMatches(new PrefixQuery(new Term("test",
"ba")), recognises(bar),
- recognises(baz), recognises("barr"), not(recognises("bor")));
+ flattenedToAutomatonThatMatches(new PrefixQuery(new Term("test",
"ba")), recognises(bar), recognises(baz), recognises("barr"),
+ not(recognises("bor")));
+ }
+
+ @Test
+ public void commonTermsQueryNoRemove() {
+ IndexReader reader = readerWithTerms(bar, randomIntBetween(1, 20),
baz, randomIntBetween(1, 20));
+ Callback callback = mock(Callback.class);
+ CommonTermsQuery q = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,
10f);
+ q.add(bar);
+ q.add(baz);
+ new QueryFlattener(100, false, false).flatten(q, reader, callback);
+ verify(callback).flattened(bar.bytes(), 1f, null);
+ verify(callback).flattened(baz.bytes(), 1f, null);
+ }
+
+ @Test
+ public void commonTermsQueryAllCommon() {
+ IndexReader reader = readerWithTerms(bar, randomIntBetween(11, 20),
baz, randomIntBetween(11, 20));
+ Callback callback = mock(Callback.class);
+ CommonTermsQuery q = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,
10f);
+ q.add(bar);
+ q.add(baz);
+ new QueryFlattener().flatten(q, reader, callback);
+ verify(callback).flattened(bar.bytes(), 1f, null);
+ verify(callback).flattened(baz.bytes(), 1f, null);
+ }
+
+ @Test
+ public void commonTermsQueryAllUncommon() {
+ IndexReader reader = readerWithTerms(bar, randomIntBetween(1, 10),
baz, randomIntBetween(1, 10));
+ Callback callback = mock(Callback.class);
+ CommonTermsQuery q = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,
10f);
+ q.add(bar);
+ q.add(baz);
+ new QueryFlattener().flatten(q, reader, callback);
+ verify(callback).flattened(bar.bytes(), 1f, null);
+ verify(callback).flattened(baz.bytes(), 1f, null);
+ }
+
+ @Test
+ public void commonTermsQueryOneUncommon() {
+ IndexReader reader = readerWithTerms(bar, randomIntBetween(1, 10),
baz, randomIntBetween(11, 20));
+ Callback callback = mock(Callback.class);
+ CommonTermsQuery q = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,
10f);
+ q.add(bar);
+ q.add(baz);
+ new QueryFlattener().flatten(q, reader, callback);
+ verify(callback).flattened(bar.bytes(), 1f, null);
+ verify(callback, never()).flattened(eq(baz.bytes()), anyFloat(),
any(Object.class));
}
@SafeVarargs
private final void flattenedToAutomatonThatMatches(Query query,
Matcher<Automaton>... matchers) {
Callback callback = mock(Callback.class);
- new QueryFlattener(1, false).flatten(query, null, callback);
+ new QueryFlattener().flatten(query, null, callback);
ArgumentCaptor<Automaton> a = ArgumentCaptor.forClass(Automaton.class);
verify(callback).flattened(a.capture(), eq(1f), anyInt());
for (Matcher<Automaton> matcher : matchers) {
assertThat(a.getValue(), matcher);
}
}
+
+ private IndexReader readerWithTerms(Object... termsAndFreqs) {
+ try {
+ assertEquals("Expected an even number of terms and freqs", 0,
termsAndFreqs.length % 2);
+ Directory dir = newDirectory();
+ toClose.add(dir);
+ try (IndexWriter writer = new IndexWriter(dir,
newIndexWriterConfig(new KeywordAnalyzer()))) {
+ for (int i = 0; i < termsAndFreqs.length; i += 2) {
+ Term term = (Term) termsAndFreqs[i];
+ int freq = ((Number) termsAndFreqs[i + 1]).intValue();
+ for (int f = 0; f < freq; f++) {
+ writer.addDocument(Collections.singleton(new
TextField(term.field(), term.text(), Field.Store.NO)));
+ }
+ }
+ }
+ IndexReader reader = DirectoryReader.open(dir);
+ toClose.add(reader);
+ return reader;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private int randomIntBetween(int min, int max) {
+ return TestUtil.nextInt(random(), min, max);
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ super.tearDown();
+
+ for (Closeable c : Lists.reverse(toClose)) {
+ c.close();
+ }
+ toClose.clear();
+ }
}
--
To view, visit https://gerrit.wikimedia.org/r/207432
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I722f9a18eafb9f1c0af36ae03eaacbd227cbee32
Gerrit-PatchSet: 3
Gerrit-Project: search/highlighter
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Jdouglas <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits