Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/207144
Change subject: Add field_value_factor_with_default ...................................................................... Add field_value_factor_with_default This is a backport of an elasticsearch feature to add the "mising" field to field_value_factor. That field contains a value that is used by the field_value_factor when the document is missing the field. Change-Id: Ic465b9fe88caf1ce6520f8b376956f2737695269 --- M README.md A docs/field_value_factor_with_default.md M src/main/java/org/wikimedia/search/extra/ExtraPlugin.java A src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefault.java A src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefaultBuilder.java A src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefaultParser.java A src/main/java/org/wikimedia/search/extra/fieldvaluefactor/package-info.java M src/main/java/org/wikimedia/search/extra/idhashmod/IdHashModFilter.java A src/main/java/org/wikimedia/search/extra/idhashmod/package-info.java A src/test/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorWithDefaultTest.java 10 files changed, 414 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/extra refs/changes/44/207144/1 diff --git a/README.md b/README.md index 2cb292a..ff45ea6 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,19 @@ * [id_hash_mod](docs/id_hash_mod.md) - Filter used to select all documents independantly. For example, it can be used by multiple processes to reindex all documents without any interprocess communication. Added in 1.5.0, 1.4.1, -and 1.3.1. +and 1.3.0. Queries: * [safer](docs/safer.md) - Wraps other queries and analyzes them for potentially expensive constructs. Expensive constructs either cause errors to be sent back to the user or are degraded into cheaper, less precise constructs. +Score Functions: +* [field_value_factor_with_default](docs/field_value_factor_with_default.md) - +Just like field_value_factor except it supports a ```missing``` parameter that +is the value used if the field is missing from the document being scored. Added +in 1.5.0, 1.4.1, and 1.3.0. + | Extra Queries and Filters Plugin | ElasticSearch | |----------------------------------|-----------------| | 1.4.0, master branch | 1.4.1 -> 1.4.X | diff --git a/docs/field_value_factor_with_default.md b/docs/field_value_factor_with_default.md new file mode 100644 index 0000000..a5e6101 --- /dev/null +++ b/docs/field_value_factor_with_default.md @@ -0,0 +1,7 @@ +field_value_factor_with_Default +=============================== + +The ```field_value_factor_with_default``` is a backport of [an Elasticsearch feature](https://github.com/elastic/elasticsearch/issues/10841) +that will be available in Elasticsearch 1.6.0 and 2.0.0 to support a +```missing``` parameter that functions as a default value to use when scoring +documents that are missing the field used to score the ```field_value_factor```. diff --git a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java index 232abfa..3408663 100644 --- a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java +++ b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java @@ -8,8 +8,10 @@ import org.elasticsearch.common.inject.multibindings.Multibinder; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.query.QueryParser; +import org.elasticsearch.index.query.functionscore.FunctionScoreModule; import org.elasticsearch.indices.query.IndicesQueriesModule; import org.elasticsearch.plugins.AbstractPlugin; +import org.wikimedia.search.extra.fieldvaluefactor.FieldValueFactorFunctionWithDefaultParser; import org.wikimedia.search.extra.idhashmod.IdHashModFilterParser; import org.wikimedia.search.extra.regex.SourceRegexFilterParser; import org.wikimedia.search.extra.safer.ActionModuleParser; @@ -41,6 +43,13 @@ module.addQuery((Class<QueryParser>) (Class<?>)SaferQueryParser.class); } + /** + * Register our parsers. + */ + public void onModule(FunctionScoreModule module) { + module.registerParser(FieldValueFactorFunctionWithDefaultParser.class); + } + @Override public Collection<Class<? extends Module>> modules() { return ImmutableList.<Class<? extends Module>>of(SafeifierActionsModule.class); diff --git a/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefault.java b/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefault.java new file mode 100644 index 0000000..86a4f7e --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefault.java @@ -0,0 +1,150 @@ +package org.wikimedia.search.extra.fieldvaluefactor; + +import java.util.Locale; + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.search.Explanation; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.common.lucene.search.function.CombineFunction; +import org.elasticsearch.common.lucene.search.function.FieldValueFactorFunction; +import org.elasticsearch.common.lucene.search.function.ScoreFunction; +import org.elasticsearch.index.fielddata.IndexNumericFieldData; +import org.elasticsearch.index.fielddata.SortedNumericDoubleValues; + +/** + * Implements field_value_factor_with_default. Basically a copy of Elasticsearch's + * FieldValueFactorParser in 1.4 with + * https://github.com/elastic/elasticsearch/pull/10845 applied. + */ +public class FieldValueFactorFunctionWithDefault extends ScoreFunction { + private final String field; + private final float boostFactor; + private final FieldValueFactorFunction.Modifier modifier; + private final Double missing; + private final IndexNumericFieldData indexFieldData; + private SortedNumericDoubleValues values; + + public FieldValueFactorFunctionWithDefault(String field, float boostFactor, FieldValueFactorFunction.Modifier modifierType, + Double missing, IndexNumericFieldData indexFieldData) { + super(CombineFunction.MULT); + this.field = field; + this.boostFactor = boostFactor; + this.modifier = modifierType; + this.missing = missing; + this.indexFieldData = indexFieldData; + } + + @Override + public void setNextReader(AtomicReaderContext context) { + this.values = this.indexFieldData.load(context).getDoubleValues(); + } + + @Override + public double score(int docId, float subQueryScore) { + this.values.setDocument(docId); + final int numValues = this.values.count(); + double value; + if (numValues > 0) { + value = this.values.valueAt(0); + } else if (missing != null) { + value = missing; + } else { + throw new ElasticsearchException("Missing value for field [" + field + "]"); + } + double val = value * boostFactor; + double result = modifier.apply(val); + if (Double.isNaN(result) || Double.isInfinite(result)) { + throw new ElasticsearchException("Result of field modification [" + modifier.toString() + "(" + val + ")] must be a number"); + } + return result; + } + + @Override + public Explanation explainScore(int docId, float subQueryScore) { + Explanation exp = new Explanation(); + String modifierStr = modifier != null ? modifier.toString() : ""; + String defaultStr = missing != null ? "?:" + missing : ""; + double score = score(docId, subQueryScore); + exp.setValue(CombineFunction.toFloat(score)); + exp.setDescription(String.format(Locale.ROOT, "field value function: %s(doc['%s'].value%s * factor=%s)", modifierStr, field, + defaultStr, boostFactor)); + return exp; + } + + /** + * The Type class encapsulates the modification types that can be applied to + * the score/value product. + */ + public enum Modifier { + NONE { + @Override + public double apply(double n) { + return n; + } + }, + LOG { + @Override + public double apply(double n) { + return Math.log10(n); + } + }, + LOG1P { + @Override + public double apply(double n) { + return Math.log10(n + 1); + } + }, + LOG2P { + @Override + public double apply(double n) { + return Math.log10(n + 2); + } + }, + LN { + @Override + public double apply(double n) { + return Math.log(n); + } + }, + LN1P { + @Override + public double apply(double n) { + return Math.log1p(n); + } + }, + LN2P { + @Override + public double apply(double n) { + return Math.log1p(n + 1); + } + }, + SQUARE { + @Override + public double apply(double n) { + return Math.pow(n, 2); + } + }, + SQRT { + @Override + public double apply(double n) { + return Math.sqrt(n); + } + }, + RECIPROCAL { + @Override + public double apply(double n) { + return 1.0 / n; + } + }; + + public abstract double apply(double n); + + @Override + public String toString() { + if (this == NONE) { + return ""; + } + return super.toString().toLowerCase(Locale.ROOT); + } + } +} diff --git a/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefaultBuilder.java b/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefaultBuilder.java new file mode 100644 index 0000000..5b72b0c --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefaultBuilder.java @@ -0,0 +1,66 @@ +package org.wikimedia.search.extra.fieldvaluefactor; + +import java.io.IOException; +import java.util.Locale; + +import org.elasticsearch.common.lucene.search.function.FieldValueFactorFunction; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilder; + +/** + * Builds field_value_factor_with_default. Basically a copy of Elasticsearch's + * FieldValueFactorParser in 1.4 with + * https://github.com/elastic/elasticsearch/pull/10845 applied. + */ +public class FieldValueFactorFunctionWithDefaultBuilder extends ScoreFunctionBuilder { + private String field = null; + private Float factor = null; + private FieldValueFactorFunction.Modifier modifier = null; + private Double missing = null; + + public FieldValueFactorFunctionWithDefaultBuilder(String fieldName) { + this.field = fieldName; + } + + @Override + public String getName() { + return FieldValueFactorFunctionWithDefaultParser.NAMES[0]; + } + + public FieldValueFactorFunctionWithDefaultBuilder factor(float boostFactor) { + this.factor = boostFactor; + return this; + } + + public FieldValueFactorFunctionWithDefaultBuilder modifier(FieldValueFactorFunction.Modifier modifier) { + this.modifier = modifier; + return this; + } + + public FieldValueFactorFunctionWithDefaultBuilder missing(double missing) { + this.missing = missing; + return this; + } + + @Override + public void doXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(getName()); + if (field != null) { + builder.field("field", field); + } + + if (factor != null) { + builder.field("factor", factor); + } + + if (modifier != null) { + builder.field("modifier", modifier.toString().toLowerCase(Locale.ROOT)); + } + + if (missing != null) { + builder.field("missing", missing); + } + + builder.endObject(); + } +} diff --git a/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefaultParser.java b/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefaultParser.java new file mode 100644 index 0000000..8e65667 --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorFunctionWithDefaultParser.java @@ -0,0 +1,70 @@ +package org.wikimedia.search.extra.fieldvaluefactor; + +import java.io.IOException; +import java.util.Locale; + +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.common.lucene.search.function.FieldValueFactorFunction; +import org.elasticsearch.common.lucene.search.function.ScoreFunction; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.fielddata.IndexNumericFieldData; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.query.QueryParseContext; +import org.elasticsearch.index.query.QueryParsingException; +import org.elasticsearch.index.query.functionscore.ScoreFunctionParser; +import org.elasticsearch.search.internal.SearchContext; + +/** + * Parses field_value_factor_with_default. Basically a copy of Elasticsearch's + * FieldValueFactorParser in 1.4 with + * https://github.com/elastic/elasticsearch/pull/10845 applied. + */ +public class FieldValueFactorFunctionWithDefaultParser implements ScoreFunctionParser { + public static String[] NAMES = { "field_value_factor_with_default", "fieldValueFactorWithDefault" }; + + @Override + public ScoreFunction parse(QueryParseContext parseContext, XContentParser parser) throws IOException, QueryParsingException { + + String currentFieldName = null; + String field = null; + float boostFactor = 1; + FieldValueFactorFunction.Modifier modifier = FieldValueFactorFunction.Modifier.NONE; + Double missing = null; + XContentParser.Token token; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token.isValue()) { + if ("field".equals(currentFieldName)) { + field = parser.text(); + } else if ("factor".equals(currentFieldName)) { + boostFactor = parser.floatValue(); + } else if ("modifier".equals(currentFieldName)) { + modifier = FieldValueFactorFunction.Modifier.valueOf(parser.text().toUpperCase(Locale.ROOT)); + } else if ("missing".equals(currentFieldName)) { + missing = parser.doubleValue(); + } else { + throw new QueryParsingException(parseContext.index(), NAMES[0] + " query does not support [" + currentFieldName + "]"); + } + } + } + + if (field == null) { + throw new QueryParsingException(parseContext.index(), "[" + NAMES[0] + "] required field 'field' missing"); + } + + SearchContext searchContext = SearchContext.current(); + @SuppressWarnings("rawtypes") + FieldMapper mapper = searchContext.mapperService().smartNameFieldMapper(field); + if (mapper == null) { + throw new ElasticsearchException("Unable to find a field mapper for field [" + field + "]"); + } + return new FieldValueFactorFunctionWithDefault(field, boostFactor, modifier, missing, (IndexNumericFieldData) searchContext.fieldData() + .getForField(mapper)); + } + + @Override + public String[] getNames() { + return NAMES; + } +} diff --git a/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/package-info.java b/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/package-info.java new file mode 100644 index 0000000..695c1da --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/fieldvaluefactor/package-info.java @@ -0,0 +1,5 @@ +/** + * Default value support for field_value_factor named field_value_factor_default. + * Backport of https://github.com/elastic/elasticsearch/pull/10845. + */ +package org.wikimedia.search.extra.fieldvaluefactor; \ No newline at end of file diff --git a/src/main/java/org/wikimedia/search/extra/idhashmod/IdHashModFilter.java b/src/main/java/org/wikimedia/search/extra/idhashmod/IdHashModFilter.java index 7bbfa11..8fcf69a 100644 --- a/src/main/java/org/wikimedia/search/extra/idhashmod/IdHashModFilter.java +++ b/src/main/java/org/wikimedia/search/extra/idhashmod/IdHashModFilter.java @@ -13,11 +13,11 @@ import org.elasticsearch.index.fielddata.ScriptDocValues; /** - * Filters to fields who's _uid's hash matches a number mod some other number. + * Filters to document's who's _uid's hash matches a number mod some other number. * Its a simple way of slicing the index into chunks that can be processed * totally independently. Its used by CirrusSearch to reindex in multiple * Independent processes. Its the same as the following script: - * + * * <pre> * {@code * "filter" : { @@ -31,7 +31,7 @@ * } * } * </pre> - * + * * Note that using the reader's native docIds won't give you a consistent view * across all shards but would be faster. It might work in a scroll context * which is how you'd use this query anyway. On the other hand this is fast diff --git a/src/main/java/org/wikimedia/search/extra/idhashmod/package-info.java b/src/main/java/org/wikimedia/search/extra/idhashmod/package-info.java new file mode 100644 index 0000000..fb0c94c --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/idhashmod/package-info.java @@ -0,0 +1,6 @@ +/** + * Filters to document's who's _uid's hash matches a number mod some other number. + * Its a simple way of slicing the index into chunks that can be processed + * totally independently. + */ +package org.wikimedia.search.extra.idhashmod; \ No newline at end of file diff --git a/src/test/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorWithDefaultTest.java b/src/test/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorWithDefaultTest.java new file mode 100644 index 0000000..5284624 --- /dev/null +++ b/src/test/java/org/wikimedia/search/extra/fieldvaluefactor/FieldValueFactorWithDefaultTest.java @@ -0,0 +1,91 @@ +package org.wikimedia.search.extra.fieldvaluefactor; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; +import static org.elasticsearch.index.query.QueryBuilders.simpleQueryString; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFailures; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertOrderedSearchHits; + +import java.io.IOException; + +import org.elasticsearch.action.search.SearchPhaseExecutionException; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.lucene.search.function.FieldValueFactorFunction; +import org.junit.Test; +import org.wikimedia.search.extra.AbstractPluginIntegrationTest; + +/** + * Tests field_value_factor_with_default. Basically a copy of Elasticsearch's + * FunctionScoreFieldValueTests with + * https://github.com/elastic/elasticsearch/pull/10845 applied. + */ +public class FieldValueFactorWithDefaultTest extends AbstractPluginIntegrationTest { + @Test + public void testFieldValueFactor() throws IOException { + assertAcked(prepareCreate("test").addMapping( + "type1", + jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("test") + .field("type", randomFrom(new String[] { "short", "float", "long", "integer", "double" })).endObject() + .startObject("body").field("type", "string").endObject().endObject().endObject().endObject()).get()); + ensureYellow(); + + client().prepareIndex("test", "type1", "1").setSource("test", 5, "body", "foo").get(); + client().prepareIndex("test", "type1", "2").setSource("test", 17, "body", "foo").get(); + client().prepareIndex("test", "type1", "3").setSource("body", "bar").get(); + + refresh(); + + // document 2 scores higher because 17 > 5 + SearchResponse response = client().prepareSearch("test").setExplain(randomBoolean()) + .setQuery(functionScoreQuery(simpleQueryString("foo"), new FieldValueFactorFunctionWithDefaultBuilder("test"))).get(); + assertOrderedSearchHits(response, "2", "1"); + + // document 1 scores higher because 1/5 > 1/17 + response = client() + .prepareSearch("test") + .setExplain(randomBoolean()) + .setQuery( + functionScoreQuery(simpleQueryString("foo"), new FieldValueFactorFunctionWithDefaultBuilder("test") + .modifier(FieldValueFactorFunction.Modifier.RECIPROCAL))).get(); + assertOrderedSearchHits(response, "1", "2"); + + // doc 3 doesn't have a "test" field, so an exception will be thrown + try { + response = client().prepareSearch("test").setExplain(randomBoolean()) + .setQuery(functionScoreQuery(matchAllQuery(), new FieldValueFactorFunctionWithDefaultBuilder("test"))).get(); + assertFailures(response); + } catch (SearchPhaseExecutionException e) { + // We are expecting an exception, because 3 has no field + } + + // doc 3 doesn't have a "test" field but we're defaulting it to 100 so + // it should be last + response = client() + .prepareSearch("test") + .setExplain(randomBoolean()) + .setQuery( + functionScoreQuery( + matchAllQuery(), + new FieldValueFactorFunctionWithDefaultBuilder("test").modifier( + FieldValueFactorFunction.Modifier.RECIPROCAL).missing(100))).get(); + assertOrderedSearchHits(response, "1", "2", "3"); + + // n divided by 0 is infinity, which should provoke an exception. + try { + response = client() + .prepareSearch("test") + .setExplain(randomBoolean()) + .setQuery( + functionScoreQuery( + simpleQueryString("foo"), + new FieldValueFactorFunctionWithDefaultBuilder("test").modifier( + FieldValueFactorFunction.Modifier.RECIPROCAL).factor(0))).get(); + assertFailures(response); + } catch (SearchPhaseExecutionException e) { + // This is fine, the query will throw an exception if executed + // locally, instead of just having failures + } + } +} -- To view, visit https://gerrit.wikimedia.org/r/207144 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ic465b9fe88caf1ce6520f8b376956f2737695269 Gerrit-PatchSet: 1 Gerrit-Project: search/extra Gerrit-Branch: master Gerrit-Owner: Manybubbles <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
