Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/165480
Change subject: WIP: stuff ...................................................................... WIP: stuff 1. Javadoc 2. reject_unaccelerated to reject regexes that cannot be accelerated 3. test for changing gram size NOCOMMIT: The tests for changing the gram size are a bit flaky right now- they fail 1/3 of the time or so. Change-Id: I5fa944ab285b410103cce0cb30ec53fbc0d5705e --- M src/main/java/org/wikimedia/search/extra/ExtraPlugin.java A src/main/java/org/wikimedia/search/extra/package-info.java M src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java M src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterBuilder.java M src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterParser.java A src/main/java/org/wikimedia/search/extra/regex/UnableToAccelerateRegexException.java M src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java M src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java M src/main/java/org/wikimedia/search/extra/regex/expression/ExpressionSource.java M src/main/java/org/wikimedia/search/extra/regex/expression/False.java M src/main/java/org/wikimedia/search/extra/regex/expression/True.java M src/main/java/org/wikimedia/search/extra/regex/ngram/AutomatonTooComplexException.java M src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java M src/main/java/org/wikimedia/search/extra/regex/ngram/NGramExtractor.java M src/main/java/org/wikimedia/search/extra/util/FieldValues.java M src/main/java/org/wikimedia/search/extra/util/Functions.java A src/main/java/org/wikimedia/search/extra/util/package-info.java M src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java 18 files changed, 312 insertions(+), 81 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/extra refs/changes/80/165480/1 diff --git a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java index 70f0c95..f0a5b92 100644 --- a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java +++ b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java @@ -4,6 +4,9 @@ import org.elasticsearch.plugins.AbstractPlugin; import org.wikimedia.search.extra.regex.SourceRegexFilterParser; +/** + * Setup the Elasticsearch plugin. + */ public class ExtraPlugin extends AbstractPlugin { @Override public String description() { @@ -15,6 +18,9 @@ return "wikimedia-extra"; } + /** + * Register our parsers. + */ public void onModule(IndicesQueriesModule module) { module.addFilter(new SourceRegexFilterParser()); } diff --git a/src/main/java/org/wikimedia/search/extra/package-info.java b/src/main/java/org/wikimedia/search/extra/package-info.java new file mode 100644 index 0000000..910721c --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/package-info.java @@ -0,0 +1,4 @@ +/** + * Utilities. + */ +package org.wikimedia.search.extra; \ No newline at end of file diff --git a/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java b/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java index 6461ca0..6f36195 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java +++ b/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java @@ -33,13 +33,14 @@ private final int maxInspect; private final boolean caseSensitive; private final Locale locale; + private final boolean rejectUnaccelerated; private int inspected = 0; private Filter prefilter; private CharacterRunAutomaton charRun; public SourceRegexFilter(String fieldPath, FieldValues.Loader loader, String regex, String ngramFieldPath, int gramSize, int maxExpand, - int maxStatesTraced, int maxInspect, boolean caseSensitive, Locale locale) { + int maxStatesTraced, int maxInspect, boolean caseSensitive, Locale locale, boolean rejectUnaccelerated) { this.fieldPath = fieldPath; this.loader = loader; this.regex = regex; @@ -50,6 +51,7 @@ this.maxInspect = maxInspect; this.caseSensitive = caseSensitive; this.locale = locale; + this.rejectUnaccelerated = rejectUnaccelerated; } @Override @@ -73,6 +75,9 @@ Automaton automaton = new RegExp(regex.toLowerCase(locale), RegExp.ALL ^ RegExp.AUTOMATON).toAutomaton(); Expression<String> expression = new NGramExtractor(gramSize, maxExpand, maxStatesTraced).extract(automaton).simplify(); if (expression.alwaysTrue()) { + if (rejectUnaccelerated) { + throw new UnableToAccelerateRegexException(regex, gramSize, ngramFieldPath); + } prefilter = Queries.MATCH_ALL_FILTER; } else if (expression.alwaysFalse()) { prefilter = Queries.MATCH_NO_FILTER; @@ -103,6 +108,7 @@ @Override protected boolean match(int docid) { if (inspected >= maxInspect) { + // TODO hook into the generic timeout mechanism when it is ready return false; } inspected++; diff --git a/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterBuilder.java b/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterBuilder.java index 14fee4d..356b361 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterBuilder.java +++ b/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterBuilder.java @@ -6,6 +6,9 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.query.BaseFilterBuilder; +/** + * Builds source_regex filters. + */ public class SourceRegexFilterBuilder extends BaseFilterBuilder { private final String field; private final String regex; @@ -17,22 +20,43 @@ private Integer maxInspect; private Boolean caseSensitive; private Locale locale; + private Boolean rejectUnaccelerated; + /** + * Start building. + * @param field the field to load and run the regex against + * @param regex the regex to run + */ public SourceRegexFilterBuilder(String field, String regex) { this.field = field; this.regex = regex; } + /** + * @param loadFromSource should field be loaded from source (true) or from a + * stored field (false)? + * @return this for chaining + */ public SourceRegexFilterBuilder loadFromSource(boolean loadFromSource) { this.loadFromSource = loadFromSource; return this; } + /** + * @param ngramField field containing ngrams used to prefilter checked + * documents. If not set then no ngram acceleration is performed. + * @return this for chaining + */ public SourceRegexFilterBuilder ngramField(String ngramField) { this.ngramField = ngramField; return this; } + /** + * @param gramSize size of the gram. Defaults to 3 because everyone loves + * trigrams. + * @return this for chaining + */ public SourceRegexFilterBuilder gramSize(int gramSize) { this.gramSize = gramSize; return this; @@ -79,6 +103,16 @@ return this; } + /** + * @param rejectUnaccelerated should the filter reject regexes it cannot + * accelerate? + * @return this for chaining + */ + public SourceRegexFilterBuilder rejectUnaccelerated(boolean rejectUnaccelerated) { + this.rejectUnaccelerated = rejectUnaccelerated; + return this; + } + @Override protected void doXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject("source-regex"); @@ -109,6 +143,9 @@ if (locale != null) { builder.field("locale", locale); } + if (rejectUnaccelerated != null) { + builder.field("reject_unaccelerated", rejectUnaccelerated); + } builder.endObject(); } diff --git a/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterParser.java b/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterParser.java index 2c0f824..9c1c3e9 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterParser.java +++ b/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilterParser.java @@ -11,6 +11,9 @@ import org.elasticsearch.index.query.QueryParsingException; import org.wikimedia.search.extra.util.FieldValues; +/** + * Parses source_regex filters. + */ public class SourceRegexFilterParser implements org.elasticsearch.index.query.FilterParser { private static final String[] NAMES = new String[] { "source_regex", "source-regex", "sourceRegex" }; @@ -32,6 +35,7 @@ int maxInspect = Integer.MAX_VALUE; boolean caseSensitive = false; Locale locale = Locale.ROOT; + boolean rejectUnaccelerated = false; // Stuff all filters have String filterName = null; @@ -69,6 +73,8 @@ caseSensitive = parser.booleanValue(); } else if ("locale".equals(currentFieldName)) { locale = LocaleUtils.parse(parser.text()); + } else if ("reject_unaccelerated".equals(currentFieldName) || "rejectUnaccelerated".equals(currentFieldName)) { + rejectUnaccelerated = parser.booleanValue(); } else if ("_cache".equals(currentFieldName)) { cache = parser.booleanValue(); } else if ("_name".equals(currentFieldName)) { @@ -82,14 +88,14 @@ } } - if (regex == null) { + if (regex == null || "".equals(regex)) { throw new QueryParsingException(parseContext.index(), "[source-regex] filter must specify [regex]"); } if (fieldPath == null) { throw new QueryParsingException(parseContext.index(), "[source-regex] filter must specify [field]"); } Filter filter = new SourceRegexFilter(fieldPath, loader, regex, ngramFieldPath, gramSize, maxExpand, maxStatesTraced, maxInspect, - caseSensitive, locale); + caseSensitive, locale, rejectUnaccelerated); if (cache) { filter = parseContext.cacheFilter(filter, cacheKey); } diff --git a/src/main/java/org/wikimedia/search/extra/regex/UnableToAccelerateRegexException.java b/src/main/java/org/wikimedia/search/extra/regex/UnableToAccelerateRegexException.java new file mode 100644 index 0000000..4c96d3e --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/regex/UnableToAccelerateRegexException.java @@ -0,0 +1,15 @@ +package org.wikimedia.search.extra.regex; + +import java.util.Locale; + +/** + * Thrown when the filter is unable to accelerate a regex and + * rejectUnaccelerated is set. + */ +public class UnableToAccelerateRegexException extends RuntimeException { + private static final long serialVersionUID = 2685216158813374775L; + + public UnableToAccelerateRegexException(String regex, int gramSize, String ngramField) { + super(String.format(Locale.ROOT, "Unable to accelerate %s with %s sized grams stored in %s", regex, gramSize, ngramField)); + } +} diff --git a/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java b/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java index 5679047..33356e1 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java +++ b/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java @@ -20,6 +20,11 @@ this.components = components; } + /** + * Transform the components of this composite expression. Used by subclasses when implementing transform. + * @param transformer transformer to use + * @return result of transforming components + */ protected <J> ImmutableSet<J> transformComponents(Expression.Transformer<T, J> transformer) { ImmutableSet.Builder<J> builder = ImmutableSet.builder(); for (Expression<T> component : components) { diff --git a/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java b/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java index 57d91f6..f123201 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java +++ b/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java @@ -7,6 +7,8 @@ * transformed. Simplifying expressions eliminates extraneous terms and factors * out common terms. Transformation allows client code to convert the expression * to some other (maybe evaluable) form. + * + * @param <T> type stored in leaves */ public interface Expression<T> { /** @@ -38,14 +40,51 @@ /** * Transform this expression into another form. + * + * @param <T> type stored in leaves + * @param <J> result of the transformation. */ <J> J transform(Transformer<T, J> transformer); + /** + * Transformer for expression components. + * + * @param <T> type stored in leaves + * @param <J> result of the transformation. + */ interface Transformer<T, J> { + /** + * Transform an expression that is always true. + */ J alwaysTrue(); + + /** + * Transform an expression that is always false. + */ J alwaysFalse(); + + /** + * Transform a leaf expression. + * + * @param t data stored in the leaf + * @return result of the transform + */ J leaf(T t); + + /** + * Transform an and expression. + * + * @param js transformed sub-expressions + * @return result of the transform + */ J and(ImmutableSet<J> js); + + /** + * Transform an or expression. + * + * @param js transformed sub-expressions + * @return result of the transform + */ J or(ImmutableSet<J> js); } } diff --git a/src/main/java/org/wikimedia/search/extra/regex/expression/ExpressionSource.java b/src/main/java/org/wikimedia/search/extra/regex/expression/ExpressionSource.java index fd0b9e1..9e824c5 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/expression/ExpressionSource.java +++ b/src/main/java/org/wikimedia/search/extra/regex/expression/ExpressionSource.java @@ -1,5 +1,14 @@ package org.wikimedia.search.extra.regex.expression; +/** + * Type that can be expressed as an expression. + * + * @param <T> type stored in leaves + */ public interface ExpressionSource<T> { + /** + * This expressed as an expression. The result might not be simplified so + * call simplify on it if you need it simplified. + */ Expression<T> expression(); } diff --git a/src/main/java/org/wikimedia/search/extra/regex/expression/False.java b/src/main/java/org/wikimedia/search/extra/regex/expression/False.java index c81d0a5..b5fcd75 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/expression/False.java +++ b/src/main/java/org/wikimedia/search/extra/regex/expression/False.java @@ -6,6 +6,9 @@ public class False<T> implements Expression<T> { private static final False<Object> TRUE = new False<>(); + /** + * There is only one false. + */ @SuppressWarnings("unchecked") public static <T> False<T> instance() { return (False<T>) TRUE; diff --git a/src/main/java/org/wikimedia/search/extra/regex/expression/True.java b/src/main/java/org/wikimedia/search/extra/regex/expression/True.java index 734084a..3a38d53 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/expression/True.java +++ b/src/main/java/org/wikimedia/search/extra/regex/expression/True.java @@ -6,6 +6,9 @@ public class True<T> implements Expression<T> { private static final True<Object> TRUE = new True<>(); + /** + * There is only one True. + */ @SuppressWarnings("unchecked") public static <T> True<T> instance() { return (True<T>) TRUE; diff --git a/src/main/java/org/wikimedia/search/extra/regex/ngram/AutomatonTooComplexException.java b/src/main/java/org/wikimedia/search/extra/regex/ngram/AutomatonTooComplexException.java index e0f2960..38cc9f0 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/ngram/AutomatonTooComplexException.java +++ b/src/main/java/org/wikimedia/search/extra/regex/ngram/AutomatonTooComplexException.java @@ -1,8 +1,15 @@ package org.wikimedia.search.extra.regex.ngram; +/** + * Thrown when the automaton is too complex to convert to ngrams (as measured by + * maxExpand). + */ public class AutomatonTooComplexException extends IllegalArgumentException { private static final long serialVersionUID = -4686819368713525883L; + /** + * Build it. + */ public AutomatonTooComplexException() { super("The supplied automaton is too complex to extract ngrams"); } diff --git a/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java b/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java index 4e925f5..5b7d30f 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java +++ b/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java @@ -33,6 +33,18 @@ private final List<NGramState> acceptStates = new ArrayList<>(); private final Map<NGramState, NGramState> states = new HashMap<>(); + /** + * Build it. + * @param source automaton to convert into an ngram automaton + * @param gramSize size of the grams to extract + * @param maxExpand Maximum size of range transitions to expand into single + * transitions. Its roughly analogous to the number of character + * in a character class before it is considered a wildcard for + * optimization purposes. + * @param maxStatesTraced maximum number of states traced during automaton + * functions. Higher number allow more complex automata to be + * converted to ngram expressions at the cost of more time. + */ public NGramAutomaton(Automaton source, int gramSize, int maxExpand, int maxStatesTraced) { this.source = source.getNumberedStates(); this.gramSize = gramSize; diff --git a/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramExtractor.java b/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramExtractor.java index 0c6f7e0..d4ff675 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramExtractor.java +++ b/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramExtractor.java @@ -7,17 +7,35 @@ import org.wikimedia.search.extra.regex.expression.Leaf; import org.wikimedia.search.extra.regex.expression.True; +/** + * Extracts ngrams from automatons. + */ public class NGramExtractor { private final int gramSize; private final int maxExpand; private final int maxStatesTraced; + /** + * Build it. + * + * @param gramSize size of the ngram. The "n" in ngram. + * @param maxExpand Maximum size of range transitions to expand into single + * transitions. Its roughly analogous to the number of character + * in a character class before it is considered a wildcard for + * optimization purposes. + * @param maxStatesTraced maximum number of states traced during automaton + * functions. Higher number allow more complex automata to be + * converted to ngram expressions at the cost of more time. + */ public NGramExtractor(int gramSize, int maxExpand, int maxStatesTraced) { this.gramSize = gramSize; this.maxExpand = maxExpand; this.maxStatesTraced = maxStatesTraced; } + /** + * Extract an Expression containing ngrams from an automaton. + */ public Expression<String> extract(Automaton automaton) { if (automaton.getSingleton() != null) { int end = automaton.getSingleton().length() - gramSize + 1; diff --git a/src/main/java/org/wikimedia/search/extra/util/FieldValues.java b/src/main/java/org/wikimedia/search/extra/util/FieldValues.java index 7158a54..dc16f68 100644 --- a/src/main/java/org/wikimedia/search/extra/util/FieldValues.java +++ b/src/main/java/org/wikimedia/search/extra/util/FieldValues.java @@ -14,11 +14,17 @@ import org.elasticsearch.index.fieldvisitor.CustomFieldsVisitor; import org.elasticsearch.index.fieldvisitor.JustSourceFieldsVisitor; +/** + * Hub for fetching field values. + */ public abstract class FieldValues { /** * Loads field values. */ public interface Loader { + /** + * Load the value of the string at path from reader for docId. + */ List<String> load(String path, IndexReader reader, int docId) throws IOException; } diff --git a/src/main/java/org/wikimedia/search/extra/util/Functions.java b/src/main/java/org/wikimedia/search/extra/util/Functions.java index 9f6ea87..2790df3 100644 --- a/src/main/java/org/wikimedia/search/extra/util/Functions.java +++ b/src/main/java/org/wikimedia/search/extra/util/Functions.java @@ -6,6 +6,10 @@ * Some useful functions. */ public class Functions { + /** + * Call toString on the provided object. This normally comes with Guava but + * Elasticsearch shades it and that somehow makes it invisible. + */ public static Function<Object, String> toStringFunction() { return TO_STRING; } @@ -14,10 +18,6 @@ // Utils class } - /** - * Call toString on the provided object. This normally comes with Guava but - * Elasticsearch shades it and that somehow makes it invisible. - */ private static final Function<Object, String> TO_STRING = new Function<Object, String>() { @Override public String apply(Object obj) { diff --git a/src/main/java/org/wikimedia/search/extra/util/package-info.java b/src/main/java/org/wikimedia/search/extra/util/package-info.java new file mode 100644 index 0000000..bbf237a --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/util/package-info.java @@ -0,0 +1,4 @@ +/** + * Extracts ngrams from an {@link org.apache.lucene.util.automaton.Automaton}. + */ +package org.wikimedia.search.extra.util; \ No newline at end of file diff --git a/src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java b/src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java index 4108342..4c6a309 100644 --- a/src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java +++ b/src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java @@ -28,13 +28,12 @@ @ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.SUITE, transportClientRatio = 0.0) public class SourceRegexFilterTest extends ElasticsearchIntegrationTest { @Test - public void basic() throws InterruptedException, ExecutionException, IOException { + public void basicUnacceleratedRegex() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(false, doc("findme", "test")); indexChaff(between(0, 10000)); - // Result is found by regex without acceleration - SearchResponse response = search("t..t").get(); + SearchResponse response = search(filter("t..t")).get(); assertSearchHits(response, "findme"); client().prepareDelete("test", "test", "findme").get(); @@ -42,68 +41,60 @@ refresh(); // Result isn't found when it is deleted - response = search("t..t").get(); + response = search(filter("t..t")).get(); assertHitCount(response, 0); } - /** - * Regex can match the whole string. - */ @Test - public void wholeString() throws InterruptedException, ExecutionException, IOException { + public void regexMatchesWholeString() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "test")); - SearchResponse response = search("test").get(); + SearchResponse response = search(filter("test")).get(); assertSearchHits(response, "findme"); } - /** - * Regex can match a word in the string. - */ @Test - public void instring() throws InterruptedException, ExecutionException, IOException { + public void regexMatchesPartOfString() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "I have the test in me.")); - SearchResponse response = search("test").get(); + SearchResponse response = search(filter("test")).get(); assertSearchHits(response, "findme"); } - /** - * Regex can match unicode characters. - */ @Test - public void unicode() throws InterruptedException, ExecutionException, IOException { + public void regexMatchesUnicodeCharacters() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "solved using only λ+μ function")); - SearchResponse response = search("only λ\\+μ").get(); + SearchResponse response = search(filter("only λ\\+μ")).get(); + assertSearchHits(response, "findme"); + + // It even works with ngram extraction! + response = search(filter("on[ly]y λ\\+μ")).get(); assertSearchHits(response, "findme"); } - /** - * maxStatesTraced limits the complexity of the regexes. - */ @Test - public void maxStatesTraced() throws InterruptedException, ExecutionException, IOException { + public void maxStatesTracedLimitsComplexityOfRegexes() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "test")); - SearchResponse response = search(new SourceRegexFilterBuilder("test", "te[st]t").ngramField("test.trigrams").maxStatesTraced(10)).get(); + SearchResponse response = search(filter("te[st]t").maxStatesTraced(10)).get(); assertHitCount(response, 1); - response = search(new SourceRegexFilterBuilder("test", "test").ngramField("test.trigrams").maxStatesTraced(0)).get(); + // maxStatesTraced isn't used when the regex is just a sequence of + // characters + response = search(filter("test").maxStatesTraced(0)).get(); assertHitCount(response, 1); + // But it is used when there are any complex things in it + assertFailures(search(filter("te[st]t").maxStatesTraced(0)), + RestStatus.INTERNAL_SERVER_ERROR, containsString("complex")); // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but // I can't find any way from here to mark it otherwise. - assertFailures(search(new SourceRegexFilterBuilder("test", "te[st]t").ngramField("test.trigrams").maxStatesTraced(0)), - RestStatus.INTERNAL_SERVER_ERROR, containsString("complex")); } - /** - * maxInspect limits the number of matches. - */ @Test - public void maxInspect() throws InterruptedException, ExecutionException, IOException { + public void maxInspectLimitsNumberOfMatches() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "test")); - SearchResponse response = search(new SourceRegexFilterBuilder("test", "test").maxInspect(0)).get(); + SearchResponse response = search(filter("test").maxInspect(0)).get(); assertHitCount(response, 0); List<IndexRequestBuilder> builders = new ArrayList<>(); @@ -111,45 +102,91 @@ builders.add(doc("findme" + i, "test")); } indexRandom(true, builders); - response = search(new SourceRegexFilterBuilder("test", "test").maxInspect(10)).get(); + response = search(filter("test").maxInspect(10)).get(); assertHitCount(response, 10); } - /** - * Regex can can insensitively. - */ @Test - public void caseInsensitive() throws InterruptedException, ExecutionException, IOException { + public void rejectEmptyRegex() throws InterruptedException, ExecutionException, + IOException { + setup(); + // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but + // I can't find any way from here to mark it otherwise. + assertFailures(search(filter("")), + RestStatus.BAD_REQUEST, containsString("filter must specify [regex]")); + } + + @Test + public void rejectUnacceleratedCausesFailuresWhenItCannotAccelerateTheRegex() throws InterruptedException, ExecutionException, + IOException { + setup(); + indexRandom(true, doc("findme", "test")); + // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but + // I can't find any way from here to mark it otherwise. + + assertFailures(search(filter("...").rejectUnaccelerated(true)), + RestStatus.INTERNAL_SERVER_ERROR, containsString("Unable to accelerate")); + assertFailures(search(filter("t.p").rejectUnaccelerated(true)), + RestStatus.INTERNAL_SERVER_ERROR, containsString("Unable to accelerate")); + assertFailures(search(filter(".+pa").rejectUnaccelerated(true)), + RestStatus.INTERNAL_SERVER_ERROR, containsString("Unable to accelerate")); + assertFailures(search(filter("p").rejectUnaccelerated(true)), + RestStatus.INTERNAL_SERVER_ERROR, containsString("Unable to accelerate")); + } + + @Test + public void caseInsensitiveMatching() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "I have the test in me.")); - SearchResponse response = search("i h[ai]ve").get(); + SearchResponse response = search(filter("i h[ai]ve")).get(); assertSearchHits(response, "findme"); - response = search("I h[ai]ve").get(); + response = search(filter("I h[ai]ve")).get(); assertSearchHits(response, "findme"); } - /** - * Regex can match case sensitively. - */ @Test - public void caseSensitive() throws InterruptedException, ExecutionException, IOException { + public void caseSensitiveMatching() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "I have the test in me.")); - SearchResponse response = search(new SourceRegexFilterBuilder("test", "i h[ai]ve").ngramField("test.trigrams").caseSensitive(true)).get(); + SearchResponse response = search(filter("i h[ai]ve").caseSensitive(true)).get(); assertHitCount(response, 0); - response = search(new SourceRegexFilterBuilder("test", "I h[ai]ve").ngramField("test.trigrams").caseSensitive(true)).get(); + response = search(filter("I h[ai]ve").caseSensitive(true)).get(); assertSearchHits(response, "findme"); } - /** - * More complex regexes work. - */ @Test public void complex() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "I have the test in me.")); - SearchResponse response = search("h[efas] te.*me").get(); + SearchResponse response = search(filter("h[efas] te.*me")).get(); assertSearchHits(response, "findme"); + } + + @Test + public void changingGramSize() throws InterruptedException, ExecutionException, IOException { + setup(); + indexRandom(true, doc("findme", "I have the test in me.")); + + // You can change the gram size to allow more degenerate regexes! + SearchResponse response = search(filter("te.*me").gramSize(2).ngramField("test.bigram").rejectUnaccelerated(true)).get(); + assertSearchHits(response, "findme"); + + // Proof the regex would fail without the new gram size: + assertFailures(search(filter("te.*me").rejectUnaccelerated(true)), + RestStatus.INTERNAL_SERVER_ERROR, containsString("Unable to accelerate")); + // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but + // I can't find any way from here to mark it otherwise. + + // You can also raise the gram size + response = search(filter("test.*me").gramSize(4).ngramField("test.quadgram").rejectUnaccelerated(true)).get(); + assertSearchHits(response, "findme"); + + // But that limits the regexes you can accelerate to those from which + // appropriate grams can be extracted + assertFailures(search(filter("tes.*me").gramSize(4).rejectUnaccelerated(true)), + RestStatus.INTERNAL_SERVER_ERROR, containsString("Unable to accelerate")); + // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but + // I can't find any way from here to mark it otherwise. } /** @@ -185,10 +222,10 @@ start = System.currentTimeMillis(); for (int i = 0; i < rounds; i++) { - SearchResponse response = search(regex).get(); + SearchResponse response = search(filter(regex)).get(); assertSearchHits(response, "findme"); } - logger.info("Accel: {}", (System.currentTimeMillis() - start) / rounds); + logger.info("Accelerated: {}", (System.currentTimeMillis() - start) / rounds); } private IndexRequestBuilder doc(String id, String fieldValue) { @@ -209,10 +246,10 @@ } } - private SearchRequestBuilder search(String regex) { + private SourceRegexFilterBuilder filter(String regex) { SourceRegexFilterBuilder builder = new SourceRegexFilterBuilder("test", regex); - builder.ngramField("test.trigrams"); - return search(builder); + builder.ngramField("test.trigram"); + return builder; } private SearchRequestBuilder search(SourceRegexFilterBuilder builder) { @@ -224,34 +261,25 @@ mapping.startObject("test").startObject("properties"); mapping.startObject("test"); mapping.field("type", "string"); - { - mapping.startObject("fields").startObject("trigrams"); - mapping.field("type", "string"); - mapping.field("analyzer", "trigram"); - mapping.endObject().endObject(); - } + mapping.startObject("fields"); + buildSubfield(mapping, "bigram"); + buildSubfield(mapping, "trigram"); + buildSubfield(mapping, "quadgram"); + mapping.endObject(); mapping.endObject(); XContentBuilder settings = jsonBuilder().startObject().startObject("index"); settings.field("number_of_shards", 1); settings.startObject("analysis"); settings.startObject("analyzer"); - { - settings.startObject("trigram"); - settings.field("type", "custom"); - settings.field("tokenizer", "trigram"); - settings.field("filter", new String[] {"lowercase"}); - settings.endObject(); - } + buildNgramAnalyzer(settings, "bigram"); + buildNgramAnalyzer(settings, "trigram"); + buildNgramAnalyzer(settings, "quadgram"); settings.endObject(); settings.startObject("tokenizer"); - { - settings.startObject("trigram"); - settings.field("type", "nGram"); - settings.field("min_gram", "3"); - settings.field("max_gram", "3"); - settings.endObject(); - } + buildNgramTokenizer(settings, "bigram", 2); + buildNgramTokenizer(settings, "trigram", 3); + buildNgramTokenizer(settings, "quadgram", 4); settings.endObject(); settings.endObject().endObject(); // System.err.println(settings.string()); @@ -260,6 +288,29 @@ ensureYellow(); } + private void buildSubfield(XContentBuilder mapping, String name) throws IOException { + mapping.startObject(name); + mapping.field("type", "string"); + mapping.field("analyzer", name); + mapping.endObject(); + } + + private void buildNgramAnalyzer(XContentBuilder settings, String name) throws IOException { + settings.startObject(name); + settings.field("type", "custom"); + settings.field("tokenizer", name); + settings.field("filter", new String[] {"lowercase"}); + settings.endObject(); + } + + private void buildNgramTokenizer(XContentBuilder settings, String name, int size) throws IOException { + settings.startObject(name); + settings.field("type", "nGram"); + settings.field("min_gram", size); + settings.field("max_gram", size); + settings.endObject(); + } + /** * Enable plugin loading. */ -- To view, visit https://gerrit.wikimedia.org/r/165480 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5fa944ab285b410103cce0cb30ec53fbc0d5705e Gerrit-PatchSet: 1 Gerrit-Project: search/extra Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits