DCausse has uploaded a new change for review. https://gerrit.wikimedia.org/r/219182
Change subject: WIP: Add Safeifier ActionModule to verify wildcard before compiling ...................................................................... WIP: Add Safeifier ActionModule to verify wildcard before compiling I will comment this patch in the phab ticket Bug: T102589 Change-Id: Ifcc9e55a4d81d48c05a76ca1afdd9b3adbf86006 --- M pom.xml A src/main/java/org/elasticsearch/index/query/UnsafeSaferQueryParseContext.java M src/main/java/org/wikimedia/search/extra/ExtraPlugin.java M src/main/java/org/wikimedia/search/extra/safer/Safeifier.java M src/main/java/org/wikimedia/search/extra/safer/SaferQueryBuilder.java M src/main/java/org/wikimedia/search/extra/safer/SaferQueryParser.java A src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsAction.java A src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsActionModule.java A src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsActionModuleParser.java A src/main/java/org/wikimedia/search/extra/safer/wildcard/DelayedWildcardQuery.java A src/main/java/org/wikimedia/search/extra/safer/wildcard/SaferMapperQueryParser.java A src/main/java/org/wikimedia/search/extra/safer/wildcard/TooManyWildcardsException.java A src/test/java/org/wikimedia/search/extra/safer/CheckWildcardsTest.java M src/test/java/org/wikimedia/search/extra/safer/SaferQueryStringTest.java 14 files changed, 749 insertions(+), 30 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/extra refs/changes/82/219182/1 diff --git a/pom.xml b/pom.xml index ec14044..a01aead 100644 --- a/pom.xml +++ b/pom.xml @@ -43,7 +43,7 @@ <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> - <elasticsearch.version>1.3.4</elasticsearch.version> + <elasticsearch.version>1.3.6</elasticsearch.version> <lucene.version>4.9.1</lucene.version> </properties> diff --git a/src/main/java/org/elasticsearch/index/query/UnsafeSaferQueryParseContext.java b/src/main/java/org/elasticsearch/index/query/UnsafeSaferQueryParseContext.java new file mode 100644 index 0000000..54dfa4c --- /dev/null +++ b/src/main/java/org/elasticsearch/index/query/UnsafeSaferQueryParseContext.java @@ -0,0 +1,290 @@ +package org.elasticsearch.index.query; + +import java.io.IOException; +import java.util.Collection; +import java.util.EnumSet; +import java.util.Set; + +import org.apache.lucene.queryparser.classic.MapperQueryParser; +import org.apache.lucene.queryparser.classic.QueryParserSettings; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.similarities.Similarity; +import org.elasticsearch.cache.recycler.CacheRecycler; +import org.elasticsearch.common.ParseField.Flag; +import org.elasticsearch.common.collect.ImmutableMap; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.common.lucene.search.NoCacheQuery; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.cache.filter.support.CacheKeyFilter.Key; +import org.elasticsearch.index.cache.query.parser.QueryParserCache; +import org.elasticsearch.index.engine.IndexEngine; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.MapperService.SmartNameFieldMappers; +import org.elasticsearch.index.mapper.MapperService.SmartNameObjectMapper; +import org.elasticsearch.index.search.child.CustomQueryWrappingFilter; +import org.elasticsearch.index.similarity.SimilarityService; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.search.lookup.SearchLookup; +import org.wikimedia.search.extra.safer.SaferQueryParser; +import org.wikimedia.search.extra.safer.wildcard.CheckWildcardsActionModule; +import org.wikimedia.search.extra.safer.wildcard.SaferMapperQueryParser; + +/** + * Hack for {@link CheckWildcardsActionModule} + * + * This class is just here to allow a custom MapperQueryParser to be used into {@link QueryStringQueryParser} + * XXX: Potential flaw, this object is stored in a thread local cache in {@link SaferQueryParser} + * + * The normal QueryParseContext seems to be "index scoped" (see the thread local cache {@link IndexQueryParserService}) + * but this one will be node scoped (it's not clear to me + * + * in org.elasticsearch.index.query because we need package visibility for {@link QueryParseContext#indexQueryParser} + */ +public class UnsafeSaferQueryParseContext extends QueryParseContext { + private QueryParseContext parseContext = null; + private final SaferMapperQueryParser queryParser; + + public UnsafeSaferQueryParseContext() { + // XXX: Don't store anything in the base class + super(null, null); + this.queryParser = new SaferMapperQueryParser(this); + } + + /** + * XXX: don't forget to {@link #cleanUp()} + * @param parseContext + */ + public void setDelegate(QueryParseContext parseContext) { + this.parseContext = parseContext; + } + + public QueryParseContext getDelegate() { + return parseContext; + } + + /** + * Release the inner parseContext instance + */ + public void cleanUp() { + this.parseContext = null; + } + + /** + * XXX: we send our SaferMapperQueryParser + */ + public MapperQueryParser queryParser(QueryParserSettings settings) { + queryParser.reset(settings); + return queryParser; + } + + /** + * XXX: not sure + */ + @Override + public void reset(XContentParser jp) { + parseContext.reset(jp); + } + + /** + * Copy/Paste from {@link QueryParseContext#parseInnerQuery()}. + * + * This method will fail if NoCacheQuery is returned of if CustomQueryWrappingFilter.shouldUseCustomQueryWrappingFilter + * returns true. This is because we cannot maintain properly the state of the QueryParseContext (propagateNoCache and + * requireCustomQueryWrappingFilter booleans) + * + * @param parseContext + * @return the query + * @throws IOException + * @throws QueryParsingException + */ + @Override + public Query parseInnerQuery() throws IOException, QueryParsingException { + XContentParser parser = parseContext.parser(); + Index index = parseContext.index(); + + // move to START object + XContentParser.Token token; + if (parser.currentToken() != XContentParser.Token.START_OBJECT) { + token = parser.nextToken(); + if (token != XContentParser.Token.START_OBJECT) { + throw new QueryParsingException(index, "[_na] query malformed, must start with start_object"); + } + } + token = parser.nextToken(); + if (token != XContentParser.Token.FIELD_NAME) { + throw new QueryParsingException(index, "[_na] query malformed, no field after start_object"); + } + String queryName = parser.currentName(); + // move to the next START_OBJECT + token = parser.nextToken(); + if (token != XContentParser.Token.START_OBJECT && token != XContentParser.Token.START_ARRAY) { + throw new QueryParsingException(index, "[_na] query malformed, no field after start_object"); + } + + QueryParser queryParser = parseContext.indexQueryParser.queryParser(queryName); + if (queryParser == null) { + throw new QueryParsingException(index, "No query registered for [" + queryName + "]"); + } + // XXX: hack is here, if we detect a query_string we wrap a new QueryParseContext that + // will send our own MapperQueryParser. + final Query result; + if(queryParser instanceof QueryStringQueryParser) { + // This could potentially propagate our UnsafeSaferQueryParseContext but hopefully + // query_string doesn't have inner ES dsl queries. + result = queryParser.parse(this); + } else { + result = queryParser.parse(parseContext); + } + if (parser.currentToken() == XContentParser.Token.END_OBJECT || parser.currentToken() == XContentParser.Token.END_ARRAY) { + // if we are at END_OBJECT, move to the next one... + parser.nextToken(); + } + + // XXX: We fail here because we can't maintain the state of QueryParseContext properly + if (result instanceof NoCacheQuery) { + throw new QueryParsingException(index, "Safeifier does not support NoCacheQuery"); + } + if (CustomQueryWrappingFilter.shouldUseCustomQueryWrappingFilter(result)) { + throw new QueryParsingException(index, "Safeifier does not support NoCacheQuery"); + } + return result; + } + + @Override + public void parseFlags(EnumSet<Flag> parseFlags) { + parseContext.parseFlags(parseFlags); + } + @Override + public EnumSet<Flag> parseFlags() { + return parseContext.parseFlags(); + } + @Override + public Index index() { + return parseContext.index(); + } + @Override + public void parser(XContentParser parser) { + parseContext.parser(parser); + } + @Override + public XContentParser parser() { + return parseContext.parser(); + } + @Override + public AnalysisService analysisService() { + return parseContext.analysisService(); + } + @Override + public CacheRecycler cacheRecycler() { + return parseContext.cacheRecycler(); + } + @Override + public ScriptService scriptService() { + return parseContext.scriptService(); + } + @Override + public MapperService mapperService() { + return parseContext.mapperService(); + } + @Override + public IndexEngine indexEngine() { + return parseContext.indexEngine(); + } + @Override + public SimilarityService similarityService() { + return parseContext.similarityService(); + } + @Override + public Similarity searchSimilarity() { + return parseContext.searchSimilarity(); + } + @Override + public QueryParserCache queryParserCache() { + return parseContext.queryParserCache(); + } + @Override + public String defaultField() { + return parseContext.defaultField(); + } + @Override + public boolean queryStringLenient() { + return parseContext.queryStringLenient(); + } + @Override + public Filter cacheFilter(Filter filter, Key cacheKey) { + return parseContext.cacheFilter(filter, cacheKey); + } + @Override + public <IFD extends IndexFieldData<?>> IFD getForField( + FieldMapper<?> mapper) { + return parseContext.getForField(mapper); + } + @Override + public void addNamedFilter(String name, Filter filter) { + parseContext.addNamedFilter(name, filter); + } + @Override + public void addNamedQuery(String name, Query query) { + parseContext.addNamedQuery(name, query); + } + @Override + public ImmutableMap<String, Filter> copyNamedFilters() { + return parseContext.copyNamedFilters(); + } + @Override + public Filter parseInnerFilter() throws IOException, + QueryParsingException { + return parseContext.parseInnerFilter(); + } + @Override + public Filter parseInnerFilter(String filterName) throws IOException, + QueryParsingException { + return parseContext.parseInnerFilter(filterName); + } + @Override + public FieldMapper fieldMapper(String name) { + return parseContext.fieldMapper(name); + } + @Override + public String indexName(String name) { + return parseContext.indexName(name); + } + @Override + public Set<String> simpleMatchToIndexNames(String pattern) { + return parseContext.simpleMatchToIndexNames(pattern); + } + @Override + public SmartNameFieldMappers smartFieldMappers(String name) { + return parseContext.smartFieldMappers(name); + } + @Override + public FieldMapper smartNameFieldMapper(String name) { + return parseContext.smartNameFieldMapper(name); + } + @Override + public SmartNameObjectMapper smartObjectMapper(String name) { + return parseContext.smartObjectMapper(name); + } + @Override + public Collection<String> queryTypes() { + return parseContext.queryTypes(); + } + @Override + public SearchLookup lookup() { + return parseContext.lookup(); + } + @Override + public long nowInMillis() { + return parseContext.nowInMillis(); + } + @Override + public boolean requireCustomQueryWrappingFilter() { + return parseContext.requireCustomQueryWrappingFilter(); + } +} \ No newline at end of file diff --git a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java index abaa8dd..937525d 100644 --- a/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java +++ b/src/main/java/org/wikimedia/search/extra/ExtraPlugin.java @@ -19,6 +19,7 @@ import org.wikimedia.search.extra.safer.SaferQueryParser; import org.wikimedia.search.extra.safer.phrase.PhraseTooLargeActionModuleParser; import org.wikimedia.search.extra.safer.simple.SimpleActionModuleParser; +import org.wikimedia.search.extra.safer.wildcard.CheckWildcardsActionModuleParser; import org.wikimedia.search.extra.superdetectnoop.ChangeHandler; import org.wikimedia.search.extra.superdetectnoop.SetHandler; import org.wikimedia.search.extra.superdetectnoop.SuperDetectNoopScript; @@ -78,6 +79,7 @@ Multibinder<ActionModuleParser> moduleParsers = Multibinder.newSetBinder(binder(), ActionModuleParser.class); moduleParsers.addBinding().to(PhraseTooLargeActionModuleParser.class).asEagerSingleton(); moduleParsers.addBinding().to(SimpleActionModuleParser.class).asEagerSingleton(); + moduleParsers.addBinding().to(CheckWildcardsActionModuleParser.class).asEagerSingleton(); } } diff --git a/src/main/java/org/wikimedia/search/extra/safer/Safeifier.java b/src/main/java/org/wikimedia/search/extra/safer/Safeifier.java index 0261843..e0b2465 100644 --- a/src/main/java/org/wikimedia/search/extra/safer/Safeifier.java +++ b/src/main/java/org/wikimedia/search/extra/safer/Safeifier.java @@ -59,4 +59,13 @@ } return action.apply(this, q); } + + /** + * Inspect registry + * @param action + * @return true if the safeifier handles this type of query + */ + public boolean hasActionFor(Class<? extends Query> queryType) { + return registry.containsKey(queryType); + } } diff --git a/src/main/java/org/wikimedia/search/extra/safer/SaferQueryBuilder.java b/src/main/java/org/wikimedia/search/extra/safer/SaferQueryBuilder.java index 681a54b..01c725c 100644 --- a/src/main/java/org/wikimedia/search/extra/safer/SaferQueryBuilder.java +++ b/src/main/java/org/wikimedia/search/extra/safer/SaferQueryBuilder.java @@ -8,6 +8,7 @@ import org.elasticsearch.index.query.QueryBuilder; import org.wikimedia.search.extra.safer.phrase.PhraseTooLargeAction; import org.wikimedia.search.extra.safer.simple.SimpleActionModule.Option; +import org.wikimedia.search.extra.safer.wildcard.CheckWildcardsAction; public class SaferQueryBuilder extends BaseQueryBuilder { private final QueryBuilder delegate; @@ -16,6 +17,8 @@ private PhraseTooLargeAction phraseTooLargeAction; private Option termRangeQuery; private Option numericRangeQuery; + private Integer maxWildcards; + private CheckWildcardsAction checkWildcardsAction; public SaferQueryBuilder(QueryBuilder delegate) { this.delegate = delegate; @@ -46,6 +49,16 @@ return this; } + public SaferQueryBuilder maxWildcards(int wildcards) { + maxWildcards = wildcards; + return this; + } + + public SaferQueryBuilder checkWildCardAction(CheckWildcardsAction checkWildcardsAction) { + this.checkWildcardsAction = checkWildcardsAction; + return this; + } + @Override protected void doXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject("safer"); @@ -69,6 +82,14 @@ builder.field("numeric_range", numericRangeQuery.toString().toLowerCase(Locale.ROOT)); } builder.endObject(); + builder.startObject("wildcards"); + if(maxWildcards != null) { + builder.field("max_wildcards", maxWildcards); + } + if(checkWildcardsAction != null) { + builder.field("check_wildcards_action", checkWildcardsAction.toString().toLowerCase(Locale.ROOT)); + } + builder.endObject(); builder.endObject(); } } diff --git a/src/main/java/org/wikimedia/search/extra/safer/SaferQueryParser.java b/src/main/java/org/wikimedia/search/extra/safer/SaferQueryParser.java index 19b3cfe..233db2d 100644 --- a/src/main/java/org/wikimedia/search/extra/safer/SaferQueryParser.java +++ b/src/main/java/org/wikimedia/search/extra/safer/SaferQueryParser.java @@ -6,7 +6,9 @@ import java.util.Map; import java.util.Set; +import org.apache.lucene.queryparser.classic.MapperQueryParser; import org.apache.lucene.search.Query; +import org.elasticsearch.common.Nullable; import org.elasticsearch.common.collect.ImmutableMap; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.xcontent.XContentParser; @@ -14,10 +16,26 @@ import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryParser; import org.elasticsearch.index.query.QueryParsingException; +import org.elasticsearch.index.query.UnsafeSaferQueryParseContext; import org.wikimedia.search.extra.safer.Safeifier.ActionModule; +import org.wikimedia.search.extra.safer.wildcard.CheckWildcardsActionModule; +import org.wikimedia.search.extra.safer.wildcard.DelayedWildcardQuery; public class SaferQueryParser implements QueryParser { private final Map<String, ActionModuleParser<?>> moduleParsers; + + /** + * Hack for {@link CheckWildcardsActionModule} + * + * We need to cache the {@link UnsafeSaferQueryParseContext} instances mainly because they contain an + * instance of a {@link MapperQueryParser} that may be slow to build. + */ + private ThreadLocal<UnsafeSaferQueryParseContext> cache = new ThreadLocal<UnsafeSaferQueryParseContext>() { + @Override + protected UnsafeSaferQueryParseContext initialValue() { + return new UnsafeSaferQueryParseContext(); + } + }; @Inject public SaferQueryParser(@SuppressWarnings("rawtypes") Set<ActionModuleParser> parsers) { @@ -37,45 +55,71 @@ public Query parse(QueryParseContext parseContext) throws IOException, QueryParsingException { boolean errorOnUnknownQueryType = true; List<ActionModule> actionModules = new ArrayList<ActionModule>(); + UnsafeSaferQueryParseContext unsafeContext = null; Query delegate = null; XContentParser parser = parseContext.parser(); String currentFieldName = null; XContentParser.Token token; - while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { - if (token == XContentParser.Token.FIELD_NAME) { - currentFieldName = parser.currentName(); - } else if (token == Token.START_OBJECT) { - switch (currentFieldName) { - case "query": - if (delegate != null) { - throw new QueryParsingException(parseContext.index(), "[safer] Can only wrap a single query ([" + currentFieldName - + "]) is the second query."); + try { + unsafeContext = getUnsafeContext(parseContext); + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token == Token.START_OBJECT) { + switch (currentFieldName) { + case "query": + if (delegate != null) { + throw new QueryParsingException(parseContext.index(), "[safer] Can only wrap a single query ([" + currentFieldName + + "]) is the second query."); + } + // It'd be nice to know if the CheckWildcards module is needed before wrapping... but how? + // 2 pass parsing? + delegate = unsafeContext.parseInnerQuery(); + break; + default: + ActionModuleParser<?> moduleParser = moduleParsers.get(currentFieldName); + if (moduleParser == null) { + throw new QueryParsingException(parseContext.index(), "[safer] query does not support the object [" + currentFieldName + "]"); + } + actionModules.add(moduleParser.parse(parseContext)); } - delegate = parseContext.parseInnerQuery(); - break; - default: - ActionModuleParser<?> moduleParser = moduleParsers.get(currentFieldName); - if (moduleParser == null) { - throw new QueryParsingException(parseContext.index(), "[safer] query does not support the object [" + currentFieldName + "]"); + } else if (token.isValue()) { + switch (currentFieldName) { + case "error_on_unknown": + case "errorOnUnknown": + errorOnUnknownQueryType = parser.booleanValue(); + break; + default: + throw new QueryParsingException(parseContext.index(), "[safer] query does not support the field [" + currentFieldName + "]"); } - actionModules.add(moduleParser.parse(parseContext)); - } - } else if (token.isValue()) { - switch (currentFieldName) { - case "error_on_unknown": - case "errorOnUnknown": - errorOnUnknownQueryType = parser.booleanValue(); - break; - default: - throw new QueryParsingException(parseContext.index(), "[safer] query does not support the field [" + currentFieldName + "]"); } } - } - if (delegate == null) { - throw new QueryParsingException(parseContext.index(), "[safer] requires a query"); + if (delegate == null) { + throw new QueryParsingException(parseContext.index(), "[safer] requires a query"); + } + + Safeifier safeifier = new Safeifier(errorOnUnknownQueryType, actionModules); + + // Hack for CheckWildcards if the module is not requested: we must remove all DelayedWildcardQuery. + if(!safeifier.hasActionFor(DelayedWildcardQuery.class)) { + safeifier.register(DelayedWildcardQuery.class, CheckWildcardsActionModule.PARSE_DELAYED_WILDCARD); + } + + return safeifier.safeify(delegate); + } finally { + if(unsafeContext != null) { + // Prevent memleak + unsafeContext.cleanUp(); + } } - return new Safeifier(errorOnUnknownQueryType, actionModules).safeify(delegate); + } + + @Nullable + private UnsafeSaferQueryParseContext getUnsafeContext(QueryParseContext parseContext) throws IOException, QueryParsingException { + UnsafeSaferQueryParseContext unsafe = cache.get(); + unsafe.setDelegate(parseContext); + return unsafe; } } \ No newline at end of file diff --git a/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsAction.java b/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsAction.java new file mode 100644 index 0000000..ff21f26 --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsAction.java @@ -0,0 +1,97 @@ +package org.wikimedia.search.extra.safer.wildcard; + +import java.util.Locale; + +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.WildcardQuery; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.ElasticsearchParseException; + +public enum CheckWildcardsAction { + /** + * Cut the string + */ + BREAK { + @Override + public Query perform(DelayedWildcardQuery query, int maxWildcards) { + int index = getNthWildcardIndex(query.getTermText(), maxWildcards+1); + String term = query.getTermText(); + if(index > 0) { + term = query.getTermText().substring(0, index); + } else if(index == 0) { + throw new ElasticsearchParseException("Wildcard query cannot be compiled (max_wildcards = 0 ??)"); + } + try { + return query.compileWildcard(term); + } catch (ParseException e) { + throw new ElasticsearchParseException(e.getMessage(), e); + } + } + }, + /** + * throw an error + */ + ERROR { + @Override + public Query perform(DelayedWildcardQuery query, int maxWildcards) { + int index = getNthWildcardIndex(query.getTermText(), maxWildcards+1); + if(index >= 0) { + throw new TooManyWildcardsException("WildCard query is too complex : " + query.getTermText()); + } + try { + return query.compileWildcard(); + } catch (ParseException e) { + throw new ElasticsearchParseException(e.getMessage(), e); + } + } + }; + + public abstract Query perform(DelayedWildcardQuery query, int maxWildcards); + + public static CheckWildcardsAction parse(String text) { + if("break".equals(text)) { + return BREAK; + } + if("error".equals(text)) { + return ERROR; + } + throw new ElasticsearchIllegalArgumentException(String.format(Locale.ROOT, "Invalid check_wildcards_action mode: %s", text)); + } + + /** + * Find the nth wildcard (* or ?) in the text + * + * @param text + * @param nth + * @return return the index of the nth wildcards in the text or -1 if not found + */ + public static int getNthWildcardIndex(String text, int nth) { + if(nth < 1) { + throw new IllegalArgumentException("nth must be > 0"); + } + // Code borrowed from WildcardQuery + int wildcards = 0; + for (int i = 0; i < text.length();) { + final int c = text.codePointAt(i); + int length = Character.charCount(c); + switch (c) { + case WildcardQuery.WILDCARD_STRING: + case WildcardQuery.WILDCARD_CHAR: + wildcards++; + break; + case WildcardQuery.WILDCARD_ESCAPE: + if (i + length < text.length()) { + final int nextChar = text.codePointAt(i + length); + length += Character.charCount(nextChar); + break; + } + } + if(wildcards == nth) { + return i; + } + i += length; + } + return -1; + } +} diff --git a/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsActionModule.java b/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsActionModule.java new file mode 100644 index 0000000..179682c --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsActionModule.java @@ -0,0 +1,51 @@ +package org.wikimedia.search.extra.safer.wildcard; + +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.Query; +import org.wikimedia.search.extra.safer.Safeifier; +import org.wikimedia.search.extra.safer.Safeifier.Action; +import org.wikimedia.search.extra.safer.Safeifier.ActionModule; + +/** + * Register the CheckWildcardsAction + * @author nomoa + * + */ +public class CheckWildcardsActionModule implements ActionModule { + /** + * Maximum number of wildcards allowed in a wildcard query. + */ + public final static int DEFAULT_MAX_WILDCARDS = 5; + + public final static Action<DelayedWildcardQuery, Query> PARSE_DELAYED_WILDCARD = new Action<DelayedWildcardQuery, Query>() { + public Query apply(Safeifier safeifier, DelayedWildcardQuery q) { + try { + return q.compileWildcard(); + } catch (ParseException e) { + throw new RuntimeException(e); + } + } + }; + + private int maxWildcards = DEFAULT_MAX_WILDCARDS; + private CheckWildcardsAction checkWildcardsAction = CheckWildcardsAction.ERROR; + + @Override + public void register(Safeifier safeifier) { + safeifier.register(DelayedWildcardQuery.class, new Action<DelayedWildcardQuery, Query>() { + @Override + public Query apply(Safeifier safeifier, DelayedWildcardQuery query) { + return checkWildcardsAction.perform(query, maxWildcards); + } + }); + } + + public CheckWildcardsActionModule maxWildcards(int maxWildcards) { + this.maxWildcards = maxWildcards; + return this; + } + + public void checkWildcardsAction(CheckWildcardsAction checkWildcardsAction) { + this.checkWildcardsAction = checkWildcardsAction; + } +} diff --git a/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsActionModuleParser.java b/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsActionModuleParser.java new file mode 100644 index 0000000..1fa4150 --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/safer/wildcard/CheckWildcardsActionModuleParser.java @@ -0,0 +1,46 @@ +package org.wikimedia.search.extra.safer.wildcard; + +import java.io.IOException; + +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.query.QueryParseContext; +import org.elasticsearch.index.query.QueryParsingException; +import org.wikimedia.search.extra.safer.ActionModuleParser; + +public class CheckWildcardsActionModuleParser implements ActionModuleParser<CheckWildcardsActionModule> { + + @Override + public String moduleName() { + return "wildcards"; + } + + @Override + public CheckWildcardsActionModule parse(QueryParseContext parseContext) throws IOException { + CheckWildcardsActionModule module = new CheckWildcardsActionModule(); + XContentParser parser = parseContext.parser(); + String currentFieldName = null; + XContentParser.Token token; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token.isValue()) { + switch (currentFieldName) { + case "max_wildcards": + case "maxWildcards": + module.maxWildcards(parser.intValue()); + break; + case "check_wildcards_action": + case "checkWildcardsAction": + module.checkWildcardsAction(CheckWildcardsAction.parse(parser.text())); + break; + default: + throw new QueryParsingException(parseContext.index(), "[safer][wildcards] query does not support the field [" + + currentFieldName + "]"); + } + } else { + throw new QueryParsingException(parseContext.index(), "[safer][wildcards] only supports values, not objects."); + } + } + return module; + } +} diff --git a/src/main/java/org/wikimedia/search/extra/safer/wildcard/DelayedWildcardQuery.java b/src/main/java/org/wikimedia/search/extra/safer/wildcard/DelayedWildcardQuery.java new file mode 100644 index 0000000..d29720f --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/safer/wildcard/DelayedWildcardQuery.java @@ -0,0 +1,50 @@ +package org.wikimedia.search.extra.safer.wildcard; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.MultiTermQuery.RewriteMethod; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.WildcardQuery; + +/** + * Just hold the query parser waiting for Safeifier to check the wildcard + */ +public class DelayedWildcardQuery extends Query { + private final Term term; + private RewriteMethod rewriteMethod; + + public DelayedWildcardQuery(Term term, RewriteMethod rewriteMethod) { + this.term = term; + this.rewriteMethod = rewriteMethod; + } + + @Override + public String toString(String field) { + return this.getClass().getSimpleName(); + } + + public Query compileWildcard() throws ParseException { + WildcardQuery wq = new WildcardQuery(term); + wq.setRewriteMethod(rewriteMethod); + return wq; + } + + public Query compileWildcard(String newTerm) throws ParseException { + WildcardQuery wq = new WildcardQuery(new Term(term.field(), newTerm)); + wq.setRewriteMethod(rewriteMethod); + return wq; + } + + public String getTermText() { + return term.text(); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + // Oops + throw new RuntimeException("This query must not be here..."); + } +} diff --git a/src/main/java/org/wikimedia/search/extra/safer/wildcard/SaferMapperQueryParser.java b/src/main/java/org/wikimedia/search/extra/safer/wildcard/SaferMapperQueryParser.java new file mode 100644 index 0000000..9d869e6 --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/safer/wildcard/SaferMapperQueryParser.java @@ -0,0 +1,28 @@ +package org.wikimedia.search.extra.safer.wildcard; + +import org.apache.lucene.queryparser.classic.MapperQueryParser; +import org.apache.lucene.queryparser.classic.QueryParserSettings; +import org.apache.lucene.search.MultiTermQuery.RewriteMethod; +import org.apache.lucene.search.Query; +import org.elasticsearch.index.query.QueryParseContext; + +/** + * Wrapper around MapperQueryParser. + * Hack for {@link CheckWildcardsActionModule} that need to prevent those queries from being compiled + */ +public class SaferMapperQueryParser extends MapperQueryParser { + private RewriteMethod rewriteMethod = null; + public SaferMapperQueryParser(QueryParseContext parseContext) { + super(parseContext); + } + + public void reset(QueryParserSettings settings) { + super.reset(settings); + this.rewriteMethod = settings.rewriteMethod(); + } + + @Override + protected Query newWildcardQuery(org.apache.lucene.index.Term t) { + return new DelayedWildcardQuery(t, rewriteMethod); + } +} diff --git a/src/main/java/org/wikimedia/search/extra/safer/wildcard/TooManyWildcardsException.java b/src/main/java/org/wikimedia/search/extra/safer/wildcard/TooManyWildcardsException.java new file mode 100644 index 0000000..d44916e --- /dev/null +++ b/src/main/java/org/wikimedia/search/extra/safer/wildcard/TooManyWildcardsException.java @@ -0,0 +1,12 @@ +package org.wikimedia.search.extra.safer.wildcard; + +/** + * Too many wildcards detected + */ +public class TooManyWildcardsException extends RuntimeException { + private static final long serialVersionUID = 5575496998211483109L; + + public TooManyWildcardsException(String message) { + super(message); + } +} diff --git a/src/test/java/org/wikimedia/search/extra/safer/CheckWildcardsTest.java b/src/test/java/org/wikimedia/search/extra/safer/CheckWildcardsTest.java new file mode 100644 index 0000000..a1ef39a --- /dev/null +++ b/src/test/java/org/wikimedia/search/extra/safer/CheckWildcardsTest.java @@ -0,0 +1,37 @@ +package org.wikimedia.search.extra.safer; + +import org.junit.Assert; + +import org.junit.Test; +import org.wikimedia.search.extra.safer.wildcard.CheckWildcardsAction; + +public class CheckWildcardsTest { + @Test + public void test() { + assertSubstring("A*B", "A*B*", 2); + assertSubstring("A*B*", "A*B*", 3); + assertSubstring("A*B\\*", "A*B\\*", 2); + assertSubstring("A*B\\*C", "A*B\\*C*D", 2); + assertSubstring("A*B", "A*B?", 2); + assertSubstring("A*B\\?C", "A*B\\?C*D", 2); + assertSubstring("A*B\\?C", "A*B\\?C?D", 2); + assertSubstring("", "*", 1); + assertSubstring("\ud802\udd10*\ud802\udd10", "\ud802\udd10*\ud802\udd10*", 2); + assertSubstring("\ud802\udd10\\*\ud802\udd10*\ud802\udd10*", "\ud802\udd10\\*\ud802\udd10*\ud802\udd10*", 3); + assertSubstring("*\ud802\udd10\\*\ud802\udd10", "*\ud802\udd10\\*\ud802\udd10*\ud802\udd10*", 2); + } + + @Test(expected=IllegalArgumentException.class) + public void testEdge() { + assertSubstring(null, "", 0); + } + + public void assertSubstring(String expected, String text, int nth) { + int r = CheckWildcardsAction.getNthWildcardIndex(text, nth); + String substring = text; + if(r >= 0) { + substring = text.substring(0, r); + } + Assert.assertEquals(expected, substring); + } +} diff --git a/src/test/java/org/wikimedia/search/extra/safer/SaferQueryStringTest.java b/src/test/java/org/wikimedia/search/extra/safer/SaferQueryStringTest.java index 20f837e..187094d 100644 --- a/src/test/java/org/wikimedia/search/extra/safer/SaferQueryStringTest.java +++ b/src/test/java/org/wikimedia/search/extra/safer/SaferQueryStringTest.java @@ -23,6 +23,7 @@ import org.wikimedia.search.extra.AbstractPluginIntegrationTest; import org.wikimedia.search.extra.safer.phrase.PhraseTooLargeAction; import org.wikimedia.search.extra.safer.simple.SimpleActionModule.Option; +import org.wikimedia.search.extra.safer.wildcard.CheckWildcardsAction; /** * Tests the safer query wrapping query_string queries similar to how <a >CirrusSearch</a> works. @@ -134,6 +135,37 @@ assertSearchByString("findme:\"CaptureAAAA Test\""); } + @Test + public void wildcards() { + QueryStringQueryBuilder qs = queryString("_all:Capt*u*r*e*AAAA"); + SaferQueryBuilder b = new SaferQueryBuilder(qs); + client().prepareSearch("test").setQuery(b); + SearchRequestBuilder search = client().prepareSearch("test").setQuery(b); + assertSearchHits(search.get(), "delimited1"); + + qs = queryString("_all:Capt*u*r*e*AAAAITSME"); + b = new SaferQueryBuilder(qs); + b.maxWildcards(3); + b.checkWildCardAction(CheckWildcardsAction.BREAK); + search = client().prepareSearch("test").setQuery(b) + .addHighlightedField("_all").setQuery(b) + .addHighlightedField("findme").setQuery(b); + assertSearchHits(search.get()); + + qs = queryString("_all:Cap*tu*AAA*THIS_SHOULD_BE_IGNORED"); + b = new SaferQueryBuilder(qs); + b.maxWildcards(2); + b.checkWildCardAction(CheckWildcardsAction.BREAK); + search = client().prepareSearch("test").setQuery(b); + assertSearchHits(search.get(), "delimited1", "delimited2"); + + qs = queryString("_all:0*1*2*3*4*5*6*7*8*9*1*1*3*4*5*2*3*4"); + b = new SaferQueryBuilder(qs); + b.maxWildcards(2); + search = client().prepareSearch("test").setQuery(b); + assertFailures(search, RestStatus.BAD_REQUEST, containsString("WildCard query is too complex")); + } + private void assertSearchByFieldsFails(boolean total, String query, String... fields) { QueryStringQueryBuilder qs = queryString(query); SaferQueryBuilder b = new SaferQueryBuilder(qs).maxTermsPerPhraseQuery(5).maxTermsInAllPhraseQueries(8); -- To view, visit https://gerrit.wikimedia.org/r/219182 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ifcc9e55a4d81d48c05a76ca1afdd9b3adbf86006 Gerrit-PatchSet: 1 Gerrit-Project: search/extra Gerrit-Branch: 1.3 Gerrit-Owner: DCausse <dcau...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits