DCausse has submitted this change and it was merged.
Change subject: Upgrade to 1.7.0
......................................................................
Upgrade to 1.7.0
Elasticsearch 1.7.0's testing framework hates non-Serializable Exceptions
so we reword an offending exception to no longer contain non-Serializable
stuff.
Change-Id: Ia1b4da9840b0598b402ee8efa086659e5b18eab9
---
M experimental-highlighter-core/pom.xml
M experimental-highlighter-elasticsearch-plugin/pom.xml
M
experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
A
experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/RegexTooComplexException.java
M
experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
M experimental-highlighter-lucene/pom.xml
M pom.xml
7 files changed, 76 insertions(+), 52 deletions(-)
Approvals:
DCausse: Looks good to me, approved
jenkins-bot: Verified
diff --git a/experimental-highlighter-core/pom.xml
b/experimental-highlighter-core/pom.xml
index f562892..13c682f 100644
--- a/experimental-highlighter-core/pom.xml
+++ b/experimental-highlighter-core/pom.xml
@@ -3,7 +3,7 @@
<parent>
<groupId>org.wikimedia.search.highlighter</groupId>
<artifactId>experimental</artifactId>
- <version>1.6.1-SNAPSHOT</version>
+ <version>1.7.0-SNAPSHOT</version>
</parent>
<artifactId>experimental-highlighter-core</artifactId>
<packaging>jar</packaging>
diff --git a/experimental-highlighter-elasticsearch-plugin/pom.xml
b/experimental-highlighter-elasticsearch-plugin/pom.xml
index 23041c7..47fa295 100644
--- a/experimental-highlighter-elasticsearch-plugin/pom.xml
+++ b/experimental-highlighter-elasticsearch-plugin/pom.xml
@@ -3,7 +3,7 @@
<parent>
<groupId>org.wikimedia.search.highlighter</groupId>
<artifactId>experimental</artifactId>
- <version>1.6.1-SNAPSHOT</version>
+ <version>1.7.0-SNAPSHOT</version>
</parent>
<artifactId>experimental-highlighter-elasticsearch-plugin</artifactId>
<packaging>jar</packaging>
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
index c842fd2..058eb35 100644
---
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
+++
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
@@ -12,8 +12,8 @@
import java.util.regex.Pattern;
import org.apache.lucene.search.Query;
-import org.apache.lucene.util.automaton.XAutomaton;
import org.apache.lucene.util.automaton.XRegExp;
+import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
import org.elasticsearch.common.base.Function;
import org.elasticsearch.common.collect.Iterators;
import org.elasticsearch.common.logging.ESLogger;
@@ -39,10 +39,10 @@
import
org.wikimedia.highlighter.experimental.lucene.hit.weight.BasicQueryWeigher;
import org.wikimedia.search.highlighter.experimental.HitEnum;
import org.wikimedia.search.highlighter.experimental.Snippet;
+import org.wikimedia.search.highlighter.experimental.Snippet.HitBuilder;
import org.wikimedia.search.highlighter.experimental.SnippetChooser;
import org.wikimedia.search.highlighter.experimental.SnippetFormatter;
import org.wikimedia.search.highlighter.experimental.SnippetWeigher;
-import org.wikimedia.search.highlighter.experimental.Snippet.HitBuilder;
import org.wikimedia.search.highlighter.experimental.hit.ConcatHitEnum;
import org.wikimedia.search.highlighter.experimental.hit.EmptyHitEnum;
import org.wikimedia.search.highlighter.experimental.hit.MergingHitEnum;
@@ -89,8 +89,7 @@
}
} catch (Exception e) {
log.error("Failed to highlight field [{}]", e, context.fieldName);
- throw new FetchPhaseExecutionException(context.context, "Failed to
highlight field ["
- + context.fieldName + "]", e);
+ throw new FetchPhaseExecutionException(context.context, "Failed to
highlight field [" + context.fieldName + "]", e);
}
}
@@ -173,7 +172,8 @@
return null;
}
- // TODO it might be possible to not build the weigher at all if
just using regex highlighting
+ // TODO it might be possible to not build the weigher at all if
just
+ // using regex highlighting
ensureWeigher();
scoreMatters = context.field.fieldOptions().scoreOrdered();
if (!scoreMatters) {
@@ -186,8 +186,7 @@
numberOfSnippets = 1;
}
segmenter = new DelayedSegmenter(defaultField);
- List<Snippet> snippets = buildChooser().choose(segmenter,
buildHitEnum(),
- numberOfSnippets);
+ List<Snippet> snippets = buildChooser().choose(segmenter,
buildHitEnum(), numberOfSnippets);
if (snippets.size() != 0) {
cache.lastMatched = true;
return new HighlightField(context.fieldName,
formatSnippets(snippets));
@@ -201,19 +200,19 @@
if (fieldValues.isEmpty()) {
return null;
}
- Text fragment = new StringText(getSegmenterFactory()
- .extractNoMatchFragment(fieldValues.get(0), noMatchSize));
- return new HighlightField(context.fieldName, new Text[]
{fragment});
+ Text fragment = new
StringText(getSegmenterFactory().extractNoMatchFragment(fieldValues.get(0),
noMatchSize));
+ return new HighlightField(context.fieldName, new Text[] { fragment
});
}
private boolean shouldSkip() {
- // Maintain lastMatched - it should be false if we shift to a new
doc.
+ // Maintain lastMatched - it should be false if we shift to a new
+ // doc.
if (cache.lastDocId != context.hitContext.docId()) {
cache.lastMatched = false;
cache.lastDocId = context.hitContext.docId();
}
- Boolean skipIfLastMatched =
(Boolean)getOption("skip_if_last_matched");
+ Boolean skipIfLastMatched = (Boolean)
getOption("skip_if_last_matched");
return skipIfLastMatched == null ? false : skipIfLastMatched &&
cache.lastMatched;
}
@@ -274,19 +273,18 @@
boolean removeHighFrequencyTermsFromCommonTerms =
getOption("remove_high_freq_terms_from_common_terms", true);
int maxExpandedTerms = getOption("max_expanded_terms", 1024);
// TODO simplify
- QueryCacheKey key = new
QueryCacheKey(context.query.originalQuery(), maxExpandedTerms, phraseAsTerms,
removeHighFrequencyTermsFromCommonTerms);
+ QueryCacheKey key = new
QueryCacheKey(context.query.originalQuery(), maxExpandedTerms, phraseAsTerms,
+ removeHighFrequencyTermsFromCommonTerms);
weigher = cache.queryWeighers.get(key);
if (weigher != null) {
return;
}
// TODO recycle. But addReleasble doesn't seem to close it properly
// later. I believe this is fixed in later Elasticsearch versions.
- BytesRefHashTermInfos infos = new BytesRefHashTermInfos(
- BigArrays.NON_RECYCLING_INSTANCE);
+ BytesRefHashTermInfos infos = new
BytesRefHashTermInfos(BigArrays.NON_RECYCLING_INSTANCE);
// context.context.addReleasable(infos);
- weigher = new BasicQueryWeigher(
- new ElasticsearchQueryFlattener(maxExpandedTerms,
phraseAsTerms, removeHighFrequencyTermsFromCommonTerms), infos,
- context.hitContext.topLevelReader(),
context.query.originalQuery());
+ weigher = new BasicQueryWeigher(new
ElasticsearchQueryFlattener(maxExpandedTerms, phraseAsTerms,
+ removeHighFrequencyTermsFromCommonTerms), infos,
context.hitContext.topLevelReader(), context.query.originalQuery());
// Build the QueryWeigher with the top level reader to get all
// the frequency information
cache.queryWeighers.put(key, weigher);
@@ -302,7 +300,7 @@
// analyzers that make overlaps.
e = new OverlapMergingHitEnumWrapper(e);
- if(getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
+ if (getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
e = new GraphvizHitEnum(e);
}
return e;
@@ -353,9 +351,7 @@
}
AutomatonHitEnum.Factory factory =
cache.automatonHitEnumFactories.get(regex);
if (factory == null) {
- XRegExp regexp = new XRegExp(regex);
- XAutomaton automaton =
regexp.toAutomaton(getMaxDeterminizedStates());
- factory = AutomatonHitEnum.factory(automaton);
+ factory = buildFactoryForRegex(new XRegExp(regex));
cache.automatonHitEnumFactories.put(regex, factory);
}
hitEnums.add(buildLuceneRegexHitEnumForRegex(factory,
fieldValues, caseInsensitive));
@@ -368,6 +364,23 @@
}
}
return hitEnums;
+ }
+
+ private AutomatonHitEnum.Factory buildFactoryForRegex(XRegExp regex) {
+ try {
+ return
AutomatonHitEnum.factory(regex.toAutomaton(getMaxDeterminizedStates()));
+ } catch (XTooComplexToDeterminizeException e) {
+ /*
+ * Elasticsearch forces us to wrap the exception in a fully
+ * Serializable exception and throw out the stack trace so we
+ * give our future selves the oportunity to log it when we need
+ * it.
+ */
+ if (log.isDebugEnabled()) {
+ log.debug("Regex too complex", e);
+ }
+ throw new RegexTooComplexException(e);
+ }
}
private int getMaxDeterminizedStates() {
@@ -395,11 +408,11 @@
if (regexes instanceof String) {
return Collections.singletonList((String) regexes);
}
- return (List<String>)regexes;
+ return (List<String>) regexes;
}
- private HitEnum buildLuceneRegexHitEnumForRegex(final
AutomatonHitEnum.Factory factory,
- List<String> fieldValues, final boolean caseInsensitive) {
+ private HitEnum buildLuceneRegexHitEnumForRegex(final
AutomatonHitEnum.Factory factory, List<String> fieldValues,
+ final boolean caseInsensitive) {
final int positionGap = defaultField.getPositionGap();
if (fieldValues.size() == 1) {
String fieldValue = fieldValues.get(0);
@@ -408,8 +421,8 @@
}
return factory.build(fieldValue);
} else {
- Iterator<HitEnumAndLength> hitEnumsFromStreams =
Iterators.transform(
- fieldValues.iterator(), new Function<String,
HitEnumAndLength>() {
+ Iterator<HitEnumAndLength> hitEnumsFromStreams =
Iterators.transform(fieldValues.iterator(),
+ new Function<String, HitEnumAndLength>() {
@Override
public HitEnumAndLength apply(String fieldValue) {
if (caseInsensitive) {
@@ -427,12 +440,11 @@
if (fieldValues.size() == 1) {
return new RegexHitEnum(pattern.matcher(fieldValues.get(0)));
} else {
- Iterator<HitEnumAndLength> hitEnumsFromStreams =
Iterators.transform(
- fieldValues.iterator(), new Function<String,
HitEnumAndLength>() {
+ Iterator<HitEnumAndLength> hitEnumsFromStreams =
Iterators.transform(fieldValues.iterator(),
+ new Function<String, HitEnumAndLength>() {
@Override
public HitEnumAndLength apply(String fieldValue) {
- return new HitEnumAndLength(new
RegexHitEnum(pattern
- .matcher(fieldValue)),
fieldValue.length());
+ return new HitEnumAndLength(new
RegexHitEnum(pattern.matcher(fieldValue)), fieldValue.length());
}
});
return new ConcatHitEnum(hitEnumsFromStreams, positionGap, 1);
@@ -486,7 +498,7 @@
private SnippetChooser buildChooser() {
HitBuilder hitBuilder = Snippet.DEFAULT_HIT_BUILDER;
- if(getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
+ if (getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
hitBuilder = GraphvizHit.GRAPHVIZ_HIT_BUILDER;
}
if (context.field.fieldOptions().scoreOrdered()) {
@@ -545,12 +557,11 @@
final SnippetFormatter formatter;
if (getOption("return_offsets", false)) {
formatter = new OffsetSnippetFormatter();
- } else if(getOption(OPTION_RETURN_DEBUG_GRAPH, false)){
+ } else if (getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
formatter = new
GraphvizSnippetFormatter(defaultField.buildSourceExtracter());
} else {
- formatter = new
SnippetFormatter.Default(defaultField.buildSourceExtracter(),
- context.field.fieldOptions().preTags()[0],
context.field.fieldOptions()
- .postTags()[0]);
+ formatter = new
SnippetFormatter.Default(defaultField.buildSourceExtracter(),
context.field.fieldOptions().preTags()[0],
+ context.field.fieldOptions().postTags()[0]);
}
List<FieldWrapper> fetchFields = buildFetchFields();
@@ -570,7 +581,7 @@
for (Snippet snippet : snippets) {
result[i++] = new StringText(formatter.format(snippet));
int index = picker.index(snippet);
- for (FieldWrapper fetchField: fetchFields) {
+ for (FieldWrapper fetchField : fetchFields) {
List<String> values = fetchField.getFieldValues();
if (index >= 0 && index < values.size()) {
result[i++] = new StringText(values.get(index));
@@ -583,7 +594,8 @@
}
/**
- * Return FieldWrappers for all fetch_fields or null if there aren't
any.
+ * Return FieldWrappers for all fetch_fields or null if there aren't
+ * any.
*/
private List<FieldWrapper> buildFetchFields() {
@SuppressWarnings("unchecked")
@@ -606,8 +618,7 @@
}
}
if (!found) {
- FieldWrapper fieldWrapper = new FieldWrapper(this,
context, weigher,
- fetchField);
+ FieldWrapper fieldWrapper = new FieldWrapper(this,
context, weigher, fetchField);
newExtraFields.add(fieldWrapper);
fetchFieldWrappers.add(fieldWrapper);
}
@@ -629,8 +640,7 @@
}
if (options.fragmenter() == null ||
options.fragmenter().equals("scan")) {
// TODO boundaryChars
- return new
CharScanningSegmenterFactory(options.fragmentCharSize(),
- options.boundaryMaxScan());
+ return new
CharScanningSegmenterFactory(options.fragmentCharSize(),
options.boundaryMaxScan());
}
if (options.fragmenter().equals("sentence")) {
return new SentenceIteratorSegmenterFactory(getLocale(),
options.boundaryMaxScan());
@@ -638,8 +648,7 @@
if (options.fragmenter().equals("none")) {
return new WholeSourceSegmenterFactory();
}
- throw new IllegalArgumentException("Unknown fragmenter: '" +
options.fragmenter()
- + "'. Options are 'scan' or 'sentence'.");
+ throw new IllegalArgumentException("Unknown fragmenter: '" +
options.fragmenter() + "'. Options are 'scan' or 'sentence'.");
}
private Locale getLocale() {
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/RegexTooComplexException.java
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/RegexTooComplexException.java
new file mode 100644
index 0000000..4a61dfa
--- /dev/null
+++
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/RegexTooComplexException.java
@@ -0,0 +1,15 @@
+package org.elasticsearch.search.highlight;
+
+import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
+
+/**
+ * Wraps Lucene's XTooComplexToDeterminizeException to be serializable to be
+ * thrown over the wire.
+ */
+public class RegexTooComplexException extends RuntimeException {
+ private static final long serialVersionUID = -41975279199116247L;
+
+ public RegexTooComplexException(XTooComplexToDeterminizeException e) {
+ super(e.getMessage());
+ }
+}
diff --git
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
index 0d3c265..b9aee20 100644
---
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
+++
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
@@ -212,7 +212,7 @@
options.put("regex_case_sensitive", true);
options.put("regex_flavor", "lucene");
assertFailures(testSearch().setHighlighterOptions(options),
- RestStatus.INTERNAL_SERVER_ERROR,
containsString("Determinizing automaton would result in more than"));
+ RestStatus.INTERNAL_SERVER_ERROR,
containsString("Determinizing [^]]*alt=[^]\\|}]{80,} would result in more
than"));
// Some regexes with explosive state growth still run because they
// don't explode into too many states.
options.put("regex", ".*te*s[tabclse]{1,16}.*");
@@ -222,7 +222,7 @@
options.put("regex", ".*te*s[tabcse]{1,16}.*");
options.put("max_determinized_states", 100);
assertFailures(testSearch().setHighlighterOptions(options),
- RestStatus.INTERNAL_SERVER_ERROR,
containsString("Determinizing automaton would result in more than 100"));
+ RestStatus.INTERNAL_SERVER_ERROR,
containsString("Determinizing .*te*s[tabcse]{1,16}.* would result in more than
100"));
// Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
// I can't find any way from here to mark it otherwise.
}
diff --git a/experimental-highlighter-lucene/pom.xml
b/experimental-highlighter-lucene/pom.xml
index eb36b8a..b2214b9 100644
--- a/experimental-highlighter-lucene/pom.xml
+++ b/experimental-highlighter-lucene/pom.xml
@@ -3,7 +3,7 @@
<parent>
<groupId>org.wikimedia.search.highlighter</groupId>
<artifactId>experimental</artifactId>
- <version>1.6.1-SNAPSHOT</version>
+ <version>1.7.0-SNAPSHOT</version>
</parent>
<artifactId>experimental-highlighter-lucene</artifactId>
<packaging>jar</packaging>
diff --git a/pom.xml b/pom.xml
index 10f5e7d..1867ef0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -10,7 +10,7 @@
<groupId>org.wikimedia.search.highlighter</groupId>
<artifactId>experimental</artifactId>
- <version>1.6.1-SNAPSHOT</version>
+ <version>1.7.0-SNAPSHOT</version>
<packaging>pom</packaging>
<modules>
@@ -49,12 +49,12 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <elasticsearch.version>1.6.0</elasticsearch.version>
+ <elasticsearch.version>1.7.0</elasticsearch.version>
<!-- For the Elasticsearch plugin to work this should match the version of
Lucene that Elasticsearch
uses. -->
<lucene.version>4.10.4</lucene.version>
<!-- Note that this has to be compatible with the Elasticsearch version
but won't match it. -->
- <elasticsearch.icu.version>2.5.0</elasticsearch.icu.version>
+ <elasticsearch.icu.version>2.7.0</elasticsearch.icu.version>
</properties>
<build>
--
To view, visit https://gerrit.wikimedia.org/r/225355
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ia1b4da9840b0598b402ee8efa086659e5b18eab9
Gerrit-PatchSet: 4
Gerrit-Project: search/highlighter
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits