[MediaWiki-commits] [Gerrit] Upgrade to 1.7.0 - change (search/highlighter)

DCausse (Code Review) Mon, 20 Jul 2015 07:36:46 -0700

DCausse has submitted this change and it was merged.

Change subject: Upgrade to 1.7.0
......................................................................



Upgrade to 1.7.0

Elasticsearch 1.7.0's testing framework hates non-Serializable Exceptions
so we reword an offending exception to no longer contain non-Serializable
stuff.

Change-Id: Ia1b4da9840b0598b402ee8efa086659e5b18eab9
---
M experimental-highlighter-core/pom.xml
M experimental-highlighter-elasticsearch-plugin/pom.xml
M 
experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
A 
experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/RegexTooComplexException.java
M 
experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
M experimental-highlighter-lucene/pom.xml
M pom.xml
7 files changed, 76 insertions(+), 52 deletions(-)

Approvals:
  DCausse: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/experimental-highlighter-core/pom.xml 
b/experimental-highlighter-core/pom.xml
index f562892..13c682f 100644
--- a/experimental-highlighter-core/pom.xml
+++ b/experimental-highlighter-core/pom.xml
@@ -3,7 +3,7 @@
   <parent>
     <groupId>org.wikimedia.search.highlighter</groupId>
     <artifactId>experimental</artifactId>
-    <version>1.6.1-SNAPSHOT</version>
+    <version>1.7.0-SNAPSHOT</version>
   </parent>
   <artifactId>experimental-highlighter-core</artifactId>
   <packaging>jar</packaging>
diff --git a/experimental-highlighter-elasticsearch-plugin/pom.xml 
b/experimental-highlighter-elasticsearch-plugin/pom.xml
index 23041c7..47fa295 100644
--- a/experimental-highlighter-elasticsearch-plugin/pom.xml
+++ b/experimental-highlighter-elasticsearch-plugin/pom.xml
@@ -3,7 +3,7 @@
   <parent>
     <groupId>org.wikimedia.search.highlighter</groupId>
     <artifactId>experimental</artifactId>
-    <version>1.6.1-SNAPSHOT</version>
+    <version>1.7.0-SNAPSHOT</version>
   </parent>
   <artifactId>experimental-highlighter-elasticsearch-plugin</artifactId>
   <packaging>jar</packaging>
diff --git 
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
 
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
index c842fd2..058eb35 100644
--- 
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
+++ 
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java
@@ -12,8 +12,8 @@
 import java.util.regex.Pattern;
 
 import org.apache.lucene.search.Query;
-import org.apache.lucene.util.automaton.XAutomaton;
 import org.apache.lucene.util.automaton.XRegExp;
+import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
 import org.elasticsearch.common.base.Function;
 import org.elasticsearch.common.collect.Iterators;
 import org.elasticsearch.common.logging.ESLogger;
@@ -39,10 +39,10 @@
 import 
org.wikimedia.highlighter.experimental.lucene.hit.weight.BasicQueryWeigher;
 import org.wikimedia.search.highlighter.experimental.HitEnum;
 import org.wikimedia.search.highlighter.experimental.Snippet;
+import org.wikimedia.search.highlighter.experimental.Snippet.HitBuilder;
 import org.wikimedia.search.highlighter.experimental.SnippetChooser;
 import org.wikimedia.search.highlighter.experimental.SnippetFormatter;
 import org.wikimedia.search.highlighter.experimental.SnippetWeigher;
-import org.wikimedia.search.highlighter.experimental.Snippet.HitBuilder;
 import org.wikimedia.search.highlighter.experimental.hit.ConcatHitEnum;
 import org.wikimedia.search.highlighter.experimental.hit.EmptyHitEnum;
 import org.wikimedia.search.highlighter.experimental.hit.MergingHitEnum;
@@ -89,8 +89,7 @@
             }
         } catch (Exception e) {
             log.error("Failed to highlight field [{}]", e, context.fieldName);
-            throw new FetchPhaseExecutionException(context.context, "Failed to 
highlight field ["
-                    + context.fieldName + "]", e);
+            throw new FetchPhaseExecutionException(context.context, "Failed to 
highlight field [" + context.fieldName + "]", e);
         }
     }
 
@@ -173,7 +172,8 @@
                 return null;
             }
 
-            // TODO it might be possible to not build the weigher at all if 
just using regex highlighting
+            // TODO it might be possible to not build the weigher at all if 
just
+            // using regex highlighting
             ensureWeigher();
             scoreMatters = context.field.fieldOptions().scoreOrdered();
             if (!scoreMatters) {
@@ -186,8 +186,7 @@
                 numberOfSnippets = 1;
             }
             segmenter = new DelayedSegmenter(defaultField);
-            List<Snippet> snippets = buildChooser().choose(segmenter, 
buildHitEnum(),
-                    numberOfSnippets);
+            List<Snippet> snippets = buildChooser().choose(segmenter, 
buildHitEnum(), numberOfSnippets);
             if (snippets.size() != 0) {
                 cache.lastMatched = true;
                 return new HighlightField(context.fieldName, 
formatSnippets(snippets));
@@ -201,19 +200,19 @@
             if (fieldValues.isEmpty()) {
                 return null;
             }
-            Text fragment = new StringText(getSegmenterFactory()
-                    .extractNoMatchFragment(fieldValues.get(0), noMatchSize));
-            return new HighlightField(context.fieldName, new Text[] 
{fragment});
+            Text fragment = new 
StringText(getSegmenterFactory().extractNoMatchFragment(fieldValues.get(0), 
noMatchSize));
+            return new HighlightField(context.fieldName, new Text[] { fragment 
});
         }
 
         private boolean shouldSkip() {
-            // Maintain lastMatched - it should be false if we shift to a new 
doc.
+            // Maintain lastMatched - it should be false if we shift to a new
+            // doc.
             if (cache.lastDocId != context.hitContext.docId()) {
                 cache.lastMatched = false;
                 cache.lastDocId = context.hitContext.docId();
             }
 
-            Boolean skipIfLastMatched = 
(Boolean)getOption("skip_if_last_matched");
+            Boolean skipIfLastMatched = (Boolean) 
getOption("skip_if_last_matched");
             return skipIfLastMatched == null ? false : skipIfLastMatched && 
cache.lastMatched;
         }
 
@@ -274,19 +273,18 @@
             boolean removeHighFrequencyTermsFromCommonTerms = 
getOption("remove_high_freq_terms_from_common_terms", true);
             int maxExpandedTerms = getOption("max_expanded_terms", 1024);
             // TODO simplify
-            QueryCacheKey key = new 
QueryCacheKey(context.query.originalQuery(), maxExpandedTerms, phraseAsTerms, 
removeHighFrequencyTermsFromCommonTerms);
+            QueryCacheKey key = new 
QueryCacheKey(context.query.originalQuery(), maxExpandedTerms, phraseAsTerms,
+                    removeHighFrequencyTermsFromCommonTerms);
             weigher = cache.queryWeighers.get(key);
             if (weigher != null) {
                 return;
             }
             // TODO recycle. But addReleasble doesn't seem to close it properly
             // later. I believe this is fixed in later Elasticsearch versions.
-            BytesRefHashTermInfos infos = new BytesRefHashTermInfos(
-                    BigArrays.NON_RECYCLING_INSTANCE);
+            BytesRefHashTermInfos infos = new 
BytesRefHashTermInfos(BigArrays.NON_RECYCLING_INSTANCE);
             // context.context.addReleasable(infos);
-            weigher = new BasicQueryWeigher(
-                    new ElasticsearchQueryFlattener(maxExpandedTerms, 
phraseAsTerms, removeHighFrequencyTermsFromCommonTerms), infos,
-                    context.hitContext.topLevelReader(), 
context.query.originalQuery());
+            weigher = new BasicQueryWeigher(new 
ElasticsearchQueryFlattener(maxExpandedTerms, phraseAsTerms,
+                    removeHighFrequencyTermsFromCommonTerms), infos, 
context.hitContext.topLevelReader(), context.query.originalQuery());
             // Build the QueryWeigher with the top level reader to get all
             // the frequency information
             cache.queryWeighers.put(key, weigher);
@@ -302,7 +300,7 @@
             // analyzers that make overlaps.
             e = new OverlapMergingHitEnumWrapper(e);
 
-            if(getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
+            if (getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
                 e = new GraphvizHitEnum(e);
             }
             return e;
@@ -353,9 +351,7 @@
                     }
                     AutomatonHitEnum.Factory factory = 
cache.automatonHitEnumFactories.get(regex);
                     if (factory == null) {
-                        XRegExp regexp = new XRegExp(regex);
-                        XAutomaton automaton = 
regexp.toAutomaton(getMaxDeterminizedStates());
-                        factory = AutomatonHitEnum.factory(automaton);
+                        factory = buildFactoryForRegex(new XRegExp(regex));
                         cache.automatonHitEnumFactories.put(regex, factory);
                     }
                     hitEnums.add(buildLuceneRegexHitEnumForRegex(factory, 
fieldValues, caseInsensitive));
@@ -368,6 +364,23 @@
                 }
             }
             return hitEnums;
+        }
+
+        private AutomatonHitEnum.Factory buildFactoryForRegex(XRegExp regex) {
+            try {
+                return 
AutomatonHitEnum.factory(regex.toAutomaton(getMaxDeterminizedStates()));
+            } catch (XTooComplexToDeterminizeException e) {
+                /*
+                 * Elasticsearch forces us to wrap the exception in a fully
+                 * Serializable exception and throw out the stack trace so we
+                 * give our future selves the oportunity to log it when we need
+                 * it.
+                 */
+                if (log.isDebugEnabled()) {
+                    log.debug("Regex too complex", e);
+                }
+                throw new RegexTooComplexException(e);
+            }
         }
 
         private int getMaxDeterminizedStates() {
@@ -395,11 +408,11 @@
             if (regexes instanceof String) {
                 return Collections.singletonList((String) regexes);
             }
-            return (List<String>)regexes;
+            return (List<String>) regexes;
         }
 
-        private HitEnum buildLuceneRegexHitEnumForRegex(final 
AutomatonHitEnum.Factory factory,
-                List<String> fieldValues, final boolean caseInsensitive) {
+        private HitEnum buildLuceneRegexHitEnumForRegex(final 
AutomatonHitEnum.Factory factory, List<String> fieldValues,
+                final boolean caseInsensitive) {
             final int positionGap = defaultField.getPositionGap();
             if (fieldValues.size() == 1) {
                 String fieldValue = fieldValues.get(0);
@@ -408,8 +421,8 @@
                 }
                 return factory.build(fieldValue);
             } else {
-                Iterator<HitEnumAndLength> hitEnumsFromStreams = 
Iterators.transform(
-                        fieldValues.iterator(), new Function<String, 
HitEnumAndLength>() {
+                Iterator<HitEnumAndLength> hitEnumsFromStreams = 
Iterators.transform(fieldValues.iterator(),
+                        new Function<String, HitEnumAndLength>() {
                             @Override
                             public HitEnumAndLength apply(String fieldValue) {
                                 if (caseInsensitive) {
@@ -427,12 +440,11 @@
             if (fieldValues.size() == 1) {
                 return new RegexHitEnum(pattern.matcher(fieldValues.get(0)));
             } else {
-                Iterator<HitEnumAndLength> hitEnumsFromStreams = 
Iterators.transform(
-                        fieldValues.iterator(), new Function<String, 
HitEnumAndLength>() {
+                Iterator<HitEnumAndLength> hitEnumsFromStreams = 
Iterators.transform(fieldValues.iterator(),
+                        new Function<String, HitEnumAndLength>() {
                             @Override
                             public HitEnumAndLength apply(String fieldValue) {
-                                return new HitEnumAndLength(new 
RegexHitEnum(pattern
-                                        .matcher(fieldValue)), 
fieldValue.length());
+                                return new HitEnumAndLength(new 
RegexHitEnum(pattern.matcher(fieldValue)), fieldValue.length());
                             }
                         });
                 return new ConcatHitEnum(hitEnumsFromStreams, positionGap, 1);
@@ -486,7 +498,7 @@
 
         private SnippetChooser buildChooser() {
             HitBuilder hitBuilder = Snippet.DEFAULT_HIT_BUILDER;
-            if(getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
+            if (getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
                 hitBuilder = GraphvizHit.GRAPHVIZ_HIT_BUILDER;
             }
             if (context.field.fieldOptions().scoreOrdered()) {
@@ -545,12 +557,11 @@
             final SnippetFormatter formatter;
             if (getOption("return_offsets", false)) {
                 formatter = new OffsetSnippetFormatter();
-            } else if(getOption(OPTION_RETURN_DEBUG_GRAPH, false)){
+            } else if (getOption(OPTION_RETURN_DEBUG_GRAPH, false)) {
                 formatter = new 
GraphvizSnippetFormatter(defaultField.buildSourceExtracter());
             } else {
-                formatter = new 
SnippetFormatter.Default(defaultField.buildSourceExtracter(),
-                    context.field.fieldOptions().preTags()[0], 
context.field.fieldOptions()
-                            .postTags()[0]);
+                formatter = new 
SnippetFormatter.Default(defaultField.buildSourceExtracter(), 
context.field.fieldOptions().preTags()[0],
+                        context.field.fieldOptions().postTags()[0]);
             }
 
             List<FieldWrapper> fetchFields = buildFetchFields();
@@ -570,7 +581,7 @@
             for (Snippet snippet : snippets) {
                 result[i++] = new StringText(formatter.format(snippet));
                 int index = picker.index(snippet);
-                for (FieldWrapper fetchField: fetchFields) {
+                for (FieldWrapper fetchField : fetchFields) {
                     List<String> values = fetchField.getFieldValues();
                     if (index >= 0 && index < values.size()) {
                         result[i++] = new StringText(values.get(index));
@@ -583,7 +594,8 @@
         }
 
         /**
-         * Return FieldWrappers for all fetch_fields or null if there aren't 
any.
+         * Return FieldWrappers for all fetch_fields or null if there aren't
+         * any.
          */
         private List<FieldWrapper> buildFetchFields() {
             @SuppressWarnings("unchecked")
@@ -606,8 +618,7 @@
                         }
                     }
                     if (!found) {
-                        FieldWrapper fieldWrapper = new FieldWrapper(this, 
context, weigher,
-                                fetchField);
+                        FieldWrapper fieldWrapper = new FieldWrapper(this, 
context, weigher, fetchField);
                         newExtraFields.add(fieldWrapper);
                         fetchFieldWrappers.add(fieldWrapper);
                     }
@@ -629,8 +640,7 @@
             }
             if (options.fragmenter() == null || 
options.fragmenter().equals("scan")) {
                 // TODO boundaryChars
-                return new 
CharScanningSegmenterFactory(options.fragmentCharSize(),
-                        options.boundaryMaxScan());
+                return new 
CharScanningSegmenterFactory(options.fragmentCharSize(), 
options.boundaryMaxScan());
             }
             if (options.fragmenter().equals("sentence")) {
                 return new SentenceIteratorSegmenterFactory(getLocale(), 
options.boundaryMaxScan());
@@ -638,8 +648,7 @@
             if (options.fragmenter().equals("none")) {
                 return new WholeSourceSegmenterFactory();
             }
-            throw new IllegalArgumentException("Unknown fragmenter:  '" + 
options.fragmenter()
-                    + "'.  Options are 'scan' or 'sentence'.");
+            throw new IllegalArgumentException("Unknown fragmenter:  '" + 
options.fragmenter() + "'.  Options are 'scan' or 'sentence'.");
         }
 
         private Locale getLocale() {
diff --git 
a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/RegexTooComplexException.java
 
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/RegexTooComplexException.java
new file mode 100644
index 0000000..4a61dfa
--- /dev/null
+++ 
b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/RegexTooComplexException.java
@@ -0,0 +1,15 @@
+package org.elasticsearch.search.highlight;
+
+import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
+
+/**
+ * Wraps Lucene's XTooComplexToDeterminizeException to be serializable to be
+ * thrown over the wire.
+ */
+public class RegexTooComplexException extends RuntimeException {
+    private static final long serialVersionUID = -41975279199116247L;
+
+    public RegexTooComplexException(XTooComplexToDeterminizeException e) {
+        super(e.getMessage());
+    }
+}
diff --git 
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
 
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
index 0d3c265..b9aee20 100644
--- 
a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
+++ 
b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/RegexTest.java
@@ -212,7 +212,7 @@
         options.put("regex_case_sensitive", true);
         options.put("regex_flavor", "lucene");
         assertFailures(testSearch().setHighlighterOptions(options),
-                RestStatus.INTERNAL_SERVER_ERROR, 
containsString("Determinizing automaton would result in more than"));
+                RestStatus.INTERNAL_SERVER_ERROR, 
containsString("Determinizing [^]]*alt=[^]\\|}]{80,} would result in more 
than"));
         // Some regexes with explosive state growth still run because they
         // don't explode into too many states.
         options.put("regex", ".*te*s[tabclse]{1,16}.*");
@@ -222,7 +222,7 @@
         options.put("regex", ".*te*s[tabcse]{1,16}.*");
         options.put("max_determinized_states", 100);
         assertFailures(testSearch().setHighlighterOptions(options),
-                RestStatus.INTERNAL_SERVER_ERROR, 
containsString("Determinizing automaton would result in more than 100"));
+                RestStatus.INTERNAL_SERVER_ERROR, 
containsString("Determinizing .*te*s[tabcse]{1,16}.* would result in more than 
100"));
         // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
         // I can't find any way from here to mark it otherwise.
     }
diff --git a/experimental-highlighter-lucene/pom.xml 
b/experimental-highlighter-lucene/pom.xml
index eb36b8a..b2214b9 100644
--- a/experimental-highlighter-lucene/pom.xml
+++ b/experimental-highlighter-lucene/pom.xml
@@ -3,7 +3,7 @@
   <parent>
     <groupId>org.wikimedia.search.highlighter</groupId>
     <artifactId>experimental</artifactId>
-    <version>1.6.1-SNAPSHOT</version>
+    <version>1.7.0-SNAPSHOT</version>
   </parent>
   <artifactId>experimental-highlighter-lucene</artifactId>
   <packaging>jar</packaging>
diff --git a/pom.xml b/pom.xml
index 10f5e7d..1867ef0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -10,7 +10,7 @@
 
   <groupId>org.wikimedia.search.highlighter</groupId>
   <artifactId>experimental</artifactId>
-  <version>1.6.1-SNAPSHOT</version>
+  <version>1.7.0-SNAPSHOT</version>
   <packaging>pom</packaging>
 
   <modules>
@@ -49,12 +49,12 @@
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <elasticsearch.version>1.6.0</elasticsearch.version>
+    <elasticsearch.version>1.7.0</elasticsearch.version>
     <!-- For the Elasticsearch plugin to work this should match the version of 
Lucene that Elasticsearch 
       uses. -->
     <lucene.version>4.10.4</lucene.version>
     <!-- Note that this has to be compatible with the Elasticsearch version 
but won't match it. -->
-    <elasticsearch.icu.version>2.5.0</elasticsearch.icu.version>
+    <elasticsearch.icu.version>2.7.0</elasticsearch.icu.version>
   </properties>
 
   <build>

-- 
To view, visit https://gerrit.wikimedia.org/r/225355
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ia1b4da9840b0598b402ee8efa086659e5b18eab9
Gerrit-PatchSet: 4
Gerrit-Project: search/highlighter
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Upgrade to 1.7.0 - change (search/highlighter)

Reply via email to