Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/225346

Change subject: Update master to 1.7.0
......................................................................

Update master to 1.7.0

Elasticsearch 1.7.0's testing framework requires that all exceptions sent
over the wire be serializable. So we introduce another exception to wrap
one from Lucene that isn't serializable. Well, not wrap so much. More like
repackage its message.

Change-Id: I2ad43ebfff6e6649d9550838573defe2e67fc8d3
---
M pom.xml
A src/main/java/org/wikimedia/search/extra/regex/RegexTooComplexException.java
M src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java
A 
src/test/java/org/wikimedia/search/extra/regex/ExceptionsAreSerializableTest.java
M src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java
5 files changed, 70 insertions(+), 12 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/extra 
refs/changes/46/225346/1

diff --git a/pom.xml b/pom.xml
index 8093484..0c9dc82 100644
--- a/pom.xml
+++ b/pom.xml
@@ -10,7 +10,7 @@
 
   <groupId>org.wikimedia.search</groupId>
   <artifactId>extra</artifactId>
-  <version>1.6.1-SNAPSHOT</version>
+  <version>1.7.0-SNAPSHOT</version>
   <description>Extra queries and filters for Elasticsearch.</description>
 
   <licenses>
@@ -43,7 +43,7 @@
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <elasticsearch.version>1.6.0</elasticsearch.version>
+    <elasticsearch.version>1.7.0</elasticsearch.version>
     <lucene.version>4.10.4</lucene.version>
   </properties>
 
diff --git 
a/src/main/java/org/wikimedia/search/extra/regex/RegexTooComplexException.java 
b/src/main/java/org/wikimedia/search/extra/regex/RegexTooComplexException.java
new file mode 100644
index 0000000..3eca5a8
--- /dev/null
+++ 
b/src/main/java/org/wikimedia/search/extra/regex/RegexTooComplexException.java
@@ -0,0 +1,15 @@
+package org.wikimedia.search.extra.regex;
+
+import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
+
+/**
+ * Wraps Lucene's XTooComplexToDeterminizeException to be serializable to be
+ * thrown over the wire.
+ */
+public class RegexTooComplexException extends RuntimeException {
+    private static final long serialVersionUID = -41975279199116247L;
+
+    public RegexTooComplexException(XTooComplexToDeterminizeException e) {
+        super(e.getMessage());
+    }
+}
diff --git 
a/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java 
b/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java
index 7b2e483..4de4107 100644
--- a/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java
+++ b/src/main/java/org/wikimedia/search/extra/regex/SourceRegexFilter.java
@@ -14,7 +14,10 @@
 import org.apache.lucene.util.automaton.XAutomaton;
 import org.apache.lucene.util.automaton.XCharacterRunAutomaton;
 import org.apache.lucene.util.automaton.XRegExp;
+import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
 import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.common.logging.ESLogger;
+import org.elasticsearch.common.logging.ESLoggerFactory;
 import org.elasticsearch.common.lucene.docset.AllDocIdSet;
 import org.elasticsearch.common.lucene.search.Queries;
 import org.wikimedia.search.extra.regex.expression.Expression;
@@ -23,6 +26,7 @@
 import org.wikimedia.search.extra.util.FieldValues;
 
 public class SourceRegexFilter extends Filter {
+    private static final ESLogger log = 
ESLoggerFactory.getLogger(SourceRegexFilter.class.getPackage().getName());
     private final String fieldPath;
     private final String ngramFieldPath;
     private final String regex;
@@ -33,7 +37,8 @@
     private Filter prefilter;
     private XCharacterRunAutomaton charRun;
 
-    public SourceRegexFilter(String fieldPath, String ngramFieldPath, String 
regex, FieldValues.Loader loader, Settings settings, int gramSize) {
+    public SourceRegexFilter(String fieldPath, String ngramFieldPath, String 
regex, FieldValues.Loader loader, Settings settings,
+            int gramSize) {
         this.fieldPath = fieldPath;
         this.ngramFieldPath = ngramFieldPath;
         this.regex = regex;
@@ -62,11 +67,12 @@
         }
         if (prefilter == null) {
             try {
-                // The accelerating filter is always assumed to be case 
insensitive/always lowercased
-                XAutomaton automaton = new 
XRegExp(regex.toLowerCase(settings.getLocale()),
-                        XRegExp.ALL ^ 
XRegExp.AUTOMATON).toAutomaton(settings.getMaxDeterminizedStates());
-                Expression<String> expression = new NGramExtractor(gramSize, 
settings.getMaxExpand(),
-                        settings.getMaxStatesTraced(), 
settings.getMaxNgramsExtracted()).extract(automaton).simplify();
+                // The accelerating filter is always assumed to be case
+                // insensitive/always lowercased
+                XAutomaton automaton = regexToAutomaton(new 
XRegExp(regex.toLowerCase(settings.getLocale()), XRegExp.ALL
+                        ^ XRegExp.AUTOMATON));
+                Expression<String> expression = new NGramExtractor(gramSize, 
settings.getMaxExpand(), settings.getMaxStatesTraced(),
+                        
settings.getMaxNgramsExtracted()).extract(automaton).simplify();
                 if (expression.alwaysTrue()) {
                     if (settings.getRejectUnaccelerated()) {
                         throw new UnableToAccelerateRegexException(regex, 
gramSize, ngramFieldPath);
@@ -84,6 +90,21 @@
             }
         }
         return prefilter.getDocIdSet(context, acceptDocs);
+    }
+
+    private XAutomaton regexToAutomaton(XRegExp regex) {
+        try {
+            return regex.toAutomaton(settings.getMaxDeterminizedStates());
+        } catch (XTooComplexToDeterminizeException e) {
+            /*
+             * Since we're going to lose the stack trace we give our future
+             * selves an opportunity to log it in case we need it.
+             */
+            if (log.isDebugEnabled()) {
+                log.debug("Regex too complex to determinize", e);
+            }
+            throw new RegexTooComplexException(e);
+        }
     }
 
     /**
@@ -110,8 +131,7 @@
                 if (!settings.getCaseSensitive()) {
                     regexString = 
regexString.toLowerCase(settings.getLocale());
                 }
-                XAutomaton automaton = new XRegExp(".*" + regexString + ".*", 
XRegExp.ALL ^ XRegExp.AUTOMATON).toAutomaton(settings
-                        .getMaxDeterminizedStates());
+                XAutomaton automaton = regexToAutomaton(new XRegExp(".*" + 
regexString + ".*", XRegExp.ALL ^ XRegExp.AUTOMATON));
                 charRun = new XCharacterRunAutomaton(automaton);
             }
             List<String> values = load(docid);
diff --git 
a/src/test/java/org/wikimedia/search/extra/regex/ExceptionsAreSerializableTest.java
 
b/src/test/java/org/wikimedia/search/extra/regex/ExceptionsAreSerializableTest.java
new file mode 100644
index 0000000..bf763d3
--- /dev/null
+++ 
b/src/test/java/org/wikimedia/search/extra/regex/ExceptionsAreSerializableTest.java
@@ -0,0 +1,23 @@
+package org.wikimedia.search.extra.regex;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
+import org.elasticsearch.common.io.ThrowableObjectOutputStream;
+import org.junit.Test;
+
+/**
+ * Exceptions in Elasticsearch must be Serializable by
+ * ThrowableObjectOutputStream.
+ */
+public class ExceptionsAreSerializableTest {
+    @Test
+    public void unableToAccelerateRegexExceptionIsSerializable() {
+        assertTrue(ThrowableObjectOutputStream.canSerialize(new 
UnableToAccelerateRegexException("cat", 3, "trigrams")));
+    }
+
+    @Test
+    public void regexTooComplexExceptionIsSerializable() {
+        assertTrue(ThrowableObjectOutputStream.canSerialize(new 
RegexTooComplexException(new XTooComplexToDeterminizeException(null, 10))));
+    }
+}
diff --git 
a/src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java 
b/src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java
index 01b1d33..ddf1123 100644
--- a/src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java
+++ b/src/test/java/org/wikimedia/search/extra/regex/SourceRegexFilterTest.java
@@ -92,14 +92,14 @@
         indexRandom(true, doc("findme", "test"));
         // The default is good enough to prevent craziness
         assertFailures(search(filter("[^]]*alt=[^]\\|}]{80,}")),
-                RestStatus.INTERNAL_SERVER_ERROR, 
containsString("Determinizing automaton would result in more than"));
+                RestStatus.INTERNAL_SERVER_ERROR, 
containsString("Determinizing [^]]*alt=[^]\\|}]{80,} would result in more 
than"));
         // Some regexes with explosive state growth still run because they
         // don't explode into too many states.
         SearchResponse response = search(filter("te*s[tabclse]{1,16}")).get();
         assertHitCount(response, 1);
         // But you can stop them by lowering maxStatesTraced
         
assertFailures(search(filter("te*s[tabcse]{1,16}").maxDeterminizedStates(100)),
-                RestStatus.INTERNAL_SERVER_ERROR, 
containsString("Determinizing automaton would result in more than 100"));
+                RestStatus.INTERNAL_SERVER_ERROR, 
containsString("Determinizing .*te*s[tabcse]{1,16}.* would result in more than 
100"));
         // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
         // I can't find any way from here to mark it otherwise.
     }

-- 
To view, visit https://gerrit.wikimedia.org/r/225346
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2ad43ebfff6e6649d9550838573defe2e67fc8d3
Gerrit-PatchSet: 1
Gerrit-Project: search/extra
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to