jenkins-bot has submitted this change and it was merged.

Change subject: Randomized testing for NGramAutomaton
......................................................................


Randomized testing for NGramAutomaton

Found an error dealing with empty automata.  Fixed it.
Found another error when trying to extract common components from exact
duplicates.  Fixed it too.

Change-Id: I313b12d4d67513dff40ac0bdc26cbf5b492ebf06
---
M 
src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
M src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
M src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
M src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
M src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
M src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
M src/test/java/org/apache/lucene/util/automaton/TestOperations.java
R src/test/java/org/apache/lucene/util/automaton/XAutomatonTestUtil.java
M src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
M src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
10 files changed, 68 insertions(+), 25 deletions(-)

Approvals:
  Chad: Looks good to me, approved
  jenkins-bot: Verified



diff --git 
a/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
 
b/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
index b17fae6..cd5ea5d 100644
--- 
a/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
+++ 
b/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
@@ -194,6 +194,12 @@
                     
extractedComponents.add(composite.newFrom(ImmutableSet.copyOf(Sets.difference(composite.components,
 sharedComponents)))
                             .simplify());
                 }
+                // In the rare case that there aren't any composites but all 
the
+                // non-composits are the same we should just return that
+                // non-composite
+                if (composite == null) {
+                    return firstNonComposite;
+                }
                 // Add True to represent the extracted common component
                 extractedComponents.add(True.<T>instance());
                 
sharedComponents.add(newFrom(extractedComponents.build()).simplify());
diff --git 
a/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java 
b/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
index f123201..6c6c334 100644
--- a/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
+++ b/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
@@ -41,7 +41,6 @@
     /**
      * Transform this expression into another form.
      *
-     * @param <T> type stored in leaves
      * @param <J> result of the transformation.
      */
     <J> J transform(Transformer<T, J> transformer);
diff --git 
a/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java 
b/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
index 599d6f9..f5b8ea5 100644
--- a/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
+++ b/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
@@ -49,6 +49,9 @@
         this.maxExpand = maxExpand;
         this.maxStatesTraced = maxStatesTraced;
         this.maxTransitions = maxTransitions;
+        if (source.getNumStates() == 0) {
+            return;
+        }
         // Build the initial states using the first gramSize transitions
         int[] codePoints = new int[gramSize - 1];
         buildInitial(codePoints, 0, 0);
diff --git a/src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java 
b/src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
index 9658eb5..4905b98 100644
--- a/src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
+++ b/src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
@@ -33,7 +33,7 @@
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.XUnicodeUtil;
-import 
org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
+import 
org.apache.lucene.util.automaton.XAutomatonTestUtil.RandomAcceptedStrings;
 import org.apache.lucene.util.fst.XUtil;
 
 import static 
org.apache.lucene.util.automaton.XOperations.DEFAULT_MAX_DETERMINIZED_STATES;
@@ -256,7 +256,7 @@
   public void testReverseRandom1() throws Exception {
     int ITERS = atLeast(100);
     for(int i=0;i<ITERS;i++) {
-      XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+      XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
       XAutomaton ra = XOperations.reverse(a);
       XAutomaton rra = XOperations.reverse(ra);
       assertTrue(XOperations.sameLanguage(
@@ -269,7 +269,7 @@
     int ITERS = atLeast(100);
     for(int iter=0;iter<ITERS;iter++) {
       //System.out.println("TEST: iter=" + iter);
-      XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+      XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
       if (random().nextBoolean()) {
         a = XOperations.removeDeadStates(a);
       }
@@ -336,7 +336,7 @@
   public void testBuilderRandom() throws Exception {
     int ITERS = atLeast(100);
     for(int iter=0;iter<ITERS;iter++) {
-      XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+      XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
 
       // Just get all transitions, shuffle, and build a new automaton with the 
same transitions:
       List<XTransition> allTrans = new ArrayList<>();
@@ -1031,7 +1031,7 @@
       }
 
       assertSame(terms, a);
-      assertEquals(AutomatonTestUtil.isDeterministicSlow(a), 
a.isDeterministic());
+      assertEquals(XAutomatonTestUtil.isDeterministicSlow(a), 
a.isDeterministic());
     }
 
     assertSame(terms, a);
diff --git 
a/src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java 
b/src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
index cc8e143..a7ba645 100644
--- a/src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
+++ b/src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
@@ -31,7 +31,7 @@
   public void testRegexps() throws Exception {
       int num = atLeast(500);
       for (int i = 0; i < num; i++) {
-        assertAutomaton(new XRegExp(AutomatonTestUtil.randomRegexp(random()), 
XRegExp.NONE).toAutomaton());
+        assertAutomaton(new XRegExp(XAutomatonTestUtil.randomRegexp(random()), 
XRegExp.NONE).toAutomaton());
       }
   }
   
@@ -39,8 +39,8 @@
   public void testAgainstSimple() throws Exception {
     int num = atLeast(200);
     for (int i = 0; i < num; i++) {
-      XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
-      a = AutomatonTestUtil.determinizeSimple(a);
+      XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
+      a = XAutomatonTestUtil.determinizeSimple(a);
       XAutomaton b = XOperations.determinize(a, 
DEFAULT_MAX_DETERMINIZED_STATES);
       // TODO: more verifications possible?
       assertTrue(XOperations.sameLanguage(a, b));
diff --git a/src/test/java/org/apache/lucene/util/automaton/TestMinimize.java 
b/src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
index 7789eb3..adf0c19 100644
--- a/src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
+++ b/src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
@@ -29,7 +29,7 @@
   public void testBasic() {
     int num = atLeast(200);
     for (int i = 0; i < num; i++) {
-      XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+      XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
       XAutomaton la = XOperations.determinize(XOperations.removeDeadStates(a),
         DEFAULT_MAX_DETERMINIZED_STATES);
       XAutomaton lb = XMinimizationOperations.minimize(a,
@@ -44,8 +44,8 @@
   public void testAgainstBrzozowski() {
     int num = atLeast(200);
     for (int i = 0; i < num; i++) {
-      XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
-      a = AutomatonTestUtil.minimizeSimple(a);
+      XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
+      a = XAutomatonTestUtil.minimizeSimple(a);
       XAutomaton b = XMinimizationOperations.minimize(a,
         DEFAULT_MAX_DETERMINIZED_STATES);
       assertTrue(XOperations.sameLanguage(a, b));
diff --git a/src/test/java/org/apache/lucene/util/automaton/TestOperations.java 
b/src/test/java/org/apache/lucene/util/automaton/TestOperations.java
index d6ad8ba..13a87bb 100644
--- a/src/test/java/org/apache/lucene/util/automaton/TestOperations.java
+++ b/src/test/java/org/apache/lucene/util/automaton/TestOperations.java
@@ -97,12 +97,12 @@
     final int ITER2 = atLeast(100);
     for(int i=0;i<ITER1;i++) {
 
-      final XRegExp re = new XRegExp(AutomatonTestUtil.randomRegexp(random()), 
XRegExp.NONE);
+      final XRegExp re = new 
XRegExp(XAutomatonTestUtil.randomRegexp(random()), XRegExp.NONE);
       //System.out.println("TEST i=" + i + " re=" + re);
       final XAutomaton a = XOperations.determinize(re.toAutomaton(), 
DEFAULT_MAX_DETERMINIZED_STATES);
       assertFalse(XOperations.isEmpty(a));
 
-      final AutomatonTestUtil.RandomAcceptedStrings rx = new 
AutomatonTestUtil.RandomAcceptedStrings(a);
+      final XAutomatonTestUtil.RandomAcceptedStrings rx = new 
XAutomatonTestUtil.RandomAcceptedStrings(a);
       for(int j=0;j<ITER2;j++) {
         //System.out.println("TEST: j=" + j);
         int[] acc = null;
@@ -130,8 +130,8 @@
   public void testIsFinite() {
     int num = atLeast(200);
     for (int i = 0; i < num; i++) {
-      XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
-      assertEquals(AutomatonTestUtil.isFiniteSlow(a), XOperations.isFinite(a));
+      XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
+      assertEquals(XAutomatonTestUtil.isFiniteSlow(a), 
XOperations.isFinite(a));
     }
   }
 
@@ -140,7 +140,7 @@
   private Set<IntsRef> getFiniteStrings(XAutomaton a, int limit, boolean 
testRecursive) {
     Set<IntsRef> result = XOperations.getFiniteStrings(a, limit);
     if (testRecursive) {
-      assertEquals(AutomatonTestUtil.getFiniteStringsRecursive(a, limit), 
result);
+      assertEquals(XAutomatonTestUtil.getFiniteStringsRecursive(a, limit), 
result);
     }
     return result;
   }
@@ -258,7 +258,7 @@
     // automaton:
     int iters = atLeast(100);
     for(int i=0;i<iters;i++) {
-      XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+      XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
       try {
         // Must pass a limit because the random automaton
         // can accept MANY strings:
@@ -273,7 +273,7 @@
   }
 
   public void testInvalidLimit() {
-    XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+    XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
     try {
       XOperations.getFiniteStrings(a, -7);
       fail("did not hit exception");
@@ -283,7 +283,7 @@
   }
 
   public void testInvalidLimit2() {
-    XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+    XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
     try {
       XOperations.getFiniteStrings(a, 0);
       fail("did not hit exception");
diff --git 
a/src/test/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java 
b/src/test/java/org/apache/lucene/util/automaton/XAutomatonTestUtil.java
similarity index 98%
rename from 
src/test/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
rename to src/test/java/org/apache/lucene/util/automaton/XAutomatonTestUtil.java
index 199ac5f..6acf121 100644
--- a/src/test/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
+++ b/src/test/java/org/apache/lucene/util/automaton/XAutomatonTestUtil.java
@@ -39,7 +39,7 @@
  * and automata, and also provides a number of very
  * basic unoptimized implementations (*slow) for testing.
  */
-public class AutomatonTestUtil {
+public class XAutomatonTestUtil {
   /**
    * Default maximum number of states that {@link XOperations#determinize} 
should create.
    */
@@ -260,12 +260,12 @@
   /** return a random NFA/DFA for testing */
   public static XAutomaton randomAutomaton(Random random) {
     // get two random Automata from regexps
-    XAutomaton a1 = new XRegExp(AutomatonTestUtil.randomRegexp(random), 
XRegExp.NONE).toAutomaton();
+    XAutomaton a1 = new XRegExp(XAutomatonTestUtil.randomRegexp(random), 
XRegExp.NONE).toAutomaton();
     if (random.nextBoolean()) {
       a1 = XOperations.complement(a1, DEFAULT_MAX_DETERMINIZED_STATES);
     }
     
-    XAutomaton a2 = new XRegExp(AutomatonTestUtil.randomRegexp(random), 
XRegExp.NONE).toAutomaton();
+    XAutomaton a2 = new XRegExp(XAutomatonTestUtil.randomRegexp(random), 
XRegExp.NONE).toAutomaton();
     if (random.nextBoolean()) {
       a2 = XOperations.complement(a2, DEFAULT_MAX_DETERMINIZED_STATES);
     }
diff --git 
a/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java 
b/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
index 1397aa4..39c7d27 100644
--- 
a/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
+++ 
b/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
@@ -49,4 +49,10 @@
                         foo
                 ).simplify());
     }
+
+    @Test
+    public void extractDuplicates() {
+        assertEquals(foo, new Or<>(foo, new And<>(foo)).simplify());
+    }
+
 }
diff --git 
a/src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java 
b/src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
index f1dc1d0..00f8237 100644
--- 
a/src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
+++ 
b/src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
@@ -1,11 +1,12 @@
 package org.wikimedia.search.extra.regex.ngram;
 
-import static org.junit.Assert.assertEquals;
 import static org.wikimedia.search.extra.regex.expression.Leaf.leaves;
 
 import org.apache.lucene.util.automaton.XAutomaton;
+import org.apache.lucene.util.automaton.XAutomatonTestUtil;
 import org.apache.lucene.util.automaton.XRegExp;
 import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
+import org.elasticsearch.test.ElasticsearchTestCase;
 import org.junit.Test;
 import org.wikimedia.search.extra.regex.expression.And;
 import org.wikimedia.search.extra.regex.expression.Expression;
@@ -13,7 +14,9 @@
 import org.wikimedia.search.extra.regex.expression.Or;
 import org.wikimedia.search.extra.regex.expression.True;
 
-public class NGramAutomatonTest {
+import com.carrotsearch.randomizedtesting.annotations.Repeat;
+
+public class NGramAutomatonTest extends ElasticsearchTestCase {
     @Test
     public void simple() {
         assertTrigramExpression("cat", new Leaf<>("cat"));
@@ -188,6 +191,31 @@
         assertTrigramExpression("[ac]*a[de]{50,80}", null);
     }
 
+    @Test
+    @Repeat(iterations=100)
+    public void randomRegexp() {
+        // Some of the regex strings don't actually compile so just retry 
until we get a good one.
+        String str;
+        while (true) {
+            try {
+                str = XAutomatonTestUtil.randomRegexp(getRandom());
+                new XRegExp(str);
+                break;
+            } catch (Exception e) {
+                // retry now
+            }
+        }
+        assertTrigramExpression(str, null);
+    }
+
+    @Test
+    @Repeat(iterations=100)
+    public void randomAutomaton() {
+        NGramAutomaton ngramAutomaton = new 
NGramAutomaton(XAutomatonTestUtil.randomAutomaton(getRandom()), between(2, 7), 
4, 10000, 500);
+        Expression<String> expression = ngramAutomaton.expression();
+        expression = expression.simplify();
+    }
+
     /**
      * Asserts that the provided regex extracts the expected expression when
      * configured to extract trigrams. Uses 4 as maxExpand just because I had 
to
@@ -203,6 +231,7 @@
      * pick something and 4 seemed pretty good.
      */
     private void assertExpression(String regex, int gramSize, 
Expression<String> expected) {
+//         System.err.println(regex);
         XAutomaton automaton = new XRegExp(regex).toAutomaton(20000);
 //         System.err.println(automaton.toDot());
         NGramAutomaton ngramAutomaton = new NGramAutomaton(automaton, 
gramSize, 4, 10000, 500);

-- 
To view, visit https://gerrit.wikimedia.org/r/171467
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I313b12d4d67513dff40ac0bdc26cbf5b492ebf06
Gerrit-PatchSet: 3
Gerrit-Project: search/extra
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: BearND <[email protected]>
Gerrit-Reviewer: BryanDavis <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to