jenkins-bot has submitted this change and it was merged.
Change subject: Randomized testing for NGramAutomaton
......................................................................
Randomized testing for NGramAutomaton
Found an error dealing with empty automata. Fixed it.
Found another error when trying to extract common components from exact
duplicates. Fixed it too.
Change-Id: I313b12d4d67513dff40ac0bdc26cbf5b492ebf06
---
M
src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
M src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
M src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
M src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
M src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
M src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
M src/test/java/org/apache/lucene/util/automaton/TestOperations.java
R src/test/java/org/apache/lucene/util/automaton/XAutomatonTestUtil.java
M src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
M src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
10 files changed, 68 insertions(+), 25 deletions(-)
Approvals:
Chad: Looks good to me, approved
jenkins-bot: Verified
diff --git
a/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
b/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
index b17fae6..cd5ea5d 100644
---
a/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
+++
b/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
@@ -194,6 +194,12 @@
extractedComponents.add(composite.newFrom(ImmutableSet.copyOf(Sets.difference(composite.components,
sharedComponents)))
.simplify());
}
+ // In the rare case that there aren't any composites but all
the
+ // non-composits are the same we should just return that
+ // non-composite
+ if (composite == null) {
+ return firstNonComposite;
+ }
// Add True to represent the extracted common component
extractedComponents.add(True.<T>instance());
sharedComponents.add(newFrom(extractedComponents.build()).simplify());
diff --git
a/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
b/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
index f123201..6c6c334 100644
--- a/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
+++ b/src/main/java/org/wikimedia/search/extra/regex/expression/Expression.java
@@ -41,7 +41,6 @@
/**
* Transform this expression into another form.
*
- * @param <T> type stored in leaves
* @param <J> result of the transformation.
*/
<J> J transform(Transformer<T, J> transformer);
diff --git
a/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
b/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
index 599d6f9..f5b8ea5 100644
--- a/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
+++ b/src/main/java/org/wikimedia/search/extra/regex/ngram/NGramAutomaton.java
@@ -49,6 +49,9 @@
this.maxExpand = maxExpand;
this.maxStatesTraced = maxStatesTraced;
this.maxTransitions = maxTransitions;
+ if (source.getNumStates() == 0) {
+ return;
+ }
// Build the initial states using the first gramSize transitions
int[] codePoints = new int[gramSize - 1];
buildInitial(codePoints, 0, 0);
diff --git a/src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
b/src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
index 9658eb5..4905b98 100644
--- a/src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
+++ b/src/test/java/org/apache/lucene/util/automaton/TestAutomaton.java
@@ -33,7 +33,7 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.XUnicodeUtil;
-import
org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
+import
org.apache.lucene.util.automaton.XAutomatonTestUtil.RandomAcceptedStrings;
import org.apache.lucene.util.fst.XUtil;
import static
org.apache.lucene.util.automaton.XOperations.DEFAULT_MAX_DETERMINIZED_STATES;
@@ -256,7 +256,7 @@
public void testReverseRandom1() throws Exception {
int ITERS = atLeast(100);
for(int i=0;i<ITERS;i++) {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
XAutomaton ra = XOperations.reverse(a);
XAutomaton rra = XOperations.reverse(ra);
assertTrue(XOperations.sameLanguage(
@@ -269,7 +269,7 @@
int ITERS = atLeast(100);
for(int iter=0;iter<ITERS;iter++) {
//System.out.println("TEST: iter=" + iter);
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
if (random().nextBoolean()) {
a = XOperations.removeDeadStates(a);
}
@@ -336,7 +336,7 @@
public void testBuilderRandom() throws Exception {
int ITERS = atLeast(100);
for(int iter=0;iter<ITERS;iter++) {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
// Just get all transitions, shuffle, and build a new automaton with the
same transitions:
List<XTransition> allTrans = new ArrayList<>();
@@ -1031,7 +1031,7 @@
}
assertSame(terms, a);
- assertEquals(AutomatonTestUtil.isDeterministicSlow(a),
a.isDeterministic());
+ assertEquals(XAutomatonTestUtil.isDeterministicSlow(a),
a.isDeterministic());
}
assertSame(terms, a);
diff --git
a/src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
b/src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
index cc8e143..a7ba645 100644
--- a/src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
+++ b/src/test/java/org/apache/lucene/util/automaton/TestDeterminism.java
@@ -31,7 +31,7 @@
public void testRegexps() throws Exception {
int num = atLeast(500);
for (int i = 0; i < num; i++) {
- assertAutomaton(new XRegExp(AutomatonTestUtil.randomRegexp(random()),
XRegExp.NONE).toAutomaton());
+ assertAutomaton(new XRegExp(XAutomatonTestUtil.randomRegexp(random()),
XRegExp.NONE).toAutomaton());
}
}
@@ -39,8 +39,8 @@
public void testAgainstSimple() throws Exception {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
- a = AutomatonTestUtil.determinizeSimple(a);
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
+ a = XAutomatonTestUtil.determinizeSimple(a);
XAutomaton b = XOperations.determinize(a,
DEFAULT_MAX_DETERMINIZED_STATES);
// TODO: more verifications possible?
assertTrue(XOperations.sameLanguage(a, b));
diff --git a/src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
b/src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
index 7789eb3..adf0c19 100644
--- a/src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
+++ b/src/test/java/org/apache/lucene/util/automaton/TestMinimize.java
@@ -29,7 +29,7 @@
public void testBasic() {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
XAutomaton la = XOperations.determinize(XOperations.removeDeadStates(a),
DEFAULT_MAX_DETERMINIZED_STATES);
XAutomaton lb = XMinimizationOperations.minimize(a,
@@ -44,8 +44,8 @@
public void testAgainstBrzozowski() {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
- a = AutomatonTestUtil.minimizeSimple(a);
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
+ a = XAutomatonTestUtil.minimizeSimple(a);
XAutomaton b = XMinimizationOperations.minimize(a,
DEFAULT_MAX_DETERMINIZED_STATES);
assertTrue(XOperations.sameLanguage(a, b));
diff --git a/src/test/java/org/apache/lucene/util/automaton/TestOperations.java
b/src/test/java/org/apache/lucene/util/automaton/TestOperations.java
index d6ad8ba..13a87bb 100644
--- a/src/test/java/org/apache/lucene/util/automaton/TestOperations.java
+++ b/src/test/java/org/apache/lucene/util/automaton/TestOperations.java
@@ -97,12 +97,12 @@
final int ITER2 = atLeast(100);
for(int i=0;i<ITER1;i++) {
- final XRegExp re = new XRegExp(AutomatonTestUtil.randomRegexp(random()),
XRegExp.NONE);
+ final XRegExp re = new
XRegExp(XAutomatonTestUtil.randomRegexp(random()), XRegExp.NONE);
//System.out.println("TEST i=" + i + " re=" + re);
final XAutomaton a = XOperations.determinize(re.toAutomaton(),
DEFAULT_MAX_DETERMINIZED_STATES);
assertFalse(XOperations.isEmpty(a));
- final AutomatonTestUtil.RandomAcceptedStrings rx = new
AutomatonTestUtil.RandomAcceptedStrings(a);
+ final XAutomatonTestUtil.RandomAcceptedStrings rx = new
XAutomatonTestUtil.RandomAcceptedStrings(a);
for(int j=0;j<ITER2;j++) {
//System.out.println("TEST: j=" + j);
int[] acc = null;
@@ -130,8 +130,8 @@
public void testIsFinite() {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
- assertEquals(AutomatonTestUtil.isFiniteSlow(a), XOperations.isFinite(a));
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
+ assertEquals(XAutomatonTestUtil.isFiniteSlow(a),
XOperations.isFinite(a));
}
}
@@ -140,7 +140,7 @@
private Set<IntsRef> getFiniteStrings(XAutomaton a, int limit, boolean
testRecursive) {
Set<IntsRef> result = XOperations.getFiniteStrings(a, limit);
if (testRecursive) {
- assertEquals(AutomatonTestUtil.getFiniteStringsRecursive(a, limit),
result);
+ assertEquals(XAutomatonTestUtil.getFiniteStringsRecursive(a, limit),
result);
}
return result;
}
@@ -258,7 +258,7 @@
// automaton:
int iters = atLeast(100);
for(int i=0;i<iters;i++) {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
try {
// Must pass a limit because the random automaton
// can accept MANY strings:
@@ -273,7 +273,7 @@
}
public void testInvalidLimit() {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
try {
XOperations.getFiniteStrings(a, -7);
fail("did not hit exception");
@@ -283,7 +283,7 @@
}
public void testInvalidLimit2() {
- XAutomaton a = AutomatonTestUtil.randomAutomaton(random());
+ XAutomaton a = XAutomatonTestUtil.randomAutomaton(random());
try {
XOperations.getFiniteStrings(a, 0);
fail("did not hit exception");
diff --git
a/src/test/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
b/src/test/java/org/apache/lucene/util/automaton/XAutomatonTestUtil.java
similarity index 98%
rename from
src/test/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
rename to src/test/java/org/apache/lucene/util/automaton/XAutomatonTestUtil.java
index 199ac5f..6acf121 100644
--- a/src/test/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
+++ b/src/test/java/org/apache/lucene/util/automaton/XAutomatonTestUtil.java
@@ -39,7 +39,7 @@
* and automata, and also provides a number of very
* basic unoptimized implementations (*slow) for testing.
*/
-public class AutomatonTestUtil {
+public class XAutomatonTestUtil {
/**
* Default maximum number of states that {@link XOperations#determinize}
should create.
*/
@@ -260,12 +260,12 @@
/** return a random NFA/DFA for testing */
public static XAutomaton randomAutomaton(Random random) {
// get two random Automata from regexps
- XAutomaton a1 = new XRegExp(AutomatonTestUtil.randomRegexp(random),
XRegExp.NONE).toAutomaton();
+ XAutomaton a1 = new XRegExp(XAutomatonTestUtil.randomRegexp(random),
XRegExp.NONE).toAutomaton();
if (random.nextBoolean()) {
a1 = XOperations.complement(a1, DEFAULT_MAX_DETERMINIZED_STATES);
}
- XAutomaton a2 = new XRegExp(AutomatonTestUtil.randomRegexp(random),
XRegExp.NONE).toAutomaton();
+ XAutomaton a2 = new XRegExp(XAutomatonTestUtil.randomRegexp(random),
XRegExp.NONE).toAutomaton();
if (random.nextBoolean()) {
a2 = XOperations.complement(a2, DEFAULT_MAX_DETERMINIZED_STATES);
}
diff --git
a/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
b/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
index 1397aa4..39c7d27 100644
---
a/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
+++
b/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
@@ -49,4 +49,10 @@
foo
).simplify());
}
+
+ @Test
+ public void extractDuplicates() {
+ assertEquals(foo, new Or<>(foo, new And<>(foo)).simplify());
+ }
+
}
diff --git
a/src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
b/src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
index f1dc1d0..00f8237 100644
---
a/src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
+++
b/src/test/java/org/wikimedia/search/extra/regex/ngram/NGramAutomatonTest.java
@@ -1,11 +1,12 @@
package org.wikimedia.search.extra.regex.ngram;
-import static org.junit.Assert.assertEquals;
import static org.wikimedia.search.extra.regex.expression.Leaf.leaves;
import org.apache.lucene.util.automaton.XAutomaton;
+import org.apache.lucene.util.automaton.XAutomatonTestUtil;
import org.apache.lucene.util.automaton.XRegExp;
import org.apache.lucene.util.automaton.XTooComplexToDeterminizeException;
+import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
import org.wikimedia.search.extra.regex.expression.And;
import org.wikimedia.search.extra.regex.expression.Expression;
@@ -13,7 +14,9 @@
import org.wikimedia.search.extra.regex.expression.Or;
import org.wikimedia.search.extra.regex.expression.True;
-public class NGramAutomatonTest {
+import com.carrotsearch.randomizedtesting.annotations.Repeat;
+
+public class NGramAutomatonTest extends ElasticsearchTestCase {
@Test
public void simple() {
assertTrigramExpression("cat", new Leaf<>("cat"));
@@ -188,6 +191,31 @@
assertTrigramExpression("[ac]*a[de]{50,80}", null);
}
+ @Test
+ @Repeat(iterations=100)
+ public void randomRegexp() {
+ // Some of the regex strings don't actually compile so just retry
until we get a good one.
+ String str;
+ while (true) {
+ try {
+ str = XAutomatonTestUtil.randomRegexp(getRandom());
+ new XRegExp(str);
+ break;
+ } catch (Exception e) {
+ // retry now
+ }
+ }
+ assertTrigramExpression(str, null);
+ }
+
+ @Test
+ @Repeat(iterations=100)
+ public void randomAutomaton() {
+ NGramAutomaton ngramAutomaton = new
NGramAutomaton(XAutomatonTestUtil.randomAutomaton(getRandom()), between(2, 7),
4, 10000, 500);
+ Expression<String> expression = ngramAutomaton.expression();
+ expression = expression.simplify();
+ }
+
/**
* Asserts that the provided regex extracts the expected expression when
* configured to extract trigrams. Uses 4 as maxExpand just because I had
to
@@ -203,6 +231,7 @@
* pick something and 4 seemed pretty good.
*/
private void assertExpression(String regex, int gramSize,
Expression<String> expected) {
+// System.err.println(regex);
XAutomaton automaton = new XRegExp(regex).toAutomaton(20000);
// System.err.println(automaton.toDot());
NGramAutomaton ngramAutomaton = new NGramAutomaton(automaton,
gramSize, 4, 10000, 500);
--
To view, visit https://gerrit.wikimedia.org/r/171467
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I313b12d4d67513dff40ac0bdc26cbf5b492ebf06
Gerrit-PatchSet: 3
Gerrit-Project: search/extra
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: BearND <[email protected]>
Gerrit-Reviewer: BryanDavis <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits