jenkins-bot has submitted this change and it was merged.

Change subject: Improve common terms extraction
......................................................................


Improve common terms extraction

Add support for stuff like
(a AND (a OR b)) -> (a AND (TRUE or b))
which allows the expression to ultimately reduce to
a

Change-Id: I6b5d9401ec5ba1f704323a2d829e633d63bae46b
---
M 
src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
M src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
2 files changed, 108 insertions(+), 36 deletions(-)

Approvals:
  BearND: Looks good to me, approved
  jenkins-bot: Verified



diff --git 
a/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
 
b/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
index 5679047..dbba4ec 100644
--- 
a/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
+++ 
b/src/main/java/org/wikimedia/search/extra/regex/expression/AbstractCompositeExpression.java
@@ -1,9 +1,7 @@
 package org.wikimedia.search.extra.regex.expression;
 
-import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
-import java.util.List;
 import java.util.Set;
 
 import org.elasticsearch.common.base.Joiner;
@@ -20,6 +18,11 @@
         this.components = components;
     }
 
+    /**
+     * Transform the components of this composite expression.  Used by 
subclasses when implementing transform.
+     * @param transformer transformer to use
+     * @return result of transforming components
+     */
     protected <J> ImmutableSet<J> 
transformComponents(Expression.Transformer<T, J> transformer) {
         ImmutableSet.Builder<J> builder = ImmutableSet.builder();
         for (Expression<T> component : components) {
@@ -62,7 +65,7 @@
     @Override
     public Expression<T> simplify() {
         Iterator<Expression<T>> componentsItr = components.iterator();
-        List<Expression<T>> newComponents = new ArrayList<>();
+        Set<Expression<T>> newComponents = new HashSet<>();
         boolean changed = false;
         while (componentsItr.hasNext()) {
             Expression<T> expression = componentsItr.next();
@@ -76,7 +79,7 @@
             if (forcedOutcome != null) {
                 return forcedOutcome;
             }
-            if (simplified.getClass() == this.getClass()) {
+            if (simplified.getClass() == getClass()) {
                 newComponents.addAll(((AbstractCompositeExpression<T>) 
simplified).components);
                 continue;
             }
@@ -86,52 +89,112 @@
         case 0:
             return True.instance();
         case 1:
-            return newComponents.get(0);
+            return newComponents.iterator().next();
         default:
         }
-        boolean composedOfSameComposites = newComponents.get(0).isComposite();
-        if (composedOfSameComposites) {
-            Class<?> componentClass = newComponents.get(0).getClass();
-            for (int i = 1; composedOfSameComposites && i < 
newComponents.size(); i++) {
-                composedOfSameComposites &= newComponents.get(i).isComposite() 
&& (componentClass == newComponents.get(i).getClass());
+
+        // Are all composite subexpressions of the same type?
+        boolean allCompositesOfSameType = true;
+        // Are all the non-composite subexpressions of this type the same?
+        boolean nonCompositesAreSame = true;
+        // First non-composite subexpression.
+        Expression<T> firstNonComposite = null;
+        // If all composite subexpressions are of the same type then what is 
that type?
+        Class<?> compositesClass = null;
+        for (Expression<T> current : newComponents) {
+            if (current.isComposite()) {
+                if (compositesClass == null) {
+                    compositesClass = current.getClass();
+                } else {
+                    allCompositesOfSameType &= compositesClass == 
current.getClass();
+                }
+            } else {
+                if (firstNonComposite == null) {
+                    firstNonComposite = current;
+                } else {
+                    nonCompositesAreSame &= firstNonComposite.equals(current);
+                }
             }
         }
-        if (composedOfSameComposites) {
-            // If this component is composed of composite components we can
-            // attempt to factor out any equivalent parts
-            AbstractCompositeExpression<T> first = 
(AbstractCompositeExpression<T>) newComponents.get(0);
-            Set<Expression<T>> sharedComponents = null;
-            for (Expression<T> component : first.components) {
-                boolean shared = true;
-                for (int i = 1; i < newComponents.size(); i++) {
-                    shared &= ((AbstractCompositeExpression<T>) 
newComponents.get(i)).components.contains(component);
-                }
-                if (shared) {
-                    if (sharedComponents == null) {
-                        sharedComponents = new HashSet<Expression<T>>();
+
+        if (firstNonComposite == null) {
+            // Everything is composite and they are all of the same type.
+            if (allCompositesOfSameType) {
+                // If this component is composed of composite components we can
+                // attempt to factor out any equivalent parts
+                AbstractCompositeExpression<T> first = 
(AbstractCompositeExpression<T>) newComponents.iterator().next();
+                Set<Expression<T>> sharedComponents = null;
+                for (Expression<T> component : first.components) {
+                    boolean shared = true;
+                    Iterator<Expression<T>> current = newComponents.iterator();
+                    while (shared && current.hasNext()) {
+                        shared &= ((AbstractCompositeExpression<T>) 
current.next()).components.contains(component);
                     }
-                    sharedComponents.add(component);
+                    if (shared) {
+                        if (sharedComponents == null) {
+                            sharedComponents = new HashSet<>();
+                        }
+                        sharedComponents.add(component);
+                    }
+                }
+                if (sharedComponents != null) {
+                    // Build all the subcomponents with the common part 
extracted
+                    ImmutableSet.Builder<Expression<T>> extractedComponents = 
ImmutableSet.builder();
+                    AbstractCompositeExpression<T> composite = null;
+                    for (Expression<T> component : newComponents) {
+                        composite = (AbstractCompositeExpression<T>) component;
+                        
extractedComponents.add(composite.newFrom(ImmutableSet.copyOf(Sets.difference(composite.components,
 sharedComponents)))
+                                .simplify());
+                    }
+                    
sharedComponents.add(newFrom(extractedComponents.build()).simplify());
+                    return 
composite.newFrom(ImmutableSet.copyOf(sharedComponents)).simplify();
                 }
             }
-            if (sharedComponents != null) {
-                // Build all the subcomponents with the common part extracted
-                ImmutableSet.Builder<Expression<T>> extractedComponents = 
ImmutableSet.builder();
+        } else {
+            if (allCompositesOfSameType && nonCompositesAreSame && 
canFactorOut(newComponents, firstNonComposite)) {
+                Set<Expression<T>> sharedComponents = new HashSet<>();
+                sharedComponents.add(firstNonComposite);
                 AbstractCompositeExpression<T> composite = null;
+                ImmutableSet.Builder<Expression<T>> extractedComponents = 
ImmutableSet.builder();
                 for (Expression<T> component : newComponents) {
-                    composite = ((AbstractCompositeExpression<T>) component);
+                    if (!component.isComposite()) {
+                        continue;
+                    }
+                    composite = (AbstractCompositeExpression<T>) component;
                     
extractedComponents.add(composite.newFrom(ImmutableSet.copyOf(Sets.difference(composite.components,
 sharedComponents)))
                             .simplify());
                 }
+                // Add True to represent the extracted common component
+                extractedComponents.add(True.<T>instance());
                 
sharedComponents.add(newFrom(extractedComponents.build()).simplify());
                 return 
composite.newFrom(ImmutableSet.copyOf(sharedComponents)).simplify();
             }
         }
+
         if (!changed) {
             return this;
         }
         return newFrom(ImmutableSet.copyOf(newComponents));
     }
 
+    /**
+     * Can we factor commonComposite out of all subexpressions?
+     */
+    private boolean canFactorOut(Set<Expression<T>> subexpressions, 
Expression<T> commonComposite) {
+        for (Expression<T> current : subexpressions) {
+            if (!current.equals(commonComposite)) {
+                if (!current.isComposite()) {
+                    return false;
+                }
+                AbstractCompositeExpression<T> composite = 
(AbstractCompositeExpression<T>) current;
+                if (!composite.components.contains(commonComposite)) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
     @Override
     public boolean isComposite() {
         return true;
diff --git 
a/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java 
b/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
index 17a13e6..1397aa4 100644
--- 
a/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
+++ 
b/src/test/java/org/wikimedia/search/extra/regex/expression/ExpressionTest.java
@@ -25,19 +25,28 @@
 
     @Test
     public void extract() {
-        assertEquals(new And<String>(foo, new Or<String>(bar, baz)),
-                new Or<String>(
-                        new And<String>(foo, bar),
-                        new And<String>(foo, baz)
+        assertEquals(new And<>(foo, new Or<>(bar, baz)),
+                new Or<>(
+                        new And<>(foo, bar),
+                        new And<>(foo, baz)
                 ).simplify());
     }
 
     @Test
     public void extractToEmpty() {
-        assertEquals(new And<String>(foo, bar),
-                new Or<String>(
-                        new And<String>(foo, bar),
-                        new And<String>(foo, bar, baz)
+        assertEquals(new And<>(foo, bar),
+                new Or<>(
+                        new And<>(foo, bar),
+                        new And<>(foo, bar, baz)
+                ).simplify());
+    }
+
+    @Test
+    public void extractSingle() {
+        assertEquals(foo,
+                new Or<>(
+                        new And<>(foo, bar),
+                        foo
                 ).simplify());
     }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/165515
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I6b5d9401ec5ba1f704323a2d829e633d63bae46b
Gerrit-PatchSet: 4
Gerrit-Project: search/extra
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: BearND <[email protected]>
Gerrit-Reviewer: BryanDavis <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to