This is an automated email from the ASF dual-hosted git repository.

tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new dd8be2a229 add TextMatchFilterOptimizer to maximally push down 
`text_match` filters to Lucene (#12339)
dd8be2a229 is described below

commit dd8be2a2298a3cdc091e342004e6eaca9d399ed9
Author: Christopher Peck <27231838+itschrisp...@users.noreply.github.com>
AuthorDate: Wed Jan 31 13:40:37 2024 -0800

    add TextMatchFilterOptimizer to maximally push down `text_match` filters to 
Lucene (#12339)
    
    * add TextMatchFilterOptimizer
    
    * fix equivalence for all not
---
 .../pinot/core/query/optimizer/QueryOptimizer.java |   8 +-
 .../optimizer/filter/TextMatchFilterOptimizer.java | 186 +++++++++++++++++++++
 .../core/query/optimizer/QueryOptimizerTest.java   |  55 ++++++
 3 files changed, 246 insertions(+), 3 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/QueryOptimizer.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/QueryOptimizer.java
index 0b98cdb88f..dbd9b5c70c 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/QueryOptimizer.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/QueryOptimizer.java
@@ -30,6 +30,7 @@ import 
org.apache.pinot.core.query.optimizer.filter.IdenticalPredicateFilterOpti
 import org.apache.pinot.core.query.optimizer.filter.MergeEqInFilterOptimizer;
 import org.apache.pinot.core.query.optimizer.filter.MergeRangeFilterOptimizer;
 import org.apache.pinot.core.query.optimizer.filter.NumericalFilterOptimizer;
+import org.apache.pinot.core.query.optimizer.filter.TextMatchFilterOptimizer;
 import 
org.apache.pinot.core.query.optimizer.filter.TimePredicateFilterOptimizer;
 import org.apache.pinot.core.query.optimizer.statement.StatementOptimizer;
 import 
org.apache.pinot.core.query.optimizer.statement.StringPredicateFilterOptimizer;
@@ -39,14 +40,15 @@ import org.apache.pinot.spi.data.Schema;
 
 public class QueryOptimizer {
   // DO NOT change the order of these optimizers.
-  // - MergeEqInFilterOptimizer and MergeRangeFilterOptimizer relies on 
FlattenAndOrFilterOptimizer to flatten the
-  //   AND/OR predicate so that the children are on the same level to be merged
+  // - MergeEqInFilterOptimizer, MergeRangeFilterOptimizer, and 
TextMatchFilterOptimizer each rely on
+  //   FlattenAndOrFilterOptimizer to flatten the AND/OR predicate so that the 
children are on the same level to
+  //   be merged
   // - TimePredicateFilterOptimizer and MergeRangeFilterOptimizer relies on 
NumericalFilterOptimizer to convert the
   //   values to the proper format so that they can be properly parsed
   private static final List<FilterOptimizer> FILTER_OPTIMIZERS =
       Arrays.asList(new FlattenAndOrFilterOptimizer(), new 
IdenticalPredicateFilterOptimizer(),
           new MergeEqInFilterOptimizer(), new NumericalFilterOptimizer(), new 
TimePredicateFilterOptimizer(),
-          new MergeRangeFilterOptimizer());
+          new MergeRangeFilterOptimizer(), new TextMatchFilterOptimizer());
 
   private static final List<StatementOptimizer> STATEMENT_OPTIMIZERS =
       Collections.singletonList(new StringPredicateFilterOptimizer());
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/TextMatchFilterOptimizer.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/TextMatchFilterOptimizer.java
new file mode 100644
index 0000000000..aca4e2d5cc
--- /dev/null
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/TextMatchFilterOptimizer.java
@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.query.optimizer.filter;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import javax.annotation.Nullable;
+import org.apache.pinot.common.request.Expression;
+import org.apache.pinot.common.request.ExpressionType;
+import org.apache.pinot.common.request.Function;
+import org.apache.pinot.common.utils.request.RequestUtils;
+import org.apache.pinot.spi.data.Schema;
+import org.apache.pinot.sql.FilterKind;
+
+
+/**
+ * The {@code TextMatchFilterOptimizer} merges `TEXT_MATCH` predicates on the 
same column within an `OR` or `AND`,
+ * maximizing the amount of the query that can be pushed down to Lucene
+ *
+ * NOTE: This optimizer follows the {@link FlattenAndOrFilterOptimizer}, so 
all the AND/OR filters are already
+ *       flattened.
+ */
+public class TextMatchFilterOptimizer implements FilterOptimizer {
+  private static final String SPACE = " ";
+
+  @Override
+  public Expression optimize(Expression filterExpression, @Nullable Schema 
schema) {
+    return filterExpression.getType() == ExpressionType.FUNCTION ? 
optimize(filterExpression) : filterExpression;
+  }
+
+  private Expression optimize(Expression filterExpression) {
+    Function function = filterExpression.getFunctionCall();
+    if (function == null) {
+      return filterExpression;
+    }
+
+    // no optimization can be performed unless the function is an OR, AND, or 
NOT
+    String operator = function.getOperator();
+    if (!operator.equals(FilterKind.OR.name()) && 
!operator.equals(FilterKind.AND.name()) && !operator.equals(
+        FilterKind.NOT.name())) {
+      return filterExpression;
+    }
+
+    List<Expression> children = function.getOperands();
+    children.replaceAll(this::optimize);
+
+    List<Expression> newChildren = new ArrayList<>();
+    Map<Expression, List<Expression>> textMatchMap = new HashMap<>();
+    boolean recreateFilter = false;
+
+    // iterate over all child expressions to collect TEXT_MATCH filters for 
each identifier
+    for (Expression child : children) {
+      Function childFunction = child.getFunctionCall();
+      if (childFunction == null) {
+        newChildren.add(child);
+      } else {
+        String childOperator = childFunction.getOperator();
+        if (childOperator.equals(FilterKind.TEXT_MATCH.name())) {
+          List<Expression> operands = childFunction.getOperands();
+          Expression identifier = operands.get(0);
+          textMatchMap.computeIfAbsent(identifier, k -> new 
ArrayList<>()).add(child);
+        } else if (childOperator.equals(FilterKind.NOT.name())) {
+          assert childFunction.getOperands().size() == 1;
+          Expression operand = childFunction.getOperands().get(0);
+          Function notChildFunction = operand.getFunctionCall();
+          if (notChildFunction == null) {
+            newChildren.add(child);
+            continue;
+          }
+          if 
(notChildFunction.getOperator().equals(FilterKind.TEXT_MATCH.name())) {
+            List<Expression> operands = notChildFunction.getOperands();
+            Expression identifier = operands.get(0);
+            textMatchMap.computeIfAbsent(identifier, k -> new 
ArrayList<>()).add(child);
+            continue;
+          }
+          newChildren.add(child);
+        } else {
+          Expression newChild = optimize(child);
+          if (!newChild.equals(child)) {
+            recreateFilter = true;
+          }
+          newChildren.add(optimize(child));
+        }
+      }
+    }
+
+    for (List<Expression> values : textMatchMap.values()) {
+      if (values.size() > 1) {
+        recreateFilter = true;
+        break;
+      }
+    }
+    if (recreateFilter) {
+      return getNewFilter(operator, newChildren, textMatchMap);
+    }
+    return filterExpression;
+  }
+
+  private Expression getNewFilter(String operator, List<Expression> 
newChildren,
+      Map<Expression, List<Expression>> textMatchMap) {
+    // for each key in textMatchMap, build a TEXT_MATCH expression (merge list 
of filters)
+    for (Map.Entry<Expression, List<Expression>> entry : 
textMatchMap.entrySet()) {
+      // special case: if all expressions are NOT, then wrap the merged text 
match inside a NOT. otherwise, push the
+      // NOT down into the text match expression
+      boolean allNot = true;
+      for (Expression expression : entry.getValue()) {
+        if 
(!expression.getFunctionCall().getOperator().equals(FilterKind.NOT.name())) {
+          allNot = false;
+          break;
+        }
+      }
+
+      List<String> literals = new ArrayList<>();
+      if (allNot) {
+        for (Expression expression : entry.getValue()) {
+          Expression operand = 
expression.getFunctionCall().getOperands().get(0);
+          
literals.add(operand.getFunctionCall().getOperands().get(1).getLiteral().getStringValue());
+        }
+      } else {
+        for (Expression expression : entry.getValue()) {
+          if 
(expression.getFunctionCall().getOperator().equals(FilterKind.NOT.name())) {
+            Expression operand = 
expression.getFunctionCall().getOperands().get(0);
+            literals.add(FilterKind.NOT.name() + SPACE + 
operand.getFunctionCall().getOperands().get(1).getLiteral()
+                .getStringValue());
+            continue;
+          }
+          assert 
expression.getFunctionCall().getOperator().equals(FilterKind.TEXT_MATCH.name());
+          
literals.add(expression.getFunctionCall().getOperands().get(1).getLiteral().getStringValue());
+        }
+      }
+
+      // build the merged TEXT_MATCH expression
+      String mergedTextMatchFilter;
+      if (allNot) {
+        assert operator.equals(FilterKind.AND.name()) || 
operator.equals(FilterKind.OR.name());
+        if (operator.equals(FilterKind.AND.name())) {
+          mergedTextMatchFilter = String.join(SPACE + FilterKind.OR.name() + 
SPACE, literals);
+        } else {
+          mergedTextMatchFilter = String.join(SPACE + FilterKind.AND.name() + 
SPACE, literals);
+        }
+      } else {
+        mergedTextMatchFilter = String.join(SPACE + operator + SPACE, 
literals);
+      }
+      Expression mergedTextMatchExpression = 
RequestUtils.getFunctionExpression(FilterKind.TEXT_MATCH.name());
+      Expression mergedTextMatchFilterExpression = 
RequestUtils.getLiteralExpression(mergedTextMatchFilter);
+      mergedTextMatchExpression.getFunctionCall()
+          .setOperands(Arrays.asList(entry.getKey(), 
mergedTextMatchFilterExpression));
+
+      if (allNot) {
+        Expression notExpression = 
RequestUtils.getFunctionExpression(FilterKind.NOT.name());
+        
notExpression.getFunctionCall().setOperands(Collections.singletonList(mergedTextMatchExpression));
+        newChildren.add(notExpression);
+        continue;
+      }
+      newChildren.add(mergedTextMatchExpression);
+    }
+
+    if (newChildren.size() == 1) {
+      return newChildren.get(0);
+    }
+    assert operator.equals(FilterKind.OR.name()) || 
operator.equals(FilterKind.AND.name());
+    Expression newExpression = RequestUtils.getFunctionExpression(operator);
+    newExpression.getFunctionCall().setOperands(newChildren);
+    return newExpression;
+  }
+}
diff --git 
a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/QueryOptimizerTest.java
 
b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/QueryOptimizerTest.java
index 631b4ee9d2..337075f974 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/QueryOptimizerTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/QueryOptimizerTest.java
@@ -166,6 +166,21 @@ public class QueryOptimizerTest {
     assertEquals(fourthChildChildren.get(1).getFunctionCall().getOperator(), 
FilterKind.LESS_THAN.name());
   }
 
+  @Test
+  public void testMergeTextMatchFilter() {
+    String query =
+        "SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo') AND 
TEXT_MATCH(string, 'bar') OR TEXT_MATCH(string, "
+            + "'baz')";
+    PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(query);
+    OPTIMIZER.optimize(pinotQuery, SCHEMA);
+    Function filterFunction = 
pinotQuery.getFilterExpression().getFunctionCall();
+    assertEquals(filterFunction.getOperator(), FilterKind.TEXT_MATCH.name());
+    List<Expression> operands = filterFunction.getOperands();
+    assertEquals(operands.size(), 2);
+    assertEquals(operands.get(0), 
RequestUtils.getIdentifierExpression("string"));
+    assertEquals(operands.get(1), RequestUtils.getLiteralExpression("foo AND 
bar OR baz"));
+  }
+
   private static Expression getRangeFilterExpression(String column, String 
rangeString) {
     Expression rangeFilterExpression = 
RequestUtils.getFunctionExpression(FilterKind.RANGE.name());
     rangeFilterExpression.getFunctionCall().setOperands(
@@ -250,6 +265,39 @@ public class QueryOptimizerTest {
 
     testQuery("SELECT * FROM testTable WHERE 1=1 AND true", "SELECT * FROM 
testTable WHERE true");
     testQuery("SELECT * FROM testTable WHERE \"a\"=\"a\" AND true", "SELECT * 
FROM testTable WHERE true");
+
+    // TextMatchFilterOptimizer
+    testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo') AND 
TEXT_MATCH(string, 'bar')",
+        "SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo AND bar')");
+    testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, '\"foo bar\"') 
AND TEXT_MATCH(string, 'baz')",
+        "SELECT * FROM testTable WHERE TEXT_MATCH(string, '\"foo bar\" AND 
baz')");
+    testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, '\"foo bar\"') 
AND TEXT_MATCH(string, '/.*ooba.*/')",
+        "SELECT * FROM testTable WHERE TEXT_MATCH(string, '\"foo bar\" AND 
/.*ooba.*/')");
+    testQuery("SELECT * FROM testTable WHERE int = 1 AND TEXT_MATCH(string, 
'foo') AND TEXT_MATCH(string, 'bar')",
+        "SELECT * FROM testTable WHERE int = 1 AND TEXT_MATCH(string, 'foo AND 
bar')");
+    testQuery("SELECT * FROM testTable WHERE int = 1 OR TEXT_MATCH(string, 
'foo') AND TEXT_MATCH(string, 'bar')",
+        "SELECT * FROM testTable WHERE int = 1 OR TEXT_MATCH(string, 'foo AND 
bar')");
+    testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo') AND NOT 
TEXT_MATCH(string, 'bar')",
+        "SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo AND NOT bar')");
+    testQuery("SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo') AND 
TEXT_MATCH(string, 'bar')",
+        "SELECT * FROM testTable WHERE TEXT_MATCH(string, 'NOT foo AND bar')");
+    testQuery("SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo') AND 
NOT TEXT_MATCH(string, 'bar')",
+        "SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo OR bar')");
+    testQuery("SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo') OR 
NOT TEXT_MATCH(string, 'bar')",
+        "SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo AND bar')");
+    testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo') AND 
TEXT_MATCH(string, 'bar') OR "
+        + "TEXT_MATCH(string, 'baz')", "SELECT * FROM testTable WHERE 
TEXT_MATCH(string, 'foo AND bar OR baz')");
+    testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string1, 'foo1') AND 
TEXT_MATCH(string1, 'bar1') OR "
+            + "TEXT_MATCH(string1, 'baz1') AND TEXT_MATCH(string2, 'foo')",
+        "SELECT * FROM testTable WHERE TEXT_MATCH(string1, 'foo1 AND bar1') OR 
TEXT_MATCH(string1, 'baz1') AND "
+            + "TEXT_MATCH(string2, 'foo')");
+    testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string1, 'foo1') AND 
TEXT_MATCH(string1, 'bar1')"
+            + "AND TEXT_MATCH(string2, 'foo2') AND TEXT_MATCH(string2, 
'bar2')",
+        "SELECT * FROM testTable WHERE TEXT_MATCH(string1, 'foo1 AND bar1') 
AND TEXT_MATCH(string2, 'foo2 AND bar2')");
+    testCannotOptimizeQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string1, 
'foo') OR TEXT_MATCH(string2, 'bar')");
+    testCannotOptimizeQuery(
+        "SELECT * FROM testTable WHERE int = 1 AND TEXT_MATCH(string, 'foo') 
OR TEXT_MATCH(string, 'bar')");
+    testCannotOptimizeQuery("SELECT * FROM testTable WHERE NOT 
TEXT_MATCH(string, 'foo')");
   }
 
   private static void testQuery(String actual, String expected) {
@@ -262,6 +310,13 @@ public class QueryOptimizerTest {
     comparePinotQuery(actualPinotQuery, expectedPinotQuery);
   }
 
+  private static void testCannotOptimizeQuery(String query) {
+    PinotQuery actualPinotQuery = CalciteSqlParser.compileToPinotQuery(query);
+    OPTIMIZER.optimize(actualPinotQuery, SCHEMA);
+    PinotQuery expectedPinotQuery = 
CalciteSqlParser.compileToPinotQuery(query);
+    comparePinotQuery(actualPinotQuery, expectedPinotQuery);
+  }
+
   private static void comparePinotQuery(PinotQuery actual, PinotQuery 
expected) {
     if (expected.getFilterExpression() == null) {
       assertNull(actual.getFilterExpression());


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to