This is an automated email from the ASF dual-hosted git repository.
tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new dd8be2a229 add TextMatchFilterOptimizer to maximally push down
`text_match` filters to Lucene (#12339)
dd8be2a229 is described below
commit dd8be2a2298a3cdc091e342004e6eaca9d399ed9
Author: Christopher Peck <[email protected]>
AuthorDate: Wed Jan 31 13:40:37 2024 -0800
add TextMatchFilterOptimizer to maximally push down `text_match` filters to
Lucene (#12339)
* add TextMatchFilterOptimizer
* fix equivalence for all not
---
.../pinot/core/query/optimizer/QueryOptimizer.java | 8 +-
.../optimizer/filter/TextMatchFilterOptimizer.java | 186 +++++++++++++++++++++
.../core/query/optimizer/QueryOptimizerTest.java | 55 ++++++
3 files changed, 246 insertions(+), 3 deletions(-)
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/QueryOptimizer.java
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/QueryOptimizer.java
index 0b98cdb88f..dbd9b5c70c 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/QueryOptimizer.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/QueryOptimizer.java
@@ -30,6 +30,7 @@ import
org.apache.pinot.core.query.optimizer.filter.IdenticalPredicateFilterOpti
import org.apache.pinot.core.query.optimizer.filter.MergeEqInFilterOptimizer;
import org.apache.pinot.core.query.optimizer.filter.MergeRangeFilterOptimizer;
import org.apache.pinot.core.query.optimizer.filter.NumericalFilterOptimizer;
+import org.apache.pinot.core.query.optimizer.filter.TextMatchFilterOptimizer;
import
org.apache.pinot.core.query.optimizer.filter.TimePredicateFilterOptimizer;
import org.apache.pinot.core.query.optimizer.statement.StatementOptimizer;
import
org.apache.pinot.core.query.optimizer.statement.StringPredicateFilterOptimizer;
@@ -39,14 +40,15 @@ import org.apache.pinot.spi.data.Schema;
public class QueryOptimizer {
// DO NOT change the order of these optimizers.
- // - MergeEqInFilterOptimizer and MergeRangeFilterOptimizer relies on
FlattenAndOrFilterOptimizer to flatten the
- // AND/OR predicate so that the children are on the same level to be merged
+ // - MergeEqInFilterOptimizer, MergeRangeFilterOptimizer, and
TextMatchFilterOptimizer each rely on
+ // FlattenAndOrFilterOptimizer to flatten the AND/OR predicate so that the
children are on the same level to
+ // be merged
// - TimePredicateFilterOptimizer and MergeRangeFilterOptimizer relies on
NumericalFilterOptimizer to convert the
// values to the proper format so that they can be properly parsed
private static final List<FilterOptimizer> FILTER_OPTIMIZERS =
Arrays.asList(new FlattenAndOrFilterOptimizer(), new
IdenticalPredicateFilterOptimizer(),
new MergeEqInFilterOptimizer(), new NumericalFilterOptimizer(), new
TimePredicateFilterOptimizer(),
- new MergeRangeFilterOptimizer());
+ new MergeRangeFilterOptimizer(), new TextMatchFilterOptimizer());
private static final List<StatementOptimizer> STATEMENT_OPTIMIZERS =
Collections.singletonList(new StringPredicateFilterOptimizer());
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/TextMatchFilterOptimizer.java
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/TextMatchFilterOptimizer.java
new file mode 100644
index 0000000000..aca4e2d5cc
--- /dev/null
+++
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/TextMatchFilterOptimizer.java
@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.query.optimizer.filter;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import javax.annotation.Nullable;
+import org.apache.pinot.common.request.Expression;
+import org.apache.pinot.common.request.ExpressionType;
+import org.apache.pinot.common.request.Function;
+import org.apache.pinot.common.utils.request.RequestUtils;
+import org.apache.pinot.spi.data.Schema;
+import org.apache.pinot.sql.FilterKind;
+
+
+/**
+ * The {@code TextMatchFilterOptimizer} merges `TEXT_MATCH` predicates on the
same column within an `OR` or `AND`,
+ * maximizing the amount of the query that can be pushed down to Lucene
+ *
+ * NOTE: This optimizer follows the {@link FlattenAndOrFilterOptimizer}, so
all the AND/OR filters are already
+ * flattened.
+ */
+public class TextMatchFilterOptimizer implements FilterOptimizer {
+ private static final String SPACE = " ";
+
+ @Override
+ public Expression optimize(Expression filterExpression, @Nullable Schema
schema) {
+ return filterExpression.getType() == ExpressionType.FUNCTION ?
optimize(filterExpression) : filterExpression;
+ }
+
+ private Expression optimize(Expression filterExpression) {
+ Function function = filterExpression.getFunctionCall();
+ if (function == null) {
+ return filterExpression;
+ }
+
+ // no optimization can be performed unless the function is an OR, AND, or
NOT
+ String operator = function.getOperator();
+ if (!operator.equals(FilterKind.OR.name()) &&
!operator.equals(FilterKind.AND.name()) && !operator.equals(
+ FilterKind.NOT.name())) {
+ return filterExpression;
+ }
+
+ List<Expression> children = function.getOperands();
+ children.replaceAll(this::optimize);
+
+ List<Expression> newChildren = new ArrayList<>();
+ Map<Expression, List<Expression>> textMatchMap = new HashMap<>();
+ boolean recreateFilter = false;
+
+ // iterate over all child expressions to collect TEXT_MATCH filters for
each identifier
+ for (Expression child : children) {
+ Function childFunction = child.getFunctionCall();
+ if (childFunction == null) {
+ newChildren.add(child);
+ } else {
+ String childOperator = childFunction.getOperator();
+ if (childOperator.equals(FilterKind.TEXT_MATCH.name())) {
+ List<Expression> operands = childFunction.getOperands();
+ Expression identifier = operands.get(0);
+ textMatchMap.computeIfAbsent(identifier, k -> new
ArrayList<>()).add(child);
+ } else if (childOperator.equals(FilterKind.NOT.name())) {
+ assert childFunction.getOperands().size() == 1;
+ Expression operand = childFunction.getOperands().get(0);
+ Function notChildFunction = operand.getFunctionCall();
+ if (notChildFunction == null) {
+ newChildren.add(child);
+ continue;
+ }
+ if
(notChildFunction.getOperator().equals(FilterKind.TEXT_MATCH.name())) {
+ List<Expression> operands = notChildFunction.getOperands();
+ Expression identifier = operands.get(0);
+ textMatchMap.computeIfAbsent(identifier, k -> new
ArrayList<>()).add(child);
+ continue;
+ }
+ newChildren.add(child);
+ } else {
+ Expression newChild = optimize(child);
+ if (!newChild.equals(child)) {
+ recreateFilter = true;
+ }
+ newChildren.add(optimize(child));
+ }
+ }
+ }
+
+ for (List<Expression> values : textMatchMap.values()) {
+ if (values.size() > 1) {
+ recreateFilter = true;
+ break;
+ }
+ }
+ if (recreateFilter) {
+ return getNewFilter(operator, newChildren, textMatchMap);
+ }
+ return filterExpression;
+ }
+
+ private Expression getNewFilter(String operator, List<Expression>
newChildren,
+ Map<Expression, List<Expression>> textMatchMap) {
+ // for each key in textMatchMap, build a TEXT_MATCH expression (merge list
of filters)
+ for (Map.Entry<Expression, List<Expression>> entry :
textMatchMap.entrySet()) {
+ // special case: if all expressions are NOT, then wrap the merged text
match inside a NOT. otherwise, push the
+ // NOT down into the text match expression
+ boolean allNot = true;
+ for (Expression expression : entry.getValue()) {
+ if
(!expression.getFunctionCall().getOperator().equals(FilterKind.NOT.name())) {
+ allNot = false;
+ break;
+ }
+ }
+
+ List<String> literals = new ArrayList<>();
+ if (allNot) {
+ for (Expression expression : entry.getValue()) {
+ Expression operand =
expression.getFunctionCall().getOperands().get(0);
+
literals.add(operand.getFunctionCall().getOperands().get(1).getLiteral().getStringValue());
+ }
+ } else {
+ for (Expression expression : entry.getValue()) {
+ if
(expression.getFunctionCall().getOperator().equals(FilterKind.NOT.name())) {
+ Expression operand =
expression.getFunctionCall().getOperands().get(0);
+ literals.add(FilterKind.NOT.name() + SPACE +
operand.getFunctionCall().getOperands().get(1).getLiteral()
+ .getStringValue());
+ continue;
+ }
+ assert
expression.getFunctionCall().getOperator().equals(FilterKind.TEXT_MATCH.name());
+
literals.add(expression.getFunctionCall().getOperands().get(1).getLiteral().getStringValue());
+ }
+ }
+
+ // build the merged TEXT_MATCH expression
+ String mergedTextMatchFilter;
+ if (allNot) {
+ assert operator.equals(FilterKind.AND.name()) ||
operator.equals(FilterKind.OR.name());
+ if (operator.equals(FilterKind.AND.name())) {
+ mergedTextMatchFilter = String.join(SPACE + FilterKind.OR.name() +
SPACE, literals);
+ } else {
+ mergedTextMatchFilter = String.join(SPACE + FilterKind.AND.name() +
SPACE, literals);
+ }
+ } else {
+ mergedTextMatchFilter = String.join(SPACE + operator + SPACE,
literals);
+ }
+ Expression mergedTextMatchExpression =
RequestUtils.getFunctionExpression(FilterKind.TEXT_MATCH.name());
+ Expression mergedTextMatchFilterExpression =
RequestUtils.getLiteralExpression(mergedTextMatchFilter);
+ mergedTextMatchExpression.getFunctionCall()
+ .setOperands(Arrays.asList(entry.getKey(),
mergedTextMatchFilterExpression));
+
+ if (allNot) {
+ Expression notExpression =
RequestUtils.getFunctionExpression(FilterKind.NOT.name());
+
notExpression.getFunctionCall().setOperands(Collections.singletonList(mergedTextMatchExpression));
+ newChildren.add(notExpression);
+ continue;
+ }
+ newChildren.add(mergedTextMatchExpression);
+ }
+
+ if (newChildren.size() == 1) {
+ return newChildren.get(0);
+ }
+ assert operator.equals(FilterKind.OR.name()) ||
operator.equals(FilterKind.AND.name());
+ Expression newExpression = RequestUtils.getFunctionExpression(operator);
+ newExpression.getFunctionCall().setOperands(newChildren);
+ return newExpression;
+ }
+}
diff --git
a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/QueryOptimizerTest.java
b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/QueryOptimizerTest.java
index 631b4ee9d2..337075f974 100644
---
a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/QueryOptimizerTest.java
+++
b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/QueryOptimizerTest.java
@@ -166,6 +166,21 @@ public class QueryOptimizerTest {
assertEquals(fourthChildChildren.get(1).getFunctionCall().getOperator(),
FilterKind.LESS_THAN.name());
}
+ @Test
+ public void testMergeTextMatchFilter() {
+ String query =
+ "SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo') AND
TEXT_MATCH(string, 'bar') OR TEXT_MATCH(string, "
+ + "'baz')";
+ PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(query);
+ OPTIMIZER.optimize(pinotQuery, SCHEMA);
+ Function filterFunction =
pinotQuery.getFilterExpression().getFunctionCall();
+ assertEquals(filterFunction.getOperator(), FilterKind.TEXT_MATCH.name());
+ List<Expression> operands = filterFunction.getOperands();
+ assertEquals(operands.size(), 2);
+ assertEquals(operands.get(0),
RequestUtils.getIdentifierExpression("string"));
+ assertEquals(operands.get(1), RequestUtils.getLiteralExpression("foo AND
bar OR baz"));
+ }
+
private static Expression getRangeFilterExpression(String column, String
rangeString) {
Expression rangeFilterExpression =
RequestUtils.getFunctionExpression(FilterKind.RANGE.name());
rangeFilterExpression.getFunctionCall().setOperands(
@@ -250,6 +265,39 @@ public class QueryOptimizerTest {
testQuery("SELECT * FROM testTable WHERE 1=1 AND true", "SELECT * FROM
testTable WHERE true");
testQuery("SELECT * FROM testTable WHERE \"a\"=\"a\" AND true", "SELECT *
FROM testTable WHERE true");
+
+ // TextMatchFilterOptimizer
+ testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo') AND
TEXT_MATCH(string, 'bar')",
+ "SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo AND bar')");
+ testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, '\"foo bar\"')
AND TEXT_MATCH(string, 'baz')",
+ "SELECT * FROM testTable WHERE TEXT_MATCH(string, '\"foo bar\" AND
baz')");
+ testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, '\"foo bar\"')
AND TEXT_MATCH(string, '/.*ooba.*/')",
+ "SELECT * FROM testTable WHERE TEXT_MATCH(string, '\"foo bar\" AND
/.*ooba.*/')");
+ testQuery("SELECT * FROM testTable WHERE int = 1 AND TEXT_MATCH(string,
'foo') AND TEXT_MATCH(string, 'bar')",
+ "SELECT * FROM testTable WHERE int = 1 AND TEXT_MATCH(string, 'foo AND
bar')");
+ testQuery("SELECT * FROM testTable WHERE int = 1 OR TEXT_MATCH(string,
'foo') AND TEXT_MATCH(string, 'bar')",
+ "SELECT * FROM testTable WHERE int = 1 OR TEXT_MATCH(string, 'foo AND
bar')");
+ testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo') AND NOT
TEXT_MATCH(string, 'bar')",
+ "SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo AND NOT bar')");
+ testQuery("SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo') AND
TEXT_MATCH(string, 'bar')",
+ "SELECT * FROM testTable WHERE TEXT_MATCH(string, 'NOT foo AND bar')");
+ testQuery("SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo') AND
NOT TEXT_MATCH(string, 'bar')",
+ "SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo OR bar')");
+ testQuery("SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo') OR
NOT TEXT_MATCH(string, 'bar')",
+ "SELECT * FROM testTable WHERE NOT TEXT_MATCH(string, 'foo AND bar')");
+ testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string, 'foo') AND
TEXT_MATCH(string, 'bar') OR "
+ + "TEXT_MATCH(string, 'baz')", "SELECT * FROM testTable WHERE
TEXT_MATCH(string, 'foo AND bar OR baz')");
+ testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string1, 'foo1') AND
TEXT_MATCH(string1, 'bar1') OR "
+ + "TEXT_MATCH(string1, 'baz1') AND TEXT_MATCH(string2, 'foo')",
+ "SELECT * FROM testTable WHERE TEXT_MATCH(string1, 'foo1 AND bar1') OR
TEXT_MATCH(string1, 'baz1') AND "
+ + "TEXT_MATCH(string2, 'foo')");
+ testQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string1, 'foo1') AND
TEXT_MATCH(string1, 'bar1')"
+ + "AND TEXT_MATCH(string2, 'foo2') AND TEXT_MATCH(string2,
'bar2')",
+ "SELECT * FROM testTable WHERE TEXT_MATCH(string1, 'foo1 AND bar1')
AND TEXT_MATCH(string2, 'foo2 AND bar2')");
+ testCannotOptimizeQuery("SELECT * FROM testTable WHERE TEXT_MATCH(string1,
'foo') OR TEXT_MATCH(string2, 'bar')");
+ testCannotOptimizeQuery(
+ "SELECT * FROM testTable WHERE int = 1 AND TEXT_MATCH(string, 'foo')
OR TEXT_MATCH(string, 'bar')");
+ testCannotOptimizeQuery("SELECT * FROM testTable WHERE NOT
TEXT_MATCH(string, 'foo')");
}
private static void testQuery(String actual, String expected) {
@@ -262,6 +310,13 @@ public class QueryOptimizerTest {
comparePinotQuery(actualPinotQuery, expectedPinotQuery);
}
+ private static void testCannotOptimizeQuery(String query) {
+ PinotQuery actualPinotQuery = CalciteSqlParser.compileToPinotQuery(query);
+ OPTIMIZER.optimize(actualPinotQuery, SCHEMA);
+ PinotQuery expectedPinotQuery =
CalciteSqlParser.compileToPinotQuery(query);
+ comparePinotQuery(actualPinotQuery, expectedPinotQuery);
+ }
+
private static void comparePinotQuery(PinotQuery actual, PinotQuery
expected) {
if (expected.getFilterExpression() == null) {
assertNull(actual.getFilterExpression());
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]