This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 782e769f6c Add BETWEEN support to NumericalFilterOptimizer (#14163)
782e769f6c is described below
commit 782e769f6c683e181903d5b6e1fc469b12193fc0
Author: Yash Mayya <[email protected]>
AuthorDate: Fri Oct 11 11:12:48 2024 +0530
Add BETWEEN support to NumericalFilterOptimizer (#14163)
---
.../optimizer/filter/NumericalFilterOptimizer.java | 277 ++++++++++++++++-----
.../filter/NumericalFilterOptimizerTest.java | 68 +++++
2 files changed, 282 insertions(+), 63 deletions(-)
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java
index 7d2336a1a0..961dfa3b71 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java
@@ -24,7 +24,6 @@ import javax.annotation.Nullable;
import org.apache.pinot.common.request.Expression;
import org.apache.pinot.common.request.ExpressionType;
import org.apache.pinot.common.request.Function;
-import org.apache.pinot.common.request.Literal;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.data.Schema;
@@ -32,34 +31,44 @@ import org.apache.pinot.sql.FilterKind;
/**
- * Numerical expressions of form "column <operator> literal", where operator
can be '=', '!=', '>', '>=', '<', or '<=',
- * can compare a column of one datatype (say INT) with a literal of different
datatype (say DOUBLE). These expressions
- * can not be evaluated on the Server. Hence, we rewrite such expressions into
an equivalent expression whose LHS and
- * RHS are of the same datatype.
- *
+ * Numerical expressions of the form "column [operator] literal", where
operator can be '=', '!=', '>', '>=', '<', '<=',
+ * or 'BETWEEN' can compare a column of one datatype (say INT) with a literal
of different datatype (say DOUBLE). These
+ * expressions can not be evaluated on the Server. Hence, we rewrite such
expressions into an equivalent expression
+ * whose LHS and RHS are of the same datatype.
+ * <p>
* Simple predicate examples:
- * 1) WHERE "intColumn = 5.0" gets rewritten to "WHERE intColumn = 5"
- * 2) WHERE "intColumn != 5.0" gets rewritten to "WHERE intColumn != 5"
- * 3) WHERE "intColumn = 5.5" gets rewritten to "WHERE false" because INT
values can not match 5.5.
- * 4) WHERE "intColumn = 3000000000" gets rewritten to "WHERE false" because
INT values can not match 3000000000.
- * 5) WHERE "intColumn != 3000000000" gets rewritten to "WHERE true" because
INT values always not equal to 3000000000.
- * 6) WHERE "intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5"
- * 7) WHERE "intColumn > -3E9" gets rewritten to "WHERE true" because int
values are always greater than -3E9.
- *
+ * <ol>
+ * <li> "WHERE intColumn = 5.0" gets rewritten to "WHERE intColumn = 5"
+ * <li> "WHERE intColumn != 5.0" gets rewritten to "WHERE intColumn != 5"
+ * <li> "WHERE intColumn = 5.5" gets rewritten to "WHERE false" because INT
values can not match 5.5.
+ * <li> "WHERE intColumn = 3000000000" gets rewritten to "WHERE false"
because INT values can not match 3000000000.
+ * <li> "WHERE intColumn != 3000000000" gets rewritten to "WHERE true"
because INT values always not equal to
+ * 3000000000.
+ * <li> "WHERE intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5"
+ * <li> "WHERE intColumn > -3E9" gets rewritten to "WHERE true" because int
values are always greater than -3E9.
+ * <li> "WHERE intColumn BETWEEN 2.5 AND 7.5" gets rewritten to "WHERE
intColumn BETWEEN 3 AND 7"
+ * <li> "WHERE intColumn BETWEEN 5.5 AND 3000000000" gets rewritten to "WHERE
intColumn BETWEEN 6 AND 2147483647" since
+ * 3000000000 is greater than Integer.MAX_VALUE.
+ * <li> "WHERE intColumn BETWEEN 10 AND 0" gets rewritten to "WHERE false"
because lower bound is greater than upper
+ * bound.
+ * </ol>
+ * <p>
* Compound predicate examples:
- * 8) WHERE "intColumn1 = 5.5 AND intColumn2 = intColumn3"
+ * <ol>
+ * <li> "WHERE intColumn1 = 5.5 AND intColumn2 = intColumn3"
* rewrite to "WHERE false AND intColumn2 = intColumn3"
* rewrite to "WHERE intColumn2 = intColumn3"
- * 9) WHERE "intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is out
of bounds for integer column)
+ * <li> "WHERE intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is
out of bounds for integer column)
* rewrite to "WHERE true OR false"
* rewrite to "WHERE true"
* rewrite to query without any WHERE clause.
- *
+ * </ol>
+ * <p>
* When entire predicate gets rewritten to false (Example 3 above), the query
will not return any data. Hence, it is
* better for the Broker itself to return an empty response rather than
sending the query to servers for further
* evaluation.
- *
- * TODO: Add support for BETWEEN, IN, and NOT IN operators.
+ * <p>
+ * TODO: Add support for IN, and NOT IN operators.
*/
public class NumericalFilterOptimizer extends BaseAndOrBooleanFilterOptimizer {
@@ -74,33 +83,39 @@ public class NumericalFilterOptimizer extends
BaseAndOrBooleanFilterOptimizer {
Expression optimizeChild(Expression filterExpression, @Nullable Schema
schema) {
Function function = filterExpression.getFunctionCall();
FilterKind kind = FilterKind.valueOf(function.getOperator());
+
+ if (!kind.isRange() && kind != FilterKind.EQUALS && kind !=
FilterKind.NOT_EQUALS) {
+ return filterExpression;
+ }
+
+ List<Expression> operands = function.getOperands();
+ // Verify that LHS is a numeric column and RHS is a literal before
rewriting.
+ Expression lhs = operands.get(0);
+ Expression rhs = operands.get(1);
+
+ DataType dataType = getDataType(lhs, schema);
+ if (dataType == null || !dataType.isNumeric() || !rhs.isSetLiteral()) {
+ // No rewrite here
+ return filterExpression;
+ }
+
switch (kind) {
- case IS_NULL:
- case IS_NOT_NULL:
- // No need to try to optimize IS_NULL and IS_NOT_NULL operations on
numerical columns.
- break;
- default:
- List<Expression> operands = function.getOperands();
- // Verify that LHS is a numeric column and RHS is a numeric literal
before rewriting.
- Expression lhs = operands.get(0);
- Expression rhs = operands.get(1);
- if (isNumericLiteral(rhs)) {
- DataType dataType = getDataType(lhs, schema);
- if (dataType != null && dataType.isNumeric()) {
- switch (kind) {
- case EQUALS:
- case NOT_EQUALS:
- return rewriteEqualsExpression(filterExpression, kind,
dataType, rhs);
- case GREATER_THAN:
- case GREATER_THAN_OR_EQUAL:
- case LESS_THAN:
- case LESS_THAN_OR_EQUAL:
- return rewriteRangeExpression(filterExpression, kind,
dataType, rhs);
- default:
- break;
- }
- }
+ case BETWEEN: {
+ return rewriteBetweenExpression(filterExpression, dataType);
+ }
+ case EQUALS:
+ case NOT_EQUALS:
+ case GREATER_THAN:
+ case GREATER_THAN_OR_EQUAL:
+ case LESS_THAN:
+ case LESS_THAN_OR_EQUAL: {
+ if (kind.isRange()) {
+ return rewriteRangeExpression(filterExpression, kind, dataType, rhs);
+ } else {
+ return rewriteEqualsExpression(filterExpression, kind, dataType,
rhs);
}
+ }
+ default:
break;
}
return filterExpression;
@@ -346,6 +361,159 @@ public class NumericalFilterOptimizer extends
BaseAndOrBooleanFilterOptimizer {
return range;
}
+ /**
+ * Rewrite expressions of the form "column BETWEEN lower AND upper" to
ensure that lower and upper bounds are the same
+ * datatype as the column (or can be cast to the same datatype in the
server).
+ */
+ private static Expression rewriteBetweenExpression(Expression between,
DataType dataType) {
+ // TODO: Consider unifying logic with rewriteRangeExpression
+ List<Expression> operands = between.getFunctionCall().getOperands();
+ Expression lower = operands.get(1);
+ Expression upper = operands.get(2);
+
+ // The BETWEEN filter predicate currently only supports literals as lower
and upper bounds, but we're still checking
+ // here just in case.
+ if (lower.isSetLiteral()) {
+ switch (lower.getLiteral().getSetField()) {
+ case LONG_VALUE: {
+ long actual = lower.getLiteral().getLongValue();
+ // Other data types can be converted on the server side.
+ if (dataType == DataType.INT) {
+ if (actual > Integer.MAX_VALUE) {
+ // Lower bound literal value is greater than the bounds of INT.
+ return getExpressionFromBoolean(false);
+ }
+ if (actual < Integer.MIN_VALUE) {
+ lower.getLiteral().setIntValue(Integer.MIN_VALUE);
+ }
+ }
+ break;
+ }
+ case DOUBLE_VALUE: {
+ double actual = lower.getLiteral().getDoubleValue();
+
+ switch (dataType) {
+ case INT: {
+ if (actual > Integer.MAX_VALUE) {
+ // Lower bound literal value is greater than the bounds of INT.
+ return getExpressionFromBoolean(false);
+ }
+ if (actual < Integer.MIN_VALUE) {
+ lower.getLiteral().setIntValue(Integer.MIN_VALUE);
+ } else {
+ // Double value is in int range
+ int converted = (int) actual;
+ int comparison =
BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
+ if (comparison >= 0) {
+ lower.getLiteral().setIntValue(converted);
+ } else {
+ lower.getLiteral().setIntValue(converted + 1);
+ }
+ }
+ break;
+ }
+ case LONG: {
+ if (actual > Long.MAX_VALUE) {
+ // Lower bound literal value is greater than the bounds of
LONG.
+ return getExpressionFromBoolean(false);
+ }
+ if (actual < Long.MIN_VALUE) {
+ lower.getLiteral().setLongValue(Long.MIN_VALUE);
+ } else {
+ // Double value is in long range
+ long converted = (long) actual;
+ int comparison =
BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
+ if (comparison >= 0) {
+ lower.getLiteral().setLongValue(converted);
+ } else {
+ lower.getLiteral().setLongValue(converted + 1);
+ }
+ }
+ break;
+ }
+ default:
+ // For other numeric data types, the double literal can be
converted on the server side.
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ if (upper.isSetLiteral()) {
+ switch (upper.getLiteral().getSetField()) {
+ case LONG_VALUE: {
+ long actual = upper.getLiteral().getLongValue();
+ // Other data types can be converted on the server side.
+ if (dataType == DataType.INT) {
+ if (actual < Integer.MIN_VALUE) {
+ // Upper bound literal value is lesser than the bounds of INT.
+ return getExpressionFromBoolean(false);
+ }
+ if (actual > Integer.MAX_VALUE) {
+ upper.getLiteral().setIntValue(Integer.MAX_VALUE);
+ }
+ }
+ break;
+ }
+ case DOUBLE_VALUE: {
+ double actual = upper.getLiteral().getDoubleValue();
+
+ switch (dataType) {
+ case INT: {
+ if (actual < Integer.MIN_VALUE) {
+ // Upper bound literal value is lesser than the bounds of INT.
+ return getExpressionFromBoolean(false);
+ }
+ if (actual > Integer.MAX_VALUE) {
+ upper.getLiteral().setIntValue(Integer.MAX_VALUE);
+ } else {
+ // Double value is in int range
+ int converted = (int) actual;
+ int comparison =
BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
+ if (comparison <= 0) {
+ upper.getLiteral().setIntValue(converted);
+ } else {
+ upper.getLiteral().setIntValue(converted - 1);
+ }
+ }
+ break;
+ }
+ case LONG: {
+ if (actual < Long.MIN_VALUE) {
+ // Upper bound literal value is lesser than the bounds of LONG.
+ return getExpressionFromBoolean(false);
+ }
+ if (actual > Long.MAX_VALUE) {
+ upper.getLiteral().setLongValue(Long.MAX_VALUE);
+ } else {
+ // Double value is in long range
+ long converted = (long) actual;
+ int comparison =
BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
+ if (comparison <= 0) {
+ upper.getLiteral().setLongValue(converted);
+ } else {
+ upper.getLiteral().setLongValue(converted - 1);
+ }
+ }
+ break;
+ }
+ default:
+ // For other numeric data types, the double literal can be
converted on the server side.
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ return between;
+ }
+
/**
* Helper function to rewrite range operator of a range expression.
* @param range Range expression.
@@ -356,9 +524,9 @@ public class NumericalFilterOptimizer extends
BaseAndOrBooleanFilterOptimizer {
if (comparison > 0) {
// Literal value is greater than the converted value, so rewrite:
// "column > literal" to "column > converted"
- // "column >= literal" to "column >= converted"
+ // "column >= literal" to "column > converted"
// "column < literal" to "column <= converted"
- // "column <= literal" to "column < converted"
+ // "column <= literal" to "column <= converted"
if (kind == FilterKind.GREATER_THAN || kind ==
FilterKind.GREATER_THAN_OR_EQUAL) {
range.getFunctionCall().setOperator(FilterKind.GREATER_THAN.name());
} else if (kind == FilterKind.LESS_THAN || kind ==
FilterKind.LESS_THAN_OR_EQUAL) {
@@ -413,21 +581,4 @@ public class NumericalFilterOptimizer extends
BaseAndOrBooleanFilterOptimizer {
}
return null;
}
-
- /** @return true if expression is a numeric literal; otherwise, false. */
- private static boolean isNumericLiteral(Expression expression) {
- if (expression.getType() == ExpressionType.LITERAL) {
- Literal._Fields type = expression.getLiteral().getSetField();
- switch (type) {
- case INT_VALUE:
- case LONG_VALUE:
- case FLOAT_VALUE:
- case DOUBLE_VALUE:
- return true;
- default:
- break;
- }
- }
- return false;
- }
}
diff --git
a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java
b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java
index e947740e24..363ef8887d 100644
---
a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java
+++
b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java
@@ -295,6 +295,74 @@ public class NumericalFilterOptimizerTest {
+ ".223372036854776E18>)]))");
}
+ @Test
+ public void testBetweenRewrites() {
+ // Test LONG literal with INT column.
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN 3000000000 AND 4000000000"),
+ "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN -4000000000 AND -3000000000"),
+ "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+
+ // No rewrite
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN -2000000000 AND 2000000000"),
+ "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN,
operands:[Expression(type:IDENTIFIER, "
+ + "identifier:Identifier(name:intColumn)),
Expression(type:LITERAL, literal:<Literal "
+ + "intValue:-2000000000>), Expression(type:LITERAL,
literal:<Literal intValue:2000000000>)]))");
+
+ // Test INT column with DOUBLE lower bound greater than Integer.MAX_VALUE.
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN 3000000000.0 AND 4000000000.0"),
+ "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+ // Test INT column with DOUBLE upper bound lesser than Integer.MIN_VALUE.
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN -4000000000.0 AND -3000000000.0"),
+ "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+ // Test INT column with LONG lower bound lesser than Integer.MIN_VALUE.
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN -4000000000 AND 0"),
+ "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN,
operands:[Expression(type:IDENTIFIER, "
+ + "identifier:Identifier(name:intColumn)),
Expression(type:LITERAL, literal:<Literal "
+ + "intValue:-2147483648>), Expression(type:LITERAL,
literal:<Literal intValue:0>)]))");
+ // Test INT column with LONG upper bound greater than Integer.MAX_VALUE.
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN 0 AND 4000000000"),
+ "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN,
operands:[Expression(type:IDENTIFIER, "
+ + "identifier:Identifier(name:intColumn)),
Expression(type:LITERAL, literal:<Literal "
+ + "intValue:0>), Expression(type:LITERAL, literal:<Literal
intValue:2147483647>)]))");
+
+ // Test LONG column with DOUBLE lower bound greater than Long.MAX_VALUE.
+ Assert.assertEquals(
+ rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN
9323372036854775808.0 AND 9323372036854775809.0"),
+ "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+ // Test LONG column with DOUBLE upper bound lesser than Long.MIN_VALUE.
+ Assert.assertEquals(
+ rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN
-9323372036854775809.0 AND -9323372036854775808.0"),
+ "Expression(type:LITERAL, literal:<Literal boolValue:false>)");
+ // Test LONG column with DOUBLE lower bound lesser than Long.MIN_VALUE.
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn
BETWEEN -9323372036854775809.0 AND 0"),
+ "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN,
operands:[Expression(type:IDENTIFIER, "
+ + "identifier:Identifier(name:longColumn)),
Expression(type:LITERAL, literal:<Literal "
+ + "longValue:-9223372036854775808>), Expression(type:LITERAL,
literal:<Literal intValue:0>)]))");
+ // Test LONG column with DOUBLE upper bound greater than Long.MAX_VALUE.
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn
BETWEEN 0 AND 9323372036854775808.0"),
+ "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN,
operands:[Expression(type:IDENTIFIER, "
+ + "identifier:Identifier(name:longColumn)),
Expression(type:LITERAL, literal:<Literal intValue:0>), "
+ + "Expression(type:LITERAL, literal:<Literal
longValue:9223372036854775807>)]))");
+
+ // Test DOUBLE literal rewrite for INT and LONG columns.
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN 2.5 AND 7.5"),
+ "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN,
operands:[Expression(type:IDENTIFIER, "
+ + "identifier:Identifier(name:intColumn)),
Expression(type:LITERAL, literal:<Literal intValue:3>), "
+ + "Expression(type:LITERAL, literal:<Literal intValue:7>)]))");
+
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn
BETWEEN 2.1 AND 7.9"),
+ "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN,
operands:[Expression(type:IDENTIFIER, "
+ + "identifier:Identifier(name:longColumn)),
Expression(type:LITERAL, literal:<Literal longValue:3>), "
+ + "Expression(type:LITERAL, literal:<Literal longValue:7>)]))");
+
+ Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn
BETWEEN -5.0 AND -3.5"),
+ "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN,
operands:[Expression(type:IDENTIFIER, "
+ + "identifier:Identifier(name:intColumn)),
Expression(type:LITERAL, literal:<Literal intValue:-5>), "
+ + "Expression(type:LITERAL, literal:<Literal intValue:-4>)]))");
+ }
+
@Test
public void testNull() {
// Test column IS NOT NULL.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]