zabetak commented on code in PR #6293:
URL: https://github.com/apache/hive/pull/6293#discussion_r2798870380
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +188,284 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * If the cast can be removed, just return its operand and adjust the
boundaries if necessary.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of
the cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of
the values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * <p>
+ * Special care is taken to support the cast to DECIMAL(precision, scale):
+ * The cast to DECIMAL rounds the value the same way as {@link
RoundingMode#HALF_UP}.
+ * The boundaries are adjusted accordingly, without changing the semantics
of <code>inclusive</code>.
+ * </p>
+ *
+ * @param cast a RexCall of type {@link SqlKind#CAST}
+ * @param tableScan the table that provides the statistics
+ * @param boundaries indexes 0 and 1 are the boundaries of the range
predicate;
+ * indexes 2 and 3, if they exist, will be set to the
boundaries of the type range
+ * @param inclusive whether the respective boundary is inclusive or
exclusive.
+ * @return the operand if the cast can be removed, otherwise the cast itself
+ */
+ private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan,
float[] boundaries, boolean[] inclusive) {
Review Comment:
The logic in this method is similar to
`org.apache.calcite.rex.RexUtil#isLosslessCast(org.apache.calcite.rex.RexNode)`.
Since the method here has access to actual ranges and stats it may be more
effective for CAST that narrow the data type. However, adjusting the boundaries
and handling the DECIMAL types adds some complexity that we may not necessarily
need at this stage.
Would it be feasible to postpone/defer the more complex CAST removal
solution in a follow-up and use `isLosslessCast` for this first iteration? How
much do we gain by the special handling of the DECIMAL types?
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +188,284 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * If the cast can be removed, just return its operand and adjust the
boundaries if necessary.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of
the cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of
the values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * <p>
+ * Special care is taken to support the cast to DECIMAL(precision, scale):
+ * The cast to DECIMAL rounds the value the same way as {@link
RoundingMode#HALF_UP}.
+ * The boundaries are adjusted accordingly, without changing the semantics
of <code>inclusive</code>.
+ * </p>
+ *
+ * @param cast a RexCall of type {@link SqlKind#CAST}
+ * @param tableScan the table that provides the statistics
+ * @param boundaries indexes 0 and 1 are the boundaries of the range
predicate;
+ * indexes 2 and 3, if they exist, will be set to the
boundaries of the type range
+ * @param inclusive whether the respective boundary is inclusive or
exclusive.
+ * @return the operand if the cast can be removed, otherwise the cast itself
+ */
+ private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan,
float[] boundaries, boolean[] inclusive) {
+ RexNode op0 = cast.getOperands().getFirst();
+ if (!(op0 instanceof RexInputRef)) {
+ return cast;
+ }
+ int index = ((RexInputRef) op0).getIndex();
+ final List<ColStatistics> colStats =
tableScan.getColStat(Collections.singletonList(index));
+ if (colStats.isEmpty()) {
+ return cast;
+ }
+
+ // we need to check that the possible values of the input to the cast are
all within the type range of the cast
+ // otherwise the CAST introduces some modulo-like behavior (*)
+ ColStatistics colStat = colStats.getFirst();
+ ColStatistics.Range range = colStat.getRange();
+ if (range == null)
+ return cast;
+ if (range.minValue == null || Double.isNaN(range.minValue.doubleValue()))
+ return cast;
+ if (range.maxValue == null || Double.isNaN(range.maxValue.doubleValue()))
+ return cast;
+
+ String type = cast.getType().getSqlTypeName().getName();
+
+ double min;
+ double max;
+ switch (type.toLowerCase()) {
+ case serdeConstants.TINYINT_TYPE_NAME:
+ min = Byte.MIN_VALUE;
+ max = Byte.MAX_VALUE;
+ break;
+ case serdeConstants.SMALLINT_TYPE_NAME:
+ min = Short.MIN_VALUE;
+ max = Short.MAX_VALUE;
+ break;
+ case serdeConstants.INT_TYPE_NAME, "integer":
+ min = Integer.MIN_VALUE;
+ max = Integer.MAX_VALUE;
+ break;
+ case serdeConstants.BIGINT_TYPE_NAME, serdeConstants.TIMESTAMP_TYPE_NAME:
+ min = Long.MIN_VALUE;
+ max = Long.MAX_VALUE;
+ break;
+ case serdeConstants.FLOAT_TYPE_NAME:
+ min = -Float.MAX_VALUE;
+ max = Float.MAX_VALUE;
+ break;
+ case serdeConstants.DOUBLE_TYPE_NAME:
+ min = -Double.MAX_VALUE;
+ max = Double.MAX_VALUE;
+ break;
+ case serdeConstants.DECIMAL_TYPE_NAME:
+ min = -Double.MAX_VALUE;
+ max = Double.MAX_VALUE;
+ adjustBoundariesForDecimal(cast, boundaries, inclusive);
+ break;
+ default:
+ // unknown type, do not remove the cast
+ return cast;
+ }
+
+ // see (*)
+ if (range.minValue.doubleValue() < min)
+ return cast;
+ if (range.maxValue.doubleValue() > max)
+ return cast;
+
+ return op0;
+ }
+
+ /**
+ * Adjust the boundaries for a DECIMAL cast.
+ * <p>
+ * See {@link #removeCastIfPossible(RexCall, HiveTableScan, float[],
boolean[])}
+ * for an explanation of the parameters.
+ */
+ private static void adjustBoundariesForDecimal(RexCall cast, float[]
boundaries, boolean[] inclusive) {
+ // values outside the representable range are cast to NULL, so adapt the
boundaries
+ int precision = cast.getType().getPrecision();
+ int scale = cast.getType().getScale();
+ int digits = precision - scale;
+ // the cast does some rounding, i.e., CAST(99.9499 AS DECIMAL(3,1)) = 99.9
+ // but CAST(99.95 AS DECIMAL(3,1)) = NULL
+ float adjust = (float) (5 * Math.pow(10, -(scale + 1)));
+ // the range of values supported by the type is interval
[-typeRangeExtent, typeRangeExtent] (both inclusive)
+ // e.g., the typeRangeExt is 99.94999 for DECIMAL(3,1)
+ float typeRangeExtent = Math.nextDown((float) (Math.pow(10, digits) -
adjust));
+
+ // the resulting value of +- adjust would be rounded up, so in some cases
we need to use Math.nextDown
+ float adjusted1 = inclusive[0] ? boundaries[0] - adjust :
Math.nextDown(boundaries[0] + adjust);
+ float adjusted2 = inclusive[1] ? Math.nextDown(boundaries[1] + adjust) :
boundaries[1] - adjust;
+
+ float lowerUniverse = inclusive[0] ? -typeRangeExtent :
Math.nextDown(-typeRangeExtent);
+ float upperUniverse = inclusive[1] ? typeRangeExtent :
Math.nextUp(typeRangeExtent);
+ boundaries[0] = Math.max(adjusted1, lowerUniverse);
+ boundaries[1] = Math.min(adjusted2, upperUniverse);
+ if (boundaries.length >= 4) {
+ boundaries[2] = lowerUniverse;
+ boundaries[3] = upperUniverse;
+ }
+ }
+
private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
- final boolean isLiteralLeft =
call.getOperands().get(0).getKind().equals(SqlKind.LITERAL);
- final boolean isLiteralRight =
call.getOperands().get(1).getKind().equals(SqlKind.LITERAL);
- final boolean isInputRefLeft =
call.getOperands().get(0).getKind().equals(SqlKind.INPUT_REF);
- final boolean isInputRefRight =
call.getOperands().get(1).getKind().equals(SqlKind.INPUT_REF);
+ double defaultSelectivity = ((double) 1 / (double) 3);
+ if (!(childRel instanceof HiveTableScan)) {
+ return defaultSelectivity;
+ }
- if (childRel instanceof HiveTableScan && isLiteralLeft != isLiteralRight
&& isInputRefLeft != isInputRefRight) {
- final HiveTableScan t = (HiveTableScan) childRel;
- final int inputRefIndex = ((RexInputRef)
call.getOperands().get(isInputRefLeft ? 0 : 1)).getIndex();
- final List<ColStatistics> colStats =
t.getColStat(Collections.singletonList(inputRefIndex));
+ // search for the literal
+ List<RexNode> operands = call.getOperands();
+ final Optional<Float> leftLiteral = extractLiteral(operands.get(0));
+ final Optional<Float> rightLiteral = extractLiteral(operands.get(1));
+ if ((leftLiteral.isPresent()) == (rightLiteral.isPresent())) {
+ return defaultSelectivity;
+ }
+ int literalOpIdx = leftLiteral.isPresent() ? 0 : 1;
+
+ // analyze the predicate
+ float value = leftLiteral.orElseGet(rightLiteral::get);
+ int boundaryIdx;
+ boolean openBound = op == SqlKind.LESS_THAN || op == SqlKind.GREATER_THAN;
+ switch (op) {
+ case LESS_THAN, LESS_THAN_OR_EQUAL:
+ boundaryIdx = literalOpIdx;
+ break;
+ case GREATER_THAN, GREATER_THAN_OR_EQUAL:
+ boundaryIdx = 1 - literalOpIdx;
+ break;
+ default:
+ return defaultSelectivity;
+ }
+ float[] boundaries = new float[] { Float.NEGATIVE_INFINITY,
Float.POSITIVE_INFINITY };
+ boolean[] inclusive = new boolean[] { true, true };
+ inclusive[boundaryIdx] = !openBound;
+ boundaries[boundaryIdx] = value;
+
+ // extract the column index from the other operator
+ final HiveTableScan scan = (HiveTableScan) childRel;
+ int inputRefOpIndex = 1 - literalOpIdx;
+ RexNode node = operands.get(inputRefOpIndex);
+ if (node.getKind().equals(SqlKind.CAST)) {
+ node = removeCastIfPossible((RexCall) node, scan, boundaries, inclusive);
+ }
- if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) {
- final KllFloatsSketch kll =
KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
- final Object boundValueObject = ((RexLiteral)
call.getOperands().get(isLiteralLeft ? 0 : 1)).getValue();
- final SqlTypeName typeName = call.getOperands().get(isInputRefLeft ? 0
: 1).getType().getSqlTypeName();
- float value = extractLiteral(typeName, boundValueObject);
- boolean closedBound = op.equals(SqlKind.LESS_THAN_OR_EQUAL) ||
op.equals(SqlKind.GREATER_THAN_OR_EQUAL);
-
- double selectivity;
- if (op.equals(SqlKind.LESS_THAN_OR_EQUAL) ||
op.equals(SqlKind.LESS_THAN)) {
- selectivity = closedBound ? lessThanOrEqualSelectivity(kll, value) :
lessThanSelectivity(kll, value);
- } else {
- selectivity = closedBound ? greaterThanOrEqualSelectivity(kll,
value) : greaterThanSelectivity(kll, value);
- }
+ int inputRefIndex = -1;
+ if (node.getKind().equals(SqlKind.INPUT_REF)) {
+ inputRefIndex = ((RexInputRef) node).getIndex();
+ }
- // selectivity does not account for null values, we multiply for the
number of non-null values (getN)
- // and we divide by the total (non-null + null values) to get the
overall selectivity.
- //
- // Example: consider a filter "col < 3", and the following table rows:
- // _____
- // | col |
- // |_____|
- // |1 |
- // |null |
- // |null |
- // |3 |
- // |4 |
- // -------
- // kll.getN() would be 3, selectivity 1/3, t.getTable().getRowCount() 5
- // so the final result would be 3 * 1/3 / 5 = 1/5, as expected.
- return kll.getN() * selectivity / t.getTable().getRowCount();
- }
+ if (inputRefIndex < 0) {
+ return defaultSelectivity;
+ }
+
+ final List<ColStatistics> colStats =
scan.getColStat(Collections.singletonList(inputRefIndex));
+ if (colStats.isEmpty() || !isHistogramAvailable(colStats.get(0))) {
+ return defaultSelectivity;
}
- return ((double) 1 / (double) 3);
+
+ // convert the condition to a range val1 <= x < val2 for
rangedSelectivity(...)
+ float left = inclusive[0] ? boundaries[0] : Math.nextUp(boundaries[0]);
+ float right = inclusive[1] ? Math.nextUp(boundaries[1]) : boundaries[1];
+
+ final KllFloatsSketch kll =
KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
+ double rawSelectivity = rangedSelectivity(kll, left, right);
+
+ // rawSelectivity does not account for null values, we multiply for the
number of non-null values (getN)
+ // and we divide by the total (non-null + null values) to get the overall
rawSelectivity.
+ //
+ // Example: consider a filter "col < 3", and the following table rows:
+ // _____
+ // | col |
+ // |_____|
+ // |1 |
+ // |null |
+ // |null |
+ // |3 |
+ // |4 |
+ // -------
+ // kll.getN() would be 3, rawSelectivity 1/3,
scan.getTable().getRowCount() 5
+ // so the final result would be 3 * 1/3 / 5 = 1/5, as expected.
+ return kll.getN() * rawSelectivity / scan.getTable().getRowCount();
Review Comment:
Since we perform this computation multiple times maybe its worth putting it
in private method and put all documentation/examples there instead of copying
the comments or putting links to other methods.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +599,215 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testComputeRangePredicateSelectivityWithCast() {
+ useFieldWithValues("f_numeric", VALUES, KLL);
+ checkSelectivity(3 / 13.f, castAndCompare(TINYINT, GE, int5));
Review Comment:
The test would be easier to read if expression was created as:
```java
ge(cast("f_numeric", TINYINT), 5)
```
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +599,215 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testComputeRangePredicateSelectivityWithCast() {
+ useFieldWithValues("f_numeric", VALUES, KLL);
+ checkSelectivity(3 / 13.f, castAndCompare(TINYINT, GE, int5));
+ checkSelectivity(10 / 13.f, castAndCompare(TINYINT, LT, int5));
+ checkSelectivity(2 / 13.f, castAndCompare(TINYINT, GT, int5));
+ checkSelectivity(11 / 13.f, castAndCompare(TINYINT, LE, int5));
+
+ checkSelectivity(12 / 13f, castAndCompare(TINYINT, GE, int2));
+ checkSelectivity(1 / 13f, castAndCompare(TINYINT, LT, int2));
+ checkSelectivity(5 / 13f, castAndCompare(TINYINT, GT, int2));
+ checkSelectivity(8 / 13f, castAndCompare(TINYINT, LE, int2));
+
+ // check some types
+ checkSelectivity(3 / 13.f, castAndCompare(INTEGER, GE, int5));
+ checkSelectivity(3 / 13.f, castAndCompare(BIGINT, GE, int5));
+ checkSelectivity(3 / 13.f, castAndCompare(FLOAT, GE, int5));
+ checkSelectivity(3 / 13.f, castAndCompare(DOUBLE, GE, int5));
+ }
+
+ @Test
+ public void testComputeRangePredicateSelectivityWithCast2() {
+ useFieldWithValues("f_numeric", VALUES2, KLL2);
+ checkSelectivity(4 / 28.f, castAndCompare(DECIMAL_3_1, GE,
literalFloat(1)));
+
+ // values from -99.94999 to 99.94999 (both inclusive)
+ checkSelectivity(7 / 28.f, castAndCompare(DECIMAL_3_1, LT,
literalFloat(100)));
+ checkSelectivity(7 / 28.f, castAndCompare(DECIMAL_3_1, LE,
literalFloat(100)));
+ checkSelectivity(0 / 28.f, castAndCompare(DECIMAL_3_1, GT,
literalFloat(100)));
+ checkSelectivity(0 / 28.f, castAndCompare(DECIMAL_3_1, GE,
literalFloat(100)));
+
+ checkSelectivity(10 / 28.f, castAndCompare(DECIMAL_4_1, LT,
literalFloat(100)));
+ checkSelectivity(20 / 28.f, castAndCompare(DECIMAL_4_1, LE,
literalFloat(100)));
+ checkSelectivity(3 / 28.f, castAndCompare(DECIMAL_4_1, GT,
literalFloat(100)));
+ checkSelectivity(13 / 28.f, castAndCompare(DECIMAL_4_1, GE,
literalFloat(100)));
+
+ checkSelectivity(2 / 28.f, castAndCompare(DECIMAL_2_1, LT,
literalFloat(100)));
+ checkSelectivity(2 / 28.f, castAndCompare(DECIMAL_2_1, LE,
literalFloat(100)));
+ checkSelectivity(0 / 28.f, castAndCompare(DECIMAL_2_1, GT,
literalFloat(100)));
+ checkSelectivity(0 / 28.f, castAndCompare(DECIMAL_2_1, GE,
literalFloat(100)));
+
+ // expected: 100_000f
+ checkSelectivity(1 / 28.f, castAndCompare(DECIMAL_7_1, GT,
literalFloat(10000)));
+
+ // expected: 10_000f, 100_000f, because CAST(1_000_000 AS DECIMAL(7,1)) =
NULL, and similar for even larger values
+ checkSelectivity(2 / 28.f, castAndCompare(DECIMAL_7_1, GE,
literalFloat(9999)));
+ checkSelectivity(2 / 28.f, castAndCompare(DECIMAL_7_1, GE,
literalFloat(10000)));
+
+ // expected: 100_000f
+ checkSelectivity(1 / 28.f, castAndCompare(DECIMAL_7_1, GT,
literalFloat(10000)));
+ checkSelectivity(1 / 28.f, castAndCompare(DECIMAL_7_1, GT,
literalFloat(10001)));
+
+ // expected 1f, 10f, 99.94998f, 99.94999f
+ checkSelectivity(4 / 28.f, castAndCompare(DECIMAL_3_1, GE,
literalFloat(1)));
+ checkSelectivity(3 / 28.f, castAndCompare(DECIMAL_3_1, GT,
literalFloat(1)));
+ // expected -99.94999f, -99.94998f, 0f, 1f
+ checkSelectivity(4 / 28.f, castAndCompare(DECIMAL_3_1, LE,
literalFloat(1)));
+ checkSelectivity(3 / 28.f, castAndCompare(DECIMAL_3_1, LT,
literalFloat(1)));
+
+ // the cast would apply a modulo operation to the values outside the range
of the cast
+ // so instead a default selectivity should be returned
+ checkSelectivity(1 / 3.f, castAndCompare(TINYINT, LT, literalFloat(100)));
+ checkSelectivity(1 / 3.f, castAndCompare(TINYINT, LT, literalFloat(100)));
+ }
+
+ @Test
+ public void testComputeRangePredicateSelectivityTimestamp() {
+ useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME);
+
+ checkSelectivity(5 / 7.f, REX_BUILDER.makeCall(GE, currentInputRef,
literalTimestamp("2020-11-03")));
+ checkSelectivity(4 / 7.f, REX_BUILDER.makeCall(GT, currentInputRef,
literalTimestamp("2020-11-03")));
+ checkSelectivity(5 / 7.f, REX_BUILDER.makeCall(LE, currentInputRef,
literalTimestamp("2020-11-05T11:23:45Z")));
+ checkSelectivity(4 / 7.f, REX_BUILDER.makeCall(LT, currentInputRef,
literalTimestamp("2020-11-05T11:23:45Z")));
}
+
+ @Test
+ public void testComputeRangePredicateSelectivityDate() {
+ useFieldWithValues("f_date", VALUES_TIME, KLL_TIME);
+
+ checkSelectivity(5 / 7.f, REX_BUILDER.makeCall(GE, currentInputRef,
literalDate("2020-11-03")));
+ checkSelectivity(4 / 7.f, REX_BUILDER.makeCall(GT, currentInputRef,
literalDate("2020-11-03")));
+ checkSelectivity(4 / 7.f, REX_BUILDER.makeCall(LE, currentInputRef,
literalDate("2020-11-05")));
+ checkSelectivity(4 / 7.f, REX_BUILDER.makeCall(LT, currentInputRef,
literalDate("2020-11-05")));
+ }
+
+ @Test
+ public void testComputeRangePredicateSelectivityBetweenWithCast() {
+ useFieldWithValues("f_numeric", VALUES2, KLL2);
+ float total = VALUES2.length;
+
+ {
+ float universe = 2; // the number of values that "survive" the cast
+ RexNode cast = REX_BUILDER.makeCast(DECIMAL_2_1, inputRef0);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 1000f);
+ checkBetweenSelectivity(1, universe, total, cast, 1f, 100f);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 0f);
+ }
+
+ {
+ float universe = 7;
+ RexNode cast = REX_BUILDER.makeCast(DECIMAL_3_1, inputRef0);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 1000f);
+ checkBetweenSelectivity(4, universe, total, cast, 1f, 100f);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 0f);
+ }
+
+ {
+ float universe = 23;
+ RexNode cast = REX_BUILDER.makeCast(DECIMAL_4_1, inputRef0);
+ // the values between -999.94999... and 999.94999... (both inclusive)
pass through the cast
+ // the values between 99.95 and 100 are rounded up to 100, so they
fulfill the BETWEEN
+ checkBetweenSelectivity(13, universe, total, cast, 100, 1000);
+ checkBetweenSelectivity(14, universe, total, cast, 1f, 100f);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 0f);
+ }
+
+ {
+ float universe = 26;
+ RexNode cast = REX_BUILDER.makeCast(DECIMAL_7_1, inputRef0);
+ checkBetweenSelectivity(14, universe, total, cast, 100, 1000);
+ checkBetweenSelectivity(14, universe, total, cast, 1f, 100f);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 0f);
+ }
+ }
+
+ private void checkSelectivity(float expectedSelectivity, RexNode filter) {
+ FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
+ Assert.assertEquals(filter.toString(), expectedSelectivity,
estimator.estimateSelectivity(filter), DELTA);
+
+ // swap equation, e.g., col < 5 becomes 5 > col; selectivity stays the same
+ RexCall call = (RexCall) filter;
+ SqlOperator operator = ((RexCall) filter).getOperator();
+ SqlOperator swappedOp;
+ if (operator == LE) {
+ swappedOp = GE;
+ } else if (operator == LT) {
+ swappedOp = GT;
+ } else if (operator == GE) {
+ swappedOp = LE;
+ } else if (operator == GT) {
+ swappedOp = LT;
+ } else if (operator == BETWEEN) {
+ // BETWEEN cannot be swapped
+ return;
+ } else {
+ throw new UnsupportedOperationException();
+ }
+ RexNode swapped = REX_BUILDER.makeCall(swappedOp,
call.getOperands().get(1), call.getOperands().get(0));
+ Assert.assertEquals(filter.toString(), expectedSelectivity,
estimator.estimateSelectivity(swapped), DELTA);
+ }
Review Comment:
What's the point of swapping if we are already testing explicitly the
inverse operation in the test itself? I think its better to keep the tests
explicit and drop this swapping logic.
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +188,284 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * If the cast can be removed, just return its operand and adjust the
boundaries if necessary.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of
the cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of
the values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * <p>
+ * Special care is taken to support the cast to DECIMAL(precision, scale):
+ * The cast to DECIMAL rounds the value the same way as {@link
RoundingMode#HALF_UP}.
+ * The boundaries are adjusted accordingly, without changing the semantics
of <code>inclusive</code>.
+ * </p>
+ *
+ * @param cast a RexCall of type {@link SqlKind#CAST}
+ * @param tableScan the table that provides the statistics
+ * @param boundaries indexes 0 and 1 are the boundaries of the range
predicate;
+ * indexes 2 and 3, if they exist, will be set to the
boundaries of the type range
+ * @param inclusive whether the respective boundary is inclusive or
exclusive.
Review Comment:
If we decide to proceed with this implementation then it may be cleaner and
more readable to have a dedicated `private static class Boundaries` instead of
passing around arrays and trying to decipher what the indexes mean.
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +188,284 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * If the cast can be removed, just return its operand and adjust the
boundaries if necessary.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of
the cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of
the values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * <p>
+ * Special care is taken to support the cast to DECIMAL(precision, scale):
+ * The cast to DECIMAL rounds the value the same way as {@link
RoundingMode#HALF_UP}.
+ * The boundaries are adjusted accordingly, without changing the semantics
of <code>inclusive</code>.
+ * </p>
+ *
+ * @param cast a RexCall of type {@link SqlKind#CAST}
+ * @param tableScan the table that provides the statistics
+ * @param boundaries indexes 0 and 1 are the boundaries of the range
predicate;
+ * indexes 2 and 3, if they exist, will be set to the
boundaries of the type range
+ * @param inclusive whether the respective boundary is inclusive or
exclusive.
+ * @return the operand if the cast can be removed, otherwise the cast itself
+ */
+ private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan,
float[] boundaries, boolean[] inclusive) {
+ RexNode op0 = cast.getOperands().getFirst();
+ if (!(op0 instanceof RexInputRef)) {
+ return cast;
+ }
+ int index = ((RexInputRef) op0).getIndex();
+ final List<ColStatistics> colStats =
tableScan.getColStat(Collections.singletonList(index));
+ if (colStats.isEmpty()) {
+ return cast;
+ }
+
+ // we need to check that the possible values of the input to the cast are
all within the type range of the cast
+ // otherwise the CAST introduces some modulo-like behavior (*)
+ ColStatistics colStat = colStats.getFirst();
+ ColStatistics.Range range = colStat.getRange();
+ if (range == null)
+ return cast;
+ if (range.minValue == null || Double.isNaN(range.minValue.doubleValue()))
+ return cast;
+ if (range.maxValue == null || Double.isNaN(range.maxValue.doubleValue()))
+ return cast;
+
+ String type = cast.getType().getSqlTypeName().getName();
+
+ double min;
+ double max;
+ switch (type.toLowerCase()) {
+ case serdeConstants.TINYINT_TYPE_NAME:
+ min = Byte.MIN_VALUE;
+ max = Byte.MAX_VALUE;
+ break;
+ case serdeConstants.SMALLINT_TYPE_NAME:
+ min = Short.MIN_VALUE;
+ max = Short.MAX_VALUE;
+ break;
+ case serdeConstants.INT_TYPE_NAME, "integer":
+ min = Integer.MIN_VALUE;
+ max = Integer.MAX_VALUE;
+ break;
+ case serdeConstants.BIGINT_TYPE_NAME, serdeConstants.TIMESTAMP_TYPE_NAME:
+ min = Long.MIN_VALUE;
+ max = Long.MAX_VALUE;
+ break;
+ case serdeConstants.FLOAT_TYPE_NAME:
+ min = -Float.MAX_VALUE;
+ max = Float.MAX_VALUE;
+ break;
+ case serdeConstants.DOUBLE_TYPE_NAME:
+ min = -Double.MAX_VALUE;
+ max = Double.MAX_VALUE;
+ break;
+ case serdeConstants.DECIMAL_TYPE_NAME:
+ min = -Double.MAX_VALUE;
+ max = Double.MAX_VALUE;
+ adjustBoundariesForDecimal(cast, boundaries, inclusive);
+ break;
+ default:
+ // unknown type, do not remove the cast
+ return cast;
+ }
+
+ // see (*)
+ if (range.minValue.doubleValue() < min)
+ return cast;
+ if (range.maxValue.doubleValue() > max)
+ return cast;
+
+ return op0;
+ }
+
+ /**
+ * Adjust the boundaries for a DECIMAL cast.
+ * <p>
+ * See {@link #removeCastIfPossible(RexCall, HiveTableScan, float[],
boolean[])}
+ * for an explanation of the parameters.
+ */
+ private static void adjustBoundariesForDecimal(RexCall cast, float[]
boundaries, boolean[] inclusive) {
+ // values outside the representable range are cast to NULL, so adapt the
boundaries
+ int precision = cast.getType().getPrecision();
+ int scale = cast.getType().getScale();
+ int digits = precision - scale;
+ // the cast does some rounding, i.e., CAST(99.9499 AS DECIMAL(3,1)) = 99.9
+ // but CAST(99.95 AS DECIMAL(3,1)) = NULL
+ float adjust = (float) (5 * Math.pow(10, -(scale + 1)));
+ // the range of values supported by the type is interval
[-typeRangeExtent, typeRangeExtent] (both inclusive)
+ // e.g., the typeRangeExt is 99.94999 for DECIMAL(3,1)
+ float typeRangeExtent = Math.nextDown((float) (Math.pow(10, digits) -
adjust));
+
+ // the resulting value of +- adjust would be rounded up, so in some cases
we need to use Math.nextDown
+ float adjusted1 = inclusive[0] ? boundaries[0] - adjust :
Math.nextDown(boundaries[0] + adjust);
+ float adjusted2 = inclusive[1] ? Math.nextDown(boundaries[1] + adjust) :
boundaries[1] - adjust;
+
+ float lowerUniverse = inclusive[0] ? -typeRangeExtent :
Math.nextDown(-typeRangeExtent);
+ float upperUniverse = inclusive[1] ? typeRangeExtent :
Math.nextUp(typeRangeExtent);
+ boundaries[0] = Math.max(adjusted1, lowerUniverse);
+ boundaries[1] = Math.min(adjusted2, upperUniverse);
+ if (boundaries.length >= 4) {
+ boundaries[2] = lowerUniverse;
+ boundaries[3] = upperUniverse;
+ }
+ }
+
private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
- final boolean isLiteralLeft =
call.getOperands().get(0).getKind().equals(SqlKind.LITERAL);
- final boolean isLiteralRight =
call.getOperands().get(1).getKind().equals(SqlKind.LITERAL);
- final boolean isInputRefLeft =
call.getOperands().get(0).getKind().equals(SqlKind.INPUT_REF);
- final boolean isInputRefRight =
call.getOperands().get(1).getKind().equals(SqlKind.INPUT_REF);
+ double defaultSelectivity = ((double) 1 / (double) 3);
+ if (!(childRel instanceof HiveTableScan)) {
+ return defaultSelectivity;
+ }
- if (childRel instanceof HiveTableScan && isLiteralLeft != isLiteralRight
&& isInputRefLeft != isInputRefRight) {
- final HiveTableScan t = (HiveTableScan) childRel;
- final int inputRefIndex = ((RexInputRef)
call.getOperands().get(isInputRefLeft ? 0 : 1)).getIndex();
- final List<ColStatistics> colStats =
t.getColStat(Collections.singletonList(inputRefIndex));
+ // search for the literal
+ List<RexNode> operands = call.getOperands();
+ final Optional<Float> leftLiteral = extractLiteral(operands.get(0));
+ final Optional<Float> rightLiteral = extractLiteral(operands.get(1));
+ if ((leftLiteral.isPresent()) == (rightLiteral.isPresent())) {
+ return defaultSelectivity;
+ }
+ int literalOpIdx = leftLiteral.isPresent() ? 0 : 1;
+
+ // analyze the predicate
+ float value = leftLiteral.orElseGet(rightLiteral::get);
+ int boundaryIdx;
+ boolean openBound = op == SqlKind.LESS_THAN || op == SqlKind.GREATER_THAN;
+ switch (op) {
+ case LESS_THAN, LESS_THAN_OR_EQUAL:
+ boundaryIdx = literalOpIdx;
+ break;
+ case GREATER_THAN, GREATER_THAN_OR_EQUAL:
+ boundaryIdx = 1 - literalOpIdx;
+ break;
+ default:
+ return defaultSelectivity;
+ }
+ float[] boundaries = new float[] { Float.NEGATIVE_INFINITY,
Float.POSITIVE_INFINITY };
+ boolean[] inclusive = new boolean[] { true, true };
+ inclusive[boundaryIdx] = !openBound;
+ boundaries[boundaryIdx] = value;
+
+ // extract the column index from the other operator
+ final HiveTableScan scan = (HiveTableScan) childRel;
+ int inputRefOpIndex = 1 - literalOpIdx;
+ RexNode node = operands.get(inputRefOpIndex);
+ if (node.getKind().equals(SqlKind.CAST)) {
+ node = removeCastIfPossible((RexCall) node, scan, boundaries, inclusive);
Review Comment:
Even when the CAST is not removed the boundaries may change is this
intentional?
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +188,284 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * If the cast can be removed, just return its operand and adjust the
boundaries if necessary.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of
the cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of
the values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * <p>
+ * Special care is taken to support the cast to DECIMAL(precision, scale):
+ * The cast to DECIMAL rounds the value the same way as {@link
RoundingMode#HALF_UP}.
+ * The boundaries are adjusted accordingly, without changing the semantics
of <code>inclusive</code>.
+ * </p>
+ *
+ * @param cast a RexCall of type {@link SqlKind#CAST}
+ * @param tableScan the table that provides the statistics
+ * @param boundaries indexes 0 and 1 are the boundaries of the range
predicate;
+ * indexes 2 and 3, if they exist, will be set to the
boundaries of the type range
+ * @param inclusive whether the respective boundary is inclusive or
exclusive.
+ * @return the operand if the cast can be removed, otherwise the cast itself
+ */
+ private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan,
float[] boundaries, boolean[] inclusive) {
+ RexNode op0 = cast.getOperands().getFirst();
+ if (!(op0 instanceof RexInputRef)) {
+ return cast;
+ }
+ int index = ((RexInputRef) op0).getIndex();
+ final List<ColStatistics> colStats =
tableScan.getColStat(Collections.singletonList(index));
+ if (colStats.isEmpty()) {
+ return cast;
+ }
+
+ // we need to check that the possible values of the input to the cast are
all within the type range of the cast
+ // otherwise the CAST introduces some modulo-like behavior (*)
+ ColStatistics colStat = colStats.getFirst();
+ ColStatistics.Range range = colStat.getRange();
+ if (range == null)
+ return cast;
+ if (range.minValue == null || Double.isNaN(range.minValue.doubleValue()))
+ return cast;
+ if (range.maxValue == null || Double.isNaN(range.maxValue.doubleValue()))
+ return cast;
+
+ String type = cast.getType().getSqlTypeName().getName();
+
+ double min;
+ double max;
+ switch (type.toLowerCase()) {
Review Comment:
This class is mostly using Calcite APIs so since we have the `SqlTypeName`
readily available wouldn't be better to use that instead?
In addition there is `org.apache.calcite.sql.type.SqlTypeName#getLimit`
which might be relevant and could potentially replace this switch statement.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -51,24 +56,77 @@
import org.mockito.Mock;
import org.mockito.junit.MockitoJUnitRunner;
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.LocalTime;
+import java.time.ZoneOffset;
import java.util.Collections;
+import java.util.Objects;
import static
org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.betweenSelectivity;
import static
org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.greaterThanOrEqualSelectivity;
import static
org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.greaterThanSelectivity;
import static
org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.isHistogramAvailable;
import static
org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.lessThanOrEqualSelectivity;
import static
org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.lessThanSelectivity;
+import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.doReturn;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
@RunWith(MockitoJUnitRunner.class)
public class TestFilterSelectivityEstimator {
+ private static final SqlBinaryOperator GT = SqlStdOperatorTable.GREATER_THAN;
+ private static final SqlBinaryOperator GE =
SqlStdOperatorTable.GREATER_THAN_OR_EQUAL;
+ private static final SqlBinaryOperator LT = SqlStdOperatorTable.LESS_THAN;
+ private static final SqlBinaryOperator LE =
SqlStdOperatorTable.LESS_THAN_OR_EQUAL;
+ private static final SqlOperator BETWEEN = HiveBetween.INSTANCE;
+
private static final float[] VALUES = { 1, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6,
7 };
+ private static final float[] VALUES2 = {
+ // rounding for DECIMAL(3,1)
+ // -99.95f and its two predecessors and successors
+ -99.95001f, -99.950005f, -99.95f, -99.94999f, -99.94998f,
+ // some values
+ 0f, 1f, 10f,
+ // rounding for DECIMAL(3,1)
+ // 99.95f and its two predecessors and successors
+ 99.94998f, 99.94999f, 99.95f, 99.950005f, 99.95001f,
+ // 100f and its two predecessors and successors
+ 99.999985f, 99.99999f, 100f, 100.00001f, 100.000015f,
+ // 100.05f and its two predecessors and successors
+ 100.04999f, 100.049995f, 100.05f, 100.05001f, 100.05002f,
+ // some values
+ 1_000f, 10_000f, 100_000f, 1_000_000f, 10_000_000f };
+
+ /**
+ * Both dates and timestamps are converted to epoch seconds.
+ * <p>
+ * See {@link
org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp#evaluate(GenericUDF.DeferredObject[])}.
+ */
+ private static final float[] VALUES_TIME =
+ { timestamp("2020-11-01"), timestamp("2020-11-02"),
timestamp("2020-11-03"), timestamp("2020-11-04"),
+ timestamp("2020-11-05T11:23:45Z"), timestamp("2020-11-06"),
timestamp("2020-11-07") };
+
private static final KllFloatsSketch KLL =
StatisticsTestUtils.createKll(VALUES);
- private static final float DELTA = Float.MIN_VALUE;
+ private static final KllFloatsSketch KLL2 =
StatisticsTestUtils.createKll(VALUES2);
+ private static final KllFloatsSketch KLL_TIME =
StatisticsTestUtils.createKll(VALUES_TIME);
+ private static final float DELTA = 1e-7f;
private static final RexBuilder REX_BUILDER = new RexBuilder(new
JavaTypeFactoryImpl(new HiveTypeSystemImpl()));
private static final RelDataTypeFactory TYPE_FACTORY =
REX_BUILDER.getTypeFactory();
+
+ public static final RelDataType TINYINT =
REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.TINYINT);
+ public static final RelDataType INTEGER =
REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.INTEGER);
+ public static final RelDataType BIGINT =
REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.BIGINT);
+ public static final RelDataType DECIMAL_2_1 = createDecimalType(2, 1);
+ public static final RelDataType DECIMAL_3_1 = createDecimalType(3, 1);
+ public static final RelDataType DECIMAL_4_1 = createDecimalType(4, 1);
+ public static final RelDataType DECIMAL_7_1 = createDecimalType(7, 1);
+ public static final RelDataType DECIMAL_38_25 = createDecimalType(38, 25);
+ public static final RelDataType FLOAT =
REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.FLOAT);
+ public static final RelDataType DOUBLE =
REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.DOUBLE);
Review Comment:
Types are not that expensive to create so we could simply inline the calls
where necessary. Moreover, there is no point in making them public cause it
wouldn't be a good idea to let other classes/tests use them; it would create
unnecessary coupling and poor encapsulation without obvious benefit.
In terms of brevity we could rename the methods:
```java
createDecimalType(2, 1)
decimal(2, 1)
```
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -489,7 +686,17 @@ public Double visitLiteral(RexLiteral literal) {
return null;
}
- private static double rangedSelectivity(KllFloatsSketch kll, float val1,
float val2) {
+ /**
+ * Returns the selectivity of a predicate "val1 <= column < val2"
+ * @param kll the sketch
+ * @param val1 lower bound (inclusive)
+ * @param val2 upper bound (exclusive)
+ * @return the selectivity of "val1 <= column < val2"
+ */
+ public static double rangedSelectivity(KllFloatsSketch kll, float val1,
float val2) {
Review Comment:
Do we need to increase visibility to public?
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +188,284 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * If the cast can be removed, just return its operand and adjust the
boundaries if necessary.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of
the cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of
the values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * <p>
+ * Special care is taken to support the cast to DECIMAL(precision, scale):
+ * The cast to DECIMAL rounds the value the same way as {@link
RoundingMode#HALF_UP}.
+ * The boundaries are adjusted accordingly, without changing the semantics
of <code>inclusive</code>.
+ * </p>
+ *
+ * @param cast a RexCall of type {@link SqlKind#CAST}
+ * @param tableScan the table that provides the statistics
+ * @param boundaries indexes 0 and 1 are the boundaries of the range
predicate;
+ * indexes 2 and 3, if they exist, will be set to the
boundaries of the type range
+ * @param inclusive whether the respective boundary is inclusive or
exclusive.
+ * @return the operand if the cast can be removed, otherwise the cast itself
+ */
+ private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan,
float[] boundaries, boolean[] inclusive) {
+ RexNode op0 = cast.getOperands().getFirst();
+ if (!(op0 instanceof RexInputRef)) {
+ return cast;
+ }
+ int index = ((RexInputRef) op0).getIndex();
+ final List<ColStatistics> colStats =
tableScan.getColStat(Collections.singletonList(index));
+ if (colStats.isEmpty()) {
+ return cast;
+ }
+
+ // we need to check that the possible values of the input to the cast are
all within the type range of the cast
+ // otherwise the CAST introduces some modulo-like behavior (*)
+ ColStatistics colStat = colStats.getFirst();
+ ColStatistics.Range range = colStat.getRange();
+ if (range == null)
+ return cast;
+ if (range.minValue == null || Double.isNaN(range.minValue.doubleValue()))
+ return cast;
+ if (range.maxValue == null || Double.isNaN(range.maxValue.doubleValue()))
+ return cast;
+
+ String type = cast.getType().getSqlTypeName().getName();
+
+ double min;
+ double max;
+ switch (type.toLowerCase()) {
+ case serdeConstants.TINYINT_TYPE_NAME:
+ min = Byte.MIN_VALUE;
+ max = Byte.MAX_VALUE;
+ break;
+ case serdeConstants.SMALLINT_TYPE_NAME:
+ min = Short.MIN_VALUE;
+ max = Short.MAX_VALUE;
+ break;
+ case serdeConstants.INT_TYPE_NAME, "integer":
+ min = Integer.MIN_VALUE;
+ max = Integer.MAX_VALUE;
+ break;
+ case serdeConstants.BIGINT_TYPE_NAME, serdeConstants.TIMESTAMP_TYPE_NAME:
+ min = Long.MIN_VALUE;
+ max = Long.MAX_VALUE;
+ break;
+ case serdeConstants.FLOAT_TYPE_NAME:
+ min = -Float.MAX_VALUE;
+ max = Float.MAX_VALUE;
+ break;
+ case serdeConstants.DOUBLE_TYPE_NAME:
+ min = -Double.MAX_VALUE;
+ max = Double.MAX_VALUE;
+ break;
+ case serdeConstants.DECIMAL_TYPE_NAME:
+ min = -Double.MAX_VALUE;
+ max = Double.MAX_VALUE;
+ adjustBoundariesForDecimal(cast, boundaries, inclusive);
+ break;
+ default:
+ // unknown type, do not remove the cast
+ return cast;
+ }
+
+ // see (*)
+ if (range.minValue.doubleValue() < min)
+ return cast;
+ if (range.maxValue.doubleValue() > max)
+ return cast;
+
+ return op0;
+ }
+
+ /**
+ * Adjust the boundaries for a DECIMAL cast.
+ * <p>
+ * See {@link #removeCastIfPossible(RexCall, HiveTableScan, float[],
boolean[])}
+ * for an explanation of the parameters.
+ */
+ private static void adjustBoundariesForDecimal(RexCall cast, float[]
boundaries, boolean[] inclusive) {
Review Comment:
Not reviewed yet this part till we decide if we are going to include this
part or not.
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +188,284 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * If the cast can be removed, just return its operand and adjust the
boundaries if necessary.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of
the cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of
the values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * <p>
+ * Special care is taken to support the cast to DECIMAL(precision, scale):
+ * The cast to DECIMAL rounds the value the same way as {@link
RoundingMode#HALF_UP}.
+ * The boundaries are adjusted accordingly, without changing the semantics
of <code>inclusive</code>.
+ * </p>
+ *
+ * @param cast a RexCall of type {@link SqlKind#CAST}
+ * @param tableScan the table that provides the statistics
+ * @param boundaries indexes 0 and 1 are the boundaries of the range
predicate;
+ * indexes 2 and 3, if they exist, will be set to the
boundaries of the type range
+ * @param inclusive whether the respective boundary is inclusive or
exclusive.
+ * @return the operand if the cast can be removed, otherwise the cast itself
+ */
+ private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan,
float[] boundaries, boolean[] inclusive) {
+ RexNode op0 = cast.getOperands().getFirst();
+ if (!(op0 instanceof RexInputRef)) {
+ return cast;
+ }
+ int index = ((RexInputRef) op0).getIndex();
+ final List<ColStatistics> colStats =
tableScan.getColStat(Collections.singletonList(index));
+ if (colStats.isEmpty()) {
+ return cast;
+ }
+
+ // we need to check that the possible values of the input to the cast are
all within the type range of the cast
+ // otherwise the CAST introduces some modulo-like behavior (*)
+ ColStatistics colStat = colStats.getFirst();
+ ColStatistics.Range range = colStat.getRange();
+ if (range == null)
+ return cast;
+ if (range.minValue == null || Double.isNaN(range.minValue.doubleValue()))
+ return cast;
+ if (range.maxValue == null || Double.isNaN(range.maxValue.doubleValue()))
+ return cast;
+
+ String type = cast.getType().getSqlTypeName().getName();
+
+ double min;
+ double max;
+ switch (type.toLowerCase()) {
+ case serdeConstants.TINYINT_TYPE_NAME:
+ min = Byte.MIN_VALUE;
+ max = Byte.MAX_VALUE;
+ break;
+ case serdeConstants.SMALLINT_TYPE_NAME:
+ min = Short.MIN_VALUE;
+ max = Short.MAX_VALUE;
+ break;
+ case serdeConstants.INT_TYPE_NAME, "integer":
+ min = Integer.MIN_VALUE;
+ max = Integer.MAX_VALUE;
+ break;
+ case serdeConstants.BIGINT_TYPE_NAME, serdeConstants.TIMESTAMP_TYPE_NAME:
+ min = Long.MIN_VALUE;
+ max = Long.MAX_VALUE;
+ break;
+ case serdeConstants.FLOAT_TYPE_NAME:
+ min = -Float.MAX_VALUE;
+ max = Float.MAX_VALUE;
+ break;
+ case serdeConstants.DOUBLE_TYPE_NAME:
+ min = -Double.MAX_VALUE;
+ max = Double.MAX_VALUE;
+ break;
+ case serdeConstants.DECIMAL_TYPE_NAME:
+ min = -Double.MAX_VALUE;
+ max = Double.MAX_VALUE;
+ adjustBoundariesForDecimal(cast, boundaries, inclusive);
+ break;
+ default:
+ // unknown type, do not remove the cast
+ return cast;
+ }
+
+ // see (*)
+ if (range.minValue.doubleValue() < min)
+ return cast;
+ if (range.maxValue.doubleValue() > max)
+ return cast;
+
+ return op0;
+ }
+
+ /**
+ * Adjust the boundaries for a DECIMAL cast.
+ * <p>
+ * See {@link #removeCastIfPossible(RexCall, HiveTableScan, float[],
boolean[])}
+ * for an explanation of the parameters.
+ */
+ private static void adjustBoundariesForDecimal(RexCall cast, float[]
boundaries, boolean[] inclusive) {
+ // values outside the representable range are cast to NULL, so adapt the
boundaries
+ int precision = cast.getType().getPrecision();
+ int scale = cast.getType().getScale();
+ int digits = precision - scale;
+ // the cast does some rounding, i.e., CAST(99.9499 AS DECIMAL(3,1)) = 99.9
+ // but CAST(99.95 AS DECIMAL(3,1)) = NULL
+ float adjust = (float) (5 * Math.pow(10, -(scale + 1)));
+ // the range of values supported by the type is interval
[-typeRangeExtent, typeRangeExtent] (both inclusive)
+ // e.g., the typeRangeExt is 99.94999 for DECIMAL(3,1)
+ float typeRangeExtent = Math.nextDown((float) (Math.pow(10, digits) -
adjust));
+
+ // the resulting value of +- adjust would be rounded up, so in some cases
we need to use Math.nextDown
+ float adjusted1 = inclusive[0] ? boundaries[0] - adjust :
Math.nextDown(boundaries[0] + adjust);
+ float adjusted2 = inclusive[1] ? Math.nextDown(boundaries[1] + adjust) :
boundaries[1] - adjust;
+
+ float lowerUniverse = inclusive[0] ? -typeRangeExtent :
Math.nextDown(-typeRangeExtent);
+ float upperUniverse = inclusive[1] ? typeRangeExtent :
Math.nextUp(typeRangeExtent);
+ boundaries[0] = Math.max(adjusted1, lowerUniverse);
+ boundaries[1] = Math.min(adjusted2, upperUniverse);
+ if (boundaries.length >= 4) {
+ boundaries[2] = lowerUniverse;
+ boundaries[3] = upperUniverse;
+ }
+ }
+
private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
- final boolean isLiteralLeft =
call.getOperands().get(0).getKind().equals(SqlKind.LITERAL);
- final boolean isLiteralRight =
call.getOperands().get(1).getKind().equals(SqlKind.LITERAL);
- final boolean isInputRefLeft =
call.getOperands().get(0).getKind().equals(SqlKind.INPUT_REF);
- final boolean isInputRefRight =
call.getOperands().get(1).getKind().equals(SqlKind.INPUT_REF);
+ double defaultSelectivity = ((double) 1 / (double) 3);
+ if (!(childRel instanceof HiveTableScan)) {
+ return defaultSelectivity;
+ }
- if (childRel instanceof HiveTableScan && isLiteralLeft != isLiteralRight
&& isInputRefLeft != isInputRefRight) {
- final HiveTableScan t = (HiveTableScan) childRel;
- final int inputRefIndex = ((RexInputRef)
call.getOperands().get(isInputRefLeft ? 0 : 1)).getIndex();
- final List<ColStatistics> colStats =
t.getColStat(Collections.singletonList(inputRefIndex));
+ // search for the literal
+ List<RexNode> operands = call.getOperands();
+ final Optional<Float> leftLiteral = extractLiteral(operands.get(0));
+ final Optional<Float> rightLiteral = extractLiteral(operands.get(1));
+ if ((leftLiteral.isPresent()) == (rightLiteral.isPresent())) {
+ return defaultSelectivity;
+ }
+ int literalOpIdx = leftLiteral.isPresent() ? 0 : 1;
+
+ // analyze the predicate
+ float value = leftLiteral.orElseGet(rightLiteral::get);
+ int boundaryIdx;
+ boolean openBound = op == SqlKind.LESS_THAN || op == SqlKind.GREATER_THAN;
+ switch (op) {
+ case LESS_THAN, LESS_THAN_OR_EQUAL:
+ boundaryIdx = literalOpIdx;
+ break;
+ case GREATER_THAN, GREATER_THAN_OR_EQUAL:
+ boundaryIdx = 1 - literalOpIdx;
+ break;
+ default:
+ return defaultSelectivity;
+ }
+ float[] boundaries = new float[] { Float.NEGATIVE_INFINITY,
Float.POSITIVE_INFINITY };
+ boolean[] inclusive = new boolean[] { true, true };
+ inclusive[boundaryIdx] = !openBound;
+ boundaries[boundaryIdx] = value;
+
+ // extract the column index from the other operator
+ final HiveTableScan scan = (HiveTableScan) childRel;
+ int inputRefOpIndex = 1 - literalOpIdx;
+ RexNode node = operands.get(inputRefOpIndex);
+ if (node.getKind().equals(SqlKind.CAST)) {
+ node = removeCastIfPossible((RexCall) node, scan, boundaries, inclusive);
+ }
- if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) {
- final KllFloatsSketch kll =
KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
- final Object boundValueObject = ((RexLiteral)
call.getOperands().get(isLiteralLeft ? 0 : 1)).getValue();
- final SqlTypeName typeName = call.getOperands().get(isInputRefLeft ? 0
: 1).getType().getSqlTypeName();
- float value = extractLiteral(typeName, boundValueObject);
- boolean closedBound = op.equals(SqlKind.LESS_THAN_OR_EQUAL) ||
op.equals(SqlKind.GREATER_THAN_OR_EQUAL);
-
- double selectivity;
- if (op.equals(SqlKind.LESS_THAN_OR_EQUAL) ||
op.equals(SqlKind.LESS_THAN)) {
- selectivity = closedBound ? lessThanOrEqualSelectivity(kll, value) :
lessThanSelectivity(kll, value);
- } else {
- selectivity = closedBound ? greaterThanOrEqualSelectivity(kll,
value) : greaterThanSelectivity(kll, value);
- }
+ int inputRefIndex = -1;
+ if (node.getKind().equals(SqlKind.INPUT_REF)) {
+ inputRefIndex = ((RexInputRef) node).getIndex();
+ }
- // selectivity does not account for null values, we multiply for the
number of non-null values (getN)
- // and we divide by the total (non-null + null values) to get the
overall selectivity.
- //
- // Example: consider a filter "col < 3", and the following table rows:
- // _____
- // | col |
- // |_____|
- // |1 |
- // |null |
- // |null |
- // |3 |
- // |4 |
- // -------
- // kll.getN() would be 3, selectivity 1/3, t.getTable().getRowCount() 5
- // so the final result would be 3 * 1/3 / 5 = 1/5, as expected.
- return kll.getN() * selectivity / t.getTable().getRowCount();
- }
+ if (inputRefIndex < 0) {
+ return defaultSelectivity;
+ }
+
+ final List<ColStatistics> colStats =
scan.getColStat(Collections.singletonList(inputRefIndex));
+ if (colStats.isEmpty() || !isHistogramAvailable(colStats.get(0))) {
+ return defaultSelectivity;
}
- return ((double) 1 / (double) 3);
+
+ // convert the condition to a range val1 <= x < val2 for
rangedSelectivity(...)
+ float left = inclusive[0] ? boundaries[0] : Math.nextUp(boundaries[0]);
+ float right = inclusive[1] ? Math.nextUp(boundaries[1]) : boundaries[1];
+
+ final KllFloatsSketch kll =
KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
+ double rawSelectivity = rangedSelectivity(kll, left, right);
+
+ // rawSelectivity does not account for null values, we multiply for the
number of non-null values (getN)
+ // and we divide by the total (non-null + null values) to get the overall
rawSelectivity.
+ //
+ // Example: consider a filter "col < 3", and the following table rows:
+ // _____
+ // | col |
+ // |_____|
+ // |1 |
+ // |null |
+ // |null |
+ // |3 |
+ // |4 |
+ // -------
+ // kll.getN() would be 3, rawSelectivity 1/3,
scan.getTable().getRowCount() 5
+ // so the final result would be 3 * 1/3 / 5 = 1/5, as expected.
+ return kll.getN() * rawSelectivity / scan.getTable().getRowCount();
}
private Double computeBetweenPredicateSelectivity(RexCall call) {
- final boolean hasLiteralBool =
call.getOperands().get(0).getKind().equals(SqlKind.LITERAL);
- final boolean hasInputRef =
call.getOperands().get(1).getKind().equals(SqlKind.INPUT_REF);
- final boolean hasLiteralLeft =
call.getOperands().get(2).getKind().equals(SqlKind.LITERAL);
- final boolean hasLiteralRight =
call.getOperands().get(3).getKind().equals(SqlKind.LITERAL);
+ if (!(childRel instanceof HiveTableScan)) {
+ return computeFunctionSelectivity(call);
+ }
+
+ List<RexNode> operands = call.getOperands();
+ final boolean hasLiteralBool =
operands.get(0).getKind().equals(SqlKind.LITERAL);
+ Optional<Float> leftLiteral = extractLiteral(operands.get(2));
+ Optional<Float> rightLiteral = extractLiteral(operands.get(3));
+
+ if (hasLiteralBool && leftLiteral.isPresent() && rightLiteral.isPresent())
{
+ final HiveTableScan scan = (HiveTableScan) childRel;
+ float leftValue = leftLiteral.get();
+ float rightValue = rightLiteral.get();
+
+ final Object inverseBoolValueObject = ((RexLiteral)
operands.getFirst()).getValue();
+ boolean inverseBool =
Boolean.parseBoolean(inverseBoolValueObject.toString());
+ // when they are equal it's an equality predicate, we cannot handle it
as "BETWEEN"
+ if (Objects.equals(leftValue, rightValue)) {
+ return inverseBool ? computeNotEqualitySelectivity(call) :
computeFunctionSelectivity(call);
+ }
+
+ float[] boundaries = new float[] { leftValue, rightValue,
Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY };
+ boolean[] inclusive = new boolean[] { true, true };
- if (childRel instanceof HiveTableScan && hasLiteralBool && hasInputRef &&
hasLiteralLeft && hasLiteralRight) {
- final HiveTableScan t = (HiveTableScan) childRel;
- final int inputRefIndex = ((RexInputRef)
call.getOperands().get(1)).getIndex();
- final List<ColStatistics> colStats =
t.getColStat(Collections.singletonList(inputRefIndex));
+ RexNode expr = operands.get(1); // expr to be checked by the BETWEEN
+ if (expr.getKind().equals(SqlKind.CAST)) {
+ expr = removeCastIfPossible((RexCall) expr, scan, boundaries,
inclusive);
+ }
+
+ int inputRefIndex = -1;
+ if (expr.getKind().equals(SqlKind.INPUT_REF)) {
+ inputRefIndex = ((RexInputRef) expr).getIndex();
+ }
+
+ if (inputRefIndex < 0) {
+ return computeFunctionSelectivity(call);
+ }
+ final List<ColStatistics> colStats =
scan.getColStat(Collections.singletonList(inputRefIndex));
if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) {
+ // convert the condition to a range val1 <= x < val2 for
rangedSelectivity(...)
+ boundaries[1] = Math.nextUp(boundaries[1]);
+ boundaries[3] = Math.nextUp(boundaries[3]);
+
final KllFloatsSketch kll =
KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
- final SqlTypeName typeName =
call.getOperands().get(1).getType().getSqlTypeName();
- final Object inverseBoolValueObject = ((RexLiteral)
call.getOperands().get(0)).getValue();
- boolean inverseBool =
Boolean.parseBoolean(inverseBoolValueObject.toString());
- final Object leftBoundValueObject = ((RexLiteral)
call.getOperands().get(2)).getValue();
- float leftValue = extractLiteral(typeName, leftBoundValueObject);
- final Object rightBoundValueObject = ((RexLiteral)
call.getOperands().get(3)).getValue();
- float rightValue = extractLiteral(typeName, rightBoundValueObject);
- // when inverseBool == true, this is a NOT_BETWEEN and selectivity
must be inverted
+ double rawSelectivity = rangedSelectivity(kll, boundaries[0],
boundaries[1]);
if (inverseBool) {
- if (rightValue == leftValue) {
- return computeNotEqualitySelectivity(call);
- } else if (rightValue < leftValue) {
- return 1.0;
- }
- return 1.0 - (kll.getN() * betweenSelectivity(kll, leftValue,
rightValue) / t.getTable().getRowCount());
- }
- // when they are equal it's an equality predicate, we cannot handle it
as "between"
- if (Double.compare(leftValue, rightValue) != 0) {
- return kll.getN() * betweenSelectivity(kll, leftValue, rightValue) /
t.getTable().getRowCount();
+ // when inverseBool == true, this is a NOT_BETWEEN and selectivity
must be inverted
+ // if there's a cast, the inversion is with respect to its codomain
(range of the values of the cast)
+ double typeRangeSelectivity = rangedSelectivity(kll, boundaries[2],
boundaries[3]);
+ rawSelectivity = typeRangeSelectivity - rawSelectivity;
}
+ // rawSelectivity does not account for null values, so adjust them
+ // for a detailed explanation, see comment at
computeRangePredicateSelectivity
+ return kll.getN() * rawSelectivity / scan.getTable().getRowCount();
Review Comment:
Refactor to private method?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]