Re: [PR] HIVE-29424: CBO plans should use histogram statistics for range predicates with a CAST [hive]

via GitHub Fri, 20 Feb 2026 02:08:17 -0800


thomasrebele commented on code in PR #6293:
URL: https://github.com/apache/hive/pull/6293#discussion_r2832376888



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +188,284 @@ public Double visitCall(RexCall call) {
     return selectivity;
   }
 
+  /**
+   * If the cast can be removed, just return its operand and adjust the 
boundaries if necessary.
+   *
+   * <p>
+   *   In Hive, if a value cannot be represented by the cast, the result of 
the cast is NULL,
+   *   and therefore cannot fulfill the predicate. So the possible range of 
the values
+   *   is limited by the range of possible values of the type.
+   * </p>
+   *
+   * <p>
+   *   Special care is taken to support the cast to DECIMAL(precision, scale):
+   *   The cast to DECIMAL rounds the value the same way as {@link 
RoundingMode#HALF_UP}.
+   *   The boundaries are adjusted accordingly, without changing the semantics 
of <code>inclusive</code>.
+   * </p>
+   *
+   * @param cast a RexCall of type {@link SqlKind#CAST}
+   * @param tableScan the table that provides the statistics
+   * @param boundaries indexes 0 and 1 are the boundaries of the range 
predicate;
+   *                   indexes 2 and 3, if they exist, will be set to the 
boundaries of the type range
+   * @param inclusive whether the respective boundary is inclusive or 
exclusive.
+   * @return the operand if the cast can be removed, otherwise the cast itself
+   */
+  private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan, 
float[] boundaries, boolean[] inclusive) {
+    RexNode op0 = cast.getOperands().getFirst();
+    if (!(op0 instanceof RexInputRef)) {
+      return cast;
+    }
+    int index = ((RexInputRef) op0).getIndex();
+    final List<ColStatistics> colStats = 
tableScan.getColStat(Collections.singletonList(index));
+    if (colStats.isEmpty()) {
+      return cast;
+    }
+
+    // we need to check that the possible values of the input to the cast are 
all within the type range of the cast
+    // otherwise the CAST introduces some modulo-like behavior (*)
+    ColStatistics colStat = colStats.getFirst();
+    ColStatistics.Range range = colStat.getRange();
+    if (range == null)
+      return cast;
+    if (range.minValue == null || Double.isNaN(range.minValue.doubleValue()))
+      return cast;
+    if (range.maxValue == null || Double.isNaN(range.maxValue.doubleValue()))
+      return cast;
+
+    String type = cast.getType().getSqlTypeName().getName();
+
+    double min;
+    double max;
+    switch (type.toLowerCase()) {
+    case serdeConstants.TINYINT_TYPE_NAME:
+      min = Byte.MIN_VALUE;
+      max = Byte.MAX_VALUE;
+      break;
+    case serdeConstants.SMALLINT_TYPE_NAME:
+      min = Short.MIN_VALUE;
+      max = Short.MAX_VALUE;
+      break;
+    case serdeConstants.INT_TYPE_NAME, "integer":
+      min = Integer.MIN_VALUE;
+      max = Integer.MAX_VALUE;
+      break;
+    case serdeConstants.BIGINT_TYPE_NAME, serdeConstants.TIMESTAMP_TYPE_NAME:
+      min = Long.MIN_VALUE;
+      max = Long.MAX_VALUE;
+      break;
+    case serdeConstants.FLOAT_TYPE_NAME:
+      min = -Float.MAX_VALUE;
+      max = Float.MAX_VALUE;
+      break;
+    case serdeConstants.DOUBLE_TYPE_NAME:
+      min = -Double.MAX_VALUE;
+      max = Double.MAX_VALUE;
+      break;
+    case serdeConstants.DECIMAL_TYPE_NAME:
+      min = -Double.MAX_VALUE;
+      max = Double.MAX_VALUE;
+      adjustBoundariesForDecimal(cast, boundaries, inclusive);
+      break;
+    default:
+      // unknown type, do not remove the cast
+      return cast;
+    }
+
+    // see (*)
+    if (range.minValue.doubleValue() < min)
+      return cast;
+    if (range.maxValue.doubleValue() > max)
+      return cast;
+
+    return op0;
+  }
+
+  /**
+   * Adjust the boundaries for a DECIMAL cast.
+   * <p>
+   * See {@link #removeCastIfPossible(RexCall, HiveTableScan, float[], 
boolean[])}
+   * for an explanation of the parameters.
+   */
+  private static void adjustBoundariesForDecimal(RexCall cast, float[] 
boundaries, boolean[] inclusive) {
+    // values outside the representable range are cast to NULL, so adapt the 
boundaries
+    int precision = cast.getType().getPrecision();
+    int scale = cast.getType().getScale();
+    int digits = precision - scale;
+    // the cast does some rounding, i.e., CAST(99.9499 AS DECIMAL(3,1)) = 99.9
+    // but CAST(99.95 AS DECIMAL(3,1)) = NULL
+    float adjust = (float) (5 * Math.pow(10, -(scale + 1)));
+    // the range of values supported by the type is interval 
[-typeRangeExtent, typeRangeExtent] (both inclusive)
+    // e.g., the typeRangeExt is 99.94999 for DECIMAL(3,1)
+    float typeRangeExtent = Math.nextDown((float) (Math.pow(10, digits) - 
adjust));
+
+    // the resulting value of +- adjust would be rounded up, so in some cases 
we need to use Math.nextDown
+    float adjusted1 = inclusive[0] ? boundaries[0] - adjust : 
Math.nextDown(boundaries[0] + adjust);
+    float adjusted2 = inclusive[1] ? Math.nextDown(boundaries[1] + adjust) : 
boundaries[1] - adjust;
+
+    float lowerUniverse = inclusive[0] ? -typeRangeExtent : 
Math.nextDown(-typeRangeExtent);
+    float upperUniverse = inclusive[1] ? typeRangeExtent : 
Math.nextUp(typeRangeExtent);
+    boundaries[0] = Math.max(adjusted1, lowerUniverse);
+    boundaries[1] = Math.min(adjusted2, upperUniverse);
+    if (boundaries.length >= 4) {
+      boundaries[2] = lowerUniverse;
+      boundaries[3] = upperUniverse;
+    }
+  }
+
   private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
-    final boolean isLiteralLeft = 
call.getOperands().get(0).getKind().equals(SqlKind.LITERAL);
-    final boolean isLiteralRight = 
call.getOperands().get(1).getKind().equals(SqlKind.LITERAL);
-    final boolean isInputRefLeft = 
call.getOperands().get(0).getKind().equals(SqlKind.INPUT_REF);
-    final boolean isInputRefRight = 
call.getOperands().get(1).getKind().equals(SqlKind.INPUT_REF);
+    double defaultSelectivity = ((double) 1 / (double) 3);
+    if (!(childRel instanceof HiveTableScan)) {
+      return defaultSelectivity;
+    }
 
-    if (childRel instanceof HiveTableScan && isLiteralLeft != isLiteralRight 
&& isInputRefLeft != isInputRefRight) {
-      final HiveTableScan t = (HiveTableScan) childRel;
-      final int inputRefIndex = ((RexInputRef) 
call.getOperands().get(isInputRefLeft ? 0 : 1)).getIndex();
-      final List<ColStatistics> colStats = 
t.getColStat(Collections.singletonList(inputRefIndex));
+    // search for the literal
+    List<RexNode> operands = call.getOperands();
+    final Optional<Float> leftLiteral = extractLiteral(operands.get(0));
+    final Optional<Float> rightLiteral = extractLiteral(operands.get(1));
+    if ((leftLiteral.isPresent()) == (rightLiteral.isPresent())) {
+      return defaultSelectivity;
+    }
+    int literalOpIdx = leftLiteral.isPresent() ? 0 : 1;
+
+    // analyze the predicate
+    float value = leftLiteral.orElseGet(rightLiteral::get);
+    int boundaryIdx;
+    boolean openBound = op == SqlKind.LESS_THAN || op == SqlKind.GREATER_THAN;
+    switch (op) {
+    case LESS_THAN, LESS_THAN_OR_EQUAL:
+      boundaryIdx = literalOpIdx;
+      break;
+    case GREATER_THAN, GREATER_THAN_OR_EQUAL:
+      boundaryIdx = 1 - literalOpIdx;
+      break;
+    default:
+      return defaultSelectivity;
+    }
+    float[] boundaries = new float[] { Float.NEGATIVE_INFINITY, 
Float.POSITIVE_INFINITY };
+    boolean[] inclusive = new boolean[] { true, true };
+    inclusive[boundaryIdx] = !openBound;
+    boundaries[boundaryIdx] = value;
+
+    // extract the column index from the other operator
+    final HiveTableScan scan = (HiveTableScan) childRel;
+    int inputRefOpIndex = 1 - literalOpIdx;
+    RexNode node = operands.get(inputRefOpIndex);
+    if (node.getKind().equals(SqlKind.CAST)) {
+      node = removeCastIfPossible((RexCall) node, scan, boundaries, inclusive);
+    }
 
-      if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) {
-        final KllFloatsSketch kll = 
KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
-        final Object boundValueObject = ((RexLiteral) 
call.getOperands().get(isLiteralLeft ? 0 : 1)).getValue();
-        final SqlTypeName typeName = call.getOperands().get(isInputRefLeft ? 0 
: 1).getType().getSqlTypeName();
-        float value = extractLiteral(typeName, boundValueObject);
-        boolean closedBound = op.equals(SqlKind.LESS_THAN_OR_EQUAL) || 
op.equals(SqlKind.GREATER_THAN_OR_EQUAL);
-
-        double selectivity;
-        if (op.equals(SqlKind.LESS_THAN_OR_EQUAL) || 
op.equals(SqlKind.LESS_THAN)) {
-          selectivity = closedBound ? lessThanOrEqualSelectivity(kll, value) : 
lessThanSelectivity(kll, value);
-        } else {
-          selectivity = closedBound ? greaterThanOrEqualSelectivity(kll, 
value) : greaterThanSelectivity(kll, value);
-        }
+    int inputRefIndex = -1;
+    if (node.getKind().equals(SqlKind.INPUT_REF)) {
+      inputRefIndex = ((RexInputRef) node).getIndex();
+    }
 
-        // selectivity does not account for null values, we multiply for the 
number of non-null values (getN)
-        // and we divide by the total (non-null + null values) to get the 
overall selectivity.
-        //
-        // Example: consider a filter "col < 3", and the following table rows:
-        //  _____
-        // | col |
-        // |_____|
-        // |1    |
-        // |null |
-        // |null |
-        // |3    |
-        // |4    |
-        // -------
-        // kll.getN() would be 3, selectivity 1/3, t.getTable().getRowCount() 5
-        // so the final result would be 3 * 1/3 / 5 = 1/5, as expected.
-        return kll.getN() * selectivity / t.getTable().getRowCount();
-      }
+    if (inputRefIndex < 0) {
+      return defaultSelectivity;
+    }
+
+    final List<ColStatistics> colStats = 
scan.getColStat(Collections.singletonList(inputRefIndex));
+    if (colStats.isEmpty() || !isHistogramAvailable(colStats.get(0))) {
+      return defaultSelectivity;
     }
-    return ((double) 1 / (double) 3);
+
+    // convert the condition to a range val1 <= x < val2 for 
rangedSelectivity(...)
+    float left = inclusive[0] ? boundaries[0] : Math.nextUp(boundaries[0]);
+    float right = inclusive[1] ? Math.nextUp(boundaries[1]) : boundaries[1];
+
+    final KllFloatsSketch kll = 
KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
+    double rawSelectivity = rangedSelectivity(kll, left, right);
+
+    // rawSelectivity does not account for null values, we multiply for the 
number of non-null values (getN)
+    // and we divide by the total (non-null + null values) to get the overall 
rawSelectivity.
+    //
+    // Example: consider a filter "col < 3", and the following table rows:
+    //  _____
+    // | col |
+    // |_____|
+    // |1    |
+    // |null |
+    // |null |
+    // |3    |
+    // |4    |
+    // -------
+    // kll.getN() would be 3, rawSelectivity 1/3, 
scan.getTable().getRowCount() 5
+    // so the final result would be 3 * 1/3 / 5 = 1/5, as expected.
+    return kll.getN() * rawSelectivity / scan.getTable().getRowCount();
   }
 
   private Double computeBetweenPredicateSelectivity(RexCall call) {
-    final boolean hasLiteralBool = 
call.getOperands().get(0).getKind().equals(SqlKind.LITERAL);
-    final boolean hasInputRef = 
call.getOperands().get(1).getKind().equals(SqlKind.INPUT_REF);
-    final boolean hasLiteralLeft = 
call.getOperands().get(2).getKind().equals(SqlKind.LITERAL);
-    final boolean hasLiteralRight = 
call.getOperands().get(3).getKind().equals(SqlKind.LITERAL);
+    if (!(childRel instanceof HiveTableScan)) {
+      return computeFunctionSelectivity(call);
+    }
+
+    List<RexNode> operands = call.getOperands();
+    final boolean hasLiteralBool = 
operands.get(0).getKind().equals(SqlKind.LITERAL);
+    Optional<Float> leftLiteral = extractLiteral(operands.get(2));
+    Optional<Float> rightLiteral = extractLiteral(operands.get(3));
+
+    if (hasLiteralBool && leftLiteral.isPresent() && rightLiteral.isPresent()) 
{
+      final HiveTableScan scan = (HiveTableScan) childRel;
+      float leftValue = leftLiteral.get();
+      float rightValue = rightLiteral.get();
+
+      final Object inverseBoolValueObject = ((RexLiteral) 
operands.getFirst()).getValue();
+      boolean inverseBool = 
Boolean.parseBoolean(inverseBoolValueObject.toString());
+      // when they are equal it's an equality predicate, we cannot handle it 
as "BETWEEN"
+      if (Objects.equals(leftValue, rightValue)) {
+        return inverseBool ? computeNotEqualitySelectivity(call) : 
computeFunctionSelectivity(call);
+      }
+
+      float[] boundaries = new float[] { leftValue, rightValue, 
Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY };
+      boolean[] inclusive = new boolean[] { true, true };
 
-    if (childRel instanceof HiveTableScan && hasLiteralBool && hasInputRef && 
hasLiteralLeft && hasLiteralRight) {
-      final HiveTableScan t = (HiveTableScan) childRel;
-      final int inputRefIndex = ((RexInputRef) 
call.getOperands().get(1)).getIndex();
-      final List<ColStatistics> colStats = 
t.getColStat(Collections.singletonList(inputRefIndex));
+      RexNode expr = operands.get(1); // expr to be checked by the BETWEEN
+      if (expr.getKind().equals(SqlKind.CAST)) {
+        expr = removeCastIfPossible((RexCall) expr, scan, boundaries, 
inclusive);
+      }
+
+      int inputRefIndex = -1;
+      if (expr.getKind().equals(SqlKind.INPUT_REF)) {
+        inputRefIndex = ((RexInputRef) expr).getIndex();
+      }
+
+      if (inputRefIndex < 0) {
+        return computeFunctionSelectivity(call);
+      }
 
+      final List<ColStatistics> colStats = 
scan.getColStat(Collections.singletonList(inputRefIndex));
       if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) {
+        // convert the condition to a range val1 <= x < val2 for 
rangedSelectivity(...)
+        boundaries[1] = Math.nextUp(boundaries[1]);
+        boundaries[3] = Math.nextUp(boundaries[3]);
+
         final KllFloatsSketch kll = 
KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
-        final SqlTypeName typeName = 
call.getOperands().get(1).getType().getSqlTypeName();
-        final Object inverseBoolValueObject = ((RexLiteral) 
call.getOperands().get(0)).getValue();
-        boolean inverseBool = 
Boolean.parseBoolean(inverseBoolValueObject.toString());
-        final Object leftBoundValueObject = ((RexLiteral) 
call.getOperands().get(2)).getValue();
-        float leftValue = extractLiteral(typeName, leftBoundValueObject);
-        final Object rightBoundValueObject = ((RexLiteral) 
call.getOperands().get(3)).getValue();
-        float rightValue = extractLiteral(typeName, rightBoundValueObject);
-        // when inverseBool == true, this is a NOT_BETWEEN and selectivity 
must be inverted
+        double rawSelectivity = rangedSelectivity(kll, boundaries[0], 
boundaries[1]);
         if (inverseBool) {
-          if (rightValue == leftValue) {
-            return computeNotEqualitySelectivity(call);
-          } else if (rightValue < leftValue) {
-            return 1.0;
-          }
-          return 1.0 - (kll.getN() * betweenSelectivity(kll, leftValue, 
rightValue) / t.getTable().getRowCount());
-        }
-        // when they are equal it's an equality predicate, we cannot handle it 
as "between"
-        if (Double.compare(leftValue, rightValue) != 0) {
-          return kll.getN() * betweenSelectivity(kll, leftValue, rightValue) / 
t.getTable().getRowCount();
+          // when inverseBool == true, this is a NOT_BETWEEN and selectivity 
must be inverted
+          // if there's a cast, the inversion is with respect to its codomain 
(range of the values of the cast)
+          double typeRangeSelectivity = rangedSelectivity(kll, boundaries[2], 
boundaries[3]);
+          rawSelectivity = typeRangeSelectivity - rawSelectivity;
         }
+        // rawSelectivity does not account for null values, so adjust them
+        // for a detailed explanation, see comment at 
computeRangePredicateSelectivity
+        return kll.getN() * rawSelectivity / scan.getTable().getRowCount();

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] HIVE-29424: CBO plans should use histogram statistics for range predicates with a CAST [hive]

Reply via email to