Github user ron8hu commented on a diff in the pull request:
https://github.com/apache/spark/pull/19952#discussion_r156555044
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
---
@@ -147,65 +139,76 @@ object EstimationUtils {
}
/**
- * Returns a percentage of a bin holding values for column value in the
range of
- * [lowerValue, higherValue]
- *
- * @param higherValue a given upper bound value of a specified column
value range
- * @param lowerValue a given lower bound value of a specified column
value range
- * @param bin a single histogram bin
- * @return the percentage of a single bin holding values in [lowerValue,
higherValue].
+ * Returns the possibility of the given histogram bin holding values
within the given range
+ * [lowerBound, upperBound].
*/
- private def getOccupation(
- higherValue: Double,
- lowerValue: Double,
+ private def binHoldingRangePossibility(
+ upperBound: Double,
+ lowerBound: Double,
bin: HistogramBin): Double = {
- assert(bin.lo <= lowerValue && lowerValue <= higherValue &&
higherValue <= bin.hi)
+ assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound
<= bin.hi)
if (bin.hi == bin.lo) {
// the entire bin is covered in the range
1.0
- } else if (higherValue == lowerValue) {
+ } else if (upperBound == lowerBound) {
// set percentage to 1/NDV
1.0 / bin.ndv.toDouble
} else {
// Use proration since the range falls inside this bin.
- math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
+ math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0)
}
}
/**
- * Returns the number of bins for column values in [lowerValue,
higherValue].
- * The column value distribution is saved in an equi-height histogram.
The return values is a
- * double value is because we may return a portion of a bin. For
example, a predicate
- * "column = 8" may return the number of bins 0.2 if the holding bin has
5 distinct values.
+ * Returns the number of histogram bins holding values within the given
range
+ * [lowerBound, upperBound].
+ *
+ * Note that the returned value is double type, because the range
boundaries usually occupy a
+ * portion of a bin. An extrema case is [value, value] which is
generated by equal predicate
--- End diff --
typo: extreme
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]