Github user ron8hu commented on a diff in the pull request: https://github.com/apache/spark/pull/19952#discussion_r156555044 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala --- @@ -147,65 +139,76 @@ object EstimationUtils { } /** - * Returns a percentage of a bin holding values for column value in the range of - * [lowerValue, higherValue] - * - * @param higherValue a given upper bound value of a specified column value range - * @param lowerValue a given lower bound value of a specified column value range - * @param bin a single histogram bin - * @return the percentage of a single bin holding values in [lowerValue, higherValue]. + * Returns the possibility of the given histogram bin holding values within the given range + * [lowerBound, upperBound]. */ - private def getOccupation( - higherValue: Double, - lowerValue: Double, + private def binHoldingRangePossibility( + upperBound: Double, + lowerBound: Double, bin: HistogramBin): Double = { - assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi) + assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bin.hi) if (bin.hi == bin.lo) { // the entire bin is covered in the range 1.0 - } else if (higherValue == lowerValue) { + } else if (upperBound == lowerBound) { // set percentage to 1/NDV 1.0 / bin.ndv.toDouble } else { // Use proration since the range falls inside this bin. - math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0) + math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0) } } /** - * Returns the number of bins for column values in [lowerValue, higherValue]. - * The column value distribution is saved in an equi-height histogram. The return values is a - * double value is because we may return a portion of a bin. For example, a predicate - * "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values. + * Returns the number of histogram bins holding values within the given range + * [lowerBound, upperBound]. + * + * Note that the returned value is double type, because the range boundaries usually occupy a + * portion of a bin. An extrema case is [value, value] which is generated by equal predicate --- End diff -- typo: extreme
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org