Github user ron8hu commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19952#discussion_r156555044
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
 ---
    @@ -147,65 +139,76 @@ object EstimationUtils {
       }
     
       /**
    -   * Returns a percentage of a bin holding values for column value in the 
range of
    -   * [lowerValue, higherValue]
    -   *
    -   * @param higherValue a given upper bound value of a specified column 
value range
    -   * @param lowerValue a given lower bound value of a specified column 
value range
    -   * @param bin a single histogram bin
    -   * @return the percentage of a single bin holding values in [lowerValue, 
higherValue].
    +   * Returns the possibility of the given histogram bin holding values 
within the given range
    +   * [lowerBound, upperBound].
        */
    -  private def getOccupation(
    -      higherValue: Double,
    -      lowerValue: Double,
    +  private def binHoldingRangePossibility(
    +      upperBound: Double,
    +      lowerBound: Double,
           bin: HistogramBin): Double = {
    -    assert(bin.lo <= lowerValue && lowerValue <= higherValue && 
higherValue <= bin.hi)
    +    assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound 
<= bin.hi)
         if (bin.hi == bin.lo) {
           // the entire bin is covered in the range
           1.0
    -    } else if (higherValue == lowerValue) {
    +    } else if (upperBound == lowerBound) {
           // set percentage to 1/NDV
           1.0 / bin.ndv.toDouble
         } else {
           // Use proration since the range falls inside this bin.
    -      math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
    +      math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0)
         }
       }
     
       /**
    -   * Returns the number of bins for column values in [lowerValue, 
higherValue].
    -   * The column value distribution is saved in an equi-height histogram.  
The return values is a
    -   * double value is because we may return a portion of a bin. For 
example, a predicate
    -   * "column = 8" may return the number of bins 0.2 if the holding bin has 
5 distinct values.
    +   * Returns the number of histogram bins holding values within the given 
range
    +   * [lowerBound, upperBound].
    +   *
    +   * Note that the returned value is double type, because the range 
boundaries usually occupy a
    +   * portion of a bin. An extrema case is [value, value] which is 
generated by equal predicate
    --- End diff --
    
    typo: extreme


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to