Github user wzhfy commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19952#discussion_r156552208
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
 ---
    @@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends 
Logging {
       }
     
       /**
    -   * Returns the selectivity percentage for binary condition in the 
column's
    -   * current valid range [min, max]
    -   *
    -   * @param op a binary comparison operator
    -   * @param histogram a numeric equi-height histogram
    -   * @param max the upper bound of the current valid range for a given 
column
    -   * @param min the lower bound of the current valid range for a given 
column
    -   * @param datumNumber the numeric value of a literal
    -   * @return the selectivity percentage for a condition in the current 
range.
    +   * Computes the possibility of a equal predicate using histogram.
        */
    +  private def computeEqualityPossibilityByHistogram(
    +      literal: Literal, colStat: ColumnStat): Double = {
    +    val datum = EstimationUtils.toDecimal(literal.value, 
literal.dataType).toDouble
    +    val histogram = colStat.histogram.get
     
    -  def computePercentByEquiHeightHgm(
    -      op: BinaryComparison,
    -      histogram: Histogram,
    -      max: Double,
    -      min: Double,
    -      datumNumber: Double): Double = {
         // find bins where column's current min and max locate.  Note that a 
column's [min, max]
         // range may change due to another condition applied earlier.
    -    val minBinId = EstimationUtils.findFirstBinForValue(min, 
histogram.bins)
    -    val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
    +    val min = EstimationUtils.toDecimal(colStat.min.get, 
literal.dataType).toDouble
    +    val max = EstimationUtils.toDecimal(colStat.max.get, 
literal.dataType).toDouble
     
         // compute how many bins the column's current valid range [min, max] 
occupies.
    -    // Note that a column's [min, max] range may vary after we apply some 
filter conditions.
    -    val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, 
minBinId, max, min, histogram)
    -
    -    val datumInBinId = op match {
    -      case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
    -        EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
    -      case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
    -        EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
    -    }
    +    val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
    +      upperBound = max,
    +      upperBoundInclusive = true,
    +      lowerBound = min,
    +      lowerBoundInclusive = true,
    +      histogram)
    +
    +    val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
    +      upperBound = datum,
    +      upperBoundInclusive = true,
    +      lowerBound = datum,
    +      lowerBoundInclusive = true,
    +      histogram)
    +
    +    numBinsHoldingDatum / numBinsHoldingEntireRange
    +  }
     
    -    op match {
    -      // LessThan and LessThanOrEqual share the same logic,
    -      // but their datumInBinId may be different
    -      case LessThan(_, _) | LessThanOrEqual(_, _) =>
    -        EstimationUtils.getOccupationBins(datumInBinId, minBinId, 
datumNumber, min,
    -          histogram) / minToMaxLength
    -      // GreaterThan and GreaterThanOrEqual share the same logic,
    -      // but their datumInBinId may be different
    -      case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
    -        EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, 
datumNumber,
    -          histogram) / minToMaxLength
    +  /**
    +   * Computes the possibility of a comparison predicate using histogram.
    +   */
    +  private def computeComparisonPossibilityByHistogram(
    +      op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double 
= {
    +    val datum = EstimationUtils.toDecimal(literal.value, 
literal.dataType).toDouble
    +    val histogram = colStat.histogram.get
    +
    +    // find bins where column's current min and max locate.  Note that a 
column's [min, max]
    +    // range may change due to another condition applied earlier.
    +    val min = EstimationUtils.toDecimal(colStat.min.get, 
literal.dataType).toDouble
    +    val max = EstimationUtils.toDecimal(colStat.max.get, 
literal.dataType).toDouble
    +
    +    // compute how many bins the column's current valid range [min, max] 
occupies.
    +    val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
    +      max, upperBoundInclusive = true, min, lowerBoundInclusive = true, 
histogram)
    +
    +    val numBinsHoldingDatum = op match {
    --- End diff --
    
    `numBinsHoldingRange`


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to