[GitHub] spark pull request #19783: [SPARK-21322][SQL] support histogram in filter ca...

wzhfy Wed, 29 Nov 2017 19:44:41 -0800

Github user wzhfy commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19783#discussion_r153974940
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
 ---
    @@ -471,37 +508,47 @@ case class FilterEstimation(plan: Filter) extends 
Logging {
           percent = 1.0
         } else {
           // This is the partial overlap case:
    -      // Without advanced statistics like histogram, we assume uniform 
data distribution.
    -      // We just prorate the adjusted range over the initial range to 
compute filter selectivity.
    -      assert(max > min)
    -      percent = op match {
    -        case _: LessThan =>
    -          if (numericLiteral == max) {
    -            // If the literal value is right on the boundary, we can minus 
the part of the
    -            // boundary value (1/ndv).
    -            1.0 - 1.0 / ndv
    -          } else {
    -            (numericLiteral - min) / (max - min)
    -          }
    -        case _: LessThanOrEqual =>
    -          if (numericLiteral == min) {
    -            // The boundary value is the only satisfying value.
    -            1.0 / ndv
    -          } else {
    -            (numericLiteral - min) / (max - min)
    -          }
    -        case _: GreaterThan =>
    -          if (numericLiteral == min) {
    -            1.0 - 1.0 / ndv
    -          } else {
    -            (max - numericLiteral) / (max - min)
    -          }
    -        case _: GreaterThanOrEqual =>
    -          if (numericLiteral == max) {
    -            1.0 / ndv
    -          } else {
    -            (max - numericLiteral) / (max - min)
    -          }
    +
    +      if (colStat.histogram.isEmpty) {
    +        // Without advanced statistics like histogram, we assume uniform 
data distribution.
    +        // We just prorate the adjusted range over the initial range to 
compute filter selectivity.
    +        assert(max > min)
    +        percent = op match {
    +          case _: LessThan =>
    +            if (numericLiteral == max) {
    +              // If the literal value is right on the boundary, we can 
minus the part of the
    +              // boundary value (1/ndv).
    +              1.0 - 1.0 / ndv
    +            } else {
    +              (numericLiteral - min) / (max - min)
    +            }
    +          case _: LessThanOrEqual =>
    +            if (numericLiteral == min) {
    +              // The boundary value is the only satisfying value.
    +              1.0 / ndv
    +            } else {
    +              (numericLiteral - min) / (max - min)
    +            }
    +          case _: GreaterThan =>
    +            if (numericLiteral == min) {
    +              1.0 - 1.0 / ndv
    +            } else {
    +              (max - numericLiteral) / (max - min)
    +            }
    +          case _: GreaterThanOrEqual =>
    +            if (numericLiteral == max) {
    +              1.0 / ndv
    +            } else {
    +              (max - numericLiteral) / (max - min)
    +            }
    +        }
    +      } else {
    +        val numericHistogram = colStat.histogram.get
    +        val datum = EstimationUtils.toDecimal(literal.value, 
literal.dataType).toDouble
    +        val maxDouble = EstimationUtils.toDecimal(colStat.max.get, 
literal.dataType).toDouble
    +        val minDouble = EstimationUtils.toDecimal(colStat.min.get, 
literal.dataType).toDouble
    --- End diff --
    
    `max, min` is good enough I think



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #19783: [SPARK-21322][SQL] support histogram in filter ca...

Reply via email to