[GitHub] spark pull request #19783: [SPARK-21322][SQL] support histogram in filter ca...

cloud-fan Thu, 07 Dec 2017 08:11:26 -0800

Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19783#discussion_r155566119
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
 ---
    @@ -471,37 +508,47 @@ case class FilterEstimation(plan: Filter) extends 
Logging {
           percent = 1.0
         } else {
           // This is the partial overlap case:
    -      // Without advanced statistics like histogram, we assume uniform 
data distribution.
    -      // We just prorate the adjusted range over the initial range to 
compute filter selectivity.
    -      assert(max > min)
    -      percent = op match {
    -        case _: LessThan =>
    -          if (numericLiteral == max) {
    -            // If the literal value is right on the boundary, we can minus 
the part of the
    -            // boundary value (1/ndv).
    -            1.0 - 1.0 / ndv
    -          } else {
    -            (numericLiteral - min) / (max - min)
    -          }
    -        case _: LessThanOrEqual =>
    -          if (numericLiteral == min) {
    -            // The boundary value is the only satisfying value.
    -            1.0 / ndv
    -          } else {
    -            (numericLiteral - min) / (max - min)
    -          }
    -        case _: GreaterThan =>
    -          if (numericLiteral == min) {
    -            1.0 - 1.0 / ndv
    -          } else {
    -            (max - numericLiteral) / (max - min)
    -          }
    -        case _: GreaterThanOrEqual =>
    -          if (numericLiteral == max) {
    -            1.0 / ndv
    -          } else {
    -            (max - numericLiteral) / (max - min)
    -          }
    +
    +      if (colStat.histogram.isEmpty) {
    --- End diff --
    
    yea please do it



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #19783: [SPARK-21322][SQL] support histogram in filter ca...

Reply via email to