Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19594#discussion_r157519924
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
 ---
    @@ -191,8 +191,19 @@ case class JoinEstimation(join: Join) extends Logging {
           val rInterval = ValueInterval(rightKeyStat.min, rightKeyStat.max, 
rightKey.dataType)
           if (ValueInterval.isIntersected(lInterval, rInterval)) {
             val (newMin, newMax) = ValueInterval.intersect(lInterval, 
rInterval, leftKey.dataType)
    -        val (card, joinStat) = computeByNdv(leftKey, rightKey, newMin, 
newMax)
    -        keyStatsAfterJoin += (leftKey -> joinStat, rightKey -> joinStat)
    +        val (card, joinStat) = (leftKeyStat.histogram, 
rightKeyStat.histogram) match {
    +          case (Some(l: Histogram), Some(r: Histogram)) =>
    +            computeByEquiHeightHistogram(leftKey, rightKey, l, r, newMin, 
newMax)
    +          case _ =>
    +            computeByNdv(leftKey, rightKey, newMin, newMax)
    +        }
    +        keyStatsAfterJoin += (
    +          // Histograms are propagated as unchanged. During future 
estimation, they should be
    +          // truncated by the updated max/min. In this way, only pointers 
of the histograms are
    +          // propagated and thus reduce memory consumption.
    +          leftKey -> joinStat.copy(histogram = leftKeyStat.histogram),
    +          rightKey -> joinStat.copy(histogram = rightKeyStat.histogram)
    --- End diff --
    
    i.e. 
https://github.com/apache/spark/pull/19594/files#diff-6387e7aaeb7d8e0cb1457b9d0fe5cd00R272


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to