Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19594#discussion_r157519924 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala --- @@ -191,8 +191,19 @@ case class JoinEstimation(join: Join) extends Logging { val rInterval = ValueInterval(rightKeyStat.min, rightKeyStat.max, rightKey.dataType) if (ValueInterval.isIntersected(lInterval, rInterval)) { val (newMin, newMax) = ValueInterval.intersect(lInterval, rInterval, leftKey.dataType) - val (card, joinStat) = computeByNdv(leftKey, rightKey, newMin, newMax) - keyStatsAfterJoin += (leftKey -> joinStat, rightKey -> joinStat) + val (card, joinStat) = (leftKeyStat.histogram, rightKeyStat.histogram) match { + case (Some(l: Histogram), Some(r: Histogram)) => + computeByEquiHeightHistogram(leftKey, rightKey, l, r, newMin, newMax) + case _ => + computeByNdv(leftKey, rightKey, newMin, newMax) + } + keyStatsAfterJoin += ( + // Histograms are propagated as unchanged. During future estimation, they should be + // truncated by the updated max/min. In this way, only pointers of the histograms are + // propagated and thus reduce memory consumption. + leftKey -> joinStat.copy(histogram = leftKeyStat.histogram), + rightKey -> joinStat.copy(histogram = rightKeyStat.histogram) --- End diff -- i.e. https://github.com/apache/spark/pull/19594/files#diff-6387e7aaeb7d8e0cb1457b9d0fe5cd00R272
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org