Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19594#discussion_r157519368
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
---
@@ -225,6 +236,43 @@ case class JoinEstimation(join: Join) extends Logging {
(ceil(card), newStats)
}
+ /** Compute join cardinality using equi-height histograms. */
+ private def computeByEquiHeightHistogram(
+ leftKey: AttributeReference,
+ rightKey: AttributeReference,
+ leftHistogram: Histogram,
+ rightHistogram: Histogram,
+ newMin: Option[Any],
+ newMax: Option[Any]): (BigInt, ColumnStat) = {
+ val overlappedRanges = getOverlappedRanges(
+ leftHistogram = leftHistogram,
+ rightHistogram = rightHistogram,
+ // Only numeric values have equi-height histograms.
+ lowerBound = newMin.get.toString.toDouble,
+ upperBound = newMax.get.toString.toDouble)
+
+ var card: BigDecimal = 0
+ var totalNdv: Double = 0
+ for (i <- overlappedRanges.indices) {
+ val range = overlappedRanges(i)
+ if (i == 0 || range.hi != overlappedRanges(i - 1).hi) {
+ // If range.hi == overlappedRanges(i - 1).hi, that means the
current range has only one
+ // value, and this value is already counted in the previous range.
So there is no need to
+ // count it in this range.
+ totalNdv += math.min(range.leftNdv, range.rightNdv)
+ }
+ // Apply the formula in this overlapped range.
+ card += range.leftNumRows * range.rightNumRows /
math.max(range.leftNdv, range.rightNdv)
+ }
+
+ val leftKeyStat = leftStats.attributeStats(leftKey)
+ val rightKeyStat = rightStats.attributeStats(rightKey)
+ val newMaxLen = math.min(leftKeyStat.maxLen, rightKeyStat.maxLen)
+ val newAvgLen = (leftKeyStat.avgLen + rightKeyStat.avgLen) / 2
--- End diff --
shall we count left/right numRows when calculating this?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]