Github user wzhfy commented on a diff in the pull request:
https://github.com/apache/spark/pull/19594#discussion_r157698793
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
---
@@ -225,6 +236,43 @@ case class JoinEstimation(join: Join) extends Logging {
(ceil(card), newStats)
}
+ /** Compute join cardinality using equi-height histograms. */
+ private def computeByEquiHeightHistogram(
+ leftKey: AttributeReference,
+ rightKey: AttributeReference,
+ leftHistogram: Histogram,
+ rightHistogram: Histogram,
+ newMin: Option[Any],
+ newMax: Option[Any]): (BigInt, ColumnStat) = {
+ val overlappedRanges = getOverlappedRanges(
+ leftHistogram = leftHistogram,
+ rightHistogram = rightHistogram,
+ // Only numeric values have equi-height histograms.
+ lowerBound = newMin.get.toString.toDouble,
+ upperBound = newMax.get.toString.toDouble)
+
+ var card: BigDecimal = 0
+ var totalNdv: Double = 0
+ for (i <- overlappedRanges.indices) {
+ val range = overlappedRanges(i)
+ if (i == 0 || range.hi != overlappedRanges(i - 1).hi) {
+ // If range.hi == overlappedRanges(i - 1).hi, that means the
current range has only one
+ // value, and this value is already counted in the previous range.
So there is no need to
+ // count it in this range.
+ totalNdv += math.min(range.leftNdv, range.rightNdv)
+ }
+ // Apply the formula in this overlapped range.
+ card += range.leftNumRows * range.rightNumRows /
math.max(range.leftNdv, range.rightNdv)
+ }
+
+ val leftKeyStat = leftStats.attributeStats(leftKey)
+ val rightKeyStat = rightStats.attributeStats(rightKey)
+ val newMaxLen = math.min(leftKeyStat.maxLen, rightKeyStat.maxLen)
+ val newAvgLen = (leftKeyStat.avgLen + rightKeyStat.avgLen) / 2
--- End diff --
how do we use left/right numRows to calculate this? Ideally avgLen is
calculated by total length of keys / numRowsAfterJoin. For string type, we
don't the exact length of the matched keys (we don't support string histogram
yet), for numeric types, their avgLen should be the same. So the equation is a
fair approximation.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]