Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19594#discussion_r157514422
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
---
@@ -212,4 +213,186 @@ object EstimationUtils {
}
}
+ /**
+ * Returns overlapped ranges between two histograms, in the given value
range
+ * [lowerBound, upperBound].
+ */
+ def getOverlappedRanges(
+ leftHistogram: Histogram,
+ rightHistogram: Histogram,
+ lowerBound: Double,
+ upperBound: Double): Seq[OverlappedRange] = {
+ val overlappedRanges = new ArrayBuffer[OverlappedRange]()
+ // Only bins whose range intersect [lowerBound, upperBound] have join
possibility.
+ val leftBins = leftHistogram.bins
+ .filter(b => b.lo <= upperBound && b.hi >= lowerBound)
+ val rightBins = rightHistogram.bins
+ .filter(b => b.lo <= upperBound && b.hi >= lowerBound)
+
+ leftBins.foreach { lb =>
+ rightBins.foreach { rb =>
+ val (left, leftHeight) = trimBin(lb, leftHistogram.height,
lowerBound, upperBound)
+ val (right, rightHeight) = trimBin(rb, rightHistogram.height,
lowerBound, upperBound)
+ // Only collect overlapped ranges.
+ if (left.lo <= right.hi && left.hi >= right.lo) {
+ // Collect overlapped ranges.
+ val range = if (left.lo == left.hi) {
+ // Case1: the left bin has only one value
+ OverlappedRange(
+ lo = left.lo,
+ hi = left.lo,
+ leftNdv = 1,
+ rightNdv = 1,
+ leftNumRows = leftHeight,
+ rightNumRows = rightHeight / right.ndv
+ )
+ } else if (right.lo == right.hi) {
+ // Case2: the right bin has only one value
+ OverlappedRange(
+ lo = right.lo,
+ hi = right.lo,
+ leftNdv = 1,
+ rightNdv = 1,
+ leftNumRows = leftHeight / left.ndv,
+ rightNumRows = rightHeight
+ )
+ } else if (right.lo >= left.lo && right.hi >= left.hi) {
+ // Case3: the left bin is "smaller" than the right bin
+ // left.lo right.lo left.hi
right.hi
+ //
--------+------------------+------------+----------------+------->
+ if (left.hi == right.lo) {
--- End diff --
yea this branch is needed, otherwise we will get 0 ratio and lead to wrong
result.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]