Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19594#discussion_r156392538 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala --- @@ -114,4 +115,183 @@ object EstimationUtils { } } + /** + * Returns overlapped ranges between two histograms, in the given value range [newMin, newMax]. + */ + def getOverlappedRanges( + leftHistogram: Histogram, + rightHistogram: Histogram, + newMin: Double, + newMax: Double): Seq[OverlappedRange] = { + val overlappedRanges = new ArrayBuffer[OverlappedRange]() + // Only bins whose range intersect [newMin, newMax] have join possibility. + val leftBins = leftHistogram.bins + .filter(b => b.lo <= newMax && b.hi >= newMin) + val rightBins = rightHistogram.bins + .filter(b => b.lo <= newMax && b.hi >= newMin) + + leftBins.foreach { lb => + rightBins.foreach { rb => + val (left, leftHeight) = trimBin(lb, leftHistogram.height, newMin, newMax) + val (right, rightHeight) = trimBin(rb, rightHistogram.height, newMin, newMax) + // Only collect overlapped ranges. + if (left.lo <= right.hi && left.hi >= right.lo) { + // Collect overlapped ranges. + val range = if (left.lo == left.hi) { + // Case1: the left bin has only one value + OverlappedRange( + lo = left.lo, + hi = left.lo, + leftNdv = 1, + rightNdv = 1, + leftNumRows = leftHeight, + rightNumRows = rightHeight / right.ndv + ) + } else if (right.lo == right.hi) { + // Case2: the right bin has only one value + OverlappedRange( + lo = right.lo, + hi = right.lo, + leftNdv = 1, + rightNdv = 1, + leftNumRows = leftHeight / left.ndv, + rightNumRows = rightHeight + ) + } else if (right.lo >= left.lo && right.hi >= left.hi) { + // Case3: the left bin is "smaller" than the right bin + // left.lo right.lo left.hi right.hi + // --------+------------------+------------+----------------+-------> + val leftRatio = (left.hi - right.lo) / (left.hi - left.lo) + val rightRatio = (left.hi - right.lo) / (right.hi - right.lo) + if (leftRatio == 0) { --- End diff -- it's more understandable to write `if (right.lo == left.hi)`
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org