Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19594#discussion_r157513559 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala --- @@ -212,4 +213,186 @@ object EstimationUtils { } } + /** + * Returns overlapped ranges between two histograms, in the given value range + * [lowerBound, upperBound]. + */ + def getOverlappedRanges( + leftHistogram: Histogram, + rightHistogram: Histogram, + lowerBound: Double, + upperBound: Double): Seq[OverlappedRange] = { + val overlappedRanges = new ArrayBuffer[OverlappedRange]() + // Only bins whose range intersect [lowerBound, upperBound] have join possibility. + val leftBins = leftHistogram.bins + .filter(b => b.lo <= upperBound && b.hi >= lowerBound) + val rightBins = rightHistogram.bins + .filter(b => b.lo <= upperBound && b.hi >= lowerBound) + + leftBins.foreach { lb => + rightBins.foreach { rb => + val (left, leftHeight) = trimBin(lb, leftHistogram.height, lowerBound, upperBound) + val (right, rightHeight) = trimBin(rb, rightHistogram.height, lowerBound, upperBound) + // Only collect overlapped ranges. + if (left.lo <= right.hi && left.hi >= right.lo) { + // Collect overlapped ranges. + val range = if (left.lo == left.hi) { + // Case1: the left bin has only one value + OverlappedRange( + lo = left.lo, + hi = left.lo, + leftNdv = 1, + rightNdv = 1, + leftNumRows = leftHeight, + rightNumRows = rightHeight / right.ndv + ) + } else if (right.lo == right.hi) { + // Case2: the right bin has only one value --- End diff -- do we really need case 1 and 2? aren't they covered by branches below?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org