Github user wzhfy commented on a diff in the pull request:
https://github.com/apache/spark/pull/19594#discussion_r156847046
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
---
@@ -114,4 +115,183 @@ object EstimationUtils {
}
}
+ /**
+ * Returns overlapped ranges between two histograms, in the given value
range [newMin, newMax].
+ */
+ def getOverlappedRanges(
+ leftHistogram: Histogram,
+ rightHistogram: Histogram,
+ newMin: Double,
+ newMax: Double): Seq[OverlappedRange] = {
+ val overlappedRanges = new ArrayBuffer[OverlappedRange]()
+ // Only bins whose range intersect [newMin, newMax] have join
possibility.
+ val leftBins = leftHistogram.bins
+ .filter(b => b.lo <= newMax && b.hi >= newMin)
+ val rightBins = rightHistogram.bins
+ .filter(b => b.lo <= newMax && b.hi >= newMin)
+
+ leftBins.foreach { lb =>
+ rightBins.foreach { rb =>
+ val (left, leftHeight) = trimBin(lb, leftHistogram.height, newMin,
newMax)
+ val (right, rightHeight) = trimBin(rb, rightHistogram.height,
newMin, newMax)
+ // Only collect overlapped ranges.
+ if (left.lo <= right.hi && left.hi >= right.lo) {
+ // Collect overlapped ranges.
+ val range = if (left.lo == left.hi) {
+ // Case1: the left bin has only one value
+ OverlappedRange(
+ lo = left.lo,
+ hi = left.lo,
+ leftNdv = 1,
+ rightNdv = 1,
+ leftNumRows = leftHeight,
+ rightNumRows = rightHeight / right.ndv
+ )
+ } else if (right.lo == right.hi) {
+ // Case2: the right bin has only one value
+ OverlappedRange(
+ lo = right.lo,
+ hi = right.lo,
+ leftNdv = 1,
+ rightNdv = 1,
+ leftNumRows = leftHeight / left.ndv,
+ rightNumRows = rightHeight
+ )
+ } else if (right.lo >= left.lo && right.hi >= left.hi) {
+ // Case3: the left bin is "smaller" than the right bin
+ // left.lo right.lo left.hi
right.hi
+ //
--------+------------------+------------+----------------+------->
+ val leftRatio = (left.hi - right.lo) / (left.hi - left.lo)
+ val rightRatio = (left.hi - right.lo) / (right.hi - right.lo)
+ if (leftRatio == 0) {
+ // The overlapped range has only one value.
+ OverlappedRange(
+ lo = right.lo,
+ hi = right.lo,
+ leftNdv = 1,
+ rightNdv = 1,
+ leftNumRows = leftHeight / left.ndv,
+ rightNumRows = rightHeight / right.ndv
+ )
+ } else {
+ OverlappedRange(
+ lo = right.lo,
+ hi = left.hi,
+ leftNdv = left.ndv * leftRatio,
+ rightNdv = right.ndv * rightRatio,
+ leftNumRows = leftHeight * leftRatio,
+ rightNumRows = rightHeight * rightRatio
+ )
+ }
+ } else if (right.lo <= left.lo && right.hi <= left.hi) {
+ // Case4: the left bin is "larger" than the right bin
+ // right.lo left.lo right.hi
left.hi
+ //
--------+------------------+------------+----------------+------->
+ val leftRatio = (right.hi - left.lo) / (left.hi - left.lo)
+ val rightRatio = (right.hi - left.lo) / (right.hi - right.lo)
+ if (leftRatio == 0) {
+ // The overlapped range has only one value.
+ OverlappedRange(
+ lo = right.hi,
+ hi = right.hi,
+ leftNdv = 1,
+ rightNdv = 1,
+ leftNumRows = leftHeight / left.ndv,
+ rightNumRows = rightHeight / right.ndv
+ )
+ } else {
+ OverlappedRange(
+ lo = left.lo,
+ hi = right.hi,
+ leftNdv = left.ndv * leftRatio,
+ rightNdv = right.ndv * rightRatio,
+ leftNumRows = leftHeight * leftRatio,
+ rightNumRows = rightHeight * rightRatio
+ )
+ }
+ } else if (right.lo >= left.lo && right.hi <= left.hi) {
+ // Case5: the left bin contains the right bin
+ // left.lo right.lo right.hi
left.hi
+ //
--------+------------------+------------+----------------+------->
+ val leftRatio = (right.hi - right.lo) / (left.hi - left.lo)
+ OverlappedRange(
+ lo = right.lo,
+ hi = right.hi,
+ leftNdv = left.ndv * leftRatio,
+ rightNdv = right.ndv,
+ leftNumRows = leftHeight * leftRatio,
+ rightNumRows = rightHeight
+ )
+ } else {
+ assert(right.lo <= left.lo && right.hi >= left.hi)
+ // Case6: the right bin contains the left bin
+ // right.lo left.lo left.hi
right.hi
+ //
--------+------------------+------------+----------------+------->
+ val rightRatio = (left.hi - left.lo) / (right.hi - right.lo)
+ OverlappedRange(
+ lo = left.lo,
+ hi = left.hi,
+ leftNdv = left.ndv,
+ rightNdv = right.ndv * rightRatio,
+ leftNumRows = leftHeight,
+ rightNumRows = rightHeight * rightRatio
+ )
+ }
+ overlappedRanges += range
+ }
+ }
+ }
+ overlappedRanges
+ }
+
+ /**
+ * Given an original bin and a value range [min, max], returns the
trimmed bin and its number of
+ * rows.
+ */
+ def trimBin(bin: HistogramBin, height: Double, min: Double, max: Double)
+ : (HistogramBin, Double) = {
+ val (lo, hi) = if (bin.lo <= min && bin.hi >= max) {
+ // bin.lo min max bin.hi
+ // --------+------------------+------------+-------------+------->
+ (min, max)
+ } else if (bin.lo <= min && bin.hi >= min) {
+ // bin.lo min bin.hi
--- End diff --
in this case, `max` is after the `bin.hi`, so the trimmed part is `(min,
bin.hi)`. I'll update the figure to indicate that.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]