[GitHub] spark pull request #19594: [SPARK-21984] [SQL] Join estimation based on equi...

cloud-fan Mon, 18 Dec 2017 08:01:02 -0800

Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19594#discussion_r157524477
  
    --- Diff: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala
 ---
    @@ -67,6 +67,222 @@ class JoinEstimationSuite extends 
StatsEstimationTestBase {
         rowCount = 2,
         attributeStats = AttributeMap(Seq("key-1-2", 
"key-2-3").map(nameToColInfo)))
     
    +  private def estimateByHistogram(
    +      leftHistogram: Histogram,
    +      rightHistogram: Histogram,
    +      expectedMin: Double,
    +      expectedMax: Double,
    +      expectedNdv: Long,
    +      expectedRows: Long): Unit = {
    +    val col1 = attr("key1")
    +    val col2 = attr("key2")
    +    val c1 = generateJoinChild(col1, leftHistogram, expectedMin, 
expectedMax)
    +    val c2 = generateJoinChild(col2, rightHistogram, expectedMin, 
expectedMax)
    +
    +    val c1JoinC2 = Join(c1, c2, Inner, Some(EqualTo(col1, col2)))
    +    val c2JoinC1 = Join(c2, c1, Inner, Some(EqualTo(col2, col1)))
    +    val expectedStatsAfterJoin = Statistics(
    +      sizeInBytes = expectedRows * (8 + 2 * 4),
    +      rowCount = Some(expectedRows),
    +      attributeStats = AttributeMap(Seq(
    +        col1 -> c1.stats.attributeStats(col1).copy(
    +          distinctCount = expectedNdv, min = Some(expectedMin), max = 
Some(expectedMax)),
    +        col2 -> c2.stats.attributeStats(col2).copy(
    +          distinctCount = expectedNdv, min = Some(expectedMin), max = 
Some(expectedMax))))
    +    )
    +
    +    // Join order should not affect estimation result.
    +    Seq(c1JoinC2, c2JoinC1).foreach { join =>
    +      assert(join.stats == expectedStatsAfterJoin)
    +    }
    +  }
    +
    +  private def generateJoinChild(
    +      col: Attribute,
    +      histogram: Histogram,
    +      expectedMin: Double,
    +      expectedMax: Double): LogicalPlan = {
    +    val colStat = inferColumnStat(histogram)
    +    StatsTestPlan(
    +      outputList = Seq(col),
    +      rowCount = (histogram.height * histogram.bins.length).toLong,
    +      attributeStats = AttributeMap(Seq(col -> colStat)))
    +  }
    +
    +  /** Column statistics should be consistent with histograms in tests. */
    +  private def inferColumnStat(histogram: Histogram): ColumnStat = {
    +    var ndv = 0L
    +    for (i <- histogram.bins.indices) {
    +      val bin = histogram.bins(i)
    +      if (i == 0 || bin.hi != histogram.bins(i - 1).hi) {
    +        ndv += bin.ndv
    +      }
    +    }
    +    ColumnStat(distinctCount = ndv, min = Some(histogram.bins.head.lo),
    +      max = Some(histogram.bins.last.hi), nullCount = 0, avgLen = 4, 
maxLen = 4,
    +      histogram = Some(histogram))
    +  }
    +
    +  test("equi-height histograms: a bin is contained by another one") {
    +    val histogram1 = Histogram(height = 300, Array(
    +      HistogramBin(lo = 10, hi = 30, ndv = 10), HistogramBin(lo = 30, hi = 
60, ndv = 30)))
    +    val histogram2 = Histogram(height = 100, Array(
    +      HistogramBin(lo = 0, hi = 50, ndv = 50), HistogramBin(lo = 50, hi = 
100, ndv = 40)))
    +    // test bin trimming
    +    val (t0, h0) = trimBin(histogram2.bins(0), height = 100, lowerBound = 
10, upperBound = 60)
    +    assert(t0 == HistogramBin(lo = 10, hi = 50, ndv = 40) && h0 == 80)
    +    val (t1, h1) = trimBin(histogram2.bins(1), height = 100, lowerBound = 
10, upperBound = 60)
    +    assert(t1 == HistogramBin(lo = 50, hi = 60, ndv = 8) && h1 == 20)
    +
    +    val expectedRanges = Seq(
    +      // histogram1.bins(0) overlaps t0
    +      OverlappedRange(10, 30, 10, 40*1/2, 300, 80*1/2),
    +      // histogram1.bins(1) overlaps t0
    +      OverlappedRange(30, 50, 30*2/3, 40*1/2, 300*2/3, 80*1/2),
    +      // histogram1.bins(1) overlaps t1
    +      OverlappedRange(50, 60, 30*1/3, 8, 300*1/3, 20)
    +    )
    +    assert(expectedRanges.equals(
    +      getOverlappedRanges(histogram1, histogram2, lowerBound = 10D, 
upperBound = 60D)))
    --- End diff --
    
    10D looks weird, how about 10.0



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #19594: [SPARK-21984] [SQL] Join estimation based on equi...

Reply via email to