Github user wzhfy commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19783#discussion_r153974257
  
    --- Diff: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
 ---
    @@ -578,6 +590,112 @@ class FilterEstimationSuite extends 
StatsEstimationTestBase {
           expectedRowCount = 5)
       }
     
    +  // The following test cases have histogram information collected for the 
test column
    +  test("Not(cintHgm < 3 AND null)") {
    +    val condition = Not(And(LessThan(attrIntHgm, Literal(3)), 
Literal(null, IntegerType)))
    +    validateEstimatedStats(
    +      Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> colStatIntHgm.copy(distinctCount = 6)),
    +      expectedRowCount = 9)
    +  }
    +
    +  test("cintHgm = 5") {
    +    validateEstimatedStats(
    +      Filter(EqualTo(attrIntHgm, Literal(5)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(5), max = 
Some(5),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 4)
    +  }
    +
    +  test("cintHgm = 0") {
    +    // This is an out-of-range case since 0 is outside the range [min, max]
    +    validateEstimatedStats(
    +      Filter(EqualTo(attrIntHgm, Literal(0)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Nil,
    +      expectedRowCount = 0)
    +  }
    +
    +  test("cintHgm < 3") {
    +    validateEstimatedStats(
    +      Filter(LessThan(attrIntHgm, Literal(3)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(1), max = 
Some(3),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 2)
    +  }
    +
    +  test("cintHgm < 0") {
    +    // This is a corner case since literal 0 is smaller than min.
    +    validateEstimatedStats(
    +      Filter(LessThan(attrIntHgm, Literal(0)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Nil,
    +      expectedRowCount = 0)
    +  }
    +
    +  test("cintHgm <= 3") {
    +    validateEstimatedStats(
    +      Filter(LessThanOrEqual(attrIntHgm, Literal(3)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(1), max = 
Some(3),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 2)
    +  }
    +
    +  test("cintHgm > 6") {
    +    validateEstimatedStats(
    +      Filter(GreaterThan(attrIntHgm, Literal(6)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 2, min = Some(6), max = 
Some(10),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 2)
    +  }
    +
    +  test("cintHgm > 10") {
    +    // This is a corner case since max value is 10.
    +    validateEstimatedStats(
    +      Filter(GreaterThan(attrIntHgm, Literal(10)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Nil,
    +      expectedRowCount = 0)
    +  }
    +
    +  test("cintHgm >= 6") {
    +    validateEstimatedStats(
    +      Filter(GreaterThanOrEqual(attrIntHgm, Literal(6)), 
childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 3, min = Some(6), max = 
Some(10),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 4)
    +  }
    +
    +  test("cintHgm IS NULL") {
    +    validateEstimatedStats(
    +      Filter(IsNull(attrIntHgm), childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Nil,
    +      expectedRowCount = 0)
    +  }
    +
    +  test("cintHgm IS NOT NULL") {
    +    validateEstimatedStats(
    +      Filter(IsNotNull(attrIntHgm), childStatsTestPlan(Seq(attrIntHgm), 
10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 6, min = Some(1), max = 
Some(10),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 10)
    +  }
    +
    +  test("cintHgm > 3 AND cintHgm <= 6") {
    +    val condition = And(GreaterThan(attrIntHgm,
    +      Literal(3)), LessThanOrEqual(attrIntHgm, Literal(6)))
    +    validateEstimatedStats(
    +      Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)),
    +      Seq(attrIntHgm -> ColumnStat(distinctCount = 5, min = Some(3), max = 
Some(6),
    +        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
    +      expectedRowCount = 8)
    +  }
    +
    +  test("cintHgm = 3 OR cintHgm = 6") {
    --- End diff --
    
    I think we don't need test cases for combination conditions like AND, OR, 
NOT, because histogram doesn't affect estimation logic for them. Instead, we 
need to test more cases for histogram, e.g. =, >=, >, <=, <, and for different 
distributions, e.g. skewed distribution and non-skewed distribution.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to