Github user wzhfy commented on a diff in the pull request:
https://github.com/apache/spark/pull/19783#discussion_r153973781
--- Diff:
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
---
@@ -578,6 +590,112 @@ class FilterEstimationSuite extends
StatsEstimationTestBase {
expectedRowCount = 5)
}
+ // The following test cases have histogram information collected for the
test column
+ test("Not(cintHgm < 3 AND null)") {
+ val condition = Not(And(LessThan(attrIntHgm, Literal(3)),
Literal(null, IntegerType)))
+ validateEstimatedStats(
+ Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Seq(attrIntHgm -> colStatIntHgm.copy(distinctCount = 6)),
+ expectedRowCount = 9)
+ }
+
+ test("cintHgm = 5") {
+ validateEstimatedStats(
+ Filter(EqualTo(attrIntHgm, Literal(5)),
childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(5), max =
Some(5),
+ nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+ expectedRowCount = 4)
+ }
+
+ test("cintHgm = 0") {
+ // This is an out-of-range case since 0 is outside the range [min, max]
+ validateEstimatedStats(
+ Filter(EqualTo(attrIntHgm, Literal(0)),
childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Nil,
+ expectedRowCount = 0)
+ }
+
+ test("cintHgm < 3") {
+ validateEstimatedStats(
+ Filter(LessThan(attrIntHgm, Literal(3)),
childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(1), max =
Some(3),
+ nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+ expectedRowCount = 2)
+ }
+
+ test("cintHgm < 0") {
+ // This is a corner case since literal 0 is smaller than min.
+ validateEstimatedStats(
+ Filter(LessThan(attrIntHgm, Literal(0)),
childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Nil,
+ expectedRowCount = 0)
+ }
+
+ test("cintHgm <= 3") {
+ validateEstimatedStats(
+ Filter(LessThanOrEqual(attrIntHgm, Literal(3)),
childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(1), max =
Some(3),
+ nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+ expectedRowCount = 2)
+ }
+
+ test("cintHgm > 6") {
+ validateEstimatedStats(
+ Filter(GreaterThan(attrIntHgm, Literal(6)),
childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Seq(attrIntHgm -> ColumnStat(distinctCount = 2, min = Some(6), max =
Some(10),
+ nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+ expectedRowCount = 2)
+ }
+
+ test("cintHgm > 10") {
+ // This is a corner case since max value is 10.
+ validateEstimatedStats(
+ Filter(GreaterThan(attrIntHgm, Literal(10)),
childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Nil,
+ expectedRowCount = 0)
+ }
+
+ test("cintHgm >= 6") {
+ validateEstimatedStats(
+ Filter(GreaterThanOrEqual(attrIntHgm, Literal(6)),
childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Seq(attrIntHgm -> ColumnStat(distinctCount = 3, min = Some(6), max =
Some(10),
+ nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+ expectedRowCount = 4)
+ }
+
+ test("cintHgm IS NULL") {
+ validateEstimatedStats(
+ Filter(IsNull(attrIntHgm), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+ Nil,
+ expectedRowCount = 0)
+ }
+
+ test("cintHgm IS NOT NULL") {
--- End diff --
histogram does not affect estimation logic for `IS NULL` and `IS NOT NULL`
filter conditions, we can remove these two test cases.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]