nsivabalan commented on code in PR #12575:
URL: https://github.com/apache/hudi/pull/12575#discussion_r1904669869
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala:
##########
@@ -94,6 +97,162 @@ class TestColumnStatsIndexWithSQL extends
ColumnStatIndexTestBase {
setupTable(testCase, metadataOpts, commonOpts, shouldValidate = true,
useShortSchema = true,
validationSortColumns = Seq("c1_maxValue", "c1_minValue", "c2_maxValue",
"c2_minValue", "c3_maxValue", "c3_minValue"))
+
+ // predicate with c2. should prune based on col stats lookup
+ var dataFilter: Expression = EqualTo(attribute("c2"), literal("619sdc"))
+ verifyPruningFileCount(commonOpts, dataFilter)
+ // predicate w/ c5. should not lookup in col stats since the column is not
indexed.
+ var dataFilter1: Expression = GreaterThan(attribute("c5"), literal("70"))
+ verifyPruningFileCount(commonOpts, dataFilter1, false)
+
+ // a mix of two cols, where c2 is indexed and c5 is not indexed. but since
its 'AND', pruning should kick in.
+ var dataFilter2 = And(dataFilter1, EqualTo(attribute("c2"),
literal("619sdc")))
+ verifyPruningFileCount(commonOpts, dataFilter2, true)
+ // adding an AND clause
+ dataFilter2 = And(dataFilter2, EqualTo(attribute("c5"), literal("100")))
+ verifyPruningFileCount(commonOpts, dataFilter2, true)
+ // adding an OR clause where the col is indexed. expected to prune
+ var dataFilter2_1 = Or(dataFilter2, EqualTo(attribute("c2"),
literal("619sda")))
+ verifyPruningFileCount(commonOpts, dataFilter2_1, true)
+ // adding another Or clause, but this time the col is not indexed. So, no
pruning expected.
+ dataFilter2_1 = Or(dataFilter2_1, EqualTo(attribute("c5"), literal("120")))
+ verifyPruningFileCount(commonOpts, dataFilter2_1, false)
+
+ // a mix of two cols, where c2 is indexed and c5 is not indexed. but since
its 'OR', pruning should be by passed.
+ var dataFilter3 = Or(dataFilter1, EqualTo(attribute("c2"),
literal("619sdc")))
+ verifyPruningFileCount(commonOpts, dataFilter3, false)
+ // adding an OR clause
+ dataFilter3 = Or(dataFilter3, EqualTo(attribute("c5"), literal("100")))
+ verifyPruningFileCount(commonOpts, dataFilter3, false)
+ // adding AND clause where the col is indexed. Expected to prune.
+ var dataFilter3_1 = And(dataFilter3, EqualTo(attribute("c2"),
literal("619sda")))
+ verifyPruningFileCount(commonOpts, dataFilter3_1, true)
+ // adding another AND clause where the col is not indexed. Still expected
to prune since c2 = 619sda could still be pruned.
+ dataFilter3_1 = And(dataFilter3_1, EqualTo(attribute("c5"),
literal("200")))
+ verifyPruningFileCount(commonOpts, dataFilter3_1, true)
+ // adding an Or clause where the col is indexed. expected to prune.
+ var dataFilter3_2 = Or(dataFilter3_1, EqualTo(attribute("c2"),
literal("619sda")))
+ verifyPruningFileCount(commonOpts, dataFilter3_2, true)
+ // adding an Or clause where the col is not indexed. not expected to prune
+ dataFilter3_2 = Or(dataFilter3_2, EqualTo(attribute("c5"), literal("250")))
+ verifyPruningFileCount(commonOpts, dataFilter3_2, false)
+ }
+
+ @Test
+ def testTranslateIntoColumnStatsIndexFilterExpr(): Unit = {
Review Comment:
good list of tests. I have added them all except 1. not sure why we need
mutli-writer tests. col stats lookup is designed to be single threaded. So, we
don't need to test for multi writers. I had to use AtomicBoolean since I need
something to be used as communication b/w caller and callee. we can't use list,
and hence used AtomicBoolean.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]