Re: [PR] [HUDI-8663] Look up in col stats based on indexed cols [hudi]

via GitHub Mon, 06 Jan 2025 13:55:30 -0800


nsivabalan commented on code in PR #12575:
URL: https://github.com/apache/hudi/pull/12575#discussion_r1904669869



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala:
##########
@@ -94,6 +97,162 @@ class TestColumnStatsIndexWithSQL extends 
ColumnStatIndexTestBase {
     setupTable(testCase, metadataOpts, commonOpts, shouldValidate = true, 
useShortSchema = true,
       validationSortColumns = Seq("c1_maxValue", "c1_minValue", "c2_maxValue",
       "c2_minValue", "c3_maxValue", "c3_minValue"))
+
+    // predicate with c2. should prune based on col stats lookup
+    var dataFilter: Expression = EqualTo(attribute("c2"), literal("619sdc"))
+    verifyPruningFileCount(commonOpts, dataFilter)
+    // predicate w/ c5. should not lookup in col stats since the column is not 
indexed.
+    var dataFilter1: Expression = GreaterThan(attribute("c5"), literal("70"))
+    verifyPruningFileCount(commonOpts, dataFilter1, false)
+
+    // a mix of two cols, where c2 is indexed and c5 is not indexed. but since 
its 'AND', pruning should kick in.
+    var dataFilter2 = And(dataFilter1, EqualTo(attribute("c2"), 
literal("619sdc")))
+    verifyPruningFileCount(commonOpts, dataFilter2, true)
+    // adding an AND clause
+    dataFilter2 = And(dataFilter2, EqualTo(attribute("c5"), literal("100")))
+    verifyPruningFileCount(commonOpts, dataFilter2, true)
+    // adding an OR clause where the col is indexed. expected to prune
+    var dataFilter2_1 = Or(dataFilter2, EqualTo(attribute("c2"), 
literal("619sda")))
+    verifyPruningFileCount(commonOpts, dataFilter2_1, true)
+    // adding another Or clause, but this time the col is not indexed. So, no 
pruning expected.
+    dataFilter2_1 = Or(dataFilter2_1, EqualTo(attribute("c5"), literal("120")))
+    verifyPruningFileCount(commonOpts, dataFilter2_1, false)
+
+    // a mix of two cols, where c2 is indexed and c5 is not indexed. but since 
its 'OR', pruning should be by passed.
+    var dataFilter3 = Or(dataFilter1, EqualTo(attribute("c2"), 
literal("619sdc")))
+    verifyPruningFileCount(commonOpts, dataFilter3, false)
+    // adding an OR clause
+    dataFilter3 = Or(dataFilter3, EqualTo(attribute("c5"), literal("100")))
+    verifyPruningFileCount(commonOpts, dataFilter3, false)
+    // adding AND clause where the col is indexed. Expected to prune.
+    var dataFilter3_1 = And(dataFilter3, EqualTo(attribute("c2"), 
literal("619sda")))
+    verifyPruningFileCount(commonOpts, dataFilter3_1, true)
+    // adding another AND clause where the col is not indexed. Still expected 
to prune since c2 = 619sda could still be pruned.
+    dataFilter3_1 = And(dataFilter3_1, EqualTo(attribute("c5"), 
literal("200")))
+    verifyPruningFileCount(commonOpts, dataFilter3_1, true)
+    // adding an Or clause where the col is indexed. expected to prune.
+    var dataFilter3_2 = Or(dataFilter3_1, EqualTo(attribute("c2"), 
literal("619sda")))
+    verifyPruningFileCount(commonOpts, dataFilter3_2, true)
+    // adding an Or clause where the col is not indexed. not expected to prune
+    dataFilter3_2 = Or(dataFilter3_2, EqualTo(attribute("c5"), literal("250")))
+    verifyPruningFileCount(commonOpts, dataFilter3_2, false)
+  }
+
+  @Test
+  def testTranslateIntoColumnStatsIndexFilterExpr(): Unit = {

Review Comment:
   good list of tests. I have added them all except 1. not sure why we need 
mutli-writer tests. col stats lookup is designed to be single threaded. So, we 
don't need to test for multi writers. I had to use AtomicBoolean since I need 
something to be used as communication b/w caller and callee. we can't use list, 
and hence used AtomicBoolean.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8663] Look up in col stats based on indexed cols [hudi]

Reply via email to