Re: [PR] [HUDI-8663] Look up in col stats based on indexed cols [hudi]

via GitHub Tue, 07 Jan 2025 07:03:46 -0800


nsivabalan commented on code in PR #12575:
URL: https://github.com/apache/hudi/pull/12575#discussion_r1905590133



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala:
##########
@@ -94,6 +97,162 @@ class TestColumnStatsIndexWithSQL extends 
ColumnStatIndexTestBase {
     setupTable(testCase, metadataOpts, commonOpts, shouldValidate = true, 
useShortSchema = true,
       validationSortColumns = Seq("c1_maxValue", "c1_minValue", "c2_maxValue",
       "c2_minValue", "c3_maxValue", "c3_minValue"))
+
+    // predicate with c2. should prune based on col stats lookup
+    var dataFilter: Expression = EqualTo(attribute("c2"), literal("619sdc"))
+    verifyPruningFileCount(commonOpts, dataFilter)
+    // predicate w/ c5. should not lookup in col stats since the column is not 
indexed.
+    var dataFilter1: Expression = GreaterThan(attribute("c5"), literal("70"))
+    verifyPruningFileCount(commonOpts, dataFilter1, false)
+
+    // a mix of two cols, where c2 is indexed and c5 is not indexed. but since 
its 'AND', pruning should kick in.
+    var dataFilter2 = And(dataFilter1, EqualTo(attribute("c2"), 
literal("619sdc")))
+    verifyPruningFileCount(commonOpts, dataFilter2, true)
+    // adding an AND clause
+    dataFilter2 = And(dataFilter2, EqualTo(attribute("c5"), literal("100")))
+    verifyPruningFileCount(commonOpts, dataFilter2, true)
+    // adding an OR clause where the col is indexed. expected to prune
+    var dataFilter2_1 = Or(dataFilter2, EqualTo(attribute("c2"), 
literal("619sda")))
+    verifyPruningFileCount(commonOpts, dataFilter2_1, true)
+    // adding another Or clause, but this time the col is not indexed. So, no 
pruning expected.
+    dataFilter2_1 = Or(dataFilter2_1, EqualTo(attribute("c5"), literal("120")))
+    verifyPruningFileCount(commonOpts, dataFilter2_1, false)
+
+    // a mix of two cols, where c2 is indexed and c5 is not indexed. but since 
its 'OR', pruning should be by passed.
+    var dataFilter3 = Or(dataFilter1, EqualTo(attribute("c2"), 
literal("619sdc")))
+    verifyPruningFileCount(commonOpts, dataFilter3, false)
+    // adding an OR clause
+    dataFilter3 = Or(dataFilter3, EqualTo(attribute("c5"), literal("100")))
+    verifyPruningFileCount(commonOpts, dataFilter3, false)
+    // adding AND clause where the col is indexed. Expected to prune.
+    var dataFilter3_1 = And(dataFilter3, EqualTo(attribute("c2"), 
literal("619sda")))
+    verifyPruningFileCount(commonOpts, dataFilter3_1, true)
+    // adding another AND clause where the col is not indexed. Still expected 
to prune since c2 = 619sda could still be pruned.
+    dataFilter3_1 = And(dataFilter3_1, EqualTo(attribute("c5"), 
literal("200")))
+    verifyPruningFileCount(commonOpts, dataFilter3_1, true)
+    // adding an Or clause where the col is indexed. expected to prune.
+    var dataFilter3_2 = Or(dataFilter3_1, EqualTo(attribute("c2"), 
literal("619sda")))
+    verifyPruningFileCount(commonOpts, dataFilter3_2, true)
+    // adding an Or clause where the col is not indexed. not expected to prune
+    dataFilter3_2 = Or(dataFilter3_2, EqualTo(attribute("c5"), literal("250")))
+    verifyPruningFileCount(commonOpts, dataFilter3_2, false)
+  }
+
+  @Test
+  def testTranslateIntoColumnStatsIndexFilterExpr(): Unit = {

Review Comment:
   yes, we could do that as well. I will give that a try. 



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala:
##########
@@ -42,28 +45,43 @@ object DataSkippingUtils extends Logging {
    * @param isExpressionIndex whether the index is an expression index
    * @return filter for column-stats index's table
    */
-  def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression, 
indexSchema: StructType, isExpressionIndex: Boolean = false): Expression = {
+  def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression, 
isExpressionIndex: Boolean = false,
+                                              indexedCols : Seq[String] = 
Seq.empty,
+                                              hasNonIndexedCols : 
AtomicBoolean = new AtomicBoolean(false)): Expression = {
     try {
-      createColumnStatsIndexFilterExprInternal(dataTableFilterExpr, 
indexSchema, isExpressionIndex)
+      createColumnStatsIndexFilterExprInternal(dataTableFilterExpr, 
isExpressionIndex, indexedCols,
+        hasNonIndexedCols)

Review Comment:
   oh I see. will TAL



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8663] Look up in col stats based on indexed cols [hudi]

Reply via email to