Re: [PR] [HUDI-8663] Look up in col stats based on indexed cols [hudi]

via GitHub Mon, 06 Jan 2025 13:15:37 -0800


nsivabalan commented on code in PR #12575:
URL: https://github.com/apache/hudi/pull/12575#discussion_r1904637258



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala:
##########
@@ -270,35 +344,63 @@ object DataSkippingUtils extends Logging {
       //       lexicographically, we essentially need to check that provided 
literal falls w/in min/max bounds of the
       //       given column
       case StartsWith(sourceExpr @ AllowedTransformationExpression(attrRef), v 
@ Literal(_: UTF8String, _)) =>
-        getTargetIndexedColumnName(attrRef, indexSchema)
+        getTargetIndexedColumnName(attrRef, indexedCols)
           .map { colName =>
             val targetExprBuilder: Expression => Expression = 
swapAttributeRefInExpr(sourceExpr, attrRef, _)
             genColumnValuesEqualToExpression(colName, v, targetExprBuilder)
-          }
+          }.orElse({
+          hasNonIndexedCols.set(true)
+          Option.empty
+        })
 
       // Filter "expr(colA) not like 'xxx%'"
       // Translates to "NOT(expr(colA_minValue) like 'xxx%' AND 
expr(colA_maxValue) like 'xxx%')" for index lookup
       // NOTE: This is NOT an inversion of "colA like xxx"
       case Not(StartsWith(sourceExpr @ 
AllowedTransformationExpression(attrRef), value @ Literal(_: UTF8String, _))) =>
-        getTargetIndexedColumnName(attrRef, indexSchema)
+        getTargetIndexedColumnName(attrRef, indexedCols)
           .map { colName =>
             val targetExprBuilder: Expression => Expression = 
swapAttributeRefInExpr(sourceExpr, attrRef, _)
             val minValueExpr = 
targetExprBuilder.apply(genColMinValueExpr(colName))
             val maxValueExpr = 
targetExprBuilder.apply(genColMaxValueExpr(colName))
             Not(And(StartsWith(minValueExpr, value), StartsWith(maxValueExpr, 
value)))
-          }
+          }.orElse({
+          hasNonIndexedCols.set(true)
+          Option.empty
+        })
 
       case or: Or =>
-        val resLeft = createColumnStatsIndexFilterExprInternal(or.left, 
indexSchema)
-        val resRight = createColumnStatsIndexFilterExprInternal(or.right, 
indexSchema)
-
-        Option(Or(resLeft, resRight))
+        val leftHasNonIndexedCols = new AtomicBoolean(false)
+        val resLeft = createColumnStatsIndexFilterExprInternal(or.left, 
isExpressionIndex = isExpressionIndex,
+          indexedCols = indexedCols, hasNonIndexedCols = leftHasNonIndexedCols)
+        val rightHasNonIndexedCols = new AtomicBoolean(false)
+        val resRight = createColumnStatsIndexFilterExprInternal(or.right, 
isExpressionIndex = isExpressionIndex,
+          indexedCols = indexedCols, hasNonIndexedCols = 
rightHasNonIndexedCols)
+        if (leftHasNonIndexedCols.get() || rightHasNonIndexedCols.get()) {
+          hasNonIndexedCols.set(true)
+          None
+        } else {
+          Option(Or(resLeft, resRight))
+        }
 
       case and: And =>
-        val resLeft = createColumnStatsIndexFilterExprInternal(and.left, 
indexSchema)
-        val resRight = createColumnStatsIndexFilterExprInternal(and.right, 
indexSchema)
-
-        Option(And(resLeft, resRight))
+        val leftHasNonIndexedCols = new AtomicBoolean(false)
+        val resLeft = createColumnStatsIndexFilterExprInternal(and.left, 
isExpressionIndex = isExpressionIndex,
+          indexedCols = indexedCols, hasNonIndexedCols = leftHasNonIndexedCols)
+        val rightHasNonIndexedCols = new AtomicBoolean(false)
+        val resRight = createColumnStatsIndexFilterExprInternal(and.right, 
isExpressionIndex = isExpressionIndex,
+          indexedCols = indexedCols, hasNonIndexedCols = 
rightHasNonIndexedCols)
+        // only if both left and right has non indexed cols, we can set 
hasNonIndexedCols to true.
+        // If not, we can still afford to prune files based on col stats 
lookup.
+        if (leftHasNonIndexedCols.get() && !rightHasNonIndexedCols.get()) {
+          Option(resRight)  // if only left has non indexed cols, ignore from 
the expression to be looked up in col stats df
+        } else if (!leftHasNonIndexedCols.get() && 
rightHasNonIndexedCols.get()) {
+          Option(resLeft) // if only right has non indexed cols, ignore from 
the expression to be looked up in col stats df
+        } else if (leftHasNonIndexedCols.get() && 
rightHasNonIndexedCols.get()) {
+          hasNonIndexedCols.set(true)
+          None
+        } else {
+          Option(And(resLeft, resRight))
+        }

Review Comment:
   sg.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8663] Look up in col stats based on indexed cols [hudi]

Reply via email to