nsivabalan commented on code in PR #12575:
URL: https://github.com/apache/hudi/pull/12575#discussion_r1904637258
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala:
##########
@@ -270,35 +344,63 @@ object DataSkippingUtils extends Logging {
// lexicographically, we essentially need to check that provided
literal falls w/in min/max bounds of the
// given column
case StartsWith(sourceExpr @ AllowedTransformationExpression(attrRef), v
@ Literal(_: UTF8String, _)) =>
- getTargetIndexedColumnName(attrRef, indexSchema)
+ getTargetIndexedColumnName(attrRef, indexedCols)
.map { colName =>
val targetExprBuilder: Expression => Expression =
swapAttributeRefInExpr(sourceExpr, attrRef, _)
genColumnValuesEqualToExpression(colName, v, targetExprBuilder)
- }
+ }.orElse({
+ hasNonIndexedCols.set(true)
+ Option.empty
+ })
// Filter "expr(colA) not like 'xxx%'"
// Translates to "NOT(expr(colA_minValue) like 'xxx%' AND
expr(colA_maxValue) like 'xxx%')" for index lookup
// NOTE: This is NOT an inversion of "colA like xxx"
case Not(StartsWith(sourceExpr @
AllowedTransformationExpression(attrRef), value @ Literal(_: UTF8String, _))) =>
- getTargetIndexedColumnName(attrRef, indexSchema)
+ getTargetIndexedColumnName(attrRef, indexedCols)
.map { colName =>
val targetExprBuilder: Expression => Expression =
swapAttributeRefInExpr(sourceExpr, attrRef, _)
val minValueExpr =
targetExprBuilder.apply(genColMinValueExpr(colName))
val maxValueExpr =
targetExprBuilder.apply(genColMaxValueExpr(colName))
Not(And(StartsWith(minValueExpr, value), StartsWith(maxValueExpr,
value)))
- }
+ }.orElse({
+ hasNonIndexedCols.set(true)
+ Option.empty
+ })
case or: Or =>
- val resLeft = createColumnStatsIndexFilterExprInternal(or.left,
indexSchema)
- val resRight = createColumnStatsIndexFilterExprInternal(or.right,
indexSchema)
-
- Option(Or(resLeft, resRight))
+ val leftHasNonIndexedCols = new AtomicBoolean(false)
+ val resLeft = createColumnStatsIndexFilterExprInternal(or.left,
isExpressionIndex = isExpressionIndex,
+ indexedCols = indexedCols, hasNonIndexedCols = leftHasNonIndexedCols)
+ val rightHasNonIndexedCols = new AtomicBoolean(false)
+ val resRight = createColumnStatsIndexFilterExprInternal(or.right,
isExpressionIndex = isExpressionIndex,
+ indexedCols = indexedCols, hasNonIndexedCols =
rightHasNonIndexedCols)
+ if (leftHasNonIndexedCols.get() || rightHasNonIndexedCols.get()) {
+ hasNonIndexedCols.set(true)
+ None
+ } else {
+ Option(Or(resLeft, resRight))
+ }
case and: And =>
- val resLeft = createColumnStatsIndexFilterExprInternal(and.left,
indexSchema)
- val resRight = createColumnStatsIndexFilterExprInternal(and.right,
indexSchema)
-
- Option(And(resLeft, resRight))
+ val leftHasNonIndexedCols = new AtomicBoolean(false)
+ val resLeft = createColumnStatsIndexFilterExprInternal(and.left,
isExpressionIndex = isExpressionIndex,
+ indexedCols = indexedCols, hasNonIndexedCols = leftHasNonIndexedCols)
+ val rightHasNonIndexedCols = new AtomicBoolean(false)
+ val resRight = createColumnStatsIndexFilterExprInternal(and.right,
isExpressionIndex = isExpressionIndex,
+ indexedCols = indexedCols, hasNonIndexedCols =
rightHasNonIndexedCols)
+ // only if both left and right has non indexed cols, we can set
hasNonIndexedCols to true.
+ // If not, we can still afford to prune files based on col stats
lookup.
+ if (leftHasNonIndexedCols.get() && !rightHasNonIndexedCols.get()) {
+ Option(resRight) // if only left has non indexed cols, ignore from
the expression to be looked up in col stats df
+ } else if (!leftHasNonIndexedCols.get() &&
rightHasNonIndexedCols.get()) {
+ Option(resLeft) // if only right has non indexed cols, ignore from
the expression to be looked up in col stats df
+ } else if (leftHasNonIndexedCols.get() &&
rightHasNonIndexedCols.get()) {
+ hasNonIndexedCols.set(true)
+ None
+ } else {
+ Option(And(resLeft, resRight))
+ }
Review Comment:
sg.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]