nsivabalan commented on code in PR #12575:
URL: https://github.com/apache/hudi/pull/12575#discussion_r1904636535
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala:
##########
@@ -42,28 +45,43 @@ object DataSkippingUtils extends Logging {
* @param isExpressionIndex whether the index is an expression index
* @return filter for column-stats index's table
*/
- def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression,
indexSchema: StructType, isExpressionIndex: Boolean = false): Expression = {
+ def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression,
isExpressionIndex: Boolean = false,
+ indexedCols : Seq[String] =
Seq.empty,
+ hasNonIndexedCols :
AtomicBoolean = new AtomicBoolean(false)): Expression = {
try {
- createColumnStatsIndexFilterExprInternal(dataTableFilterExpr,
indexSchema, isExpressionIndex)
+ createColumnStatsIndexFilterExprInternal(dataTableFilterExpr,
isExpressionIndex, indexedCols,
+ hasNonIndexedCols)
} catch {
case e: AnalysisException =>
logDebug(s"Failed to translated provided data table filter expr into
column stats one ($dataTableFilterExpr)", e)
throw e
}
}
- private def createColumnStatsIndexFilterExprInternal(dataTableFilterExpr:
Expression, indexSchema: StructType, isExpressionIndex: Boolean = false):
Expression = {
+ private def createColumnStatsIndexFilterExprInternal(dataTableFilterExpr:
Expression, isExpressionIndex: Boolean = false,
+ indexedCols :
Seq[String],
+ hasNonIndexedCols :
AtomicBoolean = new AtomicBoolean(false)): Expression = {
// Try to transform original Source Table's filter expression into
// Column-Stats Index filter expression
- tryComposeIndexFilterExpr(dataTableFilterExpr, indexSchema,
isExpressionIndex) match {
+ tryComposeIndexFilterExpr(dataTableFilterExpr, isExpressionIndex,
indexedCols, hasNonIndexedCols) match {
case Some(e) => e
// NOTE: In case we can't transform source filter expression, we fallback
// to {@code TrueLiteral}, to essentially avoid pruning any indexed
files from scanning
case None => TrueLiteral
}
}
- private def tryComposeIndexFilterExpr(sourceFilterExpr: Expression,
indexSchema: StructType, isExpressionIndex: Boolean = false):
Option[Expression] = {
+ /**
+ * Composes index filter expression to be looked up with col stats index in
MDT.
+ * For eg, a filter from source as "colA = 'abc'" will get transformed to
"colA_minValue <= 'abc' and colA_maxValue >= 'abc'"
+ * @param sourceFilterExpr source filter expression of interest.
+ * @param isExpressionIndex true if this refers to an expression index.
+ * @param indexedCols list of columns indexed with col stats index in MDT.
+ * @param hasNonIndexedCols atomic boolean tracking if there are any non
indexed columns.
+ * @return optionally transformed Expression. Returns None if column of
interest it not indexed nor translatable.
+ */
+ private def tryComposeIndexFilterExpr(sourceFilterExpr: Expression,
isExpressionIndex: Boolean = false,
+ indexedCols : Seq[String],
hasNonIndexedCols : AtomicBoolean = new AtomicBoolean(false)):
Option[Expression] = {
Review Comment:
yes. the schema was used to deduce indexed cols only. Now that we have
explicit set of indexed cols (seq), we don't need the schema anymore.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]