Re: [PR] [HUDI-8663] Look up in col stats based on indexed cols [hudi]

via GitHub Mon, 06 Jan 2025 13:15:29 -0800


nsivabalan commented on code in PR #12575:
URL: https://github.com/apache/hudi/pull/12575#discussion_r1904636535



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala:
##########
@@ -42,28 +45,43 @@ object DataSkippingUtils extends Logging {
    * @param isExpressionIndex whether the index is an expression index
    * @return filter for column-stats index's table
    */
-  def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression, 
indexSchema: StructType, isExpressionIndex: Boolean = false): Expression = {
+  def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression, 
isExpressionIndex: Boolean = false,
+                                              indexedCols : Seq[String] = 
Seq.empty,
+                                              hasNonIndexedCols : 
AtomicBoolean = new AtomicBoolean(false)): Expression = {
     try {
-      createColumnStatsIndexFilterExprInternal(dataTableFilterExpr, 
indexSchema, isExpressionIndex)
+      createColumnStatsIndexFilterExprInternal(dataTableFilterExpr, 
isExpressionIndex, indexedCols,
+        hasNonIndexedCols)
     } catch {
       case e: AnalysisException =>
         logDebug(s"Failed to translated provided data table filter expr into 
column stats one ($dataTableFilterExpr)", e)
         throw e
     }
   }
 
-  private def createColumnStatsIndexFilterExprInternal(dataTableFilterExpr: 
Expression, indexSchema: StructType, isExpressionIndex: Boolean = false): 
Expression = {
+  private def createColumnStatsIndexFilterExprInternal(dataTableFilterExpr: 
Expression, isExpressionIndex: Boolean = false,
+                                                       indexedCols : 
Seq[String],
+                                                       hasNonIndexedCols : 
AtomicBoolean = new AtomicBoolean(false)): Expression = {
     // Try to transform original Source Table's filter expression into
     // Column-Stats Index filter expression
-    tryComposeIndexFilterExpr(dataTableFilterExpr, indexSchema, 
isExpressionIndex) match {
+    tryComposeIndexFilterExpr(dataTableFilterExpr, isExpressionIndex, 
indexedCols, hasNonIndexedCols) match {
       case Some(e) => e
       // NOTE: In case we can't transform source filter expression, we fallback
       // to {@code TrueLiteral}, to essentially avoid pruning any indexed 
files from scanning
       case None => TrueLiteral
     }
   }
 
-  private def tryComposeIndexFilterExpr(sourceFilterExpr: Expression, 
indexSchema: StructType, isExpressionIndex: Boolean = false): 
Option[Expression] = {
+  /**
+   * Composes index filter expression to be looked up with col stats index in 
MDT.
+   * For eg, a filter from source as "colA = 'abc'" will get transformed to 
"colA_minValue <= 'abc' and colA_maxValue >= 'abc'"
+   * @param sourceFilterExpr source filter expression of interest.
+   * @param isExpressionIndex true if this refers to an expression index.
+   * @param indexedCols list of columns indexed with col stats index in MDT.
+   * @param hasNonIndexedCols atomic boolean tracking if there are any non 
indexed columns.
+   * @return optionally transformed Expression. Returns None if column of 
interest it not indexed nor translatable.
+   */
+  private def tryComposeIndexFilterExpr(sourceFilterExpr: Expression, 
isExpressionIndex: Boolean = false,
+                                        indexedCols : Seq[String], 
hasNonIndexedCols : AtomicBoolean = new AtomicBoolean(false)): 
Option[Expression] = {

Review Comment:
   yes. the schema was used to deduce indexed cols only. Now that we have 
explicit set of indexed cols (seq), we don't need the schema anymore. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8663] Look up in col stats based on indexed cols [hudi]

Reply via email to