alexeykudinkin commented on a change in pull request #4996:
URL: https://github.com/apache/hudi/pull/4996#discussion_r831312871
##########
File path:
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala
##########
@@ -59,147 +62,205 @@ object DataSkippingUtils extends Logging {
}
private def tryComposeIndexFilterExpr(sourceExpr: Expression, indexSchema:
StructType): Option[Expression] = {
- def minValue(colName: String) = col(getMinColumnNameFor(colName)).expr
- def maxValue(colName: String) = col(getMaxColumnNameFor(colName)).expr
- def numNulls(colName: String) = col(getNumNullsColumnNameFor(colName)).expr
-
- def colContainsValuesEqualToLiteral(colName: String, value: Literal):
Expression =
- // Only case when column C contains value V is when min(C) <= V <= max(c)
- And(LessThanOrEqual(minValue(colName), value),
GreaterThanOrEqual(maxValue(colName), value))
-
- def colContainsOnlyValuesEqualToLiteral(colName: String, value: Literal) =
- // Only case when column C contains _only_ value V is when min(C) = V AND
max(c) = V
- And(EqualTo(minValue(colName), value), EqualTo(maxValue(colName), value))
-
+ //
+ // For translation of the Filter Expression for the Data Table into Filter
Expression for Column Stats Index, we're
+ // assuming that
+ // - The column A is queried in the Data Table (hereafter referred to
as "colA")
+ // - Filter Expression is a relational expression (ie "=", "<", "<=",
...) of the following form
+ //
+ // ```transform_expr(colA) = value_expr```
+ //
+ // Where
+ // - "transform_expr" is an expression of the _transformation_
which preserve ordering of the "colA"
+ // - "value_expr" is an "value"-expression (ie one NOT referring to
other attributes/columns or containing sub-queries)
+ //
+ // We translate original Filter Expr into the one querying Column Stats
Index like following: let's consider
+ // equality Filter Expr referred to above:
+ //
+ // ```transform_expr(colA) = value_expr```
+ //
+ // This expression will be translated into following Filter Expression for
the Column Stats Index:
+ //
+ // ```(transform_expr(colA_minValue) <= value_expr) AND (value_expr <=
transform_expr(colA_maxValue))```
Review comment:
There's a test testing exactly this use-case
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]