huan233usc commented on code in PR #2592:
URL: https://github.com/apache/iceberg-rust/pull/2592#discussion_r3400080817
##########
crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs:
##########
@@ -225,17 +225,115 @@ fn to_iceberg_operation(op: Operator) ->
OpTransformedResult {
/// identified by name at runtime, so we need to handle them here.
fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) ->
TransformedResult {
match func_name {
- // TODO: support complex expression arguments to scalar functions
- "isnan" if args.len() == 1 => {
- let operand = to_iceberg_predicate(&args[0]);
- match operand {
- TransformedResult::Column(r) =>
TransformedResult::Predicate(Predicate::Unary(
- UnaryExpression::new(PredicateOperator::IsNan, r),
- )),
- _ => TransformedResult::NotTransformed,
+ "isnan" if args.len() == 1 => match
resolve_nan_preserving_reference(&args[0]) {
+ Some(r) => TransformedResult::Predicate(r.is_nan()),
+ None => TransformedResult::NotTransformed,
+ },
+ _ => TransformedResult::NotTransformed,
+ }
+}
+
+/// Attempts to resolve a numeric expression argument down to a single column
+/// [`Reference`] such that `isnan(arg)` is logically equivalent to
+/// `isnan(reference)`.
+///
+/// Filter pushdown is reported as `Inexact` (see
+/// [`IcebergTableProvider::supports_filters_pushdown`]), so DataFusion
+/// re-applies the original predicate after scanning. We therefore only need
the
+/// pushed-down predicate to be implied by the original filter (it may match
+/// extra rows, but must never drop a matching one). Every transformation
handled
+/// here preserves NaN-ness *exactly* — the result is NaN if and only if the
+/// wrapped column is NaN — so both `isnan(arg)` and `NOT isnan(arg)` are
sound:
+///
+/// * negation: `-x` is NaN iff `x` is NaN
+/// * `abs(x)`: `abs(x)` is NaN iff `x` is NaN
+/// * casts between numeric types preserve NaN
+/// * `x + c`, `c + x`, `x - c`, `c - x` for a finite literal `c`
+/// * `x * c`, `c * x`, `x / c` for a finite, non-zero literal `c`
+///
+/// Multiplication/division by zero and `c / x` are intentionally rejected:
e.g.
+/// `x * 0` is NaN when `x` is `±inf`, so it does not imply `x` is NaN.
+///
+/// [`IcebergTableProvider::supports_filters_pushdown`]:
crate::table::IcebergTableProvider
+fn resolve_nan_preserving_reference(expr: &Expr) -> Option<Reference> {
+ match expr {
+ Expr::Column(column) => Some(Reference::new(column.name())),
+ Expr::Negative(inner) => resolve_nan_preserving_reference(inner),
+ Expr::Cast(cast) => {
+ // Casts to date truncate the value and are not numeric, so they
+ // cannot be treated as NaN-preserving.
+ if cast.data_type == DataType::Date32 || cast.data_type ==
DataType::Date64 {
+ return None;
}
+ resolve_nan_preserving_reference(&cast.expr)
}
- _ => TransformedResult::NotTransformed,
+ Expr::ScalarFunction(ScalarFunction { func, args })
+ if func.name() == "abs" && args.len() == 1 =>
+ {
+ resolve_nan_preserving_reference(&args[0])
+ }
+ Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary),
+ _ => None,
+ }
+}
+
+/// Resolves the column reference from an arithmetic expression that combines a
+/// single column with a finite literal while preserving NaN-ness. See
+/// [`resolve_nan_preserving_reference`] for the soundness argument.
+fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option<Reference> {
Review Comment:
Yes, `(x + 1) * (x - 2)` won't be pushed down in this PR.
It requires more work to support it safely. To support that we need to:
1. Recurse into both operands and confirm they reduce to the same single
column — otherwise e.g. `x + y` reduces to two different columns and can't be
expressed as a single `col IS NAN` (it would need `x IS NAN OR y IS NAN`, and
picking just one would drop rows).
2. Verify the operator combination is NaN-preserving (result is NaN iff that
column is NaN) — e.g. `(x + 1) * (x - 2)` is NaN iff `x` is NaN, so it's safe,
but `(x + 1) - (x - 2)` is NaN when `x = inf` (`inf - inf`) even though `x`
isn't, so it is not.
Maybe we could do it in the follow up, but iiuc Spark won't push down it as
well so in practice this might be a little bit less seen.
##########
crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs:
##########
@@ -225,17 +225,115 @@ fn to_iceberg_operation(op: Operator) ->
OpTransformedResult {
/// identified by name at runtime, so we need to handle them here.
fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) ->
TransformedResult {
match func_name {
- // TODO: support complex expression arguments to scalar functions
- "isnan" if args.len() == 1 => {
- let operand = to_iceberg_predicate(&args[0]);
- match operand {
- TransformedResult::Column(r) =>
TransformedResult::Predicate(Predicate::Unary(
- UnaryExpression::new(PredicateOperator::IsNan, r),
- )),
- _ => TransformedResult::NotTransformed,
+ "isnan" if args.len() == 1 => match
resolve_nan_preserving_reference(&args[0]) {
+ Some(r) => TransformedResult::Predicate(r.is_nan()),
+ None => TransformedResult::NotTransformed,
+ },
+ _ => TransformedResult::NotTransformed,
+ }
+}
+
+/// Attempts to resolve a numeric expression argument down to a single column
+/// [`Reference`] such that `isnan(arg)` is logically equivalent to
+/// `isnan(reference)`.
+///
+/// Filter pushdown is reported as `Inexact` (see
+/// [`IcebergTableProvider::supports_filters_pushdown`]), so DataFusion
+/// re-applies the original predicate after scanning. We therefore only need
the
+/// pushed-down predicate to be implied by the original filter (it may match
+/// extra rows, but must never drop a matching one). Every transformation
handled
+/// here preserves NaN-ness *exactly* — the result is NaN if and only if the
+/// wrapped column is NaN — so both `isnan(arg)` and `NOT isnan(arg)` are
sound:
+///
+/// * negation: `-x` is NaN iff `x` is NaN
+/// * `abs(x)`: `abs(x)` is NaN iff `x` is NaN
+/// * casts between numeric types preserve NaN
+/// * `x + c`, `c + x`, `x - c`, `c - x` for a finite literal `c`
+/// * `x * c`, `c * x`, `x / c` for a finite, non-zero literal `c`
+///
+/// Multiplication/division by zero and `c / x` are intentionally rejected:
e.g.
+/// `x * 0` is NaN when `x` is `±inf`, so it does not imply `x` is NaN.
+///
+/// [`IcebergTableProvider::supports_filters_pushdown`]:
crate::table::IcebergTableProvider
+fn resolve_nan_preserving_reference(expr: &Expr) -> Option<Reference> {
+ match expr {
+ Expr::Column(column) => Some(Reference::new(column.name())),
+ Expr::Negative(inner) => resolve_nan_preserving_reference(inner),
+ Expr::Cast(cast) => {
+ // Casts to date truncate the value and are not numeric, so they
+ // cannot be treated as NaN-preserving.
+ if cast.data_type == DataType::Date32 || cast.data_type ==
DataType::Date64 {
+ return None;
}
+ resolve_nan_preserving_reference(&cast.expr)
}
- _ => TransformedResult::NotTransformed,
+ Expr::ScalarFunction(ScalarFunction { func, args })
+ if func.name() == "abs" && args.len() == 1 =>
+ {
+ resolve_nan_preserving_reference(&args[0])
+ }
+ Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary),
+ _ => None,
+ }
+}
+
+/// Resolves the column reference from an arithmetic expression that combines a
+/// single column with a finite literal while preserving NaN-ness. See
+/// [`resolve_nan_preserving_reference`] for the soundness argument.
+fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option<Reference> {
+ let (left, right) = (&binary.left, &binary.right);
+ match binary.op {
+ // `x + c`, `c + x`, `x - c` and `c - x` are NaN iff `x` is NaN, for
any
+ // finite literal `c`. The column may be on either side.
+ Operator::Plus | Operator::Minus => {
+ if finite_literal(right).is_some() {
+ resolve_nan_preserving_reference(left)
+ } else if finite_literal(left).is_some() {
+ resolve_nan_preserving_reference(right)
+ } else {
+ None
+ }
+ }
+
+ // `x * c` and `c * x` are NaN iff `x` is NaN, but only when `c` is
+ // non-zero: `±inf * 0` is NaN even though `±inf` is not. The column
may
+ // be on either side.
+ Operator::Multiply => {
+ if matches!(finite_literal(right), Some(c) if c != 0.0) {
+ resolve_nan_preserving_reference(left)
+ } else if matches!(finite_literal(left), Some(c) if c != 0.0) {
+ resolve_nan_preserving_reference(right)
+ } else {
+ None
+ }
+ }
+
+ // `x / c` is NaN iff `x` is NaN, for a finite non-zero literal `c`.
+ // `c / x` is rejected because it is not NaN-preserving (e.g. `0 / 0`
is
+ // NaN while `0` is not), so the column must be the dividend (left
side).
Review Comment:
Reworded in a7e81bc6 to spell out the IEEE-754 facts (0 is not NaN, but 0 /
0 is NaN).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]