This is an automated email from the ASF dual-hosted git repository.

liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git


The following commit(s) were added to refs/heads/main by this push:
     new 4f5274a8a feat(datafusion): support isnan predicate pushdown to 
Iceberg (#2142)
4f5274a8a is described below

commit 4f5274a8afd835e746167e48683207b71b20115a
Author: Kaiqi Dong <[email protected]>
AuthorDate: Thu Feb 26 02:38:34 2026 +0100

    feat(datafusion): support isnan predicate pushdown to Iceberg (#2142)
    
    ## Which issue does this PR close?
    
    Part of the ongoing effort to improve predicate pushdown coverage in the
    DataFusion integration.
    
    - Closes #2143
    
    ## What changes are included in this PR?
    
    This PR adds support for pushing down `isnan()` predicates from
    DataFusion to Iceberg's native `IsNan` / `NotNan` predicate operators.
    
    In DataFusion, `isnan()` is represented as a scalar function
    (`Expr::ScalarFunction`) rather than a dedicated `Expr` variant (unlike
    `IsNull` / `IsNotNull`). This PR introduces a new
    `scalar_function_to_iceberg_predicate` helper in `expr_to_predicate.rs`
    that matches scalar functions by name at runtime and converts
    `isnan(col)` into `Predicate::Unary(IsNan, col)`.
    
    Negation (`NOT isnan(col)`) is handled automatically: the existing
    `Expr::Not` arm wraps the result in `Predicate::Not(...)`, and Iceberg's
    downstream `rewrite_not` visitor normalizes it into
    `Predicate::Unary(NotNan, col)`.
    
    This enables file pruning using `nan_value_counts` in manifest metadata
    for float/double columns, as defined in the [Iceberg spec — Manifest
    Files: field summaries and column
    statistics](https://iceberg.apache.org/spec/#manifests).
    
    
    ## Are these changes tested?
    
    Yes
---
 .../src/physical_plan/expr_to_predicate.rs         | 58 ++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git 
a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs 
b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
index 9f37345f8..17c9416d5 100644
--- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
+++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
@@ -18,6 +18,7 @@
 use std::vec;
 
 use datafusion::arrow::datatypes::DataType;
+use datafusion::logical_expr::expr::ScalarFunction;
 use datafusion::logical_expr::{Expr, Like, Operator};
 use datafusion::scalar::ScalarValue;
 use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, 
UnaryExpression};
@@ -196,6 +197,9 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult {
                 TransformedResult::NotTransformed
             }
         }
+        Expr::ScalarFunction(ScalarFunction { func, args }) => {
+            scalar_function_to_iceberg_predicate(func.name(), args)
+        }
         _ => TransformedResult::NotTransformed,
     }
 }
@@ -216,6 +220,25 @@ fn to_iceberg_operation(op: Operator) -> 
OpTransformedResult {
     }
 }
 
+/// Translates a DataFusion scalar function into an Iceberg predicate.
+/// Unlike dedicated Expr variants (e.g. `Expr::IsNull`), scalar functions are
+/// identified by name at runtime, so we need to handle them here.
+fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> 
TransformedResult {
+    match func_name {
+        // TODO: support complex expression arguments to scalar functions
+        "isnan" if args.len() == 1 => {
+            let operand = to_iceberg_predicate(&args[0]);
+            match operand {
+                TransformedResult::Column(r) => 
TransformedResult::Predicate(Predicate::Unary(
+                    UnaryExpression::new(PredicateOperator::IsNan, r),
+                )),
+                _ => TransformedResult::NotTransformed,
+            }
+        }
+        _ => TransformedResult::NotTransformed,
+    }
+}
+
 fn to_iceberg_and_predicate(
     left: TransformedResult,
     right: TransformedResult,
@@ -324,6 +347,10 @@ mod tests {
             Field::new("ts", DataType::Timestamp(TimeUnit::Second, None), 
true).with_metadata(
                 HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), 
"3".to_string())]),
             ),
+            Field::new("qux", DataType::Float64, 
true).with_metadata(HashMap::from([(
+                PARQUET_FIELD_ID_META_KEY.to_string(),
+                "4".to_string(),
+            )])),
         ]);
         DFSchema::try_from_qualified_schema("my_table", &arrow_schema).unwrap()
     }
@@ -681,4 +708,35 @@ mod tests {
             Reference::new("bar").starts_with(Datum::string("测试"))
         );
     }
+
+    #[test]
+    fn test_predicate_conversion_with_isnan() {
+        let predicate = convert_to_iceberg_predicate("isnan(qux)").unwrap();
+        assert_eq!(predicate, Reference::new("qux").is_nan());
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_not_isnan() {
+        let predicate = convert_to_iceberg_predicate("NOT 
isnan(qux)").unwrap();
+        assert_eq!(predicate, !Reference::new("qux").is_nan());
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_isnan_and_other_condition() {
+        let sql = "isnan(qux) AND foo > 1";
+        let predicate = convert_to_iceberg_predicate(sql).unwrap();
+        let expected_predicate = Predicate::and(
+            Reference::new("qux").is_nan(),
+            Reference::new("foo").greater_than(Datum::long(1)),
+        );
+        assert_eq!(predicate, expected_predicate);
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_isnan_unsupported_arg() {
+        // isnan on a complex expression (not a bare column) cannot be pushed 
down
+        let sql = "isnan(qux + 1)";
+        let predicate = convert_to_iceberg_predicate(sql);
+        assert_eq!(predicate, None);
+    }
 }

Reply via email to