This is an automated email from the ASF dual-hosted git repository. findepi pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new fdc54b7a98 Fix predicate simplification for incompatible types in push_down_filter (#17521) fdc54b7a98 is described below commit fdc54b7a9891c4ebe9722778569354fb4ac34b00 Author: Adrian Garcia Badaracco <1755071+adria...@users.noreply.github.com> AuthorDate: Fri Sep 12 08:14:59 2025 -0500 Fix predicate simplification for incompatible types in push_down_filter (#17521) * Fix predicate simplification for incompatible types in push_down_filter Handle the case where scalar values with incompatible types (e.g., TimestampMicrosecond vs Time64Nanosecond) cannot be compared during predicate simplification. The fix catches the comparison error in find_most_restrictive_predicate and returns None to indicate we can't simplify when types are incompatible, preventing the "Uncomparable values" error. Fixes #17512 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <nore...@anthropic.com> * minimize diff * use proposed fix, add test * revert other changes --------- Co-authored-by: Claude <nore...@anthropic.com> --- .../simplify_expressions/simplify_predicates.rs | 97 +++++++++++++++++++++- .../sqllogictest/test_files/push_down_filter.slt | 25 ++++++ 2 files changed, 119 insertions(+), 3 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs b/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs index 9d9e840636..131404e607 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs @@ -26,7 +26,7 @@ //! encompasses the former, resulting in fewer checks during query execution. use datafusion_common::{Column, Result, ScalarValue}; -use datafusion_expr::{BinaryExpr, Cast, Expr, Operator}; +use datafusion_expr::{BinaryExpr, Expr, Operator}; use std::collections::BTreeMap; /// Simplifies a list of predicates by removing redundancies. @@ -239,8 +239,99 @@ fn find_most_restrictive_predicate( fn extract_column_from_expr(expr: &Expr) -> Option<Column> { match expr { Expr::Column(col) => Some(col.clone()), - // Handle cases where the column might be wrapped in a cast or other operation - Expr::Cast(Cast { expr, .. }) => extract_column_from_expr(expr), _ => None, } } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::DataType; + use datafusion_expr::{cast, col, lit}; + + #[test] + fn test_simplify_predicates_with_cast() { + // Test that predicates on cast expressions are not grouped with predicates on the raw column + // a < 5 AND CAST(a AS varchar) < 'abc' AND a < 6 + // Should simplify to: + // a < 5 AND CAST(a AS varchar) < 'abc' + + let predicates = vec![ + col("a").lt(lit(5i32)), + cast(col("a"), DataType::Utf8).lt(lit("abc")), + col("a").lt(lit(6i32)), + ]; + + let result = simplify_predicates(predicates).unwrap(); + + // Should have 2 predicates: a < 5 and CAST(a AS varchar) < 'abc' + assert_eq!(result.len(), 2); + + // Check that the cast predicate is preserved + let has_cast_predicate = result.iter().any(|p| { + matches!(p, Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Lt, + right + }) if matches!(left.as_ref(), Expr::Cast(_)) && right == &Box::new(lit("abc"))) + }); + assert!(has_cast_predicate, "Cast predicate should be preserved"); + + // Check that we have the more restrictive column predicate (a < 5) + let has_column_predicate = result.iter().any(|p| { + matches!(p, Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Lt, + right + }) if left == &Box::new(col("a")) && right == &Box::new(lit(5i32))) + }); + assert!(has_column_predicate, "Should have a < 5 predicate"); + } + + #[test] + fn test_extract_column_ignores_cast() { + // Test that extract_column_from_expr does not extract columns from cast expressions + let cast_expr = cast(col("a"), DataType::Utf8); + assert_eq!(extract_column_from_expr(&cast_expr), None); + + // Test that it still extracts from direct column references + let col_expr = col("a"); + assert_eq!(extract_column_from_expr(&col_expr), Some(Column::from("a"))); + } + + #[test] + fn test_simplify_predicates_direct_columns_only() { + // Test that only predicates on direct columns are simplified together + let predicates = vec![ + col("a").lt(lit(5i32)), + col("a").lt(lit(3i32)), + col("b").gt(lit(10i32)), + col("b").gt(lit(20i32)), + ]; + + let result = simplify_predicates(predicates).unwrap(); + + // Should have 2 predicates: a < 3 and b > 20 (most restrictive for each column) + assert_eq!(result.len(), 2); + + // Check for a < 3 + let has_a_predicate = result.iter().any(|p| { + matches!(p, Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Lt, + right + }) if left == &Box::new(col("a")) && right == &Box::new(lit(3i32))) + }); + assert!(has_a_predicate, "Should have a < 3 predicate"); + + // Check for b > 20 + let has_b_predicate = result.iter().any(|p| { + matches!(p, Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Gt, + right + }) if left == &Box::new(col("b")) && right == &Box::new(lit(20i32))) + }); + assert!(has_b_predicate, "Should have b > 20 predicate"); + } +} diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index a7e3338df3..3a6faf4654 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -395,3 +395,28 @@ order by t1.k, t2.v; ---- 1 1 1 10000000 10000000 10000000 + +# Regression test for https://github.com/apache/datafusion/issues/17512 + +query I +COPY ( + SELECT arrow_cast('2025-01-01T00:00:00Z'::timestamptz, 'Timestamp(Microsecond, Some("UTC"))') AS start_timestamp +) +TO 'test_files/scratch/push_down_filter/17512.parquet' +STORED AS PARQUET; +---- +1 + +statement ok +CREATE EXTERNAL TABLE records STORED AS PARQUET LOCATION 'test_files/scratch/push_down_filter/17512.parquet'; + +query I +SELECT 1 +FROM ( + SELECT start_timestamp + FROM records + WHERE start_timestamp <= '2025-01-01T00:00:00Z'::timestamptz +) AS t +WHERE t.start_timestamp::time < '00:00:01'::time; +---- +1 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org