[I] Simplify expressions swallows a cast expression [datafusion]

via GitHub Tue, 19 Nov 2024 02:38:03 -0800


gruuya opened a new issue, #13481:
URL: https://github.com/apache/datafusion/issues/13481


   ### Describe the bug
   
   It seems `SimplifyExpressions` will sometimes omit a cast which was 
introduced by the `TypeCoercion` analyzer rule, leading to a loss in the 
precision of the output DataType.[](url)
   
   ### To Reproduce
   
   Run 
   
   ```rust
       #[test]
       fn test_simplify_case_cast_list() {
           use datafusion_functions_nested::expr_fn::make_array;
   
           let element_field = Arc::new(Field::new("element", DataType::Int32, 
true));
           let expr = case(col("condition"))
               .when(
                   lit(false),
                   cast(
                       make_array(vec![lit(2), lit(3)]),
                       DataType::List(element_field.clone()),
                   ),
               )
               .otherwise(col("items"))
               .unwrap()
               .alias("items");
   
           let expected = case(col("condition"))
               .when(
                   lit(false),
                   cast(
                       lit(ScalarValue::List(ScalarValue::new_list_nullable(
                           &[ScalarValue::Int32(Some(2)), 
ScalarValue::Int32(Some(3))],
                           &DataType::Int32,
                       ))),
                       DataType::List(element_field),
                   ),
               )
               .otherwise(col("items"))
               .unwrap()
               .alias("items");
   
           assert_eq!(simplify(expr), expected);
       }
   ```
   
   It will fail because the cast clause gets (unexpectedly) stripped
   ```rust
   assertion `left == right` failed
     left: Alias(Alias { expr: Case(Case { expr: Some(Column(Column { relation: 
None, name: "condition" })), when_then_expr: [(Literal(Boolean(false)), 
Literal(List([2, 3])))], else_expr: Some(Column(Column { relation: None, name: 
"items" })) }), relation: None, name: "items" })
    right: Alias(Alias { expr: Case(Case { expr: Some(Column(Column { relation: 
None, name: "condition" })), when_then_expr: [(Literal(Boolean(false)), 
Cast(Cast { expr: Literal(List([2, 3])), data_type: List(Field { name: 
"element", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: 
false, metadata: {} }) }))], else_expr: Some(Column(Column { relation: None, 
name: "items" })) }), relation: None, name: "items" })
   ```
   
   ## Effects
   
   I believe this in turn causes https://github.com/delta-io/delta-rs/pull/2886.
   
   Basically `TypeCoercion` injects the CAST, while `SimplifyExpressions` omits 
it. Consequently `OptimizeProjections` folds the two projections, but the 
schema is computed from the outermost projection expressions, which now lack 
the proper type info (i.e. the non-standard list item field names).
   
   Minimal repro:
   
   ```rust
       #[tokio::test]
       async fn test_list_item() -> Result<()> {
           use datafusion_functions_nested::expr_fn::make_array;
   
           let element_field = Arc::new(Field::new("element", DataType::Int32, 
true));
           let items_field = Field::new(
               "items",
               DataType::List(element_field.clone()),
               true,
           );
           let schema = Schema::new(vec![items_field.clone()]);
   
           let mut items_builder =
               
ListBuilder::new(Int32Builder::new()).with_field(element_field.clone());
           items_builder.append_value([Some(1)]);
           let batch = RecordBatch::try_new(Arc::new(schema), 
vec![Arc::new(items_builder.finish())])?;
   
           let ctx = SessionContext::new();
           let df = ctx.read_batch(batch).expect("source DataFrame")
               .with_column("condition", lit(false))?
               .select(vec![case(col("condition")).when(lit(false), 
make_array(vec![lit(2), lit(3)])).otherwise(col("items"))?.alias("items")])?;
   
   
           let _ = df.create_physical_plan().await?;
   
           Ok(())
       }
   ```
   
   Fails with `Error: Context("Optimizer rule 'optimize_projections' failed", 
Context("optimize_projections", Internal("Failed due to a difference in 
schemas, original schema: DFSchema { inner: Schema { fields: [Field { name: 
\"items\", data_type: List(Field { name: \"element\", data_type: Int32, 
nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: 
true, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, 
field_qualifiers: [None], functional_dependencies: FunctionalDependencies { 
deps: [] } }, new schema: DFSchema { inner: Schema { fields: [Field { name: 
\"items\", data_type: List(Field { name: \"item\", data_type: Int32, nullable: 
true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, 
dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, 
field_qualifiers: [None], functional_dependencies: FunctionalDependencies { 
deps: [] } }")))`
   
   ### Expected behavior
   
   `SimplifyExpressions`should retain the CAST, and the should be no optimizer 
errors in the second (high-level) example.
   
   ### Additional context
   
   _No response_


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[I] Simplify expressions swallows a cast expression [datafusion]

Reply via email to