rtyler opened a new issue, #10749: URL: https://github.com/apache/datafusion/issues/10749
### Describe the bug When taking two `DataFrame` objects and running `except` the function fails when there are Structs in the schema, but _succeeds_ with more simple schemas. For example, this works: ```rust let schema = Arc::new(Schema::new(vec![Field::new( "value", DataType::Int32, true), ])); let batch = RecordBatch::try_new( Arc::clone(&schema), vec![ Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), ]) .unwrap(); let updated_batch = RecordBatch::try_new( Arc::clone(&schema), vec![ Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])), ], ) .unwrap(); let _ = datafusion::arrow::util::pretty::print_batches(&[batch.clone()]); let _ = datafusion::arrow::util::pretty::print_batches(&[updated_batch.clone()]); let ctx = SessionContext::new(); let before = ctx.read_batch(batch).expect("Failed to make DataFrame"); let after = ctx.read_batch(updated_batch).expect("Failed to make DataFrame"); let diff = before.except(after).expect("Failed to except").collect().await.expect("Failed to diff"); assert_eq!(diff.len(), 1); ``` ### To Reproduce ```rust let nested_schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, true), Field::new("lat", DataType::Int32, true), Field::new("long", DataType::Int32, true), ])); let schema = Arc::new(Schema::new(vec![Field::new( "value", DataType::Int32, true), Field::new("nested", DataType::Struct(nested_schema.fields.clone()), true) ])); let batch = RecordBatch::try_new( Arc::clone(&schema), vec![ Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), Arc::new(StructArray::from(vec![ ( Arc::new(Field::new("id", DataType::Int32, true)), Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef ), ( Arc::new(Field::new("lat", DataType::Int32, true)), Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef ), ( Arc::new(Field::new("long", DataType::Int32, true)), Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef ), ])) ]) .unwrap(); let updated_batch = RecordBatch::try_new( Arc::clone(&schema), vec![ Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])), Arc::new(StructArray::from(vec![ ( Arc::new(Field::new("id", DataType::Int32, true)), Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef ), ( Arc::new(Field::new("lat", DataType::Int32, true)), Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef ), ( Arc::new(Field::new("long", DataType::Int32, true)), Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef ), ])) ], ) .unwrap(); let _ = datafusion::arrow::util::pretty::print_batches(&[batch.clone()]); let _ = datafusion::arrow::util::pretty::print_batches(&[updated_batch.clone()]); let ctx = SessionContext::new(); let before = ctx.read_batch(batch).expect("Failed to make DataFrame"); let after = ctx.read_batch(updated_batch).expect("Failed to make DataFrame"); let diff = before.except(after).expect("Failed to except").collect().await.expect("Failed to diff"); assert_eq!(diff.len(), 1); ``` ### Expected behavior I would expect the above to pass assertions, instead this output is produced: ``` running 2 tests test tests::test_simple ... ok test tests::test_with_struct ... FAILED failures: ---- tests::test_with_struct stdout ---- +-------+--------------------------+ | value | nested | +-------+--------------------------+ | 1 | {id: 1, lat: 1, long: 1} | | 2 | {id: 2, lat: 2, long: 2} | | 3 | {id: 3, lat: 3, long: 3} | +-------+--------------------------+ +-------+--------------------------+ | value | nested | +-------+--------------------------+ | 1 | {id: 1, lat: 1, long: 1} | | 12 | {id: 2, lat: 2, long: 2} | | 3 | {id: 3, lat: 3, long: 3} | +-------+--------------------------+ thread 'tests::test_with_struct' panicked at except-df-bug/src/lib.rs:74:84: Failed to diff: ArrowError(InvalidArgumentError("Invalid comparison operation: Struct([Field { name: \"id\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"lat\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"long\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]) IS NOT DISTINCT FROM Struct([Field { name: \"id\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"lat\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"long\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }])"), None) note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace failures: tests::test_with_struct test result: FAILED. 1 passed; 1 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.01s ``` ### Additional context _No response_ -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org