rtyler opened a new issue, #10749:
URL: https://github.com/apache/datafusion/issues/10749
### Describe the bug
When taking two `DataFrame` objects and running `except` the function fails
when there are Structs in the schema, but _succeeds_ with more simple schemas.
For example, this works:
```rust
let schema = Arc::new(Schema::new(vec![Field::new(
"value",
DataType::Int32,
true),
]));
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])),
])
.unwrap();
let updated_batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])),
],
)
.unwrap();
let _ =
datafusion::arrow::util::pretty::print_batches(&[batch.clone()]);
let _ =
datafusion::arrow::util::pretty::print_batches(&[updated_batch.clone()]);
let ctx = SessionContext::new();
let before = ctx.read_batch(batch).expect("Failed to make
DataFrame");
let after = ctx.read_batch(updated_batch).expect("Failed to make
DataFrame");
let diff = before.except(after).expect("Failed to
except").collect().await.expect("Failed to diff");
assert_eq!(diff.len(), 1);
```
### To Reproduce
```rust
let nested_schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, true),
Field::new("lat", DataType::Int32, true),
Field::new("long", DataType::Int32, true),
]));
let schema = Arc::new(Schema::new(vec![Field::new(
"value",
DataType::Int32,
true),
Field::new("nested",
DataType::Struct(nested_schema.fields.clone()),
true)
]));
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])),
Arc::new(StructArray::from(vec![
(
Arc::new(Field::new("id", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
(
Arc::new(Field::new("lat", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
(
Arc::new(Field::new("long", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
]))
])
.unwrap();
let updated_batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])),
Arc::new(StructArray::from(vec![
(
Arc::new(Field::new("id", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
(
Arc::new(Field::new("lat", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
(
Arc::new(Field::new("long", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
]))
],
)
.unwrap();
let _ =
datafusion::arrow::util::pretty::print_batches(&[batch.clone()]);
let _ =
datafusion::arrow::util::pretty::print_batches(&[updated_batch.clone()]);
let ctx = SessionContext::new();
let before = ctx.read_batch(batch).expect("Failed to make
DataFrame");
let after = ctx.read_batch(updated_batch).expect("Failed to make
DataFrame");
let diff = before.except(after).expect("Failed to
except").collect().await.expect("Failed to diff");
assert_eq!(diff.len(), 1);
```
### Expected behavior
I would expect the above to pass assertions, instead this output is produced:
```
running 2 tests
test tests::test_simple ... ok
test tests::test_with_struct ... FAILED
failures:
---- tests::test_with_struct stdout ----
+-------+--------------------------+
| value | nested |
+-------+--------------------------+
| 1 | {id: 1, lat: 1, long: 1} |
| 2 | {id: 2, lat: 2, long: 2} |
| 3 | {id: 3, lat: 3, long: 3} |
+-------+--------------------------+
+-------+--------------------------+
| value | nested |
+-------+--------------------------+
| 1 | {id: 1, lat: 1, long: 1} |
| 12 | {id: 2, lat: 2, long: 2} |
| 3 | {id: 3, lat: 3, long: 3} |
+-------+--------------------------+
thread 'tests::test_with_struct' panicked at except-df-bug/src/lib.rs:74:84:
Failed to diff: ArrowError(InvalidArgumentError("Invalid comparison
operation: Struct([Field { name: \"id\", data_type: Int32, nullable: true,
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"lat\",
data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }, Field { name: \"long\", data_type: Int32, nullable: true, dict_id: 0,
dict_is_ordered: false, metadata: {} }]) IS NOT DISTINCT FROM Struct([Field {
name: \"id\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered:
false, metadata: {} }, Field { name: \"lat\", data_type: Int32, nullable: true,
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"long\",
data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }])"), None)
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
failures:
tests::test_with_struct
test result: FAILED. 1 passed; 1 failed; 0 ignored; 0 measured; 0 filtered
out; finished in 0.01s
```
### Additional context
_No response_
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]