kosiew commented on code in PR #17637:
URL: https://github.com/apache/datafusion/pull/17637#discussion_r2365290536
##########
datafusion/common/src/nested_struct.rs:
##########
@@ -162,6 +185,43 @@ pub fn cast_column(
}
}
+/// Sanitizes a `BinaryView` array so that any element containing invalid UTF-8
+/// is converted to null before casting to a UTF-8 string array.
+///
+/// This only transforms the array's values (not any external statistics).
Other
+/// binary array representations are returned unchanged because Arrow's safe
+/// casts already convert invalid UTF-8 sequences to null for those types.
+pub fn sanitize_binary_array_for_utf8(array: ArrayRef) -> ArrayRef {
+ match array.data_type() {
+ DataType::BinaryView => {
+ let binary_view = array.as_binary_view();
+
+ // Check if all bytes are already valid UTF-8
Review Comment:
> the existing cast kernel handles invalid ut8f already
It does not handle BinaryView
```rust
#[test]
fn test_arrow_cast_binaryview_to_utf8view_fails_with_invalid_utf8() {
// Arrow's BinaryView -> Utf8View casting fails even with safe=true
when
// encountering invalid UTF-8, unlike other binary array types which
// convert invalid UTF-8 to null with safe casting.
use arrow::compute::kernels::cast::{cast_with_options, CastOptions};
// Create BinaryView with invalid UTF-8
let binary_data = vec![
Some("valid".as_bytes()),
Some(&[0xf0, 0x28, 0x8c, 0x28]), // invalid UTF-8 sequence
Some("also_valid".as_bytes()),
];
let binary_view_array: ArrayRef =
Arc::new(BinaryViewArray::from(binary_data));
// Try casting with safe=false (should fail)
let cast_options = CastOptions::default(); // safe=false by default
let result =
cast_with_options(&binary_view_array, &DataType::Utf8View,
&cast_options);
assert!(
result.is_err(),
"Expected BinaryView->Utf8View cast to fail with safe=false"
);
assert!(
result
.unwrap_err()
.to_string()
.contains("Encountered non-UTF-8 data"),
"Error should mention non-UTF-8 data"
);
// Try casting with safe=true (should still fail for BinaryView!)
let mut safe_cast_options = CastOptions::default();
safe_cast_options.safe = true;
let safe_result = cast_with_options(
&binary_view_array,
&DataType::Utf8View,
&safe_cast_options,
);
assert!(
safe_result.is_err(),
"BinaryView->Utf8View cast fails even with safe=true (unlike
other binary types)"
);
assert!(
safe_result
.unwrap_err()
.to_string()
.contains("Encountered non-UTF-8 data"),
"Safe cast error should also mention non-UTF-8 data"
);
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]