alamb commented on code in PR #8415:
URL: https://github.com/apache/arrow-rs/pull/8415#discussion_r2373103373
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -6376,6 +6374,43 @@ mod tests {
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
}
+ #[test]
+ fn test_binary_view_to_string_view_with_invalid_utf8() {
+ let binary_view_array = BinaryViewArray::from_iter(vec![
+ Some("valid".as_bytes()),
+ Some(&[0xff]),
+ Some("utf8".as_bytes()),
+ None,
+ ]);
+
+ let strict_options = CastOptions {
+ safe: false,
+ ..Default::default()
+ };
+
+ assert!(
+ cast_with_options(&binary_view_array, &DataType::Utf8View,
&strict_options).is_err()
+ );
+
+ let safe_options = CastOptions {
+ safe: true,
+ ..Default::default()
+ };
+
+ let string_view_array =
+ cast_with_options(&binary_view_array, &DataType::Utf8View,
&safe_options).unwrap();
+ assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
+
+ let values: Vec<_> = string_view_array
+ .as_any()
+ .downcast_ref::<StringViewArray>()
+ .unwrap()
+ .iter()
+ .collect();
+
+ assert_eq!(values, vec![Some("valid"), None, Some("utf8"), None]);
Review Comment:
```suggestion
let values: Vec<_> = string_view_array.as_string_view()
.iter()
.collect();
assert_eq!(values, vec![Some("valid"), None, Some("utf8"), None]);
```
##########
arrow-cast/src/cast/string.rs:
##########
@@ -356,18 +364,33 @@ pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
let mut builder =
GenericStringBuilder::<O>::with_capacity(array.len(),
array.value_data().len());
- let iter = array
- .iter()
- .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
-
- builder.extend(iter);
+ extend_valid_utf8(&mut builder, array.iter());
Ok(Arc::new(builder.finish()))
}
false => Err(e),
},
}
}
+pub(crate) fn cast_binary_view_to_string_view(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let array = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
+
+ match array.clone().to_string_view() {
+ Ok(result) => Ok(Arc::new(result)),
+ Err(error) => match cast_options.safe {
Review Comment:
it would be nice to avoid the conversion twice if there is non utf8 data,
but the nice thing about the current implementation is that I don't think it
will regress performance: it only uses the slower path if we know for sure
there is non utf8 data.
##########
arrow-cast/src/cast/string.rs:
##########
@@ -356,18 +364,33 @@ pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
let mut builder =
GenericStringBuilder::<O>::with_capacity(array.len(),
array.value_data().len());
- let iter = array
- .iter()
- .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
-
- builder.extend(iter);
+ extend_valid_utf8(&mut builder, array.iter());
Ok(Arc::new(builder.finish()))
}
false => Err(e),
},
}
}
+pub(crate) fn cast_binary_view_to_string_view(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let array = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
Review Comment:
A minor way to make this more concise
```suggestion
let array = array.as_binary_view();
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]