alamb commented on code in PR #20787:
URL: https://github.com/apache/datafusion/pull/20787#discussion_r3004701589
##########
datafusion/functions/src/strings.rs:
##########
@@ -120,24 +136,34 @@ impl StringArrayBuilder {
.add_buffer(self.offsets_buffer.into())
.add_buffer(self.value_buffer.into())
.nulls(null_buffer);
- // SAFETY: all data that was appended was valid UTF8 and the values
- // and offsets were created correctly
- let array_data = unsafe { array_builder.build_unchecked() };
- StringArray::from(array_data)
+ if self.tainted {
+ // Raw binary arrays with possible invalid utf-8 were used,
+ // so let ArrayDataBuilder perform validation
+ let array_data = array_builder.build()?;
+ Ok(StringArray::from(array_data))
+ } else {
+ // SAFETY: all data that was appended was valid UTF8 and the values
+ // and offsets were created correctly
+ let array_data = unsafe { array_builder.build_unchecked() };
+ Ok(StringArray::from(array_data))
+ }
}
}
pub struct StringViewArrayBuilder {
builder: StringViewBuilder,
- block: String,
+ block: Vec<u8>,
+ /// If true, a safety check is required during the `append_offset` call
+ tainted: bool,
}
impl StringViewArrayBuilder {
Review Comment:
I don't fully understand why this code doesn't use
https://docs.rs/arrow/latest/arrow/array/type.StringViewBuilder.html
(I realize this was not added in this PR)
##########
datafusion/functions/src/strings.rs:
##########
@@ -75,6 +75,11 @@ impl StringArrayBuilder {
.extend_from_slice(array.value(i).as_bytes());
}
}
+ ColumnarValueRef::NullableBinaryArray(array) => {
Review Comment:
Another potential solution would be to a BinaryViewArray and then call the
arrow cast function (which will do the utf8 validation but not copying) if the
output was supposed to be utf8 and either input wasn't
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]