martin-g commented on code in PR #21538:
URL: https://github.com/apache/datafusion/pull/21538#discussion_r3073443745
##########
datafusion/functions/src/strings.rs:
##########
@@ -214,16 +221,23 @@ impl StringViewArrayBuilder {
}
}
+ /// Finalizes the current row by converting the accumulated data into a
+ /// StringView and appending it to the views buffer.
pub fn append_offset(&mut self) -> Result<()> {
- let block_str = if self.tainted {
+ if self.tainted {
std::str::from_utf8(&self.block)
- .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary
literal"))?
- } else {
- // SAFETY: all data that was appended was valid UTF8
- unsafe { std::str::from_utf8_unchecked(&self.block) }
- };
- self.builder.append_value(block_str);
+ .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary
literal"))?;
+ }
+
+ let v = &self.block;
+ let offset = self.data.len() as u32;
Review Comment:
Is this cast safe here ? It may lead to a silent truncation, no ?
Maybe:
```suggestion
let offset: u32 = self.data.len().try_into().map_err(|_| {
exec_datafusion_err!("StringView data buffer overflow")
})?;
```
##########
datafusion/functions/src/strings.rs:
##########
@@ -233,21 +247,35 @@ impl StringViewArrayBuilder {
///
/// Returns an error when:
///
- /// - the provided `null_buffer` does not match amount of `append_offset`
calls.
- pub fn finish(mut self, null_buffer: Option<NullBuffer>) ->
Result<StringViewArray> {
- let array = self.builder.finish();
- match null_buffer {
- Some(nulls) if nulls.len() != array.len() => {
- internal_err!("Null buffer and views buffer must be the same
length")
- }
- Some(nulls) => {
- let array_builder =
array.into_data().into_builder().nulls(Some(nulls));
- // SAFETY: the underlying data is valid; we are only adding a
null buffer
- let array_data = unsafe { array_builder.build_unchecked() };
- Ok(StringViewArray::from(array_data))
- }
- None => Ok(array),
+ /// - the provided `null_buffer` length does not match the row count.
+ pub fn finish(self, null_buffer: Option<NullBuffer>) ->
Result<StringViewArray> {
+ if let Some(ref nulls) = null_buffer
+ && nulls.len() != self.views.len()
+ {
+ return internal_err!(
+ "Null buffer length ({}) must match row count ({})",
+ nulls.len(),
+ self.views.len()
+ );
}
+
+ let buffers: Vec<Buffer> = if self.data.is_empty() {
+ vec![]
+ } else {
+ vec![Buffer::from(self.data)]
+ };
+
+ // SAFETY: views were constructed with correct lengths, offsets, and
+ // prefixes. UTF-8 validity has also been checked, if the input was
+ // tainted.
Review Comment:
```suggestion
// prefixes. UTF-8 validity was checked in append_offset() for any
row
// where tainted data (e.g., binary literals) was appended.
```
##########
datafusion/functions/src/strings.rs:
##########
@@ -214,16 +221,23 @@ impl StringViewArrayBuilder {
}
}
+ /// Finalizes the current row by converting the accumulated data into a
+ /// StringView and appending it to the views buffer.
pub fn append_offset(&mut self) -> Result<()> {
- let block_str = if self.tainted {
+ if self.tainted {
std::str::from_utf8(&self.block)
- .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary
literal"))?
- } else {
- // SAFETY: all data that was appended was valid UTF8
- unsafe { std::str::from_utf8_unchecked(&self.block) }
- };
- self.builder.append_value(block_str);
+ .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary
literal"))?;
+ }
+
+ let v = &self.block;
+ let offset = self.data.len() as u32;
+ if v.len() > 12 {
+ self.data.extend_from_slice(v);
+ }
+ self.views.push(make_view(v, 0, offset));
Review Comment:
https://github.com/neilconway/datafusion/blob/b17be597d57de2dec77b4fcc4cc34b79d75e1a70/datafusion/functions/src/strings.rs#L436
uses `offset=0` when calling `make_view()` for short strings.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]