neilconway commented on code in PR #21442:
URL: https://github.com/apache/datafusion/pull/21442#discussion_r3094365287
##########
datafusion/functions/src/unicode/common.rs:
##########
@@ -119,83 +120,125 @@ pub(crate) fn general_left_right<F: LeftRightSlicer>(
}
}
-/// `general_left_right` implementation for strings
-fn general_left_right_array<
- 'a,
- T: OffsetSizeTrait,
- V: ArrayAccessor<Item = &'a str>,
- F: LeftRightSlicer,
->(
- string_array: V,
+/// Returns true if all offsets in the array fit in u32, meaning the values
+/// buffer can be referenced by StringView's u32 offset field.
+fn values_fit_in_u32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>)
-> bool {
+ string_array
+ .offsets()
+ .last()
+ .map(|offset| offset.as_usize() <= u32::MAX as usize)
+ .unwrap_or(true)
+}
+
+/// `left`/`right` for Utf8/LargeUtf8 input.
+///
+/// When offsets fit in u32, produces a zero-copy `StringViewArray` with views
+/// pointing into the input values buffer. Otherwise falls back to building a
+/// `StringViewArray` by copying.
+fn general_left_right_array<T: OffsetSizeTrait, F: LeftRightSlicer>(
+ string_array: &GenericStringArray<T>,
n_array: &Int64Array,
-) -> datafusion_common::Result<ArrayRef> {
- let iter = ArrayIter::new(string_array);
- let result = iter
- .zip(n_array.iter())
- .map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => {
- let range = F::slice(string, n);
- // Extract a given range from a byte-indexed slice
- Some(&string[range])
- }
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
+) -> Result<ArrayRef> {
+ if !values_fit_in_u32(string_array) {
+ let result = string_array
+ .iter()
+ .zip(n_array.iter())
+ .map(|(string, n)| match (string, n) {
+ (Some(string), Some(n)) => Some(&string[F::slice(string, n)]),
+ _ => None,
+ })
+ .collect::<StringViewArray>();
+ return Ok(Arc::new(result) as ArrayRef);
+ }
+
+ let len = string_array.len();
+ let offsets = string_array.value_offsets();
+ let nulls = NullBuffer::union(string_array.nulls(), n_array.nulls());
- Ok(Arc::new(result) as ArrayRef)
+ let mut views_buf = Vec::with_capacity(len);
+ let mut has_out_of_line = false;
+
+ for (i, offset) in offsets.iter().enumerate().take(len) {
+ if nulls.as_ref().is_some_and(|n| !n.is_valid(i)) {
Review Comment:
Thanks, fixed.
##########
datafusion/functions/src/unicode/common.rs:
##########
@@ -119,83 +120,125 @@ pub(crate) fn general_left_right<F: LeftRightSlicer>(
}
}
-/// `general_left_right` implementation for strings
-fn general_left_right_array<
- 'a,
- T: OffsetSizeTrait,
- V: ArrayAccessor<Item = &'a str>,
- F: LeftRightSlicer,
->(
- string_array: V,
+/// Returns true if all offsets in the array fit in u32, meaning the values
+/// buffer can be referenced by StringView's u32 offset field.
+fn values_fit_in_u32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>)
-> bool {
+ string_array
+ .offsets()
+ .last()
+ .map(|offset| offset.as_usize() <= u32::MAX as usize)
+ .unwrap_or(true)
+}
+
+/// `left`/`right` for Utf8/LargeUtf8 input.
+///
+/// When offsets fit in u32, produces a zero-copy `StringViewArray` with views
+/// pointing into the input values buffer. Otherwise falls back to building a
+/// `StringViewArray` by copying.
+fn general_left_right_array<T: OffsetSizeTrait, F: LeftRightSlicer>(
+ string_array: &GenericStringArray<T>,
n_array: &Int64Array,
-) -> datafusion_common::Result<ArrayRef> {
- let iter = ArrayIter::new(string_array);
- let result = iter
- .zip(n_array.iter())
- .map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => {
- let range = F::slice(string, n);
- // Extract a given range from a byte-indexed slice
- Some(&string[range])
- }
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
+) -> Result<ArrayRef> {
+ if !values_fit_in_u32(string_array) {
+ let result = string_array
+ .iter()
+ .zip(n_array.iter())
+ .map(|(string, n)| match (string, n) {
+ (Some(string), Some(n)) => Some(&string[F::slice(string, n)]),
+ _ => None,
+ })
+ .collect::<StringViewArray>();
+ return Ok(Arc::new(result) as ArrayRef);
+ }
+
+ let len = string_array.len();
+ let offsets = string_array.value_offsets();
+ let nulls = NullBuffer::union(string_array.nulls(), n_array.nulls());
- Ok(Arc::new(result) as ArrayRef)
+ let mut views_buf = Vec::with_capacity(len);
+ let mut has_out_of_line = false;
+
+ for (i, offset) in offsets.iter().enumerate().take(len) {
+ if nulls.as_ref().is_some_and(|n| !n.is_valid(i)) {
+ views_buf.push(0);
+ continue;
+ }
+
+ // SAFETY: we just checked validity above
+ let string = unsafe { string_array.value_unchecked(i) };
+ let n = n_array.value(i);
+ let range = F::slice(string, n);
+ let result_bytes = &string.as_bytes()[range.clone()];
+ if result_bytes.len() > 12 {
+ has_out_of_line = true;
+ }
+
+ let buf_offset = offset.as_usize() as u32 + range.start as u32;
+ views_buf.push(make_view(result_bytes, 0, buf_offset));
+ }
+
+ let views = ScalarBuffer::from(views_buf);
+ let data_buffers = if has_out_of_line {
+ vec![string_array.values().clone()]
+ } else {
+ vec![]
+ };
+
+ // SAFETY:
+ // - Each view is produced by `make_view` with correct bytes and offset
+ // - Out-of-line views reference buffer index 0, which is the original
+ // values buffer included in data_buffers when has_out_of_line is true
+ // - values_fit_in_u32 guarantees all offsets fit in u32
+ unsafe {
+ let array = StringViewArray::new_unchecked(views, data_buffers, nulls);
+ Ok(Arc::new(array) as ArrayRef)
+ }
}
-/// `general_left_right` implementation for StringViewArray
+/// `general_left_right` for StringViewArray input.
fn general_left_right_view<F: LeftRightSlicer>(
string_view_array: &StringViewArray,
n_array: &Int64Array,
-) -> datafusion_common::Result<ArrayRef> {
- let len = n_array.len();
-
+) -> Result<ArrayRef> {
let views = string_view_array.views();
- // Every string in StringViewArray has one corresponding view in `views`
- debug_assert!(views.len() == string_view_array.len());
-
- // Compose null buffer at once
- let string_nulls = string_view_array.nulls();
- let n_nulls = n_array.nulls();
- let new_nulls = NullBuffer::union(string_nulls, n_nulls);
+ let new_nulls = NullBuffer::union(string_view_array.nulls(),
n_array.nulls());
+ let len = n_array.len();
+ let mut has_out_of_line = false;
let new_views = (0..len)
.map(|idx| {
- let view = views[idx];
-
- let is_valid = match &new_nulls {
- Some(nulls_buf) => nulls_buf.is_valid(idx),
- None => true,
- };
-
- if is_valid {
- let string: &str = string_view_array.value(idx);
- let n = n_array.value(idx);
-
- // Input string comes from StringViewArray, so it should fit
in 32-bit length
- let range = F::slice(string, n);
- let result_bytes = &string.as_bytes()[range.clone()];
-
- let byte_view = ByteView::from(view);
- // New offset starts at 0 for left, and at `range.start` for
right,
- // which is encoded in the given range
- let new_offset = byte_view.offset + (range.start as u32);
- // Reuse buffer
- make_view(result_bytes, byte_view.buffer_index, new_offset)
- } else {
- // For nulls, keep the original view
- view
+ if new_nulls.as_ref().is_some_and(|n| !n.is_valid(idx)) {
Review Comment:
Thanks, fixed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]