This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new beb1034101 move StringArrayType, StringViewArrayBuilder, etc outside
of string module (#12912)
beb1034101 is described below
commit beb103410178708bd43b8069a2f8a29597253eba
Author: Bruce Ritchie <[email protected]>
AuthorDate: Tue Oct 15 07:15:17 2024 -0400
move StringArrayType, StringViewArrayBuilder, etc outside of string module
(#12912)
---
datafusion/functions/src/lib.rs | 2 +
datafusion/functions/src/string/common.rs | 408 +-------------------
datafusion/functions/src/string/concat.rs | 4 +-
datafusion/functions/src/string/concat_ws.rs | 2 +-
datafusion/functions/src/string/repeat.rs | 2 +-
datafusion/functions/src/string/split_part.rs | 3 +-
datafusion/functions/src/strings.rs | 424 +++++++++++++++++++++
.../functions/src/unicode/character_length.rs | 2 +-
datafusion/functions/src/unicode/lpad.rs | 2 +-
datafusion/functions/src/unicode/rpad.rs | 2 +-
datafusion/functions/src/unicode/strpos.rs | 2 +-
datafusion/functions/src/unicode/substr.rs | 2 +-
12 files changed, 441 insertions(+), 414 deletions(-)
diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
index 81be555266..91f9449953 100644
--- a/datafusion/functions/src/lib.rs
+++ b/datafusion/functions/src/lib.rs
@@ -135,6 +135,8 @@ make_stub_package!(unicode, "unicode_expressions");
#[cfg(any(feature = "datetime_expressions", feature = "unicode_expressions"))]
pub mod planner;
+pub mod strings;
+
mod utils;
/// Fluent-style API for creating `Expr`s
diff --git a/datafusion/functions/src/string/common.rs
b/datafusion/functions/src/string/common.rs
index 72447bc68f..0d1f90eb22 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -20,12 +20,12 @@
use std::fmt::{Display, Formatter};
use std::sync::Arc;
+use crate::strings::make_and_append_view;
use arrow::array::{
- make_view, new_null_array, Array, ArrayAccessor, ArrayDataBuilder,
ArrayIter,
- ArrayRef, ByteView, GenericStringArray, GenericStringBuilder,
LargeStringArray,
- OffsetSizeTrait, StringArray, StringBuilder, StringViewArray,
StringViewBuilder,
+ new_null_array, Array, ArrayRef, GenericStringArray, GenericStringBuilder,
+ OffsetSizeTrait, StringBuilder, StringViewArray,
};
-use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
+use arrow::buffer::Buffer;
use arrow::datatypes::DataType;
use arrow_buffer::{NullBufferBuilder, ScalarBuffer};
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
@@ -33,42 +33,6 @@ use datafusion_common::Result;
use datafusion_common::{exec_err, ScalarValue};
use datafusion_expr::ColumnarValue;
-/// Append a new view to the views buffer with the given substr
-///
-/// # Safety
-///
-/// original_view must be a valid view (the format described on
-/// [`GenericByteViewArray`](arrow::array::GenericByteViewArray).
-///
-/// # Arguments
-/// - views_buffer: The buffer to append the new view to
-/// - null_builder: The buffer to append the null value to
-/// - original_view: The original view value
-/// - substr: The substring to append. Must be a valid substring of the
original view
-/// - start_offset: The start offset of the substring in the view
-pub(crate) fn make_and_append_view(
- views_buffer: &mut Vec<u128>,
- null_builder: &mut NullBufferBuilder,
- original_view: &u128,
- substr: &str,
- start_offset: u32,
-) {
- let substr_len = substr.len();
- let sub_view = if substr_len > 12 {
- let view = ByteView::from(*original_view);
- make_view(
- substr.as_bytes(),
- view.buffer_index,
- view.offset + start_offset,
- )
- } else {
- // inline value does not need block id or offset
- make_view(substr.as_bytes(), 0, 0)
- };
- views_buffer.push(sub_view);
- null_builder.append_non_null();
-}
-
pub(crate) enum TrimType {
Left,
Right,
@@ -399,370 +363,6 @@ where
}
}
-#[derive(Debug)]
-pub(crate) enum ColumnarValueRef<'a> {
- Scalar(&'a [u8]),
- NullableArray(&'a StringArray),
- NonNullableArray(&'a StringArray),
- NullableLargeStringArray(&'a LargeStringArray),
- NonNullableLargeStringArray(&'a LargeStringArray),
- NullableStringViewArray(&'a StringViewArray),
- NonNullableStringViewArray(&'a StringViewArray),
-}
-
-impl<'a> ColumnarValueRef<'a> {
- #[inline]
- pub fn is_valid(&self, i: usize) -> bool {
- match &self {
- Self::Scalar(_)
- | Self::NonNullableArray(_)
- | Self::NonNullableLargeStringArray(_)
- | Self::NonNullableStringViewArray(_) => true,
- Self::NullableArray(array) => array.is_valid(i),
- Self::NullableStringViewArray(array) => array.is_valid(i),
- Self::NullableLargeStringArray(array) => array.is_valid(i),
- }
- }
-
- #[inline]
- pub fn nulls(&self) -> Option<NullBuffer> {
- match &self {
- Self::Scalar(_)
- | Self::NonNullableArray(_)
- | Self::NonNullableStringViewArray(_)
- | Self::NonNullableLargeStringArray(_) => None,
- Self::NullableArray(array) => array.nulls().cloned(),
- Self::NullableStringViewArray(array) => array.nulls().cloned(),
- Self::NullableLargeStringArray(array) => array.nulls().cloned(),
- }
- }
-}
-
-/// Abstracts iteration over different types of string arrays.
-///
-/// The [`StringArrayType`] trait helps write generic code for string
functions that can work with
-/// different types of string arrays.
-///
-/// Currently three types are supported:
-/// - [`StringArray`]
-/// - [`LargeStringArray`]
-/// - [`StringViewArray`]
-///
-/// It is inspired / copied from [arrow-rs].
-///
-/// [arrow-rs]:
https://github.com/apache/arrow-rs/blob/bf0ea9129e617e4a3cf915a900b747cc5485315f/arrow-string/src/like.rs#L151-L157
-///
-/// # Examples
-/// Generic function that works for [`StringArray`], [`LargeStringArray`]
-/// and [`StringViewArray`]:
-/// ```
-/// # use arrow::array::{StringArray, LargeStringArray, StringViewArray};
-/// # use datafusion_functions::string::common::StringArrayType;
-///
-/// /// Combines string values for any StringArrayType type. It can be invoked
on
-/// /// and combination of `StringArray`, `LargeStringArray` or
`StringViewArray`
-/// fn combine_values<'a, S1, S2>(array1: S1, array2: S2) -> Vec<String>
-/// where S1: StringArrayType<'a>, S2: StringArrayType<'a>
-/// {
-/// // iterate over the elements of the 2 arrays in parallel
-/// array1
-/// .iter()
-/// .zip(array2.iter())
-/// .map(|(s1, s2)| {
-/// // if both values are non null, combine them
-/// if let (Some(s1), Some(s2)) = (s1, s2) {
-/// format!("{s1}{s2}")
-/// } else {
-/// "None".to_string()
-/// }
-/// })
-/// .collect()
-/// }
-///
-/// let string_array = StringArray::from(vec!["foo", "bar"]);
-/// let large_string_array = LargeStringArray::from(vec!["foo2", "bar2"]);
-/// let string_view_array = StringViewArray::from(vec!["foo3", "bar3"]);
-///
-/// // can invoke this function a string array and large string array
-/// assert_eq!(
-/// combine_values(&string_array, &large_string_array),
-/// vec![String::from("foofoo2"), String::from("barbar2")]
-/// );
-///
-/// // Can call the same function with string array and string view array
-/// assert_eq!(
-/// combine_values(&string_array, &string_view_array),
-/// vec![String::from("foofoo3"), String::from("barbar3")]
-/// );
-/// ```
-///
-/// [`LargeStringArray`]: arrow::array::LargeStringArray
-pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
- /// Return an [`ArrayIter`] over the values of the array.
- ///
- /// This iterator iterates returns `Option<&str>` for each item in the
array.
- fn iter(&self) -> ArrayIter<Self>;
-
- /// Check if the array is ASCII only.
- fn is_ascii(&self) -> bool;
-}
-
-impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T>
{
- fn iter(&self) -> ArrayIter<Self> {
- GenericStringArray::<T>::iter(self)
- }
-
- fn is_ascii(&self) -> bool {
- GenericStringArray::<T>::is_ascii(self)
- }
-}
-
-impl<'a> StringArrayType<'a> for &'a StringViewArray {
- fn iter(&self) -> ArrayIter<Self> {
- StringViewArray::iter(self)
- }
-
- fn is_ascii(&self) -> bool {
- StringViewArray::is_ascii(self)
- }
-}
-
-/// Optimized version of the StringBuilder in Arrow that:
-/// 1. Precalculating the expected length of the result, avoiding
reallocations.
-/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
-pub(crate) struct StringArrayBuilder {
- offsets_buffer: MutableBuffer,
- value_buffer: MutableBuffer,
-}
-
-impl StringArrayBuilder {
- pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
- let mut offsets_buffer = MutableBuffer::with_capacity(
- (item_capacity + 1) * std::mem::size_of::<i32>(),
- );
- // SAFETY: the first offset value is definitely not going to exceed
the bounds.
- unsafe { offsets_buffer.push_unchecked(0_i32) };
- Self {
- offsets_buffer,
- value_buffer: MutableBuffer::with_capacity(data_capacity),
- }
- }
-
- pub fn write<const CHECK_VALID: bool>(
- &mut self,
- column: &ColumnarValueRef,
- i: usize,
- ) {
- match column {
- ColumnarValueRef::Scalar(s) => {
- self.value_buffer.extend_from_slice(s);
- }
- ColumnarValueRef::NullableArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- }
- ColumnarValueRef::NullableLargeStringArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- }
- ColumnarValueRef::NullableStringViewArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- }
- ColumnarValueRef::NonNullableArray(array) => {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- ColumnarValueRef::NonNullableLargeStringArray(array) => {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- ColumnarValueRef::NonNullableStringViewArray(array) => {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- }
- }
-
- pub fn append_offset(&mut self) {
- let next_offset: i32 = self
- .value_buffer
- .len()
- .try_into()
- .expect("byte array offset overflow");
- unsafe { self.offsets_buffer.push_unchecked(next_offset) };
- }
-
- pub fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
- let array_builder = ArrayDataBuilder::new(DataType::Utf8)
- .len(self.offsets_buffer.len() / std::mem::size_of::<i32>() - 1)
- .add_buffer(self.offsets_buffer.into())
- .add_buffer(self.value_buffer.into())
- .nulls(null_buffer);
- // SAFETY: all data that was appended was valid UTF8 and the values
- // and offsets were created correctly
- let array_data = unsafe { array_builder.build_unchecked() };
- StringArray::from(array_data)
- }
-}
-
-pub(crate) struct StringViewArrayBuilder {
- builder: StringViewBuilder,
- block: String,
-}
-
-impl StringViewArrayBuilder {
- pub fn with_capacity(_item_capacity: usize, data_capacity: usize) -> Self {
- let builder = StringViewBuilder::with_capacity(data_capacity);
- Self {
- builder,
- block: String::new(),
- }
- }
-
- pub fn write<const CHECK_VALID: bool>(
- &mut self,
- column: &ColumnarValueRef,
- i: usize,
- ) {
- match column {
- ColumnarValueRef::Scalar(s) => {
- self.block.push_str(std::str::from_utf8(s).unwrap());
- }
- ColumnarValueRef::NullableArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.block.push_str(
-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
- );
- }
- }
- ColumnarValueRef::NullableLargeStringArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.block.push_str(
-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
- );
- }
- }
- ColumnarValueRef::NullableStringViewArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.block.push_str(
-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
- );
- }
- }
- ColumnarValueRef::NonNullableArray(array) => {
- self.block
-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
- }
- ColumnarValueRef::NonNullableLargeStringArray(array) => {
- self.block
-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
- }
- ColumnarValueRef::NonNullableStringViewArray(array) => {
- self.block
-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
- }
- }
- }
-
- pub fn append_offset(&mut self) {
- self.builder.append_value(&self.block);
- self.block = String::new();
- }
-
- pub fn finish(mut self) -> StringViewArray {
- self.builder.finish()
- }
-}
-
-pub(crate) struct LargeStringArrayBuilder {
- offsets_buffer: MutableBuffer,
- value_buffer: MutableBuffer,
-}
-
-impl LargeStringArrayBuilder {
- pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
- let mut offsets_buffer = MutableBuffer::with_capacity(
- (item_capacity + 1) * std::mem::size_of::<i64>(),
- );
- // SAFETY: the first offset value is definitely not going to exceed
the bounds.
- unsafe { offsets_buffer.push_unchecked(0_i64) };
- Self {
- offsets_buffer,
- value_buffer: MutableBuffer::with_capacity(data_capacity),
- }
- }
-
- pub fn write<const CHECK_VALID: bool>(
- &mut self,
- column: &ColumnarValueRef,
- i: usize,
- ) {
- match column {
- ColumnarValueRef::Scalar(s) => {
- self.value_buffer.extend_from_slice(s);
- }
- ColumnarValueRef::NullableArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- }
- ColumnarValueRef::NullableLargeStringArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- }
- ColumnarValueRef::NullableStringViewArray(array) => {
- if !CHECK_VALID || array.is_valid(i) {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- }
- ColumnarValueRef::NonNullableArray(array) => {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- ColumnarValueRef::NonNullableLargeStringArray(array) => {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- ColumnarValueRef::NonNullableStringViewArray(array) => {
- self.value_buffer
- .extend_from_slice(array.value(i).as_bytes());
- }
- }
- }
-
- pub fn append_offset(&mut self) {
- let next_offset: i64 = self
- .value_buffer
- .len()
- .try_into()
- .expect("byte array offset overflow");
- unsafe { self.offsets_buffer.push_unchecked(next_offset) };
- }
-
- pub fn finish(self, null_buffer: Option<NullBuffer>) -> LargeStringArray {
- let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8)
- .len(self.offsets_buffer.len() / std::mem::size_of::<i64>() - 1)
- .add_buffer(self.offsets_buffer.into())
- .add_buffer(self.value_buffer.into())
- .nulls(null_buffer);
- // SAFETY: all data that was appended was valid Large UTF8 and the
values
- // and offsets were created correctly
- let array_data = unsafe { array_builder.build_unchecked() };
- LargeStringArray::from(array_data)
- }
-}
-
fn case_conversion_array<'a, O, F>(array: &'a ArrayRef, op: F) ->
Result<ArrayRef>
where
O: OffsetSizeTrait,
diff --git a/datafusion/functions/src/string/concat.rs
b/datafusion/functions/src/string/concat.rs
index 228fcd460c..33a926863a 100644
--- a/datafusion/functions/src/string/concat.rs
+++ b/datafusion/functions/src/string/concat.rs
@@ -20,8 +20,10 @@ use arrow::datatypes::DataType;
use std::any::Any;
use std::sync::{Arc, OnceLock};
-use crate::string::common::*;
use crate::string::concat;
+use crate::strings::{
+ ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder,
StringViewArrayBuilder,
+};
use datafusion_common::cast::{as_string_array, as_string_view_array};
use datafusion_common::{internal_err, plan_err, Result, ScalarValue};
use datafusion_expr::expr::ScalarFunction;
diff --git a/datafusion/functions/src/string/concat_ws.rs
b/datafusion/functions/src/string/concat_ws.rs
index a20cbf1a16..17361b0733 100644
--- a/datafusion/functions/src/string/concat_ws.rs
+++ b/datafusion/functions/src/string/concat_ws.rs
@@ -21,9 +21,9 @@ use std::sync::{Arc, OnceLock};
use arrow::datatypes::DataType;
-use crate::string::common::*;
use crate::string::concat::simplify_concat;
use crate::string::concat_ws;
+use crate::strings::{ColumnarValueRef, StringArrayBuilder};
use datafusion_common::cast::{as_string_array, as_string_view_array};
use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue};
use datafusion_expr::expr::ScalarFunction;
diff --git a/datafusion/functions/src/string/repeat.rs
b/datafusion/functions/src/string/repeat.rs
index fda9c7a13d..7364c7d36f 100644
--- a/datafusion/functions/src/string/repeat.rs
+++ b/datafusion/functions/src/string/repeat.rs
@@ -18,7 +18,7 @@
use std::any::Any;
use std::sync::{Arc, OnceLock};
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
use crate::utils::{make_scalar_function, utf8_to_str_type};
use arrow::array::{
ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
diff --git a/datafusion/functions/src/string/split_part.rs
b/datafusion/functions/src/string/split_part.rs
index 2441798c38..cea3b0890f 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+use crate::strings::StringArrayType;
use crate::utils::utf8_to_str_type;
use arrow::array::{
ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, StringViewArray,
@@ -30,8 +31,6 @@ use datafusion_expr::{ScalarUDFImpl, Signature};
use std::any::Any;
use std::sync::{Arc, OnceLock};
-use super::common::StringArrayType;
-
#[derive(Debug)]
pub struct SplitPartFunc {
signature: Signature,
diff --git a/datafusion/functions/src/strings.rs
b/datafusion/functions/src/strings.rs
new file mode 100644
index 0000000000..2e0e2c4839
--- /dev/null
+++ b/datafusion/functions/src/strings.rs
@@ -0,0 +1,424 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+ make_view, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ByteView,
+ GenericStringArray, LargeStringArray, OffsetSizeTrait, StringArray,
StringViewArray,
+ StringViewBuilder,
+};
+use arrow::datatypes::DataType;
+use arrow_buffer::{MutableBuffer, NullBuffer, NullBufferBuilder};
+
+/// Abstracts iteration over different types of string arrays.
+///
+/// The [`StringArrayType`] trait helps write generic code for string
functions that can work with
+/// different types of string arrays.
+///
+/// Currently three types are supported:
+/// - [`StringArray`]
+/// - [`LargeStringArray`]
+/// - [`StringViewArray`]
+///
+/// It is inspired / copied from [arrow-rs].
+///
+/// [arrow-rs]:
https://github.com/apache/arrow-rs/blob/bf0ea9129e617e4a3cf915a900b747cc5485315f/arrow-string/src/like.rs#L151-L157
+///
+/// # Examples
+/// Generic function that works for [`StringArray`], [`LargeStringArray`]
+/// and [`StringViewArray`]:
+/// ```
+/// # use arrow::array::{StringArray, LargeStringArray, StringViewArray};
+/// # use datafusion_functions::strings::StringArrayType;
+///
+/// /// Combines string values for any StringArrayType type. It can be invoked
on
+/// /// and combination of `StringArray`, `LargeStringArray` or
`StringViewArray`
+/// fn combine_values<'a, S1, S2>(array1: S1, array2: S2) -> Vec<String>
+/// where S1: StringArrayType<'a>, S2: StringArrayType<'a>
+/// {
+/// // iterate over the elements of the 2 arrays in parallel
+/// array1
+/// .iter()
+/// .zip(array2.iter())
+/// .map(|(s1, s2)| {
+/// // if both values are non null, combine them
+/// if let (Some(s1), Some(s2)) = (s1, s2) {
+/// format!("{s1}{s2}")
+/// } else {
+/// "None".to_string()
+/// }
+/// })
+/// .collect()
+/// }
+///
+/// let string_array = StringArray::from(vec!["foo", "bar"]);
+/// let large_string_array = LargeStringArray::from(vec!["foo2", "bar2"]);
+/// let string_view_array = StringViewArray::from(vec!["foo3", "bar3"]);
+///
+/// // can invoke this function a string array and large string array
+/// assert_eq!(
+/// combine_values(&string_array, &large_string_array),
+/// vec![String::from("foofoo2"), String::from("barbar2")]
+/// );
+///
+/// // Can call the same function with string array and string view array
+/// assert_eq!(
+/// combine_values(&string_array, &string_view_array),
+/// vec![String::from("foofoo3"), String::from("barbar3")]
+/// );
+/// ```
+///
+/// [`LargeStringArray`]: arrow::array::LargeStringArray
+pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+ /// Return an [`ArrayIter`] over the values of the array.
+ ///
+ /// This iterator iterates returns `Option<&str>` for each item in the
array.
+ fn iter(&self) -> ArrayIter<Self>;
+
+ /// Check if the array is ASCII only.
+ fn is_ascii(&self) -> bool;
+}
+
+impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T>
{
+ fn iter(&self) -> ArrayIter<Self> {
+ GenericStringArray::<T>::iter(self)
+ }
+
+ fn is_ascii(&self) -> bool {
+ GenericStringArray::<T>::is_ascii(self)
+ }
+}
+
+impl<'a> StringArrayType<'a> for &'a StringViewArray {
+ fn iter(&self) -> ArrayIter<Self> {
+ StringViewArray::iter(self)
+ }
+
+ fn is_ascii(&self) -> bool {
+ StringViewArray::is_ascii(self)
+ }
+}
+
+/// Optimized version of the StringBuilder in Arrow that:
+/// 1. Precalculating the expected length of the result, avoiding
reallocations.
+/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
+pub struct StringArrayBuilder {
+ offsets_buffer: MutableBuffer,
+ value_buffer: MutableBuffer,
+}
+
+impl StringArrayBuilder {
+ pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let mut offsets_buffer = MutableBuffer::with_capacity(
+ (item_capacity + 1) * std::mem::size_of::<i32>(),
+ );
+ // SAFETY: the first offset value is definitely not going to exceed
the bounds.
+ unsafe { offsets_buffer.push_unchecked(0_i32) };
+ Self {
+ offsets_buffer,
+ value_buffer: MutableBuffer::with_capacity(data_capacity),
+ }
+ }
+
+ pub fn write<const CHECK_VALID: bool>(
+ &mut self,
+ column: &ColumnarValueRef,
+ i: usize,
+ ) {
+ match column {
+ ColumnarValueRef::Scalar(s) => {
+ self.value_buffer.extend_from_slice(s);
+ }
+ ColumnarValueRef::NullableArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ ColumnarValueRef::NullableLargeStringArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ ColumnarValueRef::NullableStringViewArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ ColumnarValueRef::NonNullableArray(array) => {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ ColumnarValueRef::NonNullableLargeStringArray(array) => {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ ColumnarValueRef::NonNullableStringViewArray(array) => {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ }
+
+ pub fn append_offset(&mut self) {
+ let next_offset: i32 = self
+ .value_buffer
+ .len()
+ .try_into()
+ .expect("byte array offset overflow");
+ unsafe { self.offsets_buffer.push_unchecked(next_offset) };
+ }
+
+ pub fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
+ let array_builder = ArrayDataBuilder::new(DataType::Utf8)
+ .len(self.offsets_buffer.len() / std::mem::size_of::<i32>() - 1)
+ .add_buffer(self.offsets_buffer.into())
+ .add_buffer(self.value_buffer.into())
+ .nulls(null_buffer);
+ // SAFETY: all data that was appended was valid UTF8 and the values
+ // and offsets were created correctly
+ let array_data = unsafe { array_builder.build_unchecked() };
+ StringArray::from(array_data)
+ }
+}
+
+pub struct StringViewArrayBuilder {
+ builder: StringViewBuilder,
+ block: String,
+}
+
+impl StringViewArrayBuilder {
+ pub fn with_capacity(_item_capacity: usize, data_capacity: usize) -> Self {
+ let builder = StringViewBuilder::with_capacity(data_capacity);
+ Self {
+ builder,
+ block: String::new(),
+ }
+ }
+
+ pub fn write<const CHECK_VALID: bool>(
+ &mut self,
+ column: &ColumnarValueRef,
+ i: usize,
+ ) {
+ match column {
+ ColumnarValueRef::Scalar(s) => {
+ self.block.push_str(std::str::from_utf8(s).unwrap());
+ }
+ ColumnarValueRef::NullableArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.block.push_str(
+
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
+ );
+ }
+ }
+ ColumnarValueRef::NullableLargeStringArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.block.push_str(
+
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
+ );
+ }
+ }
+ ColumnarValueRef::NullableStringViewArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.block.push_str(
+
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
+ );
+ }
+ }
+ ColumnarValueRef::NonNullableArray(array) => {
+ self.block
+
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+ }
+ ColumnarValueRef::NonNullableLargeStringArray(array) => {
+ self.block
+
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+ }
+ ColumnarValueRef::NonNullableStringViewArray(array) => {
+ self.block
+
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+ }
+ }
+ }
+
+ pub fn append_offset(&mut self) {
+ self.builder.append_value(&self.block);
+ self.block = String::new();
+ }
+
+ pub fn finish(mut self) -> StringViewArray {
+ self.builder.finish()
+ }
+}
+
+pub struct LargeStringArrayBuilder {
+ offsets_buffer: MutableBuffer,
+ value_buffer: MutableBuffer,
+}
+
+impl LargeStringArrayBuilder {
+ pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let mut offsets_buffer = MutableBuffer::with_capacity(
+ (item_capacity + 1) * std::mem::size_of::<i64>(),
+ );
+ // SAFETY: the first offset value is definitely not going to exceed
the bounds.
+ unsafe { offsets_buffer.push_unchecked(0_i64) };
+ Self {
+ offsets_buffer,
+ value_buffer: MutableBuffer::with_capacity(data_capacity),
+ }
+ }
+
+ pub fn write<const CHECK_VALID: bool>(
+ &mut self,
+ column: &ColumnarValueRef,
+ i: usize,
+ ) {
+ match column {
+ ColumnarValueRef::Scalar(s) => {
+ self.value_buffer.extend_from_slice(s);
+ }
+ ColumnarValueRef::NullableArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ ColumnarValueRef::NullableLargeStringArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ ColumnarValueRef::NullableStringViewArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ ColumnarValueRef::NonNullableArray(array) => {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ ColumnarValueRef::NonNullableLargeStringArray(array) => {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ ColumnarValueRef::NonNullableStringViewArray(array) => {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ }
+
+ pub fn append_offset(&mut self) {
+ let next_offset: i64 = self
+ .value_buffer
+ .len()
+ .try_into()
+ .expect("byte array offset overflow");
+ unsafe { self.offsets_buffer.push_unchecked(next_offset) };
+ }
+
+ pub fn finish(self, null_buffer: Option<NullBuffer>) -> LargeStringArray {
+ let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8)
+ .len(self.offsets_buffer.len() / std::mem::size_of::<i64>() - 1)
+ .add_buffer(self.offsets_buffer.into())
+ .add_buffer(self.value_buffer.into())
+ .nulls(null_buffer);
+ // SAFETY: all data that was appended was valid Large UTF8 and the
values
+ // and offsets were created correctly
+ let array_data = unsafe { array_builder.build_unchecked() };
+ LargeStringArray::from(array_data)
+ }
+}
+
+/// Append a new view to the views buffer with the given substr
+///
+/// # Safety
+///
+/// original_view must be a valid view (the format described on
+/// [`GenericByteViewArray`](arrow::array::GenericByteViewArray).
+///
+/// # Arguments
+/// - views_buffer: The buffer to append the new view to
+/// - null_builder: The buffer to append the null value to
+/// - original_view: The original view value
+/// - substr: The substring to append. Must be a valid substring of the
original view
+/// - start_offset: The start offset of the substring in the view
+pub fn make_and_append_view(
+ views_buffer: &mut Vec<u128>,
+ null_builder: &mut NullBufferBuilder,
+ original_view: &u128,
+ substr: &str,
+ start_offset: u32,
+) {
+ let substr_len = substr.len();
+ let sub_view = if substr_len > 12 {
+ let view = ByteView::from(*original_view);
+ make_view(
+ substr.as_bytes(),
+ view.buffer_index,
+ view.offset + start_offset,
+ )
+ } else {
+ // inline value does not need block id or offset
+ make_view(substr.as_bytes(), 0, 0)
+ };
+ views_buffer.push(sub_view);
+ null_builder.append_non_null();
+}
+
+#[derive(Debug)]
+pub enum ColumnarValueRef<'a> {
+ Scalar(&'a [u8]),
+ NullableArray(&'a StringArray),
+ NonNullableArray(&'a StringArray),
+ NullableLargeStringArray(&'a LargeStringArray),
+ NonNullableLargeStringArray(&'a LargeStringArray),
+ NullableStringViewArray(&'a StringViewArray),
+ NonNullableStringViewArray(&'a StringViewArray),
+}
+
+impl<'a> ColumnarValueRef<'a> {
+ #[inline]
+ pub fn is_valid(&self, i: usize) -> bool {
+ match &self {
+ Self::Scalar(_)
+ | Self::NonNullableArray(_)
+ | Self::NonNullableLargeStringArray(_)
+ | Self::NonNullableStringViewArray(_) => true,
+ Self::NullableArray(array) => array.is_valid(i),
+ Self::NullableStringViewArray(array) => array.is_valid(i),
+ Self::NullableLargeStringArray(array) => array.is_valid(i),
+ }
+ }
+
+ #[inline]
+ pub fn nulls(&self) -> Option<NullBuffer> {
+ match &self {
+ Self::Scalar(_)
+ | Self::NonNullableArray(_)
+ | Self::NonNullableStringViewArray(_)
+ | Self::NonNullableLargeStringArray(_) => None,
+ Self::NullableArray(array) => array.nulls().cloned(),
+ Self::NullableStringViewArray(array) => array.nulls().cloned(),
+ Self::NullableLargeStringArray(array) => array.nulls().cloned(),
+ }
+ }
+}
diff --git a/datafusion/functions/src/unicode/character_length.rs
b/datafusion/functions/src/unicode/character_length.rs
index bfb60bfbe2..6e74135b60 100644
--- a/datafusion/functions/src/unicode/character_length.rs
+++ b/datafusion/functions/src/unicode/character_length.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
use crate::utils::{make_scalar_function, utf8_to_int_type};
use arrow::array::{
Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait,
PrimitiveArray,
diff --git a/datafusion/functions/src/unicode/lpad.rs
b/datafusion/functions/src/unicode/lpad.rs
index 48bd583720..948afd050c 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -27,7 +27,7 @@ use arrow::datatypes::DataType;
use unicode_segmentation::UnicodeSegmentation;
use DataType::{LargeUtf8, Utf8, Utf8View};
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
use crate::utils::{make_scalar_function, utf8_to_str_type};
use datafusion_common::cast::as_int64_array;
use datafusion_common::{exec_err, Result};
diff --git a/datafusion/functions/src/unicode/rpad.rs
b/datafusion/functions/src/unicode/rpad.rs
index 9ca65e229c..fd4c1ee6fe 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
use crate::utils::{make_scalar_function, utf8_to_str_type};
use arrow::array::{
ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
diff --git a/datafusion/functions/src/unicode/strpos.rs
b/datafusion/functions/src/unicode/strpos.rs
index 660adc7578..e4696e4e5c 100644
--- a/datafusion/functions/src/unicode/strpos.rs
+++ b/datafusion/functions/src/unicode/strpos.rs
@@ -18,7 +18,7 @@
use std::any::Any;
use std::sync::{Arc, OnceLock};
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
use crate::utils::{make_scalar_function, utf8_to_int_type};
use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray};
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
diff --git a/datafusion/functions/src/unicode/substr.rs
b/datafusion/functions/src/unicode/substr.rs
index 969969ef2f..4e0c293577 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -18,7 +18,7 @@
use std::any::Any;
use std::sync::{Arc, OnceLock};
-use crate::string::common::{make_and_append_view, StringArrayType};
+use crate::strings::{make_and_append_view, StringArrayType};
use crate::utils::{make_scalar_function, utf8_to_str_type};
use arrow::array::{
Array, ArrayIter, ArrayRef, AsArray, GenericStringArray, Int64Array,
OffsetSizeTrait,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]