This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new beb1034101 move StringArrayType, StringViewArrayBuilder, etc outside 
of string module (#12912)
beb1034101 is described below

commit beb103410178708bd43b8069a2f8a29597253eba
Author: Bruce Ritchie <[email protected]>
AuthorDate: Tue Oct 15 07:15:17 2024 -0400

    move StringArrayType, StringViewArrayBuilder, etc outside of string module 
(#12912)
---
 datafusion/functions/src/lib.rs                    |   2 +
 datafusion/functions/src/string/common.rs          | 408 +-------------------
 datafusion/functions/src/string/concat.rs          |   4 +-
 datafusion/functions/src/string/concat_ws.rs       |   2 +-
 datafusion/functions/src/string/repeat.rs          |   2 +-
 datafusion/functions/src/string/split_part.rs      |   3 +-
 datafusion/functions/src/strings.rs                | 424 +++++++++++++++++++++
 .../functions/src/unicode/character_length.rs      |   2 +-
 datafusion/functions/src/unicode/lpad.rs           |   2 +-
 datafusion/functions/src/unicode/rpad.rs           |   2 +-
 datafusion/functions/src/unicode/strpos.rs         |   2 +-
 datafusion/functions/src/unicode/substr.rs         |   2 +-
 12 files changed, 441 insertions(+), 414 deletions(-)

diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
index 81be555266..91f9449953 100644
--- a/datafusion/functions/src/lib.rs
+++ b/datafusion/functions/src/lib.rs
@@ -135,6 +135,8 @@ make_stub_package!(unicode, "unicode_expressions");
 #[cfg(any(feature = "datetime_expressions", feature = "unicode_expressions"))]
 pub mod planner;
 
+pub mod strings;
+
 mod utils;
 
 /// Fluent-style API for creating `Expr`s
diff --git a/datafusion/functions/src/string/common.rs 
b/datafusion/functions/src/string/common.rs
index 72447bc68f..0d1f90eb22 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -20,12 +20,12 @@
 use std::fmt::{Display, Formatter};
 use std::sync::Arc;
 
+use crate::strings::make_and_append_view;
 use arrow::array::{
-    make_view, new_null_array, Array, ArrayAccessor, ArrayDataBuilder, 
ArrayIter,
-    ArrayRef, ByteView, GenericStringArray, GenericStringBuilder, 
LargeStringArray,
-    OffsetSizeTrait, StringArray, StringBuilder, StringViewArray, 
StringViewBuilder,
+    new_null_array, Array, ArrayRef, GenericStringArray, GenericStringBuilder,
+    OffsetSizeTrait, StringBuilder, StringViewArray,
 };
-use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
+use arrow::buffer::Buffer;
 use arrow::datatypes::DataType;
 use arrow_buffer::{NullBufferBuilder, ScalarBuffer};
 use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
@@ -33,42 +33,6 @@ use datafusion_common::Result;
 use datafusion_common::{exec_err, ScalarValue};
 use datafusion_expr::ColumnarValue;
 
-/// Append a new view to the views buffer with the given substr
-///
-/// # Safety
-///
-/// original_view must be a valid view (the format described on
-/// [`GenericByteViewArray`](arrow::array::GenericByteViewArray).
-///
-/// # Arguments
-/// - views_buffer: The buffer to append the new view to
-/// - null_builder: The buffer to append the null value to
-/// - original_view: The original view value
-/// - substr: The substring to append. Must be a valid substring of the 
original view
-/// - start_offset: The start offset of the substring in the view
-pub(crate) fn make_and_append_view(
-    views_buffer: &mut Vec<u128>,
-    null_builder: &mut NullBufferBuilder,
-    original_view: &u128,
-    substr: &str,
-    start_offset: u32,
-) {
-    let substr_len = substr.len();
-    let sub_view = if substr_len > 12 {
-        let view = ByteView::from(*original_view);
-        make_view(
-            substr.as_bytes(),
-            view.buffer_index,
-            view.offset + start_offset,
-        )
-    } else {
-        // inline value does not need block id or offset
-        make_view(substr.as_bytes(), 0, 0)
-    };
-    views_buffer.push(sub_view);
-    null_builder.append_non_null();
-}
-
 pub(crate) enum TrimType {
     Left,
     Right,
@@ -399,370 +363,6 @@ where
     }
 }
 
-#[derive(Debug)]
-pub(crate) enum ColumnarValueRef<'a> {
-    Scalar(&'a [u8]),
-    NullableArray(&'a StringArray),
-    NonNullableArray(&'a StringArray),
-    NullableLargeStringArray(&'a LargeStringArray),
-    NonNullableLargeStringArray(&'a LargeStringArray),
-    NullableStringViewArray(&'a StringViewArray),
-    NonNullableStringViewArray(&'a StringViewArray),
-}
-
-impl<'a> ColumnarValueRef<'a> {
-    #[inline]
-    pub fn is_valid(&self, i: usize) -> bool {
-        match &self {
-            Self::Scalar(_)
-            | Self::NonNullableArray(_)
-            | Self::NonNullableLargeStringArray(_)
-            | Self::NonNullableStringViewArray(_) => true,
-            Self::NullableArray(array) => array.is_valid(i),
-            Self::NullableStringViewArray(array) => array.is_valid(i),
-            Self::NullableLargeStringArray(array) => array.is_valid(i),
-        }
-    }
-
-    #[inline]
-    pub fn nulls(&self) -> Option<NullBuffer> {
-        match &self {
-            Self::Scalar(_)
-            | Self::NonNullableArray(_)
-            | Self::NonNullableStringViewArray(_)
-            | Self::NonNullableLargeStringArray(_) => None,
-            Self::NullableArray(array) => array.nulls().cloned(),
-            Self::NullableStringViewArray(array) => array.nulls().cloned(),
-            Self::NullableLargeStringArray(array) => array.nulls().cloned(),
-        }
-    }
-}
-
-/// Abstracts iteration over different types of string arrays.
-///
-/// The [`StringArrayType`] trait helps write generic code for string 
functions that can work with
-/// different types of string arrays.
-///
-/// Currently three types are supported:
-/// - [`StringArray`]
-/// - [`LargeStringArray`]
-/// - [`StringViewArray`]
-///
-/// It is inspired / copied from [arrow-rs].
-///
-/// [arrow-rs]: 
https://github.com/apache/arrow-rs/blob/bf0ea9129e617e4a3cf915a900b747cc5485315f/arrow-string/src/like.rs#L151-L157
-///
-/// # Examples
-/// Generic function that works for [`StringArray`], [`LargeStringArray`]
-/// and [`StringViewArray`]:
-/// ```
-/// # use arrow::array::{StringArray, LargeStringArray, StringViewArray};
-/// # use datafusion_functions::string::common::StringArrayType;
-///
-/// /// Combines string values for any StringArrayType type. It can be invoked 
on
-/// /// and combination of `StringArray`, `LargeStringArray` or 
`StringViewArray`
-/// fn combine_values<'a, S1, S2>(array1: S1, array2: S2) -> Vec<String>
-///   where S1: StringArrayType<'a>, S2: StringArrayType<'a>
-/// {
-///   // iterate over the elements of the 2 arrays in parallel
-///   array1
-///   .iter()
-///   .zip(array2.iter())
-///   .map(|(s1, s2)| {
-///      // if both values are non null, combine them
-///      if let (Some(s1), Some(s2)) = (s1, s2) {
-///        format!("{s1}{s2}")
-///      } else {
-///        "None".to_string()
-///     }
-///    })
-///   .collect()
-/// }
-///
-/// let string_array = StringArray::from(vec!["foo", "bar"]);
-/// let large_string_array = LargeStringArray::from(vec!["foo2", "bar2"]);
-/// let string_view_array = StringViewArray::from(vec!["foo3", "bar3"]);
-///
-/// // can invoke this function a string array and large string array
-/// assert_eq!(
-///   combine_values(&string_array, &large_string_array),
-///   vec![String::from("foofoo2"), String::from("barbar2")]
-/// );
-///
-/// // Can call the same function with string array and string view array
-/// assert_eq!(
-///   combine_values(&string_array, &string_view_array),
-///   vec![String::from("foofoo3"), String::from("barbar3")]
-/// );
-/// ```
-///
-/// [`LargeStringArray`]: arrow::array::LargeStringArray
-pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
-    /// Return an [`ArrayIter`]  over the values of the array.
-    ///
-    /// This iterator iterates returns `Option<&str>` for each item in the 
array.
-    fn iter(&self) -> ArrayIter<Self>;
-
-    /// Check if the array is ASCII only.
-    fn is_ascii(&self) -> bool;
-}
-
-impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T> 
{
-    fn iter(&self) -> ArrayIter<Self> {
-        GenericStringArray::<T>::iter(self)
-    }
-
-    fn is_ascii(&self) -> bool {
-        GenericStringArray::<T>::is_ascii(self)
-    }
-}
-
-impl<'a> StringArrayType<'a> for &'a StringViewArray {
-    fn iter(&self) -> ArrayIter<Self> {
-        StringViewArray::iter(self)
-    }
-
-    fn is_ascii(&self) -> bool {
-        StringViewArray::is_ascii(self)
-    }
-}
-
-/// Optimized version of the StringBuilder in Arrow that:
-/// 1. Precalculating the expected length of the result, avoiding 
reallocations.
-/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
-pub(crate) struct StringArrayBuilder {
-    offsets_buffer: MutableBuffer,
-    value_buffer: MutableBuffer,
-}
-
-impl StringArrayBuilder {
-    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
-        let mut offsets_buffer = MutableBuffer::with_capacity(
-            (item_capacity + 1) * std::mem::size_of::<i32>(),
-        );
-        // SAFETY: the first offset value is definitely not going to exceed 
the bounds.
-        unsafe { offsets_buffer.push_unchecked(0_i32) };
-        Self {
-            offsets_buffer,
-            value_buffer: MutableBuffer::with_capacity(data_capacity),
-        }
-    }
-
-    pub fn write<const CHECK_VALID: bool>(
-        &mut self,
-        column: &ColumnarValueRef,
-        i: usize,
-    ) {
-        match column {
-            ColumnarValueRef::Scalar(s) => {
-                self.value_buffer.extend_from_slice(s);
-            }
-            ColumnarValueRef::NullableArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.value_buffer
-                        .extend_from_slice(array.value(i).as_bytes());
-                }
-            }
-            ColumnarValueRef::NullableLargeStringArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.value_buffer
-                        .extend_from_slice(array.value(i).as_bytes());
-                }
-            }
-            ColumnarValueRef::NullableStringViewArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.value_buffer
-                        .extend_from_slice(array.value(i).as_bytes());
-                }
-            }
-            ColumnarValueRef::NonNullableArray(array) => {
-                self.value_buffer
-                    .extend_from_slice(array.value(i).as_bytes());
-            }
-            ColumnarValueRef::NonNullableLargeStringArray(array) => {
-                self.value_buffer
-                    .extend_from_slice(array.value(i).as_bytes());
-            }
-            ColumnarValueRef::NonNullableStringViewArray(array) => {
-                self.value_buffer
-                    .extend_from_slice(array.value(i).as_bytes());
-            }
-        }
-    }
-
-    pub fn append_offset(&mut self) {
-        let next_offset: i32 = self
-            .value_buffer
-            .len()
-            .try_into()
-            .expect("byte array offset overflow");
-        unsafe { self.offsets_buffer.push_unchecked(next_offset) };
-    }
-
-    pub fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
-        let array_builder = ArrayDataBuilder::new(DataType::Utf8)
-            .len(self.offsets_buffer.len() / std::mem::size_of::<i32>() - 1)
-            .add_buffer(self.offsets_buffer.into())
-            .add_buffer(self.value_buffer.into())
-            .nulls(null_buffer);
-        // SAFETY: all data that was appended was valid UTF8 and the values
-        // and offsets were created correctly
-        let array_data = unsafe { array_builder.build_unchecked() };
-        StringArray::from(array_data)
-    }
-}
-
-pub(crate) struct StringViewArrayBuilder {
-    builder: StringViewBuilder,
-    block: String,
-}
-
-impl StringViewArrayBuilder {
-    pub fn with_capacity(_item_capacity: usize, data_capacity: usize) -> Self {
-        let builder = StringViewBuilder::with_capacity(data_capacity);
-        Self {
-            builder,
-            block: String::new(),
-        }
-    }
-
-    pub fn write<const CHECK_VALID: bool>(
-        &mut self,
-        column: &ColumnarValueRef,
-        i: usize,
-    ) {
-        match column {
-            ColumnarValueRef::Scalar(s) => {
-                self.block.push_str(std::str::from_utf8(s).unwrap());
-            }
-            ColumnarValueRef::NullableArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
-                }
-            }
-            ColumnarValueRef::NullableLargeStringArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
-                }
-            }
-            ColumnarValueRef::NullableStringViewArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
-                }
-            }
-            ColumnarValueRef::NonNullableArray(array) => {
-                self.block
-                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
-            }
-            ColumnarValueRef::NonNullableLargeStringArray(array) => {
-                self.block
-                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
-            }
-            ColumnarValueRef::NonNullableStringViewArray(array) => {
-                self.block
-                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
-            }
-        }
-    }
-
-    pub fn append_offset(&mut self) {
-        self.builder.append_value(&self.block);
-        self.block = String::new();
-    }
-
-    pub fn finish(mut self) -> StringViewArray {
-        self.builder.finish()
-    }
-}
-
-pub(crate) struct LargeStringArrayBuilder {
-    offsets_buffer: MutableBuffer,
-    value_buffer: MutableBuffer,
-}
-
-impl LargeStringArrayBuilder {
-    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
-        let mut offsets_buffer = MutableBuffer::with_capacity(
-            (item_capacity + 1) * std::mem::size_of::<i64>(),
-        );
-        // SAFETY: the first offset value is definitely not going to exceed 
the bounds.
-        unsafe { offsets_buffer.push_unchecked(0_i64) };
-        Self {
-            offsets_buffer,
-            value_buffer: MutableBuffer::with_capacity(data_capacity),
-        }
-    }
-
-    pub fn write<const CHECK_VALID: bool>(
-        &mut self,
-        column: &ColumnarValueRef,
-        i: usize,
-    ) {
-        match column {
-            ColumnarValueRef::Scalar(s) => {
-                self.value_buffer.extend_from_slice(s);
-            }
-            ColumnarValueRef::NullableArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.value_buffer
-                        .extend_from_slice(array.value(i).as_bytes());
-                }
-            }
-            ColumnarValueRef::NullableLargeStringArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.value_buffer
-                        .extend_from_slice(array.value(i).as_bytes());
-                }
-            }
-            ColumnarValueRef::NullableStringViewArray(array) => {
-                if !CHECK_VALID || array.is_valid(i) {
-                    self.value_buffer
-                        .extend_from_slice(array.value(i).as_bytes());
-                }
-            }
-            ColumnarValueRef::NonNullableArray(array) => {
-                self.value_buffer
-                    .extend_from_slice(array.value(i).as_bytes());
-            }
-            ColumnarValueRef::NonNullableLargeStringArray(array) => {
-                self.value_buffer
-                    .extend_from_slice(array.value(i).as_bytes());
-            }
-            ColumnarValueRef::NonNullableStringViewArray(array) => {
-                self.value_buffer
-                    .extend_from_slice(array.value(i).as_bytes());
-            }
-        }
-    }
-
-    pub fn append_offset(&mut self) {
-        let next_offset: i64 = self
-            .value_buffer
-            .len()
-            .try_into()
-            .expect("byte array offset overflow");
-        unsafe { self.offsets_buffer.push_unchecked(next_offset) };
-    }
-
-    pub fn finish(self, null_buffer: Option<NullBuffer>) -> LargeStringArray {
-        let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8)
-            .len(self.offsets_buffer.len() / std::mem::size_of::<i64>() - 1)
-            .add_buffer(self.offsets_buffer.into())
-            .add_buffer(self.value_buffer.into())
-            .nulls(null_buffer);
-        // SAFETY: all data that was appended was valid Large UTF8 and the 
values
-        // and offsets were created correctly
-        let array_data = unsafe { array_builder.build_unchecked() };
-        LargeStringArray::from(array_data)
-    }
-}
-
 fn case_conversion_array<'a, O, F>(array: &'a ArrayRef, op: F) -> 
Result<ArrayRef>
 where
     O: OffsetSizeTrait,
diff --git a/datafusion/functions/src/string/concat.rs 
b/datafusion/functions/src/string/concat.rs
index 228fcd460c..33a926863a 100644
--- a/datafusion/functions/src/string/concat.rs
+++ b/datafusion/functions/src/string/concat.rs
@@ -20,8 +20,10 @@ use arrow::datatypes::DataType;
 use std::any::Any;
 use std::sync::{Arc, OnceLock};
 
-use crate::string::common::*;
 use crate::string::concat;
+use crate::strings::{
+    ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, 
StringViewArrayBuilder,
+};
 use datafusion_common::cast::{as_string_array, as_string_view_array};
 use datafusion_common::{internal_err, plan_err, Result, ScalarValue};
 use datafusion_expr::expr::ScalarFunction;
diff --git a/datafusion/functions/src/string/concat_ws.rs 
b/datafusion/functions/src/string/concat_ws.rs
index a20cbf1a16..17361b0733 100644
--- a/datafusion/functions/src/string/concat_ws.rs
+++ b/datafusion/functions/src/string/concat_ws.rs
@@ -21,9 +21,9 @@ use std::sync::{Arc, OnceLock};
 
 use arrow::datatypes::DataType;
 
-use crate::string::common::*;
 use crate::string::concat::simplify_concat;
 use crate::string::concat_ws;
+use crate::strings::{ColumnarValueRef, StringArrayBuilder};
 use datafusion_common::cast::{as_string_array, as_string_view_array};
 use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue};
 use datafusion_expr::expr::ScalarFunction;
diff --git a/datafusion/functions/src/string/repeat.rs 
b/datafusion/functions/src/string/repeat.rs
index fda9c7a13d..7364c7d36f 100644
--- a/datafusion/functions/src/string/repeat.rs
+++ b/datafusion/functions/src/string/repeat.rs
@@ -18,7 +18,7 @@
 use std::any::Any;
 use std::sync::{Arc, OnceLock};
 
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use arrow::array::{
     ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
diff --git a/datafusion/functions/src/string/split_part.rs 
b/datafusion/functions/src/string/split_part.rs
index 2441798c38..cea3b0890f 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::strings::StringArrayType;
 use crate::utils::utf8_to_str_type;
 use arrow::array::{
     ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, StringViewArray,
@@ -30,8 +31,6 @@ use datafusion_expr::{ScalarUDFImpl, Signature};
 use std::any::Any;
 use std::sync::{Arc, OnceLock};
 
-use super::common::StringArrayType;
-
 #[derive(Debug)]
 pub struct SplitPartFunc {
     signature: Signature,
diff --git a/datafusion/functions/src/strings.rs 
b/datafusion/functions/src/strings.rs
new file mode 100644
index 0000000000..2e0e2c4839
--- /dev/null
+++ b/datafusion/functions/src/strings.rs
@@ -0,0 +1,424 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    make_view, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ByteView,
+    GenericStringArray, LargeStringArray, OffsetSizeTrait, StringArray, 
StringViewArray,
+    StringViewBuilder,
+};
+use arrow::datatypes::DataType;
+use arrow_buffer::{MutableBuffer, NullBuffer, NullBufferBuilder};
+
+/// Abstracts iteration over different types of string arrays.
+///
+/// The [`StringArrayType`] trait helps write generic code for string 
functions that can work with
+/// different types of string arrays.
+///
+/// Currently three types are supported:
+/// - [`StringArray`]
+/// - [`LargeStringArray`]
+/// - [`StringViewArray`]
+///
+/// It is inspired / copied from [arrow-rs].
+///
+/// [arrow-rs]: 
https://github.com/apache/arrow-rs/blob/bf0ea9129e617e4a3cf915a900b747cc5485315f/arrow-string/src/like.rs#L151-L157
+///
+/// # Examples
+/// Generic function that works for [`StringArray`], [`LargeStringArray`]
+/// and [`StringViewArray`]:
+/// ```
+/// # use arrow::array::{StringArray, LargeStringArray, StringViewArray};
+/// # use datafusion_functions::strings::StringArrayType;
+///
+/// /// Combines string values for any StringArrayType type. It can be invoked 
on
+/// /// and combination of `StringArray`, `LargeStringArray` or 
`StringViewArray`
+/// fn combine_values<'a, S1, S2>(array1: S1, array2: S2) -> Vec<String>
+///   where S1: StringArrayType<'a>, S2: StringArrayType<'a>
+/// {
+///   // iterate over the elements of the 2 arrays in parallel
+///   array1
+///   .iter()
+///   .zip(array2.iter())
+///   .map(|(s1, s2)| {
+///      // if both values are non null, combine them
+///      if let (Some(s1), Some(s2)) = (s1, s2) {
+///        format!("{s1}{s2}")
+///      } else {
+///        "None".to_string()
+///     }
+///    })
+///   .collect()
+/// }
+///
+/// let string_array = StringArray::from(vec!["foo", "bar"]);
+/// let large_string_array = LargeStringArray::from(vec!["foo2", "bar2"]);
+/// let string_view_array = StringViewArray::from(vec!["foo3", "bar3"]);
+///
+/// // can invoke this function a string array and large string array
+/// assert_eq!(
+///   combine_values(&string_array, &large_string_array),
+///   vec![String::from("foofoo2"), String::from("barbar2")]
+/// );
+///
+/// // Can call the same function with string array and string view array
+/// assert_eq!(
+///   combine_values(&string_array, &string_view_array),
+///   vec![String::from("foofoo3"), String::from("barbar3")]
+/// );
+/// ```
+///
+/// [`LargeStringArray`]: arrow::array::LargeStringArray
+pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+    /// Return an [`ArrayIter`]  over the values of the array.
+    ///
+    /// This iterator iterates returns `Option<&str>` for each item in the 
array.
+    fn iter(&self) -> ArrayIter<Self>;
+
+    /// Check if the array is ASCII only.
+    fn is_ascii(&self) -> bool;
+}
+
+impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T> 
{
+    fn iter(&self) -> ArrayIter<Self> {
+        GenericStringArray::<T>::iter(self)
+    }
+
+    fn is_ascii(&self) -> bool {
+        GenericStringArray::<T>::is_ascii(self)
+    }
+}
+
+impl<'a> StringArrayType<'a> for &'a StringViewArray {
+    fn iter(&self) -> ArrayIter<Self> {
+        StringViewArray::iter(self)
+    }
+
+    fn is_ascii(&self) -> bool {
+        StringViewArray::is_ascii(self)
+    }
+}
+
+/// Optimized version of the StringBuilder in Arrow that:
+/// 1. Precalculating the expected length of the result, avoiding 
reallocations.
+/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
+pub struct StringArrayBuilder {
+    offsets_buffer: MutableBuffer,
+    value_buffer: MutableBuffer,
+}
+
+impl StringArrayBuilder {
+    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+        let mut offsets_buffer = MutableBuffer::with_capacity(
+            (item_capacity + 1) * std::mem::size_of::<i32>(),
+        );
+        // SAFETY: the first offset value is definitely not going to exceed 
the bounds.
+        unsafe { offsets_buffer.push_unchecked(0_i32) };
+        Self {
+            offsets_buffer,
+            value_buffer: MutableBuffer::with_capacity(data_capacity),
+        }
+    }
+
+    pub fn write<const CHECK_VALID: bool>(
+        &mut self,
+        column: &ColumnarValueRef,
+        i: usize,
+    ) {
+        match column {
+            ColumnarValueRef::Scalar(s) => {
+                self.value_buffer.extend_from_slice(s);
+            }
+            ColumnarValueRef::NullableArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.value_buffer
+                        .extend_from_slice(array.value(i).as_bytes());
+                }
+            }
+            ColumnarValueRef::NullableLargeStringArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.value_buffer
+                        .extend_from_slice(array.value(i).as_bytes());
+                }
+            }
+            ColumnarValueRef::NullableStringViewArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.value_buffer
+                        .extend_from_slice(array.value(i).as_bytes());
+                }
+            }
+            ColumnarValueRef::NonNullableArray(array) => {
+                self.value_buffer
+                    .extend_from_slice(array.value(i).as_bytes());
+            }
+            ColumnarValueRef::NonNullableLargeStringArray(array) => {
+                self.value_buffer
+                    .extend_from_slice(array.value(i).as_bytes());
+            }
+            ColumnarValueRef::NonNullableStringViewArray(array) => {
+                self.value_buffer
+                    .extend_from_slice(array.value(i).as_bytes());
+            }
+        }
+    }
+
+    pub fn append_offset(&mut self) {
+        let next_offset: i32 = self
+            .value_buffer
+            .len()
+            .try_into()
+            .expect("byte array offset overflow");
+        unsafe { self.offsets_buffer.push_unchecked(next_offset) };
+    }
+
+    pub fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
+        let array_builder = ArrayDataBuilder::new(DataType::Utf8)
+            .len(self.offsets_buffer.len() / std::mem::size_of::<i32>() - 1)
+            .add_buffer(self.offsets_buffer.into())
+            .add_buffer(self.value_buffer.into())
+            .nulls(null_buffer);
+        // SAFETY: all data that was appended was valid UTF8 and the values
+        // and offsets were created correctly
+        let array_data = unsafe { array_builder.build_unchecked() };
+        StringArray::from(array_data)
+    }
+}
+
+pub struct StringViewArrayBuilder {
+    builder: StringViewBuilder,
+    block: String,
+}
+
+impl StringViewArrayBuilder {
+    pub fn with_capacity(_item_capacity: usize, data_capacity: usize) -> Self {
+        let builder = StringViewBuilder::with_capacity(data_capacity);
+        Self {
+            builder,
+            block: String::new(),
+        }
+    }
+
+    pub fn write<const CHECK_VALID: bool>(
+        &mut self,
+        column: &ColumnarValueRef,
+        i: usize,
+    ) {
+        match column {
+            ColumnarValueRef::Scalar(s) => {
+                self.block.push_str(std::str::from_utf8(s).unwrap());
+            }
+            ColumnarValueRef::NullableArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.block.push_str(
+                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
+                    );
+                }
+            }
+            ColumnarValueRef::NullableLargeStringArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.block.push_str(
+                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
+                    );
+                }
+            }
+            ColumnarValueRef::NullableStringViewArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.block.push_str(
+                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
+                    );
+                }
+            }
+            ColumnarValueRef::NonNullableArray(array) => {
+                self.block
+                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+            }
+            ColumnarValueRef::NonNullableLargeStringArray(array) => {
+                self.block
+                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+            }
+            ColumnarValueRef::NonNullableStringViewArray(array) => {
+                self.block
+                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+            }
+        }
+    }
+
+    pub fn append_offset(&mut self) {
+        self.builder.append_value(&self.block);
+        self.block = String::new();
+    }
+
+    pub fn finish(mut self) -> StringViewArray {
+        self.builder.finish()
+    }
+}
+
+pub struct LargeStringArrayBuilder {
+    offsets_buffer: MutableBuffer,
+    value_buffer: MutableBuffer,
+}
+
+impl LargeStringArrayBuilder {
+    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+        let mut offsets_buffer = MutableBuffer::with_capacity(
+            (item_capacity + 1) * std::mem::size_of::<i64>(),
+        );
+        // SAFETY: the first offset value is definitely not going to exceed 
the bounds.
+        unsafe { offsets_buffer.push_unchecked(0_i64) };
+        Self {
+            offsets_buffer,
+            value_buffer: MutableBuffer::with_capacity(data_capacity),
+        }
+    }
+
+    pub fn write<const CHECK_VALID: bool>(
+        &mut self,
+        column: &ColumnarValueRef,
+        i: usize,
+    ) {
+        match column {
+            ColumnarValueRef::Scalar(s) => {
+                self.value_buffer.extend_from_slice(s);
+            }
+            ColumnarValueRef::NullableArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.value_buffer
+                        .extend_from_slice(array.value(i).as_bytes());
+                }
+            }
+            ColumnarValueRef::NullableLargeStringArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.value_buffer
+                        .extend_from_slice(array.value(i).as_bytes());
+                }
+            }
+            ColumnarValueRef::NullableStringViewArray(array) => {
+                if !CHECK_VALID || array.is_valid(i) {
+                    self.value_buffer
+                        .extend_from_slice(array.value(i).as_bytes());
+                }
+            }
+            ColumnarValueRef::NonNullableArray(array) => {
+                self.value_buffer
+                    .extend_from_slice(array.value(i).as_bytes());
+            }
+            ColumnarValueRef::NonNullableLargeStringArray(array) => {
+                self.value_buffer
+                    .extend_from_slice(array.value(i).as_bytes());
+            }
+            ColumnarValueRef::NonNullableStringViewArray(array) => {
+                self.value_buffer
+                    .extend_from_slice(array.value(i).as_bytes());
+            }
+        }
+    }
+
+    pub fn append_offset(&mut self) {
+        let next_offset: i64 = self
+            .value_buffer
+            .len()
+            .try_into()
+            .expect("byte array offset overflow");
+        unsafe { self.offsets_buffer.push_unchecked(next_offset) };
+    }
+
+    pub fn finish(self, null_buffer: Option<NullBuffer>) -> LargeStringArray {
+        let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8)
+            .len(self.offsets_buffer.len() / std::mem::size_of::<i64>() - 1)
+            .add_buffer(self.offsets_buffer.into())
+            .add_buffer(self.value_buffer.into())
+            .nulls(null_buffer);
+        // SAFETY: all data that was appended was valid Large UTF8 and the 
values
+        // and offsets were created correctly
+        let array_data = unsafe { array_builder.build_unchecked() };
+        LargeStringArray::from(array_data)
+    }
+}
+
+/// Append a new view to the views buffer with the given substr
+///
+/// # Safety
+///
+/// original_view must be a valid view (the format described on
+/// [`GenericByteViewArray`](arrow::array::GenericByteViewArray).
+///
+/// # Arguments
+/// - views_buffer: The buffer to append the new view to
+/// - null_builder: The buffer to append the null value to
+/// - original_view: The original view value
+/// - substr: The substring to append. Must be a valid substring of the 
original view
+/// - start_offset: The start offset of the substring in the view
+pub fn make_and_append_view(
+    views_buffer: &mut Vec<u128>,
+    null_builder: &mut NullBufferBuilder,
+    original_view: &u128,
+    substr: &str,
+    start_offset: u32,
+) {
+    let substr_len = substr.len();
+    let sub_view = if substr_len > 12 {
+        let view = ByteView::from(*original_view);
+        make_view(
+            substr.as_bytes(),
+            view.buffer_index,
+            view.offset + start_offset,
+        )
+    } else {
+        // inline value does not need block id or offset
+        make_view(substr.as_bytes(), 0, 0)
+    };
+    views_buffer.push(sub_view);
+    null_builder.append_non_null();
+}
+
+#[derive(Debug)]
+pub enum ColumnarValueRef<'a> {
+    Scalar(&'a [u8]),
+    NullableArray(&'a StringArray),
+    NonNullableArray(&'a StringArray),
+    NullableLargeStringArray(&'a LargeStringArray),
+    NonNullableLargeStringArray(&'a LargeStringArray),
+    NullableStringViewArray(&'a StringViewArray),
+    NonNullableStringViewArray(&'a StringViewArray),
+}
+
+impl<'a> ColumnarValueRef<'a> {
+    #[inline]
+    pub fn is_valid(&self, i: usize) -> bool {
+        match &self {
+            Self::Scalar(_)
+            | Self::NonNullableArray(_)
+            | Self::NonNullableLargeStringArray(_)
+            | Self::NonNullableStringViewArray(_) => true,
+            Self::NullableArray(array) => array.is_valid(i),
+            Self::NullableStringViewArray(array) => array.is_valid(i),
+            Self::NullableLargeStringArray(array) => array.is_valid(i),
+        }
+    }
+
+    #[inline]
+    pub fn nulls(&self) -> Option<NullBuffer> {
+        match &self {
+            Self::Scalar(_)
+            | Self::NonNullableArray(_)
+            | Self::NonNullableStringViewArray(_)
+            | Self::NonNullableLargeStringArray(_) => None,
+            Self::NullableArray(array) => array.nulls().cloned(),
+            Self::NullableStringViewArray(array) => array.nulls().cloned(),
+            Self::NullableLargeStringArray(array) => array.nulls().cloned(),
+        }
+    }
+}
diff --git a/datafusion/functions/src/unicode/character_length.rs 
b/datafusion/functions/src/unicode/character_length.rs
index bfb60bfbe2..6e74135b60 100644
--- a/datafusion/functions/src/unicode/character_length.rs
+++ b/datafusion/functions/src/unicode/character_length.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
 use crate::utils::{make_scalar_function, utf8_to_int_type};
 use arrow::array::{
     Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, 
PrimitiveArray,
diff --git a/datafusion/functions/src/unicode/lpad.rs 
b/datafusion/functions/src/unicode/lpad.rs
index 48bd583720..948afd050c 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -27,7 +27,7 @@ use arrow::datatypes::DataType;
 use unicode_segmentation::UnicodeSegmentation;
 use DataType::{LargeUtf8, Utf8, Utf8View};
 
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::as_int64_array;
 use datafusion_common::{exec_err, Result};
diff --git a/datafusion/functions/src/unicode/rpad.rs 
b/datafusion/functions/src/unicode/rpad.rs
index 9ca65e229c..fd4c1ee6fe 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use arrow::array::{
     ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
diff --git a/datafusion/functions/src/unicode/strpos.rs 
b/datafusion/functions/src/unicode/strpos.rs
index 660adc7578..e4696e4e5c 100644
--- a/datafusion/functions/src/unicode/strpos.rs
+++ b/datafusion/functions/src/unicode/strpos.rs
@@ -18,7 +18,7 @@
 use std::any::Any;
 use std::sync::{Arc, OnceLock};
 
-use crate::string::common::StringArrayType;
+use crate::strings::StringArrayType;
 use crate::utils::{make_scalar_function, utf8_to_int_type};
 use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray};
 use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
diff --git a/datafusion/functions/src/unicode/substr.rs 
b/datafusion/functions/src/unicode/substr.rs
index 969969ef2f..4e0c293577 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -18,7 +18,7 @@
 use std::any::Any;
 use std::sync::{Arc, OnceLock};
 
-use crate::string::common::{make_and_append_view, StringArrayType};
+use crate::strings::{make_and_append_view, StringArrayType};
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use arrow::array::{
     Array, ArrayIter, ArrayRef, AsArray, GenericStringArray, Int64Array, 
OffsetSizeTrait,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to