This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 9f893a4241 perf: Optimize `split_part`, support `Utf8View` (#21119)
9f893a4241 is described below
commit 9f893a4241155f3d65f859450ccd86cca7a68d16
Author: Neil Conway <[email protected]>
AuthorDate: Fri Mar 27 13:47:21 2026 -0400
perf: Optimize `split_part`, support `Utf8View` (#21119)
## Which issue does this PR close?
- Closes #21117.
- Closes #21118 .
## Rationale for this change
`split_part` currently accepts `Utf8View` but always returns `Utf8`.
When given `Utf8View` input, it should instead return `Utf8View` output.
While we're at it, optimize `split_part` for single-character delimiters
(the common case): `str::split(&str)` is significantly slower than
`str::split(char)` for single-character ASCII delimiters, because the
former uses a general string matching algorithm but the latter uses
`memchr::memchr`.
Benchmark results (M4 Max):
- `utf8_single_char/pos_first`: 142 µs → 104 µs (-26%)
- `utf8_single_char/pos_middle`: 389 µs → 365 µs (-6%)
- `utf8_single_char/pos_negative`: 154 µs → 109 µs (-29%)
- `utf8_multi_char/pos_middle`: 356 µs → 361 µs (~0%, noise)
- `utf8view_single_char/pos_first`: 143 µs → 111 µs (-22%)
- `utf8_long_strings/pos_middle`: 2568 µs → 1984 µs (-23%)
- `utf8view_long_parts/pos_middle`: 998 µs → 470 µs (-53%)
## What changes are included in this PR?
* Revise `split_part` benchmarks to reduce redundancy and improve
`Utf8View` coverage
* Support `Utf8View` -> `Utf8View` in `split_part`
* Refactor `split_part` to cleanup some redundant code
* Optimize `split_part` for single-character delimiters
* Add SLT test coverage for `split_part` with `Utf8View` input
## Are these changes tested?
Yes. New tests and benchmarks added.
## Are there any user-facing changes?
No.
---
datafusion/functions/benches/split_part.rs | 393 +++++++--------------
datafusion/functions/src/string/split_part.rs | 242 +++++++------
.../sqllogictest/test_files/string/string_view.slt | 46 +++
3 files changed, 297 insertions(+), 384 deletions(-)
diff --git a/datafusion/functions/benches/split_part.rs
b/datafusion/functions/benches/split_part.rs
index 7ef84a0589..72ca6f66a0 100644
--- a/datafusion/functions/benches/split_part.rs
+++ b/datafusion/functions/benches/split_part.rs
@@ -19,7 +19,7 @@ use arrow::array::{ArrayRef, Int64Array, StringArray,
StringViewArray};
use arrow::datatypes::{DataType, Field};
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use datafusion_common::config::ConfigOptions;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
use datafusion_functions::string::split_part;
use rand::distr::Alphanumeric;
use rand::prelude::StdRng;
@@ -29,14 +29,14 @@ use std::sync::Arc;
const N_ROWS: usize = 8192;
-/// Generate test data for split_part benchmarks
-/// Creates strings with multiple parts separated by the delimiter
+/// Creates strings with `num_parts` random alphanumeric segments of `part_len`
+/// bytes each, joined by `delimiter`.
fn gen_split_part_data(
n_rows: usize,
- num_parts: usize, // number of parts in each string (separated by
delimiter)
- part_len: usize, // length of each part
- delimiter: &str, // the delimiter to use
- use_string_view: bool, // false -> StringArray, true -> StringViewArray
+ num_parts: usize,
+ part_len: usize,
+ delimiter: &str,
+ use_string_view: bool,
) -> (ColumnarValue, ColumnarValue) {
let mut rng = StdRng::seed_from_u64(42);
@@ -73,303 +73,154 @@ fn gen_split_part_data(
}
}
-fn gen_positions(n_rows: usize, position: i64) -> ColumnarValue {
- let positions: Vec<i64> = vec![position; n_rows];
- ColumnarValue::Array(Arc::new(Int64Array::from(positions)) as ArrayRef)
+#[expect(clippy::too_many_arguments)]
+fn bench_split_part(
+ group: &mut criterion::BenchmarkGroup<'_,
criterion::measurement::WallTime>,
+ func: &ScalarUDF,
+ config_options: &Arc<ConfigOptions>,
+ name: &str,
+ tag: &str,
+ strings: ColumnarValue,
+ delimiters: ColumnarValue,
+ position: i64,
+) {
+ let positions: ColumnarValue =
+ ColumnarValue::Array(Arc::new(Int64Array::from(vec![position;
N_ROWS])));
+ let args = vec![strings, delimiters, positions];
+ let arg_fields: Vec<_> = args
+ .iter()
+ .enumerate()
+ .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(),
true).into())
+ .collect();
+ let return_type = match args[0].data_type() {
+ DataType::Utf8View => DataType::Utf8View,
+ _ => DataType::Utf8,
+ };
+ let return_field = Field::new("f", return_type, true).into();
+
+ group.bench_function(BenchmarkId::new(name, tag), |b| {
+ b.iter(|| {
+ black_box(
+ func.invoke_with_args(ScalarFunctionArgs {
+ args: args.clone(),
+ arg_fields: arg_fields.clone(),
+ number_rows: N_ROWS,
+ return_field: Arc::clone(&return_field),
+ config_options: Arc::clone(config_options),
+ })
+ .expect("split_part should work"),
+ )
+ })
+ });
}
fn criterion_benchmark(c: &mut Criterion) {
let split_part_func = split_part();
let config_options = Arc::new(ConfigOptions::default());
-
let mut group = c.benchmark_group("split_part");
- // Test different scenarios
- // Scenario 1: Single-char delimiter, first position (should be fastest
with optimization)
- {
- let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".",
false);
- let positions = gen_positions(N_ROWS, 1);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(BenchmarkId::new("single_char_delim",
"pos_first"), |b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- });
- }
-
- // Scenario 2: Single-char delimiter, middle position
+ // Utf8, single-char delimiter, first position
{
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".",
false);
- let positions = gen_positions(N_ROWS, 5);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(BenchmarkId::new("single_char_delim",
"pos_middle"), |b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- });
+ bench_split_part(
+ &mut group,
+ &split_part_func,
+ &config_options,
+ "utf8_single_char",
+ "pos_first",
+ strings,
+ delimiters,
+ 1,
+ );
}
- // Scenario 3: Single-char delimiter, last position
+ // Utf8, single-char delimiter, middle position
{
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".",
false);
- let positions = gen_positions(N_ROWS, 10);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(BenchmarkId::new("single_char_delim",
"pos_last"), |b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- });
+ bench_split_part(
+ &mut group,
+ &split_part_func,
+ &config_options,
+ "utf8_single_char",
+ "pos_middle",
+ strings,
+ delimiters,
+ 5,
+ );
}
- // Scenario 4: Single-char delimiter, negative position (last element)
+ // Utf8, single-char delimiter, negative position
{
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".",
false);
- let positions = gen_positions(N_ROWS, -1);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(
- BenchmarkId::new("single_char_delim", "pos_negative"),
- |b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- },
+ bench_split_part(
+ &mut group,
+ &split_part_func,
+ &config_options,
+ "utf8_single_char",
+ "pos_negative",
+ strings,
+ delimiters,
+ -1,
);
}
- // Scenario 5: Multi-char delimiter, first position
+ // Utf8, multi-char delimiter, middle position
{
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, "~@~",
false);
- let positions = gen_positions(N_ROWS, 1);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(BenchmarkId::new("multi_char_delim",
"pos_first"), |b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- });
- }
-
- // Scenario 6: Multi-char delimiter, middle position
- {
- let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, "~@~",
false);
- let positions = gen_positions(N_ROWS, 5);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(BenchmarkId::new("multi_char_delim",
"pos_middle"), |b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- });
+ bench_split_part(
+ &mut group,
+ &split_part_func,
+ &config_options,
+ "utf8_multi_char",
+ "pos_middle",
+ strings,
+ delimiters,
+ 5,
+ );
}
- // Scenario 7: StringViewArray, single-char delimiter, first position
+ // Utf8View, single-char delimiter, first position
{
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".",
true);
- let positions = gen_positions(N_ROWS, 1);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(
- BenchmarkId::new("string_view_single_char", "pos_first"),
- |b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- },
+ bench_split_part(
+ &mut group,
+ &split_part_func,
+ &config_options,
+ "utf8view_single_char",
+ "pos_first",
+ strings,
+ delimiters,
+ 1,
);
}
- // Scenario 8: Many parts (20), position near end - shows benefit of early
termination
+ // Utf8, single-char delimiter, many long parts
{
- let (strings, delimiters) = gen_split_part_data(N_ROWS, 20, 8, ".",
false);
- let positions = gen_positions(N_ROWS, 2);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(BenchmarkId::new("many_parts_20", "pos_second"),
|b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- });
+ let (strings, delimiters) = gen_split_part_data(N_ROWS, 50, 16, ".",
false);
+ bench_split_part(
+ &mut group,
+ &split_part_func,
+ &config_options,
+ "utf8_long_strings",
+ "pos_middle",
+ strings,
+ delimiters,
+ 25,
+ );
}
- // Scenario 9: Long strings with many parts - worst case for old
implementation
+ // Utf8View, single-char delimiter, middle position, long parts
{
- let (strings, delimiters) = gen_split_part_data(N_ROWS, 50, 16, "/",
false);
- let positions = gen_positions(N_ROWS, 1);
- let args = vec![strings, delimiters, positions];
- let arg_fields: Vec<_> = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect();
- let return_field = Field::new("f", DataType::Utf8, true).into();
-
- group.bench_function(
- BenchmarkId::new("long_strings_50_parts", "pos_first"),
- |b| {
- b.iter(|| {
- black_box(
- split_part_func
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: N_ROWS,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("split_part should work"),
- )
- })
- },
+ let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 32, ".",
true);
+ bench_split_part(
+ &mut group,
+ &split_part_func,
+ &config_options,
+ "utf8view_long_parts",
+ "pos_middle",
+ strings,
+ delimiters,
+ 5,
);
}
diff --git a/datafusion/functions/src/string/split_part.rs
b/datafusion/functions/src/string/split_part.rs
index 2f36c767f0..87beacabe8 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -17,15 +17,14 @@
use crate::utils::utf8_to_str_type;
use arrow::array::{
- ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, StringArrayType,
- StringViewArray,
+ ArrayRef, AsArray, GenericStringBuilder, Int64Array, StringArrayType,
+ StringLikeArrayBuilder, StringViewBuilder,
};
-use arrow::array::{AsArray, GenericStringBuilder};
use arrow::datatypes::DataType;
use datafusion_common::ScalarValue;
use datafusion_common::cast::as_int64_array;
use datafusion_common::types::{NativeType, logical_int64, logical_string};
-use datafusion_common::{DataFusionError, Result, exec_datafusion_err,
exec_err};
+use datafusion_common::{Result, exec_datafusion_err, exec_err};
use datafusion_expr::{
Coercion, ColumnarValue, Documentation, TypeSignatureClass, Volatility,
};
@@ -92,7 +91,11 @@ impl ScalarUDFImpl for SplitPartFunc {
}
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
- utf8_to_str_type(&arg_types[0], "split_part")
+ if arg_types[0] == DataType::Utf8View {
+ Ok(DataType::Utf8View)
+ } else {
+ utf8_to_str_type(&arg_types[0], "split_part")
+ }
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
@@ -118,71 +121,62 @@ impl ScalarUDFImpl for SplitPartFunc {
// Unpack the ArrayRefs from the arguments
let n_array = as_int64_array(&args[2])?;
- let result = match (args[0].data_type(), args[1].data_type()) {
- (DataType::Utf8View, DataType::Utf8View) => {
- split_part_impl::<&StringViewArray, &StringViewArray, i32>(
- &args[0].as_string_view(),
- &args[1].as_string_view(),
- n_array,
- )
- }
- (DataType::Utf8View, DataType::Utf8) => {
- split_part_impl::<&StringViewArray, &GenericStringArray<i32>,
i32>(
- &args[0].as_string_view(),
- &args[1].as_string::<i32>(),
- n_array,
- )
- }
- (DataType::Utf8View, DataType::LargeUtf8) => {
- split_part_impl::<&StringViewArray, &GenericStringArray<i64>,
i32>(
- &args[0].as_string_view(),
- &args[1].as_string::<i64>(),
- n_array,
- )
- }
- (DataType::Utf8, DataType::Utf8View) => {
- split_part_impl::<&GenericStringArray<i32>, &StringViewArray,
i32>(
- &args[0].as_string::<i32>(),
- &args[1].as_string_view(),
- n_array,
- )
- }
- (DataType::LargeUtf8, DataType::Utf8View) => {
- split_part_impl::<&GenericStringArray<i64>, &StringViewArray,
i64>(
- &args[0].as_string::<i64>(),
- &args[1].as_string_view(),
- n_array,
- )
- }
- (DataType::Utf8, DataType::Utf8) => {
- split_part_impl::<&GenericStringArray<i32>,
&GenericStringArray<i32>, i32>(
- &args[0].as_string::<i32>(),
- &args[1].as_string::<i32>(),
- n_array,
- )
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- split_part_impl::<&GenericStringArray<i64>,
&GenericStringArray<i64>, i64>(
- &args[0].as_string::<i64>(),
- &args[1].as_string::<i64>(),
- n_array,
- )
- }
- (DataType::Utf8, DataType::LargeUtf8) => {
- split_part_impl::<&GenericStringArray<i32>,
&GenericStringArray<i64>, i32>(
- &args[0].as_string::<i32>(),
- &args[1].as_string::<i64>(),
- n_array,
+
+ // Dispatch on delimiter type for a given string array and builder.
+ macro_rules! split_part_for_delimiter_type {
+ ($str_arr:expr, $builder:expr) => {
+ match args[1].data_type() {
+ DataType::Utf8View => split_part_impl(
+ $str_arr,
+ &args[1].as_string_view(),
+ n_array,
+ $builder,
+ ),
+ DataType::Utf8 => split_part_impl(
+ $str_arr,
+ &args[1].as_string::<i32>(),
+ n_array,
+ $builder,
+ ),
+ DataType::LargeUtf8 => split_part_impl(
+ $str_arr,
+ &args[1].as_string::<i64>(),
+ n_array,
+ $builder,
+ ),
+ other => {
+ exec_err!("Unsupported delimiter type {other:?} for
split_part")
+ }
+ }
+ };
+ }
+
+ let result = match args[0].data_type() {
+ DataType::Utf8View => split_part_for_delimiter_type!(
+ &args[0].as_string_view(),
+ StringViewBuilder::with_capacity(inferred_length)
+ ),
+ DataType::Utf8 => {
+ let str_arr = &args[0].as_string::<i32>();
+ split_part_for_delimiter_type!(
+ str_arr,
+ GenericStringBuilder::<i32>::with_capacity(
+ inferred_length,
+ str_arr.value_data().len(),
+ )
)
}
- (DataType::LargeUtf8, DataType::Utf8) => {
- split_part_impl::<&GenericStringArray<i64>,
&GenericStringArray<i32>, i64>(
- &args[0].as_string::<i64>(),
- &args[1].as_string::<i32>(),
- n_array,
+ DataType::LargeUtf8 => {
+ let str_arr = &args[0].as_string::<i64>();
+ split_part_for_delimiter_type!(
+ str_arr,
+ GenericStringBuilder::<i64>::with_capacity(
+ inferred_length,
+ str_arr.value_data().len(),
+ )
)
}
- _ => exec_err!("Unsupported combination of argument types for
split_part"),
+ other => exec_err!("Unsupported string type {other:?} for
split_part"),
};
if is_scalar {
// If all inputs are scalar, keep the output as scalar
@@ -198,71 +192,93 @@ impl ScalarUDFImpl for SplitPartFunc {
}
}
-fn split_part_impl<'a, StringArrType, DelimiterArrType, StringArrayLen>(
+/// Finds the nth split part of `string` by `delimiter`.
+#[inline]
+fn split_nth<'a>(string: &'a str, delimiter: &str, n: usize) -> Option<&'a
str> {
+ if delimiter.len() == 1 {
+ // A single-byte UTF-8 string is always ASCII, so we can safely cast
+ // just the first byte to a character. `str::split(char)` internally
+ // uses memchr::memchr and is notably faster than `str::split(&str)`,
+ // even for a single character string.
+ string.split(delimiter.as_bytes()[0] as char).nth(n)
+ } else {
+ string.split(delimiter).nth(n)
+ }
+}
+
+/// Like `split_nth` but splits from the right.
+#[inline]
+fn rsplit_nth<'a>(string: &'a str, delimiter: &str, n: usize) -> Option<&'a
str> {
+ if delimiter.len() == 1 {
+ // A single-byte UTF-8 string is always ASCII, so we can safely cast
+ // just the first byte to a character. `str::rsplit(char)` internally
+ // uses memchr::memrchr and is notably faster than `str::rsplit(&str)`,
+ // even for a single character string.
+ string.rsplit(delimiter.as_bytes()[0] as char).nth(n)
+ } else {
+ string.rsplit(delimiter).nth(n)
+ }
+}
+
+fn split_part_impl<'a, StringArrType, DelimiterArrType, B>(
string_array: &StringArrType,
delimiter_array: &DelimiterArrType,
n_array: &Int64Array,
+ mut builder: B,
) -> Result<ArrayRef>
where
StringArrType: StringArrayType<'a>,
DelimiterArrType: StringArrayType<'a>,
- StringArrayLen: OffsetSizeTrait,
+ B: StringLikeArrayBuilder,
{
- let mut builder: GenericStringBuilder<StringArrayLen> =
GenericStringBuilder::new();
-
- string_array
+ for ((string, delimiter), n) in string_array
.iter()
.zip(delimiter_array.iter())
.zip(n_array.iter())
- .try_for_each(|((string, delimiter), n)| -> Result<(),
DataFusionError> {
- match (string, delimiter, n) {
- (Some(string), Some(delimiter), Some(n)) => {
- let result = match n.cmp(&0) {
- std::cmp::Ordering::Greater => {
- // Positive index: use nth() to avoid collecting
all parts
- // This stops iteration as soon as we find the nth
element
- let idx: usize = (n - 1).try_into().map_err(|_| {
- exec_datafusion_err!(
- "split_part index {n} exceeds maximum
supported value"
- )
- })?;
-
- if delimiter.is_empty() {
- // Match PostgreSQL split_part behavior for
empty delimiter:
- // treat the input as a single field ("ab" ->
["ab"]),
- // rather than Rust's split("") result (["",
"a", "b", ""]).
- (n == 1).then_some(string)
- } else {
- string.split(delimiter).nth(idx)
- }
+ {
+ match (string, delimiter, n) {
+ (Some(string), Some(delimiter), Some(n)) => {
+ let result = match n.cmp(&0) {
+ std::cmp::Ordering::Greater => {
+ let idx: usize = (n - 1).try_into().map_err(|_| {
+ exec_datafusion_err!(
+ "split_part index {n} exceeds maximum
supported value"
+ )
+ })?;
+ if delimiter.is_empty() {
+ // Match PostgreSQL's behavior: empty delimiter
+ // treats input as a single field, so only position
+ // 1 returns data.
+ (n == 1).then_some(string)
+ } else {
+ split_nth(string, delimiter, idx)
}
- std::cmp::Ordering::Less => {
- // Negative index: use rsplit().nth() to
efficiently get from the end
- // rsplit iterates in reverse, so -1 means first
from rsplit (index 0)
- let idx: usize = (n.unsigned_abs() -
1).try_into().map_err(|_| {
+ }
+ std::cmp::Ordering::Less => {
+ let idx: usize =
+ (n.unsigned_abs() - 1).try_into().map_err(|_| {
exec_datafusion_err!(
"split_part index {n} exceeds minimum
supported value"
)
})?;
- if delimiter.is_empty() {
- // Match PostgreSQL split_part behavior for
empty delimiter:
- // treat the input as a single field ("ab" ->
["ab"]),
- // rather than Rust's split("") result (["",
"a", "b", ""]).
- (n == -1).then_some(string)
- } else {
- string.rsplit(delimiter).nth(idx)
- }
+ if delimiter.is_empty() {
+ // Match PostgreSQL's behavior: empty delimiter
+ // treats input as a single field, so only position
+ // -1 returns data.
+ (n == -1).then_some(string)
+ } else {
+ rsplit_nth(string, delimiter, idx)
}
- std::cmp::Ordering::Equal => {
- return exec_err!("field position must not be
zero");
- }
- };
- builder.append_value(result.unwrap_or(""));
- }
- _ => builder.append_null(),
+ }
+ std::cmp::Ordering::Equal => {
+ return exec_err!("field position must not be zero");
+ }
+ };
+ builder.append_value(result.unwrap_or(""));
}
- Ok(())
- })?;
+ _ => builder.append_null(),
+ }
+ }
Ok(Arc::new(builder.finish()) as ArrayRef)
}
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt
b/datafusion/sqllogictest/test_files/string/string_view.slt
index 5c7236f576..086f37d6c3 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -908,6 +908,52 @@ logical_plan
01)Projection: split_part(test.column1_utf8view, Utf8("f"), Int64(1)) AS c1,
split_part(Utf8("testtesttest"), test.column1_utf8view, Int64(1)) AS c2
02)--TableScan: test projection=[column1_utf8view]
+# SPLIT_PART with Utf8View
+query T
+SELECT split_part(arrow_cast('abc~@~def~@~ghi', 'Utf8View'), '~@~', 2);
+----
+def
+
+query T
+SELECT split_part(arrow_cast('abc~@~def~@~ghi', 'Utf8View'), '~@~', 20);
+----
+(empty)
+
+query T
+SELECT split_part(arrow_cast('abc~@~def~@~ghi', 'Utf8View'), '~@~', -1);
+----
+ghi
+
+statement error DataFusion error: Execution error: field position must not be
zero
+SELECT split_part(arrow_cast('abc~@~def~@~ghi', 'Utf8View'), '~@~', 0);
+
+query T
+SELECT split_part(arrow_cast('a,b', 'Utf8View'), '', 1);
+----
+a,b
+
+query T
+SELECT split_part(arrow_cast('a,b', 'Utf8View'), '', 2);
+----
+(empty)
+
+query T
+SELECT split_part(arrow_cast('a,b', 'Utf8View'), '', -1);
+----
+a,b
+
+# Single-char delimiter
+query T
+SELECT split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2);
+----
+b
+
+# Verify Utf8View input produces Utf8View output
+query T
+SELECT arrow_typeof(split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2));
+----
+Utf8View
+
## Ensure no casts for STRPOS
query TT
EXPLAIN SELECT
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]