This is an automated email from the ASF dual-hosted git repository.

blaginin pushed a commit to branch annarose/dict-coercion
in repository https://gitbox.apache.org/repos/asf/datafusion-sandbox.git

commit eb3314123ebe4496e7f63030641a4bf95915b547
Author: theirix <[email protected]>
AuthorDate: Thu Feb 5 01:12:53 2026 +0000

    feat: unify left and right functions and benches (#20114)
    
    ## Which issue does this PR close?
    
    - Closes #20103
    
    ## Rationale for this change
    
    A refactoring PR for performance improvement PRs for left #19749 and
    right #20068.
    
    ## What changes are included in this PR?
    
    1. Removed a lot of code duplication by extracting a common stringarray
    / stringview implementation. Now left and right UDFs entry points are
    leaner. Differences are only in slicing - from the left or from the
    right - which is implemented in a generic trait parameter, following the
    design of trim.
    
    2. Switched `left` to use `make_view` to avoid buffer tinkering in
    datafusion code.
    
    4. Combine left and right benches together
    
    ## Are these changes tested?
    
    - Existing unit tests
    - Existing SLTs passed
    - Benches show the same performance improvement of 60-85%
    
    Bench results against pre-optimisation commit
    458b49109af58e678520edebbad9fb3edfd26992:
    <details>
    left size=1024/string_array positive n/1024
                            time:   [34.150 µs 34.694 µs 35.251 µs]
    change: [−71.694% −70.722% −69.818%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 1 outliers among 100 measurements (1.00%)
      1 (1.00%) high mild
    left size=1024/string_array negative n/1024
                            time:   [30.860 µs 31.396 µs 31.998 µs]
    change: [−85.846% −85.294% −84.759%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 8 outliers among 100 measurements (8.00%)
      2 (2.00%) low mild
      4 (4.00%) high mild
      2 (2.00%) high severe
    
    left size=4096/string_array positive n/4096
                            time:   [112.19 µs 114.28 µs 116.98 µs]
    change: [−71.673% −70.934% −70.107%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 3 outliers among 100 measurements (3.00%)
      2 (2.00%) high mild
      1 (1.00%) high severe
    left size=4096/string_array negative n/4096
                            time:   [126.71 µs 129.06 µs 131.26 µs]
    change: [−84.204% −83.809% −83.455%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 5 outliers among 100 measurements (5.00%)
      3 (3.00%) low mild
      2 (2.00%) high mild
    
    left size=1024/string_view_array positive n/1024
                            time:   [30.249 µs 30.887 µs 31.461 µs]
    change: [−75.288% −74.499% −73.743%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 4 outliers among 100 measurements (4.00%)
      3 (3.00%) low mild
      1 (1.00%) high mild
    left size=1024/string_view_array negative n/1024
                            time:   [48.404 µs 49.007 µs 49.608 µs]
    change: [−66.827% −65.727% −64.652%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 3 outliers among 100 measurements (3.00%)
      1 (1.00%) low mild
      1 (1.00%) high mild
      1 (1.00%) high severe
    
    left size=4096/string_view_array positive n/4096
                            time:   [145.25 µs 148.47 µs 151.85 µs]
    change: [−68.913% −67.836% −66.770%] (p = 0.00 < 0.05)
                            Performance has improved.
    left size=4096/string_view_array negative n/4096
                            time:   [203.11 µs 206.31 µs 209.98 µs]
    change: [−57.411% −56.773% −56.142%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 15 outliers among 100 measurements (15.00%)
      1 (1.00%) low mild
      13 (13.00%) high mild
      1 (1.00%) high severe
    
    right size=1024/string_array positive n/1024
                            time:   [30.820 µs 31.674 µs 32.627 µs]
    change: [−84.230% −83.842% −83.402%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 5 outliers among 100 measurements (5.00%)
      5 (5.00%) high mild
    right size=1024/string_array negative n/1024
                            time:   [32.434 µs 33.170 µs 33.846 µs]
    change: [−88.796% −88.460% −88.164%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 3 outliers among 100 measurements (3.00%)
      3 (3.00%) high mild
    
    right size=4096/string_array positive n/4096
                            time:   [124.71 µs 126.54 µs 128.27 µs]
    change: [−83.321% −82.902% −82.537%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 2 outliers among 100 measurements (2.00%)
      2 (2.00%) high mild
    right size=4096/string_array negative n/4096
                            time:   [125.05 µs 127.67 µs 130.35 µs]
    change: [−89.376% −89.193% −89.004%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 1 outliers among 100 measurements (1.00%)
      1 (1.00%) high mild
    
    right size=1024/string_view_array positive n/1024
                            time:   [29.110 µs 29.608 µs 30.141 µs]
    change: [−79.807% −79.330% −78.683%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 8 outliers among 100 measurements (8.00%)
      6 (6.00%) high mild
      2 (2.00%) high severe
    right size=1024/string_view_array negative n/1024
                            time:   [44.883 µs 45.656 µs 46.511 µs]
    change: [−71.157% −70.546% −69.874%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 6 outliers among 100 measurements (6.00%)
      5 (5.00%) high mild
      1 (1.00%) high severe
    
    right size=4096/string_view_array positive n/4096
                            time:   [139.57 µs 142.18 µs 144.96 µs]
    change: [−75.610% −75.088% −74.549%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 2 outliers among 100 measurements (2.00%)
      2 (2.00%) high severe
    right size=4096/string_view_array negative n/4096
                            time:   [221.47 µs 224.47 µs 227.72 µs]
    change: [−64.625% −64.047% −63.504%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 3 outliers among 100 measurements (3.00%)
      3 (3.00%) high mild
    
    </details>
    
    
    ## Are there any user-facing changes?
    
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    -->
    
    <!--
    If there are any breaking changes to public APIs, please add the `api
    change` label.
    -->
---
 datafusion/functions/Cargo.toml            |   7 +-
 datafusion/functions/benches/left.rs       | 140 ----------------------
 datafusion/functions/benches/left_right.rs | 130 ++++++++++++++++++++
 datafusion/functions/benches/right.rs      | 150 -----------------------
 datafusion/functions/src/unicode/common.rs | 183 +++++++++++++++++++++++++++++
 datafusion/functions/src/unicode/left.rs   | 156 +-----------------------
 datafusion/functions/src/unicode/mod.rs    |   1 +
 datafusion/functions/src/unicode/right.rs  | 140 +---------------------
 8 files changed, 327 insertions(+), 580 deletions(-)

diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index a8c41121b..5af901a6b 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -308,12 +308,7 @@ required-features = ["string_expressions"]
 
 [[bench]]
 harness = false
-name = "left"
-required-features = ["unicode_expressions"]
-
-[[bench]]
-harness = false
-name = "right"
+name = "left_right"
 required-features = ["unicode_expressions"]
 
 [[bench]]
diff --git a/datafusion/functions/benches/left.rs 
b/datafusion/functions/benches/left.rs
deleted file mode 100644
index d208e7d4f..000000000
--- a/datafusion/functions/benches/left.rs
+++ /dev/null
@@ -1,140 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-extern crate criterion;
-
-use std::hint::black_box;
-use std::sync::Arc;
-
-use arrow::array::{ArrayRef, Int64Array};
-use arrow::datatypes::{DataType, Field};
-use arrow::util::bench_util::{
-    create_string_array_with_len, create_string_view_array_with_len,
-};
-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use datafusion_common::config::ConfigOptions;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
-use datafusion_functions::unicode::left;
-
-fn create_args(
-    size: usize,
-    str_len: usize,
-    use_negative: bool,
-    is_string_view: bool,
-) -> Vec<ColumnarValue> {
-    let string_arg = if is_string_view {
-        ColumnarValue::Array(Arc::new(create_string_view_array_with_len(
-            size, 0.1, str_len, true,
-        )))
-    } else {
-        ColumnarValue::Array(Arc::new(create_string_array_with_len::<i32>(
-            size, 0.1, str_len,
-        )))
-    };
-
-    // For negative n, we want to trigger the double-iteration code path
-    let n_values: Vec<i64> = if use_negative {
-        (0..size).map(|i| -((i % 10 + 1) as i64)).collect()
-    } else {
-        (0..size).map(|i| (i % 10 + 1) as i64).collect()
-    };
-    let n_array = Arc::new(Int64Array::from(n_values));
-
-    vec![
-        string_arg,
-        ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef),
-    ]
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    for is_string_view in [false, true] {
-        for size in [1024, 4096] {
-            let mut group = c.benchmark_group(format!("left size={size}"));
-
-            // Benchmark with positive n (no optimization needed)
-            let mut function_name = if is_string_view {
-                "string_view_array positive n"
-            } else {
-                "string_array positive n"
-            };
-            let args = create_args(size, 32, false, is_string_view);
-            group.bench_function(BenchmarkId::new(function_name, size), |b| {
-                let arg_fields = args
-                    .iter()
-                    .enumerate()
-                    .map(|(idx, arg)| {
-                        Field::new(format!("arg_{idx}"), arg.data_type(), 
true).into()
-                    })
-                    .collect::<Vec<_>>();
-                let config_options = Arc::new(ConfigOptions::default());
-
-                b.iter(|| {
-                    black_box(
-                        left()
-                            .invoke_with_args(ScalarFunctionArgs {
-                                args: args.clone(),
-                                arg_fields: arg_fields.clone(),
-                                number_rows: size,
-                                return_field: Field::new("f", DataType::Utf8, 
true)
-                                    .into(),
-                                config_options: Arc::clone(&config_options),
-                            })
-                            .expect("left should work"),
-                    )
-                })
-            });
-
-            // Benchmark with negative n (triggers optimization)
-            function_name = if is_string_view {
-                "string_view_array negative n"
-            } else {
-                "string_array negative n"
-            };
-            let args = create_args(size, 32, true, is_string_view);
-            group.bench_function(BenchmarkId::new(function_name, size), |b| {
-                let arg_fields = args
-                    .iter()
-                    .enumerate()
-                    .map(|(idx, arg)| {
-                        Field::new(format!("arg_{idx}"), arg.data_type(), 
true).into()
-                    })
-                    .collect::<Vec<_>>();
-                let config_options = Arc::new(ConfigOptions::default());
-
-                b.iter(|| {
-                    black_box(
-                        left()
-                            .invoke_with_args(ScalarFunctionArgs {
-                                args: args.clone(),
-                                arg_fields: arg_fields.clone(),
-                                number_rows: size,
-                                return_field: Field::new("f", DataType::Utf8, 
true)
-                                    .into(),
-                                config_options: Arc::clone(&config_options),
-                            })
-                            .expect("left should work"),
-                    )
-                })
-            });
-
-            group.finish();
-        }
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
diff --git a/datafusion/functions/benches/left_right.rs 
b/datafusion/functions/benches/left_right.rs
new file mode 100644
index 000000000..913a2194f
--- /dev/null
+++ b/datafusion/functions/benches/left_right.rs
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array};
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::unicode::{left, right};
+
+fn create_args(
+    size: usize,
+    str_len: usize,
+    use_negative: bool,
+    is_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let string_arg = if is_string_view {
+        ColumnarValue::Array(Arc::new(create_string_view_array_with_len(
+            size, 0.1, str_len, true,
+        )))
+    } else {
+        ColumnarValue::Array(Arc::new(create_string_array_with_len::<i32>(
+            size, 0.1, str_len,
+        )))
+    };
+
+    // For negative n, we want to trigger the double-iteration code path
+    let n_values: Vec<i64> = if use_negative {
+        (0..size).map(|i| -((i % 10 + 1) as i64)).collect()
+    } else {
+        (0..size).map(|i| (i % 10 + 1) as i64).collect()
+    };
+    let n_array = Arc::new(Int64Array::from(n_values));
+
+    vec![
+        string_arg,
+        ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef),
+    ]
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let left_function = left();
+    let right_function = right();
+
+    for function in [left_function, right_function] {
+        for is_string_view in [false, true] {
+            for is_negative in [false, true] {
+                for size in [1024, 4096] {
+                    let function_name = function.name();
+                    let mut group =
+                        c.benchmark_group(format!("{function_name} 
size={size}"));
+
+                    let bench_name = format!(
+                        "{} {} n",
+                        if is_string_view {
+                            "string_view_array"
+                        } else {
+                            "string_array"
+                        },
+                        if is_negative { "negative" } else { "positive" },
+                    );
+                    let return_type = if is_string_view {
+                        DataType::Utf8View
+                    } else {
+                        DataType::Utf8
+                    };
+
+                    let args = create_args(size, 32, is_negative, 
is_string_view);
+                    group.bench_function(BenchmarkId::new(bench_name, size), 
|b| {
+                        let arg_fields = args
+                            .iter()
+                            .enumerate()
+                            .map(|(idx, arg)| {
+                                Field::new(format!("arg_{idx}"), 
arg.data_type(), true)
+                                    .into()
+                            })
+                            .collect::<Vec<_>>();
+                        let config_options = 
Arc::new(ConfigOptions::default());
+
+                        b.iter(|| {
+                            black_box(
+                                function
+                                    .invoke_with_args(ScalarFunctionArgs {
+                                        args: args.clone(),
+                                        arg_fields: arg_fields.clone(),
+                                        number_rows: size,
+                                        return_field: Field::new(
+                                            "f",
+                                            return_type.clone(),
+                                            true,
+                                        )
+                                        .into(),
+                                        config_options: 
Arc::clone(&config_options),
+                                    })
+                                    .expect("should work"),
+                            )
+                        })
+                    });
+
+                    group.finish();
+                }
+            }
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/right.rs 
b/datafusion/functions/benches/right.rs
deleted file mode 100644
index 80294ecc4..000000000
--- a/datafusion/functions/benches/right.rs
+++ /dev/null
@@ -1,150 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-extern crate criterion;
-
-use std::hint::black_box;
-use std::sync::Arc;
-
-use arrow::array::{ArrayRef, Int64Array};
-use arrow::datatypes::{DataType, Field};
-use arrow::util::bench_util::{
-    create_string_array_with_len, create_string_view_array_with_len,
-};
-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use datafusion_common::config::ConfigOptions;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
-use datafusion_functions::unicode::right;
-
-fn create_args(
-    size: usize,
-    str_len: usize,
-    use_negative: bool,
-    is_string_view: bool,
-) -> Vec<ColumnarValue> {
-    let string_arg = if is_string_view {
-        ColumnarValue::Array(Arc::new(create_string_view_array_with_len(
-            size, 0.1, str_len, true,
-        )))
-    } else {
-        ColumnarValue::Array(Arc::new(create_string_array_with_len::<i32>(
-            size, 0.1, str_len,
-        )))
-    };
-
-    // For negative n, we want to trigger the double-iteration code path
-    let n_values: Vec<i64> = if use_negative {
-        (0..size).map(|i| -((i % 10 + 1) as i64)).collect()
-    } else {
-        (0..size).map(|i| (i % 10 + 1) as i64).collect()
-    };
-    let n_array = Arc::new(Int64Array::from(n_values));
-
-    vec![
-        string_arg,
-        ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef),
-    ]
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    for is_string_view in [false, true] {
-        for size in [1024, 4096] {
-            let mut group = c.benchmark_group(format!("right size={size}"));
-
-            // Benchmark with positive n (no optimization needed)
-            let mut function_name = if is_string_view {
-                "string_view_array positive n"
-            } else {
-                "string_array positive n"
-            };
-            let args = create_args(size, 32, false, is_string_view);
-            group.bench_function(BenchmarkId::new(function_name, size), |b| {
-                let arg_fields = args
-                    .iter()
-                    .enumerate()
-                    .map(|(idx, arg)| {
-                        Field::new(format!("arg_{idx}"), arg.data_type(), 
true).into()
-                    })
-                    .collect::<Vec<_>>();
-                let return_type = if is_string_view {
-                    DataType::Utf8View
-                } else {
-                    DataType::Utf8
-                };
-                let config_options = Arc::new(ConfigOptions::default());
-
-                b.iter(|| {
-                    black_box(
-                        right()
-                            .invoke_with_args(ScalarFunctionArgs {
-                                args: args.clone(),
-                                arg_fields: arg_fields.clone(),
-                                number_rows: size,
-                                return_field: Field::new("f", 
return_type.clone(), true)
-                                    .into(),
-                                config_options: Arc::clone(&config_options),
-                            })
-                            .expect("right should work"),
-                    )
-                })
-            });
-
-            // Benchmark with negative n (triggers optimization)
-            function_name = if is_string_view {
-                "string_view_array negative n"
-            } else {
-                "string_array negative n"
-            };
-            let args = create_args(size, 32, true, is_string_view);
-            group.bench_function(BenchmarkId::new(function_name, size), |b| {
-                let arg_fields = args
-                    .iter()
-                    .enumerate()
-                    .map(|(idx, arg)| {
-                        Field::new(format!("arg_{idx}"), arg.data_type(), 
true).into()
-                    })
-                    .collect::<Vec<_>>();
-                let return_type = if is_string_view {
-                    DataType::Utf8View
-                } else {
-                    DataType::Utf8
-                };
-                let config_options = Arc::new(ConfigOptions::default());
-
-                b.iter(|| {
-                    black_box(
-                        right()
-                            .invoke_with_args(ScalarFunctionArgs {
-                                args: args.clone(),
-                                arg_fields: arg_fields.clone(),
-                                number_rows: size,
-                                return_field: Field::new("f", 
return_type.clone(), true)
-                                    .into(),
-                                config_options: Arc::clone(&config_options),
-                            })
-                            .expect("right should work"),
-                    )
-                })
-            });
-
-            group.finish();
-        }
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
diff --git a/datafusion/functions/src/unicode/common.rs 
b/datafusion/functions/src/unicode/common.rs
new file mode 100644
index 000000000..93f0c7900
--- /dev/null
+++ b/datafusion/functions/src/unicode/common.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Common utilities for implementing unicode functions
+
+use arrow::array::{
+    Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, 
Int64Array,
+    OffsetSizeTrait, StringViewArray, make_view,
+};
+use arrow::datatypes::DataType;
+use arrow_buffer::{NullBuffer, ScalarBuffer};
+use datafusion_common::cast::{
+    as_generic_string_array, as_int64_array, as_string_view_array,
+};
+use datafusion_common::exec_err;
+use std::cmp::Ordering;
+use std::ops::Range;
+use std::sync::Arc;
+
+/// A trait for `left` and `right` byte slicing operations
+pub(crate) trait LeftRightSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize>;
+}
+
+pub(crate) struct LeftSlicer {}
+
+impl LeftRightSlicer for LeftSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize> {
+        0..left_right_byte_length(string, n)
+    }
+}
+
+pub(crate) struct RightSlicer {}
+
+impl LeftRightSlicer for RightSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize> {
+        if n == 0 {
+            // Return nothing for `n=0`
+            0..0
+        } else if n == i64::MIN {
+            // Special case for i64::MIN overflow
+            0..0
+        } else {
+            left_right_byte_length(string, -n)..string.len()
+        }
+    }
+}
+
+/// Calculate the byte length of the substring of `n` chars from string 
`string`
+#[inline]
+fn left_right_byte_length(string: &str, n: i64) -> usize {
+    match n.cmp(&0) {
+        Ordering::Less => string
+            .char_indices()
+            .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize)
+            .map(|(index, _)| index)
+            .unwrap_or(0),
+        Ordering::Equal => 0,
+        Ordering::Greater => string
+            .char_indices()
+            .nth(n.unsigned_abs().min(usize::MAX as u64) as usize)
+            .map(|(index, _)| index)
+            .unwrap_or(string.len()),
+    }
+}
+
+/// General implementation for `left` and `right` functions
+pub(crate) fn general_left_right<F: LeftRightSlicer>(
+    args: &[ArrayRef],
+) -> datafusion_common::Result<ArrayRef> {
+    let n_array = as_int64_array(&args[1])?;
+
+    match args[0].data_type() {
+        DataType::Utf8 => {
+            let string_array = as_generic_string_array::<i32>(&args[0])?;
+            general_left_right_array::<i32, _, F>(string_array, n_array)
+        }
+        DataType::LargeUtf8 => {
+            let string_array = as_generic_string_array::<i64>(&args[0])?;
+            general_left_right_array::<i64, _, F>(string_array, n_array)
+        }
+        DataType::Utf8View => {
+            let string_view_array = as_string_view_array(&args[0])?;
+            general_left_right_view::<F>(string_view_array, n_array)
+        }
+        _ => exec_err!("Not supported"),
+    }
+}
+
+/// `general_left_right` implementation for strings
+fn general_left_right_array<
+    'a,
+    T: OffsetSizeTrait,
+    V: ArrayAccessor<Item = &'a str>,
+    F: LeftRightSlicer,
+>(
+    string_array: V,
+    n_array: &Int64Array,
+) -> datafusion_common::Result<ArrayRef> {
+    let iter = ArrayIter::new(string_array);
+    let result = iter
+        .zip(n_array.iter())
+        .map(|(string, n)| match (string, n) {
+            (Some(string), Some(n)) => {
+                let range = F::slice(string, n);
+                // Extract a given range from a byte-indexed slice
+                Some(&string[range])
+            }
+            _ => None,
+        })
+        .collect::<GenericStringArray<T>>();
+
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+/// `general_left_right` implementation for StringViewArray
+fn general_left_right_view<F: LeftRightSlicer>(
+    string_view_array: &StringViewArray,
+    n_array: &Int64Array,
+) -> datafusion_common::Result<ArrayRef> {
+    let len = n_array.len();
+
+    let views = string_view_array.views();
+    // Every string in StringViewArray has one corresponding view in `views`
+    debug_assert!(views.len() == string_view_array.len());
+
+    // Compose null buffer at once
+    let string_nulls = string_view_array.nulls();
+    let n_nulls = n_array.nulls();
+    let new_nulls = NullBuffer::union(string_nulls, n_nulls);
+
+    let new_views = (0..len)
+        .map(|idx| {
+            let view = views[idx];
+
+            let is_valid = match &new_nulls {
+                Some(nulls_buf) => nulls_buf.is_valid(idx),
+                None => true,
+            };
+
+            if is_valid {
+                let string: &str = string_view_array.value(idx);
+                let n = n_array.value(idx);
+
+                // Input string comes from StringViewArray, so it should fit 
in 32-bit length
+                let range = F::slice(string, n);
+                let result_bytes = &string.as_bytes()[range.clone()];
+
+                let byte_view = ByteView::from(view);
+                // New offset starts at 0 for left, and at `range.start` for 
right,
+                // which is encoded in the given range
+                let new_offset = byte_view.offset + (range.start as u32);
+                // Reuse buffer
+                make_view(result_bytes, byte_view.buffer_index, new_offset)
+            } else {
+                // For nulls, keep the original view
+                view
+            }
+        })
+        .collect::<Vec<u128>>();
+
+    // Buffers are unchanged
+    let result = StringViewArray::try_new(
+        ScalarBuffer::from(new_views),
+        Vec::from(string_view_array.data_buffers()),
+        new_nulls,
+    )?;
+    Ok(Arc::new(result) as ArrayRef)
+}
diff --git a/datafusion/functions/src/unicode/left.rs 
b/datafusion/functions/src/unicode/left.rs
index 54f204993..76873e7f5 100644
--- a/datafusion/functions/src/unicode/left.rs
+++ b/datafusion/functions/src/unicode/left.rs
@@ -16,20 +16,11 @@
 // under the License.
 
 use std::any::Any;
-use std::cmp::Ordering;
-use std::sync::Arc;
 
+use crate::unicode::common::{LeftSlicer, general_left_right};
 use crate::utils::make_scalar_function;
-use arrow::array::{
-    Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, 
Int64Array,
-    OffsetSizeTrait, StringViewArray,
-};
 use arrow::datatypes::DataType;
-use arrow_buffer::{NullBuffer, ScalarBuffer};
 use datafusion_common::Result;
-use datafusion_common::cast::{
-    as_generic_string_array, as_int64_array, as_string_view_array,
-};
 use datafusion_common::exec_err;
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
@@ -97,6 +88,10 @@ impl ScalarUDFImpl for LeftFunc {
         Ok(arg_types[0].clone())
     }
 
+    /// Returns first n characters in the string, or when n is negative, 
returns all but last |n| characters.
+    /// left('abcde', 2) = 'ab'
+    /// left('abcde', -2) = 'abc'
+    /// The implementation uses UTF-8 code points as characters
     fn invoke_with_args(
         &self,
         args: datafusion_expr::ScalarFunctionArgs,
@@ -104,7 +99,7 @@ impl ScalarUDFImpl for LeftFunc {
         let args = &args.args;
         match args[0].data_type() {
             DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => {
-                make_scalar_function(left, vec![])(args)
+                make_scalar_function(general_left_right::<LeftSlicer>, 
vec![])(args)
             }
             other => exec_err!(
                 "Unsupported data type {other:?} for function {},\
@@ -119,145 +114,6 @@ impl ScalarUDFImpl for LeftFunc {
     }
 }
 
-/// Returns first n characters in the string, or when n is negative, returns 
all but last |n| characters.
-/// left('abcde', 2) = 'ab'
-/// left('abcde', -2) = 'ab'
-/// The implementation uses UTF-8 code points as characters
-fn left(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let n_array = as_int64_array(&args[1])?;
-
-    match args[0].data_type() {
-        DataType::Utf8 => {
-            let string_array = as_generic_string_array::<i32>(&args[0])?;
-            left_impl::<i32, _>(string_array, n_array)
-        }
-        DataType::LargeUtf8 => {
-            let string_array = as_generic_string_array::<i64>(&args[0])?;
-            left_impl::<i64, _>(string_array, n_array)
-        }
-        DataType::Utf8View => {
-            let string_view_array = as_string_view_array(&args[0])?;
-            left_impl_view(string_view_array, n_array)
-        }
-        _ => exec_err!("Not supported"),
-    }
-}
-
-/// `left` implementation for strings
-fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
-    string_array: V,
-    n_array: &Int64Array,
-) -> Result<ArrayRef> {
-    let iter = ArrayIter::new(string_array);
-    let result = iter
-        .zip(n_array.iter())
-        .map(|(string, n)| match (string, n) {
-            (Some(string), Some(n)) => {
-                let byte_length = left_byte_length(string, n);
-                // Extract first `byte_length` bytes from a byte-indexed slice
-                Some(&string[0..byte_length])
-            }
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
-/// `left` implementation for StringViewArray
-fn left_impl_view(
-    string_view_array: &StringViewArray,
-    n_array: &Int64Array,
-) -> Result<ArrayRef> {
-    let len = n_array.len();
-
-    let views = string_view_array.views();
-    // Every string in StringViewArray has one corresponding view in `views`
-    debug_assert!(views.len() == string_view_array.len());
-
-    // Compose null buffer at once
-    let string_nulls = string_view_array.nulls();
-    let n_nulls = n_array.nulls();
-    let new_nulls = NullBuffer::union(string_nulls, n_nulls);
-
-    let new_views = (0..len)
-        .map(|idx| {
-            let view = views[idx];
-
-            let is_valid = match &new_nulls {
-                Some(nulls_buf) => nulls_buf.is_valid(idx),
-                None => true,
-            };
-
-            if is_valid {
-                let string: &str = string_view_array.value(idx);
-                let n = n_array.value(idx);
-
-                // Input string comes from StringViewArray, so it should fit 
in 32-bit length
-                let new_length: u32 = left_byte_length(string, n) as u32;
-                let byte_view = ByteView::from(view);
-                // Construct a new view
-                shrink_string_view_array_view(string, new_length, byte_view)
-            } else {
-                // For nulls, keep the original view
-                view
-            }
-        })
-        .collect::<Vec<u128>>();
-
-    // Buffers are unchanged
-    let result = StringViewArray::try_new(
-        ScalarBuffer::from(new_views),
-        Vec::from(string_view_array.data_buffers()),
-        new_nulls,
-    )?;
-    Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Calculate the byte length of the substring of `n` chars from string 
`string`
-fn left_byte_length(string: &str, n: i64) -> usize {
-    match n.cmp(&0) {
-        Ordering::Less => string
-            .char_indices()
-            .nth_back(n.unsigned_abs() as usize - 1)
-            .map(|(index, _)| index)
-            .unwrap_or(0),
-        Ordering::Equal => 0,
-        Ordering::Greater => string
-            .char_indices()
-            .nth(n.unsigned_abs() as usize)
-            .map(|(index, _)| index)
-            .unwrap_or(string.len()),
-    }
-}
-
-/// Construct a new StringViewArray view from existing view `byte_view` and 
new length `len`.
-/// Prefix is taken from the original string `string`.
-/// Handles both inline and non-inline views, referencing the same buffers.
-fn shrink_string_view_array_view(string: &str, len: u32, byte_view: ByteView) 
-> u128 {
-    debug_assert!(len <= byte_view.length);
-    // Acquire bytes view to string (no allocations)
-    let bytes = string.as_bytes();
-
-    if len <= 12 {
-        // Inline view
-        // Construct manually since ByteView cannot work with inline views
-        let mut view_buffer = [0u8; 16];
-        // 4 bytes: length
-        view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
-        // 12 bytes: the whole zero-padded string
-        view_buffer[4..4 + len as usize].copy_from_slice(&bytes[..len as 
usize]);
-        u128::from_le_bytes(view_buffer)
-    } else {
-        // Non-inline view.
-        // Use ByteView constructor to reference existing buffers
-        let new_byte_view = ByteView::new(len, &bytes[..4])
-            .with_buffer_index(byte_view.buffer_index)
-            .with_offset(byte_view.offset);
-        new_byte_view.as_u128()
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use arrow::array::{Array, StringArray, StringViewArray};
diff --git a/datafusion/functions/src/unicode/mod.rs 
b/datafusion/functions/src/unicode/mod.rs
index 4a0dd21d7..7250b3915 100644
--- a/datafusion/functions/src/unicode/mod.rs
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -22,6 +22,7 @@ use std::sync::Arc;
 use datafusion_expr::ScalarUDF;
 
 pub mod character_length;
+pub mod common;
 pub mod find_in_set;
 pub mod initcap;
 pub mod left;
diff --git a/datafusion/functions/src/unicode/right.rs 
b/datafusion/functions/src/unicode/right.rs
index 569f20d32..a97e242b7 100644
--- a/datafusion/functions/src/unicode/right.rs
+++ b/datafusion/functions/src/unicode/right.rs
@@ -16,20 +16,11 @@
 // under the License.
 
 use std::any::Any;
-use std::cmp::Ordering;
-use std::sync::Arc;
 
+use crate::unicode::common::{RightSlicer, general_left_right};
 use crate::utils::make_scalar_function;
-use arrow::array::{
-    Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, 
Int64Array,
-    OffsetSizeTrait, StringViewArray, make_view,
-};
 use arrow::datatypes::DataType;
-use arrow_buffer::{NullBuffer, ScalarBuffer};
 use datafusion_common::Result;
-use datafusion_common::cast::{
-    as_generic_string_array, as_int64_array, as_string_view_array,
-};
 use datafusion_common::exec_err;
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
@@ -97,6 +88,10 @@ impl ScalarUDFImpl for RightFunc {
         Ok(arg_types[0].clone())
     }
 
+    /// Returns right n characters in the string, or when n is negative, 
returns all but first |n| characters.
+    /// right('abcde', 2) = 'de'
+    /// right('abcde', -2) = 'cde'
+    /// The implementation uses UTF-8 code points as characters
     fn invoke_with_args(
         &self,
         args: datafusion_expr::ScalarFunctionArgs,
@@ -104,7 +99,7 @@ impl ScalarUDFImpl for RightFunc {
         let args = &args.args;
         match args[0].data_type() {
             DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => {
-                make_scalar_function(right, vec![])(args)
+                make_scalar_function(general_left_right::<RightSlicer>, 
vec![])(args)
             }
             other => exec_err!(
                 "Unsupported data type {other:?} for function {},\
@@ -119,129 +114,6 @@ impl ScalarUDFImpl for RightFunc {
     }
 }
 
-/// Returns right n characters in the string, or when n is negative, returns 
all but first |n| characters.
-/// right('abcde', 2) = 'de'
-/// right('abcde', -2) = 'cde'
-/// The implementation uses UTF-8 code points as characters
-fn right(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let n_array = as_int64_array(&args[1])?;
-
-    match args[0].data_type() {
-        DataType::Utf8 => {
-            let string_array = as_generic_string_array::<i32>(&args[0])?;
-            right_impl::<i32, _>(string_array, n_array)
-        }
-        DataType::LargeUtf8 => {
-            let string_array = as_generic_string_array::<i64>(&args[0])?;
-            right_impl::<i64, _>(string_array, n_array)
-        }
-        DataType::Utf8View => {
-            let string_view_array = as_string_view_array(&args[0])?;
-            right_impl_view(string_view_array, n_array)
-        }
-        _ => exec_err!("Not supported"),
-    }
-}
-
-/// `right` implementation for strings
-fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
-    string_array: V,
-    n_array: &Int64Array,
-) -> Result<ArrayRef> {
-    let iter = ArrayIter::new(string_array);
-    let result = iter
-        .zip(n_array.iter())
-        .map(|(string, n)| match (string, n) {
-            (Some(string), Some(n)) => {
-                let byte_length = right_byte_length(string, n);
-                // Extract starting from `byte_length` bytes from a 
byte-indexed slice
-                Some(&string[byte_length..])
-            }
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
-/// `right` implementation for StringViewArray
-fn right_impl_view(
-    string_view_array: &StringViewArray,
-    n_array: &Int64Array,
-) -> Result<ArrayRef> {
-    let len = n_array.len();
-
-    let views = string_view_array.views();
-    // Every string in StringViewArray has one corresponding view in `views`
-    debug_assert!(views.len() == string_view_array.len());
-
-    // Compose null buffer at once
-    let string_nulls = string_view_array.nulls();
-    let n_nulls = n_array.nulls();
-    let new_nulls = NullBuffer::union(string_nulls, n_nulls);
-
-    let new_views = (0..len)
-        .map(|idx| {
-            let view = views[idx];
-
-            let is_valid = match &new_nulls {
-                Some(nulls_buf) => nulls_buf.is_valid(idx),
-                None => true,
-            };
-
-            if is_valid {
-                let string: &str = string_view_array.value(idx);
-                let n = n_array.value(idx);
-
-                let new_offset = right_byte_length(string, n);
-                let result_bytes = &string.as_bytes()[new_offset..];
-
-                if result_bytes.len() > 12 {
-                    let byte_view = ByteView::from(view);
-                    // Reuse buffer, but adjust offset and length
-                    make_view(
-                        result_bytes,
-                        byte_view.buffer_index,
-                        byte_view.offset + new_offset as u32,
-                    )
-                } else {
-                    // inline value does not need block id or offset
-                    make_view(result_bytes, 0, 0)
-                }
-            } else {
-                // For nulls, keep the original view
-                view
-            }
-        })
-        .collect::<Vec<u128>>();
-
-    // Buffers are unchanged
-    let result = StringViewArray::try_new(
-        ScalarBuffer::from(new_views),
-        Vec::from(string_view_array.data_buffers()),
-        new_nulls,
-    )?;
-    Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Calculate the byte length of the substring of last `n` chars from string 
`string`
-/// (or all but first `|n|` chars if n is negative)
-fn right_byte_length(string: &str, n: i64) -> usize {
-    match n.cmp(&0) {
-        Ordering::Less => string
-            .char_indices()
-            .nth(n.unsigned_abs().min(usize::MAX as u64) as usize)
-            .map(|(index, _)| index)
-            .unwrap_or(string.len()),
-        Ordering::Equal => string.len(),
-        Ordering::Greater => string
-            .char_indices()
-            .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize)
-            .map(|(index, _)| index)
-            .unwrap_or(0),
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use arrow::array::{Array, StringArray, StringViewArray};


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to