This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 54b848c077 feat(spark): implement substring function (#19805)
54b848c077 is described below
commit 54b848c077c092ebe6462d9480ef1925d66ef9d7
Author: cht42 <[email protected]>
AuthorDate: Fri Jan 16 02:52:40 2026 +0400
feat(spark): implement substring function (#19805)
## Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax. For example
`Closes #123` indicates that this PR will close issue #123.
-->
- Part of #15914
- Closes #19803.
## Rationale for this change
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
## What changes are included in this PR?
Implementation of spark substring function.
Spark implementation:
-
https://github.com/apache/spark/blob/6831481fd7a2d30dfa16b4b70c8e6296b4deeb8c/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L663
-
https://github.com/apache/spark/blob/6831481fd7a2d30dfa16b4b70c8e6296b4deeb8c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala#L2300
## Are these changes tested?
yes
## Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
-->
<!--
If there are any breaking changes to public APIs, please add the `api
change` label.
-->
---
datafusion/functions/src/unicode/substr.rs | 4 +-
datafusion/spark/Cargo.toml | 5 +
datafusion/spark/benches/substring.rs | 207 +++++++++++++++++
datafusion/spark/src/function/string/mod.rs | 8 +
datafusion/spark/src/function/string/substring.rs | 258 +++++++++++++++++++++
datafusion/spark/src/lib.rs | 1 +
datafusion/spark/src/planner.rs | 34 +++
datafusion/sqllogictest/src/test_context.rs | 16 +-
.../test_files/spark/string/substr.slt | 37 ---
.../test_files/spark/string/substring.slt | 206 ++++++++++++++--
10 files changed, 713 insertions(+), 63 deletions(-)
diff --git a/datafusion/functions/src/unicode/substr.rs
b/datafusion/functions/src/unicode/substr.rs
index cc1d53b3aa..9517aac8e0 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -176,7 +176,7 @@ fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
// `get_true_start_end('Hi🌏', 1, None) -> (0, 6)`
// `get_true_start_end('Hi🌏', 1, 1) -> (0, 1)`
// `get_true_start_end('Hi🌏', -10, 2) -> (0, 0)`
-fn get_true_start_end(
+pub fn get_true_start_end(
input: &str,
start: i64,
count: Option<u64>,
@@ -235,7 +235,7 @@ fn get_true_start_end(
// string, such as `substr(long_str_with_1k_chars, 1, 32)`.
// In such case the overhead of ASCII-validation may not be worth it, so
// skip the validation for short prefix for now.
-fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
+pub fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
string_array: &V,
start: &Int64Array,
count: Option<&Int64Array>,
diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml
index dc5875e5a1..ad2620a532 100644
--- a/datafusion/spark/Cargo.toml
+++ b/datafusion/spark/Cargo.toml
@@ -57,6 +57,7 @@ sha1 = "0.10"
url = { workspace = true }
[dev-dependencies]
+arrow = { workspace = true, features = ["test_utils"] }
criterion = { workspace = true }
[[bench]]
@@ -74,3 +75,7 @@ name = "hex"
[[bench]]
harness = false
name = "slice"
+
+[[bench]]
+harness = false
+name = "substring"
diff --git a/datafusion/spark/benches/substring.rs
b/datafusion/spark/benches/substring.rs
new file mode 100644
index 0000000000..286758f43d
--- /dev/null
+++ b/datafusion/spark/benches/substring.rs
@@ -0,0 +1,207 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+ create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::string::substring;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_args_without_count<O: OffsetSizeTrait>(
+ size: usize,
+ str_len: usize,
+ start_half_way: bool,
+ force_view_types: bool,
+) -> Vec<ColumnarValue> {
+ let start_array = Arc::new(Int64Array::from(
+ (0..size)
+ .map(|_| {
+ if start_half_way {
+ (str_len / 2) as i64
+ } else {
+ 1i64
+ }
+ })
+ .collect::<Vec<_>>(),
+ ));
+
+ if force_view_types {
+ let string_array =
+ Arc::new(create_string_view_array_with_len(size, 0.1, str_len,
false));
+ vec![
+ ColumnarValue::Array(string_array),
+ ColumnarValue::Array(start_array),
+ ]
+ } else {
+ let string_array =
+ Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+ vec![
+ ColumnarValue::Array(string_array),
+ ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+ ]
+ }
+}
+
+fn create_args_with_count<O: OffsetSizeTrait>(
+ size: usize,
+ str_len: usize,
+ count_max: usize,
+ force_view_types: bool,
+) -> Vec<ColumnarValue> {
+ let start_array =
+ Arc::new(Int64Array::from((0..size).map(|_| 1).collect::<Vec<_>>()));
+ let count = count_max.min(str_len) as i64;
+ let count_array = Arc::new(Int64Array::from(
+ (0..size).map(|_| count).collect::<Vec<_>>(),
+ ));
+
+ if force_view_types {
+ let string_array =
+ Arc::new(create_string_view_array_with_len(size, 0.1, str_len,
false));
+ vec![
+ ColumnarValue::Array(string_array),
+ ColumnarValue::Array(start_array),
+ ColumnarValue::Array(count_array),
+ ]
+ } else {
+ let string_array =
+ Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+ vec![
+ ColumnarValue::Array(string_array),
+ ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+ ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef),
+ ]
+ }
+}
+
+#[expect(clippy::needless_pass_by_value)]
+fn invoke_substr_with_args(
+ args: Vec<ColumnarValue>,
+ number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+ let arg_fields = args
+ .iter()
+ .enumerate()
+ .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(),
true).into())
+ .collect::<Vec<_>>();
+ let config_options = Arc::new(ConfigOptions::default());
+
+ substring().invoke_with_args(ScalarFunctionArgs {
+ args: args.clone(),
+ arg_fields,
+ number_rows,
+ return_field: Field::new("f", DataType::Utf8View, true).into(),
+ config_options: Arc::clone(&config_options),
+ })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+ for size in [1024, 4096] {
+ // string_len = 12, substring_len=6 (see `create_args_without_count`)
+ let len = 12;
+ let mut group = c.benchmark_group("SHORTER THAN 12");
+ group.sampling_mode(SamplingMode::Flat);
+ group.sample_size(10);
+
+ let args = create_args_without_count::<i32>(size, len, true, true);
+ group.bench_function(
+ format!("substr_string_view [size={size}, strlen={len}]"),
+ |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(),
size))),
+ );
+
+ let args = create_args_without_count::<i32>(size, len, false, false);
+ group.bench_function(format!("substr_string [size={size},
strlen={len}]"), |b| {
+ b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
+ });
+
+ let args = create_args_without_count::<i64>(size, len, true, false);
+ group.bench_function(
+ format!("substr_large_string [size={size}, strlen={len}]"),
+ |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(),
size))),
+ );
+
+ group.finish();
+
+ // string_len = 128, start=1, count=64, substring_len=64
+ let len = 128;
+ let count = 64;
+ let mut group = c.benchmark_group("LONGER THAN 12");
+ group.sampling_mode(SamplingMode::Flat);
+ group.sample_size(10);
+
+ let args = create_args_with_count::<i32>(size, len, count, true);
+ group.bench_function(
+ format!("substr_string_view [size={size}, count={count},
strlen={len}]",),
+ |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(),
size))),
+ );
+
+ let args = create_args_with_count::<i32>(size, len, count, false);
+ group.bench_function(
+ format!("substr_string [size={size}, count={count},
strlen={len}]",),
+ |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(),
size))),
+ );
+
+ let args = create_args_with_count::<i64>(size, len, count, false);
+ group.bench_function(
+ format!("substr_large_string [size={size}, count={count},
strlen={len}]",),
+ |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(),
size))),
+ );
+
+ group.finish();
+
+ // string_len = 128, start=1, count=6, substring_len=6
+ let len = 128;
+ let count = 6;
+ let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12");
+ group.sampling_mode(SamplingMode::Flat);
+ group.sample_size(10);
+
+ let args = create_args_with_count::<i32>(size, len, count, true);
+ group.bench_function(
+ format!("substr_string_view [size={size}, count={count},
strlen={len}]",),
+ |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(),
size))),
+ );
+
+ let args = create_args_with_count::<i32>(size, len, count, false);
+ group.bench_function(
+ format!("substr_string [size={size}, count={count},
strlen={len}]",),
+ |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(),
size))),
+ );
+
+ let args = create_args_with_count::<i64>(size, len, count, false);
+ group.bench_function(
+ format!("substr_large_string [size={size}, count={count},
strlen={len}]",),
+ |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(),
size))),
+ );
+
+ group.finish();
+ }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/src/function/string/mod.rs
b/datafusion/spark/src/function/string/mod.rs
index 369d381a9c..1f0108cf50 100644
--- a/datafusion/spark/src/function/string/mod.rs
+++ b/datafusion/spark/src/function/string/mod.rs
@@ -25,6 +25,7 @@ pub mod length;
pub mod like;
pub mod luhn_check;
pub mod space;
+pub mod substring;
use datafusion_expr::ScalarUDF;
use datafusion_functions::make_udf_function;
@@ -40,6 +41,7 @@ make_udf_function!(like::SparkLike, like);
make_udf_function!(luhn_check::SparkLuhnCheck, luhn_check);
make_udf_function!(format_string::FormatStringFunc, format_string);
make_udf_function!(space::SparkSpace, space);
+make_udf_function!(substring::SparkSubstring, substring);
pub mod expr_fn {
use datafusion_functions::export_functions;
@@ -90,6 +92,11 @@ pub mod expr_fn {
strfmt args
));
export_functions!((space, "Returns a string consisting of n spaces.",
arg1));
+ export_functions!((
+ substring,
+ "Returns the substring from string `str` starting at position `pos`
with length `length.",
+ str pos length
+ ));
}
pub fn functions() -> Vec<Arc<ScalarUDF>> {
@@ -104,5 +111,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
luhn_check(),
format_string(),
space(),
+ substring(),
]
}
diff --git a/datafusion/spark/src/function/string/substring.rs
b/datafusion/spark/src/function/string/substring.rs
new file mode 100644
index 0000000000..524262b12f
--- /dev/null
+++ b/datafusion/spark/src/function/string/substring.rs
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+ Array, ArrayBuilder, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
+ OffsetSizeTrait, StringArrayType, StringViewBuilder,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::arrow::datatypes::{Field, FieldRef};
+use datafusion_common::cast::as_int64_array;
+use datafusion_common::types::{
+ NativeType, logical_int32, logical_int64, logical_string,
+};
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{Coercion, ReturnFieldArgs, TypeSignatureClass};
+use datafusion_expr::{
+ ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+ Volatility,
+};
+use datafusion_functions::unicode::substr::{enable_ascii_fast_path,
get_true_start_end};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `substring` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#substring>
+///
+/// Returns the substring from string starting at position pos with length len.
+/// Position is 1-indexed. If pos is negative, it counts from the end of the
string.
+/// Returns NULL if any input is NULL.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSubstring {
+ signature: Signature,
+ aliases: Vec<String>,
+}
+
+impl Default for SparkSubstring {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl SparkSubstring {
+ pub fn new() -> Self {
+ let string =
Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+ let int64 = Coercion::new_implicit(
+ TypeSignatureClass::Native(logical_int64()),
+ vec![TypeSignatureClass::Native(logical_int32())],
+ NativeType::Int64,
+ );
+ Self {
+ signature: Signature::one_of(
+ vec![
+ TypeSignature::Coercible(vec![string.clone(),
int64.clone()]),
+ TypeSignature::Coercible(vec![
+ string.clone(),
+ int64.clone(),
+ int64.clone(),
+ ]),
+ ],
+ Volatility::Immutable,
+ )
+ .with_parameter_names(vec![
+ "str".to_string(),
+ "pos".to_string(),
+ "length".to_string(),
+ ])
+ .expect("valid parameter names"),
+ aliases: vec![String::from("substr")],
+ }
+ }
+}
+
+impl ScalarUDFImpl for SparkSubstring {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "substring"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn aliases(&self) -> &[String] {
+ &self.aliases
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(spark_substring, vec![])(&args.args)
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+ datafusion_common::internal_err!(
+ "return_type should not be called for Spark substring"
+ )
+ }
+
+ fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) ->
Result<FieldRef> {
+ // Spark semantics: substring returns NULL if ANY input is NULL
+ let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+ Ok(Arc::new(Field::new(
+ "substring",
+ args.arg_fields[0].data_type().clone(),
+ nullable,
+ )))
+ }
+}
+
+fn spark_substring(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let start_array = as_int64_array(&args[1])?;
+ let length_array = if args.len() > 2 {
+ Some(as_int64_array(&args[2])?)
+ } else {
+ None
+ };
+
+ match args[0].data_type() {
+ DataType::Utf8 => spark_substring_impl(
+ &args[0].as_string::<i32>(),
+ start_array,
+ length_array,
+ GenericStringBuilder::<i32>::new(),
+ ),
+ DataType::LargeUtf8 => spark_substring_impl(
+ &args[0].as_string::<i64>(),
+ start_array,
+ length_array,
+ GenericStringBuilder::<i64>::new(),
+ ),
+ DataType::Utf8View => spark_substring_impl(
+ &args[0].as_string_view(),
+ start_array,
+ length_array,
+ StringViewBuilder::new(),
+ ),
+ other => exec_err!(
+ "Unsupported data type {other:?} for function spark_substring,
expected Utf8View, Utf8 or LargeUtf8."
+ ),
+ }
+}
+
+/// Convert Spark's start position to DataFusion's 1-based start position.
+///
+/// Spark semantics:
+/// - Positive start: 1-based index from beginning
+/// - Zero start: treated as 1
+/// - Negative start: counts from end of string
+///
+/// Returns the converted 1-based start position for use with
`get_true_start_end`.
+#[inline]
+fn spark_start_to_datafusion_start(start: i64, len: usize) -> i64 {
+ if start >= 0 {
+ start.max(1)
+ } else {
+ let len_i64 = i64::try_from(len).unwrap_or(i64::MAX);
+ let start = start.saturating_add(len_i64).saturating_add(1);
+ start.max(1)
+ }
+}
+
+trait StringArrayBuilder: ArrayBuilder {
+ fn append_value(&mut self, val: &str);
+ fn append_null(&mut self);
+}
+
+impl<O: OffsetSizeTrait> StringArrayBuilder for GenericStringBuilder<O> {
+ fn append_value(&mut self, val: &str) {
+ GenericStringBuilder::append_value(self, val);
+ }
+ fn append_null(&mut self) {
+ GenericStringBuilder::append_null(self);
+ }
+}
+
+impl StringArrayBuilder for StringViewBuilder {
+ fn append_value(&mut self, val: &str) {
+ StringViewBuilder::append_value(self, val);
+ }
+ fn append_null(&mut self) {
+ StringViewBuilder::append_null(self);
+ }
+}
+
+fn spark_substring_impl<'a, V, B>(
+ string_array: &V,
+ start_array: &Int64Array,
+ length_array: Option<&Int64Array>,
+ mut builder: B,
+) -> Result<ArrayRef>
+where
+ V: StringArrayType<'a>,
+ B: StringArrayBuilder,
+{
+ let is_ascii = enable_ascii_fast_path(string_array, start_array,
length_array);
+
+ for i in 0..string_array.len() {
+ if string_array.is_null(i) || start_array.is_null(i) {
+ builder.append_null();
+ continue;
+ }
+
+ if let Some(len_arr) = length_array
+ && len_arr.is_null(i)
+ {
+ builder.append_null();
+ continue;
+ }
+
+ let string = string_array.value(i);
+ let start = start_array.value(i);
+ let len_opt = length_array.map(|arr| arr.value(i));
+
+ // Spark: negative length returns empty string
+ if let Some(len) = len_opt
+ && len < 0
+ {
+ builder.append_value("");
+ continue;
+ }
+
+ let string_len = if is_ascii {
+ string.len()
+ } else {
+ string.chars().count()
+ };
+
+ let adjusted_start = spark_start_to_datafusion_start(start,
string_len);
+
+ let (byte_start, byte_end) = get_true_start_end(
+ string,
+ adjusted_start,
+ len_opt.map(|l| l as u64),
+ is_ascii,
+ );
+ let substr = &string[byte_start..byte_end];
+ builder.append_value(substr);
+ }
+
+ Ok(builder.finish())
+}
diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs
index aad3ceed68..f67367734c 100644
--- a/datafusion/spark/src/lib.rs
+++ b/datafusion/spark/src/lib.rs
@@ -95,6 +95,7 @@
//![`Expr`]: datafusion_expr::Expr
pub mod function;
+pub mod planner;
use datafusion_catalog::TableFunction;
use datafusion_common::Result;
diff --git a/datafusion/spark/src/planner.rs b/datafusion/spark/src/planner.rs
new file mode 100644
index 0000000000..8b68617828
--- /dev/null
+++ b/datafusion/spark/src/planner.rs
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_expr::Expr;
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::planner::{ExprPlanner, PlannerResult};
+
+#[derive(Default, Debug)]
+pub struct SparkFunctionPlanner;
+
+impl ExprPlanner for SparkFunctionPlanner {
+ fn plan_substring(
+ &self,
+ args: Vec<Expr>,
+ ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+ Ok(PlannerResult::Planned(Expr::ScalarFunction(
+ ScalarFunction::new_udf(crate::function::string::substring(),
args),
+ )))
+ }
+}
diff --git a/datafusion/sqllogictest/src/test_context.rs
b/datafusion/sqllogictest/src/test_context.rs
index d416dc1bcf..19ec3e7613 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -21,6 +21,7 @@ use std::fs::File;
use std::io::Write;
use std::path::Path;
use std::sync::Arc;
+use std::vec;
use arrow::array::{
Array, ArrayRef, BinaryArray, Float64Array, Int32Array, LargeBinaryArray,
@@ -80,11 +81,18 @@ impl TestContext {
// hardcode target partitions so plans are deterministic
.with_target_partitions(4);
let runtime = Arc::new(RuntimeEnv::default());
- let mut state = SessionStateBuilder::new()
+
+ let mut state_builder = SessionStateBuilder::new()
.with_config(config)
- .with_runtime_env(runtime)
- .with_default_features()
- .build();
+ .with_runtime_env(runtime);
+
+ if is_spark_path(relative_path) {
+ state_builder = state_builder.with_expr_planners(vec![Arc::new(
+ datafusion_spark::planner::SparkFunctionPlanner,
+ )]);
+ }
+
+ let mut state = state_builder.with_default_features().build();
if is_spark_path(relative_path) {
info!("Registering Spark functions");
diff --git a/datafusion/sqllogictest/test_files/spark/string/substr.slt
b/datafusion/sqllogictest/test_files/spark/string/substr.slt
deleted file mode 100644
index 0942bdd86a..0000000000
--- a/datafusion/sqllogictest/test_files/spark/string/substr.slt
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-
-# http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file was originally created by a porting script from:
-#
https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function
library.
-# For more information, please see:
-# https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT substr('Spark SQL', -3);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, -3, 2147483647)': 'SQL',
'typeof(substr(Spark SQL, -3, 2147483647))': 'string', 'typeof(Spark SQL)':
'string', 'typeof(-3)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, -3::int);
-
-## Original Query: SELECT substr('Spark SQL', 5);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, 5, 2147483647)': 'k SQL',
'typeof(substr(Spark SQL, 5, 2147483647))': 'string', 'typeof(Spark SQL)':
'string', 'typeof(5)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, 5::int);
-
-## Original Query: SELECT substr('Spark SQL', 5, 1);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, 5, 1)': 'k', 'typeof(substr(Spark
SQL, 5, 1))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int',
'typeof(1)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, 5::int, 1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/substring.slt
b/datafusion/sqllogictest/test_files/spark/string/substring.slt
index 847ce4b6d4..5bf2fdf2fb 100644
--- a/datafusion/sqllogictest/test_files/spark/string/substring.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/substring.slt
@@ -15,23 +15,189 @@
# specific language governing permissions and limitations
# under the License.
-# This file was originally created by a porting script from:
-#
https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function
library.
-# For more information, please see:
-# https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT substring('Spark SQL', -3);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, -3, 2147483647)': 'SQL',
'typeof(substring(Spark SQL, -3, 2147483647))': 'string', 'typeof(Spark SQL)':
'string', 'typeof(-3)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, -3::int);
-
-## Original Query: SELECT substring('Spark SQL', 5);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, 5, 2147483647)': 'k SQL',
'typeof(substring(Spark SQL, 5, 2147483647))': 'string', 'typeof(Spark SQL)':
'string', 'typeof(5)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, 5::int);
-
-## Original Query: SELECT substring('Spark SQL', 5, 1);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, 5, 1)': 'k',
'typeof(substring(Spark SQL, 5, 1))': 'string', 'typeof(Spark SQL)': 'string',
'typeof(5)': 'int', 'typeof(1)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, 5::int, 1::int);
+
+query T
+SELECT substring('Spark SQL'::string, 0::int);
+----
+Spark SQL
+
+query T
+SELECT substring('Spark SQL'::string, 5::int);
+----
+k SQL
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, 1::int);
+----
+a
+
+# Test negative start
+query T
+SELECT substring('Spark SQL'::string, -3::int);
+----
+SQL
+
+query T
+SELECT substring('Spark SQL'::string, -3::int, 2::int);
+----
+SQ
+
+# Test length exceeding string length
+query T
+SELECT substring('Spark SQL'::string, 2::int, 700::int);
+----
+park SQL
+
+# Test start position beyond string length
+query T
+SELECT substring('Spark SQL'::string, 30::int);
+----
+(empty)
+
+query T
+SELECT substring('Spark SQL'::string, -30::int);
+----
+Spark SQL
+
+# Test negative length
+query T
+SELECT substring('Spark SQL'::string, 3::int, -1::int);
+----
+(empty)
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, 0::int);
+----
+(empty)
+
+# Test unicode strings
+query T
+SELECT substring('joséésoj'::string, 5::int);
+----
+ésoj
+
+query T
+SELECT substring('joséésoj'::string, 5::int, 2::int);
+----
+és
+
+# NULL handling
+query T
+SELECT substring('Spark SQL'::string, NULL::int);
+----
+NULL
+
+query T
+SELECT substring(NULL::string, 5::int);
+----
+NULL
+
+query T
+SELECT substring(NULL::string, 3::int, 1::int);
+----
+NULL
+
+query T
+SELECT substring('Spark SQL'::string, NULL::int, 1::int);
+----
+NULL
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, NULL::int);
+----
+NULL
+
+query T
+SELECT substring(column1, column2)
+FROM VALUES
+('Spark SQL'::string, 0::int),
+('Spark SQL'::string, 5::int),
+('Spark SQL'::string, -3::int),
+('Spark SQL'::string, 500::int),
+('Spark SQL'::string, -300::int),
+(NULL::string, 5::int),
+('Spark SQL'::string, NULL::int);
+----
+Spark SQL
+k SQL
+SQL
+(empty)
+Spark SQL
+NULL
+NULL
+
+query T
+SELECT substring(column1, column2, column3)
+FROM VALUES
+('Spark SQL'::string, -3::int, 2::int),
+('Spark SQL'::string, 3::int, 1::int),
+('Spark SQL'::string, 3::int, 700::int),
+('Spark SQL'::string, 3::int, -1::int),
+('Spark SQL'::string, 3::int, 0::int),
+('Spark SQL'::string, 300::int, 3::int),
+('Spark SQL'::string, -300::int, 3::int),
+(NULL::string, 3::int, 1::int),
+('Spark SQL'::string, NULL::int, 1::int),
+('Spark SQL'::string, 3::int, NULL::int);
+----
+SQ
+a
+ark SQL
+(empty)
+(empty)
+(empty)
+Spa
+NULL
+NULL
+NULL
+
+# alias substr
+
+query T
+SELECT substr('Spark SQL'::string, 0::int);
+----
+Spark SQL
+
+query T
+SELECT substr(column1, column2)
+FROM VALUES
+('Spark SQL'::string, 0::int),
+('Spark SQL'::string, 5::int),
+('Spark SQL'::string, -3::int),
+('Spark SQL'::string, 500::int),
+('Spark SQL'::string, -300::int),
+(NULL::string, 5::int),
+('Spark SQL'::string, NULL::int);
+----
+Spark SQL
+k SQL
+SQL
+(empty)
+Spark SQL
+NULL
+NULL
+
+query T
+SELECT substr(column1, column2, column3)
+FROM VALUES
+('Spark SQL'::string, -3::int, 2::int),
+('Spark SQL'::string, 3::int, 1::int),
+('Spark SQL'::string, 3::int, 700::int),
+('Spark SQL'::string, 3::int, -1::int),
+('Spark SQL'::string, 3::int, 0::int),
+('Spark SQL'::string, 300::int, 3::int),
+('Spark SQL'::string, -300::int, 3::int),
+(NULL::string, 3::int, 1::int),
+('Spark SQL'::string, NULL::int, 1::int),
+('Spark SQL'::string, 3::int, NULL::int);
+----
+SQ
+a
+ark SQL
+(empty)
+(empty)
+(empty)
+Spa
+NULL
+NULL
+NULL
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]