Re: [PR] perf: Optimize `split_part` for scalar args [datafusion]

via GitHub Thu, 02 Apr 2026 13:34:05 -0700


martin-g commented on code in PR #21238:
URL: https://github.com/apache/datafusion/pull/21238#discussion_r3030035579



##########
datafusion/sqllogictest/test_files/expr.slt:
##########
@@ -724,6 +724,72 @@ SELECT split_part('a,b', '', -2)
 statement error DataFusion error: Execution error: field position must not be 
zero
 SELECT split_part('abc~@~def~@~ghi', '~@~', 0)
 
+# Position 0 with column input errors even for empty/null inputs
+statement error DataFusion error: Execution error: field position must not be 
zero
+SELECT split_part(column1, '.', 0) FROM (VALUES (NULL::text)) AS t(column1)
+
+# split_part with column input (exercises the scalar-delimiter fast path)
+query TTT
+SELECT
+  split_part(column1, '.', 1),
+  split_part(column1, '.', 2),
+  split_part(column1, '.', 3)
+FROM (VALUES ('a.b.c'), ('d.e.f'), ('x.y')) AS t(column1)
+----
+a b c
+d e f
+x y (empty)
+
+# Multi-char delimiter with column input
+query TT
+SELECT
+  split_part(column1, '~@~', 2),
+  split_part(column1, '~@~', 3)
+FROM (VALUES ('abc~@~def~@~ghi'), ('one~@~two')) AS t(column1)
+----
+def ghi
+two (empty)
+
+# Negative position with column input
+query TT
+SELECT
+  split_part(column1, '.', -1),
+  split_part(column1, '.', -2)
+FROM (VALUES ('a.b.c'), ('x.y')) AS t(column1)
+----
+c b
+y x
+
+# Empty delimiter with column input
+query TT
+SELECT
+  split_part(column1, '', 1),
+  split_part(column1, '', 2)
+FROM (VALUES ('abc'), ('xyz')) AS t(column1)
+----
+abc (empty)
+xyz (empty)
+
+# NULL column values with scalar delimiter
+query T
+SELECT split_part(column1, '.', 2)
+FROM (VALUES ('a.b'), (NULL), ('c.d')) AS t(column1)
+----
+b
+NULL
+d
+
+# Utf8View column with scalar delimiter
+query TT
+SELECT
+  split_part(column1, '.', 1),
+  split_part(column1, '.', 2)
+FROM (SELECT arrow_cast(column1, 'Utf8View') AS column1
+      FROM (VALUES ('a.b.c'), ('x.y.z')) AS t(column1))
+----
+a b
+x y
+

Review Comment:
   For completeness and code coverage: a test for LargeUtf8 too:
   ```sql
   # LargeUtf8 column with scalar delimiter
   query T
   SELECT split_part(arrow_cast(column1, 'LargeUtf8'), '.', 2)
   FROM (VALUES ('a.b.c'), NULL) AS t(column1)
   ----
   b
   NULL
   ```



##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -220,6 +231,190 @@ fn rsplit_nth<'a>(string: &'a str, delimiter: &str, n: 
usize) -> Option<&'a str>
     }
 }
 
+/// Fast path for `split_part(array, scalar_delimiter, scalar_position)`.
+fn split_part_scalar(
+    string_array: &ArrayRef,
+    delim_scalar: &ScalarValue,
+    pos_scalar: &ScalarValue,
+) -> Result<ColumnarValue> {
+    let delimiter = delim_scalar.try_as_str().ok_or_else(|| {
+        exec_datafusion_err!(
+            "Unsupported delimiter type {:?} for split_part",
+            delim_scalar.data_type()
+        )
+    })?;
+
+    let position = match pos_scalar {
+        ScalarValue::Int64(v) => *v,
+        other => {
+            return exec_err!(
+                "Unsupported position type {:?} for split_part",
+                other.data_type()
+            );
+        }
+    };
+
+    if position == Some(0) {
+        return exec_err!("field position must not be zero");
+    }
+
+    // Null delimiter or position → every row is null.
+    let (Some(delimiter), Some(position)) = (delimiter, position) else {
+        return Ok(ColumnarValue::Array(new_null_array(
+            string_array.data_type(),
+            string_array.len(),
+        )));
+    };
+
+    let result = match string_array.data_type() {
+        DataType::Utf8View => split_part_scalar_impl(
+            string_array.as_string_view(),
+            delimiter,
+            position,
+            StringViewBuilder::with_capacity(string_array.len()),
+        ),
+        DataType::Utf8 => {
+            let arr = string_array.as_string::<i32>();
+            split_part_scalar_impl(
+                arr,
+                delimiter,
+                position,
+                GenericStringBuilder::<i32>::with_capacity(
+                    arr.len(),
+                    arr.value_data().len(),
+                ),
+            )
+        }
+        DataType::LargeUtf8 => {
+            let arr = string_array.as_string::<i64>();
+            split_part_scalar_impl(
+                arr,
+                delimiter,
+                position,
+                GenericStringBuilder::<i64>::with_capacity(
+                    arr.len(),
+                    arr.value_data().len(),

Review Comment:
   As above: may over-allocate for sliced array.



##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -220,6 +231,190 @@ fn rsplit_nth<'a>(string: &'a str, delimiter: &str, n: 
usize) -> Option<&'a str>
     }
 }
 
+/// Fast path for `split_part(array, scalar_delimiter, scalar_position)`.
+fn split_part_scalar(
+    string_array: &ArrayRef,
+    delim_scalar: &ScalarValue,
+    pos_scalar: &ScalarValue,
+) -> Result<ColumnarValue> {
+    let delimiter = delim_scalar.try_as_str().ok_or_else(|| {
+        exec_datafusion_err!(
+            "Unsupported delimiter type {:?} for split_part",
+            delim_scalar.data_type()
+        )
+    })?;
+
+    let position = match pos_scalar {
+        ScalarValue::Int64(v) => *v,
+        other => {
+            return exec_err!(
+                "Unsupported position type {:?} for split_part",
+                other.data_type()
+            );
+        }
+    };
+

Review Comment:
   For the slow path (Array) it zips the arrays. So, an empty string_arr leads 
to an empty result.
   To preserve the same behavior here before checking for `position=0` we 
should add something like:
   ```suggestion
       if string_array.len() == 0 {
           return Ok(ColumnarValue::Array(new_null_array(
               string_array.data_type(),
               0,
           )));
       }
   ```



##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -220,6 +231,190 @@ fn rsplit_nth<'a>(string: &'a str, delimiter: &str, n: 
usize) -> Option<&'a str>
     }
 }
 
+/// Fast path for `split_part(array, scalar_delimiter, scalar_position)`.
+fn split_part_scalar(
+    string_array: &ArrayRef,
+    delim_scalar: &ScalarValue,
+    pos_scalar: &ScalarValue,
+) -> Result<ColumnarValue> {
+    let delimiter = delim_scalar.try_as_str().ok_or_else(|| {
+        exec_datafusion_err!(
+            "Unsupported delimiter type {:?} for split_part",
+            delim_scalar.data_type()
+        )
+    })?;
+
+    let position = match pos_scalar {
+        ScalarValue::Int64(v) => *v,
+        other => {
+            return exec_err!(
+                "Unsupported position type {:?} for split_part",
+                other.data_type()
+            );
+        }
+    };
+
+    if position == Some(0) {

Review Comment:
   Here if the delimiter is NULL and the position=0 it will fail.
   But for Array (slow path) it fails only if all three args are non-NULL.
   To keep both paths behaving the same I think this `if` should be moved at 
line 268



##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -220,6 +231,190 @@ fn rsplit_nth<'a>(string: &'a str, delimiter: &str, n: 
usize) -> Option<&'a str>
     }
 }
 
+/// Fast path for `split_part(array, scalar_delimiter, scalar_position)`.
+fn split_part_scalar(
+    string_array: &ArrayRef,
+    delim_scalar: &ScalarValue,
+    pos_scalar: &ScalarValue,
+) -> Result<ColumnarValue> {
+    let delimiter = delim_scalar.try_as_str().ok_or_else(|| {
+        exec_datafusion_err!(
+            "Unsupported delimiter type {:?} for split_part",
+            delim_scalar.data_type()
+        )
+    })?;
+
+    let position = match pos_scalar {
+        ScalarValue::Int64(v) => *v,
+        other => {
+            return exec_err!(
+                "Unsupported position type {:?} for split_part",
+                other.data_type()
+            );
+        }
+    };
+
+    if position == Some(0) {
+        return exec_err!("field position must not be zero");
+    }
+
+    // Null delimiter or position → every row is null.
+    let (Some(delimiter), Some(position)) = (delimiter, position) else {
+        return Ok(ColumnarValue::Array(new_null_array(
+            string_array.data_type(),
+            string_array.len(),
+        )));
+    };
+
+    let result = match string_array.data_type() {
+        DataType::Utf8View => split_part_scalar_impl(
+            string_array.as_string_view(),
+            delimiter,
+            position,
+            StringViewBuilder::with_capacity(string_array.len()),
+        ),
+        DataType::Utf8 => {
+            let arr = string_array.as_string::<i32>();
+            split_part_scalar_impl(
+                arr,
+                delimiter,
+                position,
+                GenericStringBuilder::<i32>::with_capacity(
+                    arr.len(),
+                    arr.value_data().len(),

Review Comment:
   This may over-allocate if this is a sliced array.
   Can we use the offsets to calculate the actual length ?
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] perf: Optimize `split_part` for scalar args [datafusion]

Reply via email to