(arrow-datafusion) branch main updated: Port `StringToArray` to `function-arrays` subcrate (#9543)

jayzhan Sun, 10 Mar 2024 23:59:38 -0700

This is an automated email from the ASF dual-hosted git repository.

jayzhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 4cd3c43300 Port `StringToArray` to `function-arrays` subcrate (#9543)
4cd3c43300 is described below

commit 4cd3c433004a7a6825643d6b3911db720efe5f76
Author: Eren Avsarogullari <[email protected]>
AuthorDate: Sun Mar 10 23:59:17 2024 -0700

    Port `StringToArray` to `function-arrays` subcrate (#9543)
    
    * Issue-9497 - Port StringToArray to function-arrays
    
    * Issue-9497 - Fix formatting issues
    
    * Issue-9497 - Format expressions.md documentation
---
 datafusion/expr/src/built_in_function.rs           |  18 ----
 datafusion/expr/src/expr_fn.rs                     |   2 -
 datafusion/functions-array/src/kernels.rs          | 106 +++++++++++++++++++--
 datafusion/functions-array/src/lib.rs              |   2 +
 datafusion/functions-array/src/udf.rs              |  76 +++++++++++++++
 datafusion/physical-expr/src/array_expressions.rs  |  93 +-----------------
 datafusion/physical-expr/src/functions.rs          |  15 ---
 datafusion/proto/proto/datafusion.proto            |   2 +-
 datafusion/proto/src/generated/pbjson.rs           |   3 -
 datafusion/proto/src/generated/prost.rs            |   4 +-
 datafusion/proto/src/logical_plan/from_proto.rs    |  15 +--
 datafusion/proto/src/logical_plan/to_proto.rs      |   1 -
 .../proto/tests/cases/roundtrip_logical_plan.rs    |   1 +
 docs/source/user-guide/expressions.md              |  73 +++++++-------
 docs/source/user-guide/sql/scalar_functions.md     |   1 +
 15 files changed, 224 insertions(+), 188 deletions(-)

diff --git a/datafusion/expr/src/built_in_function.rs 
b/datafusion/expr/src/built_in_function.rs
index 6351e877df..9c2b6683e8 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -210,8 +210,6 @@ pub enum BuiltinScalarFunction {
     SHA512,
     /// split_part
     SplitPart,
-    /// string_to_array
-    StringToArray,
     /// starts_with
     StartsWith,
     /// strpos
@@ -383,7 +381,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::SHA512 => Volatility::Immutable,
             BuiltinScalarFunction::Digest => Volatility::Immutable,
             BuiltinScalarFunction::SplitPart => Volatility::Immutable,
-            BuiltinScalarFunction::StringToArray => Volatility::Immutable,
             BuiltinScalarFunction::StartsWith => Volatility::Immutable,
             BuiltinScalarFunction::Strpos => Volatility::Immutable,
             BuiltinScalarFunction::Substr => Volatility::Immutable,
@@ -556,11 +553,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::SplitPart => {
                 utf8_to_str_type(&input_expr_types[0], "split_part")
             }
-            BuiltinScalarFunction::StringToArray => 
Ok(List(Arc::new(Field::new(
-                "item",
-                input_expr_types[0].clone(),
-                true,
-            )))),
             BuiltinScalarFunction::StartsWith => Ok(Boolean),
             BuiltinScalarFunction::EndsWith => Ok(Boolean),
             BuiltinScalarFunction::Strpos => {
@@ -833,13 +825,6 @@ impl BuiltinScalarFunction {
                 ],
                 self.volatility(),
             ),
-            BuiltinScalarFunction::StringToArray => Signature::one_of(
-                vec![
-                    TypeSignature::Uniform(2, vec![Utf8, LargeUtf8]),
-                    TypeSignature::Uniform(3, vec![Utf8, LargeUtf8]),
-                ],
-                self.volatility(),
-            ),
 
             BuiltinScalarFunction::EndsWith
             | BuiltinScalarFunction::Strpos
@@ -1087,9 +1072,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::Rpad => &["rpad"],
             BuiltinScalarFunction::Rtrim => &["rtrim"],
             BuiltinScalarFunction::SplitPart => &["split_part"],
-            BuiltinScalarFunction::StringToArray => {
-                &["string_to_array", "string_to_list"]
-            }
             BuiltinScalarFunction::StartsWith => &["starts_with"],
             BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
             BuiltinScalarFunction::Substr => &["substr"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index d1ae06d68f..5239c67b52 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -754,7 +754,6 @@ scalar_expr!(SHA256, sha256, string, "SHA-256 hash");
 scalar_expr!(SHA384, sha384, string, "SHA-384 hash");
 scalar_expr!(SHA512, sha512, string, "SHA-512 hash");
 scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string 
based on a delimiter and picks out the desired field based on the index.");
-scalar_expr!(StringToArray, string_to_array, string delimiter null_string, 
"splits a `string` based on a `delimiter` and returns an array of parts. Any 
parts matching the optional `null_string` will be replaced with `NULL`");
 scalar_expr!(StartsWith, starts_with, string prefix, "whether the `string` 
starts with the `prefix`");
 scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends 
with the `suffix`");
 scalar_expr!(Strpos, strpos, string substring, "finds the position from where 
the `substring` matches the `string`");
@@ -1275,7 +1274,6 @@ mod test {
         test_scalar_expr!(SHA384, sha384, string);
         test_scalar_expr!(SHA512, sha512, string);
         test_scalar_expr!(SplitPart, split_part, expr, delimiter, index);
-        test_scalar_expr!(StringToArray, string_to_array, expr, delimiter, 
null_value);
         test_scalar_expr!(StartsWith, starts_with, string, characters);
         test_scalar_expr!(EndsWith, ends_with, string, characters);
         test_scalar_expr!(Strpos, strpos, string, substring);
diff --git a/datafusion/functions-array/src/kernels.rs 
b/datafusion/functions-array/src/kernels.rs
index bb5c4ef53e..6d843aa4bb 100644
--- a/datafusion/functions-array/src/kernels.rs
+++ b/datafusion/functions-array/src/kernels.rs
@@ -19,20 +19,20 @@
 
 use arrow::array::{
     Array, ArrayRef, BooleanArray, Date32Array, Float32Array, Float64Array,
-    GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, 
LargeStringArray,
-    OffsetSizeTrait, StringArray, UInt16Array, UInt32Array, UInt64Array, 
UInt8Array,
+    GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, 
LargeListArray,
+    LargeStringArray, ListArray, ListBuilder, OffsetSizeTrait, StringArray,
+    StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
 };
-use arrow::array::{LargeListArray, ListArray};
 use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::Field;
 use arrow::datatypes::UInt64Type;
 use arrow::datatypes::{DataType, Date32Type, IntervalMonthDayNanoType};
 use datafusion_common::cast::{
-    as_date32_array, as_generic_list_array, as_int64_array, 
as_interval_mdn_array,
-    as_large_list_array, as_list_array, as_null_array, as_string_array,
+    as_date32_array, as_generic_list_array, as_generic_string_array, 
as_int64_array,
+    as_interval_mdn_array, as_large_list_array, as_list_array, as_null_array,
+    as_string_array,
 };
-use datafusion_common::DataFusionError;
-use datafusion_common::{exec_err, not_impl_datafusion_err, Result};
+use datafusion_common::{exec_err, not_impl_datafusion_err, DataFusionError, 
Result};
 use std::any::type_name;
 use std::sync::Arc;
 
@@ -261,6 +261,98 @@ pub(super) fn array_to_string(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     Ok(Arc::new(string_arr))
 }
 
+/// Splits string at occurrences of delimiter and returns an array of parts
+/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]'
+pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
+    if args.len() < 2 || args.len() > 3 {
+        return exec_err!("string_to_array expects two or three arguments");
+    }
+    let string_array = as_generic_string_array::<T>(&args[0])?;
+    let delimiter_array = as_generic_string_array::<T>(&args[1])?;
+
+    let mut list_builder = ListBuilder::new(StringBuilder::with_capacity(
+        string_array.len(),
+        string_array.get_buffer_memory_size(),
+    ));
+
+    match args.len() {
+        2 => {
+            string_array.iter().zip(delimiter_array.iter()).for_each(
+                |(string, delimiter)| {
+                    match (string, delimiter) {
+                        (Some(string), Some("")) => {
+                            list_builder.values().append_value(string);
+                            list_builder.append(true);
+                        }
+                        (Some(string), Some(delimiter)) => {
+                            string.split(delimiter).for_each(|s| {
+                                list_builder.values().append_value(s);
+                            });
+                            list_builder.append(true);
+                        }
+                        (Some(string), None) => {
+                            string.chars().map(|c| c.to_string()).for_each(|c| 
{
+                                list_builder.values().append_value(c);
+                            });
+                            list_builder.append(true);
+                        }
+                        _ => list_builder.append(false), // null value
+                    }
+                },
+            );
+        }
+
+        3 => {
+            let null_value_array = as_generic_string_array::<T>(&args[2])?;
+            string_array
+                .iter()
+                .zip(delimiter_array.iter())
+                .zip(null_value_array.iter())
+                .for_each(|((string, delimiter), null_value)| {
+                    match (string, delimiter) {
+                        (Some(string), Some("")) => {
+                            if Some(string) == null_value {
+                                list_builder.values().append_null();
+                            } else {
+                                list_builder.values().append_value(string);
+                            }
+                            list_builder.append(true);
+                        }
+                        (Some(string), Some(delimiter)) => {
+                            string.split(delimiter).for_each(|s| {
+                                if Some(s) == null_value {
+                                    list_builder.values().append_null();
+                                } else {
+                                    list_builder.values().append_value(s);
+                                }
+                            });
+                            list_builder.append(true);
+                        }
+                        (Some(string), None) => {
+                            string.chars().map(|c| c.to_string()).for_each(|c| 
{
+                                if Some(c.as_str()) == null_value {
+                                    list_builder.values().append_null();
+                                } else {
+                                    list_builder.values().append_value(c);
+                                }
+                            });
+                            list_builder.append(true);
+                        }
+                        _ => list_builder.append(false), // null value
+                    }
+                });
+        }
+        _ => {
+            return exec_err!(
+                "Expect string_to_array function to take two or three 
parameters"
+            )
+        }
+    }
+
+    let list_array = list_builder.finish();
+    Ok(Arc::new(list_array) as ArrayRef)
+}
+
 /// Generates an array of integers from start to stop with a given step.
 ///
 /// This function takes 1 to 3 ArrayRefs as arguments, representing start, 
stop, and step values.
diff --git a/datafusion/functions-array/src/lib.rs 
b/datafusion/functions-array/src/lib.rs
index cf1e35d608..0f395f2270 100644
--- a/datafusion/functions-array/src/lib.rs
+++ b/datafusion/functions-array/src/lib.rs
@@ -59,12 +59,14 @@ pub mod expr_fn {
     pub use super::udf::flatten;
     pub use super::udf::gen_series;
     pub use super::udf::range;
+    pub use super::udf::string_to_array;
 }
 
 /// Registers all enabled packages with a [`FunctionRegistry`]
 pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> {
     let functions: Vec<Arc<ScalarUDF>> = vec![
         udf::array_to_string_udf(),
+        udf::string_to_array_udf(),
         udf::range_udf(),
         udf::gen_series_udf(),
         udf::array_dims_udf(),
diff --git a/datafusion/functions-array/src/udf.rs 
b/datafusion/functions-array/src/udf.rs
index 854535c237..fc1cc281bc 100644
--- a/datafusion/functions-array/src/udf.rs
+++ b/datafusion/functions-array/src/udf.rs
@@ -17,6 +17,7 @@
 
 //! [`ScalarUDFImpl`] definitions for array functions.
 
+use arrow::array::{NullArray, StringArray};
 use arrow::datatypes::DataType;
 use arrow::datatypes::Field;
 use arrow::datatypes::IntervalUnit::MonthDayNano;
@@ -89,6 +90,81 @@ impl ScalarUDFImpl for ArrayToString {
     }
 }
 
+make_udf_function!(StringToArray,
+    string_to_array,
+    string delimiter null_string, // arg name
+    "splits a `string` based on a `delimiter` and returns an array of parts. 
Any parts matching the optional `null_string` will be replaced with `NULL`", // 
doc
+    string_to_array_udf // internal function name
+);
+#[derive(Debug)]
+pub(super) struct StringToArray {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl StringToArray {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+            aliases: vec![
+                String::from("string_to_array"),
+                String::from("string_to_list"),
+            ],
+        }
+    }
+}
+
+impl ScalarUDFImpl for StringToArray {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+    fn name(&self) -> &str {
+        "string_to_array"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> 
datafusion_common::Result<DataType> {
+        use DataType::*;
+        Ok(match arg_types[0] {
+            Utf8 | LargeUtf8 => {
+                List(Arc::new(Field::new("item", arg_types[0].clone(), true)))
+            }
+            _ => {
+                return plan_err!(
+                    "The string_to_array function can only accept Utf8 or 
LargeUtf8."
+                );
+            }
+        })
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> 
datafusion_common::Result<ColumnarValue> {
+        let mut args = ColumnarValue::values_to_arrays(args)?;
+        // Case: delimiter is NULL, needs to be handled as well.
+        if args[1].as_any().is::<NullArray>() {
+            args[1] = Arc::new(StringArray::new_null(args[1].len()));
+        };
+
+        match args[0].data_type() {
+            arrow::datatypes::DataType::Utf8 => {
+                
crate::kernels::string_to_array::<i32>(&args).map(ColumnarValue::Array)
+            }
+            arrow::datatypes::DataType::LargeUtf8 => {
+                
crate::kernels::string_to_array::<i64>(&args).map(ColumnarValue::Array)
+            }
+            other => {
+                exec_err!("unsupported type for string_to_array function as 
{other}")
+            }
+        }
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+}
+
 make_udf_function!(
     Range,
     range,
diff --git a/datafusion/physical-expr/src/array_expressions.rs 
b/datafusion/physical-expr/src/array_expressions.rs
index 3f7ea57df2..ed656660ab 100644
--- a/datafusion/physical-expr/src/array_expressions.rs
+++ b/datafusion/physical-expr/src/array_expressions.rs
@@ -30,8 +30,8 @@ use arrow_buffer::{ArrowNativeType, NullBuffer};
 
 use arrow_schema::{FieldRef, SortOptions};
 use datafusion_common::cast::{
-    as_generic_list_array, as_generic_string_array, as_int64_array, 
as_large_list_array,
-    as_list_array, as_string_array,
+    as_generic_list_array, as_int64_array, as_large_list_array, as_list_array,
+    as_string_array,
 };
 use datafusion_common::utils::array_into_list_array;
 use datafusion_common::{
@@ -1587,95 +1587,6 @@ pub fn array_intersect(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     general_set_op(array1, array2, SetOp::Intersect)
 }
 
-/// Splits string at occurrences of delimiter and returns an array of parts
-/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]'
-pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
-    let string_array = as_generic_string_array::<T>(&args[0])?;
-    let delimiter_array = as_generic_string_array::<T>(&args[1])?;
-
-    let mut list_builder = ListBuilder::new(StringBuilder::with_capacity(
-        string_array.len(),
-        string_array.get_buffer_memory_size(),
-    ));
-
-    match args.len() {
-        2 => {
-            string_array.iter().zip(delimiter_array.iter()).for_each(
-                |(string, delimiter)| {
-                    match (string, delimiter) {
-                        (Some(string), Some("")) => {
-                            list_builder.values().append_value(string);
-                            list_builder.append(true);
-                        }
-                        (Some(string), Some(delimiter)) => {
-                            string.split(delimiter).for_each(|s| {
-                                list_builder.values().append_value(s);
-                            });
-                            list_builder.append(true);
-                        }
-                        (Some(string), None) => {
-                            string.chars().map(|c| c.to_string()).for_each(|c| 
{
-                                list_builder.values().append_value(c);
-                            });
-                            list_builder.append(true);
-                        }
-                        _ => list_builder.append(false), // null value
-                    }
-                },
-            );
-        }
-
-        3 => {
-            let null_value_array = as_generic_string_array::<T>(&args[2])?;
-            string_array
-                .iter()
-                .zip(delimiter_array.iter())
-                .zip(null_value_array.iter())
-                .for_each(|((string, delimiter), null_value)| {
-                    match (string, delimiter) {
-                        (Some(string), Some("")) => {
-                            if Some(string) == null_value {
-                                list_builder.values().append_null();
-                            } else {
-                                list_builder.values().append_value(string);
-                            }
-                            list_builder.append(true);
-                        }
-                        (Some(string), Some(delimiter)) => {
-                            string.split(delimiter).for_each(|s| {
-                                if Some(s) == null_value {
-                                    list_builder.values().append_null();
-                                } else {
-                                    list_builder.values().append_value(s);
-                                }
-                            });
-                            list_builder.append(true);
-                        }
-                        (Some(string), None) => {
-                            string.chars().map(|c| c.to_string()).for_each(|c| 
{
-                                if Some(c.as_str()) == null_value {
-                                    list_builder.values().append_null();
-                                } else {
-                                    list_builder.values().append_value(c);
-                                }
-                            });
-                            list_builder.append(true);
-                        }
-                        _ => list_builder.append(false), // null value
-                    }
-                });
-        }
-        _ => {
-            return exec_err!(
-                "Expect string_to_array function to take two or three 
parameters"
-            )
-        }
-    }
-
-    let list_array = list_builder.finish();
-    Ok(Arc::new(list_array) as ArrayRef)
-}
-
 pub fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
     array: &GenericListArray<OffsetSize>,
     field: &FieldRef,
diff --git a/datafusion/physical-expr/src/functions.rs 
b/datafusion/physical-expr/src/functions.rs
index e9ac9bd2d6..db38e358db 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -602,21 +602,6 @@ pub fn create_physical_fun(
                 exec_err!("Unsupported data type {other:?} for function 
split_part")
             }
         }),
-        BuiltinScalarFunction::StringToArray => {
-            Arc::new(|args| match args[0].data_type() {
-                DataType::Utf8 => make_scalar_function_inner(
-                    array_expressions::string_to_array::<i32>,
-                )(args),
-                DataType::LargeUtf8 => make_scalar_function_inner(
-                    array_expressions::string_to_array::<i64>,
-                )(args),
-                other => {
-                    exec_err!(
-                        "Unsupported data type {other:?} for function 
string_to_array"
-                    )
-                }
-            })
-        }
         BuiltinScalarFunction::StartsWith => Arc::new(|args| match 
args[0].data_type() {
             DataType::Utf8 => {
                 
make_scalar_function_inner(string_expressions::starts_with::<i32>)(args)
diff --git a/datafusion/proto/proto/datafusion.proto 
b/datafusion/proto/proto/datafusion.proto
index c2a36af2e7..8d8ae3691a 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -664,7 +664,7 @@ enum ScalarFunction {
   Iszero = 114;
   // 115 was ArrayEmpty
   ArrayPopBack = 116;
-  StringToArray = 117;
+  // 117 was StringToArray
   // 118 was ToTimestampNanos
   ArrayIntersect = 119;
   ArrayUnion = 120;
diff --git a/datafusion/proto/src/generated/pbjson.rs 
b/datafusion/proto/src/generated/pbjson.rs
index 0ec6de8f40..961a626463 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22185,7 +22185,6 @@ impl serde::Serialize for ScalarFunction {
             Self::Nanvl => "Nanvl",
             Self::Iszero => "Iszero",
             Self::ArrayPopBack => "ArrayPopBack",
-            Self::StringToArray => "StringToArray",
             Self::ArrayIntersect => "ArrayIntersect",
             Self::ArrayUnion => "ArrayUnion",
             Self::OverLay => "OverLay",
@@ -22300,7 +22299,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
             "Nanvl",
             "Iszero",
             "ArrayPopBack",
-            "StringToArray",
             "ArrayIntersect",
             "ArrayUnion",
             "OverLay",
@@ -22444,7 +22442,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
                     "Nanvl" => Ok(ScalarFunction::Nanvl),
                     "Iszero" => Ok(ScalarFunction::Iszero),
                     "ArrayPopBack" => Ok(ScalarFunction::ArrayPopBack),
-                    "StringToArray" => Ok(ScalarFunction::StringToArray),
                     "ArrayIntersect" => Ok(ScalarFunction::ArrayIntersect),
                     "ArrayUnion" => Ok(ScalarFunction::ArrayUnion),
                     "OverLay" => Ok(ScalarFunction::OverLay),
diff --git a/datafusion/proto/src/generated/prost.rs 
b/datafusion/proto/src/generated/prost.rs
index 9b34b084c9..deaa977faa 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2736,7 +2736,7 @@ pub enum ScalarFunction {
     Iszero = 114,
     /// 115 was ArrayEmpty
     ArrayPopBack = 116,
-    StringToArray = 117,
+    /// 117 was StringToArray
     /// 118 was ToTimestampNanos
     ArrayIntersect = 119,
     ArrayUnion = 120,
@@ -2855,7 +2855,6 @@ impl ScalarFunction {
             ScalarFunction::Nanvl => "Nanvl",
             ScalarFunction::Iszero => "Iszero",
             ScalarFunction::ArrayPopBack => "ArrayPopBack",
-            ScalarFunction::StringToArray => "StringToArray",
             ScalarFunction::ArrayIntersect => "ArrayIntersect",
             ScalarFunction::ArrayUnion => "ArrayUnion",
             ScalarFunction::OverLay => "OverLay",
@@ -2964,7 +2963,6 @@ impl ScalarFunction {
             "Nanvl" => Some(Self::Nanvl),
             "Iszero" => Some(Self::Iszero),
             "ArrayPopBack" => Some(Self::ArrayPopBack),
-            "StringToArray" => Some(Self::StringToArray),
             "ArrayIntersect" => Some(Self::ArrayIntersect),
             "ArrayUnion" => Some(Self::ArrayUnion),
             "OverLay" => Some(Self::OverLay),
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs 
b/datafusion/proto/src/logical_plan/from_proto.rs
index 8dba553b48..739503a942 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -60,11 +60,10 @@ use datafusion_expr::{
     logical_plan::{PlanType, StringifiedPlan},
     lower, lpad, ltrim, md5, nanvl, now, octet_length, overlay, pi, power, 
radians,
     random, repeat, replace, reverse, right, round, rpad, rtrim, sha224, 
sha256, sha384,
-    sha512, signum, sin, sinh, split_part, sqrt, starts_with, string_to_array, 
strpos,
-    struct_fun, substr, substr_index, substring, tan, tanh, to_hex, translate, 
trim,
-    trunc, upper, uuid, AggregateFunction, Between, BinaryExpr, 
BuiltInWindowFunction,
-    BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField,
-    GroupingSet,
+    sha512, signum, sin, sinh, split_part, sqrt, starts_with, strpos, 
struct_fun, substr,
+    substr_index, substring, tan, tanh, to_hex, translate, trim, trunc, upper, 
uuid,
+    AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, 
BuiltinScalarFunction,
+    Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet,
     GroupingSet::GroupingSets,
     JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, 
WindowFrameBound,
     WindowFrameUnits,
@@ -521,7 +520,6 @@ impl From<&protobuf::ScalarFunction> for 
BuiltinScalarFunction {
             ScalarFunction::Right => Self::Right,
             ScalarFunction::Rpad => Self::Rpad,
             ScalarFunction::SplitPart => Self::SplitPart,
-            ScalarFunction::StringToArray => Self::StringToArray,
             ScalarFunction::StartsWith => Self::StartsWith,
             ScalarFunction::Strpos => Self::Strpos,
             ScalarFunction::Substr => Self::Substr,
@@ -1742,11 +1740,6 @@ pub fn parse_expr(
                 ScalarFunction::ArrowTypeof => {
                     Ok(arrow_typeof(parse_expr(&args[0], registry, codec)?))
                 }
-                ScalarFunction::StringToArray => Ok(string_to_array(
-                    parse_expr(&args[0], registry, codec)?,
-                    parse_expr(&args[1], registry, codec)?,
-                    parse_expr(&args[2], registry, codec)?,
-                )),
                 ScalarFunction::OverLay => Ok(overlay(
                     args.to_owned()
                         .iter()
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs 
b/datafusion/proto/src/logical_plan/to_proto.rs
index 393cc78267..a9867b8f03 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1502,7 +1502,6 @@ impl TryFrom<&BuiltinScalarFunction> for 
protobuf::ScalarFunction {
             BuiltinScalarFunction::Right => Self::Right,
             BuiltinScalarFunction::Rpad => Self::Rpad,
             BuiltinScalarFunction::SplitPart => Self::SplitPart,
-            BuiltinScalarFunction::StringToArray => Self::StringToArray,
             BuiltinScalarFunction::StartsWith => Self::StartsWith,
             BuiltinScalarFunction::Strpos => Self::Strpos,
             BuiltinScalarFunction::Substr => Self::Substr,
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs 
b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index 76402604ac..3899f64a37 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -586,6 +586,7 @@ async fn roundtrip_expr_api() -> Result<()> {
         array_dims(make_array(vec![lit(1), lit(2), lit(3)])),
         array_ndims(make_array(vec![lit(1), lit(2), lit(3)])),
         cardinality(make_array(vec![lit(1), lit(2), lit(3)])),
+        string_to_array(lit("abc#def#ghl"), lit("#"), lit(",")),
         range(lit(1), lit(10), lit(2)),
         gen_series(lit(1), lit(10), lit(2)),
         array_append(make_array(vec![lit(1), lit(2), lit(3)]), lit(4)),
diff --git a/docs/source/user-guide/expressions.md 
b/docs/source/user-guide/expressions.md
index dcb599b9b3..17da8c3fc2 100644
--- a/docs/source/user-guide/expressions.md
+++ b/docs/source/user-guide/expressions.md
@@ -207,42 +207,43 @@ select log(-1), log(0), sqrt(-1);
 
 ## Array Expressions
 
-| Syntax                                 | Description                         
                                                                                
                                                     |
-| -------------------------------------- | 
------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 |
-| array_append(array, element)           | Appends an element to the end of an 
array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]`                             
                                                     |
-| array_concat(array[, ..., array_n])    | Concatenates arrays. 
`array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]`                      
                                                                    |
-| array_has(array, element)              | Returns true if the array contains 
the element `array_has([1,2,3], 1) -> true`                                     
                                                      |
-| array_has_all(array, sub-array)        | Returns true if all elements of 
sub-array exist in array `array_has_all([1,2,3], [1,3]) -> true`                
                                                         |
-| array_has_any(array, sub-array)        | Returns true if any elements exist 
in both arrays `array_has_any([1,2,3], [1,4]) -> true`                          
                                                      |
-| array_dims(array)                      | Returns an array of the array's 
dimensions. `array_dims([[1, 2, 3], [4, 5, 6]]) -> [2, 3]`                      
                                                         |
-| array_distinct(array)                  | Returns distinct values from the 
array after removing duplicates. `array_distinct([1, 3, 2, 3, 1, 2, 4]) -> [1, 
2, 3, 4]`                                                |
-| array_element(array, index)            | Extracts the element with the index 
n from the array `array_element([1, 2, 3, 4], 3) -> 3`                          
                                                     |
-| flatten(array)                         | Converts an array of arrays to a 
flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]`            
                                                        |
-| array_length(array, dimension)         | Returns the length of the array 
dimension. `array_length([1, 2, 3, 4, 5]) -> 5`                                 
                                                         |
-| array_ndims(array)                     | Returns the number of dimensions of 
the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2`                           
                                                     |
-| array_pop_front(array)                 | Returns the array without the first 
element. `array_pop_front([1, 2, 3]) -> [2, 3]`                                 
                                                     |
-| array_pop_back(array)                  | Returns the array without the last 
element. `array_pop_back([1, 2, 3]) -> [1, 2]`                                  
                                                      |
-| array_position(array, element)         | Searches for an element in the 
array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2`      
                                                          |
-| array_positions(array, element)        | Searches for an element in the 
array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2, 3]` 
                                                          |
-| array_prepend(array, element)          | Prepends an element to the 
beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]`            
                                                              |
-| array_repeat(element, count)           | Returns an array containing element 
`count` times. `array_repeat(1, 3) -> [1, 1, 1]`                                
                                                     |
-| array_remove(array, element)           | Removes the first element from the 
array equal to the given value. `array_remove([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 
2, 3, 2, 1, 4]`                                        |
-| array_remove_n(array, element, max)    | Removes the first `max` elements 
from the array equal to the given value. `array_remove_n([1, 2, 2, 3, 2, 1, 4], 
2, 2) -> [1, 3, 2, 1, 4]`                               |
-| array_remove_all(array, element)       | Removes all elements from the array 
equal to the given value. `array_remove_all([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 3, 
1, 4]`                                               |
-| array_replace(array, from, to)         | Replaces the first occurrence of 
the specified element with another specified element. `array_replace([1, 2, 2, 
3, 2, 1, 4], 2, 5) -> [1, 5, 2, 3, 2, 1, 4]`             |
-| array_replace_n(array, from, to, max)  | Replaces the first `max` 
occurrences of the specified element with another specified element. 
`array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2) -> [1, 5, 5, 3, 2, 1, 4]` |
-| array_replace_all(array, from, to)     | Replaces all occurrences of the 
specified element with another specified element. `array_replace_all([1, 2, 2, 
3, 2, 1, 4], 2, 5) -> [1, 5, 5, 3, 5, 1, 4]`              |
-| array_slice(array, begin,end)          | Returns a slice of the array. 
`array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6) -> [3, 4, 5, 6]`                   
                                                           |
-| array_slice(array, begin, end, stride) | Returns a slice of the array with 
added stride feature. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6, 2) -> [3, 5, 
6]`                                                    |
-| array_to_string(array, delimiter)      | Converts each element to its text 
representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4`                 
                                                       |
-| array_intersect(array1, array2)        | Returns an array of the elements in 
the intersection of array1 and array2. `array_intersect([1, 2, 3, 4], [5, 6, 3, 
4]) -> [3, 4]`                                       |
-| array_union(array1, array2)            | Returns an array of the elements in 
the union of array1 and array2 without duplicates. `array_union([1, 2, 3, 4], 
[5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]`                   |
-| array_except(array1, array2)           | Returns an array of the elements 
that appear in the first array but not in the second. `array_except([1, 2, 3, 
4], [5, 6, 3, 4]) -> [3, 4]`                              |
-| array_resize(array, size, value)       | Resizes the list to contain size 
elements. Initializes new elements with value or empty if value is not set. 
`array_resize([1, 2, 3], 5, 0) -> [1, 2, 3, 4, 5, 6]`       |
-| cardinality(array)                     | Returns the total number of 
elements in the array. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6`               
                                                             |
-| make_array(value1, [value2 [, ...]])   | Returns an Arrow array using the 
specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]`                 
                                                        |
-| range(start [, stop, step])            | Returns an Arrow array between 
start and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]`                 
                                                          |
-| trim_array(array, n)                   | Deprecated                          
                                                                                
                                                     |
+| Syntax                                         | Description                 
                                                                                
                                                                                
                            |
+| ---------------------------------------------- | 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 |
+| array_append(array, element)                   | Appends an element to the 
end of an array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]`                   
                                                                                
                              |
+| array_concat(array[, ..., array_n])            | Concatenates arrays. 
`array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]`                      
                                                                                
                                   |
+| array_has(array, element)                      | Returns true if the array 
contains the element `array_has([1,2,3], 1) -> true`                            
                                                                                
                              |
+| array_has_all(array, sub-array)                | Returns true if all 
elements of sub-array exist in array `array_has_all([1,2,3], [1,3]) -> true`    
                                                                                
                                    |
+| array_has_any(array, sub-array)                | Returns true if any 
elements exist in both arrays `array_has_any([1,2,3], [1,4]) -> true`           
                                                                                
                                    |
+| array_dims(array)                              | Returns an array of the 
array's dimensions. `array_dims([[1, 2, 3], [4, 5, 6]]) -> [2, 3]`              
                                                                                
                                |
+| array_distinct(array)                          | Returns distinct values 
from the array after removing duplicates. `array_distinct([1, 3, 2, 3, 1, 2, 
4]) -> [1, 2, 3, 4]`                                                            
                                   |
+| array_element(array, index)                    | Extracts the element with 
the index n from the array `array_element([1, 2, 3, 4], 3) -> 3`                
                                                                                
                              |
+| flatten(array)                                 | Converts an array of arrays 
to a flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]`       
                                                                                
                            |
+| array_length(array, dimension)                 | Returns the length of the 
array dimension. `array_length([1, 2, 3, 4, 5]) -> 5`                           
                                                                                
                              |
+| array_ndims(array)                             | Returns the number of 
dimensions of the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2`             
                                                                                
                                  |
+| array_pop_front(array)                         | Returns the array without 
the first element. `array_pop_front([1, 2, 3]) -> [2, 3]`                       
                                                                                
                              |
+| array_pop_back(array)                          | Returns the array without 
the last element. `array_pop_back([1, 2, 3]) -> [1, 2]`                         
                                                                                
                              |
+| array_position(array, element)                 | Searches for an element in 
the array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2`  
                                                                                
                             |
+| array_positions(array, element)                | Searches for an element in 
the array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2, 
3]`                                                                             
                             |
+| array_prepend(array, element)                  | Prepends an element to the 
beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]`            
                                                                                
                             |
+| array_repeat(element, count)                   | Returns an array containing 
element `count` times. `array_repeat(1, 3) -> [1, 1, 1]`                        
                                                                                
                            |
+| array_remove(array, element)                   | Removes the first element 
from the array equal to the given value. `array_remove([1, 2, 2, 3, 2, 1, 4], 
2) -> [1, 2, 3, 2, 1, 4]`                                                       
                                |
+| array_remove_n(array, element, max)            | Removes the first `max` 
elements from the array equal to the given value. `array_remove_n([1, 2, 2, 3, 
2, 1, 4], 2, 2) -> [1, 3, 2, 1, 4]`                                             
                                 |
+| array_remove_all(array, element)               | Removes all elements from 
the array equal to the given value. `array_remove_all([1, 2, 2, 3, 2, 1, 4], 2) 
-> [1, 3, 1, 4]`                                                                
                              |
+| array_replace(array, from, to)                 | Replaces the first 
occurrence of the specified element with another specified element. 
`array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 2, 3, 2, 1, 4]`           
                                                 |
+| array_replace_n(array, from, to, max)          | Replaces the first `max` 
occurrences of the specified element with another specified element. 
`array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2) -> [1, 5, 5, 3, 2, 1, 4]`      
                                          |
+| array_replace_all(array, from, to)             | Replaces all occurrences of 
the specified element with another specified element. `array_replace_all([1, 2, 
2, 3, 2, 1, 4], 2, 5) -> [1, 5, 5, 3, 5, 1, 4]`                                 
                            |
+| array_slice(array, begin,end)                  | Returns a slice of the 
array. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6) -> [3, 4, 5, 6]`            
                                                                                
                                 |
+| array_slice(array, begin, end, stride)         | Returns a slice of the 
array with added stride feature. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6, 
2) -> [3, 5, 6]`                                                                
                                   |
+| array_to_string(array, delimiter)              | Converts each element to 
its text representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4`        
                                                                                
                               |
+| array_intersect(array1, array2)                | Returns an array of the 
elements in the intersection of array1 and array2. `array_intersect([1, 2, 3, 
4], [5, 6, 3, 4]) -> [3, 4]`                                                    
                                  |
+| array_union(array1, array2)                    | Returns an array of the 
elements in the union of array1 and array2 without duplicates. `array_union([1, 
2, 3, 4], [5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]`                                  
                                |
+| array_except(array1, array2)                   | Returns an array of the 
elements that appear in the first array but not in the second. 
`array_except([1, 2, 3, 4], [5, 6, 3, 4]) -> [3, 4]`                            
                                                 |
+| array_resize(array, size, value)               | Resizes the list to contain 
size elements. Initializes new elements with value or empty if value is not 
set. `array_resize([1, 2, 3], 5, 0) -> [1, 2, 3, 4, 5, 6]`                      
                                |
+| cardinality(array)                             | Returns the total number of 
elements in the array. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6`               
                                                                                
                            |
+| make_array(value1, [value2 [, ...]])           | Returns an Arrow array 
using the specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]`       
                                                                                
                                 |
+| range(start [, stop, step])                    | Returns an Arrow array 
between start and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]`         
                                                                                
                                 |
+| string_to_array(array, delimiter, null_string) | Splits a `string` based on 
a `delimiter` and returns an array of parts. Any parts matching the optional 
`null_string` will be replaced with `NULL`. `string_to_array('abc#def#ghi', 
'#', ' ') -> ['abc', 'def', 'ghi']` |
+| trim_array(array, n)                           | Deprecated                  
                                                                                
                                                                                
                            |
 
 ## Regular Expressions
 
diff --git a/docs/source/user-guide/sql/scalar_functions.md 
b/docs/source/user-guide/sql/scalar_functions.md
index b0385b4923..7496039116 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -3113,6 +3113,7 @@ _Alias of [make_array](#make_array)._
 ### `string_to_array`
 
 Splits a string in to an array of substrings based on a delimiter. Any 
substrings matching the optional `null_str` argument are replaced with NULL.
+`SELECT string_to_array('abc##def', '##')` or `SELECT string_to_array('abc 
def', ' ', 'def')`
 
 ```
 starts_with(str, delimiter[, null_str])

(arrow-datafusion) branch main updated: Port `StringToArray` to `function-arrays` subcrate (#9543)

Reply via email to