This is an automated email from the ASF dual-hosted git repository.
jayzhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 4cd3c43300 Port `StringToArray` to `function-arrays` subcrate (#9543)
4cd3c43300 is described below
commit 4cd3c433004a7a6825643d6b3911db720efe5f76
Author: Eren Avsarogullari <[email protected]>
AuthorDate: Sun Mar 10 23:59:17 2024 -0700
Port `StringToArray` to `function-arrays` subcrate (#9543)
* Issue-9497 - Port StringToArray to function-arrays
* Issue-9497 - Fix formatting issues
* Issue-9497 - Format expressions.md documentation
---
datafusion/expr/src/built_in_function.rs | 18 ----
datafusion/expr/src/expr_fn.rs | 2 -
datafusion/functions-array/src/kernels.rs | 106 +++++++++++++++++++--
datafusion/functions-array/src/lib.rs | 2 +
datafusion/functions-array/src/udf.rs | 76 +++++++++++++++
datafusion/physical-expr/src/array_expressions.rs | 93 +-----------------
datafusion/physical-expr/src/functions.rs | 15 ---
datafusion/proto/proto/datafusion.proto | 2 +-
datafusion/proto/src/generated/pbjson.rs | 3 -
datafusion/proto/src/generated/prost.rs | 4 +-
datafusion/proto/src/logical_plan/from_proto.rs | 15 +--
datafusion/proto/src/logical_plan/to_proto.rs | 1 -
.../proto/tests/cases/roundtrip_logical_plan.rs | 1 +
docs/source/user-guide/expressions.md | 73 +++++++-------
docs/source/user-guide/sql/scalar_functions.md | 1 +
15 files changed, 224 insertions(+), 188 deletions(-)
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index 6351e877df..9c2b6683e8 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -210,8 +210,6 @@ pub enum BuiltinScalarFunction {
SHA512,
/// split_part
SplitPart,
- /// string_to_array
- StringToArray,
/// starts_with
StartsWith,
/// strpos
@@ -383,7 +381,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::SHA512 => Volatility::Immutable,
BuiltinScalarFunction::Digest => Volatility::Immutable,
BuiltinScalarFunction::SplitPart => Volatility::Immutable,
- BuiltinScalarFunction::StringToArray => Volatility::Immutable,
BuiltinScalarFunction::StartsWith => Volatility::Immutable,
BuiltinScalarFunction::Strpos => Volatility::Immutable,
BuiltinScalarFunction::Substr => Volatility::Immutable,
@@ -556,11 +553,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::SplitPart => {
utf8_to_str_type(&input_expr_types[0], "split_part")
}
- BuiltinScalarFunction::StringToArray =>
Ok(List(Arc::new(Field::new(
- "item",
- input_expr_types[0].clone(),
- true,
- )))),
BuiltinScalarFunction::StartsWith => Ok(Boolean),
BuiltinScalarFunction::EndsWith => Ok(Boolean),
BuiltinScalarFunction::Strpos => {
@@ -833,13 +825,6 @@ impl BuiltinScalarFunction {
],
self.volatility(),
),
- BuiltinScalarFunction::StringToArray => Signature::one_of(
- vec![
- TypeSignature::Uniform(2, vec![Utf8, LargeUtf8]),
- TypeSignature::Uniform(3, vec![Utf8, LargeUtf8]),
- ],
- self.volatility(),
- ),
BuiltinScalarFunction::EndsWith
| BuiltinScalarFunction::Strpos
@@ -1087,9 +1072,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Rpad => &["rpad"],
BuiltinScalarFunction::Rtrim => &["rtrim"],
BuiltinScalarFunction::SplitPart => &["split_part"],
- BuiltinScalarFunction::StringToArray => {
- &["string_to_array", "string_to_list"]
- }
BuiltinScalarFunction::StartsWith => &["starts_with"],
BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
BuiltinScalarFunction::Substr => &["substr"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index d1ae06d68f..5239c67b52 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -754,7 +754,6 @@ scalar_expr!(SHA256, sha256, string, "SHA-256 hash");
scalar_expr!(SHA384, sha384, string, "SHA-384 hash");
scalar_expr!(SHA512, sha512, string, "SHA-512 hash");
scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string
based on a delimiter and picks out the desired field based on the index.");
-scalar_expr!(StringToArray, string_to_array, string delimiter null_string,
"splits a `string` based on a `delimiter` and returns an array of parts. Any
parts matching the optional `null_string` will be replaced with `NULL`");
scalar_expr!(StartsWith, starts_with, string prefix, "whether the `string`
starts with the `prefix`");
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends
with the `suffix`");
scalar_expr!(Strpos, strpos, string substring, "finds the position from where
the `substring` matches the `string`");
@@ -1275,7 +1274,6 @@ mod test {
test_scalar_expr!(SHA384, sha384, string);
test_scalar_expr!(SHA512, sha512, string);
test_scalar_expr!(SplitPart, split_part, expr, delimiter, index);
- test_scalar_expr!(StringToArray, string_to_array, expr, delimiter,
null_value);
test_scalar_expr!(StartsWith, starts_with, string, characters);
test_scalar_expr!(EndsWith, ends_with, string, characters);
test_scalar_expr!(Strpos, strpos, string, substring);
diff --git a/datafusion/functions-array/src/kernels.rs
b/datafusion/functions-array/src/kernels.rs
index bb5c4ef53e..6d843aa4bb 100644
--- a/datafusion/functions-array/src/kernels.rs
+++ b/datafusion/functions-array/src/kernels.rs
@@ -19,20 +19,20 @@
use arrow::array::{
Array, ArrayRef, BooleanArray, Date32Array, Float32Array, Float64Array,
- GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array,
LargeStringArray,
- OffsetSizeTrait, StringArray, UInt16Array, UInt32Array, UInt64Array,
UInt8Array,
+ GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array,
LargeListArray,
+ LargeStringArray, ListArray, ListBuilder, OffsetSizeTrait, StringArray,
+ StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
-use arrow::array::{LargeListArray, ListArray};
use arrow::buffer::OffsetBuffer;
use arrow::datatypes::Field;
use arrow::datatypes::UInt64Type;
use arrow::datatypes::{DataType, Date32Type, IntervalMonthDayNanoType};
use datafusion_common::cast::{
- as_date32_array, as_generic_list_array, as_int64_array,
as_interval_mdn_array,
- as_large_list_array, as_list_array, as_null_array, as_string_array,
+ as_date32_array, as_generic_list_array, as_generic_string_array,
as_int64_array,
+ as_interval_mdn_array, as_large_list_array, as_list_array, as_null_array,
+ as_string_array,
};
-use datafusion_common::DataFusionError;
-use datafusion_common::{exec_err, not_impl_datafusion_err, Result};
+use datafusion_common::{exec_err, not_impl_datafusion_err, DataFusionError,
Result};
use std::any::type_name;
use std::sync::Arc;
@@ -261,6 +261,98 @@ pub(super) fn array_to_string(args: &[ArrayRef]) ->
Result<ArrayRef> {
Ok(Arc::new(string_arr))
}
+/// Splits string at occurrences of delimiter and returns an array of parts
+/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]'
+pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
+ if args.len() < 2 || args.len() > 3 {
+ return exec_err!("string_to_array expects two or three arguments");
+ }
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let delimiter_array = as_generic_string_array::<T>(&args[1])?;
+
+ let mut list_builder = ListBuilder::new(StringBuilder::with_capacity(
+ string_array.len(),
+ string_array.get_buffer_memory_size(),
+ ));
+
+ match args.len() {
+ 2 => {
+ string_array.iter().zip(delimiter_array.iter()).for_each(
+ |(string, delimiter)| {
+ match (string, delimiter) {
+ (Some(string), Some("")) => {
+ list_builder.values().append_value(string);
+ list_builder.append(true);
+ }
+ (Some(string), Some(delimiter)) => {
+ string.split(delimiter).for_each(|s| {
+ list_builder.values().append_value(s);
+ });
+ list_builder.append(true);
+ }
+ (Some(string), None) => {
+ string.chars().map(|c| c.to_string()).for_each(|c|
{
+ list_builder.values().append_value(c);
+ });
+ list_builder.append(true);
+ }
+ _ => list_builder.append(false), // null value
+ }
+ },
+ );
+ }
+
+ 3 => {
+ let null_value_array = as_generic_string_array::<T>(&args[2])?;
+ string_array
+ .iter()
+ .zip(delimiter_array.iter())
+ .zip(null_value_array.iter())
+ .for_each(|((string, delimiter), null_value)| {
+ match (string, delimiter) {
+ (Some(string), Some("")) => {
+ if Some(string) == null_value {
+ list_builder.values().append_null();
+ } else {
+ list_builder.values().append_value(string);
+ }
+ list_builder.append(true);
+ }
+ (Some(string), Some(delimiter)) => {
+ string.split(delimiter).for_each(|s| {
+ if Some(s) == null_value {
+ list_builder.values().append_null();
+ } else {
+ list_builder.values().append_value(s);
+ }
+ });
+ list_builder.append(true);
+ }
+ (Some(string), None) => {
+ string.chars().map(|c| c.to_string()).for_each(|c|
{
+ if Some(c.as_str()) == null_value {
+ list_builder.values().append_null();
+ } else {
+ list_builder.values().append_value(c);
+ }
+ });
+ list_builder.append(true);
+ }
+ _ => list_builder.append(false), // null value
+ }
+ });
+ }
+ _ => {
+ return exec_err!(
+ "Expect string_to_array function to take two or three
parameters"
+ )
+ }
+ }
+
+ let list_array = list_builder.finish();
+ Ok(Arc::new(list_array) as ArrayRef)
+}
+
/// Generates an array of integers from start to stop with a given step.
///
/// This function takes 1 to 3 ArrayRefs as arguments, representing start,
stop, and step values.
diff --git a/datafusion/functions-array/src/lib.rs
b/datafusion/functions-array/src/lib.rs
index cf1e35d608..0f395f2270 100644
--- a/datafusion/functions-array/src/lib.rs
+++ b/datafusion/functions-array/src/lib.rs
@@ -59,12 +59,14 @@ pub mod expr_fn {
pub use super::udf::flatten;
pub use super::udf::gen_series;
pub use super::udf::range;
+ pub use super::udf::string_to_array;
}
/// Registers all enabled packages with a [`FunctionRegistry`]
pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> {
let functions: Vec<Arc<ScalarUDF>> = vec![
udf::array_to_string_udf(),
+ udf::string_to_array_udf(),
udf::range_udf(),
udf::gen_series_udf(),
udf::array_dims_udf(),
diff --git a/datafusion/functions-array/src/udf.rs
b/datafusion/functions-array/src/udf.rs
index 854535c237..fc1cc281bc 100644
--- a/datafusion/functions-array/src/udf.rs
+++ b/datafusion/functions-array/src/udf.rs
@@ -17,6 +17,7 @@
//! [`ScalarUDFImpl`] definitions for array functions.
+use arrow::array::{NullArray, StringArray};
use arrow::datatypes::DataType;
use arrow::datatypes::Field;
use arrow::datatypes::IntervalUnit::MonthDayNano;
@@ -89,6 +90,81 @@ impl ScalarUDFImpl for ArrayToString {
}
}
+make_udf_function!(StringToArray,
+ string_to_array,
+ string delimiter null_string, // arg name
+ "splits a `string` based on a `delimiter` and returns an array of parts.
Any parts matching the optional `null_string` will be replaced with `NULL`", //
doc
+ string_to_array_udf // internal function name
+);
+#[derive(Debug)]
+pub(super) struct StringToArray {
+ signature: Signature,
+ aliases: Vec<String>,
+}
+
+impl StringToArray {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::variadic_any(Volatility::Immutable),
+ aliases: vec![
+ String::from("string_to_array"),
+ String::from("string_to_list"),
+ ],
+ }
+ }
+}
+
+impl ScalarUDFImpl for StringToArray {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+ fn name(&self) -> &str {
+ "string_to_array"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) ->
datafusion_common::Result<DataType> {
+ use DataType::*;
+ Ok(match arg_types[0] {
+ Utf8 | LargeUtf8 => {
+ List(Arc::new(Field::new("item", arg_types[0].clone(), true)))
+ }
+ _ => {
+ return plan_err!(
+ "The string_to_array function can only accept Utf8 or
LargeUtf8."
+ );
+ }
+ })
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) ->
datafusion_common::Result<ColumnarValue> {
+ let mut args = ColumnarValue::values_to_arrays(args)?;
+ // Case: delimiter is NULL, needs to be handled as well.
+ if args[1].as_any().is::<NullArray>() {
+ args[1] = Arc::new(StringArray::new_null(args[1].len()));
+ };
+
+ match args[0].data_type() {
+ arrow::datatypes::DataType::Utf8 => {
+
crate::kernels::string_to_array::<i32>(&args).map(ColumnarValue::Array)
+ }
+ arrow::datatypes::DataType::LargeUtf8 => {
+
crate::kernels::string_to_array::<i64>(&args).map(ColumnarValue::Array)
+ }
+ other => {
+ exec_err!("unsupported type for string_to_array function as
{other}")
+ }
+ }
+ }
+
+ fn aliases(&self) -> &[String] {
+ &self.aliases
+ }
+}
+
make_udf_function!(
Range,
range,
diff --git a/datafusion/physical-expr/src/array_expressions.rs
b/datafusion/physical-expr/src/array_expressions.rs
index 3f7ea57df2..ed656660ab 100644
--- a/datafusion/physical-expr/src/array_expressions.rs
+++ b/datafusion/physical-expr/src/array_expressions.rs
@@ -30,8 +30,8 @@ use arrow_buffer::{ArrowNativeType, NullBuffer};
use arrow_schema::{FieldRef, SortOptions};
use datafusion_common::cast::{
- as_generic_list_array, as_generic_string_array, as_int64_array,
as_large_list_array,
- as_list_array, as_string_array,
+ as_generic_list_array, as_int64_array, as_large_list_array, as_list_array,
+ as_string_array,
};
use datafusion_common::utils::array_into_list_array;
use datafusion_common::{
@@ -1587,95 +1587,6 @@ pub fn array_intersect(args: &[ArrayRef]) ->
Result<ArrayRef> {
general_set_op(array1, array2, SetOp::Intersect)
}
-/// Splits string at occurrences of delimiter and returns an array of parts
-/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]'
-pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let delimiter_array = as_generic_string_array::<T>(&args[1])?;
-
- let mut list_builder = ListBuilder::new(StringBuilder::with_capacity(
- string_array.len(),
- string_array.get_buffer_memory_size(),
- ));
-
- match args.len() {
- 2 => {
- string_array.iter().zip(delimiter_array.iter()).for_each(
- |(string, delimiter)| {
- match (string, delimiter) {
- (Some(string), Some("")) => {
- list_builder.values().append_value(string);
- list_builder.append(true);
- }
- (Some(string), Some(delimiter)) => {
- string.split(delimiter).for_each(|s| {
- list_builder.values().append_value(s);
- });
- list_builder.append(true);
- }
- (Some(string), None) => {
- string.chars().map(|c| c.to_string()).for_each(|c|
{
- list_builder.values().append_value(c);
- });
- list_builder.append(true);
- }
- _ => list_builder.append(false), // null value
- }
- },
- );
- }
-
- 3 => {
- let null_value_array = as_generic_string_array::<T>(&args[2])?;
- string_array
- .iter()
- .zip(delimiter_array.iter())
- .zip(null_value_array.iter())
- .for_each(|((string, delimiter), null_value)| {
- match (string, delimiter) {
- (Some(string), Some("")) => {
- if Some(string) == null_value {
- list_builder.values().append_null();
- } else {
- list_builder.values().append_value(string);
- }
- list_builder.append(true);
- }
- (Some(string), Some(delimiter)) => {
- string.split(delimiter).for_each(|s| {
- if Some(s) == null_value {
- list_builder.values().append_null();
- } else {
- list_builder.values().append_value(s);
- }
- });
- list_builder.append(true);
- }
- (Some(string), None) => {
- string.chars().map(|c| c.to_string()).for_each(|c|
{
- if Some(c.as_str()) == null_value {
- list_builder.values().append_null();
- } else {
- list_builder.values().append_value(c);
- }
- });
- list_builder.append(true);
- }
- _ => list_builder.append(false), // null value
- }
- });
- }
- _ => {
- return exec_err!(
- "Expect string_to_array function to take two or three
parameters"
- )
- }
- }
-
- let list_array = list_builder.finish();
- Ok(Arc::new(list_array) as ArrayRef)
-}
-
pub fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
array: &GenericListArray<OffsetSize>,
field: &FieldRef,
diff --git a/datafusion/physical-expr/src/functions.rs
b/datafusion/physical-expr/src/functions.rs
index e9ac9bd2d6..db38e358db 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -602,21 +602,6 @@ pub fn create_physical_fun(
exec_err!("Unsupported data type {other:?} for function
split_part")
}
}),
- BuiltinScalarFunction::StringToArray => {
- Arc::new(|args| match args[0].data_type() {
- DataType::Utf8 => make_scalar_function_inner(
- array_expressions::string_to_array::<i32>,
- )(args),
- DataType::LargeUtf8 => make_scalar_function_inner(
- array_expressions::string_to_array::<i64>,
- )(args),
- other => {
- exec_err!(
- "Unsupported data type {other:?} for function
string_to_array"
- )
- }
- })
- }
BuiltinScalarFunction::StartsWith => Arc::new(|args| match
args[0].data_type() {
DataType::Utf8 => {
make_scalar_function_inner(string_expressions::starts_with::<i32>)(args)
diff --git a/datafusion/proto/proto/datafusion.proto
b/datafusion/proto/proto/datafusion.proto
index c2a36af2e7..8d8ae3691a 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -664,7 +664,7 @@ enum ScalarFunction {
Iszero = 114;
// 115 was ArrayEmpty
ArrayPopBack = 116;
- StringToArray = 117;
+ // 117 was StringToArray
// 118 was ToTimestampNanos
ArrayIntersect = 119;
ArrayUnion = 120;
diff --git a/datafusion/proto/src/generated/pbjson.rs
b/datafusion/proto/src/generated/pbjson.rs
index 0ec6de8f40..961a626463 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22185,7 +22185,6 @@ impl serde::Serialize for ScalarFunction {
Self::Nanvl => "Nanvl",
Self::Iszero => "Iszero",
Self::ArrayPopBack => "ArrayPopBack",
- Self::StringToArray => "StringToArray",
Self::ArrayIntersect => "ArrayIntersect",
Self::ArrayUnion => "ArrayUnion",
Self::OverLay => "OverLay",
@@ -22300,7 +22299,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Nanvl",
"Iszero",
"ArrayPopBack",
- "StringToArray",
"ArrayIntersect",
"ArrayUnion",
"OverLay",
@@ -22444,7 +22442,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Nanvl" => Ok(ScalarFunction::Nanvl),
"Iszero" => Ok(ScalarFunction::Iszero),
"ArrayPopBack" => Ok(ScalarFunction::ArrayPopBack),
- "StringToArray" => Ok(ScalarFunction::StringToArray),
"ArrayIntersect" => Ok(ScalarFunction::ArrayIntersect),
"ArrayUnion" => Ok(ScalarFunction::ArrayUnion),
"OverLay" => Ok(ScalarFunction::OverLay),
diff --git a/datafusion/proto/src/generated/prost.rs
b/datafusion/proto/src/generated/prost.rs
index 9b34b084c9..deaa977faa 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2736,7 +2736,7 @@ pub enum ScalarFunction {
Iszero = 114,
/// 115 was ArrayEmpty
ArrayPopBack = 116,
- StringToArray = 117,
+ /// 117 was StringToArray
/// 118 was ToTimestampNanos
ArrayIntersect = 119,
ArrayUnion = 120,
@@ -2855,7 +2855,6 @@ impl ScalarFunction {
ScalarFunction::Nanvl => "Nanvl",
ScalarFunction::Iszero => "Iszero",
ScalarFunction::ArrayPopBack => "ArrayPopBack",
- ScalarFunction::StringToArray => "StringToArray",
ScalarFunction::ArrayIntersect => "ArrayIntersect",
ScalarFunction::ArrayUnion => "ArrayUnion",
ScalarFunction::OverLay => "OverLay",
@@ -2964,7 +2963,6 @@ impl ScalarFunction {
"Nanvl" => Some(Self::Nanvl),
"Iszero" => Some(Self::Iszero),
"ArrayPopBack" => Some(Self::ArrayPopBack),
- "StringToArray" => Some(Self::StringToArray),
"ArrayIntersect" => Some(Self::ArrayIntersect),
"ArrayUnion" => Some(Self::ArrayUnion),
"OverLay" => Some(Self::OverLay),
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs
b/datafusion/proto/src/logical_plan/from_proto.rs
index 8dba553b48..739503a942 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -60,11 +60,10 @@ use datafusion_expr::{
logical_plan::{PlanType, StringifiedPlan},
lower, lpad, ltrim, md5, nanvl, now, octet_length, overlay, pi, power,
radians,
random, repeat, replace, reverse, right, round, rpad, rtrim, sha224,
sha256, sha384,
- sha512, signum, sin, sinh, split_part, sqrt, starts_with, string_to_array,
strpos,
- struct_fun, substr, substr_index, substring, tan, tanh, to_hex, translate,
trim,
- trunc, upper, uuid, AggregateFunction, Between, BinaryExpr,
BuiltInWindowFunction,
- BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField,
- GroupingSet,
+ sha512, signum, sin, sinh, split_part, sqrt, starts_with, strpos,
struct_fun, substr,
+ substr_index, substring, tan, tanh, to_hex, translate, trim, trunc, upper,
uuid,
+ AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction,
BuiltinScalarFunction,
+ Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet,
GroupingSet::GroupingSets,
JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame,
WindowFrameBound,
WindowFrameUnits,
@@ -521,7 +520,6 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::Right => Self::Right,
ScalarFunction::Rpad => Self::Rpad,
ScalarFunction::SplitPart => Self::SplitPart,
- ScalarFunction::StringToArray => Self::StringToArray,
ScalarFunction::StartsWith => Self::StartsWith,
ScalarFunction::Strpos => Self::Strpos,
ScalarFunction::Substr => Self::Substr,
@@ -1742,11 +1740,6 @@ pub fn parse_expr(
ScalarFunction::ArrowTypeof => {
Ok(arrow_typeof(parse_expr(&args[0], registry, codec)?))
}
- ScalarFunction::StringToArray => Ok(string_to_array(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- parse_expr(&args[2], registry, codec)?,
- )),
ScalarFunction::OverLay => Ok(overlay(
args.to_owned()
.iter()
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs
b/datafusion/proto/src/logical_plan/to_proto.rs
index 393cc78267..a9867b8f03 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1502,7 +1502,6 @@ impl TryFrom<&BuiltinScalarFunction> for
protobuf::ScalarFunction {
BuiltinScalarFunction::Right => Self::Right,
BuiltinScalarFunction::Rpad => Self::Rpad,
BuiltinScalarFunction::SplitPart => Self::SplitPart,
- BuiltinScalarFunction::StringToArray => Self::StringToArray,
BuiltinScalarFunction::StartsWith => Self::StartsWith,
BuiltinScalarFunction::Strpos => Self::Strpos,
BuiltinScalarFunction::Substr => Self::Substr,
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index 76402604ac..3899f64a37 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -586,6 +586,7 @@ async fn roundtrip_expr_api() -> Result<()> {
array_dims(make_array(vec![lit(1), lit(2), lit(3)])),
array_ndims(make_array(vec![lit(1), lit(2), lit(3)])),
cardinality(make_array(vec![lit(1), lit(2), lit(3)])),
+ string_to_array(lit("abc#def#ghl"), lit("#"), lit(",")),
range(lit(1), lit(10), lit(2)),
gen_series(lit(1), lit(10), lit(2)),
array_append(make_array(vec![lit(1), lit(2), lit(3)]), lit(4)),
diff --git a/docs/source/user-guide/expressions.md
b/docs/source/user-guide/expressions.md
index dcb599b9b3..17da8c3fc2 100644
--- a/docs/source/user-guide/expressions.md
+++ b/docs/source/user-guide/expressions.md
@@ -207,42 +207,43 @@ select log(-1), log(0), sqrt(-1);
## Array Expressions
-| Syntax | Description
|
-| -------------------------------------- |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
-| array_append(array, element) | Appends an element to the end of an
array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]`
|
-| array_concat(array[, ..., array_n]) | Concatenates arrays.
`array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]`
|
-| array_has(array, element) | Returns true if the array contains
the element `array_has([1,2,3], 1) -> true`
|
-| array_has_all(array, sub-array) | Returns true if all elements of
sub-array exist in array `array_has_all([1,2,3], [1,3]) -> true`
|
-| array_has_any(array, sub-array) | Returns true if any elements exist
in both arrays `array_has_any([1,2,3], [1,4]) -> true`
|
-| array_dims(array) | Returns an array of the array's
dimensions. `array_dims([[1, 2, 3], [4, 5, 6]]) -> [2, 3]`
|
-| array_distinct(array) | Returns distinct values from the
array after removing duplicates. `array_distinct([1, 3, 2, 3, 1, 2, 4]) -> [1,
2, 3, 4]` |
-| array_element(array, index) | Extracts the element with the index
n from the array `array_element([1, 2, 3, 4], 3) -> 3`
|
-| flatten(array) | Converts an array of arrays to a
flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]`
|
-| array_length(array, dimension) | Returns the length of the array
dimension. `array_length([1, 2, 3, 4, 5]) -> 5`
|
-| array_ndims(array) | Returns the number of dimensions of
the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2`
|
-| array_pop_front(array) | Returns the array without the first
element. `array_pop_front([1, 2, 3]) -> [2, 3]`
|
-| array_pop_back(array) | Returns the array without the last
element. `array_pop_back([1, 2, 3]) -> [1, 2]`
|
-| array_position(array, element) | Searches for an element in the
array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2`
|
-| array_positions(array, element) | Searches for an element in the
array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2, 3]`
|
-| array_prepend(array, element) | Prepends an element to the
beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]`
|
-| array_repeat(element, count) | Returns an array containing element
`count` times. `array_repeat(1, 3) -> [1, 1, 1]`
|
-| array_remove(array, element) | Removes the first element from the
array equal to the given value. `array_remove([1, 2, 2, 3, 2, 1, 4], 2) -> [1,
2, 3, 2, 1, 4]` |
-| array_remove_n(array, element, max) | Removes the first `max` elements
from the array equal to the given value. `array_remove_n([1, 2, 2, 3, 2, 1, 4],
2, 2) -> [1, 3, 2, 1, 4]` |
-| array_remove_all(array, element) | Removes all elements from the array
equal to the given value. `array_remove_all([1, 2, 2, 3, 2, 1, 4], 2) -> [1, 3,
1, 4]` |
-| array_replace(array, from, to) | Replaces the first occurrence of
the specified element with another specified element. `array_replace([1, 2, 2,
3, 2, 1, 4], 2, 5) -> [1, 5, 2, 3, 2, 1, 4]` |
-| array_replace_n(array, from, to, max) | Replaces the first `max`
occurrences of the specified element with another specified element.
`array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2) -> [1, 5, 5, 3, 2, 1, 4]` |
-| array_replace_all(array, from, to) | Replaces all occurrences of the
specified element with another specified element. `array_replace_all([1, 2, 2,
3, 2, 1, 4], 2, 5) -> [1, 5, 5, 3, 5, 1, 4]` |
-| array_slice(array, begin,end) | Returns a slice of the array.
`array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6) -> [3, 4, 5, 6]`
|
-| array_slice(array, begin, end, stride) | Returns a slice of the array with
added stride feature. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6, 2) -> [3, 5,
6]` |
-| array_to_string(array, delimiter) | Converts each element to its text
representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4`
|
-| array_intersect(array1, array2) | Returns an array of the elements in
the intersection of array1 and array2. `array_intersect([1, 2, 3, 4], [5, 6, 3,
4]) -> [3, 4]` |
-| array_union(array1, array2) | Returns an array of the elements in
the union of array1 and array2 without duplicates. `array_union([1, 2, 3, 4],
[5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]` |
-| array_except(array1, array2) | Returns an array of the elements
that appear in the first array but not in the second. `array_except([1, 2, 3,
4], [5, 6, 3, 4]) -> [3, 4]` |
-| array_resize(array, size, value) | Resizes the list to contain size
elements. Initializes new elements with value or empty if value is not set.
`array_resize([1, 2, 3], 5, 0) -> [1, 2, 3, 4, 5, 6]` |
-| cardinality(array) | Returns the total number of
elements in the array. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6`
|
-| make_array(value1, [value2 [, ...]]) | Returns an Arrow array using the
specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]`
|
-| range(start [, stop, step]) | Returns an Arrow array between
start and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]`
|
-| trim_array(array, n) | Deprecated
|
+| Syntax | Description
|
+| ---------------------------------------------- |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
+| array_append(array, element) | Appends an element to the
end of an array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]`
|
+| array_concat(array[, ..., array_n]) | Concatenates arrays.
`array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]`
|
+| array_has(array, element) | Returns true if the array
contains the element `array_has([1,2,3], 1) -> true`
|
+| array_has_all(array, sub-array) | Returns true if all
elements of sub-array exist in array `array_has_all([1,2,3], [1,3]) -> true`
|
+| array_has_any(array, sub-array) | Returns true if any
elements exist in both arrays `array_has_any([1,2,3], [1,4]) -> true`
|
+| array_dims(array) | Returns an array of the
array's dimensions. `array_dims([[1, 2, 3], [4, 5, 6]]) -> [2, 3]`
|
+| array_distinct(array) | Returns distinct values
from the array after removing duplicates. `array_distinct([1, 3, 2, 3, 1, 2,
4]) -> [1, 2, 3, 4]`
|
+| array_element(array, index) | Extracts the element with
the index n from the array `array_element([1, 2, 3, 4], 3) -> 3`
|
+| flatten(array) | Converts an array of arrays
to a flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]`
|
+| array_length(array, dimension) | Returns the length of the
array dimension. `array_length([1, 2, 3, 4, 5]) -> 5`
|
+| array_ndims(array) | Returns the number of
dimensions of the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2`
|
+| array_pop_front(array) | Returns the array without
the first element. `array_pop_front([1, 2, 3]) -> [2, 3]`
|
+| array_pop_back(array) | Returns the array without
the last element. `array_pop_back([1, 2, 3]) -> [1, 2]`
|
+| array_position(array, element) | Searches for an element in
the array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2`
|
+| array_positions(array, element) | Searches for an element in
the array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2,
3]`
|
+| array_prepend(array, element) | Prepends an element to the
beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]`
|
+| array_repeat(element, count) | Returns an array containing
element `count` times. `array_repeat(1, 3) -> [1, 1, 1]`
|
+| array_remove(array, element) | Removes the first element
from the array equal to the given value. `array_remove([1, 2, 2, 3, 2, 1, 4],
2) -> [1, 2, 3, 2, 1, 4]`
|
+| array_remove_n(array, element, max) | Removes the first `max`
elements from the array equal to the given value. `array_remove_n([1, 2, 2, 3,
2, 1, 4], 2, 2) -> [1, 3, 2, 1, 4]`
|
+| array_remove_all(array, element) | Removes all elements from
the array equal to the given value. `array_remove_all([1, 2, 2, 3, 2, 1, 4], 2)
-> [1, 3, 1, 4]`
|
+| array_replace(array, from, to) | Replaces the first
occurrence of the specified element with another specified element.
`array_replace([1, 2, 2, 3, 2, 1, 4], 2, 5) -> [1, 5, 2, 3, 2, 1, 4]`
|
+| array_replace_n(array, from, to, max) | Replaces the first `max`
occurrences of the specified element with another specified element.
`array_replace_n([1, 2, 2, 3, 2, 1, 4], 2, 5, 2) -> [1, 5, 5, 3, 2, 1, 4]`
|
+| array_replace_all(array, from, to) | Replaces all occurrences of
the specified element with another specified element. `array_replace_all([1, 2,
2, 3, 2, 1, 4], 2, 5) -> [1, 5, 5, 3, 5, 1, 4]`
|
+| array_slice(array, begin,end) | Returns a slice of the
array. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6) -> [3, 4, 5, 6]`
|
+| array_slice(array, begin, end, stride) | Returns a slice of the
array with added stride feature. `array_slice([1, 2, 3, 4, 5, 6, 7, 8], 3, 6,
2) -> [3, 5, 6]`
|
+| array_to_string(array, delimiter) | Converts each element to
its text representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4`
|
+| array_intersect(array1, array2) | Returns an array of the
elements in the intersection of array1 and array2. `array_intersect([1, 2, 3,
4], [5, 6, 3, 4]) -> [3, 4]`
|
+| array_union(array1, array2) | Returns an array of the
elements in the union of array1 and array2 without duplicates. `array_union([1,
2, 3, 4], [5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]`
|
+| array_except(array1, array2) | Returns an array of the
elements that appear in the first array but not in the second.
`array_except([1, 2, 3, 4], [5, 6, 3, 4]) -> [3, 4]`
|
+| array_resize(array, size, value) | Resizes the list to contain
size elements. Initializes new elements with value or empty if value is not
set. `array_resize([1, 2, 3], 5, 0) -> [1, 2, 3, 4, 5, 6]`
|
+| cardinality(array) | Returns the total number of
elements in the array. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6`
|
+| make_array(value1, [value2 [, ...]]) | Returns an Arrow array
using the specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]`
|
+| range(start [, stop, step]) | Returns an Arrow array
between start and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]`
|
+| string_to_array(array, delimiter, null_string) | Splits a `string` based on
a `delimiter` and returns an array of parts. Any parts matching the optional
`null_string` will be replaced with `NULL`. `string_to_array('abc#def#ghi',
'#', ' ') -> ['abc', 'def', 'ghi']` |
+| trim_array(array, n) | Deprecated
|
## Regular Expressions
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index b0385b4923..7496039116 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -3113,6 +3113,7 @@ _Alias of [make_array](#make_array)._
### `string_to_array`
Splits a string in to an array of substrings based on a delimiter. Any
substrings matching the optional `null_str` argument are replaced with NULL.
+`SELECT string_to_array('abc##def', '##')` or `SELECT string_to_array('abc
def', ' ', 'def')`
```
starts_with(str, delimiter[, null_str])