This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 1e4ddb6d86 Move repeat, replace, split_part to datafusion_functions
(#9784)
1e4ddb6d86 is described below
commit 1e4ddb6d86328cb6596bb50da9ccc654f19a83ea
Author: Bruce Ritchie <[email protected]>
AuthorDate: Sun Mar 24 14:28:53 2024 -0400
Move repeat, replace, split_part to datafusion_functions (#9784)
* Fix to_timestamp benchmark
* Remove reference to simd and nightly build as simd is no longer an
available feature in DataFusion and building with nightly may not be a good
recommendation when getting started.
* Fixed missing trim() function.
* Move repeat, replace, split_part to datafusion_functions
---
datafusion/expr/src/built_in_function.rs | 44 +-----
datafusion/expr/src/expr_fn.rs | 6 -
datafusion/functions/src/string/mod.rs | 24 +++
datafusion/functions/src/string/repeat.rs | 144 +++++++++++++++++
datafusion/functions/src/string/replace.rs | 97 ++++++++++++
datafusion/functions/src/string/split_part.rs | 170 +++++++++++++++++++++
datafusion/physical-expr/src/functions.rs | 122 +++------------
datafusion/physical-expr/src/string_expressions.rs | 67 --------
datafusion/proto/proto/datafusion.proto | 6 +-
datafusion/proto/src/generated/pbjson.rs | 9 --
datafusion/proto/src/generated/prost.rs | 12 +-
datafusion/proto/src/logical_plan/from_proto.rs | 26 +---
datafusion/proto/src/logical_plan/to_proto.rs | 3 -
13 files changed, 469 insertions(+), 261 deletions(-)
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index 1904d58cfc..b3f17ae3c2 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -123,18 +123,12 @@ pub enum BuiltinScalarFunction {
Lpad,
/// random
Random,
- /// repeat
- Repeat,
- /// replace
- Replace,
/// reverse
Reverse,
/// right
Right,
/// rpad
Rpad,
- /// split_part
- SplitPart,
/// strpos
Strpos,
/// substr
@@ -238,12 +232,9 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Left => Volatility::Immutable,
BuiltinScalarFunction::Lpad => Volatility::Immutable,
BuiltinScalarFunction::Radians => Volatility::Immutable,
- BuiltinScalarFunction::Repeat => Volatility::Immutable,
- BuiltinScalarFunction::Replace => Volatility::Immutable,
BuiltinScalarFunction::Reverse => Volatility::Immutable,
BuiltinScalarFunction::Right => Volatility::Immutable,
BuiltinScalarFunction::Rpad => Volatility::Immutable,
- BuiltinScalarFunction::SplitPart => Volatility::Immutable,
BuiltinScalarFunction::Strpos => Volatility::Immutable,
BuiltinScalarFunction::Substr => Volatility::Immutable,
BuiltinScalarFunction::Translate => Volatility::Immutable,
@@ -293,12 +284,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Lpad =>
utf8_to_str_type(&input_expr_types[0], "lpad"),
BuiltinScalarFunction::Pi => Ok(Float64),
BuiltinScalarFunction::Random => Ok(Float64),
- BuiltinScalarFunction::Repeat => {
- utf8_to_str_type(&input_expr_types[0], "repeat")
- }
- BuiltinScalarFunction::Replace => {
- utf8_to_str_type(&input_expr_types[0], "replace")
- }
BuiltinScalarFunction::Reverse => {
utf8_to_str_type(&input_expr_types[0], "reverse")
}
@@ -306,9 +291,6 @@ impl BuiltinScalarFunction {
utf8_to_str_type(&input_expr_types[0], "right")
}
BuiltinScalarFunction::Rpad =>
utf8_to_str_type(&input_expr_types[0], "rpad"),
- BuiltinScalarFunction::SplitPart => {
- utf8_to_str_type(&input_expr_types[0], "split_part")
- }
BuiltinScalarFunction::EndsWith => Ok(Boolean),
BuiltinScalarFunction::Strpos => {
utf8_to_int_type(&input_expr_types[0], "strpos/instr/position")
@@ -417,21 +399,12 @@ impl BuiltinScalarFunction {
self.volatility(),
)
}
- BuiltinScalarFunction::Left
- | BuiltinScalarFunction::Repeat
- | BuiltinScalarFunction::Right => Signature::one_of(
- vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
- self.volatility(),
- ),
- BuiltinScalarFunction::SplitPart => Signature::one_of(
- vec![
- Exact(vec![Utf8, Utf8, Int64]),
- Exact(vec![LargeUtf8, Utf8, Int64]),
- Exact(vec![Utf8, LargeUtf8, Int64]),
- Exact(vec![LargeUtf8, LargeUtf8, Int64]),
- ],
- self.volatility(),
- ),
+ BuiltinScalarFunction::Left | BuiltinScalarFunction::Right => {
+ Signature::one_of(
+ vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8,
Int64])],
+ self.volatility(),
+ )
+ }
BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos =>
{
Signature::one_of(
@@ -467,7 +440,7 @@ impl BuiltinScalarFunction {
self.volatility(),
),
- BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate
=> {
+ BuiltinScalarFunction::Translate => {
Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])],
self.volatility())
}
BuiltinScalarFunction::Pi => Signature::exact(vec![],
self.volatility()),
@@ -637,12 +610,9 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::InitCap => &["initcap"],
BuiltinScalarFunction::Left => &["left"],
BuiltinScalarFunction::Lpad => &["lpad"],
- BuiltinScalarFunction::Repeat => &["repeat"],
- BuiltinScalarFunction::Replace => &["replace"],
BuiltinScalarFunction::Reverse => &["reverse"],
BuiltinScalarFunction::Right => &["right"],
BuiltinScalarFunction::Rpad => &["rpad"],
- BuiltinScalarFunction::SplitPart => &["split_part"],
BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
BuiltinScalarFunction::Substr => &["substr"],
BuiltinScalarFunction::Translate => &["translate"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 60db21e5f5..f75d886967 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -598,11 +598,8 @@ scalar_expr!(
);
scalar_expr!(InitCap, initcap, string, "converts the first letter of each word
in `string` in uppercase and the remaining characters in lowercase");
scalar_expr!(Left, left, string n, "returns the first `n` characters in the
`string`");
-scalar_expr!(Replace, replace, string from to, "replaces all occurrences of
`from` with `to` in the `string`");
-scalar_expr!(Repeat, repeat, string n, "repeats the `string` to `n` times");
scalar_expr!(Reverse, reverse, string, "reverses the `string`");
scalar_expr!(Right, right, string n, "returns the last `n` characters in the
`string`");
-scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string
based on a delimiter and picks out the desired field based on the index.");
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends
with the `suffix`");
scalar_expr!(Strpos, strpos, string substring, "finds the position from where
the `substring` matches the `string`");
scalar_expr!(Substr, substr, string position, "substring from the `position`
to the end");
@@ -1056,13 +1053,10 @@ mod test {
test_scalar_expr!(Left, left, string, count);
test_nary_scalar_expr!(Lpad, lpad, string, count);
test_nary_scalar_expr!(Lpad, lpad, string, count, characters);
- test_scalar_expr!(Replace, replace, string, from, to);
- test_scalar_expr!(Repeat, repeat, string, count);
test_scalar_expr!(Reverse, reverse, string);
test_scalar_expr!(Right, right, string, count);
test_nary_scalar_expr!(Rpad, rpad, string, count);
test_nary_scalar_expr!(Rpad, rpad, string, count, characters);
- test_scalar_expr!(SplitPart, split_part, expr, delimiter, index);
test_scalar_expr!(EndsWith, ends_with, string, characters);
test_scalar_expr!(Strpos, strpos, string, substring);
test_scalar_expr!(Substr, substr, string, position);
diff --git a/datafusion/functions/src/string/mod.rs
b/datafusion/functions/src/string/mod.rs
index 165a7c6604..d2b9fb2da8 100644
--- a/datafusion/functions/src/string/mod.rs
+++ b/datafusion/functions/src/string/mod.rs
@@ -29,7 +29,10 @@ mod lower;
mod ltrim;
mod octet_length;
mod overlay;
+mod repeat;
+mod replace;
mod rtrim;
+mod split_part;
mod starts_with;
mod to_hex;
mod upper;
@@ -43,8 +46,11 @@ make_udf_function!(ltrim::LtrimFunc, LTRIM, ltrim);
make_udf_function!(lower::LowerFunc, LOWER, lower);
make_udf_function!(octet_length::OctetLengthFunc, OCTET_LENGTH, octet_length);
make_udf_function!(overlay::OverlayFunc, OVERLAY, overlay);
+make_udf_function!(repeat::RepeatFunc, REPEAT, repeat);
+make_udf_function!(replace::ReplaceFunc, REPLACE, replace);
make_udf_function!(rtrim::RtrimFunc, RTRIM, rtrim);
make_udf_function!(starts_with::StartsWithFunc, STARTS_WITH, starts_with);
+make_udf_function!(split_part::SplitPartFunc, SPLIT_PART, split_part);
make_udf_function!(to_hex::ToHexFunc, TO_HEX, to_hex);
make_udf_function!(upper::UpperFunc, UPPER, upper);
make_udf_function!(uuid::UuidFunc, UUID, uuid);
@@ -87,11 +93,26 @@ pub mod expr_fn {
super::overlay().call(args)
}
+ #[doc = "Repeats the `string` to `n` times"]
+ pub fn repeat(string: Expr, n: Expr) -> Expr {
+ super::repeat().call(vec![string, n])
+ }
+
+ #[doc = "Replaces all occurrences of `from` with `to` in the `string`"]
+ pub fn replace(string: Expr, from: Expr, to: Expr) -> Expr {
+ super::replace().call(vec![string, from, to])
+ }
+
#[doc = "Removes all characters, spaces by default, from the end of a
string"]
pub fn rtrim(args: Vec<Expr>) -> Expr {
super::rtrim().call(args)
}
+ #[doc = "Splits a string based on a delimiter and picks out the desired
field based on the index."]
+ pub fn split_part(string: Expr, delimiter: Expr, index: Expr) -> Expr {
+ super::split_part().call(vec![string, delimiter, index])
+ }
+
#[doc = "Returns true if string starts with prefix."]
pub fn starts_with(arg1: Expr, arg2: Expr) -> Expr {
super::starts_with().call(vec![arg1, arg2])
@@ -128,7 +149,10 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
ltrim(),
octet_length(),
overlay(),
+ repeat(),
+ replace(),
rtrim(),
+ split_part(),
starts_with(),
to_hex(),
upper(),
diff --git a/datafusion/functions/src/string/repeat.rs
b/datafusion/functions/src/string/repeat.rs
new file mode 100644
index 0000000000..83bc929cb9
--- /dev/null
+++ b/datafusion/functions/src/string/repeat.rs
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+
+use datafusion_common::cast::{as_generic_string_array, as_int64_array};
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::*;
+use datafusion_expr::{ColumnarValue, Volatility};
+use datafusion_expr::{ScalarUDFImpl, Signature};
+
+use crate::string::common::*;
+
+#[derive(Debug)]
+pub(super) struct RepeatFunc {
+ signature: Signature,
+}
+
+impl RepeatFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for RepeatFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "repeat"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "repeat")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(repeat::<i32>,
vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(repeat::<i64>,
vec![])(args),
+ other => exec_err!("Unsupported data type {other:?} for function
repeat"),
+ }
+ }
+}
+
+/// Repeats string the specified number of times.
+/// repeat('Pg', 4) = 'PgPgPgPg'
+fn repeat<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let number_array = as_int64_array(&args[1])?;
+
+ let result = string_array
+ .iter()
+ .zip(number_array.iter())
+ .map(|(string, number)| match (string, number) {
+ (Some(string), Some(number)) => Some(string.repeat(number as
usize)),
+ _ => None,
+ })
+ .collect::<GenericStringArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::{Array, StringArray};
+ use arrow::datatypes::DataType::Utf8;
+
+ use datafusion_common::Result;
+ use datafusion_common::ScalarValue;
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use crate::string::common::test::test_function;
+ use crate::string::repeat::RepeatFunc;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ test_function!(
+ RepeatFunc::new(),
+ &[
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("Pg")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(4))),
+ ],
+ Ok(Some("PgPgPgPg")),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ test_function!(
+ RepeatFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(4))),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RepeatFunc::new(),
+ &[
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("Pg")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ Ok(())
+ }
+}
diff --git a/datafusion/functions/src/string/replace.rs
b/datafusion/functions/src/string/replace.rs
new file mode 100644
index 0000000000..e352442960
--- /dev/null
+++ b/datafusion/functions/src/string/replace.rs
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+
+use datafusion_common::cast::as_generic_string_array;
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::*;
+use datafusion_expr::{ColumnarValue, Volatility};
+use datafusion_expr::{ScalarUDFImpl, Signature};
+
+use crate::string::common::*;
+
+#[derive(Debug)]
+pub(super) struct ReplaceFunc {
+ signature: Signature,
+}
+
+impl ReplaceFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![Exact(vec![Utf8, Utf8, Utf8])],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for ReplaceFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "replace"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "replace")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(replace::<i32>,
vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(replace::<i64>,
vec![])(args),
+ other => {
+ exec_err!("Unsupported data type {other:?} for function
replace")
+ }
+ }
+ }
+}
+
+/// Replaces all occurrences in string of substring from with substring to.
+/// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef'
+fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let from_array = as_generic_string_array::<T>(&args[1])?;
+ let to_array = as_generic_string_array::<T>(&args[2])?;
+
+ let result = string_array
+ .iter()
+ .zip(from_array.iter())
+ .zip(to_array.iter())
+ .map(|((string, from), to)| match (string, from, to) {
+ (Some(string), Some(from), Some(to)) => Some(string.replace(from,
to)),
+ _ => None,
+ })
+ .collect::<GenericStringArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+mod test {}
diff --git a/datafusion/functions/src/string/split_part.rs
b/datafusion/functions/src/string/split_part.rs
new file mode 100644
index 0000000000..af201e90fc
--- /dev/null
+++ b/datafusion/functions/src/string/split_part.rs
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+
+use datafusion_common::cast::{as_generic_string_array, as_int64_array};
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::*;
+use datafusion_expr::{ColumnarValue, Volatility};
+use datafusion_expr::{ScalarUDFImpl, Signature};
+
+use crate::string::common::*;
+
+#[derive(Debug)]
+pub(super) struct SplitPartFunc {
+ signature: Signature,
+}
+
+impl SplitPartFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![
+ Exact(vec![Utf8, Utf8, Int64]),
+ Exact(vec![LargeUtf8, Utf8, Int64]),
+ Exact(vec![Utf8, LargeUtf8, Int64]),
+ Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+ ],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for SplitPartFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "split_part"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "split_part")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(split_part::<i32>,
vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(split_part::<i64>,
vec![])(args),
+ other => {
+ exec_err!("Unsupported data type {other:?} for function
split_part")
+ }
+ }
+ }
+}
+
+/// Splits string at occurrences of delimiter and returns the n'th field
(counting from one).
+/// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def'
+fn split_part<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let delimiter_array = as_generic_string_array::<T>(&args[1])?;
+ let n_array = as_int64_array(&args[2])?;
+ let result = string_array
+ .iter()
+ .zip(delimiter_array.iter())
+ .zip(n_array.iter())
+ .map(|((string, delimiter), n)| match (string, delimiter, n) {
+ (Some(string), Some(delimiter), Some(n)) => {
+ if n <= 0 {
+ exec_err!("field position must be greater than zero")
+ } else {
+ let split_string: Vec<&str> =
string.split(delimiter).collect();
+ match split_string.get(n as usize - 1) {
+ Some(s) => Ok(Some(*s)),
+ None => Ok(Some("")),
+ }
+ }
+ }
+ _ => Ok(None),
+ })
+ .collect::<Result<GenericStringArray<T>>>()?;
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::{Array, StringArray};
+ use arrow::datatypes::DataType::Utf8;
+
+ use datafusion_common::ScalarValue;
+ use datafusion_common::{exec_err, Result};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use crate::string::common::test::test_function;
+ use crate::string::split_part::SplitPartFunc;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ test_function!(
+ SplitPartFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(
+ "abc~@~def~@~ghi"
+ )))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("~@~")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+ ],
+ Ok(Some("def")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(
+ "abc~@~def~@~ghi"
+ )))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("~@~")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(20))),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SplitPartFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(
+ "abc~@~def~@~ghi"
+ )))),
+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("~@~")))),
+ ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+ ],
+ exec_err!("field position must be greater than zero"),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ Ok(())
+ }
+}
diff --git a/datafusion/physical-expr/src/functions.rs
b/datafusion/physical-expr/src/functions.rs
index 8759adc89b..163598c2df 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -30,17 +30,16 @@
//! an argument i32 is passed to a function that supports f64, the
//! argument is automatically is coerced to f64.
-use crate::sort_properties::SortProperties;
-use crate::{
- conditional_expressions, math_expressions, string_expressions,
PhysicalExpr,
- ScalarFunctionExpr,
-};
+use std::ops::Neg;
+use std::sync::Arc;
+
use arrow::{
array::ArrayRef,
compute::kernels::length::bit_length,
datatypes::{DataType, Int32Type, Int64Type, Schema},
};
use arrow_array::Array;
+
use datafusion_common::{exec_err, Result, ScalarValue};
use datafusion_expr::execution_props::ExecutionProps;
pub use datafusion_expr::FuncMonotonicity;
@@ -49,8 +48,12 @@ use datafusion_expr::{
type_coercion::functions::data_types, BuiltinScalarFunction, ColumnarValue,
ScalarFunctionImplementation,
};
-use std::ops::Neg;
-use std::sync::Arc;
+
+use crate::sort_properties::SortProperties;
+use crate::{
+ conditional_expressions, math_expressions, string_expressions,
PhysicalExpr,
+ ScalarFunctionExpr,
+};
/// Create a physical (function) expression.
/// This function errors when `args`' can't be coerced to a valid argument
type of the function.
@@ -328,26 +331,6 @@ pub fn create_physical_fun(
}
other => exec_err!("Unsupported data type {other:?} for function
lpad"),
}),
- BuiltinScalarFunction::Repeat => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
-
make_scalar_function_inner(string_expressions::repeat::<i32>)(args)
- }
- DataType::LargeUtf8 => {
-
make_scalar_function_inner(string_expressions::repeat::<i64>)(args)
- }
- other => exec_err!("Unsupported data type {other:?} for function
repeat"),
- }),
- BuiltinScalarFunction::Replace => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
-
make_scalar_function_inner(string_expressions::replace::<i32>)(args)
- }
- DataType::LargeUtf8 => {
-
make_scalar_function_inner(string_expressions::replace::<i64>)(args)
- }
- other => {
- exec_err!("Unsupported data type {other:?} for function
replace")
- }
- }),
BuiltinScalarFunction::Reverse => Arc::new(|args| match
args[0].data_type() {
DataType::Utf8 => {
let func =
@@ -387,17 +370,6 @@ pub fn create_physical_fun(
}
other => exec_err!("Unsupported data type {other:?} for function
rpad"),
}),
- BuiltinScalarFunction::SplitPart => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
-
make_scalar_function_inner(string_expressions::split_part::<i32>)(args)
- }
- DataType::LargeUtf8 => {
-
make_scalar_function_inner(string_expressions::split_part::<i64>)(args)
- }
- other => {
- exec_err!("Unsupported data type {other:?} for function
split_part")
- }
- }),
BuiltinScalarFunction::EndsWith => Arc::new(|args| match
args[0].data_type() {
DataType::Utf8 => {
make_scalar_function_inner(string_expressions::ends_with::<i32>)(args)
@@ -568,9 +540,6 @@ fn func_order_in_one_dimension(
#[cfg(test)]
mod tests {
- use super::*;
- use crate::expressions::lit;
- use crate::expressions::try_cast;
use arrow::{
array::{
Array, ArrayRef, BooleanArray, Float32Array, Float64Array,
Int32Array,
@@ -579,12 +548,18 @@ mod tests {
datatypes::Field,
record_batch::RecordBatch,
};
+
use datafusion_common::cast::as_uint64_array;
use datafusion_common::{exec_err, internal_err, plan_err};
use datafusion_common::{DataFusionError, Result, ScalarValue};
use datafusion_expr::type_coercion::functions::data_types;
use datafusion_expr::Signature;
+ use crate::expressions::lit;
+ use crate::expressions::try_cast;
+
+ use super::*;
+
/// $FUNC function to test
/// $ARGS arguments (vec) to pass to function
/// $EXPECTED a Result<Option<$EXPECTED_TYPE>> where Result allows testing
errors and Option allows testing Null
@@ -1124,33 +1099,6 @@ mod tests {
Utf8,
StringArray
);
- test_function!(
- Repeat,
- &[lit("Pg"), lit(ScalarValue::Int64(Some(4))),],
- Ok(Some("PgPgPgPg")),
- &str,
- Utf8,
- StringArray
- );
- test_function!(
- Repeat,
- &[
- lit(ScalarValue::Utf8(None)),
- lit(ScalarValue::Int64(Some(4))),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- test_function!(
- Repeat,
- &[lit("Pg"), lit(ScalarValue::Int64(None)),],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
#[cfg(feature = "unicode_expressions")]
test_function!(
Reverse,
@@ -1447,42 +1395,6 @@ mod tests {
Utf8,
StringArray
);
- test_function!(
- SplitPart,
- &[
- lit("abc~@~def~@~ghi"),
- lit("~@~"),
- lit(ScalarValue::Int64(Some(2))),
- ],
- Ok(Some("def")),
- &str,
- Utf8,
- StringArray
- );
- test_function!(
- SplitPart,
- &[
- lit("abc~@~def~@~ghi"),
- lit("~@~"),
- lit(ScalarValue::Int64(Some(20))),
- ],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- test_function!(
- SplitPart,
- &[
- lit("abc~@~def~@~ghi"),
- lit("~@~"),
- lit(ScalarValue::Int64(Some(-1))),
- ],
- exec_err!("field position must be greater than zero"),
- &str,
- Utf8,
- StringArray
- );
test_function!(
EndsWith,
&[lit("alphabet"), lit("alph"),],
@@ -1812,7 +1724,7 @@ mod tests {
let schema = Schema::new(vec![Field::new("a", DataType::Int32,
false)]);
// pick some arbitrary functions to test
- let funs = [BuiltinScalarFunction::Concat,
BuiltinScalarFunction::Repeat];
+ let funs = [BuiltinScalarFunction::Concat];
for fun in funs.iter() {
let expr = create_physical_expr_with_type_coercion(
diff --git a/datafusion/physical-expr/src/string_expressions.rs
b/datafusion/physical-expr/src/string_expressions.rs
index 766e167a94..812b746354 100644
--- a/datafusion/physical-expr/src/string_expressions.rs
+++ b/datafusion/physical-expr/src/string_expressions.rs
@@ -242,73 +242,6 @@ pub fn instr<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
}
}
-/// Repeats string the specified number of times.
-/// repeat('Pg', 4) = 'PgPgPgPg'
-pub fn repeat<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let number_array = as_int64_array(&args[1])?;
-
- let result = string_array
- .iter()
- .zip(number_array.iter())
- .map(|(string, number)| match (string, number) {
- (Some(string), Some(number)) => Some(string.repeat(number as
usize)),
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Replaces all occurrences in string of substring from with substring to.
-/// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef'
-pub fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let from_array = as_generic_string_array::<T>(&args[1])?;
- let to_array = as_generic_string_array::<T>(&args[2])?;
-
- let result = string_array
- .iter()
- .zip(from_array.iter())
- .zip(to_array.iter())
- .map(|((string, from), to)| match (string, from, to) {
- (Some(string), Some(from), Some(to)) => Some(string.replace(from,
to)),
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Splits string at occurrences of delimiter and returns the n'th field
(counting from one).
-/// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def'
-pub fn split_part<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let delimiter_array = as_generic_string_array::<T>(&args[1])?;
- let n_array = as_int64_array(&args[2])?;
- let result = string_array
- .iter()
- .zip(delimiter_array.iter())
- .zip(n_array.iter())
- .map(|((string, delimiter), n)| match (string, delimiter, n) {
- (Some(string), Some(delimiter), Some(n)) => {
- if n <= 0 {
- exec_err!("field position must be greater than zero")
- } else {
- let split_string: Vec<&str> =
string.split(delimiter).collect();
- match split_string.get(n as usize - 1) {
- Some(s) => Ok(Some(*s)),
- None => Ok(Some("")),
- }
- }
- }
- _ => Ok(None),
- })
- .collect::<Result<GenericStringArray<T>>>()?;
-
- Ok(Arc::new(result) as ArrayRef)
-}
-
/// Returns true if string starts with prefix.
/// starts_with('alphabet', 'alph') = 't'
pub fn starts_with<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
diff --git a/datafusion/proto/proto/datafusion.proto
b/datafusion/proto/proto/datafusion.proto
index 795995ce2c..297e355dd7 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -581,8 +581,8 @@ enum ScalarFunction {
// 37 was OctetLength
Random = 38;
// 39 was RegexpReplace
- Repeat = 40;
- Replace = 41;
+ // 40 was Repeat
+ // 41 was Replace
Reverse = 42;
Right = 43;
Rpad = 44;
@@ -591,7 +591,7 @@ enum ScalarFunction {
// 47 was SHA256
// 48 was SHA384
// 49 was SHA512
- SplitPart = 50;
+ // 50 was SplitPart
// StartsWith = 51;
Strpos = 52;
Substr = 53;
diff --git a/datafusion/proto/src/generated/pbjson.rs
b/datafusion/proto/src/generated/pbjson.rs
index 3941171e4f..dce815f0f2 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22937,12 +22937,9 @@ impl serde::Serialize for ScalarFunction {
Self::Left => "Left",
Self::Lpad => "Lpad",
Self::Random => "Random",
- Self::Repeat => "Repeat",
- Self::Replace => "Replace",
Self::Reverse => "Reverse",
Self::Right => "Right",
Self::Rpad => "Rpad",
- Self::SplitPart => "SplitPart",
Self::Strpos => "Strpos",
Self::Substr => "Substr",
Self::Translate => "Translate",
@@ -23002,12 +22999,9 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Left",
"Lpad",
"Random",
- "Repeat",
- "Replace",
"Reverse",
"Right",
"Rpad",
- "SplitPart",
"Strpos",
"Substr",
"Translate",
@@ -23096,12 +23090,9 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Left" => Ok(ScalarFunction::Left),
"Lpad" => Ok(ScalarFunction::Lpad),
"Random" => Ok(ScalarFunction::Random),
- "Repeat" => Ok(ScalarFunction::Repeat),
- "Replace" => Ok(ScalarFunction::Replace),
"Reverse" => Ok(ScalarFunction::Reverse),
"Right" => Ok(ScalarFunction::Right),
"Rpad" => Ok(ScalarFunction::Rpad),
- "SplitPart" => Ok(ScalarFunction::SplitPart),
"Strpos" => Ok(ScalarFunction::Strpos),
"Substr" => Ok(ScalarFunction::Substr),
"Translate" => Ok(ScalarFunction::Translate),
diff --git a/datafusion/proto/src/generated/prost.rs
b/datafusion/proto/src/generated/prost.rs
index 58fda7fcb5..2292687b45 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2880,8 +2880,8 @@ pub enum ScalarFunction {
/// 37 was OctetLength
Random = 38,
/// 39 was RegexpReplace
- Repeat = 40,
- Replace = 41,
+ /// 40 was Repeat
+ /// 41 was Replace
Reverse = 42,
Right = 43,
Rpad = 44,
@@ -2890,7 +2890,7 @@ pub enum ScalarFunction {
/// 47 was SHA256
/// 48 was SHA384
/// 49 was SHA512
- SplitPart = 50,
+ /// 50 was SplitPart
/// StartsWith = 51;
Strpos = 52,
Substr = 53,
@@ -3010,12 +3010,9 @@ impl ScalarFunction {
ScalarFunction::Left => "Left",
ScalarFunction::Lpad => "Lpad",
ScalarFunction::Random => "Random",
- ScalarFunction::Repeat => "Repeat",
- ScalarFunction::Replace => "Replace",
ScalarFunction::Reverse => "Reverse",
ScalarFunction::Right => "Right",
ScalarFunction::Rpad => "Rpad",
- ScalarFunction::SplitPart => "SplitPart",
ScalarFunction::Strpos => "Strpos",
ScalarFunction::Substr => "Substr",
ScalarFunction::Translate => "Translate",
@@ -3069,12 +3066,9 @@ impl ScalarFunction {
"Left" => Some(Self::Left),
"Lpad" => Some(Self::Lpad),
"Random" => Some(Self::Random),
- "Repeat" => Some(Self::Repeat),
- "Replace" => Some(Self::Replace),
"Reverse" => Some(Self::Reverse),
"Right" => Some(Self::Right),
"Rpad" => Some(Self::Rpad),
- "SplitPart" => Some(Self::SplitPart),
"Strpos" => Some(Self::Strpos),
"Substr" => Some(Self::Substr),
"Translate" => Some(Self::Translate),
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs
b/datafusion/proto/src/logical_plan/from_proto.rs
index 3b44c1cb27..b78e3ae6dc 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -53,11 +53,10 @@ use datafusion_expr::{
expr::{self, InList, Sort, WindowFunction},
factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log,
log10, log2,
logical_plan::{PlanType, StringifiedPlan},
- lpad, nanvl, pi, power, radians, random, repeat, replace, reverse, right,
round,
- rpad, signum, sin, sinh, split_part, sqrt, strpos, substr, substr_index,
substring,
- translate, trunc, AggregateFunction, Between, BinaryExpr,
BuiltInWindowFunction,
- BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField,
- GroupingSet,
+ lpad, nanvl, pi, power, radians, random, reverse, right, round, rpad,
signum, sin,
+ sinh, sqrt, strpos, substr, substr_index, substring, translate, trunc,
+ AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction,
BuiltinScalarFunction,
+ Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet,
GroupingSet::GroupingSets,
JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame,
WindowFrameBound,
WindowFrameUnits,
@@ -468,12 +467,9 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::Left => Self::Left,
ScalarFunction::Lpad => Self::Lpad,
ScalarFunction::Random => Self::Random,
- ScalarFunction::Repeat => Self::Repeat,
- ScalarFunction::Replace => Self::Replace,
ScalarFunction::Reverse => Self::Reverse,
ScalarFunction::Right => Self::Right,
ScalarFunction::Rpad => Self::Rpad,
- ScalarFunction::SplitPart => Self::SplitPart,
ScalarFunction::Strpos => Self::Strpos,
ScalarFunction::Substr => Self::Substr,
ScalarFunction::Translate => Self::Translate,
@@ -1445,15 +1441,6 @@ pub fn parse_expr(
parse_expr(&args[1], registry, codec)?,
)),
ScalarFunction::Random => Ok(random()),
- ScalarFunction::Repeat => Ok(repeat(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- )),
- ScalarFunction::Replace => Ok(replace(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- parse_expr(&args[2], registry, codec)?,
- )),
ScalarFunction::Reverse => {
Ok(reverse(parse_expr(&args[0], registry, codec)?))
}
@@ -1485,11 +1472,6 @@ pub fn parse_expr(
.map(|expr| parse_expr(expr, registry, codec))
.collect::<Result<Vec<_>, _>>()?,
)),
- ScalarFunction::SplitPart => Ok(split_part(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- parse_expr(&args[2], registry, codec)?,
- )),
ScalarFunction::EndsWith => Ok(ends_with(
parse_expr(&args[0], registry, codec)?,
parse_expr(&args[1], registry, codec)?,
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs
b/datafusion/proto/src/logical_plan/to_proto.rs
index 446a91a39a..0c0f0c6e0a 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1490,12 +1490,9 @@ impl TryFrom<&BuiltinScalarFunction> for
protobuf::ScalarFunction {
BuiltinScalarFunction::Left => Self::Left,
BuiltinScalarFunction::Lpad => Self::Lpad,
BuiltinScalarFunction::Random => Self::Random,
- BuiltinScalarFunction::Repeat => Self::Repeat,
- BuiltinScalarFunction::Replace => Self::Replace,
BuiltinScalarFunction::Reverse => Self::Reverse,
BuiltinScalarFunction::Right => Self::Right,
BuiltinScalarFunction::Rpad => Self::Rpad,
- BuiltinScalarFunction::SplitPart => Self::SplitPart,
BuiltinScalarFunction::Strpos => Self::Strpos,
BuiltinScalarFunction::Substr => Self::Substr,
BuiltinScalarFunction::Translate => Self::Translate,