This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new c2879f5105 move strpos, substr functions to datafusion_functions
(#9849)
c2879f5105 is described below
commit c2879f510533a01bc04ef75da4f1416d0ddb99f6
Author: Bruce Ritchie <[email protected]>
AuthorDate: Fri Mar 29 09:53:03 2024 -0400
move strpos, substr functions to datafusion_functions (#9849)
* Fix to_timestamp benchmark
* Remove reference to simd and nightly build as simd is no longer an
available feature in DataFusion and building with nightly may not be a good
recommendation when getting started.
* Fixed missing trim() function.
* Create unicode module in datafusion/functions/src/unicode and
unicode_expressions feature flag, move char_length function
* move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions
* move strpos, substr functions to datafusion_functions
* Cleanup tests
---
datafusion/expr/src/built_in_function.rs | 36 +-
datafusion/expr/src/expr_fn.rs | 6 -
datafusion/functions/src/unicode/mod.rs | 31 ++
datafusion/functions/src/unicode/strpos.rs | 121 +++++++
datafusion/functions/src/unicode/substr.rs | 392 +++++++++++++++++++++
datafusion/physical-expr/src/functions.rs | 258 +-------------
.../physical-expr/src/unicode_expressions.rs | 95 -----
datafusion/proto/Cargo.toml | 1 +
datafusion/proto/proto/datafusion.proto | 4 +-
datafusion/proto/src/generated/pbjson.rs | 6 -
datafusion/proto/src/generated/prost.rs | 8 +-
datafusion/proto/src/logical_plan/from_proto.rs | 29 +-
datafusion/proto/src/logical_plan/to_proto.rs | 2 -
.../proto/tests/cases/roundtrip_logical_plan.rs | 29 +-
datafusion/proto/tests/cases/serialize.rs | 5 +-
datafusion/sql/src/expr/mod.rs | 9 +-
datafusion/sql/src/expr/substring.rs | 16 +-
datafusion/sqllogictest/test_files/scalar.slt | 2 +-
18 files changed, 598 insertions(+), 452 deletions(-)
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index 196d278dc7..423fc11c1d 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -113,10 +113,6 @@ pub enum BuiltinScalarFunction {
InitCap,
/// random
Random,
- /// strpos
- Strpos,
- /// substr
- Substr,
/// translate
Translate,
/// substr_index
@@ -211,8 +207,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::EndsWith => Volatility::Immutable,
BuiltinScalarFunction::InitCap => Volatility::Immutable,
BuiltinScalarFunction::Radians => Volatility::Immutable,
- BuiltinScalarFunction::Strpos => Volatility::Immutable,
- BuiltinScalarFunction::Substr => Volatility::Immutable,
BuiltinScalarFunction::Translate => Volatility::Immutable,
BuiltinScalarFunction::SubstrIndex => Volatility::Immutable,
BuiltinScalarFunction::FindInSet => Volatility::Immutable,
@@ -252,12 +246,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Pi => Ok(Float64),
BuiltinScalarFunction::Random => Ok(Float64),
BuiltinScalarFunction::EndsWith => Ok(Boolean),
- BuiltinScalarFunction::Strpos => {
- utf8_to_int_type(&input_expr_types[0], "strpos/instr/position")
- }
- BuiltinScalarFunction::Substr => {
- utf8_to_str_type(&input_expr_types[0], "substr")
- }
BuiltinScalarFunction::SubstrIndex => {
utf8_to_str_type(&input_expr_types[0], "substr_index")
}
@@ -341,24 +329,12 @@ impl BuiltinScalarFunction {
Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility())
}
- BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos =>
{
- Signature::one_of(
- vec![
- Exact(vec![Utf8, Utf8]),
- Exact(vec![Utf8, LargeUtf8]),
- Exact(vec![LargeUtf8, Utf8]),
- Exact(vec![LargeUtf8, LargeUtf8]),
- ],
- self.volatility(),
- )
- }
-
- BuiltinScalarFunction::Substr => Signature::one_of(
+ BuiltinScalarFunction::EndsWith => Signature::one_of(
vec![
- Exact(vec![Utf8, Int64]),
- Exact(vec![LargeUtf8, Int64]),
- Exact(vec![Utf8, Int64, Int64]),
- Exact(vec![LargeUtf8, Int64, Int64]),
+ Exact(vec![Utf8, Utf8]),
+ Exact(vec![Utf8, LargeUtf8]),
+ Exact(vec![LargeUtf8, Utf8]),
+ Exact(vec![LargeUtf8, LargeUtf8]),
],
self.volatility(),
),
@@ -537,8 +513,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
BuiltinScalarFunction::EndsWith => &["ends_with"],
BuiltinScalarFunction::InitCap => &["initcap"],
- BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
- BuiltinScalarFunction::Substr => &["substr"],
BuiltinScalarFunction::Translate => &["translate"],
BuiltinScalarFunction::SubstrIndex => &["substr_index",
"substring_index"],
BuiltinScalarFunction::FindInSet => &["find_in_set"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 21dab72855..09170ae639 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -579,9 +579,6 @@ scalar_expr!(Log, log, base x, "logarithm of a `x` for a
particular `base`");
scalar_expr!(InitCap, initcap, string, "converts the first letter of each word
in `string` in uppercase and the remaining characters in lowercase");
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends
with the `suffix`");
-scalar_expr!(Strpos, strpos, string substring, "finds the position from where
the `substring` matches the `string`");
-scalar_expr!(Substr, substr, string position, "substring from the `position`
to the end");
-scalar_expr!(Substr, substring, string position length, "substring from the
`position` with `length` characters");
scalar_expr!(Translate, translate, string from to, "replaces the characters in
`from` with the counterpart in `to`");
nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which
evaluates to the value of the first [Expr] which is not NULL");
//there is a func concat_ws before, so use concat_ws_expr as name.c
@@ -1015,9 +1012,6 @@ mod test {
test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
test_scalar_expr!(InitCap, initcap, string);
test_scalar_expr!(EndsWith, ends_with, string, characters);
- test_scalar_expr!(Strpos, strpos, string, substring);
- test_scalar_expr!(Substr, substr, string, position);
- test_scalar_expr!(Substr, substring, string, position, count);
test_scalar_expr!(Translate, translate, string, from, to);
test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count);
test_scalar_expr!(FindInSet, find_in_set, string, stringlist);
diff --git a/datafusion/functions/src/unicode/mod.rs
b/datafusion/functions/src/unicode/mod.rs
index ea4e70a921..ddab0d1e27 100644
--- a/datafusion/functions/src/unicode/mod.rs
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -27,6 +27,8 @@ mod lpad;
mod reverse;
mod right;
mod rpad;
+mod strpos;
+mod substr;
// create UDFs
make_udf_function!(
@@ -39,6 +41,8 @@ make_udf_function!(lpad::LPadFunc, LPAD, lpad);
make_udf_function!(right::RightFunc, RIGHT, right);
make_udf_function!(reverse::ReverseFunc, REVERSE, reverse);
make_udf_function!(rpad::RPadFunc, RPAD, rpad);
+make_udf_function!(strpos::StrposFunc, STRPOS, strpos);
+make_udf_function!(substr::SubstrFunc, SUBSTR, substr);
pub mod expr_fn {
use datafusion_expr::Expr;
@@ -53,6 +57,11 @@ pub mod expr_fn {
super::character_length().call(vec![string])
}
+ #[doc = "finds the position from where the `substring` matches the
`string`"]
+ pub fn instr(string: Expr, substring: Expr) -> Expr {
+ strpos(string, substring)
+ }
+
#[doc = "the number of characters in the `string`"]
pub fn length(string: Expr) -> Expr {
character_length(string)
@@ -68,6 +77,11 @@ pub mod expr_fn {
super::lpad().call(args)
}
+ #[doc = "finds the position from where the `substring` matches the
`string`"]
+ pub fn position(string: Expr, substring: Expr) -> Expr {
+ strpos(string, substring)
+ }
+
#[doc = "reverses the `string`"]
pub fn reverse(string: Expr) -> Expr {
super::reverse().call(vec![string])
@@ -82,6 +96,21 @@ pub mod expr_fn {
pub fn rpad(args: Vec<Expr>) -> Expr {
super::rpad().call(args)
}
+
+ #[doc = "finds the position from where the `substring` matches the
`string`"]
+ pub fn strpos(string: Expr, substring: Expr) -> Expr {
+ super::strpos().call(vec![string, substring])
+ }
+
+ #[doc = "substring from the `position` to the end"]
+ pub fn substr(string: Expr, position: Expr) -> Expr {
+ super::substr().call(vec![string, position])
+ }
+
+ #[doc = "substring from the `position` with `length` characters"]
+ pub fn substring(string: Expr, position: Expr, length: Expr) -> Expr {
+ super::substr().call(vec![string, position, length])
+ }
}
/// Return a list of all functions in this package
@@ -93,5 +122,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
reverse(),
right(),
rpad(),
+ strpos(),
+ substr(),
]
}
diff --git a/datafusion/functions/src/unicode/strpos.rs
b/datafusion/functions/src/unicode/strpos.rs
new file mode 100644
index 0000000000..1e8bfa37d4
--- /dev/null
+++ b/datafusion/functions/src/unicode/strpos.rs
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+ ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait,
PrimitiveArray,
+};
+use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
+
+use datafusion_common::cast::as_generic_string_array;
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+use crate::utils::{make_scalar_function, utf8_to_int_type};
+
+#[derive(Debug)]
+pub(super) struct StrposFunc {
+ signature: Signature,
+ aliases: Vec<String>,
+}
+
+impl StrposFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![
+ Exact(vec![Utf8, Utf8]),
+ Exact(vec![Utf8, LargeUtf8]),
+ Exact(vec![LargeUtf8, Utf8]),
+ Exact(vec![LargeUtf8, LargeUtf8]),
+ ],
+ Volatility::Immutable,
+ ),
+ aliases: vec![String::from("instr"), String::from("position")],
+ }
+ }
+}
+
+impl ScalarUDFImpl for StrposFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "strpos"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_int_type(&arg_types[0], "strpos/instr/position")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(strpos::<Int32Type>,
vec![])(args),
+ DataType::LargeUtf8 => {
+ make_scalar_function(strpos::<Int64Type>, vec![])(args)
+ }
+ other => exec_err!("Unsupported data type {other:?} for function
strpos"),
+ }
+ }
+
+ fn aliases(&self) -> &[String] {
+ &self.aliases
+ }
+}
+
+/// Returns starting index of specified substring within string, or zero if
it's not present. (Same as position(substring in string), but note the reversed
argument order.)
+/// strpos('high', 'ig') = 2
+/// The implementation uses UTF-8 code points as characters
+fn strpos<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
+where
+ T::Native: OffsetSizeTrait,
+{
+ let string_array: &GenericStringArray<T::Native> =
+ as_generic_string_array::<T::Native>(&args[0])?;
+
+ let substring_array: &GenericStringArray<T::Native> =
+ as_generic_string_array::<T::Native>(&args[1])?;
+
+ let result = string_array
+ .iter()
+ .zip(substring_array.iter())
+ .map(|(string, substring)| match (string, substring) {
+ (Some(string), Some(substring)) => {
+ // the find method returns the byte index of the substring
+ // Next, we count the number of the chars until that byte
+ T::Native::from_usize(
+ string
+ .find(substring)
+ .map(|x| string[..x].chars().count() + 1)
+ .unwrap_or(0),
+ )
+ }
+ _ => None,
+ })
+ .collect::<PrimitiveArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
diff --git a/datafusion/functions/src/unicode/substr.rs
b/datafusion/functions/src/unicode/substr.rs
new file mode 100644
index 0000000000..403157e2a8
--- /dev/null
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -0,0 +1,392 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::cmp::max;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+
+use datafusion_common::cast::{as_generic_string_array, as_int64_array};
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+use crate::utils::{make_scalar_function, utf8_to_str_type};
+
+#[derive(Debug)]
+pub(super) struct SubstrFunc {
+ signature: Signature,
+}
+
+impl SubstrFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![
+ Exact(vec![Utf8, Int64]),
+ Exact(vec![LargeUtf8, Int64]),
+ Exact(vec![Utf8, Int64, Int64]),
+ Exact(vec![LargeUtf8, Int64, Int64]),
+ ],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for SubstrFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "substr"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "substr")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(substr::<i32>,
vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(substr::<i64>,
vec![])(args),
+ other => exec_err!("Unsupported data type {other:?} for function
substr"),
+ }
+ }
+}
+
+/// Extracts the substring of string starting at the start'th character, and
extending for count characters if that is specified. (Same as substring(string
from start for count).)
+/// substr('alphabet', 3) = 'phabet'
+/// substr('alphabet', 3, 2) = 'ph'
+/// The implementation uses UTF-8 code points as characters
+pub fn substr<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ match args.len() {
+ 2 => {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let start_array = as_int64_array(&args[1])?;
+
+ let result = string_array
+ .iter()
+ .zip(start_array.iter())
+ .map(|(string, start)| match (string, start) {
+ (Some(string), Some(start)) => {
+ if start <= 0 {
+ Some(string.to_string())
+ } else {
+ Some(string.chars().skip(start as usize -
1).collect())
+ }
+ }
+ _ => None,
+ })
+ .collect::<GenericStringArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+ }
+ 3 => {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let start_array = as_int64_array(&args[1])?;
+ let count_array = as_int64_array(&args[2])?;
+
+ let result = string_array
+ .iter()
+ .zip(start_array.iter())
+ .zip(count_array.iter())
+ .map(|((string, start), count)| match (string, start, count) {
+ (Some(string), Some(start), Some(count)) => {
+ if count < 0 {
+ exec_err!(
+ "negative substring length not allowed:
substr(<str>, {start}, {count})"
+ )
+ } else {
+ let skip = max(0, start - 1);
+ let count = max(0, count + (if start < 1 {start -
1} else {0}));
+ Ok(Some(string.chars().skip(skip as
usize).take(count as usize).collect::<String>()))
+ }
+ }
+ _ => Ok(None),
+ })
+ .collect::<Result<GenericStringArray<T>>>()?;
+
+ Ok(Arc::new(result) as ArrayRef)
+ }
+ other => {
+ exec_err!("substr was called with {other} arguments. It requires 2
or 3.")
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::{Array, StringArray};
+ use arrow::datatypes::DataType::Utf8;
+
+ use datafusion_common::{exec_err, Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use crate::unicode::substr::SubstrFunc;
+ use crate::utils::test::test_function;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(0i64)),
+ ],
+ Ok(Some("alphabet")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some("ésoj")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+ ColumnarValue::Scalar(ScalarValue::from(-5i64)),
+ ],
+ Ok(Some("joséésoj")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(1i64)),
+ ],
+ Ok(Some("alphabet")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ Ok(Some("lphabet")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(3i64)),
+ ],
+ Ok(Some("phabet")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(-3i64)),
+ ],
+ Ok(Some("alphabet")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(30i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(3i64)),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ Ok(Some("ph")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(3i64)),
+ ColumnarValue::Scalar(ScalarValue::from(20i64)),
+ ],
+ Ok(Some("phabet")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(0i64)),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some("alph")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ // starting from 5 (10 + -5)
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(-5i64)),
+ ColumnarValue::Scalar(ScalarValue::from(10i64)),
+ ],
+ Ok(Some("alph")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ // starting from -1 (4 + -5)
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(-5i64)),
+ ColumnarValue::Scalar(ScalarValue::from(4i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ // starting from 0 (5 + -5)
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(-5i64)),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ColumnarValue::Scalar(ScalarValue::from(20i64)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(3i64)),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(1i64)),
+ ColumnarValue::Scalar(ScalarValue::from(-1i64)),
+ ],
+ exec_err!("negative substring length not allowed: substr(<str>, 1,
-1)"),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ Ok(Some("és")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ #[cfg(not(feature = "unicode_expressions"))]
+ test_function!(
+ SubstrFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("alphabet")),
+ ColumnarValue::Scalar(ScalarValue::from(0i64)),
+ ],
+ internal_err!(
+ "function substr requires compilation with feature flag:
unicode_expressions."
+ ),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ Ok(())
+ }
+}
diff --git a/datafusion/physical-expr/src/functions.rs
b/datafusion/physical-expr/src/functions.rs
index c1b4900e39..513dd71d40 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -281,34 +281,6 @@ pub fn create_physical_fun(
exec_err!("Unsupported data type {other:?} for function
ends_with")
}
}),
- BuiltinScalarFunction::Strpos => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(
- strpos, Int32Type, "strpos"
- );
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(
- strpos, Int64Type, "strpos"
- );
- make_scalar_function_inner(func)(args)
- }
- other => exec_err!("Unsupported data type {other:?} for function
strpos"),
- }),
- BuiltinScalarFunction::Substr => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
- let func =
- invoke_if_unicode_expressions_feature_flag!(substr, i32,
"substr");
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func =
- invoke_if_unicode_expressions_feature_flag!(substr, i64,
"substr");
- make_scalar_function_inner(func)(args)
- }
- other => exec_err!("Unsupported data type {other:?} for function
substr"),
- }),
BuiltinScalarFunction::Translate => Arc::new(|args| match
args[0].data_type() {
DataType::Utf8 => {
let func = invoke_if_unicode_expressions_feature_flag!(
@@ -450,7 +422,7 @@ mod tests {
};
use datafusion_common::cast::as_uint64_array;
- use datafusion_common::{exec_err, internal_err, plan_err};
+ use datafusion_common::{internal_err, plan_err};
use datafusion_common::{DataFusionError, Result, ScalarValue};
use datafusion_expr::type_coercion::functions::data_types;
use datafusion_expr::Signature;
@@ -663,234 +635,6 @@ mod tests {
BooleanArray
);
#[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("alphabet"), lit(ScalarValue::Int64(Some(0))),],
- Ok(Some("alphabet")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),],
- Ok(Some("ésoj")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-5))),],
- Ok(Some("joséésoj")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("alphabet"), lit(ScalarValue::Int64(Some(1))),],
- Ok(Some("alphabet")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("alphabet"), lit(ScalarValue::Int64(Some(2))),],
- Ok(Some("lphabet")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("alphabet"), lit(ScalarValue::Int64(Some(3))),],
- Ok(Some("phabet")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("alphabet"), lit(ScalarValue::Int64(Some(-3))),],
- Ok(Some("alphabet")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("alphabet"), lit(ScalarValue::Int64(Some(30))),],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[lit("alphabet"), lit(ScalarValue::Int64(None)),],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(3))),
- lit(ScalarValue::Int64(Some(2))),
- ],
- Ok(Some("ph")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(3))),
- lit(ScalarValue::Int64(Some(20))),
- ],
- Ok(Some("phabet")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(0))),
- lit(ScalarValue::Int64(Some(5))),
- ],
- Ok(Some("alph")),
- &str,
- Utf8,
- StringArray
- );
- // starting from 5 (10 + -5)
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(-5))),
- lit(ScalarValue::Int64(Some(10))),
- ],
- Ok(Some("alph")),
- &str,
- Utf8,
- StringArray
- );
- // starting from -1 (4 + -5)
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(-5))),
- lit(ScalarValue::Int64(Some(4))),
- ],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- // starting from 0 (5 + -5)
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(-5))),
- lit(ScalarValue::Int64(Some(5))),
- ],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(None)),
- lit(ScalarValue::Int64(Some(20))),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(3))),
- lit(ScalarValue::Int64(None)),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(1))),
- lit(ScalarValue::Int64(Some(-1))),
- ],
- exec_err!("negative substring length not allowed: substr(<str>, 1,
-1)"),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Substr,
- &[
- lit("joséésoj"),
- lit(ScalarValue::Int64(Some(5))),
- lit(ScalarValue::Int64(Some(2))),
- ],
- Ok(Some("és")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(not(feature = "unicode_expressions"))]
- test_function!(
- Substr,
- &[
- lit("alphabet"),
- lit(ScalarValue::Int64(Some(0))),
- ],
- internal_err!(
- "function substr requires compilation with feature flag:
unicode_expressions."
- ),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
test_function!(
Translate,
&[lit("12345"), lit("143"), lit("ax"),],
diff --git a/datafusion/physical-expr/src/unicode_expressions.rs
b/datafusion/physical-expr/src/unicode_expressions.rs
index faff21111a..ecbd1ea320 100644
--- a/datafusion/physical-expr/src/unicode_expressions.rs
+++ b/datafusion/physical-expr/src/unicode_expressions.rs
@@ -21,7 +21,6 @@
//! Unicode expressions
-use std::cmp::max;
use std::sync::Arc;
use arrow::{
@@ -36,100 +35,6 @@ use datafusion_common::{
exec_err, Result,
};
-/// Returns starting index of specified substring within string, or zero if
it's not present. (Same as position(substring in string), but note the reversed
argument order.)
-/// strpos('high', 'ig') = 2
-/// The implementation uses UTF-8 code points as characters
-pub fn strpos<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
-where
- T::Native: OffsetSizeTrait,
-{
- let string_array: &GenericStringArray<T::Native> =
- as_generic_string_array::<T::Native>(&args[0])?;
-
- let substring_array: &GenericStringArray<T::Native> =
- as_generic_string_array::<T::Native>(&args[1])?;
-
- let result = string_array
- .iter()
- .zip(substring_array.iter())
- .map(|(string, substring)| match (string, substring) {
- (Some(string), Some(substring)) => {
- // the find method returns the byte index of the substring
- // Next, we count the number of the chars until that byte
- T::Native::from_usize(
- string
- .find(substring)
- .map(|x| string[..x].chars().count() + 1)
- .unwrap_or(0),
- )
- }
- _ => None,
- })
- .collect::<PrimitiveArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Extracts the substring of string starting at the start'th character, and
extending for count characters if that is specified. (Same as substring(string
from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-pub fn substr<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- match args.len() {
- 2 => {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let start_array = as_int64_array(&args[1])?;
-
- let result = string_array
- .iter()
- .zip(start_array.iter())
- .map(|(string, start)| match (string, start) {
- (Some(string), Some(start)) => {
- if start <= 0 {
- Some(string.to_string())
- } else {
- Some(string.chars().skip(start as usize -
1).collect())
- }
- }
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
- }
- 3 => {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let start_array = as_int64_array(&args[1])?;
- let count_array = as_int64_array(&args[2])?;
-
- let result = string_array
- .iter()
- .zip(start_array.iter())
- .zip(count_array.iter())
- .map(|((string, start), count)| match (string, start, count) {
- (Some(string), Some(start), Some(count)) => {
- if count < 0 {
- exec_err!(
- "negative substring length not allowed:
substr(<str>, {start}, {count})"
- )
- } else {
- let skip = max(0, start - 1);
- let count = max(0, count + (if start < 1 {start -
1} else {0}));
- Ok(Some(string.chars().skip(skip as
usize).take(count as usize).collect::<String>()))
- }
- }
- _ => Ok(None),
- })
- .collect::<Result<GenericStringArray<T>>>()?;
-
- Ok(Arc::new(result) as ArrayRef)
- }
- other => {
- exec_err!("substr was called with {other} arguments. It requires 2
or 3.")
- }
- }
-}
-
/// Replaces each character in string that matches a character in the from set
with the corresponding character in the to set. If from is longer than to,
occurrences of the extra characters in from are deleted.
/// translate('12345', '143', 'ax') = 'a2x5'
pub fn translate<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml
index f5297aefcd..bec2b8c53a 100644
--- a/datafusion/proto/Cargo.toml
+++ b/datafusion/proto/Cargo.toml
@@ -54,6 +54,7 @@ serde = { version = "1.0", optional = true }
serde_json = { workspace = true, optional = true }
[dev-dependencies]
+datafusion-functions = { workspace = true, default-features = true }
doc-comment = { workspace = true }
strum = { version = "0.26.1", features = ["derive"] }
tokio = { workspace = true, features = ["rt-multi-thread"] }
diff --git a/datafusion/proto/proto/datafusion.proto
b/datafusion/proto/proto/datafusion.proto
index 6319372d98..3a187eabe8 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -593,8 +593,8 @@ enum ScalarFunction {
// 49 was SHA512
// 50 was SplitPart
// StartsWith = 51;
- Strpos = 52;
- Substr = 53;
+ // 52 was Strpos
+ // 53 was Substr
// ToHex = 54;
// 55 was ToTimestamp
// 56 was ToTimestampMillis
diff --git a/datafusion/proto/src/generated/pbjson.rs
b/datafusion/proto/src/generated/pbjson.rs
index 7281bc9dc2..07b91b26d6 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22932,8 +22932,6 @@ impl serde::Serialize for ScalarFunction {
Self::ConcatWithSeparator => "ConcatWithSeparator",
Self::InitCap => "InitCap",
Self::Random => "Random",
- Self::Strpos => "Strpos",
- Self::Substr => "Substr",
Self::Translate => "Translate",
Self::Coalesce => "Coalesce",
Self::Power => "Power",
@@ -22986,8 +22984,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"ConcatWithSeparator",
"InitCap",
"Random",
- "Strpos",
- "Substr",
"Translate",
"Coalesce",
"Power",
@@ -23069,8 +23065,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"ConcatWithSeparator" =>
Ok(ScalarFunction::ConcatWithSeparator),
"InitCap" => Ok(ScalarFunction::InitCap),
"Random" => Ok(ScalarFunction::Random),
- "Strpos" => Ok(ScalarFunction::Strpos),
- "Substr" => Ok(ScalarFunction::Substr),
"Translate" => Ok(ScalarFunction::Translate),
"Coalesce" => Ok(ScalarFunction::Coalesce),
"Power" => Ok(ScalarFunction::Power),
diff --git a/datafusion/proto/src/generated/prost.rs
b/datafusion/proto/src/generated/prost.rs
index 2fe89efb9c..babeccec59 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2892,8 +2892,8 @@ pub enum ScalarFunction {
/// 49 was SHA512
/// 50 was SplitPart
/// StartsWith = 51;
- Strpos = 52,
- Substr = 53,
+ /// 52 was Strpos
+ /// 53 was Substr
/// ToHex = 54;
/// 55 was ToTimestamp
/// 56 was ToTimestampMillis
@@ -3005,8 +3005,6 @@ impl ScalarFunction {
ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator",
ScalarFunction::InitCap => "InitCap",
ScalarFunction::Random => "Random",
- ScalarFunction::Strpos => "Strpos",
- ScalarFunction::Substr => "Substr",
ScalarFunction::Translate => "Translate",
ScalarFunction::Coalesce => "Coalesce",
ScalarFunction::Power => "Power",
@@ -3053,8 +3051,6 @@ impl ScalarFunction {
"ConcatWithSeparator" => Some(Self::ConcatWithSeparator),
"InitCap" => Some(Self::InitCap),
"Random" => Some(Self::Random),
- "Strpos" => Some(Self::Strpos),
- "Substr" => Some(Self::Substr),
"Translate" => Some(Self::Translate),
"Coalesce" => Some(Self::Coalesce),
"Power" => Some(Self::Power),
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs
b/datafusion/proto/src/logical_plan/from_proto.rs
index 2c6f2e479b..ff3d6773d5 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -42,10 +42,10 @@ use datafusion_expr::{
expr::{self, InList, Sort, WindowFunction},
factorial, find_in_set, floor, gcd, initcap, iszero, lcm, ln, log, log10,
log2,
logical_plan::{PlanType, StringifiedPlan},
- nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, strpos,
substr,
- substr_index, substring, translate, trunc, AggregateFunction, Between,
BinaryExpr,
- BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr,
GetFieldAccess,
- GetIndexedField, GroupingSet,
+ nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt,
substr_index,
+ translate, trunc, AggregateFunction, Between, BinaryExpr,
BuiltInWindowFunction,
+ BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField,
+ GroupingSet,
GroupingSet::GroupingSets,
JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame,
WindowFrameBound,
WindowFrameUnits,
@@ -455,8 +455,6 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::EndsWith => Self::EndsWith,
ScalarFunction::InitCap => Self::InitCap,
ScalarFunction::Random => Self::Random,
- ScalarFunction::Strpos => Self::Strpos,
- ScalarFunction::Substr => Self::Substr,
ScalarFunction::Translate => Self::Translate,
ScalarFunction::Coalesce => Self::Coalesce,
ScalarFunction::Pi => Self::Pi,
@@ -1389,25 +1387,6 @@ pub fn parse_expr(
parse_expr(&args[0], registry, codec)?,
parse_expr(&args[1], registry, codec)?,
)),
- ScalarFunction::Strpos => Ok(strpos(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- )),
- ScalarFunction::Substr => {
- if args.len() > 2 {
- assert_eq!(args.len(), 3);
- Ok(substring(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- parse_expr(&args[2], registry, codec)?,
- ))
- } else {
- Ok(substr(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- ))
- }
- }
ScalarFunction::Translate => Ok(translate(
parse_expr(&args[0], registry, codec)?,
parse_expr(&args[1], registry, codec)?,
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs
b/datafusion/proto/src/logical_plan/to_proto.rs
index ea682a5a22..89d49c5658 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1446,8 +1446,6 @@ impl TryFrom<&BuiltinScalarFunction> for
protobuf::ScalarFunction {
BuiltinScalarFunction::EndsWith => Self::EndsWith,
BuiltinScalarFunction::InitCap => Self::InitCap,
BuiltinScalarFunction::Random => Self::Random,
- BuiltinScalarFunction::Strpos => Self::Strpos,
- BuiltinScalarFunction::Substr => Self::Substr,
BuiltinScalarFunction::Translate => Self::Translate,
BuiltinScalarFunction::Coalesce => Self::Coalesce,
BuiltinScalarFunction::Pi => Self::Pi,
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index 3c43f10075..3a47f556c0 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -34,8 +34,8 @@ use datafusion::test_util::{TestTableFactory,
TestTableProvider};
use datafusion_common::config::{FormatOptions, TableOptions};
use datafusion_common::scalar::ScalarStructBuilder;
use datafusion_common::{
- internal_err, not_impl_err, plan_err, DFField, DFSchema, DFSchemaRef,
- DataFusionError, FileType, Result, ScalarValue,
+ internal_datafusion_err, internal_err, not_impl_err, plan_err, DFField,
DFSchema,
+ DFSchemaRef, DataFusionError, FileType, Result, ScalarValue,
};
use datafusion_expr::dml::CopyTo;
use datafusion_expr::expr::{
@@ -44,8 +44,7 @@ use datafusion_expr::expr::{
};
use datafusion_expr::logical_plan::{Extension, UserDefinedLogicalNodeCore};
use datafusion_expr::{
- col, create_udaf, lit, Accumulator, AggregateFunction,
- BuiltinScalarFunction::{Sqrt, Substr},
+ col, create_udaf, lit, Accumulator, AggregateFunction,
BuiltinScalarFunction::Sqrt,
ColumnarValue, Expr, ExprSchemable, LogicalPlan, Operator,
PartitionEvaluator,
ScalarUDF, ScalarUDFImpl, Signature, TryCast, Volatility, WindowFrame,
WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, WindowUDF,
@@ -60,6 +59,7 @@ use datafusion_proto::logical_plan::LogicalExtensionCodec;
use datafusion_proto::logical_plan::{from_proto, DefaultLogicalExtensionCodec};
use datafusion_proto::protobuf;
+use datafusion::execution::FunctionRegistry;
use prost::Message;
#[cfg(feature = "json")]
@@ -1863,17 +1863,28 @@ fn roundtrip_cube() {
#[test]
fn roundtrip_substr() {
+ let ctx = SessionContext::new();
+
+ let fun = ctx
+ .state()
+ .udf("substr")
+ .map_err(|e| {
+ internal_datafusion_err!("Unable to find expected 'substr'
function: {e:?}")
+ })
+ .unwrap();
+
// substr(string, position)
- let test_expr =
- Expr::ScalarFunction(ScalarFunction::new(Substr, vec![col("col"),
lit(1_i64)]));
+ let test_expr = Expr::ScalarFunction(ScalarFunction::new_udf(
+ fun.clone(),
+ vec![col("col"), lit(1_i64)],
+ ));
// substr(string, position, count)
- let test_expr_with_count = Expr::ScalarFunction(ScalarFunction::new(
- Substr,
+ let test_expr_with_count = Expr::ScalarFunction(ScalarFunction::new_udf(
+ fun,
vec![col("col"), lit(1_i64), lit(1_i64)],
));
- let ctx = SessionContext::new();
roundtrip_expr_test(test_expr, ctx.clone());
roundtrip_expr_test(test_expr_with_count, ctx);
}
diff --git a/datafusion/proto/tests/cases/serialize.rs
b/datafusion/proto/tests/cases/serialize.rs
index d4a1ab44a6..972382b841 100644
--- a/datafusion/proto/tests/cases/serialize.rs
+++ b/datafusion/proto/tests/cases/serialize.rs
@@ -260,10 +260,7 @@ fn test_expression_serialization_roundtrip() {
let lit = Expr::Literal(ScalarValue::Utf8(None));
for builtin_fun in BuiltinScalarFunction::iter() {
// default to 4 args (though some exprs like substr have error
checking)
- let num_args = match builtin_fun {
- BuiltinScalarFunction::Substr => 3,
- _ => 4,
- };
+ let num_args = 4;
let args: Vec<_> =
std::iter::repeat(&lit).take(num_args).cloned().collect();
let expr = Expr::ScalarFunction(ScalarFunction::new(builtin_fun,
args));
diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs
index d1fc031949..43bf2d8715 100644
--- a/datafusion/sql/src/expr/mod.rs
+++ b/datafusion/sql/src/expr/mod.rs
@@ -823,12 +823,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
schema: &DFSchema,
planner_context: &mut PlannerContext,
) -> Result<Expr> {
- let fun = BuiltinScalarFunction::Strpos;
+ let fun = self
+ .context_provider
+ .get_function_meta("strpos")
+ .ok_or_else(|| {
+ internal_datafusion_err!("Unable to find expected 'strpos'
function")
+ })?;
let substr =
self.sql_expr_to_logical_expr(substr_expr, schema,
planner_context)?;
let fullstr = self.sql_expr_to_logical_expr(str_expr, schema,
planner_context)?;
let args = vec![fullstr, substr];
- Ok(Expr::ScalarFunction(ScalarFunction::new(fun, args)))
+ Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fun, args)))
}
fn sql_agg_with_filter_to_expr(
&self,
diff --git a/datafusion/sql/src/expr/substring.rs
b/datafusion/sql/src/expr/substring.rs
index a5d1abf0f2..f58c6f3b94 100644
--- a/datafusion/sql/src/expr/substring.rs
+++ b/datafusion/sql/src/expr/substring.rs
@@ -16,10 +16,10 @@
// under the License.
use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::plan_err;
+use datafusion_common::{internal_datafusion_err, plan_err};
use datafusion_common::{DFSchema, Result, ScalarValue};
use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::{BuiltinScalarFunction, Expr};
+use datafusion_expr::Expr;
use sqlparser::ast::Expr as SQLExpr;
impl<'a, S: ContextProvider> SqlToRel<'a, S> {
@@ -68,9 +68,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
}
};
- Ok(Expr::ScalarFunction(ScalarFunction::new(
- BuiltinScalarFunction::Substr,
- args,
- )))
+ let fun = self
+ .context_provider
+ .get_function_meta("substr")
+ .ok_or_else(|| {
+ internal_datafusion_err!("Unable to find expected 'substr'
function")
+ })?;
+
+ Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fun, args)))
}
}
diff --git a/datafusion/sqllogictest/test_files/scalar.slt
b/datafusion/sqllogictest/test_files/scalar.slt
index a77a2bf405..20c8b3d25f 100644
--- a/datafusion/sqllogictest/test_files/scalar.slt
+++ b/datafusion/sqllogictest/test_files/scalar.slt
@@ -2087,7 +2087,7 @@ select position('' in '')
1
-query error DataFusion error: Error during planning: The STRPOS/INSTR/POSITION
function can only accept strings, but got Int64.
+query error DataFusion error: Execution error: The STRPOS/INSTR/POSITION
function can only accept strings, but got Int64.
select position(1 in 1)