This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 09f5a544d2 move Left, Lpad, Reverse, Right, Rpad functions to
datafusion_functions (#9841)
09f5a544d2 is described below
commit 09f5a544d25f36ff1d65cc377123aee9b0e8f538
Author: Bruce Ritchie <[email protected]>
AuthorDate: Thu Mar 28 22:56:15 2024 -0400
move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions
(#9841)
* Fix to_timestamp benchmark
* Remove reference to simd and nightly build as simd is no longer an
available feature in DataFusion and building with nightly may not be a good
recommendation when getting started.
* Fixed missing trim() function.
* Create unicode module in datafusion/functions/src/unicode and
unicode_expressions feature flag, move char_length function
* move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions
* Code cleanup from PR review.
---
datafusion/expr/src/built_in_function.rs | 50 +-
datafusion/expr/src/expr_fn.rs | 21 -
datafusion/functions/src/unicode/left.rs | 236 ++++++++
datafusion/functions/src/unicode/lpad.rs | 369 +++++++++++++
datafusion/functions/src/unicode/mod.rs | 44 +-
datafusion/functions/src/unicode/reverse.rs | 149 +++++
datafusion/functions/src/unicode/right.rs | 238 ++++++++
datafusion/functions/src/unicode/rpad.rs | 361 ++++++++++++
datafusion/physical-expr/src/functions.rs | 606 ---------------------
datafusion/physical-expr/src/planner.rs | 4 +-
.../physical-expr/src/unicode_expressions.rs | 263 +--------
datafusion/proto/proto/datafusion.proto | 10 +-
datafusion/proto/src/generated/pbjson.rs | 15 -
datafusion/proto/src/generated/prost.rs | 20 +-
datafusion/proto/src/logical_plan/from_proto.rs | 53 +-
datafusion/proto/src/logical_plan/to_proto.rs | 5 -
16 files changed, 1428 insertions(+), 1016 deletions(-)
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index eefbc131a2..196d278dc7 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -111,18 +111,8 @@ pub enum BuiltinScalarFunction {
EndsWith,
/// initcap
InitCap,
- /// left
- Left,
- /// lpad
- Lpad,
/// random
Random,
- /// reverse
- Reverse,
- /// right
- Right,
- /// rpad
- Rpad,
/// strpos
Strpos,
/// substr
@@ -220,12 +210,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::ConcatWithSeparator =>
Volatility::Immutable,
BuiltinScalarFunction::EndsWith => Volatility::Immutable,
BuiltinScalarFunction::InitCap => Volatility::Immutable,
- BuiltinScalarFunction::Left => Volatility::Immutable,
- BuiltinScalarFunction::Lpad => Volatility::Immutable,
BuiltinScalarFunction::Radians => Volatility::Immutable,
- BuiltinScalarFunction::Reverse => Volatility::Immutable,
- BuiltinScalarFunction::Right => Volatility::Immutable,
- BuiltinScalarFunction::Rpad => Volatility::Immutable,
BuiltinScalarFunction::Strpos => Volatility::Immutable,
BuiltinScalarFunction::Substr => Volatility::Immutable,
BuiltinScalarFunction::Translate => Volatility::Immutable,
@@ -264,17 +249,8 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::InitCap => {
utf8_to_str_type(&input_expr_types[0], "initcap")
}
- BuiltinScalarFunction::Left =>
utf8_to_str_type(&input_expr_types[0], "left"),
- BuiltinScalarFunction::Lpad =>
utf8_to_str_type(&input_expr_types[0], "lpad"),
BuiltinScalarFunction::Pi => Ok(Float64),
BuiltinScalarFunction::Random => Ok(Float64),
- BuiltinScalarFunction::Reverse => {
- utf8_to_str_type(&input_expr_types[0], "reverse")
- }
- BuiltinScalarFunction::Right => {
- utf8_to_str_type(&input_expr_types[0], "right")
- }
- BuiltinScalarFunction::Rpad =>
utf8_to_str_type(&input_expr_types[0], "rpad"),
BuiltinScalarFunction::EndsWith => Ok(Boolean),
BuiltinScalarFunction::Strpos => {
utf8_to_int_type(&input_expr_types[0], "strpos/instr/position")
@@ -361,28 +337,9 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Coalesce => {
Signature::variadic_equal(self.volatility())
}
- BuiltinScalarFunction::InitCap | BuiltinScalarFunction::Reverse =>
{
+ BuiltinScalarFunction::InitCap => {
Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility())
}
- BuiltinScalarFunction::Lpad | BuiltinScalarFunction::Rpad => {
- Signature::one_of(
- vec![
- Exact(vec![Utf8, Int64]),
- Exact(vec![LargeUtf8, Int64]),
- Exact(vec![Utf8, Int64, Utf8]),
- Exact(vec![LargeUtf8, Int64, Utf8]),
- Exact(vec![Utf8, Int64, LargeUtf8]),
- Exact(vec![LargeUtf8, Int64, LargeUtf8]),
- ],
- self.volatility(),
- )
- }
- BuiltinScalarFunction::Left | BuiltinScalarFunction::Right => {
- Signature::one_of(
- vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8,
Int64])],
- self.volatility(),
- )
- }
BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos =>
{
Signature::one_of(
@@ -580,11 +537,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
BuiltinScalarFunction::EndsWith => &["ends_with"],
BuiltinScalarFunction::InitCap => &["initcap"],
- BuiltinScalarFunction::Left => &["left"],
- BuiltinScalarFunction::Lpad => &["lpad"],
- BuiltinScalarFunction::Reverse => &["reverse"],
- BuiltinScalarFunction::Right => &["right"],
- BuiltinScalarFunction::Rpad => &["rpad"],
BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
BuiltinScalarFunction::Substr => &["substr"],
BuiltinScalarFunction::Translate => &["translate"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 6544647986..21dab72855 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -578,25 +578,11 @@ scalar_expr!(Atan2, atan2, y x, "inverse tangent of a
division given in the argu
scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`");
scalar_expr!(InitCap, initcap, string, "converts the first letter of each word
in `string` in uppercase and the remaining characters in lowercase");
-scalar_expr!(Left, left, string n, "returns the first `n` characters in the
`string`");
-scalar_expr!(Reverse, reverse, string, "reverses the `string`");
-scalar_expr!(Right, right, string n, "returns the last `n` characters in the
`string`");
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends
with the `suffix`");
scalar_expr!(Strpos, strpos, string substring, "finds the position from where
the `substring` matches the `string`");
scalar_expr!(Substr, substr, string position, "substring from the `position`
to the end");
scalar_expr!(Substr, substring, string position length, "substring from the
`position` with `length` characters");
scalar_expr!(Translate, translate, string from to, "replaces the characters in
`from` with the counterpart in `to`");
-//use vec as parameter
-nary_scalar_expr!(
- Lpad,
- lpad,
- "fill up a string to the length by prepending the characters"
-);
-nary_scalar_expr!(
- Rpad,
- rpad,
- "fill up a string to the length by appending the characters"
-);
nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which
evaluates to the value of the first [Expr] which is not NULL");
//there is a func concat_ws before, so use concat_ws_expr as name.c
nary_scalar_expr!(
@@ -1028,13 +1014,6 @@ mod test {
test_scalar_expr!(Gcd, gcd, arg_1, arg_2);
test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
test_scalar_expr!(InitCap, initcap, string);
- test_scalar_expr!(Left, left, string, count);
- test_nary_scalar_expr!(Lpad, lpad, string, count);
- test_nary_scalar_expr!(Lpad, lpad, string, count, characters);
- test_scalar_expr!(Reverse, reverse, string);
- test_scalar_expr!(Right, right, string, count);
- test_nary_scalar_expr!(Rpad, rpad, string, count);
- test_nary_scalar_expr!(Rpad, rpad, string, count, characters);
test_scalar_expr!(EndsWith, ends_with, string, characters);
test_scalar_expr!(Strpos, strpos, string, substring);
test_scalar_expr!(Substr, substr, string, position);
diff --git a/datafusion/functions/src/unicode/left.rs
b/datafusion/functions/src/unicode/left.rs
new file mode 100644
index 0000000000..473589fdc8
--- /dev/null
+++ b/datafusion/functions/src/unicode/left.rs
@@ -0,0 +1,236 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::cmp::Ordering;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+
+use datafusion_common::cast::{as_generic_string_array, as_int64_array};
+use datafusion_common::exec_err;
+use datafusion_common::Result;
+use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+use crate::utils::{make_scalar_function, utf8_to_str_type};
+
+#[derive(Debug)]
+pub(super) struct LeftFunc {
+ signature: Signature,
+}
+
+impl LeftFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for LeftFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "left"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "left")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(left::<i32>, vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(left::<i64>,
vec![])(args),
+ other => exec_err!("Unsupported data type {other:?} for function
left"),
+ }
+ }
+}
+
+/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
+/// left('abcde', 2) = 'ab'
+/// The implementation uses UTF-8 code points as characters
+pub fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let n_array = as_int64_array(&args[1])?;
+ let result = string_array
+ .iter()
+ .zip(n_array.iter())
+ .map(|(string, n)| match (string, n) {
+ (Some(string), Some(n)) => match n.cmp(&0) {
+ Ordering::Less => {
+ let len = string.chars().count() as i64;
+ Some(if n.abs() < len {
+ string.chars().take((len + n) as
usize).collect::<String>()
+ } else {
+ "".to_string()
+ })
+ }
+ Ordering::Equal => Some("".to_string()),
+ Ordering::Greater => {
+ Some(string.chars().take(n as usize).collect::<String>())
+ }
+ },
+ _ => None,
+ })
+ .collect::<GenericStringArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::{Array, StringArray};
+ use arrow::datatypes::DataType::Utf8;
+
+ use datafusion_common::{Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use crate::unicode::left::LeftFunc;
+ use crate::utils::test::test_function;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ Ok(Some("ab")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(200i64)),
+ ],
+ Ok(Some("abcde")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(-2i64)),
+ ],
+ Ok(Some("abc")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(-200i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(0i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some("joséé")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+ ColumnarValue::Scalar(ScalarValue::from(-3i64)),
+ ],
+ Ok(Some("joséé")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ #[cfg(not(feature = "unicode_expressions"))]
+ test_function!(
+ LeftFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ internal_err!(
+ "function left requires compilation with feature flag:
unicode_expressions."
+ ),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ Ok(())
+ }
+}
diff --git a/datafusion/functions/src/unicode/lpad.rs
b/datafusion/functions/src/unicode/lpad.rs
new file mode 100644
index 0000000000..76a8e68cca
--- /dev/null
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{as_generic_string_array, as_int64_array};
+use unicode_segmentation::UnicodeSegmentation;
+
+use crate::utils::{make_scalar_function, utf8_to_str_type};
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+#[derive(Debug)]
+pub(super) struct LPadFunc {
+ signature: Signature,
+}
+
+impl LPadFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![
+ Exact(vec![Utf8, Int64]),
+ Exact(vec![LargeUtf8, Int64]),
+ Exact(vec![Utf8, Int64, Utf8]),
+ Exact(vec![LargeUtf8, Int64, Utf8]),
+ Exact(vec![Utf8, Int64, LargeUtf8]),
+ Exact(vec![LargeUtf8, Int64, LargeUtf8]),
+ ],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for LPadFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "lpad"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "lpad")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(lpad::<i32>, vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(lpad::<i64>,
vec![])(args),
+ other => exec_err!("Unsupported data type {other:?} for function
lpad"),
+ }
+ }
+}
+
+/// Extends the string to length 'length' by prepending the characters fill (a
space by default). If the string is already longer than length then it is
truncated (on the right).
+/// lpad('hi', 5, 'xy') = 'xyxhi'
+pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ match args.len() {
+ 2 => {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let length_array = as_int64_array(&args[1])?;
+
+ let result = string_array
+ .iter()
+ .zip(length_array.iter())
+ .map(|(string, length)| match (string, length) {
+ (Some(string), Some(length)) => {
+ if length > i32::MAX as i64 {
+ return exec_err!(
+ "lpad requested length {length} too large"
+ );
+ }
+
+ let length = if length < 0 { 0 } else { length as
usize };
+ if length == 0 {
+ Ok(Some("".to_string()))
+ } else {
+ let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
+ if length < graphemes.len() {
+ Ok(Some(graphemes[..length].concat()))
+ } else {
+ let mut s: String = " ".repeat(length -
graphemes.len());
+ s.push_str(string);
+ Ok(Some(s))
+ }
+ }
+ }
+ _ => Ok(None),
+ })
+ .collect::<Result<GenericStringArray<T>>>()?;
+
+ Ok(Arc::new(result) as ArrayRef)
+ }
+ 3 => {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let length_array = as_int64_array(&args[1])?;
+ let fill_array = as_generic_string_array::<T>(&args[2])?;
+
+ let result = string_array
+ .iter()
+ .zip(length_array.iter())
+ .zip(fill_array.iter())
+ .map(|((string, length), fill)| match (string, length, fill) {
+ (Some(string), Some(length), Some(fill)) => {
+ if length > i32::MAX as i64 {
+ return exec_err!(
+ "lpad requested length {length} too large"
+ );
+ }
+
+ let length = if length < 0 { 0 } else { length as
usize };
+ if length == 0 {
+ Ok(Some("".to_string()))
+ } else {
+ let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
+ let fill_chars =
fill.chars().collect::<Vec<char>>();
+
+ if length < graphemes.len() {
+ Ok(Some(graphemes[..length].concat()))
+ } else if fill_chars.is_empty() {
+ Ok(Some(string.to_string()))
+ } else {
+ let mut s = string.to_string();
+ let mut char_vector =
+ Vec::<char>::with_capacity(length -
graphemes.len());
+ for l in 0..length - graphemes.len() {
+ char_vector.push(
+ *fill_chars.get(l %
fill_chars.len()).unwrap(),
+ );
+ }
+ s.insert_str(
+ 0,
+
char_vector.iter().collect::<String>().as_str(),
+ );
+ Ok(Some(s))
+ }
+ }
+ }
+ _ => Ok(None),
+ })
+ .collect::<Result<GenericStringArray<T>>>()?;
+
+ Ok(Arc::new(result) as ArrayRef)
+ }
+ other => exec_err!(
+ "lpad was called with {other} arguments. It requires at least 2
and at most 3."
+ ),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::{Array, StringArray};
+ use arrow::datatypes::DataType::Utf8;
+
+ use datafusion_common::{Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use crate::unicode::lpad::LPadFunc;
+ use crate::utils::test::test_function;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("josé")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some(" josé")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some(" hi")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(0i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from("xy")),
+ ],
+ Ok(Some("xyxhi")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(21i64)),
+ ColumnarValue::Scalar(ScalarValue::from("abcdef")),
+ ],
+ Ok(Some("abcdefabcdefabcdefahi")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from(" ")),
+ ],
+ Ok(Some(" hi")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from("")),
+ ],
+ Ok(Some("hi")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from("xy")),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ColumnarValue::Scalar(ScalarValue::from("xy")),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("josé")),
+ ColumnarValue::Scalar(ScalarValue::from(10i64)),
+ ColumnarValue::Scalar(ScalarValue::from("xy")),
+ ],
+ Ok(Some("xyxyxyjosé")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("josé")),
+ ColumnarValue::Scalar(ScalarValue::from(10i64)),
+ ColumnarValue::Scalar(ScalarValue::from("éñ")),
+ ],
+ Ok(Some("éñéñéñjosé")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ #[cfg(not(feature = "unicode_expressions"))]
+ test_function!(
+ LPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("josé")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ internal_err!(
+ "function lpad requires compilation with feature flag:
unicode_expressions."
+ ),
+ &str,
+ Utf8,
+ StringArray
+ );
+ Ok(())
+ }
+}
diff --git a/datafusion/functions/src/unicode/mod.rs
b/datafusion/functions/src/unicode/mod.rs
index 291de38439..ea4e70a921 100644
--- a/datafusion/functions/src/unicode/mod.rs
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -22,6 +22,11 @@ use std::sync::Arc;
use datafusion_expr::ScalarUDF;
mod character_length;
+mod left;
+mod lpad;
+mod reverse;
+mod right;
+mod rpad;
// create UDFs
make_udf_function!(
@@ -29,6 +34,11 @@ make_udf_function!(
CHARACTER_LENGTH,
character_length
);
+make_udf_function!(left::LeftFunc, LEFT, left);
+make_udf_function!(lpad::LPadFunc, LPAD, lpad);
+make_udf_function!(right::RightFunc, RIGHT, right);
+make_udf_function!(reverse::ReverseFunc, REVERSE, reverse);
+make_udf_function!(rpad::RPadFunc, RPAD, rpad);
pub mod expr_fn {
use datafusion_expr::Expr;
@@ -47,9 +57,41 @@ pub mod expr_fn {
pub fn length(string: Expr) -> Expr {
character_length(string)
}
+
+ #[doc = "returns the first `n` characters in the `string`"]
+ pub fn left(string: Expr, n: Expr) -> Expr {
+ super::left().call(vec![string, n])
+ }
+
+ #[doc = "fill up a string to the length by prepending the characters"]
+ pub fn lpad(args: Vec<Expr>) -> Expr {
+ super::lpad().call(args)
+ }
+
+ #[doc = "reverses the `string`"]
+ pub fn reverse(string: Expr) -> Expr {
+ super::reverse().call(vec![string])
+ }
+
+ #[doc = "returns the last `n` characters in the `string`"]
+ pub fn right(string: Expr, n: Expr) -> Expr {
+ super::right().call(vec![string, n])
+ }
+
+ #[doc = "fill up a string to the length by appending the characters"]
+ pub fn rpad(args: Vec<Expr>) -> Expr {
+ super::rpad().call(args)
+ }
}
/// Return a list of all functions in this package
pub fn functions() -> Vec<Arc<ScalarUDF>> {
- vec![character_length()]
+ vec![
+ character_length(),
+ left(),
+ lpad(),
+ reverse(),
+ right(),
+ rpad(),
+ ]
}
diff --git a/datafusion/functions/src/unicode/reverse.rs
b/datafusion/functions/src/unicode/reverse.rs
new file mode 100644
index 0000000000..42ca6e0d17
--- /dev/null
+++ b/datafusion/functions/src/unicode/reverse.rs
@@ -0,0 +1,149 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+
+use datafusion_common::cast::as_generic_string_array;
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+use crate::utils::{make_scalar_function, utf8_to_str_type};
+
+#[derive(Debug)]
+pub(super) struct ReverseFunc {
+ signature: Signature,
+}
+
+impl ReverseFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::uniform(
+ 1,
+ vec![Utf8, LargeUtf8],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for ReverseFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "reverse"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "reverse")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(reverse::<i32>,
vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(reverse::<i64>,
vec![])(args),
+ other => {
+ exec_err!("Unsupported data type {other:?} for function
reverse")
+ }
+ }
+ }
+}
+
+/// Reverses the order of the characters in the string.
+/// reverse('abcde') = 'edcba'
+/// The implementation uses UTF-8 code points as characters
+pub fn reverse<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+
+ let result = string_array
+ .iter()
+ .map(|string| string.map(|string: &str|
string.chars().rev().collect::<String>()))
+ .collect::<GenericStringArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::{Array, StringArray};
+ use arrow::datatypes::DataType::Utf8;
+
+ use datafusion_common::{Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use crate::unicode::reverse::ReverseFunc;
+ use crate::utils::test::test_function;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ test_function!(
+ ReverseFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::from("abcde"))],
+ Ok(Some("edcba")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ ReverseFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::from("loẅks"))],
+ Ok(Some("sk̈wol")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ ReverseFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::from("loẅks"))],
+ Ok(Some("sk̈wol")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ ReverseFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::Utf8(None))],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ #[cfg(not(feature = "unicode_expressions"))]
+ test_function!(
+ ReverseFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::from("abcde"))],
+ internal_err!(
+ "function reverse requires compilation with feature flag:
unicode_expressions."
+ ),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ Ok(())
+ }
+}
diff --git a/datafusion/functions/src/unicode/right.rs
b/datafusion/functions/src/unicode/right.rs
new file mode 100644
index 0000000000..d1bd976342
--- /dev/null
+++ b/datafusion/functions/src/unicode/right.rs
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::cmp::{max, Ordering};
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+
+use datafusion_common::cast::{as_generic_string_array, as_int64_array};
+use datafusion_common::exec_err;
+use datafusion_common::Result;
+use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+use crate::utils::{make_scalar_function, utf8_to_str_type};
+
+#[derive(Debug)]
+pub(super) struct RightFunc {
+ signature: Signature,
+}
+
+impl RightFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for RightFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "right"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "right")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(right::<i32>, vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(right::<i64>,
vec![])(args),
+ other => exec_err!("Unsupported data type {other:?} for function
right"),
+ }
+ }
+}
+
+/// Returns last n characters in the string, or when n is negative, returns
all but first |n| characters.
+/// right('abcde', 2) = 'de'
+/// The implementation uses UTF-8 code points as characters
+pub fn right<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let n_array = as_int64_array(&args[1])?;
+
+ let result = string_array
+ .iter()
+ .zip(n_array.iter())
+ .map(|(string, n)| match (string, n) {
+ (Some(string), Some(n)) => match n.cmp(&0) {
+ Ordering::Less => Some(
+ string
+ .chars()
+ .skip(n.unsigned_abs() as usize)
+ .collect::<String>(),
+ ),
+ Ordering::Equal => Some("".to_string()),
+ Ordering::Greater => Some(
+ string
+ .chars()
+ .skip(max(string.chars().count() as i64 - n, 0) as
usize)
+ .collect::<String>(),
+ ),
+ },
+ _ => None,
+ })
+ .collect::<GenericStringArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::{Array, StringArray};
+ use arrow::datatypes::DataType::Utf8;
+
+ use datafusion_common::{Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use crate::unicode::right::RightFunc;
+ use crate::utils::test::test_function;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ Ok(Some("de")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(200i64)),
+ ],
+ Ok(Some("abcde")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(-2i64)),
+ ],
+ Ok(Some("cde")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(-200i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(0i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some("éésoj")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+ ColumnarValue::Scalar(ScalarValue::from(-3i64)),
+ ],
+ Ok(Some("éésoj")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ #[cfg(not(feature = "unicode_expressions"))]
+ test_function!(
+ RightFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("abcde")),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ internal_err!(
+ "function right requires compilation with feature flag:
unicode_expressions."
+ ),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ Ok(())
+ }
+}
diff --git a/datafusion/functions/src/unicode/rpad.rs
b/datafusion/functions/src/unicode/rpad.rs
new file mode 100644
index 0000000000..070278c90b
--- /dev/null
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -0,0 +1,361 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{as_generic_string_array, as_int64_array};
+use unicode_segmentation::UnicodeSegmentation;
+
+use crate::utils::{make_scalar_function, utf8_to_str_type};
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+
+#[derive(Debug)]
+pub(super) struct RPadFunc {
+ signature: Signature,
+}
+
+impl RPadFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![
+ Exact(vec![Utf8, Int64]),
+ Exact(vec![LargeUtf8, Int64]),
+ Exact(vec![Utf8, Int64, Utf8]),
+ Exact(vec![LargeUtf8, Int64, Utf8]),
+ Exact(vec![Utf8, Int64, LargeUtf8]),
+ Exact(vec![LargeUtf8, Int64, LargeUtf8]),
+ ],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for RPadFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "rpad"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_str_type(&arg_types[0], "rpad")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => make_scalar_function(rpad::<i32>, vec![])(args),
+ DataType::LargeUtf8 => make_scalar_function(rpad::<i64>,
vec![])(args),
+ other => exec_err!("Unsupported data type {other:?} for function
rpad"),
+ }
+ }
+}
+
+/// Extends the string to length 'length' by appending the characters fill (a
space by default). If the string is already longer than length then it is
truncated.
+/// rpad('hi', 5, 'xy') = 'hixyx'
+pub fn rpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ match args.len() {
+ 2 => {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let length_array = as_int64_array(&args[1])?;
+
+ let result = string_array
+ .iter()
+ .zip(length_array.iter())
+ .map(|(string, length)| match (string, length) {
+ (Some(string), Some(length)) => {
+ if length > i32::MAX as i64 {
+ return exec_err!(
+ "rpad requested length {length} too large"
+ );
+ }
+
+ let length = if length < 0 { 0 } else { length as
usize };
+ if length == 0 {
+ Ok(Some("".to_string()))
+ } else {
+ let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
+ if length < graphemes.len() {
+ Ok(Some(graphemes[..length].concat()))
+ } else {
+ let mut s = string.to_string();
+ s.push_str(" ".repeat(length -
graphemes.len()).as_str());
+ Ok(Some(s))
+ }
+ }
+ }
+ _ => Ok(None),
+ })
+ .collect::<Result<GenericStringArray<T>>>()?;
+ Ok(Arc::new(result) as ArrayRef)
+ }
+ 3 => {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+ let length_array = as_int64_array(&args[1])?;
+ let fill_array = as_generic_string_array::<T>(&args[2])?;
+
+ let result = string_array
+ .iter()
+ .zip(length_array.iter())
+ .zip(fill_array.iter())
+ .map(|((string, length), fill)| match (string, length, fill) {
+ (Some(string), Some(length), Some(fill)) => {
+ if length > i32::MAX as i64 {
+ return exec_err!(
+ "rpad requested length {length} too large"
+ );
+ }
+
+ let length = if length < 0 { 0 } else { length as
usize };
+ let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
+ let fill_chars = fill.chars().collect::<Vec<char>>();
+
+ if length < graphemes.len() {
+ Ok(Some(graphemes[..length].concat()))
+ } else if fill_chars.is_empty() {
+ Ok(Some(string.to_string()))
+ } else {
+ let mut s = string.to_string();
+ let mut char_vector =
+ Vec::<char>::with_capacity(length -
graphemes.len());
+ for l in 0..length - graphemes.len() {
+ char_vector
+ .push(*fill_chars.get(l %
fill_chars.len()).unwrap());
+ }
+
s.push_str(char_vector.iter().collect::<String>().as_str());
+ Ok(Some(s))
+ }
+ }
+ _ => Ok(None),
+ })
+ .collect::<Result<GenericStringArray<T>>>()?;
+
+ Ok(Arc::new(result) as ArrayRef)
+ }
+ other => exec_err!(
+ "rpad was called with {other} arguments. It requires at least 2
and at most 3."
+ ),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::{Array, StringArray};
+ use arrow::datatypes::DataType::Utf8;
+
+ use datafusion_common::{Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use crate::unicode::rpad::RPadFunc;
+ use crate::utils::test::test_function;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("josé")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some("josé ")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(Some("hi ")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(0i64)),
+ ],
+ Ok(Some("")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from("xy")),
+ ],
+ Ok(Some("hixyx")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(21i64)),
+ ColumnarValue::Scalar(ScalarValue::from("abcdef")),
+ ],
+ Ok(Some("hiabcdefabcdefabcdefa")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from(" ")),
+ ],
+ Ok(Some("hi ")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from("")),
+ ],
+ Ok(Some("hi")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::from("xy")),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::Int64(None)),
+ ColumnarValue::Scalar(ScalarValue::from("xy")),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("hi")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+ ],
+ Ok(None),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("josé")),
+ ColumnarValue::Scalar(ScalarValue::from(10i64)),
+ ColumnarValue::Scalar(ScalarValue::from("xy")),
+ ],
+ Ok(Some("joséxyxyxy")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("josé")),
+ ColumnarValue::Scalar(ScalarValue::from(10i64)),
+ ColumnarValue::Scalar(ScalarValue::from("éñ")),
+ ],
+ Ok(Some("josééñéñéñ")),
+ &str,
+ Utf8,
+ StringArray
+ );
+ #[cfg(not(feature = "unicode_expressions"))]
+ test_function!(
+ RPadFunc::new(),
+ &[
+ ColumnarValue::Scalar(ScalarValue::from("josé")),
+ ColumnarValue::Scalar(ScalarValue::from(5i64)),
+ ],
+ internal_err!(
+ "function rpad requires compilation with feature flag:
unicode_expressions."
+ ),
+ &str,
+ Utf8,
+ StringArray
+ );
+
+ Ok(())
+ }
+}
diff --git a/datafusion/physical-expr/src/functions.rs
b/datafusion/physical-expr/src/functions.rs
index 9adc853634..c1b4900e39 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -270,67 +270,6 @@ pub fn create_physical_fun(
exec_err!("Unsupported data type {other:?} for function
initcap")
}
}),
- BuiltinScalarFunction::Left => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(left,
i32, "left");
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(left,
i64, "left");
- make_scalar_function_inner(func)(args)
- }
- other => exec_err!("Unsupported data type {other:?} for function
left"),
- }),
- BuiltinScalarFunction::Lpad => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(lpad,
i32, "lpad");
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(lpad,
i64, "lpad");
- make_scalar_function_inner(func)(args)
- }
- other => exec_err!("Unsupported data type {other:?} for function
lpad"),
- }),
- BuiltinScalarFunction::Reverse => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
- let func =
- invoke_if_unicode_expressions_feature_flag!(reverse, i32,
"reverse");
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func =
- invoke_if_unicode_expressions_feature_flag!(reverse, i64,
"reverse");
- make_scalar_function_inner(func)(args)
- }
- other => {
- exec_err!("Unsupported data type {other:?} for function
reverse")
- }
- }),
- BuiltinScalarFunction::Right => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
- let func =
- invoke_if_unicode_expressions_feature_flag!(right, i32,
"right");
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func =
- invoke_if_unicode_expressions_feature_flag!(right, i64,
"right");
- make_scalar_function_inner(func)(args)
- }
- other => exec_err!("Unsupported data type {other:?} for function
right"),
- }),
- BuiltinScalarFunction::Rpad => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(rpad,
i32, "rpad");
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(rpad,
i64, "rpad");
- make_scalar_function_inner(func)(args)
- }
- other => exec_err!("Unsupported data type {other:?} for function
rpad"),
- }),
BuiltinScalarFunction::EndsWith => Arc::new(|args| match
args[0].data_type() {
DataType::Utf8 => {
make_scalar_function_inner(string_expressions::ends_with::<i32>)(args)
@@ -691,551 +630,6 @@ mod tests {
Utf8,
StringArray
);
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[lit("abcde"), lit(ScalarValue::Int8(Some(2))),],
- Ok(Some("ab")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[lit("abcde"), lit(ScalarValue::Int64(Some(200))),],
- Ok(Some("abcde")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[lit("abcde"), lit(ScalarValue::Int64(Some(-2))),],
- Ok(Some("abc")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[lit("abcde"), lit(ScalarValue::Int64(Some(-200))),],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[lit("abcde"), lit(ScalarValue::Int64(Some(0))),],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[
- lit(ScalarValue::Utf8(None)),
- lit(ScalarValue::Int64(Some(2))),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[lit("abcde"), lit(ScalarValue::Int64(None)),],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),],
- Ok(Some("joséé")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Left,
- &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-3))),],
- Ok(Some("joséé")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(not(feature = "unicode_expressions"))]
- test_function!(
- Left,
- &[
- lit("abcde"),
- lit(ScalarValue::Int8(Some(2))),
- ],
- internal_err!(
- "function left requires compilation with feature flag:
unicode_expressions."
- ),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("josé"), lit(ScalarValue::Int64(Some(5))),],
- Ok(Some(" josé")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(5))),],
- Ok(Some(" hi")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(0))),],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("hi"), lit(ScalarValue::Int64(None)),],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[
- lit(ScalarValue::Utf8(None)),
- lit(ScalarValue::Int64(Some(5))),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit("xy"),],
- Ok(Some("xyxhi")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(21))), lit("abcdef"),],
- Ok(Some("abcdefabcdefabcdefahi")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(" "),],
- Ok(Some(" hi")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(""),],
- Ok(Some("hi")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[
- lit(ScalarValue::Utf8(None)),
- lit(ScalarValue::Int64(Some(5))),
- lit("xy"),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("hi"), lit(ScalarValue::Int64(None)), lit("xy"),],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[
- lit("hi"),
- lit(ScalarValue::Int64(Some(5))),
- lit(ScalarValue::Utf8(None)),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("xy"),],
- Ok(Some("xyxyxyjosé")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Lpad,
- &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("éñ"),],
- Ok(Some("éñéñéñjosé")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(not(feature = "unicode_expressions"))]
- test_function!(
- Lpad,
- &[
- lit("josé"),
- lit(ScalarValue::Int64(Some(5))),
- ],
- internal_err!(
- "function lpad requires compilation with feature flag:
unicode_expressions."
- ),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Reverse,
- &[lit("abcde")],
- Ok(Some("edcba")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Reverse,
- &[lit("loẅks")],
- Ok(Some("sk̈wol")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Reverse,
- &[lit("loẅks")],
- Ok(Some("sk̈wol")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Reverse,
- &[lit(ScalarValue::Utf8(None))],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(not(feature = "unicode_expressions"))]
- test_function!(
- Reverse,
- &[lit("abcde")],
- internal_err!(
- "function reverse requires compilation with feature flag:
unicode_expressions."
- ),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[lit("abcde"), lit(ScalarValue::Int8(Some(2))),],
- Ok(Some("de")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[lit("abcde"), lit(ScalarValue::Int64(Some(200))),],
- Ok(Some("abcde")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[lit("abcde"), lit(ScalarValue::Int64(Some(-2))),],
- Ok(Some("cde")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[lit("abcde"), lit(ScalarValue::Int64(Some(-200))),],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[lit("abcde"), lit(ScalarValue::Int64(Some(0))),],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[
- lit(ScalarValue::Utf8(None)),
- lit(ScalarValue::Int64(Some(2))),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[lit("abcde"), lit(ScalarValue::Int64(None)),],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[lit("joséésoj"), lit(ScalarValue::Int64(Some(5))),],
- Ok(Some("éésoj")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Right,
- &[lit("joséésoj"), lit(ScalarValue::Int64(Some(-3))),],
- Ok(Some("éésoj")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(not(feature = "unicode_expressions"))]
- test_function!(
- Right,
- &[
- lit("abcde"),
- lit(ScalarValue::Int8(Some(2))),
- ],
- internal_err!(
- "function right requires compilation with feature flag:
unicode_expressions."
- ),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("josé"), lit(ScalarValue::Int64(Some(5))),],
- Ok(Some("josé ")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(5))),],
- Ok(Some("hi ")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(0))),],
- Ok(Some("")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("hi"), lit(ScalarValue::Int64(None)),],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[
- lit(ScalarValue::Utf8(None)),
- lit(ScalarValue::Int64(Some(5))),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit("xy"),],
- Ok(Some("hixyx")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(21))), lit("abcdef"),],
- Ok(Some("hiabcdefabcdefabcdefa")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(" "),],
- Ok(Some("hi ")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("hi"), lit(ScalarValue::Int64(Some(5))), lit(""),],
- Ok(Some("hi")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[
- lit(ScalarValue::Utf8(None)),
- lit(ScalarValue::Int64(Some(5))),
- lit("xy"),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("hi"), lit(ScalarValue::Int64(None)), lit("xy"),],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[
- lit("hi"),
- lit(ScalarValue::Int64(Some(5))),
- lit(ScalarValue::Utf8(None)),
- ],
- Ok(None),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("xy"),],
- Ok(Some("joséxyxyxy")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- Rpad,
- &[lit("josé"), lit(ScalarValue::Int64(Some(10))), lit("éñ"),],
- Ok(Some("josééñéñéñ")),
- &str,
- Utf8,
- StringArray
- );
- #[cfg(not(feature = "unicode_expressions"))]
- test_function!(
- Rpad,
- &[
- lit("josé"),
- lit(ScalarValue::Int64(Some(5))),
- ],
- internal_err!(
- "function rpad requires compilation with feature flag:
unicode_expressions."
- ),
- &str,
- Utf8,
- StringArray
- );
test_function!(
EndsWith,
&[lit("alphabet"), lit("alph"),],
diff --git a/datafusion/physical-expr/src/planner.rs
b/datafusion/physical-expr/src/planner.rs
index 319d9ca226..0dbea09ffb 100644
--- a/datafusion/physical-expr/src/planner.rs
+++ b/datafusion/physical-expr/src/planner.rs
@@ -335,11 +335,11 @@ mod tests {
use arrow_array::{ArrayRef, BooleanArray, RecordBatch, StringArray};
use arrow_schema::{DataType, Field, Schema};
use datafusion_common::{DFSchema, Result};
- use datafusion_expr::{col, left, Literal};
+ use datafusion_expr::{col, lit};
#[test]
fn test_create_physical_expr_scalar_input_output() -> Result<()> {
- let expr = col("letter").eq(left("APACHE".lit(), 1i64.lit()));
+ let expr = col("letter").eq(lit("A"));
let schema = Schema::new(vec![Field::new("letter", DataType::Utf8,
false)]);
let df_schema = DFSchema::try_from_qualified_schema("data", &schema)?;
diff --git a/datafusion/physical-expr/src/unicode_expressions.rs
b/datafusion/physical-expr/src/unicode_expressions.rs
index c7e4b7d7c4..faff21111a 100644
--- a/datafusion/physical-expr/src/unicode_expressions.rs
+++ b/datafusion/physical-expr/src/unicode_expressions.rs
@@ -21,7 +21,7 @@
//! Unicode expressions
-use std::cmp::{max, Ordering};
+use std::cmp::max;
use std::sync::Arc;
use arrow::{
@@ -36,267 +36,6 @@ use datafusion_common::{
exec_err, Result,
};
-/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
-/// left('abcde', 2) = 'ab'
-/// The implementation uses UTF-8 code points as characters
-pub fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let n_array = as_int64_array(&args[1])?;
- let result = string_array
- .iter()
- .zip(n_array.iter())
- .map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- let len = string.chars().count() as i64;
- Some(if n.abs() < len {
- string.chars().take((len + n) as
usize).collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Extends the string to length 'length' by prepending the characters fill (a
space by default). If the string is already longer than length then it is
truncated (on the right).
-/// lpad('hi', 5, 'xy') = 'xyxhi'
-pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- match args.len() {
- 2 => {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let length_array = as_int64_array(&args[1])?;
-
- let result = string_array
- .iter()
- .zip(length_array.iter())
- .map(|(string, length)| match (string, length) {
- (Some(string), Some(length)) => {
- if length > i32::MAX as i64 {
- return exec_err!(
- "lpad requested length {length} too large"
- );
- }
-
- let length = if length < 0 { 0 } else { length as
usize };
- if length == 0 {
- Ok(Some("".to_string()))
- } else {
- let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
- if length < graphemes.len() {
- Ok(Some(graphemes[..length].concat()))
- } else {
- let mut s: String = " ".repeat(length -
graphemes.len());
- s.push_str(string);
- Ok(Some(s))
- }
- }
- }
- _ => Ok(None),
- })
- .collect::<Result<GenericStringArray<T>>>()?;
-
- Ok(Arc::new(result) as ArrayRef)
- }
- 3 => {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let length_array = as_int64_array(&args[1])?;
- let fill_array = as_generic_string_array::<T>(&args[2])?;
-
- let result = string_array
- .iter()
- .zip(length_array.iter())
- .zip(fill_array.iter())
- .map(|((string, length), fill)| match (string, length, fill) {
- (Some(string), Some(length), Some(fill)) => {
- if length > i32::MAX as i64 {
- return exec_err!(
- "lpad requested length {length} too large"
- );
- }
-
- let length = if length < 0 { 0 } else { length as
usize };
- if length == 0 {
- Ok(Some("".to_string()))
- } else {
- let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
- let fill_chars =
fill.chars().collect::<Vec<char>>();
-
- if length < graphemes.len() {
- Ok(Some(graphemes[..length].concat()))
- } else if fill_chars.is_empty() {
- Ok(Some(string.to_string()))
- } else {
- let mut s = string.to_string();
- let mut char_vector =
- Vec::<char>::with_capacity(length -
graphemes.len());
- for l in 0..length - graphemes.len() {
- char_vector.push(
- *fill_chars.get(l %
fill_chars.len()).unwrap(),
- );
- }
- s.insert_str(
- 0,
-
char_vector.iter().collect::<String>().as_str(),
- );
- Ok(Some(s))
- }
- }
- }
- _ => Ok(None),
- })
- .collect::<Result<GenericStringArray<T>>>()?;
-
- Ok(Arc::new(result) as ArrayRef)
- }
- other => exec_err!(
- "lpad was called with {other} arguments. It requires at least 2
and at most 3."
- ),
- }
-}
-
-/// Reverses the order of the characters in the string.
-/// reverse('abcde') = 'edcba'
-/// The implementation uses UTF-8 code points as characters
-pub fn reverse<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
-
- let result = string_array
- .iter()
- .map(|string| string.map(|string: &str|
string.chars().rev().collect::<String>()))
- .collect::<GenericStringArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Returns last n characters in the string, or when n is negative, returns
all but first |n| characters.
-/// right('abcde', 2) = 'de'
-/// The implementation uses UTF-8 code points as characters
-pub fn right<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let n_array = as_int64_array(&args[1])?;
-
- let result = string_array
- .iter()
- .zip(n_array.iter())
- .map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => Some(
- string
- .chars()
- .skip(n.unsigned_abs() as usize)
- .collect::<String>(),
- ),
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => Some(
- string
- .chars()
- .skip(max(string.chars().count() as i64 - n, 0) as
usize)
- .collect::<String>(),
- ),
- },
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Extends the string to length 'length' by appending the characters fill (a
space by default). If the string is already longer than length then it is
truncated.
-/// rpad('hi', 5, 'xy') = 'hixyx'
-pub fn rpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- match args.len() {
- 2 => {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let length_array = as_int64_array(&args[1])?;
-
- let result = string_array
- .iter()
- .zip(length_array.iter())
- .map(|(string, length)| match (string, length) {
- (Some(string), Some(length)) => {
- if length > i32::MAX as i64 {
- return exec_err!(
- "rpad requested length {length} too large"
- );
- }
-
- let length = if length < 0 { 0 } else { length as
usize };
- if length == 0 {
- Ok(Some("".to_string()))
- } else {
- let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
- if length < graphemes.len() {
- Ok(Some(graphemes[..length].concat()))
- } else {
- let mut s = string.to_string();
- s.push_str(" ".repeat(length -
graphemes.len()).as_str());
- Ok(Some(s))
- }
- }
- }
- _ => Ok(None),
- })
- .collect::<Result<GenericStringArray<T>>>()?;
- Ok(Arc::new(result) as ArrayRef)
- }
- 3 => {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let length_array = as_int64_array(&args[1])?;
- let fill_array = as_generic_string_array::<T>(&args[2])?;
-
- let result = string_array
- .iter()
- .zip(length_array.iter())
- .zip(fill_array.iter())
- .map(|((string, length), fill)| match (string, length, fill) {
- (Some(string), Some(length), Some(fill)) => {
- if length > i32::MAX as i64 {
- return exec_err!(
- "rpad requested length {length} too large"
- );
- }
-
- let length = if length < 0 { 0 } else { length as
usize };
- let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
- let fill_chars = fill.chars().collect::<Vec<char>>();
-
- if length < graphemes.len() {
- Ok(Some(graphemes[..length].concat()))
- } else if fill_chars.is_empty() {
- Ok(Some(string.to_string()))
- } else {
- let mut s = string.to_string();
- let mut char_vector =
- Vec::<char>::with_capacity(length -
graphemes.len());
- for l in 0..length - graphemes.len() {
- char_vector
- .push(*fill_chars.get(l %
fill_chars.len()).unwrap());
- }
-
s.push_str(char_vector.iter().collect::<String>().as_str());
- Ok(Some(s))
- }
- }
- _ => Ok(None),
- })
- .collect::<Result<GenericStringArray<T>>>()?;
-
- Ok(Arc::new(result) as ArrayRef)
- }
- other => exec_err!(
- "rpad was called with {other} arguments. It requires at least 2
and at most 3."
- ),
- }
-}
-
/// Returns starting index of specified substring within string, or zero if
it's not present. (Same as position(substring in string), but note the reversed
argument order.)
/// strpos('high', 'ig') = 2
/// The implementation uses UTF-8 code points as characters
diff --git a/datafusion/proto/proto/datafusion.proto
b/datafusion/proto/proto/datafusion.proto
index 766ca6633e..6319372d98 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -572,8 +572,8 @@ enum ScalarFunction {
// 28 was DatePart
// 29 was DateTrunc
InitCap = 30;
- Left = 31;
- Lpad = 32;
+ // 31 was Left
+ // 32 was Lpad
// 33 was Lower
// 34 was Ltrim
// 35 was MD5
@@ -583,9 +583,9 @@ enum ScalarFunction {
// 39 was RegexpReplace
// 40 was Repeat
// 41 was Replace
- Reverse = 42;
- Right = 43;
- Rpad = 44;
+ // 42 was Reverse
+ // 43 was Right
+ // 44 was Rpad
// 45 was Rtrim
// 46 was SHA224
// 47 was SHA256
diff --git a/datafusion/proto/src/generated/pbjson.rs
b/datafusion/proto/src/generated/pbjson.rs
index f2814956ef..7281bc9dc2 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22931,12 +22931,7 @@ impl serde::Serialize for ScalarFunction {
Self::Concat => "Concat",
Self::ConcatWithSeparator => "ConcatWithSeparator",
Self::InitCap => "InitCap",
- Self::Left => "Left",
- Self::Lpad => "Lpad",
Self::Random => "Random",
- Self::Reverse => "Reverse",
- Self::Right => "Right",
- Self::Rpad => "Rpad",
Self::Strpos => "Strpos",
Self::Substr => "Substr",
Self::Translate => "Translate",
@@ -22990,12 +22985,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Concat",
"ConcatWithSeparator",
"InitCap",
- "Left",
- "Lpad",
"Random",
- "Reverse",
- "Right",
- "Rpad",
"Strpos",
"Substr",
"Translate",
@@ -23078,12 +23068,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Concat" => Ok(ScalarFunction::Concat),
"ConcatWithSeparator" =>
Ok(ScalarFunction::ConcatWithSeparator),
"InitCap" => Ok(ScalarFunction::InitCap),
- "Left" => Ok(ScalarFunction::Left),
- "Lpad" => Ok(ScalarFunction::Lpad),
"Random" => Ok(ScalarFunction::Random),
- "Reverse" => Ok(ScalarFunction::Reverse),
- "Right" => Ok(ScalarFunction::Right),
- "Rpad" => Ok(ScalarFunction::Rpad),
"Strpos" => Ok(ScalarFunction::Strpos),
"Substr" => Ok(ScalarFunction::Substr),
"Translate" => Ok(ScalarFunction::Translate),
diff --git a/datafusion/proto/src/generated/prost.rs
b/datafusion/proto/src/generated/prost.rs
index ecc94fcdaf..2fe89efb9c 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2871,8 +2871,8 @@ pub enum ScalarFunction {
/// 28 was DatePart
/// 29 was DateTrunc
InitCap = 30,
- Left = 31,
- Lpad = 32,
+ /// 31 was Left
+ /// 32 was Lpad
/// 33 was Lower
/// 34 was Ltrim
/// 35 was MD5
@@ -2882,9 +2882,9 @@ pub enum ScalarFunction {
/// 39 was RegexpReplace
/// 40 was Repeat
/// 41 was Replace
- Reverse = 42,
- Right = 43,
- Rpad = 44,
+ /// 42 was Reverse
+ /// 43 was Right
+ /// 44 was Rpad
/// 45 was Rtrim
/// 46 was SHA224
/// 47 was SHA256
@@ -3004,12 +3004,7 @@ impl ScalarFunction {
ScalarFunction::Concat => "Concat",
ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator",
ScalarFunction::InitCap => "InitCap",
- ScalarFunction::Left => "Left",
- ScalarFunction::Lpad => "Lpad",
ScalarFunction::Random => "Random",
- ScalarFunction::Reverse => "Reverse",
- ScalarFunction::Right => "Right",
- ScalarFunction::Rpad => "Rpad",
ScalarFunction::Strpos => "Strpos",
ScalarFunction::Substr => "Substr",
ScalarFunction::Translate => "Translate",
@@ -3057,12 +3052,7 @@ impl ScalarFunction {
"Concat" => Some(Self::Concat),
"ConcatWithSeparator" => Some(Self::ConcatWithSeparator),
"InitCap" => Some(Self::InitCap),
- "Left" => Some(Self::Left),
- "Lpad" => Some(Self::Lpad),
"Random" => Some(Self::Random),
- "Reverse" => Some(Self::Reverse),
- "Right" => Some(Self::Right),
- "Rpad" => Some(Self::Rpad),
"Strpos" => Some(Self::Strpos),
"Substr" => Some(Self::Substr),
"Translate" => Some(Self::Translate),
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs
b/datafusion/proto/src/logical_plan/from_proto.rs
index 19edd71a3a..2c6f2e479b 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -17,18 +17,6 @@
use std::sync::Arc;
-use crate::protobuf::{
- self,
- plan_type::PlanTypeEnum::{
- AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan,
- FinalPhysicalPlan, FinalPhysicalPlanWithStats, InitialLogicalPlan,
- InitialPhysicalPlan, InitialPhysicalPlanWithStats,
OptimizedLogicalPlan,
- OptimizedPhysicalPlan,
- },
- AnalyzedLogicalPlanType, CubeNode, GroupingSetNode,
OptimizedLogicalPlanType,
- OptimizedPhysicalPlanType, PlaceholderNode, RollupNode,
-};
-
use arrow::{
array::AsArray,
buffer::Buffer,
@@ -38,6 +26,7 @@ use arrow::{
},
ipc::{reader::read_record_batch, root_as_message},
};
+
use datafusion::execution::registry::FunctionRegistry;
use datafusion_common::{
arrow_datafusion_err, internal_err, plan_datafusion_err, Column,
Constraint,
@@ -51,17 +40,29 @@ use datafusion_expr::{
acosh, asinh, atan, atan2, atanh, cbrt, ceil, coalesce, concat_expr,
concat_ws_expr,
cos, cosh, cot, degrees, ends_with, exp,
expr::{self, InList, Sort, WindowFunction},
- factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log,
log10, log2,
+ factorial, find_in_set, floor, gcd, initcap, iszero, lcm, ln, log, log10,
log2,
logical_plan::{PlanType, StringifiedPlan},
- lpad, nanvl, pi, power, radians, random, reverse, right, round, rpad,
signum, sin,
- sinh, sqrt, strpos, substr, substr_index, substring, translate, trunc,
- AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction,
BuiltinScalarFunction,
- Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet,
+ nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, strpos,
substr,
+ substr_index, substring, translate, trunc, AggregateFunction, Between,
BinaryExpr,
+ BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr,
GetFieldAccess,
+ GetIndexedField, GroupingSet,
GroupingSet::GroupingSets,
JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame,
WindowFrameBound,
WindowFrameUnits,
};
+use crate::protobuf::{
+ self,
+ plan_type::PlanTypeEnum::{
+ AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan,
+ FinalPhysicalPlan, FinalPhysicalPlanWithStats, InitialLogicalPlan,
+ InitialPhysicalPlan, InitialPhysicalPlanWithStats,
OptimizedLogicalPlan,
+ OptimizedPhysicalPlan,
+ },
+ AnalyzedLogicalPlanType, CubeNode, GroupingSetNode,
OptimizedLogicalPlanType,
+ OptimizedPhysicalPlanType, PlaceholderNode, RollupNode,
+};
+
use super::LogicalExtensionCodec;
#[derive(Debug)]
@@ -453,12 +454,7 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator,
ScalarFunction::EndsWith => Self::EndsWith,
ScalarFunction::InitCap => Self::InitCap,
- ScalarFunction::Left => Self::Left,
- ScalarFunction::Lpad => Self::Lpad,
ScalarFunction::Random => Self::Random,
- ScalarFunction::Reverse => Self::Reverse,
- ScalarFunction::Right => Self::Right,
- ScalarFunction::Rpad => Self::Rpad,
ScalarFunction::Strpos => Self::Strpos,
ScalarFunction::Substr => Self::Substr,
ScalarFunction::Translate => Self::Translate,
@@ -1382,26 +1378,13 @@ pub fn parse_expr(
parse_expr(&args[0], registry, codec)?,
parse_expr(&args[1], registry, codec)?,
)),
- ScalarFunction::Left => Ok(left(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- )),
ScalarFunction::Random => Ok(random()),
- ScalarFunction::Reverse => {
- Ok(reverse(parse_expr(&args[0], registry, codec)?))
- }
- ScalarFunction::Right => Ok(right(
- parse_expr(&args[0], registry, codec)?,
- parse_expr(&args[1], registry, codec)?,
- )),
ScalarFunction::Concat => {
Ok(concat_expr(parse_exprs(args, registry, codec)?))
}
ScalarFunction::ConcatWithSeparator => {
Ok(concat_ws_expr(parse_exprs(args, registry, codec)?))
}
- ScalarFunction::Lpad => Ok(lpad(parse_exprs(args, registry,
codec)?)),
- ScalarFunction::Rpad => Ok(rpad(parse_exprs(args, registry,
codec)?)),
ScalarFunction::EndsWith => Ok(ends_with(
parse_expr(&args[0], registry, codec)?,
parse_expr(&args[1], registry, codec)?,
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs
b/datafusion/proto/src/logical_plan/to_proto.rs
index 11fc7362c7..ea682a5a22 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1445,12 +1445,7 @@ impl TryFrom<&BuiltinScalarFunction> for
protobuf::ScalarFunction {
BuiltinScalarFunction::ConcatWithSeparator =>
Self::ConcatWithSeparator,
BuiltinScalarFunction::EndsWith => Self::EndsWith,
BuiltinScalarFunction::InitCap => Self::InitCap,
- BuiltinScalarFunction::Left => Self::Left,
- BuiltinScalarFunction::Lpad => Self::Lpad,
BuiltinScalarFunction::Random => Self::Random,
- BuiltinScalarFunction::Reverse => Self::Reverse,
- BuiltinScalarFunction::Right => Self::Right,
- BuiltinScalarFunction::Rpad => Self::Rpad,
BuiltinScalarFunction::Strpos => Self::Strpos,
BuiltinScalarFunction::Substr => Self::Substr,
BuiltinScalarFunction::Translate => Self::Translate,