This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 2cca3710f3 Create unicode module in datafusion/functions/src/unicode 
and unicode_expressions feature flag, move char_length function (#9825)
2cca3710f3 is described below

commit 2cca3710f3b31148ffe99d9e225c768c921a748b
Author: Bruce Ritchie <[email protected]>
AuthorDate: Thu Mar 28 13:02:27 2024 -0400

    Create unicode module in datafusion/functions/src/unicode and 
unicode_expressions feature flag, move char_length function (#9825)
    
    * Fix to_timestamp benchmark
    
    * Remove reference to simd and nightly build as simd is no longer an 
available feature in DataFusion and building with nightly may not be a good 
recommendation when getting started.
    
    * Fixed missing trim() function.
    
    * Create unicode module in datafusion/functions/src/unicode and 
unicode_expressions feature flag, move char_length function
---
 datafusion-cli/Cargo.lock                          |   1 +
 datafusion/core/Cargo.toml                         |   1 +
 .../core/tests/dataframe/dataframe_functions.rs    |   1 +
 datafusion/expr/src/built_in_function.rs           |  14 +-
 datafusion/expr/src/expr_fn.rs                     |   8 -
 datafusion/functions/Cargo.toml                    |   4 +
 datafusion/functions/src/lib.rs                    |   9 ++
 datafusion/functions/src/string/ascii.rs           |   2 +-
 datafusion/functions/src/string/bit_length.rs      |   4 +-
 datafusion/functions/src/string/btrim.rs           |   1 +
 datafusion/functions/src/string/chr.rs             |   2 +-
 datafusion/functions/src/string/common.rs          | 158 +-----------------
 datafusion/functions/src/string/levenshtein.rs     |   3 +-
 datafusion/functions/src/string/lower.rs           |   8 +-
 datafusion/functions/src/string/ltrim.rs           |   3 +-
 datafusion/functions/src/string/octet_length.rs    |  13 +-
 datafusion/functions/src/string/overlay.rs         |   2 +-
 datafusion/functions/src/string/repeat.rs          |   4 +-
 datafusion/functions/src/string/replace.rs         |   2 +-
 datafusion/functions/src/string/rtrim.rs           |   1 +
 datafusion/functions/src/string/split_part.rs      |   4 +-
 datafusion/functions/src/string/starts_with.rs     |   9 +-
 datafusion/functions/src/string/to_hex.rs          |   9 +-
 datafusion/functions/src/string/upper.rs           |   3 +-
 .../functions/src/unicode/character_length.rs      | 176 +++++++++++++++++++++
 datafusion/functions/src/unicode/mod.rs            |  55 +++++++
 .../functions/src/{string/common.rs => utils.rs}   | 162 +------------------
 datafusion/physical-expr/src/functions.rs          |  70 --------
 .../physical-expr/src/unicode_expressions.rs       |  23 ---
 datafusion/proto/proto/datafusion.proto            |   2 +-
 datafusion/proto/src/generated/pbjson.rs           |   3 -
 datafusion/proto/src/generated/prost.rs            |   4 +-
 datafusion/proto/src/logical_plan/from_proto.rs    |   8 +-
 datafusion/proto/src/logical_plan/to_proto.rs      |   1 -
 datafusion/sql/Cargo.toml                          |   1 +
 datafusion/sql/tests/sql_integration.rs            |  15 +-
 36 files changed, 310 insertions(+), 476 deletions(-)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index b5535a47e9..ba60c04cea 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1273,6 +1273,7 @@ dependencies = [
  "md-5",
  "regex",
  "sha2",
+ "unicode-segmentation",
  "uuid",
 ]
 
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index 1e5c0d748e..de03579975 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -70,6 +70,7 @@ unicode_expressions = [
     "datafusion-physical-expr/unicode_expressions",
     "datafusion-optimizer/unicode_expressions",
     "datafusion-sql/unicode_expressions",
+    "datafusion-functions/unicode_expressions",
 ]
 
 [dependencies]
diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs 
b/datafusion/core/tests/dataframe/dataframe_functions.rs
index 6ebd64c9b6..4371cce856 100644
--- a/datafusion/core/tests/dataframe/dataframe_functions.rs
+++ b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -37,6 +37,7 @@ use datafusion::assert_batches_eq;
 use datafusion_common::DFSchema;
 use datafusion_expr::expr::Alias;
 use datafusion_expr::{approx_median, cast, ExprSchemable};
+use datafusion_functions::unicode::expr_fn::character_length;
 
 fn test_schema() -> SchemaRef {
     Arc::new(Schema::new(vec![
diff --git a/datafusion/expr/src/built_in_function.rs 
b/datafusion/expr/src/built_in_function.rs
index bb0f79f8ec..eefbc131a2 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -103,8 +103,6 @@ pub enum BuiltinScalarFunction {
     Cot,
 
     // string functions
-    /// character_length
-    CharacterLength,
     /// concat
     Concat,
     /// concat_ws
@@ -218,7 +216,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::Cbrt => Volatility::Immutable,
             BuiltinScalarFunction::Cot => Volatility::Immutable,
             BuiltinScalarFunction::Trunc => Volatility::Immutable,
-            BuiltinScalarFunction::CharacterLength => Volatility::Immutable,
             BuiltinScalarFunction::Concat => Volatility::Immutable,
             BuiltinScalarFunction::ConcatWithSeparator => 
Volatility::Immutable,
             BuiltinScalarFunction::EndsWith => Volatility::Immutable,
@@ -257,9 +254,6 @@ impl BuiltinScalarFunction {
         // the return type of the built in function.
         // Some built-in functions' return type depends on the incoming type.
         match self {
-            BuiltinScalarFunction::CharacterLength => {
-                utf8_to_int_type(&input_expr_types[0], "character_length")
-            }
             BuiltinScalarFunction::Coalesce => {
                 // COALESCE has multiple args and they might get coerced, get 
a preview of this
                 let coerced_types = data_types(input_expr_types, 
&self.signature());
@@ -367,9 +361,7 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::Coalesce => {
                 Signature::variadic_equal(self.volatility())
             }
-            BuiltinScalarFunction::CharacterLength
-            | BuiltinScalarFunction::InitCap
-            | BuiltinScalarFunction::Reverse => {
+            BuiltinScalarFunction::InitCap | BuiltinScalarFunction::Reverse => 
{
                 Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility())
             }
             BuiltinScalarFunction::Lpad | BuiltinScalarFunction::Rpad => {
@@ -584,10 +576,6 @@ impl BuiltinScalarFunction {
             // conditional functions
             BuiltinScalarFunction::Coalesce => &["coalesce"],
 
-            // string functions
-            BuiltinScalarFunction::CharacterLength => {
-                &["character_length", "char_length", "length"]
-            }
             BuiltinScalarFunction::Concat => &["concat"],
             BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
             BuiltinScalarFunction::EndsWith => &["ends_with"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 0ea946288e..6544647986 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -577,13 +577,6 @@ scalar_expr!(Power, power, base exponent, "`base` raised 
to the power of `expone
 scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the 
argument");
 scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`");
 
-// string functions
-scalar_expr!(
-    CharacterLength,
-    character_length,
-    string,
-    "the number of characters in the `string`"
-);
 scalar_expr!(InitCap, initcap, string, "converts the first letter of each word 
in `string` in uppercase and the remaining characters in lowercase");
 scalar_expr!(Left, left, string n, "returns the first `n` characters in the 
`string`");
 scalar_expr!(Reverse, reverse, string, "reverses the `string`");
@@ -1032,7 +1025,6 @@ mod test {
         test_scalar_expr!(Nanvl, nanvl, x, y);
         test_scalar_expr!(Iszero, iszero, input);
 
-        test_scalar_expr!(CharacterLength, character_length, string);
         test_scalar_expr!(Gcd, gcd, arg_1, arg_2);
         test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
         test_scalar_expr!(InitCap, initcap, string);
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 81050dfddf..0cab0276ff 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -43,6 +43,7 @@ default = [
     "regex_expressions",
     "crypto_expressions",
     "string_expressions",
+    "unicode_expressions",
 ]
 # enable encode/decode functions
 encoding_expressions = ["base64", "hex"]
@@ -52,6 +53,8 @@ math_expressions = []
 regex_expressions = ["regex"]
 # enable string functions
 string_expressions = []
+# enable unicode functions
+unicode_expressions = ["unicode-segmentation"]
 
 [lib]
 name = "datafusion_functions"
@@ -75,6 +78,7 @@ log = { workspace = true }
 md-5 = { version = "^0.10.0", optional = true }
 regex = { version = "1.8", optional = true }
 sha2 = { version = "^0.10.1", optional = true }
+unicode-segmentation = { version = "^1.7.1", optional = true }
 uuid = { version = "1.7", features = ["v4"] }
 
 [dev-dependencies]
diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
index f469b343e1..2a00839dc5 100644
--- a/datafusion/functions/src/lib.rs
+++ b/datafusion/functions/src/lib.rs
@@ -124,6 +124,12 @@ make_stub_package!(regex, "regex_expressions");
 pub mod crypto;
 make_stub_package!(crypto, "crypto_expressions");
 
+#[cfg(feature = "unicode_expressions")]
+pub mod unicode;
+make_stub_package!(unicode, "unicode_expressions");
+
+mod utils;
+
 /// Fluent-style API for creating `Expr`s
 pub mod expr_fn {
     #[cfg(feature = "core_expressions")]
@@ -140,6 +146,8 @@ pub mod expr_fn {
     pub use super::regex::expr_fn::*;
     #[cfg(feature = "string_expressions")]
     pub use super::string::expr_fn::*;
+    #[cfg(feature = "unicode_expressions")]
+    pub use super::unicode::expr_fn::*;
 }
 
 /// Registers all enabled packages with a [`FunctionRegistry`]
@@ -151,6 +159,7 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) -> 
Result<()> {
         .chain(math::functions())
         .chain(regex::functions())
         .chain(crypto::functions())
+        .chain(unicode::functions())
         .chain(string::functions());
 
     all_functions.try_for_each(|udf| {
diff --git a/datafusion/functions/src/string/ascii.rs 
b/datafusion/functions/src/string/ascii.rs
index 5bd77833a9..9a07f4c19c 100644
--- a/datafusion/functions/src/string/ascii.rs
+++ b/datafusion/functions/src/string/ascii.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::string::common::make_scalar_function;
+use crate::utils::make_scalar_function;
 use arrow::array::Int32Array;
 use arrow::array::{ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
diff --git a/datafusion/functions/src/string/bit_length.rs 
b/datafusion/functions/src/string/bit_length.rs
index 9f61275158..6a200471d4 100644
--- a/datafusion/functions/src/string/bit_length.rs
+++ b/datafusion/functions/src/string/bit_length.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::compute::kernels::length::bit_length;
 use std::any::Any;
 
+use arrow::compute::kernels::length::bit_length;
 use arrow::datatypes::DataType;
 
 use datafusion_common::{exec_err, Result, ScalarValue};
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
-use crate::string::common::*;
+use crate::utils::utf8_to_int_type;
 
 #[derive(Debug)]
 pub(super) struct BitLengthFunc {
diff --git a/datafusion/functions/src/string/btrim.rs 
b/datafusion/functions/src/string/btrim.rs
index de1c9cc69b..573a23d070 100644
--- a/datafusion/functions/src/string/btrim.rs
+++ b/datafusion/functions/src/string/btrim.rs
@@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
 use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
 
 /// Returns the longest string with leading and trailing characters removed. 
If the characters are not specified, whitespace is removed.
 /// btrim('xyxtrimyyx', 'xyz') = 'trim'
diff --git a/datafusion/functions/src/string/chr.rs 
b/datafusion/functions/src/string/chr.rs
index df3b803ba6..d1f8dc398a 100644
--- a/datafusion/functions/src/string/chr.rs
+++ b/datafusion/functions/src/string/chr.rs
@@ -29,7 +29,7 @@ use datafusion_common::{exec_err, Result};
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
-use crate::string::common::*;
+use crate::utils::make_scalar_function;
 
 /// Returns the character with the given code. chr(0) is disallowed because 
text data types cannot store that character.
 /// chr(65) = 'A'
diff --git a/datafusion/functions/src/string/common.rs 
b/datafusion/functions/src/string/common.rs
index 339f4e6c1a..276aad121d 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -24,8 +24,7 @@ use arrow::datatypes::DataType;
 use datafusion_common::cast::as_generic_string_array;
 use datafusion_common::Result;
 use datafusion_common::{exec_err, ScalarValue};
-use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation};
-use datafusion_physical_expr::functions::Hint;
+use datafusion_expr::ColumnarValue;
 
 pub(crate) enum TrimType {
     Left,
@@ -98,52 +97,6 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
     }
 }
 
-/// Creates a function to identify the optimal return type of a string 
function given
-/// the type of its first argument.
-///
-/// If the input type is `LargeUtf8` or `LargeBinary` the return type is
-/// `$largeUtf8Type`,
-///
-/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`,
-macro_rules! get_optimal_return_type {
-    ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => {
-        pub(crate) fn $FUNC(arg_type: &DataType, name: &str) -> 
Result<DataType> {
-            Ok(match arg_type {
-                // LargeBinary inputs are automatically coerced to Utf8
-                DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type,
-                // Binary inputs are automatically coerced to Utf8
-                DataType::Utf8 | DataType::Binary => $utf8Type,
-                DataType::Null => DataType::Null,
-                DataType::Dictionary(_, value_type) => match **value_type {
-                    DataType::LargeUtf8 | DataType::LargeBinary => 
$largeUtf8Type,
-                    DataType::Utf8 | DataType::Binary => $utf8Type,
-                    DataType::Null => DataType::Null,
-                    _ => {
-                        return datafusion_common::exec_err!(
-                            "The {} function can only accept strings, but got 
{:?}.",
-                            name.to_uppercase(),
-                            **value_type
-                        );
-                    }
-                },
-                data_type => {
-                    return datafusion_common::exec_err!(
-                        "The {} function can only accept strings, but got 
{:?}.",
-                        name.to_uppercase(),
-                        data_type
-                    );
-                }
-            })
-        }
-    };
-}
-
-// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input 
type size.
-get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, 
DataType::Utf8);
-
-// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type 
size.
-get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
-
 /// applies a unary expression to `args[0]` that is expected to be 
downcastable to
 /// a `GenericStringArray` and returns a `GenericStringArray` (which may have 
a different offset)
 /// # Errors
@@ -221,112 +174,3 @@ where
         },
     }
 }
-
-pub(super) fn make_scalar_function<F>(
-    inner: F,
-    hints: Vec<Hint>,
-) -> ScalarFunctionImplementation
-where
-    F: Fn(&[ArrayRef]) -> Result<ArrayRef> + Sync + Send + 'static,
-{
-    Arc::new(move |args: &[ColumnarValue]| {
-        // first, identify if any of the arguments is an Array. If yes, store 
its `len`,
-        // as any scalar will need to be converted to an array of len `len`.
-        let len = args
-            .iter()
-            .fold(Option::<usize>::None, |acc, arg| match arg {
-                ColumnarValue::Scalar(_) => acc,
-                ColumnarValue::Array(a) => Some(a.len()),
-            });
-
-        let is_scalar = len.is_none();
-
-        let inferred_length = len.unwrap_or(1);
-        let args = args
-            .iter()
-            .zip(hints.iter().chain(std::iter::repeat(&Hint::Pad)))
-            .map(|(arg, hint)| {
-                // Decide on the length to expand this scalar to depending
-                // on the given hints.
-                let expansion_len = match hint {
-                    Hint::AcceptsSingular => 1,
-                    Hint::Pad => inferred_length,
-                };
-                arg.clone().into_array(expansion_len)
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        let result = (inner)(&args);
-        if is_scalar {
-            // If all inputs are scalar, keeps output as scalar
-            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
-            result.map(ColumnarValue::Scalar)
-        } else {
-            result.map(ColumnarValue::Array)
-        }
-    })
-}
-
-#[cfg(test)]
-pub mod test {
-    /// $FUNC ScalarUDFImpl to test
-    /// $ARGS arguments (vec) to pass to function
-    /// $EXPECTED a Result<ColumnarValue>
-    /// $EXPECTED_TYPE is the expected value type
-    /// $EXPECTED_DATA_TYPE is the expected result type
-    /// $ARRAY_TYPE is the column type after function applied
-    macro_rules! test_function {
-        ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, 
$EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident) => {
-            let expected: Result<Option<$EXPECTED_TYPE>> = $EXPECTED;
-            let func = $FUNC;
-
-            let type_array = $ARGS.iter().map(|arg| 
arg.data_type()).collect::<Vec<_>>();
-            let return_type = func.return_type(&type_array);
-
-            match expected {
-                Ok(expected) => {
-                    assert_eq!(return_type.is_ok(), true);
-                    assert_eq!(return_type.unwrap(), $EXPECTED_DATA_TYPE);
-
-                    let result = func.invoke($ARGS);
-                    assert_eq!(result.is_ok(), true);
-
-                    let len = $ARGS
-                        .iter()
-                        .fold(Option::<usize>::None, |acc, arg| match arg {
-                            ColumnarValue::Scalar(_) => acc,
-                            ColumnarValue::Array(a) => Some(a.len()),
-                        });
-                    let inferred_length = len.unwrap_or(1);
-                    let result = 
result.unwrap().clone().into_array(inferred_length).expect("Failed to convert 
to array");
-                    let result = 
result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to 
type");
-
-                    // value is correct
-                    match expected {
-                        Some(v) => assert_eq!(result.value(0), v),
-                        None => assert!(result.is_null(0)),
-                    };
-                }
-                Err(expected_error) => {
-                    if return_type.is_err() {
-                        match return_type {
-                            Ok(_) => assert!(false, "expected error"),
-                            Err(error) => { 
datafusion_common::assert_contains!(expected_error.strip_backtrace(), 
error.strip_backtrace()); }
-                        }
-                    }
-                    else {
-                        // invoke is expected error - cannot use .expect_err() 
due to Debug not being implemented
-                        match func.invoke($ARGS) {
-                            Ok(_) => assert!(false, "expected error"),
-                            Err(error) => {
-                                
assert!(expected_error.strip_backtrace().starts_with(&error.strip_backtrace()));
-                            }
-                        }
-                    }
-                }
-            };
-        };
-    }
-
-    pub(crate) use test_function;
-}
diff --git a/datafusion/functions/src/string/levenshtein.rs 
b/datafusion/functions/src/string/levenshtein.rs
index b5de4b2894..8f497e73e3 100644
--- a/datafusion/functions/src/string/levenshtein.rs
+++ b/datafusion/functions/src/string/levenshtein.rs
@@ -21,6 +21,7 @@ use std::sync::Arc;
 use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait};
 use arrow::datatypes::DataType;
 
+use crate::utils::{make_scalar_function, utf8_to_int_type};
 use datafusion_common::cast::as_generic_string_array;
 use datafusion_common::utils::datafusion_strsim;
 use datafusion_common::{exec_err, Result};
@@ -28,8 +29,6 @@ use datafusion_expr::ColumnarValue;
 use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 
-use crate::string::common::{make_scalar_function, utf8_to_int_type};
-
 #[derive(Debug)]
 pub(super) struct LevenshteinFunc {
     signature: Signature,
diff --git a/datafusion/functions/src/string/lower.rs 
b/datafusion/functions/src/string/lower.rs
index 42bda04700..327772bd80 100644
--- a/datafusion/functions/src/string/lower.rs
+++ b/datafusion/functions/src/string/lower.rs
@@ -15,12 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::string::common::{handle, utf8_to_str_type};
+use std::any::Any;
+
 use arrow::datatypes::DataType;
+
 use datafusion_common::Result;
 use datafusion_expr::ColumnarValue;
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
-use std::any::Any;
+
+use crate::string::common::handle;
+use crate::utils::utf8_to_str_type;
 
 #[derive(Debug)]
 pub(super) struct LowerFunc {
diff --git a/datafusion/functions/src/string/ltrim.rs 
b/datafusion/functions/src/string/ltrim.rs
index 535ffb14f5..e6926e5bd5 100644
--- a/datafusion/functions/src/string/ltrim.rs
+++ b/datafusion/functions/src/string/ltrim.rs
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{ArrayRef, OffsetSizeTrait};
 use std::any::Any;
 
+use arrow::array::{ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
 
 use datafusion_common::{exec_err, Result};
@@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
 use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
 
 /// Returns the longest string  with leading characters removed. If the 
characters are not specified, whitespace is removed.
 /// ltrim('zzzytest', 'xyz') = 'test'
diff --git a/datafusion/functions/src/string/octet_length.rs 
b/datafusion/functions/src/string/octet_length.rs
index 36a62fbe4e..639bf6cb48 100644
--- a/datafusion/functions/src/string/octet_length.rs
+++ b/datafusion/functions/src/string/octet_length.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::compute::kernels::length::length;
 use std::any::Any;
 
+use arrow::compute::kernels::length::length;
 use arrow::datatypes::DataType;
 
 use datafusion_common::{exec_err, Result, ScalarValue};
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
-use crate::string::common::*;
+use crate::utils::utf8_to_int_type;
 
 #[derive(Debug)]
 pub(super) struct OctetLengthFunc {
@@ -86,14 +86,17 @@ impl ScalarUDFImpl for OctetLengthFunc {
 
 #[cfg(test)]
 mod tests {
-    use crate::string::common::test::test_function;
-    use crate::string::octet_length::OctetLengthFunc;
+    use std::sync::Arc;
+
     use arrow::array::{Array, Int32Array, StringArray};
     use arrow::datatypes::DataType::Int32;
+
     use datafusion_common::ScalarValue;
     use datafusion_common::{exec_err, Result};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
-    use std::sync::Arc;
+
+    use crate::string::octet_length::OctetLengthFunc;
+    use crate::utils::test::test_function;
 
     #[test]
     fn test_functions() -> Result<()> {
diff --git a/datafusion/functions/src/string/overlay.rs 
b/datafusion/functions/src/string/overlay.rs
index d7cc0da806..8b9cc03afc 100644
--- a/datafusion/functions/src/string/overlay.rs
+++ b/datafusion/functions/src/string/overlay.rs
@@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
-use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
 
 #[derive(Debug)]
 pub(super) struct OverlayFunc {
diff --git a/datafusion/functions/src/string/repeat.rs 
b/datafusion/functions/src/string/repeat.rs
index 83bc929cb9..f4319af0a5 100644
--- a/datafusion/functions/src/string/repeat.rs
+++ b/datafusion/functions/src/string/repeat.rs
@@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
-use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
 
 #[derive(Debug)]
 pub(super) struct RepeatFunc {
@@ -99,8 +99,8 @@ mod tests {
     use datafusion_common::ScalarValue;
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
-    use crate::string::common::test::test_function;
     use crate::string::repeat::RepeatFunc;
+    use crate::utils::test::test_function;
 
     #[test]
     fn test_functions() -> Result<()> {
diff --git a/datafusion/functions/src/string/replace.rs 
b/datafusion/functions/src/string/replace.rs
index e352442960..e869ac2054 100644
--- a/datafusion/functions/src/string/replace.rs
+++ b/datafusion/functions/src/string/replace.rs
@@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
-use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
 
 #[derive(Debug)]
 pub(super) struct ReplaceFunc {
diff --git a/datafusion/functions/src/string/rtrim.rs 
b/datafusion/functions/src/string/rtrim.rs
index 17d2f8234b..d04d15ce88 100644
--- a/datafusion/functions/src/string/rtrim.rs
+++ b/datafusion/functions/src/string/rtrim.rs
@@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
 use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
 
 /// Returns the longest string  with trailing characters removed. If the 
characters are not specified, whitespace is removed.
 /// rtrim('testxxzx', 'xyz') = 'test'
diff --git a/datafusion/functions/src/string/split_part.rs 
b/datafusion/functions/src/string/split_part.rs
index af201e90fc..0aa968a1ef 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
 
-use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
 
 #[derive(Debug)]
 pub(super) struct SplitPartFunc {
@@ -117,8 +117,8 @@ mod tests {
     use datafusion_common::{exec_err, Result};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
-    use crate::string::common::test::test_function;
     use crate::string::split_part::SplitPartFunc;
+    use crate::utils::test::test_function;
 
     #[test]
     fn test_functions() -> Result<()> {
diff --git a/datafusion/functions/src/string/starts_with.rs 
b/datafusion/functions/src/string/starts_with.rs
index 4450b9d332..f1b03907f8 100644
--- a/datafusion/functions/src/string/starts_with.rs
+++ b/datafusion/functions/src/string/starts_with.rs
@@ -15,15 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::string::common::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
+
 use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
 use datafusion_expr::ColumnarValue;
 use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
-use std::any::Any;
-use std::sync::Arc;
+
+use crate::utils::make_scalar_function;
 
 /// Returns true if string starts with prefix.
 /// starts_with('alphabet', 'alph') = 't'
diff --git a/datafusion/functions/src/string/to_hex.rs 
b/datafusion/functions/src/string/to_hex.rs
index 1bdece3f7a..ab320c68d4 100644
--- a/datafusion/functions/src/string/to_hex.rs
+++ b/datafusion/functions/src/string/to_hex.rs
@@ -15,18 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::string::common::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
 use arrow::datatypes::{
     ArrowNativeType, ArrowPrimitiveType, DataType, Int32Type, Int64Type,
 };
+
 use datafusion_common::cast::as_primitive_array;
 use datafusion_common::Result;
 use datafusion_common::{exec_err, plan_err};
 use datafusion_expr::ColumnarValue;
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
-use std::any::Any;
-use std::sync::Arc;
+
+use crate::utils::make_scalar_function;
 
 /// Converts the number to its equivalent hexadecimal representation.
 /// to_hex(2147483647) = '7fffffff'
diff --git a/datafusion/functions/src/string/upper.rs 
b/datafusion/functions/src/string/upper.rs
index a0c910ebb2..066174abf2 100644
--- a/datafusion/functions/src/string/upper.rs
+++ b/datafusion/functions/src/string/upper.rs
@@ -15,7 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::string::common::{handle, utf8_to_str_type};
+use crate::string::common::handle;
+use crate::utils::utf8_to_str_type;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
 use datafusion_expr::ColumnarValue;
diff --git a/datafusion/functions/src/unicode/character_length.rs 
b/datafusion/functions/src/unicode/character_length.rs
new file mode 100644
index 0000000000..51331bf9a5
--- /dev/null
+++ b/datafusion/functions/src/unicode/character_length.rs
@@ -0,0 +1,176 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::{make_scalar_function, utf8_to_int_type};
+use arrow::array::{
+    ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, 
PrimitiveArray,
+};
+use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
+use datafusion_common::cast::as_generic_string_array;
+use datafusion_common::exec_err;
+use datafusion_common::Result;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+use std::any::Any;
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct CharacterLengthFunc {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl CharacterLengthFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::uniform(
+                1,
+                vec![Utf8, LargeUtf8],
+                Volatility::Immutable,
+            ),
+            aliases: vec![String::from("length"), String::from("char_length")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for CharacterLengthFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "character_length"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        utf8_to_int_type(&arg_types[0], "character_length")
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        match args[0].data_type() {
+            DataType::Utf8 => {
+                make_scalar_function(character_length::<Int32Type>, 
vec![])(args)
+            }
+            DataType::LargeUtf8 => {
+                make_scalar_function(character_length::<Int64Type>, 
vec![])(args)
+            }
+            other => {
+                exec_err!("Unsupported data type {other:?} for function 
character_length")
+            }
+        }
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+}
+
+/// Returns number of characters in the string.
+/// character_length('josé') = 4
+/// The implementation counts UTF-8 code points to count the number of 
characters
+fn character_length<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> 
Result<ArrayRef>
+where
+    T::Native: OffsetSizeTrait,
+{
+    let string_array: &GenericStringArray<T::Native> =
+        as_generic_string_array::<T::Native>(&args[0])?;
+
+    let result = string_array
+        .iter()
+        .map(|string| {
+            string.map(|string: &str| {
+                T::Native::from_usize(string.chars().count())
+                    .expect("should not fail as string.chars will always 
return integer")
+            })
+        })
+        .collect::<PrimitiveArray<T>>();
+
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::unicode::character_length::CharacterLengthFunc;
+    use crate::utils::test::test_function;
+    use arrow::array::{Array, Int32Array};
+    use arrow::datatypes::DataType::Int32;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+    #[test]
+    fn test_functions() -> Result<()> {
+        #[cfg(feature = "unicode_expressions")]
+        test_function!(
+            CharacterLengthFunc::new(),
+            &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                String::from("chars")
+            )))],
+            Ok(Some(5)),
+            i32,
+            Int32,
+            Int32Array
+        );
+        #[cfg(feature = "unicode_expressions")]
+        test_function!(
+            CharacterLengthFunc::new(),
+            &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                String::from("josé")
+            )))],
+            Ok(Some(4)),
+            i32,
+            Int32,
+            Int32Array
+        );
+        #[cfg(feature = "unicode_expressions")]
+        test_function!(
+            CharacterLengthFunc::new(),
+            &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                String::from("")
+            )))],
+            Ok(Some(0)),
+            i32,
+            Int32,
+            Int32Array
+        );
+        #[cfg(feature = "unicode_expressions")]
+        test_function!(
+            CharacterLengthFunc::new(),
+            &[ColumnarValue::Scalar(ScalarValue::Utf8(None))],
+            Ok(None),
+            i32,
+            Int32,
+            Int32Array
+        );
+        #[cfg(not(feature = "unicode_expressions"))]
+        test_function!(
+            CharacterLengthFunc::new(),
+            
&[ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("josé"))))],
+            internal_err!(
+                "function character_length requires compilation with feature 
flag: unicode_expressions."
+            ),
+            i32,
+            Int32,
+            Int32Array
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/functions/src/unicode/mod.rs 
b/datafusion/functions/src/unicode/mod.rs
new file mode 100644
index 0000000000..291de38439
--- /dev/null
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! "unicode" DataFusion functions
+
+use std::sync::Arc;
+
+use datafusion_expr::ScalarUDF;
+
+mod character_length;
+
+// create UDFs
+make_udf_function!(
+    character_length::CharacterLengthFunc,
+    CHARACTER_LENGTH,
+    character_length
+);
+
+pub mod expr_fn {
+    use datafusion_expr::Expr;
+
+    #[doc = "the number of characters in the `string`"]
+    pub fn char_length(string: Expr) -> Expr {
+        character_length(string)
+    }
+
+    #[doc = "the number of characters in the `string`"]
+    pub fn character_length(string: Expr) -> Expr {
+        super::character_length().call(vec![string])
+    }
+
+    #[doc = "the number of characters in the `string`"]
+    pub fn length(string: Expr) -> Expr {
+        character_length(string)
+    }
+}
+
+///   Return a list of all functions in this package
+pub fn functions() -> Vec<Arc<ScalarUDF>> {
+    vec![character_length()]
+}
diff --git a/datafusion/functions/src/string/common.rs 
b/datafusion/functions/src/utils.rs
similarity index 59%
copy from datafusion/functions/src/string/common.rs
copy to datafusion/functions/src/utils.rs
index 339f4e6c1a..f45deafdb3 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/utils.rs
@@ -15,88 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::fmt::{Display, Formatter};
-use std::sync::Arc;
-
-use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::array::ArrayRef;
 use arrow::datatypes::DataType;
-
-use datafusion_common::cast::as_generic_string_array;
-use datafusion_common::Result;
-use datafusion_common::{exec_err, ScalarValue};
+use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation};
 use datafusion_physical_expr::functions::Hint;
-
-pub(crate) enum TrimType {
-    Left,
-    Right,
-    Both,
-}
-
-impl Display for TrimType {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        match self {
-            TrimType::Left => write!(f, "ltrim"),
-            TrimType::Right => write!(f, "rtrim"),
-            TrimType::Both => write!(f, "btrim"),
-        }
-    }
-}
-
-pub(crate) fn general_trim<T: OffsetSizeTrait>(
-    args: &[ArrayRef],
-    trim_type: TrimType,
-) -> Result<ArrayRef> {
-    let func = match trim_type {
-        TrimType::Left => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            str::trim_start_matches::<&[char]>(input, pattern.as_ref())
-        },
-        TrimType::Right => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            str::trim_end_matches::<&[char]>(input, pattern.as_ref())
-        },
-        TrimType::Both => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            str::trim_end_matches::<&[char]>(
-                str::trim_start_matches::<&[char]>(input, pattern.as_ref()),
-                pattern.as_ref(),
-            )
-        },
-    };
-
-    let string_array = as_generic_string_array::<T>(&args[0])?;
-
-    match args.len() {
-        1 => {
-            let result = string_array
-                .iter()
-                .map(|string| string.map(|string: &str| func(string, " ")))
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
-        }
-        2 => {
-            let characters_array = as_generic_string_array::<T>(&args[1])?;
-
-            let result = string_array
-                .iter()
-                .zip(characters_array.iter())
-                .map(|(string, characters)| match (string, characters) {
-                    (Some(string), Some(characters)) => Some(func(string, 
characters)),
-                    _ => None,
-                })
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
-        }
-        other => {
-            exec_err!(
-            "{trim_type} was called with {other} arguments. It requires at 
least 1 and at most 2."
-        )
-        }
-    }
-}
+use std::sync::Arc;
 
 /// Creates a function to identify the optimal return type of a string 
function given
 /// the type of its first argument.
@@ -144,84 +68,6 @@ get_optimal_return_type!(utf8_to_str_type, 
DataType::LargeUtf8, DataType::Utf8);
 // `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type 
size.
 get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
 
-/// applies a unary expression to `args[0]` that is expected to be 
downcastable to
-/// a `GenericStringArray` and returns a `GenericStringArray` (which may have 
a different offset)
-/// # Errors
-/// This function errors when:
-/// * the number of arguments is not 1
-/// * the first argument is not castable to a `GenericStringArray`
-pub(crate) fn unary_string_function<'a, T, O, F, R>(
-    args: &[&'a dyn Array],
-    op: F,
-    name: &str,
-) -> Result<GenericStringArray<O>>
-where
-    R: AsRef<str>,
-    O: OffsetSizeTrait,
-    T: OffsetSizeTrait,
-    F: Fn(&'a str) -> R,
-{
-    if args.len() != 1 {
-        return exec_err!(
-            "{:?} args were supplied but {} takes exactly one argument",
-            args.len(),
-            name
-        );
-    }
-
-    let string_array = as_generic_string_array::<T>(args[0])?;
-
-    // first map is the iterator, second is for the `Option<_>`
-    Ok(string_array.iter().map(|string| string.map(&op)).collect())
-}
-
-pub(crate) fn handle<'a, F, R>(
-    args: &'a [ColumnarValue],
-    op: F,
-    name: &str,
-) -> Result<ColumnarValue>
-where
-    R: AsRef<str>,
-    F: Fn(&'a str) -> R,
-{
-    match &args[0] {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8 => {
-                Ok(ColumnarValue::Array(Arc::new(unary_string_function::<
-                    i32,
-                    i32,
-                    _,
-                    _,
-                >(
-                    &[a.as_ref()], op, name
-                )?)))
-            }
-            DataType::LargeUtf8 => {
-                Ok(ColumnarValue::Array(Arc::new(unary_string_function::<
-                    i64,
-                    i64,
-                    _,
-                    _,
-                >(
-                    &[a.as_ref()], op, name
-                )?)))
-            }
-            other => exec_err!("Unsupported data type {other:?} for function 
{name}"),
-        },
-        ColumnarValue::Scalar(scalar) => match scalar {
-            ScalarValue::Utf8(a) => {
-                let result = a.as_ref().map(|x| (op)(x).as_ref().to_string());
-                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
-            }
-            ScalarValue::LargeUtf8(a) => {
-                let result = a.as_ref().map(|x| (op)(x).as_ref().to_string());
-                Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result)))
-            }
-            other => exec_err!("Unsupported data type {other:?} for function 
{name}"),
-        },
-    }
-}
-
 pub(super) fn make_scalar_function<F>(
     inner: F,
     hints: Vec<Hint>,
@@ -254,7 +100,7 @@ where
                 };
                 arg.clone().into_array(expansion_len)
             })
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<datafusion_common::Result<Vec<_>>>()?;
 
         let result = (inner)(&args);
         if is_scalar {
diff --git a/datafusion/physical-expr/src/functions.rs 
b/datafusion/physical-expr/src/functions.rs
index cd9bba63d6..9adc853634 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -254,29 +254,6 @@ pub fn create_physical_fun(
             Arc::new(|args| 
make_scalar_function_inner(math_expressions::cot)(args))
         }
         // string functions
-        BuiltinScalarFunction::CharacterLength => {
-            Arc::new(|args| match args[0].data_type() {
-                DataType::Utf8 => {
-                    let func = invoke_if_unicode_expressions_feature_flag!(
-                        character_length,
-                        Int32Type,
-                        "character_length"
-                    );
-                    make_scalar_function_inner(func)(args)
-                }
-                DataType::LargeUtf8 => {
-                    let func = invoke_if_unicode_expressions_feature_flag!(
-                        character_length,
-                        Int64Type,
-                        "character_length"
-                    );
-                    make_scalar_function_inner(func)(args)
-                }
-                other => exec_err!(
-                    "Unsupported data type {other:?} for function 
character_length"
-                ),
-            })
-        }
         BuiltinScalarFunction::Coalesce => 
Arc::new(conditional_expressions::coalesce),
         BuiltinScalarFunction::Concat => Arc::new(string_expressions::concat),
         BuiltinScalarFunction::ConcatWithSeparator => Arc::new(|args| {
@@ -595,53 +572,6 @@ mod tests {
 
     #[test]
     fn test_functions() -> Result<()> {
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            CharacterLength,
-            &[lit("chars")],
-            Ok(Some(5)),
-            i32,
-            Int32,
-            Int32Array
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            CharacterLength,
-            &[lit("josé")],
-            Ok(Some(4)),
-            i32,
-            Int32,
-            Int32Array
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            CharacterLength,
-            &[lit("")],
-            Ok(Some(0)),
-            i32,
-            Int32,
-            Int32Array
-        );
-        #[cfg(feature = "unicode_expressions")]
-        test_function!(
-            CharacterLength,
-            &[lit(ScalarValue::Utf8(None))],
-            Ok(None),
-            i32,
-            Int32,
-            Int32Array
-        );
-        #[cfg(not(feature = "unicode_expressions"))]
-        test_function!(
-            CharacterLength,
-            &[lit("josé")],
-            internal_err!(
-                "function character_length requires compilation with feature 
flag: unicode_expressions."
-            ),
-            i32,
-            Int32,
-            Int32Array
-        );
         test_function!(
             Concat,
             &[lit("aa"), lit("bb"), lit("cc"),],
diff --git a/datafusion/physical-expr/src/unicode_expressions.rs 
b/datafusion/physical-expr/src/unicode_expressions.rs
index 8ec9e062d9..c7e4b7d7c4 100644
--- a/datafusion/physical-expr/src/unicode_expressions.rs
+++ b/datafusion/physical-expr/src/unicode_expressions.rs
@@ -36,29 +36,6 @@ use datafusion_common::{
     exec_err, Result,
 };
 
-/// Returns number of characters in the string.
-/// character_length('josé') = 4
-/// The implementation counts UTF-8 code points to count the number of 
characters
-pub fn character_length<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> 
Result<ArrayRef>
-where
-    T::Native: OffsetSizeTrait,
-{
-    let string_array: &GenericStringArray<T::Native> =
-        as_generic_string_array::<T::Native>(&args[0])?;
-
-    let result = string_array
-        .iter()
-        .map(|string| {
-            string.map(|string: &str| {
-                T::Native::from_usize(string.chars().count())
-                    .expect("should not fail as string.chars will always 
return integer")
-            })
-        })
-        .collect::<PrimitiveArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
 /// Returns first n characters in the string, or when n is negative, returns 
all but last |n| characters.
 /// left('abcde', 2) = 'ab'
 /// The implementation uses UTF-8 code points as characters
diff --git a/datafusion/proto/proto/datafusion.proto 
b/datafusion/proto/proto/datafusion.proto
index f405ecf976..766ca6633e 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -565,7 +565,7 @@ enum ScalarFunction {
   // RegexpMatch = 21;
   // 22 was BitLength
   // 23 was Btrim
-  CharacterLength = 24;
+  // 24 was CharacterLength
   // 25 was Chr
   Concat = 26;
   ConcatWithSeparator = 27;
diff --git a/datafusion/proto/src/generated/pbjson.rs 
b/datafusion/proto/src/generated/pbjson.rs
index 0d22ba5db7..f2814956ef 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22928,7 +22928,6 @@ impl serde::Serialize for ScalarFunction {
             Self::Sin => "Sin",
             Self::Sqrt => "Sqrt",
             Self::Trunc => "Trunc",
-            Self::CharacterLength => "CharacterLength",
             Self::Concat => "Concat",
             Self::ConcatWithSeparator => "ConcatWithSeparator",
             Self::InitCap => "InitCap",
@@ -22988,7 +22987,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
             "Sin",
             "Sqrt",
             "Trunc",
-            "CharacterLength",
             "Concat",
             "ConcatWithSeparator",
             "InitCap",
@@ -23077,7 +23075,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
                     "Sin" => Ok(ScalarFunction::Sin),
                     "Sqrt" => Ok(ScalarFunction::Sqrt),
                     "Trunc" => Ok(ScalarFunction::Trunc),
-                    "CharacterLength" => Ok(ScalarFunction::CharacterLength),
                     "Concat" => Ok(ScalarFunction::Concat),
                     "ConcatWithSeparator" => 
Ok(ScalarFunction::ConcatWithSeparator),
                     "InitCap" => Ok(ScalarFunction::InitCap),
diff --git a/datafusion/proto/src/generated/prost.rs 
b/datafusion/proto/src/generated/prost.rs
index 07c3fad153..ecc94fcdaf 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2864,7 +2864,7 @@ pub enum ScalarFunction {
     /// RegexpMatch = 21;
     /// 22 was BitLength
     /// 23 was Btrim
-    CharacterLength = 24,
+    /// 24 was CharacterLength
     /// 25 was Chr
     Concat = 26,
     ConcatWithSeparator = 27,
@@ -3001,7 +3001,6 @@ impl ScalarFunction {
             ScalarFunction::Sin => "Sin",
             ScalarFunction::Sqrt => "Sqrt",
             ScalarFunction::Trunc => "Trunc",
-            ScalarFunction::CharacterLength => "CharacterLength",
             ScalarFunction::Concat => "Concat",
             ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator",
             ScalarFunction::InitCap => "InitCap",
@@ -3055,7 +3054,6 @@ impl ScalarFunction {
             "Sin" => Some(Self::Sin),
             "Sqrt" => Some(Self::Sqrt),
             "Trunc" => Some(Self::Trunc),
-            "CharacterLength" => Some(Self::CharacterLength),
             "Concat" => Some(Self::Concat),
             "ConcatWithSeparator" => Some(Self::ConcatWithSeparator),
             "InitCap" => Some(Self::InitCap),
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs 
b/datafusion/proto/src/logical_plan/from_proto.rs
index 4b9874bf8f..19edd71a3a 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -48,8 +48,8 @@ use datafusion_expr::expr::Unnest;
 use datafusion_expr::expr::{Alias, Placeholder};
 use datafusion_expr::window_frame::{check_window_frame, 
regularize_window_order_by};
 use datafusion_expr::{
-    acosh, asinh, atan, atan2, atanh, cbrt, ceil, character_length, coalesce,
-    concat_expr, concat_ws_expr, cos, cosh, cot, degrees, ends_with, exp,
+    acosh, asinh, atan, atan2, atanh, cbrt, ceil, coalesce, concat_expr, 
concat_ws_expr,
+    cos, cosh, cot, degrees, ends_with, exp,
     expr::{self, InList, Sort, WindowFunction},
     factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log, 
log10, log2,
     logical_plan::{PlanType, StringifiedPlan},
@@ -450,7 +450,6 @@ impl From<&protobuf::ScalarFunction> for 
BuiltinScalarFunction {
             ScalarFunction::Concat => Self::Concat,
             ScalarFunction::Log2 => Self::Log2,
             ScalarFunction::Signum => Self::Signum,
-            ScalarFunction::CharacterLength => Self::CharacterLength,
             ScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator,
             ScalarFunction::EndsWith => Self::EndsWith,
             ScalarFunction::InitCap => Self::InitCap,
@@ -1372,9 +1371,6 @@ pub fn parse_expr(
                 ScalarFunction::Signum => {
                     Ok(signum(parse_expr(&args[0], registry, codec)?))
                 }
-                ScalarFunction::CharacterLength => {
-                    Ok(character_length(parse_expr(&args[0], registry, 
codec)?))
-                }
                 ScalarFunction::InitCap => {
                     Ok(initcap(parse_expr(&args[0], registry, codec)?))
                 }
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs 
b/datafusion/proto/src/logical_plan/to_proto.rs
index 1335d511a0..11fc7362c7 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1442,7 +1442,6 @@ impl TryFrom<&BuiltinScalarFunction> for 
protobuf::ScalarFunction {
             BuiltinScalarFunction::Concat => Self::Concat,
             BuiltinScalarFunction::Log2 => Self::Log2,
             BuiltinScalarFunction::Signum => Self::Signum,
-            BuiltinScalarFunction::CharacterLength => Self::CharacterLength,
             BuiltinScalarFunction::ConcatWithSeparator => 
Self::ConcatWithSeparator,
             BuiltinScalarFunction::EndsWith => Self::EndsWith,
             BuiltinScalarFunction::InitCap => Self::InitCap,
diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml
index ca2c1a240c..b9f6dc259e 100644
--- a/datafusion/sql/Cargo.toml
+++ b/datafusion/sql/Cargo.toml
@@ -49,6 +49,7 @@ strum = { version = "0.26.1", features = ["derive"] }
 
 [dev-dependencies]
 ctor = { workspace = true }
+datafusion-functions = { workspace = true, default-features = true }
 env_logger = { workspace = true }
 paste = "^1.0"
 rstest = { workspace = true }
diff --git a/datafusion/sql/tests/sql_integration.rs 
b/datafusion/sql/tests/sql_integration.rs
index 448a9c5420..101c31039c 100644
--- a/datafusion/sql/tests/sql_integration.rs
+++ b/datafusion/sql/tests/sql_integration.rs
@@ -38,6 +38,7 @@ use datafusion_sql::{
     planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel},
 };
 
+use datafusion_functions::unicode;
 use rstest::rstest;
 use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect};
 use sqlparser::parser::Parser;
@@ -88,7 +89,7 @@ fn parse_decimals() {
 fn parse_ident_normalization() {
     let test_data = [
         (
-            "SELECT LENGTH('str')",
+            "SELECT CHARACTER_LENGTH('str')",
             "Ok(Projection: character_length(Utf8(\"str\"))\n  EmptyRelation)",
             false,
         ),
@@ -2688,6 +2689,7 @@ fn logical_plan_with_dialect_and_options(
     options: ParserOptions,
 ) -> Result<LogicalPlan> {
     let context = MockContextProvider::default()
+        .with_udf(unicode::character_length().as_ref().clone())
         .with_udf(make_udf(
             "nullif",
             vec![DataType::Int32, DataType::Int32],
@@ -4508,26 +4510,27 @@ fn test_field_not_found_window_function() {
 
 #[test]
 fn test_parse_escaped_string_literal_value() {
-    let sql = r"SELECT length('\r\n') AS len";
+    let sql = r"SELECT character_length('\r\n') AS len";
     let expected = "Projection: character_length(Utf8(\"\\r\\n\")) AS len\
     \n  EmptyRelation";
     quick_test(sql, expected);
 
-    let sql = r"SELECT length(E'\r\n') AS len";
+    let sql = r"SELECT character_length(E'\r\n') AS len";
     let expected = "Projection: character_length(Utf8(\"\r\n\")) AS len\
     \n  EmptyRelation";
     quick_test(sql, expected);
 
-    let sql = r"SELECT length(E'\445') AS len, E'\x4B' AS hex, E'\u0001' AS 
unicode";
+    let sql =
+        r"SELECT character_length(E'\445') AS len, E'\x4B' AS hex, E'\u0001' 
AS unicode";
     let expected =
         "Projection: character_length(Utf8(\"%\")) AS len, Utf8(\"\u{004b}\") 
AS hex, Utf8(\"\u{0001}\") AS unicode\
     \n  EmptyRelation";
     quick_test(sql, expected);
 
-    let sql = r"SELECT length(E'\000') AS len";
+    let sql = r"SELECT character_length(E'\000') AS len";
     assert_eq!(
         logical_plan(sql).unwrap_err().strip_backtrace(),
-        "SQL error: TokenizerError(\"Unterminated encoded string literal at 
Line: 1, Column 15\")"
+        "SQL error: TokenizerError(\"Unterminated encoded string literal at 
Line: 1, Column 25\")"
     )
 }
 

Reply via email to