This is an automated email from the ASF dual-hosted git repository.

goldmedal pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new a267784bc6 Support unicode character for `initcap` function (#13752)
a267784bc6 is described below

commit a267784bc60910bfdf558b4f8e600c1890ad6245
Author: Tai Le Manh <[email protected]>
AuthorDate: Sun Dec 22 18:10:35 2024 +0700

    Support unicode character for `initcap` function (#13752)
    
    * Support unicode character for 'initcap' function
    
    Signed-off-by: Tai Le Manh <[email protected]>
    
    * Update unit tests
    
    * Fix clippy warning
    
    * Update sqllogictests - initcap
    
    * Update scalar_functions.md docs
    
    * Add suggestions change
    
    Signed-off-by: Tai Le Manh <[email protected]>
    
    ---------
    
    Signed-off-by: Tai Le Manh <[email protected]>
---
 datafusion/functions/Cargo.toml                    |   2 +-
 datafusion/functions/benches/initcap.rs            |   4 +-
 datafusion/functions/src/string/mod.rs             |   7 --
 .../functions/src/{string => unicode}/initcap.rs   | 114 +++++++++++++++------
 datafusion/functions/src/unicode/mod.rs            |   7 ++
 .../test_files/string/string_query.slt.part        |   2 +-
 docs/source/user-guide/sql/scalar_functions.md     |   4 +-
 7 files changed, 93 insertions(+), 47 deletions(-)

diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index de72c7ee94..fd986c4be4 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -212,4 +212,4 @@ required-features = ["math_expressions"]
 [[bench]]
 harness = false
 name = "initcap"
-required-features = ["string_expressions"]
+required-features = ["unicode_expressions"]
diff --git a/datafusion/functions/benches/initcap.rs 
b/datafusion/functions/benches/initcap.rs
index c88b6b5139..97c76831b3 100644
--- a/datafusion/functions/benches/initcap.rs
+++ b/datafusion/functions/benches/initcap.rs
@@ -24,7 +24,7 @@ use arrow::util::bench_util::{
 };
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
-use datafusion_functions::string;
+use datafusion_functions::unicode;
 use std::sync::Arc;
 
 fn create_args<O: OffsetSizeTrait>(
@@ -46,7 +46,7 @@ fn create_args<O: OffsetSizeTrait>(
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
-    let initcap = string::initcap();
+    let initcap = unicode::initcap();
     for size in [1024, 4096] {
         let args = create_args::<i32>(size, 8, true);
         c.bench_function(
diff --git a/datafusion/functions/src/string/mod.rs 
b/datafusion/functions/src/string/mod.rs
index f156f070d9..c43aaeccbe 100644
--- a/datafusion/functions/src/string/mod.rs
+++ b/datafusion/functions/src/string/mod.rs
@@ -30,7 +30,6 @@ pub mod concat;
 pub mod concat_ws;
 pub mod contains;
 pub mod ends_with;
-pub mod initcap;
 pub mod levenshtein;
 pub mod lower;
 pub mod ltrim;
@@ -52,7 +51,6 @@ make_udf_function!(chr::ChrFunc, chr);
 make_udf_function!(concat::ConcatFunc, concat);
 make_udf_function!(concat_ws::ConcatWsFunc, concat_ws);
 make_udf_function!(ends_with::EndsWithFunc, ends_with);
-make_udf_function!(initcap::InitcapFunc, initcap);
 make_udf_function!(levenshtein::LevenshteinFunc, levenshtein);
 make_udf_function!(ltrim::LtrimFunc, ltrim);
 make_udf_function!(lower::LowerFunc, lower);
@@ -94,10 +92,6 @@ pub mod expr_fn {
         ends_with,
         "Returns true if the `string` ends with the `suffix`, false 
otherwise.",
         string suffix
-    ),(
-        initcap,
-        "Converts the first letter of each word in `string` in uppercase and 
the remaining characters in lowercase",
-        string
     ),(
         levenshtein,
         "Returns the Levenshtein distance between the two given strings",
@@ -177,7 +171,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
         concat(),
         concat_ws(),
         ends_with(),
-        initcap(),
         levenshtein(),
         lower(),
         ltrim(),
diff --git a/datafusion/functions/src/string/initcap.rs 
b/datafusion/functions/src/unicode/initcap.rs
similarity index 68%
rename from datafusion/functions/src/string/initcap.rs
rename to datafusion/functions/src/unicode/initcap.rs
index 2780dcaeeb..e9f966b958 100644
--- a/datafusion/functions/src/string/initcap.rs
+++ b/datafusion/functions/src/unicode/initcap.rs
@@ -18,7 +18,9 @@
 use std::any::Any;
 use std::sync::{Arc, OnceLock};
 
-use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
+use arrow::array::{
+    Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
+};
 use arrow::datatypes::DataType;
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -74,7 +76,7 @@ impl ScalarUDFImpl for InitcapFunc {
             DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, 
vec![])(args),
             DataType::Utf8View => make_scalar_function(initcap_utf8view, 
vec![])(args),
             other => {
-                exec_err!("Unsupported data type {other:?} for function 
initcap")
+                exec_err!("Unsupported data type {other:?} for function 
`initcap`")
             }
         }
     }
@@ -90,9 +92,8 @@ fn get_initcap_doc() -> &'static Documentation {
     DOCUMENTATION.get_or_init(|| {
         Documentation::builder(
             DOC_SECTION_STRING,
-            "Capitalizes the first character in each word in the ASCII input 
string. \
-            Words are delimited by non-alphanumeric characters.\n\n\
-            Note this function does not support UTF-8 characters.",
+            "Capitalizes the first character in each word in the input string. 
\
+            Words are delimited by non-alphanumeric characters.",
             "initcap(str)",
         )
         .with_sql_example(
@@ -123,50 +124,70 @@ fn get_initcap_doc() -> &'static Documentation {
 fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_array = as_generic_string_array::<T>(&args[0])?;
 
-    // first map is the iterator, second is for the `Option<_>`
-    let result = string_array
-        .iter()
-        .map(initcap_string)
-        .collect::<GenericStringArray<T>>();
+    let mut builder = GenericStringBuilder::<T>::with_capacity(
+        string_array.len(),
+        string_array.value_data().len(),
+    );
 
-    Ok(Arc::new(result) as ArrayRef)
+    string_array.iter().for_each(|str| match str {
+        Some(s) => {
+            let initcap_str = initcap_string(s);
+            builder.append_value(initcap_str);
+        }
+        None => builder.append_null(),
+    });
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
 fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_view_array = as_string_view_array(&args[0])?;
 
-    let result = string_view_array
-        .iter()
-        .map(initcap_string)
-        .collect::<StringArray>();
+    let mut builder = 
StringViewBuilder::with_capacity(string_view_array.len());
+
+    string_view_array.iter().for_each(|str| match str {
+        Some(s) => {
+            let initcap_str = initcap_string(s);
+            builder.append_value(initcap_str);
+        }
+        None => builder.append_null(),
+    });
 
-    Ok(Arc::new(result) as ArrayRef)
+    Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
-fn initcap_string(input: Option<&str>) -> Option<String> {
-    input.map(|s| {
-        let mut result = String::with_capacity(s.len());
-        let mut prev_is_alphanumeric = false;
+fn initcap_string(input: &str) -> String {
+    let mut result = String::with_capacity(input.len());
+    let mut prev_is_alphanumeric = false;
 
-        for c in s.chars() {
-            let transformed = if prev_is_alphanumeric {
-                c.to_ascii_lowercase()
+    if input.is_ascii() {
+        for c in input.chars() {
+            if prev_is_alphanumeric {
+                result.push(c.to_ascii_lowercase());
             } else {
-                c.to_ascii_uppercase()
+                result.push(c.to_ascii_uppercase());
             };
-            result.push(transformed);
             prev_is_alphanumeric = c.is_ascii_alphanumeric();
         }
+    } else {
+        for c in input.chars() {
+            if prev_is_alphanumeric {
+                result.extend(c.to_lowercase());
+            } else {
+                result.extend(c.to_uppercase());
+            }
+            prev_is_alphanumeric = c.is_alphanumeric();
+        }
+    }
 
-        result
-    })
+    result
 }
 
 #[cfg(test)]
 mod tests {
-    use crate::string::initcap::InitcapFunc;
+    use crate::unicode::initcap::InitcapFunc;
     use crate::utils::test::test_function;
-    use arrow::array::{Array, StringArray};
+    use arrow::array::{Array, StringArray, StringViewArray};
     use arrow::datatypes::DataType::Utf8;
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -181,6 +202,19 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            InitcapFunc::new(),
+            vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                "êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
+                    .to_string()
+            )))],
+            Ok(Some(
+                "Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
+            )),
+            &str,
+            Utf8,
+            StringArray
+        );
         test_function!(
             InitcapFunc::new(),
             vec![ColumnarValue::Scalar(ScalarValue::from(""))],
@@ -205,6 +239,7 @@ mod tests {
             Utf8,
             StringArray
         );
+
         test_function!(
             InitcapFunc::new(),
             vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
@@ -213,7 +248,7 @@ mod tests {
             Ok(Some("Hi Thomas")),
             &str,
             Utf8,
-            StringArray
+            StringViewArray
         );
         test_function!(
             InitcapFunc::new(),
@@ -223,7 +258,20 @@ mod tests {
             Ok(Some("Hi Thomas With M0re Than 12 Chars")),
             &str,
             Utf8,
-            StringArray
+            StringViewArray
+        );
+        test_function!(
+            InitcapFunc::new(),
+            vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                "đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR 
εΛλΗΝΙκΉ"
+                    .to_string()
+            )))],
+            Ok(Some(
+                "Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar 
Ελληνική"
+            )),
+            &str,
+            Utf8,
+            StringViewArray
         );
         test_function!(
             InitcapFunc::new(),
@@ -233,7 +281,7 @@ mod tests {
             Ok(Some("")),
             &str,
             Utf8,
-            StringArray
+            StringViewArray
         );
         test_function!(
             InitcapFunc::new(),
@@ -241,7 +289,7 @@ mod tests {
             Ok(None),
             &str,
             Utf8,
-            StringArray
+            StringViewArray
         );
 
         Ok(())
diff --git a/datafusion/functions/src/unicode/mod.rs 
b/datafusion/functions/src/unicode/mod.rs
index f31ece9196..e8e3eb3f4e 100644
--- a/datafusion/functions/src/unicode/mod.rs
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -23,6 +23,7 @@ use datafusion_expr::ScalarUDF;
 
 pub mod character_length;
 pub mod find_in_set;
+pub mod initcap;
 pub mod left;
 pub mod lpad;
 pub mod reverse;
@@ -36,6 +37,7 @@ pub mod translate;
 // create UDFs
 make_udf_function!(character_length::CharacterLengthFunc, character_length);
 make_udf_function!(find_in_set::FindInSetFunc, find_in_set);
+make_udf_function!(initcap::InitcapFunc, initcap);
 make_udf_function!(left::LeftFunc, left);
 make_udf_function!(lpad::LPadFunc, lpad);
 make_udf_function!(right::RightFunc, right);
@@ -94,6 +96,10 @@ pub mod expr_fn {
         left,
         "returns the first `n` characters in the `string`",
         string n
+    ),(
+        initcap,
+        "converts the first letter of each word in `string` in uppercase and 
the remaining characters in lowercase",
+        string
     ),(
         find_in_set,
         "Returns a value in the range of 1 to N if the string str is in the 
string list strlist consisting of N substrings",
@@ -126,6 +132,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
     vec![
         character_length(),
         find_in_set(),
+        initcap(),
         left(),
         lpad(),
         reverse(),
diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part 
b/datafusion/sqllogictest/test_files/string/string_query.slt.part
index 80fcc01028..2414e5864c 100644
--- a/datafusion/sqllogictest/test_files/string/string_query.slt.part
+++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part
@@ -460,7 +460,7 @@ Andrew Datafusion📊🔥
 Xiangpeng Datafusion数据融合
 Raphael Datafusionдатафусион
 Under_Score Un Iść Core
-Percent Pan Tadeusz Ma Iść W KąT
+Percent Pan Tadeusz Ma Iść W Kąt
 (empty) (empty)
 (empty) (empty)
 % (empty)
diff --git a/docs/source/user-guide/sql/scalar_functions.md 
b/docs/source/user-guide/sql/scalar_functions.md
index 2e4147f96e..be4f5e56b3 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -1070,9 +1070,7 @@ find_in_set(str, strlist)
 
 ### `initcap`
 
-Capitalizes the first character in each word in the ASCII input string. Words 
are delimited by non-alphanumeric characters.
-
-Note this function does not support UTF-8 characters.
+Capitalizes the first character in each word in the input string. Words are 
delimited by non-alphanumeric characters.
 
 ```
 initcap(str)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to