This is an automated email from the ASF dual-hosted git repository.
goldmedal pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new a267784bc6 Support unicode character for `initcap` function (#13752)
a267784bc6 is described below
commit a267784bc60910bfdf558b4f8e600c1890ad6245
Author: Tai Le Manh <[email protected]>
AuthorDate: Sun Dec 22 18:10:35 2024 +0700
Support unicode character for `initcap` function (#13752)
* Support unicode character for 'initcap' function
Signed-off-by: Tai Le Manh <[email protected]>
* Update unit tests
* Fix clippy warning
* Update sqllogictests - initcap
* Update scalar_functions.md docs
* Add suggestions change
Signed-off-by: Tai Le Manh <[email protected]>
---------
Signed-off-by: Tai Le Manh <[email protected]>
---
datafusion/functions/Cargo.toml | 2 +-
datafusion/functions/benches/initcap.rs | 4 +-
datafusion/functions/src/string/mod.rs | 7 --
.../functions/src/{string => unicode}/initcap.rs | 114 +++++++++++++++------
datafusion/functions/src/unicode/mod.rs | 7 ++
.../test_files/string/string_query.slt.part | 2 +-
docs/source/user-guide/sql/scalar_functions.md | 4 +-
7 files changed, 93 insertions(+), 47 deletions(-)
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index de72c7ee94..fd986c4be4 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -212,4 +212,4 @@ required-features = ["math_expressions"]
[[bench]]
harness = false
name = "initcap"
-required-features = ["string_expressions"]
+required-features = ["unicode_expressions"]
diff --git a/datafusion/functions/benches/initcap.rs
b/datafusion/functions/benches/initcap.rs
index c88b6b5139..97c76831b3 100644
--- a/datafusion/functions/benches/initcap.rs
+++ b/datafusion/functions/benches/initcap.rs
@@ -24,7 +24,7 @@ use arrow::util::bench_util::{
};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
-use datafusion_functions::string;
+use datafusion_functions::unicode;
use std::sync::Arc;
fn create_args<O: OffsetSizeTrait>(
@@ -46,7 +46,7 @@ fn create_args<O: OffsetSizeTrait>(
}
fn criterion_benchmark(c: &mut Criterion) {
- let initcap = string::initcap();
+ let initcap = unicode::initcap();
for size in [1024, 4096] {
let args = create_args::<i32>(size, 8, true);
c.bench_function(
diff --git a/datafusion/functions/src/string/mod.rs
b/datafusion/functions/src/string/mod.rs
index f156f070d9..c43aaeccbe 100644
--- a/datafusion/functions/src/string/mod.rs
+++ b/datafusion/functions/src/string/mod.rs
@@ -30,7 +30,6 @@ pub mod concat;
pub mod concat_ws;
pub mod contains;
pub mod ends_with;
-pub mod initcap;
pub mod levenshtein;
pub mod lower;
pub mod ltrim;
@@ -52,7 +51,6 @@ make_udf_function!(chr::ChrFunc, chr);
make_udf_function!(concat::ConcatFunc, concat);
make_udf_function!(concat_ws::ConcatWsFunc, concat_ws);
make_udf_function!(ends_with::EndsWithFunc, ends_with);
-make_udf_function!(initcap::InitcapFunc, initcap);
make_udf_function!(levenshtein::LevenshteinFunc, levenshtein);
make_udf_function!(ltrim::LtrimFunc, ltrim);
make_udf_function!(lower::LowerFunc, lower);
@@ -94,10 +92,6 @@ pub mod expr_fn {
ends_with,
"Returns true if the `string` ends with the `suffix`, false
otherwise.",
string suffix
- ),(
- initcap,
- "Converts the first letter of each word in `string` in uppercase and
the remaining characters in lowercase",
- string
),(
levenshtein,
"Returns the Levenshtein distance between the two given strings",
@@ -177,7 +171,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
concat(),
concat_ws(),
ends_with(),
- initcap(),
levenshtein(),
lower(),
ltrim(),
diff --git a/datafusion/functions/src/string/initcap.rs
b/datafusion/functions/src/unicode/initcap.rs
similarity index 68%
rename from datafusion/functions/src/string/initcap.rs
rename to datafusion/functions/src/unicode/initcap.rs
index 2780dcaeeb..e9f966b958 100644
--- a/datafusion/functions/src/string/initcap.rs
+++ b/datafusion/functions/src/unicode/initcap.rs
@@ -18,7 +18,9 @@
use std::any::Any;
use std::sync::{Arc, OnceLock};
-use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
+use arrow::array::{
+ Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
+};
use arrow::datatypes::DataType;
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -74,7 +76,7 @@ impl ScalarUDFImpl for InitcapFunc {
DataType::LargeUtf8 => make_scalar_function(initcap::<i64>,
vec![])(args),
DataType::Utf8View => make_scalar_function(initcap_utf8view,
vec![])(args),
other => {
- exec_err!("Unsupported data type {other:?} for function
initcap")
+ exec_err!("Unsupported data type {other:?} for function
`initcap`")
}
}
}
@@ -90,9 +92,8 @@ fn get_initcap_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_STRING,
- "Capitalizes the first character in each word in the ASCII input
string. \
- Words are delimited by non-alphanumeric characters.\n\n\
- Note this function does not support UTF-8 characters.",
+ "Capitalizes the first character in each word in the input string.
\
+ Words are delimited by non-alphanumeric characters.",
"initcap(str)",
)
.with_sql_example(
@@ -123,50 +124,70 @@ fn get_initcap_doc() -> &'static Documentation {
fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
- // first map is the iterator, second is for the `Option<_>`
- let result = string_array
- .iter()
- .map(initcap_string)
- .collect::<GenericStringArray<T>>();
+ let mut builder = GenericStringBuilder::<T>::with_capacity(
+ string_array.len(),
+ string_array.value_data().len(),
+ );
- Ok(Arc::new(result) as ArrayRef)
+ string_array.iter().for_each(|str| match str {
+ Some(s) => {
+ let initcap_str = initcap_string(s);
+ builder.append_value(initcap_str);
+ }
+ None => builder.append_null(),
+ });
+
+ Ok(Arc::new(builder.finish()) as ArrayRef)
}
fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_view_array = as_string_view_array(&args[0])?;
- let result = string_view_array
- .iter()
- .map(initcap_string)
- .collect::<StringArray>();
+ let mut builder =
StringViewBuilder::with_capacity(string_view_array.len());
+
+ string_view_array.iter().for_each(|str| match str {
+ Some(s) => {
+ let initcap_str = initcap_string(s);
+ builder.append_value(initcap_str);
+ }
+ None => builder.append_null(),
+ });
- Ok(Arc::new(result) as ArrayRef)
+ Ok(Arc::new(builder.finish()) as ArrayRef)
}
-fn initcap_string(input: Option<&str>) -> Option<String> {
- input.map(|s| {
- let mut result = String::with_capacity(s.len());
- let mut prev_is_alphanumeric = false;
+fn initcap_string(input: &str) -> String {
+ let mut result = String::with_capacity(input.len());
+ let mut prev_is_alphanumeric = false;
- for c in s.chars() {
- let transformed = if prev_is_alphanumeric {
- c.to_ascii_lowercase()
+ if input.is_ascii() {
+ for c in input.chars() {
+ if prev_is_alphanumeric {
+ result.push(c.to_ascii_lowercase());
} else {
- c.to_ascii_uppercase()
+ result.push(c.to_ascii_uppercase());
};
- result.push(transformed);
prev_is_alphanumeric = c.is_ascii_alphanumeric();
}
+ } else {
+ for c in input.chars() {
+ if prev_is_alphanumeric {
+ result.extend(c.to_lowercase());
+ } else {
+ result.extend(c.to_uppercase());
+ }
+ prev_is_alphanumeric = c.is_alphanumeric();
+ }
+ }
- result
- })
+ result
}
#[cfg(test)]
mod tests {
- use crate::string::initcap::InitcapFunc;
+ use crate::unicode::initcap::InitcapFunc;
use crate::utils::test::test_function;
- use arrow::array::{Array, StringArray};
+ use arrow::array::{Array, StringArray, StringViewArray};
use arrow::datatypes::DataType::Utf8;
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -181,6 +202,19 @@ mod tests {
Utf8,
StringArray
);
+ test_function!(
+ InitcapFunc::new(),
+ vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+ "êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
+ .to_string()
+ )))],
+ Ok(Some(
+ "Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
+ )),
+ &str,
+ Utf8,
+ StringArray
+ );
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::from(""))],
@@ -205,6 +239,7 @@ mod tests {
Utf8,
StringArray
);
+
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
@@ -213,7 +248,7 @@ mod tests {
Ok(Some("Hi Thomas")),
&str,
Utf8,
- StringArray
+ StringViewArray
);
test_function!(
InitcapFunc::new(),
@@ -223,7 +258,20 @@ mod tests {
Ok(Some("Hi Thomas With M0re Than 12 Chars")),
&str,
Utf8,
- StringArray
+ StringViewArray
+ );
+ test_function!(
+ InitcapFunc::new(),
+ vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+ "đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR
εΛλΗΝΙκΉ"
+ .to_string()
+ )))],
+ Ok(Some(
+ "Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar
Ελληνική"
+ )),
+ &str,
+ Utf8,
+ StringViewArray
);
test_function!(
InitcapFunc::new(),
@@ -233,7 +281,7 @@ mod tests {
Ok(Some("")),
&str,
Utf8,
- StringArray
+ StringViewArray
);
test_function!(
InitcapFunc::new(),
@@ -241,7 +289,7 @@ mod tests {
Ok(None),
&str,
Utf8,
- StringArray
+ StringViewArray
);
Ok(())
diff --git a/datafusion/functions/src/unicode/mod.rs
b/datafusion/functions/src/unicode/mod.rs
index f31ece9196..e8e3eb3f4e 100644
--- a/datafusion/functions/src/unicode/mod.rs
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -23,6 +23,7 @@ use datafusion_expr::ScalarUDF;
pub mod character_length;
pub mod find_in_set;
+pub mod initcap;
pub mod left;
pub mod lpad;
pub mod reverse;
@@ -36,6 +37,7 @@ pub mod translate;
// create UDFs
make_udf_function!(character_length::CharacterLengthFunc, character_length);
make_udf_function!(find_in_set::FindInSetFunc, find_in_set);
+make_udf_function!(initcap::InitcapFunc, initcap);
make_udf_function!(left::LeftFunc, left);
make_udf_function!(lpad::LPadFunc, lpad);
make_udf_function!(right::RightFunc, right);
@@ -94,6 +96,10 @@ pub mod expr_fn {
left,
"returns the first `n` characters in the `string`",
string n
+ ),(
+ initcap,
+ "converts the first letter of each word in `string` in uppercase and
the remaining characters in lowercase",
+ string
),(
find_in_set,
"Returns a value in the range of 1 to N if the string str is in the
string list strlist consisting of N substrings",
@@ -126,6 +132,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
vec![
character_length(),
find_in_set(),
+ initcap(),
left(),
lpad(),
reverse(),
diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part
b/datafusion/sqllogictest/test_files/string/string_query.slt.part
index 80fcc01028..2414e5864c 100644
--- a/datafusion/sqllogictest/test_files/string/string_query.slt.part
+++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part
@@ -460,7 +460,7 @@ Andrew Datafusion📊🔥
Xiangpeng Datafusion数据融合
Raphael Datafusionдатафусион
Under_Score Un Iść Core
-Percent Pan Tadeusz Ma Iść W KąT
+Percent Pan Tadeusz Ma Iść W Kąt
(empty) (empty)
(empty) (empty)
% (empty)
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index 2e4147f96e..be4f5e56b3 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -1070,9 +1070,7 @@ find_in_set(str, strlist)
### `initcap`
-Capitalizes the first character in each word in the ASCII input string. Words
are delimited by non-alphanumeric characters.
-
-Note this function does not support UTF-8 characters.
+Capitalizes the first character in each word in the input string. Words are
delimited by non-alphanumeric characters.
```
initcap(str)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]