This is an automated email from the ASF dual-hosted git repository.
ytyou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 91c0975c8a Always use `StringViewArray` as output of `substr` (#14498)
91c0975c8a is described below
commit 91c0975c8a1382dff97474bfc45e504bc62ef016
Author: Kaifeng Zheng <[email protected]>
AuthorDate: Sat Feb 8 18:15:19 2025 +0800
Always use `StringViewArray` as output of `substr` (#14498)
* change substr return type to utf8view (bin/sqllogicaltests.rs to fix)
* fix sqllogictest/string_view
* fix clippy
* fix tcph benchmark result
---
datafusion/functions/src/unicode/substr.rs | 118 ++++++++++-----------
.../sqllogictest/test_files/string/string_view.slt | 2 +-
.../test_files/tpch/plans/q22.slt.part | 12 +--
3 files changed, 64 insertions(+), 68 deletions(-)
diff --git a/datafusion/functions/src/unicode/substr.rs
b/datafusion/functions/src/unicode/substr.rs
index 3767166cab..20d5f6e3ab 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -19,10 +19,10 @@ use std::any::Any;
use std::sync::Arc;
use crate::strings::make_and_append_view;
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::make_scalar_function;
use arrow::array::{
- Array, ArrayIter, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
- NullBufferBuilder, OffsetSizeTrait, StringArrayType, StringViewArray,
+ Array, ArrayIter, ArrayRef, AsArray, Int64Array, NullBufferBuilder,
StringArrayType,
+ StringViewArray, StringViewBuilder,
};
use arrow::buffer::ScalarBuffer;
use arrow::datatypes::DataType;
@@ -90,12 +90,9 @@ impl ScalarUDFImpl for SubstrFunc {
&self.signature
}
- fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
- if arg_types[0] == DataType::Utf8View {
- Ok(DataType::Utf8View)
- } else {
- utf8_to_str_type(&arg_types[0], "substr")
- }
+ // `SubstrFunc` always generates `Utf8View` output for its efficiency.
+ fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+ Ok(DataType::Utf8View)
}
fn invoke_batch(
@@ -189,11 +186,11 @@ pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8 => {
let string_array = args[0].as_string::<i32>();
- string_substr::<_, i32>(string_array, &args[1..])
+ string_substr::<_>(string_array, &args[1..])
}
DataType::LargeUtf8 => {
let string_array = args[0].as_string::<i64>();
- string_substr::<_, i64>(string_array, &args[1..])
+ string_substr::<_>(string_array, &args[1..])
}
DataType::Utf8View => {
let string_array = args[0].as_string_view();
@@ -429,10 +426,9 @@ fn string_view_substr(
}
}
-fn string_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) ->
Result<ArrayRef>
+fn string_substr<'a, V>(string_array: V, args: &[ArrayRef]) -> Result<ArrayRef>
where
V: StringArrayType<'a>,
- T: OffsetSizeTrait,
{
let start_array = as_int64_array(&args[0])?;
let count_array_opt = if args.len() == 2 {
@@ -447,7 +443,7 @@ where
match args.len() {
1 => {
let iter = ArrayIter::new(string_array);
- let mut result_builder = GenericStringBuilder::<T>::new();
+ let mut result_builder = StringViewBuilder::new();
for (string, start) in iter.zip(start_array.iter()) {
match (string, start) {
(Some(string), Some(start)) => {
@@ -470,7 +466,7 @@ where
2 => {
let iter = ArrayIter::new(string_array);
let count_array = count_array_opt.unwrap();
- let mut result_builder = GenericStringBuilder::<T>::new();
+ let mut result_builder = StringViewBuilder::new();
for ((string, start), count) in
iter.zip(start_array.iter()).zip(count_array.iter())
@@ -512,8 +508,8 @@ where
#[cfg(test)]
mod tests {
- use arrow::array::{Array, StringArray, StringViewArray};
- use arrow::datatypes::DataType::{Utf8, Utf8View};
+ use arrow::array::{Array, StringViewArray};
+ use arrow::datatypes::DataType::Utf8View;
use datafusion_common::{exec_err, Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -623,8 +619,8 @@ mod tests {
],
Ok(Some("alphabet")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -634,8 +630,8 @@ mod tests {
],
Ok(Some("ésoj")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -645,8 +641,8 @@ mod tests {
],
Ok(Some("joséésoj")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -656,8 +652,8 @@ mod tests {
],
Ok(Some("alphabet")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -667,8 +663,8 @@ mod tests {
],
Ok(Some("lphabet")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -678,8 +674,8 @@ mod tests {
],
Ok(Some("phabet")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -689,8 +685,8 @@ mod tests {
],
Ok(Some("alphabet")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -700,8 +696,8 @@ mod tests {
],
Ok(Some("")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -711,8 +707,8 @@ mod tests {
],
Ok(None),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -723,8 +719,8 @@ mod tests {
],
Ok(Some("ph")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -735,8 +731,8 @@ mod tests {
],
Ok(Some("phabet")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -747,8 +743,8 @@ mod tests {
],
Ok(Some("alph")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
// starting from 5 (10 + -5)
test_function!(
@@ -760,8 +756,8 @@ mod tests {
],
Ok(Some("alph")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
// starting from -1 (4 + -5)
test_function!(
@@ -773,8 +769,8 @@ mod tests {
],
Ok(Some("")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
// starting from 0 (5 + -5)
test_function!(
@@ -786,8 +782,8 @@ mod tests {
],
Ok(Some("")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -798,8 +794,8 @@ mod tests {
],
Ok(None),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -810,8 +806,8 @@ mod tests {
],
Ok(None),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -822,8 +818,8 @@ mod tests {
],
exec_err!("negative substring length not allowed: substr(<str>, 1,
-1)"),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -834,8 +830,8 @@ mod tests {
],
Ok(Some("és")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
#[cfg(not(feature = "unicode_expressions"))]
test_function!(
@@ -848,8 +844,8 @@ mod tests {
"function substr requires compilation with feature flag:
unicode_expressions."
),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -859,8 +855,8 @@ mod tests {
],
Ok(Some("abc")),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
test_function!(
SubstrFunc::new(),
@@ -871,8 +867,8 @@ mod tests {
],
exec_err!("negative overflow when calculating skip value"),
&str,
- Utf8,
- StringArray
+ Utf8View,
+ StringViewArray
);
Ok(())
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt
b/datafusion/sqllogictest/test_files/string/string_view.slt
index c54e2aa700..a04cea7cd8 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -382,7 +382,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
-01)Projection: starts_with(test.column1_utf8, substr(test.column1_utf8,
Int64(1), Int64(2))) AS c1, starts_with(test.column1_large_utf8,
substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2,
starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1),
Int64(2))) AS c3
+01)Projection: starts_with(CAST(test.column1_utf8 AS Utf8View),
substr(test.column1_utf8, Int64(1), Int64(2))) AS c1,
starts_with(CAST(test.column1_large_utf8 AS Utf8View),
substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2,
starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1),
Int64(2))) AS c3
02)--TableScan: test projection=[column1_utf8, column1_large_utf8,
column1_utf8view]
query BBB
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
index e7b1f0a598..9ad9936125 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
@@ -64,15 +64,15 @@ logical_plan
06)----------Inner Join: Filter: CAST(customer.c_acctbal AS Decimal128(19,
6)) > __scalar_sq_2.avg(customer.c_acctbal)
07)------------Projection: customer.c_phone, customer.c_acctbal
08)--------------LeftAnti Join: customer.c_custkey =
__correlated_sq_1.o_custkey
-09)----------------Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN
([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"),
Utf8("17")])
-10)------------------TableScan: customer projection=[c_custkey, c_phone,
c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN
([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"),
Utf8("17")])]
+09)----------------Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN
([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"),
Utf8View("30"), Utf8View("18"), Utf8View("17")])
+10)------------------TableScan: customer projection=[c_custkey, c_phone,
c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN
([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"),
Utf8View("30"), Utf8View("18"), Utf8View("17")])]
11)----------------SubqueryAlias: __correlated_sq_1
12)------------------TableScan: orders projection=[o_custkey]
13)------------SubqueryAlias: __scalar_sq_2
14)--------------Aggregate: groupBy=[[]], aggr=[[avg(customer.c_acctbal)]]
15)----------------Projection: customer.c_acctbal
-16)------------------Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND
substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"),
Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")])
-17)--------------------TableScan: customer projection=[c_phone, c_acctbal],
partial_filters=[customer.c_acctbal > Decimal128(Some(0),15,2),
substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"),
Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")])]
+16)------------------Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND
substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"),
Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"),
Utf8View("17")])
+17)--------------------TableScan: customer projection=[c_phone, c_acctbal],
partial_filters=[customer.c_acctbal > Decimal128(Some(0),15,2),
substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"),
Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"),
Utf8View("17")])]
physical_plan
01)SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST]
02)--SortExec: expr=[cntrycode@0 ASC NULLS LAST], preserve_partitioning=[true]
@@ -90,7 +90,7 @@ physical_plan
14)--------------------------CoalesceBatchesExec: target_batch_size=8192
15)----------------------------RepartitionExec:
partitioning=Hash([c_custkey@0], 4), input_partitions=4
16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------FilterExec: Use substr(c_phone@1, 1, 2) IN
(SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal {
value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30")
}, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }])
+17)--------------------------------FilterExec: substr(c_phone@1, 1, 2) IN
([Literal { value: Utf8View("13") }, Literal { value: Utf8View("31") }, Literal
{ value: Utf8View("23") }, Literal { value: Utf8View("29") }, Literal { value:
Utf8View("30") }, Literal { value: Utf8View("18") }, Literal { value:
Utf8View("17") }])
18)----------------------------------RepartitionExec:
partitioning=RoundRobinBatch(4), input_partitions=1
19)------------------------------------DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]},
projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false
20)--------------------------CoalesceBatchesExec: target_batch_size=8192
@@ -100,6 +100,6 @@ physical_plan
24)----------------------CoalescePartitionsExec
25)------------------------AggregateExec: mode=Partial, gby=[],
aggr=[avg(customer.c_acctbal)]
26)--------------------------CoalesceBatchesExec: target_batch_size=8192
-27)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND Use
substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal {
value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29")
}, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal {
value: Utf8("17") }]), projection=[c_acctbal@1]
+27)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND
substr(c_phone@0, 1, 2) IN ([Literal { value: Utf8View("13") }, Literal {
value: Utf8View("31") }, Literal { value: Utf8View("23") }, Literal { value:
Utf8View("29") }, Literal { value: Utf8View("30") }, Literal { value:
Utf8View("18") }, Literal { value: Utf8View("17") }]), projection=[c_acctbal@1]
28)------------------------------RepartitionExec:
partitioning=RoundRobinBatch(4), input_partitions=1
29)--------------------------------DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]},
projection=[c_phone, c_acctbal], file_type=csv, has_header=false
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]