This is an automated email from the ASF dual-hosted git repository.

ytyou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 91c0975c8a Always use `StringViewArray` as output of `substr` (#14498)
91c0975c8a is described below

commit 91c0975c8a1382dff97474bfc45e504bc62ef016
Author: Kaifeng Zheng <[email protected]>
AuthorDate: Sat Feb 8 18:15:19 2025 +0800

    Always use `StringViewArray` as output of `substr` (#14498)
    
    * change substr return type to utf8view (bin/sqllogicaltests.rs to fix)
    
    * fix sqllogictest/string_view
    
    * fix clippy
    
    * fix tcph benchmark result
---
 datafusion/functions/src/unicode/substr.rs         | 118 ++++++++++-----------
 .../sqllogictest/test_files/string/string_view.slt |   2 +-
 .../test_files/tpch/plans/q22.slt.part             |  12 +--
 3 files changed, 64 insertions(+), 68 deletions(-)

diff --git a/datafusion/functions/src/unicode/substr.rs 
b/datafusion/functions/src/unicode/substr.rs
index 3767166cab..20d5f6e3ab 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -19,10 +19,10 @@ use std::any::Any;
 use std::sync::Arc;
 
 use crate::strings::make_and_append_view;
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::make_scalar_function;
 use arrow::array::{
-    Array, ArrayIter, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
-    NullBufferBuilder, OffsetSizeTrait, StringArrayType, StringViewArray,
+    Array, ArrayIter, ArrayRef, AsArray, Int64Array, NullBufferBuilder, 
StringArrayType,
+    StringViewArray, StringViewBuilder,
 };
 use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::DataType;
@@ -90,12 +90,9 @@ impl ScalarUDFImpl for SubstrFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types[0] == DataType::Utf8View {
-            Ok(DataType::Utf8View)
-        } else {
-            utf8_to_str_type(&arg_types[0], "substr")
-        }
+    // `SubstrFunc` always generates `Utf8View` output for its efficiency.
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8View)
     }
 
     fn invoke_batch(
@@ -189,11 +186,11 @@ pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            string_substr::<_, i32>(string_array, &args[1..])
+            string_substr::<_>(string_array, &args[1..])
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            string_substr::<_, i64>(string_array, &args[1..])
+            string_substr::<_>(string_array, &args[1..])
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
@@ -429,10 +426,9 @@ fn string_view_substr(
     }
 }
 
-fn string_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> 
Result<ArrayRef>
+fn string_substr<'a, V>(string_array: V, args: &[ArrayRef]) -> Result<ArrayRef>
 where
     V: StringArrayType<'a>,
-    T: OffsetSizeTrait,
 {
     let start_array = as_int64_array(&args[0])?;
     let count_array_opt = if args.len() == 2 {
@@ -447,7 +443,7 @@ where
     match args.len() {
         1 => {
             let iter = ArrayIter::new(string_array);
-            let mut result_builder = GenericStringBuilder::<T>::new();
+            let mut result_builder = StringViewBuilder::new();
             for (string, start) in iter.zip(start_array.iter()) {
                 match (string, start) {
                     (Some(string), Some(start)) => {
@@ -470,7 +466,7 @@ where
         2 => {
             let iter = ArrayIter::new(string_array);
             let count_array = count_array_opt.unwrap();
-            let mut result_builder = GenericStringBuilder::<T>::new();
+            let mut result_builder = StringViewBuilder::new();
 
             for ((string, start), count) in
                 iter.zip(start_array.iter()).zip(count_array.iter())
@@ -512,8 +508,8 @@ where
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray, StringViewArray};
-    use arrow::datatypes::DataType::{Utf8, Utf8View};
+    use arrow::array::{Array, StringViewArray};
+    use arrow::datatypes::DataType::Utf8View;
 
     use datafusion_common::{exec_err, Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -623,8 +619,8 @@ mod tests {
             ],
             Ok(Some("alphabet")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -634,8 +630,8 @@ mod tests {
             ],
             Ok(Some("ésoj")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -645,8 +641,8 @@ mod tests {
             ],
             Ok(Some("joséésoj")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -656,8 +652,8 @@ mod tests {
             ],
             Ok(Some("alphabet")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -667,8 +663,8 @@ mod tests {
             ],
             Ok(Some("lphabet")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -678,8 +674,8 @@ mod tests {
             ],
             Ok(Some("phabet")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -689,8 +685,8 @@ mod tests {
             ],
             Ok(Some("alphabet")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -700,8 +696,8 @@ mod tests {
             ],
             Ok(Some("")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -711,8 +707,8 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -723,8 +719,8 @@ mod tests {
             ],
             Ok(Some("ph")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -735,8 +731,8 @@ mod tests {
             ],
             Ok(Some("phabet")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -747,8 +743,8 @@ mod tests {
             ],
             Ok(Some("alph")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         // starting from 5 (10 + -5)
         test_function!(
@@ -760,8 +756,8 @@ mod tests {
             ],
             Ok(Some("alph")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         // starting from -1 (4 + -5)
         test_function!(
@@ -773,8 +769,8 @@ mod tests {
             ],
             Ok(Some("")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         // starting from 0 (5 + -5)
         test_function!(
@@ -786,8 +782,8 @@ mod tests {
             ],
             Ok(Some("")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -798,8 +794,8 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -810,8 +806,8 @@ mod tests {
             ],
             Ok(None),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -822,8 +818,8 @@ mod tests {
             ],
             exec_err!("negative substring length not allowed: substr(<str>, 1, 
-1)"),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -834,8 +830,8 @@ mod tests {
             ],
             Ok(Some("és")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         #[cfg(not(feature = "unicode_expressions"))]
         test_function!(
@@ -848,8 +844,8 @@ mod tests {
                 "function substr requires compilation with feature flag: 
unicode_expressions."
             ),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -859,8 +855,8 @@ mod tests {
             ],
             Ok(Some("abc")),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
         test_function!(
             SubstrFunc::new(),
@@ -871,8 +867,8 @@ mod tests {
             ],
             exec_err!("negative overflow when calculating skip value"),
             &str,
-            Utf8,
-            StringArray
+            Utf8View,
+            StringViewArray
         );
 
         Ok(())
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt 
b/datafusion/sqllogictest/test_files/string/string_view.slt
index c54e2aa700..a04cea7cd8 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -382,7 +382,7 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: starts_with(test.column1_utf8, substr(test.column1_utf8, 
Int64(1), Int64(2))) AS c1, starts_with(test.column1_large_utf8, 
substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2, 
starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1), 
Int64(2))) AS c3
+01)Projection: starts_with(CAST(test.column1_utf8 AS Utf8View), 
substr(test.column1_utf8, Int64(1), Int64(2))) AS c1, 
starts_with(CAST(test.column1_large_utf8 AS Utf8View), 
substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2, 
starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1), 
Int64(2))) AS c3
 02)--TableScan: test projection=[column1_utf8, column1_large_utf8, 
column1_utf8view]
 
 query BBB
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part 
b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
index e7b1f0a598..9ad9936125 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
@@ -64,15 +64,15 @@ logical_plan
 06)----------Inner Join:  Filter: CAST(customer.c_acctbal AS Decimal128(19, 
6)) > __scalar_sq_2.avg(customer.c_acctbal)
 07)------------Projection: customer.c_phone, customer.c_acctbal
 08)--------------LeftAnti Join: customer.c_custkey = 
__correlated_sq_1.o_custkey
-09)----------------Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN 
([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), 
Utf8("17")])
-10)------------------TableScan: customer projection=[c_custkey, c_phone, 
c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN 
([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), 
Utf8("17")])]
+09)----------------Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN 
([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), 
Utf8View("30"), Utf8View("18"), Utf8View("17")])
+10)------------------TableScan: customer projection=[c_custkey, c_phone, 
c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN 
([Utf8View("13"), Utf8View("31"), Utf8View("23"), Utf8View("29"), 
Utf8View("30"), Utf8View("18"), Utf8View("17")])]
 11)----------------SubqueryAlias: __correlated_sq_1
 12)------------------TableScan: orders projection=[o_custkey]
 13)------------SubqueryAlias: __scalar_sq_2
 14)--------------Aggregate: groupBy=[[]], aggr=[[avg(customer.c_acctbal)]]
 15)----------------Projection: customer.c_acctbal
-16)------------------Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND 
substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), 
Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")])
-17)--------------------TableScan: customer projection=[c_phone, c_acctbal], 
partial_filters=[customer.c_acctbal > Decimal128(Some(0),15,2), 
substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), 
Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")])]
+16)------------------Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND 
substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), 
Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), 
Utf8View("17")])
+17)--------------------TableScan: customer projection=[c_phone, c_acctbal], 
partial_filters=[customer.c_acctbal > Decimal128(Some(0),15,2), 
substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("13"), 
Utf8View("31"), Utf8View("23"), Utf8View("29"), Utf8View("30"), Utf8View("18"), 
Utf8View("17")])]
 physical_plan
 01)SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST]
 02)--SortExec: expr=[cntrycode@0 ASC NULLS LAST], preserve_partitioning=[true]
@@ -90,7 +90,7 @@ physical_plan
 14)--------------------------CoalesceBatchesExec: target_batch_size=8192
 15)----------------------------RepartitionExec: 
partitioning=Hash([c_custkey@0], 4), input_partitions=4
 16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------FilterExec: Use substr(c_phone@1, 1, 2) IN 
(SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { 
value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") 
}, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }])
+17)--------------------------------FilterExec: substr(c_phone@1, 1, 2) IN 
([Literal { value: Utf8View("13") }, Literal { value: Utf8View("31") }, Literal 
{ value: Utf8View("23") }, Literal { value: Utf8View("29") }, Literal { value: 
Utf8View("30") }, Literal { value: Utf8View("18") }, Literal { value: 
Utf8View("17") }])
 18)----------------------------------RepartitionExec: 
partitioning=RoundRobinBatch(4), input_partitions=1
 19)------------------------------------DataSourceExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, 
projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false
 20)--------------------------CoalesceBatchesExec: target_batch_size=8192
@@ -100,6 +100,6 @@ physical_plan
 24)----------------------CoalescePartitionsExec
 25)------------------------AggregateExec: mode=Partial, gby=[], 
aggr=[avg(customer.c_acctbal)]
 26)--------------------------CoalesceBatchesExec: target_batch_size=8192
-27)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND Use 
substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { 
value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") 
}, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { 
value: Utf8("17") }]), projection=[c_acctbal@1]
+27)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND 
substr(c_phone@0, 1, 2) IN ([Literal { value: Utf8View("13") }, Literal { 
value: Utf8View("31") }, Literal { value: Utf8View("23") }, Literal { value: 
Utf8View("29") }, Literal { value: Utf8View("30") }, Literal { value: 
Utf8View("18") }, Literal { value: Utf8View("17") }]), projection=[c_acctbal@1]
 28)------------------------------RepartitionExec: 
partitioning=RoundRobinBatch(4), input_partitions=1
 29)--------------------------------DataSourceExec: file_groups={1 group: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, 
projection=[c_phone, c_acctbal], file_type=csv, has_header=false


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to