This is an automated email from the ASF dual-hosted git repository.
weijun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new d7c70e4788 Support utf8view datatype for window (#15257)
d7c70e4788 is described below
commit d7c70e4788183f741370fee41f7e7db6637e1815
Author: Qi Zhu <[email protected]>
AuthorDate: Sun Mar 16 20:57:37 2025 +0800
Support utf8view datatype for window (#15257)
* Support utf8view datatype for window
* Fix table name
* Also support subquery testing
---
datafusion/expr/src/type_coercion/mod.rs | 9 ++--
datafusion/optimizer/src/analyzer/type_coercion.rs | 4 +-
.../sqllogictest/test_files/subquery_sort.slt | 29 +++++++++++
datafusion/sqllogictest/test_files/window.slt | 59 ++++++++++++++++++++++
4 files changed, 96 insertions(+), 5 deletions(-)
diff --git a/datafusion/expr/src/type_coercion/mod.rs
b/datafusion/expr/src/type_coercion/mod.rs
index 3a5c65fb46..4fc150ef29 100644
--- a/datafusion/expr/src/type_coercion/mod.rs
+++ b/datafusion/expr/src/type_coercion/mod.rs
@@ -79,9 +79,12 @@ pub fn is_datetime(dt: &DataType) -> bool {
)
}
-/// Determine whether the given data type `dt` is a `Utf8` or `LargeUtf8`.
-pub fn is_utf8_or_large_utf8(dt: &DataType) -> bool {
- matches!(dt, DataType::Utf8 | DataType::LargeUtf8)
+/// Determine whether the given data type `dt` is a `Utf8` or `Utf8View` or
`LargeUtf8`.
+pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool {
+ matches!(
+ dt,
+ DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
+ )
}
/// Determine whether the given data type `dt` is a `Decimal`.
diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs
b/datafusion/optimizer/src/analyzer/type_coercion.rs
index 5d6b226ff1..07eb795462 100644
--- a/datafusion/optimizer/src/analyzer/type_coercion.rs
+++ b/datafusion/optimizer/src/analyzer/type_coercion.rs
@@ -46,7 +46,7 @@ use datafusion_expr::type_coercion::functions::{
use datafusion_expr::type_coercion::other::{
get_coerce_type_for_case_expression, get_coerce_type_for_list,
};
-use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_large_utf8};
+use datafusion_expr::type_coercion::{is_datetime,
is_utf8_or_utf8view_or_large_utf8};
use datafusion_expr::utils::merge_schema;
use datafusion_expr::{
is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown,
not,
@@ -716,7 +716,7 @@ fn coerce_frame_bound(
fn extract_window_frame_target_type(col_type: &DataType) -> Result<DataType> {
if col_type.is_numeric()
- || is_utf8_or_large_utf8(col_type)
+ || is_utf8_or_utf8view_or_large_utf8(col_type)
|| matches!(col_type, DataType::Null)
|| matches!(col_type, DataType::Boolean)
{
diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt
b/datafusion/sqllogictest/test_files/subquery_sort.slt
index 4ca19c0b6a..5d22bf92e7 100644
--- a/datafusion/sqllogictest/test_files/subquery_sort.slt
+++ b/datafusion/sqllogictest/test_files/subquery_sort.slt
@@ -104,6 +104,35 @@ physical_plan
05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false]
06)----------DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1,
c3, c9], file_type=csv, has_header=true
+#Test with utf8view for window function
+statement ok
+CREATE TABLE sink_table_with_utf8view AS
+SELECT arrow_cast(c1, 'Utf8View') AS c1, c2, c3, c9
+FROM sink_table;
+
+
+query TT
+EXPLAIN SELECT t2.c1, t2.r FROM (SELECT c1, RANK() OVER (ORDER BY c1 DESC) AS
r, c3, c9 FROM sink_table_with_utf8view ORDER BY c1, c3 LIMIT 2) AS t2 ORDER BY
t2.c1, t2.c3, t2.c9;
+----
+logical_plan
+01)Projection: t2.c1, t2.r
+02)--Sort: t2.c1 ASC NULLS LAST, t2.c3 ASC NULLS LAST, t2.c9 ASC NULLS LAST
+03)----SubqueryAlias: t2
+04)------Sort: sink_table_with_utf8view.c1 ASC NULLS LAST,
sink_table_with_utf8view.c3 ASC NULLS LAST, fetch=2
+05)--------Projection: sink_table_with_utf8view.c1, rank() ORDER BY
[sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED
PRECEDING AND CURRENT ROW AS r, sink_table_with_utf8view.c3,
sink_table_with_utf8view.c9
+06)----------WindowAggr: windowExpr=[[rank() ORDER BY
[sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED
PRECEDING AND CURRENT ROW]]
+07)------------TableScan: sink_table_with_utf8view projection=[c1, c3, c9]
+physical_plan
+01)ProjectionExec: expr=[c1@0 as c1, r@1 as r]
+02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST,
c9@3 ASC NULLS LAST], preserve_partitioning=[false]
+03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY
[sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED
PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9]
+04)------BoundedWindowAggExec: wdw=[rank() ORDER BY
[sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED
PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() ORDER BY
[sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED
PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0,
dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range,
start_bound: Preceding(Utf8View(NULL)), end_bound: CurrentRow, is_causal: false
}], m [...]
+05)--------SortPreservingMergeExec: [c1@0 DESC]
+06)----------SortExec: expr=[c1@0 DESC], preserve_partitioning=[true]
+07)------------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0]
+
+statement ok
+DROP TABLE sink_table_with_utf8view;
query TT
EXPLAIN SELECT c1, c2 FROM (SELECT DISTINCT ON (c1) c1, c2, c3, c9 FROM
sink_table ORDER BY c1, c3 DESC, c9) AS t2 ORDER BY t2.c1, t2.c3 DESC, t2.c9
diff --git a/datafusion/sqllogictest/test_files/window.slt
b/datafusion/sqllogictest/test_files/window.slt
index 1a9acc0f53..fd623b67fe 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -5536,3 +5536,62 @@ physical_plan
01)ProjectionExec: expr=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN
UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as max_c5]
02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN
UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name:
"max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0,
dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows,
start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)),
is_causal: false }]
03)----DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5],
file_type=csv, has_header=true
+
+# Testing Utf8View with window
+statement ok
+CREATE TABLE aggregate_test_100_utf8view AS SELECT
+ arrow_cast(c1, 'Utf8View') as c1,
+ c9,
+ c13
+FROM aggregate_test_100;
+
+
+#fn window_frame_ranges_string_check
+query II
+SELECT
+SUM(LENGTH(c13)) OVER(ORDER BY c13),
+SUM(LENGTH(c1)) OVER(ORDER BY c1)
+FROM aggregate_test_100_utf8view
+ORDER BY c9
+LIMIT 5
+----
+2100 100
+510 79
+1440 21
+1830 61
+2010 21
+
+
+#fn test_window_rank
+query IIIIIRR
+SELECT
+ c9,
+ RANK() OVER(ORDER BY c1) AS rank1,
+ RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rank2,
+ DENSE_RANK() OVER(ORDER BY c1) as dense_rank1,
+ DENSE_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as
dense_rank2,
+ PERCENT_RANK() OVER(ORDER BY c1) as percent_rank1,
+ PERCENT_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING)
as percent_rank2
+ FROM aggregate_test_100_utf8view
+ ORDER BY c9
+ LIMIT 5
+----
+28774375 80 80 5 5 0.79797979798 0.79797979798
+63044568 62 62 4 4 0.616161616162 0.616161616162
+141047417 1 1 1 1 0 0
+141680161 41 41 3 3 0.40404040404 0.40404040404
+145294611 1 1 1 1 0 0
+
+
+# CTAS with NTILE function
+statement ok
+CREATE TABLE new_table AS SELECT NTILE(2) OVER(ORDER BY c1) AS ntile_2 FROM
aggregate_test_100_utf8view;
+
+statement ok
+DROP TABLE new_table;
+
+statement ok
+DROP TABLE aggregate_test_100_utf8view;
+
+statement ok
+DROP TABLE aggregate_test_100
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]