This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 0bcf4627d6 Update code comment for the cases of regularized RANGE
frame and add tests for ORDER BY cases with RANGE frame (#8410)
0bcf4627d6 is described below
commit 0bcf4627d6ab9faddbfbe11817c55b7a0e0686eb
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Mon Dec 4 15:24:18 2023 -0800
Update code comment for the cases of regularized RANGE frame and add tests
for ORDER BY cases with RANGE frame (#8410)
* fix: RANGE frame can be regularized to ROWS frame only if empty ORDER BY
clause
* Fix flaky test
* Update test comment
* Add code comment
* Update
---
datafusion/expr/src/window_frame.rs | 12 +++++-
datafusion/sqllogictest/test_files/window.slt | 58 +++++++++++++++++++++++++++
2 files changed, 69 insertions(+), 1 deletion(-)
diff --git a/datafusion/expr/src/window_frame.rs
b/datafusion/expr/src/window_frame.rs
index 5f161b85dd..2a64f21b85 100644
--- a/datafusion/expr/src/window_frame.rs
+++ b/datafusion/expr/src/window_frame.rs
@@ -148,12 +148,22 @@ impl WindowFrame {
pub fn regularize(mut frame: WindowFrame, order_bys: usize) ->
Result<WindowFrame> {
if frame.units == WindowFrameUnits::Range && order_bys != 1 {
// Normally, RANGE frames require an ORDER BY clause with exactly one
- // column. However, an ORDER BY clause may be absent in two edge cases.
+ // column. However, an ORDER BY clause may be absent or present but
with
+ // more than one column in two edge cases:
+ // 1. start bound is UNBOUNDED or CURRENT ROW
+ // 2. end bound is CURRENT ROW or UNBOUNDED.
+ // In these cases, we regularize the RANGE frame to be equivalent to a
ROWS
+ // frame with the UNBOUNDED bounds.
+ // Note that this follows Postgres behavior.
if (frame.start_bound.is_unbounded()
|| frame.start_bound == WindowFrameBound::CurrentRow)
&& (frame.end_bound == WindowFrameBound::CurrentRow
|| frame.end_bound.is_unbounded())
{
+ // If an ORDER BY clause is absent, the frame is equivalent to a
ROWS
+ // frame with the UNBOUNDED bounds.
+ // If an ORDER BY clause is present but has more than one column,
the
+ // frame is unchanged.
if order_bys == 0 {
frame.units = WindowFrameUnits::Rows;
frame.start_bound =
diff --git a/datafusion/sqllogictest/test_files/window.slt
b/datafusion/sqllogictest/test_files/window.slt
index bb6ca11948..c0dcd4ae1e 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -3727,3 +3727,61 @@ FROM score_board s
statement ok
DROP TABLE score_board;
+
+# Regularize RANGE frame
+query error DataFusion error: Error during planning: RANGE requires exactly
one ORDER BY column
+select a,
+ rank() over (order by a, a + 1 RANGE BETWEEN UNBOUNDED PRECEDING AND 1
FOLLOWING) rnk
+ from (select 1 a union select 2 a) q ORDER BY a
+
+query II
+select a,
+ rank() over (order by a RANGE BETWEEN UNBOUNDED PRECEDING AND 1
FOLLOWING) rnk
+ from (select 1 a union select 2 a) q ORDER BY a
+----
+1 1
+2 2
+
+query error DataFusion error: Error during planning: RANGE requires exactly
one ORDER BY column
+select a,
+ rank() over (RANGE BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) rnk
+ from (select 1 a union select 2 a) q ORDER BY a
+
+query II
+select a,
+ rank() over (order by a, a + 1 RANGE BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING) rnk
+ from (select 1 a union select 2 a) q ORDER BY a
+----
+1 1
+2 2
+
+query II
+select a,
+ rank() over (order by a RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING) rnk
+ from (select 1 a union select 2 a) q ORDER BY a
+----
+1 1
+2 2
+
+# TODO: this is different to Postgres which returns [1, 1] for `rnk`.
+# Comment it because it is flaky now as it depends on the order of the `a`
column.
+# query II
+# select a,
+# rank() over (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING) rnk
+# from (select 1 a union select 2 a) q ORDER BY rnk
+# ----
+# 1 1
+# 2 2
+
+# TODO: this works in Postgres which returns [1, 1].
+query error DataFusion error: Arrow error: Invalid argument error: must either
specify a row count or at least one column
+select rank() over (RANGE between UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
rnk
+ from (select 1 a union select 2 a) q;
+
+# TODO: this is different to Postgres which returns [1, 1] for `rnk`.
+query I
+select rank() over (order by 1 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING) rnk
+ from (select 1 a union select 2 a) q ORDER BY rnk
+----
+1
+2