This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 0bcf4627d6 Update code comment for the cases of regularized RANGE 
frame and add tests for ORDER BY cases with RANGE frame (#8410)
0bcf4627d6 is described below

commit 0bcf4627d6ab9faddbfbe11817c55b7a0e0686eb
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Mon Dec 4 15:24:18 2023 -0800

    Update code comment for the cases of regularized RANGE frame and add tests 
for ORDER BY cases with RANGE frame (#8410)
    
    * fix: RANGE frame can be regularized to ROWS frame only if empty ORDER BY 
clause
    
    * Fix flaky test
    
    * Update test comment
    
    * Add code comment
    
    * Update
---
 datafusion/expr/src/window_frame.rs           | 12 +++++-
 datafusion/sqllogictest/test_files/window.slt | 58 +++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/datafusion/expr/src/window_frame.rs 
b/datafusion/expr/src/window_frame.rs
index 5f161b85dd..2a64f21b85 100644
--- a/datafusion/expr/src/window_frame.rs
+++ b/datafusion/expr/src/window_frame.rs
@@ -148,12 +148,22 @@ impl WindowFrame {
 pub fn regularize(mut frame: WindowFrame, order_bys: usize) -> 
Result<WindowFrame> {
     if frame.units == WindowFrameUnits::Range && order_bys != 1 {
         // Normally, RANGE frames require an ORDER BY clause with exactly one
-        // column. However, an ORDER BY clause may be absent in two edge cases.
+        // column. However, an ORDER BY clause may be absent or present but 
with
+        // more than one column in two edge cases:
+        // 1. start bound is UNBOUNDED or CURRENT ROW
+        // 2. end bound is CURRENT ROW or UNBOUNDED.
+        // In these cases, we regularize the RANGE frame to be equivalent to a 
ROWS
+        // frame with the UNBOUNDED bounds.
+        // Note that this follows Postgres behavior.
         if (frame.start_bound.is_unbounded()
             || frame.start_bound == WindowFrameBound::CurrentRow)
             && (frame.end_bound == WindowFrameBound::CurrentRow
                 || frame.end_bound.is_unbounded())
         {
+            // If an ORDER BY clause is absent, the frame is equivalent to a 
ROWS
+            // frame with the UNBOUNDED bounds.
+            // If an ORDER BY clause is present but has more than one column, 
the
+            // frame is unchanged.
             if order_bys == 0 {
                 frame.units = WindowFrameUnits::Rows;
                 frame.start_bound =
diff --git a/datafusion/sqllogictest/test_files/window.slt 
b/datafusion/sqllogictest/test_files/window.slt
index bb6ca11948..c0dcd4ae1e 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -3727,3 +3727,61 @@ FROM score_board s
 
 statement ok
 DROP TABLE score_board;
+
+# Regularize RANGE frame
+query error DataFusion error: Error during planning: RANGE requires exactly 
one ORDER BY column
+select a,
+       rank() over (order by a, a + 1 RANGE BETWEEN UNBOUNDED PRECEDING AND 1 
FOLLOWING) rnk
+       from (select 1 a union select 2 a) q ORDER BY a
+
+query II
+select a,
+       rank() over (order by a RANGE BETWEEN UNBOUNDED PRECEDING AND 1 
FOLLOWING) rnk
+       from (select 1 a union select 2 a) q ORDER BY a
+----
+1 1
+2 2
+
+query error DataFusion error: Error during planning: RANGE requires exactly 
one ORDER BY column
+select a,
+       rank() over (RANGE BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) rnk
+       from (select 1 a union select 2 a) q ORDER BY a
+
+query II
+select a,
+       rank() over (order by a, a + 1 RANGE BETWEEN UNBOUNDED PRECEDING AND 
UNBOUNDED FOLLOWING) rnk
+       from (select 1 a union select 2 a) q ORDER BY a
+----
+1 1
+2 2
+
+query II
+select a,
+       rank() over (order by a RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED 
FOLLOWING) rnk
+       from (select 1 a union select 2 a) q ORDER BY a
+----
+1 1
+2 2
+
+# TODO: this is different to Postgres which returns [1, 1] for `rnk`.
+# Comment it because it is flaky now as it depends on the order of the `a` 
column.
+# query II
+# select a,
+#       rank() over (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED 
FOLLOWING) rnk
+#       from (select 1 a union select 2 a) q ORDER BY rnk
+# ----
+# 1 1
+# 2 2
+
+# TODO: this works in Postgres which returns [1, 1].
+query error DataFusion error: Arrow error: Invalid argument error: must either 
specify a row count or at least one column
+select rank() over (RANGE between UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) 
rnk
+       from (select 1 a union select 2 a) q;
+
+# TODO: this is different to Postgres which returns [1, 1] for `rnk`.
+query I
+select rank() over (order by 1 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED 
FOLLOWING) rnk
+       from (select 1 a union select 2 a) q ORDER BY rnk
+----
+1
+2

Reply via email to