[doris] branch master updated: [bug](node)fix dense_rank function in partition sort node return wrong rows (#24727)

yiguolei Thu, 21 Sep 2023 04:13:48 -0700

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 7630fe7b7b [bug](node)fix dense_rank function in partition sort node 
return wrong rows (#24727)
7630fe7b7b is described below

commit 7630fe7b7b583ff240b4abeaf73388ba9a2ba9af
Author: zhangstar333 <[email protected]>
AuthorDate: Thu Sep 21 19:13:30 2023 +0800

    [bug](node)fix dense_rank function in partition sort node return wrong rows 
(#24727)
---
 be/src/vec/common/sort/partition_sorter.cpp        | 39 +++++++------
 be/src/vec/common/sort/partition_sorter.h          |  1 +
 .../test_select_stddev_variance_window.out         | 68 ++++++++++++++++++++++
 .../test_select_stddev_variance_window.groovy      | 24 ++++++++
 4 files changed, 114 insertions(+), 18 deletions(-)

diff --git a/be/src/vec/common/sort/partition_sorter.cpp 
b/be/src/vec/common/sort/partition_sorter.cpp
index 1bffb5ed76..083c676ba8 100644
--- a/be/src/vec/common/sort/partition_sorter.cpp
+++ b/be/src/vec/common/sort/partition_sorter.cpp
@@ -101,12 +101,10 @@ Status PartitionSorter::partition_sort_read(Block* 
output_block, bool* eos, int
     auto& priority_queue = _state->get_priority_queue();
 
     bool get_enough_data = false;
-    bool first_compare_row = false;
     while (!priority_queue.empty()) {
         auto current = priority_queue.top();
         priority_queue.pop();
         if (UNLIKELY(_previous_row->impl == nullptr)) {
-            first_compare_row = true;
             *_previous_row = current;
         }
 
@@ -125,34 +123,39 @@ Status PartitionSorter::partition_sort_read(Block* 
output_block, bool* eos, int
             break;
         }
         case TopNAlgorithm::DENSE_RANK: {
+            //  dense_rank(): 1,1,1,2,2,2,2,.......,2,3,3,3, if SQL: where rk 
< 3, need output all 1 and 2
             //3 dense_rank() maybe need distinct rows of partition_inner_limit
-            if ((current_output_rows + _output_total_rows) < 
_partition_inner_limit) {
-                for (size_t i = 0; i < num_columns; ++i) {
-                    merged_columns[i]->insert_from(*current->all_columns[i], 
current->pos);
-                }
-            } else {
+            //3.1 _has_global_limit = true, so check (current_output_rows + 
_output_total_rows) >= _partition_inner_limit)
+            //3.2 _has_global_limit = false. so check have output distinct 
rows, not _output_total_rows
+            if (_has_global_limit &&
+                (current_output_rows + _output_total_rows) >= 
_partition_inner_limit) {
                 get_enough_data = true;
+                break;
             }
             if (_has_global_limit) {
                 current_output_rows++;
             } else {
-                //when it's first comes, the rows are same no need compare
-                if (first_compare_row) {
-                    current_output_rows++;
-                    first_compare_row = false;
-                } else {
-                    // not the first comes, so need compare those, when is 
distinct row
-                    // so could current_output_rows++
-                    bool cmp_res = _previous_row->compare_two_rows(current);
-                    if (cmp_res == false) { // distinct row
-                        current_output_rows++;
-                        *_previous_row = current;
+                bool cmp_res = _previous_row->compare_two_rows(current);
+                //get a distinct row
+                if (cmp_res == false) {
+                    _output_distinct_rows++; //need rows++ firstly
+                    if (_output_distinct_rows >= _partition_inner_limit) {
+                        get_enough_data = true;
+                        break;
                     }
+                    *_previous_row = current;
                 }
             }
+            for (size_t i = 0; i < num_columns; ++i) {
+                merged_columns[i]->insert_from(*current->all_columns[i], 
current->pos);
+            }
             break;
         }
         case TopNAlgorithm::RANK: {
+            //  rank(): 1,1,1,4,5,6,6,6.....,6,100,101. if SQL where rk < 7, 
need output all 1,1,1,4,5,6,6,....6
+            //2 rank() maybe need check when have get a distinct row
+            //2.1 _has_global_limit = true: (current_output_rows + 
_output_total_rows) >= _partition_inner_limit)
+            //2.2 _has_global_limit = false: so when the cmp_res is get a 
distinct row, need check have output all rows num
             if (_has_global_limit &&
                 (current_output_rows + _output_total_rows) >= 
_partition_inner_limit) {
                 get_enough_data = true;
diff --git a/be/src/vec/common/sort/partition_sorter.h 
b/be/src/vec/common/sort/partition_sorter.h
index ff17ac2115..ca0cd5c493 100644
--- a/be/src/vec/common/sort/partition_sorter.h
+++ b/be/src/vec/common/sort/partition_sorter.h
@@ -99,6 +99,7 @@ private:
     std::unique_ptr<MergeSorterState> _state;
     const RowDescriptor& _row_desc;
     int64 _output_total_rows = 0;
+    int64 _output_distinct_rows = 0;
     bool _has_global_limit = false;
     int _partition_inner_limit = 0;
     TopNAlgorithm::type _top_n_algorithm = TopNAlgorithm::type::ROW_NUMBER;
diff --git 
a/regression-test/data/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.out
 
b/regression-test/data/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.out
index cb3e3e4d64..d8542dca56 100644
--- 
a/regression-test/data/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.out
+++ 
b/regression-test/data/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.out
@@ -1019,3 +1019,71 @@
 14     1987.5
 15     1989.0
 
+-- !sql_row_number_1 --
+1      -32767  false
+1      255     false
+1      1985    true
+1      1986    false
+1      1989    false
+1      1991    false
+1      1992    true
+1      32767   false
+
+-- !sql_rank_1 --
+1      -32767  false
+1      -32767  false
+1      255     false
+1      1985    true
+1      1986    false
+1      1989    false
+1      1991    false
+1      1991    false
+1      1992    true
+1      32767   false
+
+-- !sql_dense_rank_1 --
+1      -32767  false
+1      -32767  false
+1      255     false
+1      1985    true
+1      1986    false
+1      1989    false
+1      1991    false
+1      1991    false
+1      1992    true
+1      32767   false
+
+-- !sql_row_number --
+1      -32767  false
+1      255     false
+1      1985    true
+1      1986    false
+1      1989    false
+1      1991    false
+1      1992    true
+1      32767   false
+
+-- !sql_rank --
+1      -32767  false
+1      -32767  false
+1      255     false
+1      1985    true
+1      1986    false
+1      1989    false
+1      1991    false
+1      1991    false
+1      1992    true
+1      32767   false
+
+-- !sql_dense_rank --
+1      -32767  false
+1      -32767  false
+1      255     false
+1      1985    true
+1      1986    false
+1      1989    false
+1      1991    false
+1      1991    false
+1      1992    true
+1      32767   false
+
diff --git 
a/regression-test/suites/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.groovy
 
b/regression-test/suites/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.groovy
index 7cbad40a61..7ec02d90ae 100644
--- 
a/regression-test/suites/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.groovy
@@ -147,6 +147,30 @@ suite("test_select_stddev_variance_window") {
     qt_select_default  "select k1, percentile_approx(k2,0.5,4096) over 
(partition by k6 order by k1 rows between current row and current row) from  
${tableName} order by k1;"
     qt_select_default  "select k1, percentile_approx(k2,0.5,4096) over 
(partition by k6 order by k1 rows between current row and unbounded following) 
from ${tableName} order by k1;"
     qt_select_default  "select k1, percentile_approx(k2,0.5,4096) over 
(partition by k6 order by k1) from ${tableName} order by k1;"
+
+    sql "set experimental_enable_nereids_planner = false;"
+
+    qt_sql_row_number_1 """
+        select * from (select row_number() over(partition by k2 order by k6) 
as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
+    """
+    qt_sql_rank_1 """
+        select * from (select rank() over(partition by k2 order by k6) as 
rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
+    """
+    qt_sql_dense_rank_1 """
+        select * from (select dense_rank() over(partition by k2 order by k6) 
as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
+    """
+
+    sql "set experimental_enable_nereids_planner = true;"
+
+    qt_sql_row_number """
+        select * from (select row_number() over(partition by k2 order by k6) 
as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
+    """
+    qt_sql_rank """
+        select * from (select rank() over(partition by k2 order by k6) as 
rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
+    """
+    qt_sql_dense_rank """
+        select * from (select dense_rank() over(partition by k2 order by k6) 
as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
+    """
 }
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] branch master updated: [bug](node)fix dense_rank function in partition sort node return wrong rows (#24727)

Reply via email to