(cloudberry) 01/12: Fix gp_dqa test to explicitly analyze tables (#14643)

avamingli Mon, 30 Dec 2024 19:10:52 -0800

This is an automated email from the ASF dual-hosted git repository.

avamingli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


commit 49ae63ad03324c6f055ab24410e02098fbc543e0
Author: Chris Hajas <[email protected]>
AuthorDate: Tue Dec 13 16:25:53 2022 -0800

    Fix gp_dqa test to explicitly analyze tables (#14643)
    
    Commit 0f4af19b71f2f31acd28c39d0d0a865cbf1d3bab added this test, but the
    `analyze` statement exposed an unrelated issue and caused CI to fail. We
    should explicitly analyze tables to reduce runtime in tests anyway.
---
 src/test/regress/expected/gp_dqa.out           | 64 +++++++++++++++++++++-
 src/test/regress/expected/gp_dqa_optimizer.out | 74 ++++++++++++++++++++++++++
 src/test/regress/sql/gp_dqa.sql                | 18 +++++++
 3 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/src/test/regress/expected/gp_dqa.out 
b/src/test/regress/expected/gp_dqa.out
index 9090a1547a..30505978ab 100644
--- a/src/test/regress/expected/gp_dqa.out
+++ b/src/test/regress/expected/gp_dqa.out
@@ -2367,7 +2367,7 @@ select count(distinct a) from t_issue_659;
    ->  Gather Motion 3:1  (slice1; segments: 3)
          ->  Partial Aggregate
                ->  Seq Scan on t_issue_659
- Optimizer: Postgres query optimizer
+ Optimizer: Pivotal Optimizer (GPORCA)
 (5 rows)
 
 select count(distinct a) from t_issue_659;
@@ -2402,6 +2402,68 @@ select count(distinct a) from t_issue_659;
 
 reset gp_eager_distinct_dedup;
 reset optimizer_force_three_stage_scalar_dqa;
+-- fix dqa bug when optimizer_force_multistage_agg is on
+set optimizer_force_multistage_agg = on;
+create table multiagg1(a int, b bigint, c int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table multiagg2(a int, b bigint, c numeric(8, 4));
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into multiagg1 values(generate_series(1, 10), generate_series(1, 10), 
generate_series(1, 10));
+insert into multiagg2 values(generate_series(1, 10), generate_series(1, 10), 
555.55);
+analyze multiagg1;
+analyze multiagg2;
+explain (verbose, costs off) select count(distinct b), sum(c) from multiagg1;
+                                                QUERY PLAN                     
                            
+-----------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(DISTINCT b), sum(c)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(DISTINCT b)), (PARTIAL sum(c))
+         ->  Partial Aggregate
+               Output: PARTIAL count(DISTINCT b), PARTIAL sum(c)
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Output: b, c
+                     Hash Key: b
+                     ->  Seq Scan on public.multiagg1
+                           Output: b, c
+ Settings: enable_groupagg = 'off', enable_hashagg = 'on', 
gp_motion_cost_per_row = '2', optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(13 rows)
+
+select count(distinct b), sum(c) from multiagg1;
+ count | sum 
+-------+-----
+    10 |  55
+(1 row)
+
+explain (verbose, costs off) select count(distinct b), sum(c) from multiagg2;
+                                                QUERY PLAN                     
                            
+-----------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(DISTINCT b), sum(c)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(DISTINCT b)), (PARTIAL sum(c))
+         ->  Partial Aggregate
+               Output: PARTIAL count(DISTINCT b), PARTIAL sum(c)
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Output: b, c
+                     Hash Key: b
+                     ->  Seq Scan on public.multiagg2
+                           Output: b, c
+ Settings: enable_groupagg = 'off', enable_hashagg = 'on', 
gp_motion_cost_per_row = '2', optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(13 rows)
+
+select count(distinct b), sum(c) from multiagg2;
+ count |    sum    
+-------+-----------
+    10 | 5555.5000
+(1 row)
+
+drop table multiagg1;
+drop table multiagg2;
 reset optimizer_force_multistage_agg;
 reset optimizer_enable_use_distribution_in_dqa;
 drop table t_issue_659;
diff --git a/src/test/regress/expected/gp_dqa_optimizer.out 
b/src/test/regress/expected/gp_dqa_optimizer.out
index 3f1c7f540e..a99bf5a3ee 100644
--- a/src/test/regress/expected/gp_dqa_optimizer.out
+++ b/src/test/regress/expected/gp_dqa_optimizer.out
@@ -2550,6 +2550,80 @@ select count(distinct a) from t_issue_659;
 
 reset gp_eager_distinct_dedup;
 reset optimizer_force_three_stage_scalar_dqa;
+-- fix dqa bug when optimizer_force_multistage_agg is on
+set optimizer_force_multistage_agg = on;
+create table multiagg1(a int, b bigint, c int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table multiagg2(a int, b bigint, c numeric(8, 4));
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into multiagg1 values(generate_series(1, 10), generate_series(1, 10), 
generate_series(1, 10));
+INFO:  GPORCA failed to produce a plan, falling back to planner
+DETAIL:  Feature not supported: Unexpected target list entries in ProjectSet 
node
+insert into multiagg2 values(generate_series(1, 10), generate_series(1, 10), 
555.55);
+INFO:  GPORCA failed to produce a plan, falling back to planner
+DETAIL:  Feature not supported: Unexpected target list entries in ProjectSet 
node
+analyze multiagg1;
+analyze multiagg2;
+explain (verbose, costs off) select count(distinct b), sum(c) from multiagg1;
+INFO:  GPORCA failed to produce a plan, falling back to planner
+DETAIL:  GPDB Expression type: GPDB_96_MERGE_FIXME: Intermediate aggregate 
stage not implemented not supported in DXL
+                                       QUERY PLAN                              
         
+----------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(DISTINCT b), sum(c)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(DISTINCT b)), (PARTIAL sum(c))
+         ->  Partial Aggregate
+               Output: PARTIAL count(DISTINCT b), PARTIAL sum(c)
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Output: b, c
+                     Hash Key: b
+                     ->  Seq Scan on public.multiagg1
+                           Output: b, c
+ Settings: enable_groupagg = 'off', enable_hashagg = 'on', 
gp_motion_cost_per_row = '2'
+ Optimizer: Postgres query optimizer
+(13 rows)
+
+select count(distinct b), sum(c) from multiagg1;
+INFO:  GPORCA failed to produce a plan, falling back to planner
+DETAIL:  GPDB Expression type: GPDB_96_MERGE_FIXME: Intermediate aggregate 
stage not implemented not supported in DXL
+ count | sum 
+-------+-----
+    10 |  55
+(1 row)
+
+explain (verbose, costs off) select count(distinct b), sum(c) from multiagg2;
+INFO:  GPORCA failed to produce a plan, falling back to planner
+DETAIL:  GPDB Expression type: GPDB_96_MERGE_FIXME: Intermediate aggregate 
stage not implemented not supported in DXL
+                                       QUERY PLAN                              
         
+----------------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(DISTINCT b), sum(c)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL count(DISTINCT b)), (PARTIAL sum(c))
+         ->  Partial Aggregate
+               Output: PARTIAL count(DISTINCT b), PARTIAL sum(c)
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Output: b, c
+                     Hash Key: b
+                     ->  Seq Scan on public.multiagg2
+                           Output: b, c
+ Settings: enable_groupagg = 'off', enable_hashagg = 'on', 
gp_motion_cost_per_row = '2'
+ Optimizer: Postgres query optimizer
+(13 rows)
+
+select count(distinct b), sum(c) from multiagg2;
+INFO:  GPORCA failed to produce a plan, falling back to planner
+DETAIL:  GPDB Expression type: GPDB_96_MERGE_FIXME: Intermediate aggregate 
stage not implemented not supported in DXL
+ count |    sum    
+-------+-----------
+    10 | 5555.5000
+(1 row)
+
+drop table multiagg1;
+drop table multiagg2;
 reset optimizer_force_multistage_agg;
 reset optimizer_enable_use_distribution_in_dqa;
 drop table t_issue_659;
diff --git a/src/test/regress/sql/gp_dqa.sql b/src/test/regress/sql/gp_dqa.sql
index 65f52cd743..9bbdf48c19 100644
--- a/src/test/regress/sql/gp_dqa.sql
+++ b/src/test/regress/sql/gp_dqa.sql
@@ -428,6 +428,24 @@ select count(distinct a) from t_issue_659;
 select count(distinct a) from t_issue_659;
 reset gp_eager_distinct_dedup;
 reset optimizer_force_three_stage_scalar_dqa;
+
+
+-- fix dqa bug when optimizer_force_multistage_agg is on
+set optimizer_force_multistage_agg = on;
+create table multiagg1(a int, b bigint, c int);
+create table multiagg2(a int, b bigint, c numeric(8, 4));
+insert into multiagg1 values(generate_series(1, 10), generate_series(1, 10), 
generate_series(1, 10));
+insert into multiagg2 values(generate_series(1, 10), generate_series(1, 10), 
555.55);
+analyze multiagg1;
+analyze multiagg2;
+
+explain (verbose, costs off) select count(distinct b), sum(c) from multiagg1;
+select count(distinct b), sum(c) from multiagg1;
+
+explain (verbose, costs off) select count(distinct b), sum(c) from multiagg2;
+select count(distinct b), sum(c) from multiagg2;
+drop table multiagg1;
+drop table multiagg2;
 reset optimizer_force_multistage_agg;
 reset optimizer_enable_use_distribution_in_dqa;
 drop table t_issue_659;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) 01/12: Fix gp_dqa test to explicitly analyze tables (#14643)

Reply via email to