(cloudberry) branch main updated: Fix Segment NDV number underflow issue (#1387)

wangwn Mon, 20 Oct 2025 03:49:32 -0700

This is an automated email from the ASF dual-hosted git repository.

wangwn pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git



The following commit(s) were added to refs/heads/main by this push:
     new cf9a6ea57a2 Fix Segment NDV number underflow issue (#1387)
cf9a6ea57a2 is described below

commit cf9a6ea57a2a7a69cbde99d0b881475b6945bd19
Author: Weinan WANG <[email protected]>
AuthorDate: Mon Oct 20 18:49:20 2025 +0800

    Fix Segment NDV number underflow issue (#1387)
    
    For the ratio of NDV, the function `abs` accepts an int value as
    parameter, the double value `-0.x` cast to `int(0)`, which lead
    to unefficient plan picked.
    
    Fix `abs` -> `fabs`
---
 .../regress/expected/bfv_aggregate_optimizer.out   |  2 +-
 .../src/test/regress/expected/gp_dqa_optimizer.out | 60 +++++++++-----------
 .../regress/expected/groupingsets_optimizer.out    | 14 ++---
 .../src/test/regress/expected/limit_optimizer.out  | 31 +++++-----
 src/backend/commands/analyze.c                     |  5 +-
 .../regress/expected/bfv_aggregate_optimizer.out   |  2 +-
 src/test/regress/expected/gp_dqa_optimizer.out     | 66 ++++++++++------------
 .../regress/expected/groupingsets_optimizer.out    | 14 ++---
 src/test/regress/expected/limit_optimizer.out      | 33 ++++++-----
 9 files changed, 105 insertions(+), 122 deletions(-)

diff --git 
a/contrib/pax_storage/src/test/regress/expected/bfv_aggregate_optimizer.out 
b/contrib/pax_storage/src/test/regress/expected/bfv_aggregate_optimizer.out
index f8508db62b0..72f0a78988c 100644
--- a/contrib/pax_storage/src/test/regress/expected/bfv_aggregate_optimizer.out
+++ b/contrib/pax_storage/src/test/regress/expected/bfv_aggregate_optimizer.out
@@ -188,7 +188,7 @@ set optimizer_force_multistage_agg = on;
 select count_operator('select count(*) from multi_stage_test group by 
b;','GroupAggregate');
  count_operator 
 ----------------
-              1
+              2
 (1 row)
 
 set optimizer_force_multistage_agg = off;
diff --git a/contrib/pax_storage/src/test/regress/expected/gp_dqa_optimizer.out 
b/contrib/pax_storage/src/test/regress/expected/gp_dqa_optimizer.out
index 250ad329a5e..db4d467570b 100644
--- a/contrib/pax_storage/src/test/regress/expected/gp_dqa_optimizer.out
+++ b/contrib/pax_storage/src/test/regress/expected/gp_dqa_optimizer.out
@@ -65,20 +65,18 @@ select count(distinct d) from dqa_t1 group by i;
 (12 rows)
 
 explain (costs off) select count(distinct d) from dqa_t1 group by i;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   ->  Finalize HashAggregate
+   ->  GroupAggregate
          Group Key: i
-         ->  Redistribute Motion 3:3  (slice2; segments: 3)
-               Hash Key: i
-               ->  Partial GroupAggregate
-                     Group Key: i
-                     ->  Sort
-                           Sort Key: i, d
-                           ->  Seq Scan on dqa_t1
+         ->  Sort
+               Sort Key: i
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Hash Key: i
+                     ->  Seq Scan on dqa_t1
  Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0
-(11 rows)
+(9 rows)
 
 select count(distinct d), sum(distinct d) from dqa_t1 group by i;
  count | sum 
@@ -98,20 +96,18 @@ select count(distinct d), sum(distinct d) from dqa_t1 group 
by i;
 (12 rows)
 
 explain (costs off) select count(distinct d), sum(distinct d) from dqa_t1 
group by i;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   ->  Finalize HashAggregate
+   ->  GroupAggregate
          Group Key: i
-         ->  Redistribute Motion 3:3  (slice2; segments: 3)
-               Hash Key: i
-               ->  Partial GroupAggregate
-                     Group Key: i
-                     ->  Sort
-                           Sort Key: i, d
-                           ->  Seq Scan on dqa_t1
+         ->  Sort
+               Sort Key: i
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Hash Key: i
+                     ->  Seq Scan on dqa_t1
  Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0
-(11 rows)
+(9 rows)
 
 select count(distinct d), count(distinct dt) from dqa_t1;
  count | count 
@@ -1909,20 +1905,18 @@ select count(distinct d) from dqa_t1 group by i;
 (12 rows)
 
 explain (costs off) select count(distinct d) from dqa_t1 group by i;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   ->  Finalize HashAggregate
+   ->  GroupAggregate
          Group Key: i
-         ->  Redistribute Motion 3:3  (slice2; segments: 3)
-               Hash Key: i
-               ->  Partial GroupAggregate
-                     Group Key: i
-                     ->  Sort
-                           Sort Key: i, d
-                           ->  Seq Scan on dqa_t1
+         ->  Sort
+               Sort Key: i
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Hash Key: i
+                     ->  Seq Scan on dqa_t1
  Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0
-(11 rows)
+(9 rows)
 
 select count(distinct d), count(distinct c), count(distinct dt) from dqa_t1;
  count | count | count 
diff --git 
a/contrib/pax_storage/src/test/regress/expected/groupingsets_optimizer.out 
b/contrib/pax_storage/src/test/regress/expected/groupingsets_optimizer.out
index b2ac9ce9ddc..b3da68b1f9d 100644
--- a/contrib/pax_storage/src/test/regress/expected/groupingsets_optimizer.out
+++ b/contrib/pax_storage/src/test/regress/expected/groupingsets_optimizer.out
@@ -1891,12 +1891,11 @@ explain (costs off)
                            ->  Streaming Partial HashAggregate
                                  Group Key: share0_ref5.hundred
                                  ->  Shared Scan (share slice:id 5:0)
-               ->  Finalize HashAggregate
+               ->  HashAggregate
                      Group Key: share0_ref6.thousand
                      ->  Redistribute Motion 3:3  (slice6; segments: 3)
                            Hash Key: share0_ref6.thousand
-                           ->  Streaming Partial HashAggregate
-                                 Group Key: share0_ref6.thousand
+                           ->  Result
                                  ->  Shared Scan (share slice:id 6:0)
                ->  HashAggregate
                      Group Key: share0_ref7.twothousand
@@ -1908,7 +1907,7 @@ explain (costs off)
                      Group Key: share0_ref8.unique1
                      ->  Shared Scan (share slice:id 1:0)
  Optimizer: Pivotal Optimizer (GPORCA)
-(50 rows)
+(49 rows)
 
 explain (costs off)
   select unique1,
@@ -1999,12 +1998,11 @@ explain (costs off)
                            ->  Streaming Partial HashAggregate
                                  Group Key: share0_ref5.hundred
                                  ->  Shared Scan (share slice:id 5:0)
-               ->  Finalize HashAggregate
+               ->  HashAggregate
                      Group Key: share0_ref6.thousand
                      ->  Redistribute Motion 3:3  (slice6; segments: 3)
                            Hash Key: share0_ref6.thousand
-                           ->  Streaming Partial HashAggregate
-                                 Group Key: share0_ref6.thousand
+                           ->  Result
                                  ->  Shared Scan (share slice:id 6:0)
                ->  HashAggregate
                      Group Key: share0_ref7.twothousand
@@ -2016,7 +2014,7 @@ explain (costs off)
                      Group Key: share0_ref8.unique1
                      ->  Shared Scan (share slice:id 1:0)
  Optimizer: Pivotal Optimizer (GPORCA)
-(50 rows)
+(49 rows)
 
 reset hash_mem_multiplier;
 -- check collation-sensitive matching between grouping expressions
diff --git a/contrib/pax_storage/src/test/regress/expected/limit_optimizer.out 
b/contrib/pax_storage/src/test/regress/expected/limit_optimizer.out
index b8562703a6c..dea74df2b5d 100644
--- a/contrib/pax_storage/src/test/regress/expected/limit_optimizer.out
+++ b/contrib/pax_storage/src/test/regress/expected/limit_optimizer.out
@@ -358,29 +358,28 @@ order by s2 desc;
 explain (verbose, costs off)
 select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2
   from tenk1 group by thousand order by thousand limit 3;
-                                                             QUERY PLAN        
                                                      
--------------------------------------------------------------------------------------------------------------------------------------
+                                                               QUERY PLAN      
                                                          
+-----------------------------------------------------------------------------------------------------------------------------------------
  Result
-   Output: (sum(tenthous)), (((sum(tenthous))::double precision + (random() * 
'0'::double precision)))
+   Output: (sum(tenthous)), ((((sum(tenthous)))::double precision + (random() 
* '0'::double precision)))
    ->  Limit
-         Output: (sum(tenthous)), (((sum(tenthous))::double precision + 
(random() * '0'::double precision))), thousand
+         Output: (sum(tenthous)), ((((sum(tenthous)))::double precision + 
(random() * '0'::double precision))), thousand
          ->  Gather Motion 3:1  (slice1; segments: 3)
-               Output: (sum(tenthous)), (((sum(tenthous))::double precision + 
(random() * '0'::double precision))), thousand
+               Output: (sum(tenthous)), ((((sum(tenthous)))::double precision 
+ (random() * '0'::double precision))), thousand
                Merge Key: thousand
                ->  Limit
-                     Output: (sum(tenthous)), (((sum(tenthous))::double 
precision + (random() * '0'::double precision))), thousand
-                     ->  Finalize GroupAggregate
-                           Output: sum(tenthous), ((sum(tenthous))::double 
precision + (random() * '0'::double precision)), thousand
-                           Group Key: tenk1.thousand
+                     Output: (sum(tenthous)), ((((sum(tenthous)))::double 
precision + (random() * '0'::double precision))), thousand
+                     ->  Result
+                           Output: (sum(tenthous)), (((sum(tenthous)))::double 
precision + (random() * '0'::double precision)), thousand
                            ->  Sort
-                                 Output: thousand, (PARTIAL sum(tenthous)), 
(PARTIAL sum(tenthous))
+                                 Output: (sum(tenthous)), (sum(tenthous)), 
thousand
                                  Sort Key: tenk1.thousand
-                                 ->  Redistribute Motion 3:3  (slice2; 
segments: 3)
-                                       Output: thousand, (PARTIAL 
sum(tenthous)), (PARTIAL sum(tenthous))
-                                       Hash Key: thousand
-                                       ->  Streaming Partial HashAggregate
-                                             Output: thousand, PARTIAL 
sum(tenthous), PARTIAL sum(tenthous)
-                                             Group Key: tenk1.thousand
+                                 ->  HashAggregate
+                                       Output: sum(tenthous), sum(tenthous), 
thousand
+                                       Group Key: tenk1.thousand
+                                       ->  Redistribute Motion 3:3  (slice2; 
segments: 3)
+                                             Output: thousand, tenthous
+                                             Hash Key: thousand
                                              ->  Seq Scan on public.tenk1
                                                    Output: thousand, tenthous
  Optimizer: Pivotal Optimizer (GPORCA)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index ce642dc84e7..6f5c4c5c42c 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2871,12 +2871,13 @@ process_sample_rows(Portal portal,
                                arrayVal = DatumGetArrayTypeP(funcRetValues[3]);
                                deconstruct_array(arrayVal, FLOAT8OID, 8, true, 
'd',
                                                                &colndv, 
&nulls, &numelems);
-                               for (i = 0; i < relDesc->natts; i++)
+                               Assert(numelems == relDesc->natts);
+                               for (i = 0; i < numelems; i++)
                                {
                                        double this_colndv = 
DatumGetFloat8(colndv[i]);
                                        if (this_colndv < 0) {
                                                Assert(this_colndv >= -1);
-                                               colNDVBySeg[i] += 
abs(this_colndv) * this_totalrows;
+                                               colNDVBySeg[i] += 
fabs(this_colndv) * this_totalrows;
                                        } else {
                                                /* if current segment have any 
data, then ndv won't be 0.
                                                 * if current segment have no 
rows, ndv is 0.
diff --git a/src/test/regress/expected/bfv_aggregate_optimizer.out 
b/src/test/regress/expected/bfv_aggregate_optimizer.out
index 51fd134f587..47ae7048795 100644
--- a/src/test/regress/expected/bfv_aggregate_optimizer.out
+++ b/src/test/regress/expected/bfv_aggregate_optimizer.out
@@ -188,7 +188,7 @@ set optimizer_force_multistage_agg = on;
 select count_operator('select count(*) from multi_stage_test group by 
b;','GroupAggregate');
  count_operator 
 ----------------
-              1
+              2
 (1 row)
 
 set optimizer_force_multistage_agg = off;
diff --git a/src/test/regress/expected/gp_dqa_optimizer.out 
b/src/test/regress/expected/gp_dqa_optimizer.out
index acbd713866d..278b719e933 100644
--- a/src/test/regress/expected/gp_dqa_optimizer.out
+++ b/src/test/regress/expected/gp_dqa_optimizer.out
@@ -66,20 +66,18 @@ select count(distinct d) from dqa_t1 group by i;
 (12 rows)
 
 explain (costs off) select count(distinct d) from dqa_t1 group by i;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   ->  Finalize HashAggregate
+   ->  GroupAggregate
          Group Key: i
-         ->  Redistribute Motion 3:3  (slice2; segments: 3)
-               Hash Key: i
-               ->  Partial GroupAggregate
-                     Group Key: i
-                     ->  Sort
-                           Sort Key: i, d
-                           ->  Seq Scan on dqa_t1
- Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0
-(11 rows)
+         ->  Sort
+               Sort Key: i
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Hash Key: i
+                     ->  Seq Scan on dqa_t1
+ Optimizer: GPORCA
+(9 rows)
 
 select count(distinct d), sum(distinct d) from dqa_t1 group by i;
  count | sum 
@@ -99,20 +97,18 @@ select count(distinct d), sum(distinct d) from dqa_t1 group 
by i;
 (12 rows)
 
 explain (costs off) select count(distinct d), sum(distinct d) from dqa_t1 
group by i;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   ->  Finalize HashAggregate
+   ->  GroupAggregate
          Group Key: i
-         ->  Redistribute Motion 3:3  (slice2; segments: 3)
-               Hash Key: i
-               ->  Partial GroupAggregate
-                     Group Key: i
-                     ->  Sort
-                           Sort Key: i, d
-                           ->  Seq Scan on dqa_t1
- Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0
-(11 rows)
+         ->  Sort
+               Sort Key: i
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Hash Key: i
+                     ->  Seq Scan on dqa_t1
+ Optimizer: GPORCA
+(9 rows)
 
 select count(distinct d), count(distinct dt) from dqa_t1;
 INFO:  GPORCA failed to produce a plan, falling back to Postgres-based planner
@@ -2022,20 +2018,18 @@ select count(distinct d) from dqa_t1 group by i;
 (12 rows)
 
 explain (costs off) select count(distinct d) from dqa_t1 group by i;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   ->  Finalize HashAggregate
+   ->  GroupAggregate
          Group Key: i
-         ->  Redistribute Motion 3:3  (slice2; segments: 3)
-               Hash Key: i
-               ->  Partial GroupAggregate
-                     Group Key: i
-                     ->  Sort
-                           Sort Key: i, d
-                           ->  Seq Scan on dqa_t1
- Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0
-(11 rows)
+         ->  Sort
+               Sort Key: i
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Hash Key: i
+                     ->  Seq Scan on dqa_t1
+ Optimizer: GPORCA
+(9 rows)
 
 select count(distinct d), count(distinct c), count(distinct dt) from dqa_t1;
 INFO:  GPORCA failed to produce a plan, falling back to Postgres-based planner
diff --git a/src/test/regress/expected/groupingsets_optimizer.out 
b/src/test/regress/expected/groupingsets_optimizer.out
index 39fad5a1c8a..08ef4c1a68c 100644
--- a/src/test/regress/expected/groupingsets_optimizer.out
+++ b/src/test/regress/expected/groupingsets_optimizer.out
@@ -1968,12 +1968,11 @@ explain (costs off)
                            ->  Streaming Partial HashAggregate
                                  Group Key: share0_ref5.hundred
                                  ->  Shared Scan (share slice:id 5:0)
-               ->  Finalize HashAggregate
+               ->  HashAggregate
                      Group Key: share0_ref6.thousand
                      ->  Redistribute Motion 3:3  (slice6; segments: 3)
                            Hash Key: share0_ref6.thousand
-                           ->  Streaming Partial HashAggregate
-                                 Group Key: share0_ref6.thousand
+                           ->  Result
                                  ->  Shared Scan (share slice:id 6:0)
                ->  HashAggregate
                      Group Key: share0_ref7.twothousand
@@ -1985,7 +1984,7 @@ explain (costs off)
                      Group Key: share0_ref8.unique1
                      ->  Shared Scan (share slice:id 1:0)
  Optimizer: Pivotal Optimizer (GPORCA)
-(50 rows)
+(49 rows)
 
 explain (costs off)
   select unique1,
@@ -2076,12 +2075,11 @@ explain (costs off)
                            ->  Streaming Partial HashAggregate
                                  Group Key: share0_ref5.hundred
                                  ->  Shared Scan (share slice:id 5:0)
-               ->  Finalize HashAggregate
+               ->  HashAggregate
                      Group Key: share0_ref6.thousand
                      ->  Redistribute Motion 3:3  (slice6; segments: 3)
                            Hash Key: share0_ref6.thousand
-                           ->  Streaming Partial HashAggregate
-                                 Group Key: share0_ref6.thousand
+                           ->  Result
                                  ->  Shared Scan (share slice:id 6:0)
                ->  HashAggregate
                      Group Key: share0_ref7.twothousand
@@ -2093,7 +2091,7 @@ explain (costs off)
                      Group Key: share0_ref8.unique1
                      ->  Shared Scan (share slice:id 1:0)
  Optimizer: Pivotal Optimizer (GPORCA)
-(50 rows)
+(49 rows)
 
 reset hash_mem_multiplier;
 -- check collation-sensitive matching between grouping expressions
diff --git a/src/test/regress/expected/limit_optimizer.out 
b/src/test/regress/expected/limit_optimizer.out
index 6508365fc81..e8a00fe3a40 100644
--- a/src/test/regress/expected/limit_optimizer.out
+++ b/src/test/regress/expected/limit_optimizer.out
@@ -358,32 +358,31 @@ order by s2 desc;
 explain (verbose, costs off)
 select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2
   from tenk1 group by thousand order by thousand limit 3;
-                                                             QUERY PLAN        
                                                      
--------------------------------------------------------------------------------------------------------------------------------------
+                                                               QUERY PLAN      
                                                          
+-----------------------------------------------------------------------------------------------------------------------------------------
  Result
-   Output: (sum(tenthous)), (((sum(tenthous))::double precision + (random() * 
'0'::double precision)))
+   Output: (sum(tenthous)), ((((sum(tenthous)))::double precision + (random() 
* '0'::double precision)))
    ->  Limit
-         Output: (sum(tenthous)), (((sum(tenthous))::double precision + 
(random() * '0'::double precision))), thousand
+         Output: (sum(tenthous)), ((((sum(tenthous)))::double precision + 
(random() * '0'::double precision))), thousand
          ->  Gather Motion 3:1  (slice1; segments: 3)
-               Output: (sum(tenthous)), (((sum(tenthous))::double precision + 
(random() * '0'::double precision))), thousand
+               Output: (sum(tenthous)), ((((sum(tenthous)))::double precision 
+ (random() * '0'::double precision))), thousand
                Merge Key: thousand
                ->  Limit
-                     Output: (sum(tenthous)), (((sum(tenthous))::double 
precision + (random() * '0'::double precision))), thousand
-                     ->  Finalize GroupAggregate
-                           Output: sum(tenthous), ((sum(tenthous))::double 
precision + (random() * '0'::double precision)), thousand
-                           Group Key: tenk1.thousand
+                     Output: (sum(tenthous)), ((((sum(tenthous)))::double 
precision + (random() * '0'::double precision))), thousand
+                     ->  Result
+                           Output: (sum(tenthous)), (((sum(tenthous)))::double 
precision + (random() * '0'::double precision)), thousand
                            ->  Sort
-                                 Output: thousand, (PARTIAL sum(tenthous)), 
(PARTIAL sum(tenthous))
+                                 Output: (sum(tenthous)), (sum(tenthous)), 
thousand
                                  Sort Key: tenk1.thousand
-                                 ->  Redistribute Motion 3:3  (slice2; 
segments: 3)
-                                       Output: thousand, (PARTIAL 
sum(tenthous)), (PARTIAL sum(tenthous))
-                                       Hash Key: thousand
-                                       ->  Streaming Partial HashAggregate
-                                             Output: thousand, PARTIAL 
sum(tenthous), PARTIAL sum(tenthous)
-                                             Group Key: tenk1.thousand
+                                 ->  HashAggregate
+                                       Output: sum(tenthous), sum(tenthous), 
thousand
+                                       Group Key: tenk1.thousand
+                                       ->  Redistribute Motion 3:3  (slice2; 
segments: 3)
+                                             Output: thousand, tenthous
+                                             Hash Key: thousand
                                              ->  Seq Scan on public.tenk1
                                                    Output: thousand, tenthous
- Optimizer: Pivotal Optimizer (GPORCA)
+ Optimizer: GPORCA
 (24 rows)
 
 select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) branch main updated: Fix Segment NDV number underflow issue (#1387)

Reply via email to