(cloudberry) branch main updated: Add explain analyze detailed info of hash agg (#15917)

jiaqizho Tue, 17 Jun 2025 01:10:34 -0700

This is an automated email from the ASF dual-hosted git repository.

jiaqizho pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git



The following commit(s) were added to refs/heads/main by this push:
     new 2018fed2ab2 Add explain analyze detailed info of hash agg (#15917)
2018fed2ab2 is described below

commit 2018fed2ab24c67af69907e950a2990cfe4d3f90
Author: Yao Wang <[email protected]>
AuthorDate: Thu Aug 24 15:36:41 2023 +0800

    Add explain analyze detailed info of hash agg (#15917)
    
    After upgrading to GPDB7, we lost the detailed cdb executor instruments of
    hashagg since the code base has been changed in a large size. The patch is
    to re-implement explain analyze related code of hashagg to provide more
    critical info for troubleshooting issues.
---
 src/backend/commands/explain.c                     |   2 +-
 src/backend/executor/nodeAgg.c                     | 145 +++++++++++++++++++++
 src/include/executor/nodeAgg.h                     |  23 ++++
 src/include/lib/simplehash.h                       |  81 ++++++++++++
 src/test/regress/expected/explain_format.out       |  54 +++++++-
 .../regress/expected/explain_format_optimizer.out  |  52 +++++++-
 src/test/regress/sql/explain_format.sql            |  35 ++++-
 7 files changed, 388 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index bfd5f98219c..c89d088c4a2 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -4266,7 +4266,7 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
        }
 
        /* Display stats for each parallel worker */
-       if (es->analyze && aggstate->shared_info != NULL)
+       if (aggstate->shared_info != NULL)
        {
                for (int n = 0; n < aggstate->shared_info->num_workers; n++)
                {
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 881d09e5375..1c49e426d2d 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -481,6 +481,8 @@ static void build_pertrans_for_aggref(AggStatePerTrans 
pertrans,
 
 static void ExecEagerFreeAgg(AggState *node);
 
+void agg_hash_explain_extra_message(AggState *aggstate);
+
 /*
  * Select the current grouping set; affects current_set and
  * curaggcontext.
@@ -1540,6 +1542,16 @@ build_hash_tables(AggState *aggstate)
                                                                                
   memory);
 
                build_hash_table(aggstate, setno, nbuckets);
+
+               /* initialize some statistic info of hash table */
+               perhash->num_output_groups = 0;
+               perhash->num_spill_parts = 0;
+               perhash->num_expansions = 0;
+               perhash->bucket_total = 0;
+               perhash->bucket_used = 0;
+               perhash->chain_count = 0;
+               perhash->chain_length_total = 0;
+               perhash->chain_length_max = 0;
        }
 
        aggstate->hash_ngroups_current = 0;
@@ -1966,6 +1978,7 @@ hash_agg_enter_spill_mode(AggState *aggstate)
                        hashagg_spill_init(aggstate, spill, 
aggstate->hash_tapeinfo, 0,
                                                           
perhash->aggnode->numGroups,
                                                           
aggstate->hashentrysize);
+                       perhash->num_spill_parts += spill->npartitions;
                }
 
                if (aggstate->ss.ps.instrument)
@@ -2019,6 +2032,12 @@ hash_agg_update_metrics(AggState *aggstate, bool 
from_tape, int npartitions)
        }
 
        /* update hashentrysize estimate based on contents */
+       /*
+        * Greenplum doesn't use hashentrysize in the instrumentation, it will
+        * calculate hash table chain length to get an accurate number.
+        *
+        * See the following code to collect hash table statistic info.
+        */
        if (aggstate->hash_ngroups_current > 0)
        {
                aggstate->hashentrysize =
@@ -2031,6 +2050,47 @@ hash_agg_update_metrics(AggState *aggstate, bool 
from_tape, int npartitions)
                Instrumentation    *instrument = aggstate->ss.ps.instrument;
 
                instrument->workmemused = aggstate->hash_mem_peak;
+
+               /*
+                * workmemwanted to avoid scratch i/o, that how much memory is 
needed
+                * if we want to load into the hashtable at once:
+                *
+                * 1. add meta_mem only when from_tape is false, because when 
we are
+                *        reading from tape/spilled file, we can reuse the 
existing hash
+                *        table's meta.
+                * 2. add hash_mem every time.
+                * 3. don't add buffer_mem since it's unnecessary when we can 
load into
+                *    into the memory at once.
+                */
+               if (!from_tape)
+                       instrument->workmemwanted += meta_mem;
+               instrument->workmemwanted += hashkey_mem;
+
+               /* Scan all perhashs and collect hash table statistic info */
+               for (int setno = 0; setno < aggstate->num_hashes; setno++)
+               {
+                       AggStatePerHash perhash = &aggstate->perhash[setno];
+                       tuplehash_hash *hashtab = perhash->hashtable->hashtab;
+
+                       Assert(hashtab);
+
+                       perhash->num_expansions += hashtab->num_expansions;
+                       perhash->bucket_total += hashtab->size;
+                       perhash->bucket_used += hashtab->members;
+                       if (hashtab->members > 0)
+                       {
+                               uint32  perht_chain_length_total = 0;
+                               uint32  perht_chain_count = 0;
+
+                               /* collect statistic info of chain length per 
hash table */
+                               tuplehash_coll_stat(hashtab,
+                                                                       
&(perhash->chain_length_max),
+                                                                       
&perht_chain_length_total,
+                                                                       
&perht_chain_count);
+                               perhash->chain_count += perht_chain_count;
+                               perhash->chain_length_total += 
perht_chain_length_total;
+                       }
+               }
        }
 }
 
@@ -2225,6 +2285,7 @@ lookup_hash_entries(AggState *aggstate)
                                hashagg_spill_init(aggstate, spill, 
aggstate->hash_tapeinfo, 0,
                                                                   
perhash->aggnode->numGroups,
                                                                   
aggstate->hashentrysize);
+                       perhash->num_spill_parts += spill->npartitions;
 
                        hashagg_spill_tuple(aggstate, spill, slot, hash);
                        pergroup[setno] = NULL;
@@ -2277,6 +2338,13 @@ ExecAgg(PlanState *pstate)
                        return result;
        }
 
+       /* Save statistics into the cdbexplainbuf for EXPLAIN ANALYZE */
+       if (node->ss.ps.instrument &&
+                       (node->ss.ps.instrument)->need_cdb &&
+                       (node->phase->aggstrategy == AGG_HASHED ||
+                       node->phase->aggstrategy == AGG_MIXED))
+               agg_hash_explain_extra_message(node);
+
        return NULL;
 }
 
@@ -2807,6 +2875,7 @@ agg_refill_hash_table(AggState *aggstate)
                                spill_initialized = true;
                                hashagg_spill_init(aggstate, &spill, tapeinfo, 
batch->used_bits,
                                                                   
batch->input_card, aggstate->hashentrysize);
+                               
aggstate->perhash[aggstate->current_set].num_spill_parts += spill.npartitions;
                        }
                        /* no memory for a new group, spill */
                        hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
@@ -2981,6 +3050,8 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
                        }
                }
 
+               perhash->num_output_groups++;
+
                /*
                 * Clear the per-output-tuple context for each group
                 *
@@ -5139,3 +5210,77 @@ ReuseHashTable(AggState *node)
                        !node->streaming &&
                        !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams));
 }
+
+/*
+ * Save statistics into the cdbexplainbuf for EXPLAIN ANALYZE
+ */
+void
+agg_hash_explain_extra_message(AggState *aggstate)
+{
+       /*
+        * Check cdbexplain_depositStatsToNode(), Greenplum only saves extra
+        * message text for the most interesting winning qExecs.
+        */
+       StringInfo hbuf = aggstate->ss.ps.cdbexplainbuf;
+       uint64  sum_num_expansions = 0;
+       uint64  sum_output_groups = 0;
+       uint64  sum_spill_parts = 0;
+       uint64  sum_chain_length_total = 0;
+       uint64  sum_chain_count = 0;
+       uint32  chain_length_max = 0;
+       uint64  sum_bucket_used = 0;
+       uint64  sum_bucket_total = 0;
+
+       Assert(hbuf);
+
+       appendStringInfo(hbuf, "hash table(s): %d", aggstate->num_hashes);
+
+       /* Scan all perhashs and collect statistic info */
+       for (int setno = 0; setno < aggstate->num_hashes; setno++)
+       {
+               AggStatePerHash perhash = &aggstate->perhash[setno];
+
+               /* spill statistic info */
+               if (aggstate->hash_ever_spilled)
+               {
+                       sum_output_groups += perhash->num_output_groups;
+                       sum_spill_parts += perhash->num_spill_parts;
+               }
+
+               /* inner hash table statistic info */
+               if (perhash->chain_count > 0)
+               {
+                       sum_chain_length_total += perhash->chain_length_total;
+                       sum_chain_count += perhash->chain_count;
+                       if (perhash->chain_length_max > chain_length_max)
+                               chain_length_max = perhash->chain_length_max;
+                       sum_bucket_used = perhash->bucket_used;
+                       sum_bucket_total = perhash->bucket_total;
+                       sum_num_expansions += perhash->num_expansions;
+               }
+       }
+
+       if (aggstate->hash_ever_spilled)
+       {
+               appendStringInfo(hbuf,
+                       "; " UINT64_FORMAT " groups total in %d batches, " 
UINT64_FORMAT
+                       " spill partitions; disk usage: " INT64_FORMAT "KB",
+                       sum_output_groups,
+                       aggstate->hash_batches_used,
+                       sum_spill_parts,
+                       aggstate->hash_disk_used);
+       }
+
+       if (sum_chain_count > 0)
+       {
+               appendStringInfo(hbuf,
+                               "; chain length %.1f avg, %d max;"
+                               " using " INT64_FORMAT " of " INT64_FORMAT " 
buckets;"
+                               " total " INT64_FORMAT " expansions.\n",
+                               (double)sum_chain_length_total / 
sum_chain_count,
+                               chain_length_max,
+                               sum_bucket_used,
+                               sum_bucket_total,
+                               sum_num_expansions);
+       }
+}
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index 60fd9f960f8..5e4e21f250a 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -311,6 +311,29 @@ typedef struct AggStatePerHashData
        AttrNumber *hashGrpColIdxInput; /* hash col indices in input slot */
        AttrNumber *hashGrpColIdxHash;  /* indices in hash table tuples */
        Agg                *aggnode;            /* original Agg node, for 
numGroups etc. */
+
+       /*
+        * Some statistic info of hash table, used for EXPLAIN ANALYZE.
+        * Note that they are accumulated info and will not be reset even
+        * after the hash table is reset.
+        */
+
+       /* number of groups/entries output by the iterator */
+       uint64          num_output_groups;
+       /* number of spilled partitions */
+       uint64          num_spill_parts;
+       /* number of hash table expansions */
+       uint32          num_expansions;
+       /* total number of buckets */
+       uint64      bucket_total;
+       /* number of used buckets */
+       uint64      bucket_used;
+       /* number of all chains */
+       uint64      chain_count;
+       /* total length of all chains */
+       uint64      chain_length_total;
+       /* max chain length */
+       uint32      chain_length_max;
 }                      AggStatePerHashData;
 
 
diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index a4a25c57f21..8b4af795544 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -126,6 +126,7 @@
 #define SH_ALLOCATE SH_MAKE_NAME(allocate)
 #define SH_FREE SH_MAKE_NAME(free)
 #define SH_STAT SH_MAKE_NAME(stat)
+#define SH_COLL_STAT SH_MAKE_NAME(coll_stat)
 
 /* internal helper functions (no externally visible prototypes) */
 #define SH_COMPUTE_PARAMETERS SH_MAKE_NAME(compute_parameters)
@@ -169,6 +170,14 @@ typedef struct SH_TYPE
 
        /* user defined data, useful for callbacks */
        void       *private_data;
+
+       /*
+        * number of times hash table is expanded
+        *
+        * Since the max size of hash table is UINT32_MAX, uint32 is good
+        * enough for the number of expanded times.
+        */
+       uint32 num_expansions;
 }                      SH_TYPE;
 
 typedef enum SH_STATUS
@@ -244,6 +253,12 @@ SH_SCOPE   SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE * tb, 
SH_ITERATOR * iter);
 /* void <prefix>_stat(<prefix>_hash *tb */
 SH_SCOPE void SH_STAT(SH_TYPE * tb);
 
+SH_SCOPE void
+SH_COLL_STAT(SH_TYPE * tb,
+                        uint32 * max_chain_length,
+                        uint32 * total_chain_length,
+                        uint32 * chain_count);
+
 #endif                                                 /* SH_DECLARE */
 
 
@@ -450,6 +465,7 @@ SH_CREATE(MemoryContext ctx, uint32 nelements, void 
*private_data)
        SH_COMPUTE_PARAMETERS(tb, size);
 
        tb->data = SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
+       tb->num_expansions = 0;
 
        return tb;
 }
@@ -468,6 +484,7 @@ SH_RESET(SH_TYPE * tb)
 {
        memset(tb->data, 0, sizeof(SH_ELEMENT_TYPE) * tb->size);
        tb->members = 0;
+       tb->num_expansions = 0;
 }
 
 /*
@@ -580,6 +597,7 @@ SH_GROW(SH_TYPE * tb, uint64 newsize)
                }
        }
 
+       (tb->num_expansions)++;
        SH_FREE(tb, olddata);
 }
 
@@ -1120,6 +1138,69 @@ SH_STAT(SH_TYPE * tb)
                   total_collisions, max_collisions, avg_collisions);
 }
 
+/*
+ * Greenplum specific
+ *
+ * Collect some statistics about the state of the hashtable. Major code was
+ * copied from SH_STAT() with some modifications to keep consistent with GPDB6.
+ */
+SH_SCOPE void
+SH_COLL_STAT(SH_TYPE * tb,
+                        uint32 * max_chain_length,
+                        uint32 * total_chain_length,
+                        uint32 * chain_count)
+{
+       *total_chain_length = 0;
+       *chain_count = 0;
+       uint32 last_dist = 0;
+
+       for (int i = 0; i < tb->size; i++)
+       {
+               uint32          hash;
+               uint32          optimal;
+               uint32          dist;
+               SH_ELEMENT_TYPE *elem;
+
+               elem = &tb->data[i];
+
+               if (elem->status != SH_STATUS_IN_USE)
+                       continue;
+
+               hash = SH_ENTRY_HASH(tb, elem);
+               optimal = SH_INITIAL_BUCKET(tb, hash);
+               dist = SH_DISTANCE_FROM_OPTIMAL(tb, optimal, i);
+
+               /*
+                * Different from SH_STAT(), always calculate chain length from 
1 but
+                * not 0, e.g. when there is only one element in bucket, the 
length
+                * is 1.
+                */
+               dist++;
+
+               /*
+                * In same chain, dist must be always increasing. If dist < 
last_dist,
+                * we must hit a new chain; take the length of old chain into 
account.
+                */
+               if (dist < last_dist)
+               {
+                       if (last_dist > *max_chain_length)
+                               *max_chain_length = last_dist;
+                       *total_chain_length += last_dist;
+                       (*chain_count)++;
+               }
+               last_dist = dist;
+       }
+
+       /* Count the last chain. */
+       if (last_dist != 0)
+       {
+               if (last_dist > *max_chain_length)
+                       *max_chain_length = last_dist;
+               *total_chain_length += last_dist;
+               (*chain_count)++;
+       }
+}
+
 #endif                                                 /* SH_DEFINE */
 
 
diff --git a/src/test/regress/expected/explain_format.out 
b/src/test/regress/expected/explain_format.out
index 1675c8a2153..c8c452cc781 100644
--- a/src/test/regress/expected/explain_format.out
+++ b/src/test/regress/expected/explain_format.out
@@ -17,6 +17,8 @@
 -- s/Memory used:  \d+\w?B/Memory used: ###B/
 -- m/Memory Usage: \d+\w?B/
 -- s/Memory Usage: \d+\w?B/Memory Usage: ###B/
+-- m/Memory wanted:  \d+\w?kB/
+-- s/Memory wanted:  \d+\w?kB/Memory wanted: ###kB/
 -- m/Peak Memory Usage: \d+/
 -- s/Peak Memory Usage: \d+/Peak Memory Usage: ###/
 -- m/Buckets: \d+/
@@ -583,9 +585,59 @@ QUERY PLAN
   }
 ]
 (1 row)
--- explain_processing_on
+-- start_matchsubs
+-- m/Extra Text: \(seg\d+\)   hash table\(s\): \d+; \d+ groups total in \d+ 
batches, \d+ spill partitions; disk usage: \d+KB; chain length \d+.\d+ avg, \d+ 
max; using \d+ of \d+ buckets; total \d+ expansions./
+-- s/Extra Text: \(seg\d+\)   hash table\(s\): \d+; \d+ groups total in \d+ 
batches, \d+ spill partitions; disk usage: \d+KB; chain length \d+.\d+ avg, \d+ 
max; using \d+ of \d+ buckets; total \d+ expansions./Extra Text: (seg0)   hash 
table(s): ###; ### groups total in ### batches, ### spill partitions; disk 
usage: ###KB; chain length ###.## avg, ### max; using ## of ### buckets; total 
### expansions./
+-- m/Work_mem: \d+K bytes max, \d+K bytes wanted/
+-- s/Work_mem: \d+K bytes max, \d+K bytes wanted/Work_mem: ###K bytes max, 
###K bytes wanted/
+-- end_matchsubs
+-- Greenplum hash table extra message
+CREATE TABLE test_src_tbl AS
+SELECT i % 10000 AS a, i % 10000 + 1 AS b FROM generate_series(1, 50000) i 
DISTRIBUTED BY (a);
+ANALYZE test_src_tbl;
+-- Enable optimizer_enable_hashagg, and set statement_mem to a small value to 
force spilling
+set optimizer_enable_hashagg = on;
+SET statement_mem = '1000kB';
+-- Hashagg with spilling
+CREATE TABLE test_hashagg_spill AS
+SELECT a, COUNT(DISTINCT b) AS b FROM test_src_tbl GROUP BY a;
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 
'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+EXPLAIN (analyze, costs off) SELECT a, COUNT(DISTINCT b) AS b FROM 
test_src_tbl GROUP BY a;
+QUERY PLAN
+Gather Motion 3:1  (slice1; segments: 3) (actual time=8.948..21.656 rows=10000 
loops=1)
+  ->  HashAggregate (actual time=13.950..15.183 rows=3386 loops=1)
+        Group Key: a
+        Extra Text: (seg0)   hash table(s): 1; 3368 groups total in 4 batches, 
2576 spill partitions; disk usage: 1024KB; chain length 2.9 avg, 9 max; using 
3368 of 20480 buckets; total 0 expansions.
+
+        ->  HashAggregate (actual time=12.113..12.658 rows=3386 loops=1)
+              Group Key: a, b
+              Extra Text: (seg0)   hash table(s): 1; chain length 2.3 avg, 5 
max; using 3368 of 8192 buckets; total 1 expansions.
+
+              ->  Seq Scan on test_src_tbl (actual time=0.015..1.748 
rows=16930 loops=1)
+Planning Time: 0.167 ms
+  (slice0)    Executor memory: 236K bytes.
+* (slice1)    Executor memory: 657K bytes avg x 3 workers, 722K bytes max 
(seg0).  Work_mem: 753K bytes max, 721K bytes wanted.
+Memory used:  1000kB
+Memory wanted:  1640kB
+Optimizer: Postgres query optimizer
+Execution Time: 22.346 ms
+(17 rows)
+-- Hashagg with grouping sets
+CREATE TABLE test_hashagg_groupingsets AS
+SELECT a, avg(b) AS b FROM test_src_tbl GROUP BY grouping sets ((a), (b));
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 
'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+-- The planner generates multiple hash tables but ORCA uses Shared Scan.
+-- flaky test
+-- EXPLAIN (analyze, costs off) SELECT a, avg(b) AS b FROM test_src_tbl GROUP 
BY grouping sets ((a), (b));
+RESET optimizer_enable_hashagg;
+RESET statement_mem;
 -- Cleanup
 DROP TABLE boxes;
 DROP TABLE apples;
 DROP TABLE box_locations;
 DROP TABLE jsonexplaintest;
+DROP TABLE test_src_tbl;
+DROP TABLE test_hashagg_spill;
+DROP TABLE test_hashagg_groupingsets;
diff --git a/src/test/regress/expected/explain_format_optimizer.out 
b/src/test/regress/expected/explain_format_optimizer.out
index 74a0d03e09d..459a466c993 100644
--- a/src/test/regress/expected/explain_format_optimizer.out
+++ b/src/test/regress/expected/explain_format_optimizer.out
@@ -17,6 +17,8 @@
 -- s/Memory used:  \d+\w?B/Memory used: ###B/
 -- m/Memory Usage: \d+\w?B/
 -- s/Memory Usage: \d+\w?B/Memory Usage: ###B/
+-- m/Memory wanted:  \d+\w?kB/
+-- s/Memory wanted:  \d+\w?kB/Memory wanted: ###kB/
 -- m/Peak Memory Usage: \d+/
 -- s/Peak Memory Usage: \d+/Peak Memory Usage: ###/
 -- m/Buckets: \d+/
@@ -530,9 +532,57 @@ QUERY PLAN
   }
 ]
 (1 row)
--- explain_processing_on
+-- start_matchsubs
+-- m/Extra Text: \(seg\d+\)   hash table\(s\): \d+; \d+ groups total in \d+ 
batches, \d+ spill partitions; disk usage: \d+KB; chain length \d+.\d+ avg, \d+ 
max; using \d+ of \d+ buckets; total \d+ expansions./
+-- s/Extra Text: \(seg\d+\)   hash table\(s\): \d+; \d+ groups total in \d+ 
batches, \d+ spill partitions; disk usage: \d+KB; chain length \d+.\d+ avg, \d+ 
max; using \d+ of \d+ buckets; total \d+ expansions./Extra Text: (seg0)   hash 
table(s): ###; ### groups total in ### batches, ### spill partitions; disk 
usage: ###KB; chain length ###.## avg, ### max; using ## of ### buckets; total 
### expansions./
+-- m/Work_mem: \d+K bytes max, \d+K bytes wanted/
+-- s/Work_mem: \d+K bytes max, \d+K bytes wanted/Work_mem: ###K bytes max, 
###K bytes wanted/
+-- end_matchsubs
+-- Greenplum hash table extra message
+CREATE TABLE test_src_tbl AS
+SELECT i % 10000 AS a, i % 10000 + 1 AS b FROM generate_series(1, 50000) i 
DISTRIBUTED BY (a);
+ANALYZE test_src_tbl;
+-- Enable optimizer_enable_hashagg, and set statement_mem to a small value to 
force spilling
+set optimizer_enable_hashagg = on;
+SET statement_mem = '1000kB';
+-- Hashagg with spilling
+CREATE TABLE test_hashagg_spill AS
+SELECT a, COUNT(DISTINCT b) AS b FROM test_src_tbl GROUP BY a;
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy 
entry.
+EXPLAIN (analyze, costs off) SELECT a, COUNT(DISTINCT b) AS b FROM 
test_src_tbl GROUP BY a;
+QUERY PLAN
+Gather Motion 3:1  (slice1; segments: 3) (actual time=9.551..21.820 rows=10000 
loops=1)
+  ->  HashAggregate (actual time=9.028..10.280 rows=3386 loops=1)
+        Group Key: a
+        Extra Text: (seg0)   hash table(s): 1; 3368 groups total in 4 batches, 
2576 spill partitions; disk usage: 1024KB; chain length 2.9 avg, 9 max; using 
3368 of 20480 buckets; total 0 expansions.
+
+        ->  HashAggregate (actual time=7.066..7.628 rows=3386 loops=1)
+              Group Key: a, b
+              Extra Text: (seg0)   hash table(s): 1; 3368 groups total in 4 
batches, 1264 spill partitions; disk usage: 1024KB; chain length 2.3 avg, 5 
max; using 3368 of 40960 buckets; total 1 expansions.
+
+              ->  Seq Scan on test_src_tbl (actual time=0.020..1.349 
rows=16930 loops=1)
+Planning Time: 5.518 ms
+  (slice0)    Executor memory: 236K bytes.
+* (slice1)    Executor memory: 709K bytes avg x 3 workers, 876K bytes max 
(seg0).  Work_mem: 753K bytes max, 721K bytes wanted.
+Memory used:  1000kB
+Memory wanted:  1640kB
+Optimizer: GPORCA
+Execution Time: 22.843 ms
+(17 rows)
+-- Hashagg with grouping sets
+CREATE TABLE test_hashagg_groupingsets AS
+SELECT a, avg(b) AS b FROM test_src_tbl GROUP BY grouping sets ((a), (b));
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy 
entry.
+-- The planner generates multiple hash tables but ORCA uses Shared Scan.
+-- flaky test
+-- EXPLAIN (analyze, costs off) SELECT a, avg(b) AS b FROM test_src_tbl GROUP 
BY grouping sets ((a), (b));
+RESET optimizer_enable_hashagg;
+RESET statement_mem;
 -- Cleanup
 DROP TABLE boxes;
 DROP TABLE apples;
 DROP TABLE box_locations;
 DROP TABLE jsonexplaintest;
+DROP TABLE test_src_tbl;
+DROP TABLE test_hashagg_spill;
+DROP TABLE test_hashagg_groupingsets;
diff --git a/src/test/regress/sql/explain_format.sql 
b/src/test/regress/sql/explain_format.sql
index 602891698fd..e04cba90947 100644
--- a/src/test/regress/sql/explain_format.sql
+++ b/src/test/regress/sql/explain_format.sql
@@ -17,6 +17,8 @@
 -- s/Memory used:  \d+\w?B/Memory used: ###B/
 -- m/Memory Usage: \d+\w?B/
 -- s/Memory Usage: \d+\w?B/Memory Usage: ###B/
+-- m/Memory wanted:  \d+\w?kB/
+-- s/Memory wanted:  \d+\w?kB/Memory wanted: ###kB/
 -- m/Peak Memory Usage: \d+/
 -- s/Peak Memory Usage: \d+/Peak Memory Usage: ###/
 -- m/Buckets: \d+/
@@ -110,10 +112,41 @@ EXPLAIN (FORMAT XML, COSTS OFF) SELECT * FROM 
generate_series(1, 10);
 CREATE TABLE jsonexplaintest (i int4) PARTITION BY RANGE (i) (START(1) END(3) 
EVERY(1));
 EXPLAIN (FORMAT JSON, COSTS OFF) SELECT * FROM jsonexplaintest WHERE i = 2;
 
--- explain_processing_on
+-- start_matchsubs
+-- m/Extra Text: \(seg\d+\)   hash table\(s\): \d+; \d+ groups total in \d+ 
batches, \d+ spill partitions; disk usage: \d+KB; chain length \d+.\d+ avg, \d+ 
max; using \d+ of \d+ buckets; total \d+ expansions./
+-- s/Extra Text: \(seg\d+\)   hash table\(s\): \d+; \d+ groups total in \d+ 
batches, \d+ spill partitions; disk usage: \d+KB; chain length \d+.\d+ avg, \d+ 
max; using \d+ of \d+ buckets; total \d+ expansions./Extra Text: (seg0)   hash 
table(s): ###; ### groups total in ### batches, ### spill partitions; disk 
usage: ###KB; chain length ###.## avg, ### max; using ## of ### buckets; total 
### expansions./
+-- m/Work_mem: \d+K bytes max, \d+K bytes wanted/
+-- s/Work_mem: \d+K bytes max, \d+K bytes wanted/Work_mem: ###K bytes max, 
###K bytes wanted/
+-- end_matchsubs
+-- Greenplum hash table extra message
+CREATE TABLE test_src_tbl AS
+SELECT i % 10000 AS a, i % 10000 + 1 AS b FROM generate_series(1, 50000) i 
DISTRIBUTED BY (a);
+ANALYZE test_src_tbl;
+
+-- Enable optimizer_enable_hashagg, and set statement_mem to a small value to 
force spilling
+set optimizer_enable_hashagg = on;
+SET statement_mem = '1000kB';
+
+-- Hashagg with spilling
+CREATE TABLE test_hashagg_spill AS
+SELECT a, COUNT(DISTINCT b) AS b FROM test_src_tbl GROUP BY a;
+EXPLAIN (analyze, costs off) SELECT a, COUNT(DISTINCT b) AS b FROM 
test_src_tbl GROUP BY a;
+
+-- Hashagg with grouping sets
+CREATE TABLE test_hashagg_groupingsets AS
+SELECT a, avg(b) AS b FROM test_src_tbl GROUP BY grouping sets ((a), (b));
+-- The planner generates multiple hash tables but ORCA uses Shared Scan.
+-- flaky test
+-- EXPLAIN (analyze, costs off) SELECT a, avg(b) AS b FROM test_src_tbl GROUP 
BY grouping sets ((a), (b));
+
+RESET optimizer_enable_hashagg;
+RESET statement_mem;
 
 -- Cleanup
 DROP TABLE boxes;
 DROP TABLE apples;
 DROP TABLE box_locations;
 DROP TABLE jsonexplaintest;
+DROP TABLE test_src_tbl;
+DROP TABLE test_hashagg_spill;
+DROP TABLE test_hashagg_groupingsets;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) branch main updated: Add explain analyze detailed info of hash agg (#15917)

Reply via email to