Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-04-08 Thread Alena Rybakina

Hi! Thank you for your work on this issue!

Your patch required a little revision. I did this and attached the patch.

Also, I think you should add some clarification to the comments about 
printing 'exact' and 'loosy' pages in show_hashagg_info function, which 
you get from planstate->stats, whereas previously it was output only 
from planstate. Perhaps it is enough to mention this in the comment to 
the commit.


I mean this place:

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 926d70afaf..02251994c6 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3467,26 +3467,57 @@ show_hashagg_info(AggState *aggstate, 
ExplainState *es)

 static void
 show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
 {
+    Assert(es->analyze);
+
 if (es->format != EXPLAIN_FORMAT_TEXT)
 {
     ExplainPropertyInteger("Exact Heap Blocks", NULL,
-                               planstate->exact_pages, es);
+                               planstate->stats.exact_pages, es);
     ExplainPropertyInteger("Lossy Heap Blocks", NULL,
-                               planstate->lossy_pages, es);
+                               planstate->stats.lossy_pages, es);
 }
 else
 {
-        if (planstate->exact_pages > 0 || planstate->lossy_pages > 0)
+        if (planstate->stats.exact_pages > 0 || 
planstate->stats.lossy_pages > 0)

     {
         ExplainIndentText(es);
         appendStringInfoString(es->str, "Heap Blocks:");
-            if (planstate->exact_pages > 0)
-                appendStringInfo(es->str, " exact=%ld", 
planstate->exact_pages);

-            if (planstate->lossy_pages > 0)
-                appendStringInfo(es->str, " lossy=%ld", 
planstate->lossy_pages);

+            if (planstate->stats.exact_pages > 0)
+                appendStringInfo(es->str, " exact=%ld", 
planstate->stats.exact_pages);

+            if (planstate->stats.lossy_pages > 0)
+                appendStringInfo(es->str, " lossy=%ld", 
planstate->stats.lossy_pages);

         appendStringInfoChar(es->str, '\n');
     }
 }
+
+    if (planstate->pstate != NULL)
+    {
+        for (int n = 0; n < planstate->sinstrument->num_workers; n++)
+        {
+            BitmapHeapScanInstrumentation *si = 
>sinstrument->sinstrument[n];

+
+            if (si->exact_pages == 0 && si->lossy_pages == 0)
+                continue;
+
+            if (es->workers_state)
+                ExplainOpenWorker(n, es);
+
+            if (es->format == EXPLAIN_FORMAT_TEXT)
+            {
+                ExplainIndentText(es);
+                appendStringInfo(es->str, "Heap Blocks: exact=%ld 
lossy=%ld\n",

+                         si->exact_pages, si->lossy_pages);
+            }
+            else
+            {
+                ExplainPropertyInteger("Exact Heap Blocks", NULL, 
si->exact_pages, es);
+                ExplainPropertyInteger("Lossy Heap Blocks", NULL, 
si->lossy_pages, es);

+            }
+
+            if (es->workers_state)
+                ExplainCloseWorker(n, es);
+        }
+    }
 }

I suggest some code refactoring (diff.diff.no-cfbot file) that allows 
you to improve your code.


--
Regards,
Alena Rybakina
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
diff --git a/src/backend/executor/nodeBitmapHeapscan.c 
b/src/backend/executor/nodeBitmapHeapscan.c
index 973bf56022d..ca8d94ba09a 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -634,8 +634,9 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 */
if (node->sinstrument != NULL && IsParallelWorker())
{
+   BitmapHeapScanInstrumentation *si;
Assert(ParallelWorkerNumber <= node->sinstrument->num_workers);
-   BitmapHeapScanInstrumentation *si = 
>sinstrument->sinstrument[ParallelWorkerNumber];
+   si = >sinstrument->sinstrument[ParallelWorkerNumber];
si->exact_pages += node->stats.exact_pages;
si->lossy_pages += node->stats.lossy_pages;
}
@@ -864,7 +865,7 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
 
ptr = shm_toc_allocate(pcxt->toc, size);
pstate = (ParallelBitmapHeapState *) ptr;
-   ptr += MAXALIGN(sizeof(ParallelBitmapHeapState));
+   ptr += size;
if (node->ss.ps.instrument && pcxt->nworkers > 0)
sinstrument = (SharedBitmapHeapInstrumentation *) ptr;
 
From 8dc939749a3f0cea9ba337c020b9ccd474e7c8e3 Mon Sep 17 00:00:00 2001
From: Alena Rybakina 
Date: Mon, 8 Apr 2024 23:41:41 +0300
Subject: [PATCH] Parallel Bitmap Heap Scan reports per-worker stats.

Similarly to other nodes (e.g. hash join, sort, memoize),
Bitmap Heap Scan now reports per-worker stats in the EXPLAIN
ANALYZE output. Previously only the heap blocks stats for the
leader were reported which was incomplete in parallel scans.

Discussion: 

Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-03-27 Thread Donghang Lin
On Mon, Mar 25, 2024 at 2:11 PM Melanie Plageman 
wrote:
> As for whether or not per-worker stats should be displayed by default
> or only with VERBOSE, it sounds like there are two different
> precedents. I don't have a strong feeling one way or the other.
> Whichever is most consistent.
> Donghang, could you list again which plan nodes and explain options
> always print per-worker stats and which only do with the VERBOSE
> option?

I took a look at explain.c where workers info is printed out.

These works for every parallel aware nodes:
Buffers stats print for workers with VERBOSE and BUFFERS
WAL stats print for workers with VERBOSE and WAL
JIT stats print for workers with VERBOSE and COSTS
Timing print for workers with VERBOSE and TIMING
Rows and loops print for workers with VERBOSE

Some specific nodes:
Sort / Incremental Sort / Hash / HashAggregate / Memorize and Bitmap Heap
Scan (this patch) nodes
always print their specific stats for workers.

> I did some logging and I do see workers with
> counts of lossy/exact not making it into the final count. I haven't
> had time to debug more, but it is worth looking into.

Indeed, rescan overrides previous scan stats in workers.
Attach v5 with v4 plus the fix to aggregate the counts.

Regards,
Donghang Lin
(ServiceNow)


v5-0001-Parallel-Bitmap-Heap-Scan-reports-per-worker-stat.patch
Description: Binary data


Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-03-25 Thread Melanie Plageman
On Mon, Mar 25, 2024 at 2:29 AM Donghang Lin  wrote:
>
>
> > On Sat, Feb 17, 2024 at 2:31 PM Tomas Vondra 
> >  wrote:
> > 2) Leader vs. worker counters
> >
> > It seems to me this does nothing to add the per-worker values from "Heap
> > Blocks" into the leader, which means we get stuff like this:
> >
> > Heap Blocks: exact=102 lossy=10995
> > Worker 0:  actual time=50.559..209.773 rows=215253 loops=1
> >Heap Blocks: exact=207 lossy=19354
> > Worker 1:  actual time=50.543..211.387 rows=162934 loops=1
> >Heap Blocks: exact=161 lossy=14636
> >
> > I think this is wrong / confusing, and inconsistent with what we do for
> > other nodes. It's also inconsistent with how we deal e.g. with BUFFERS,
> > where we *do* add the values to the leader:
> >
> > Heap Blocks: exact=125 lossy=10789
> > Buffers: shared hit=11 read=45420
> > Worker 0:  actual time=51.419..221.904 rows=150437 loops=1
> >   Heap Blocks: exact=136 lossy=13541
> >   Buffers: shared hit=4 read=13541
> > Worker 1:  actual time=56.610..222.469 rows=229738 loops=1
> >   Heap Blocks: exact=209 lossy=20655
> >   Buffers: shared hit=4 read=20655
> >
> > Here it's not entirely obvious, because leader participates in the
> > execution, but once we disable leader participation, it's clearer:
> >
> > Buffers: shared hit=7 read=45421
> > Worker 0:  actual time=28.540..247.683 rows=309112 loops=1
> >   Heap Blocks: exact=282 lossy=27806
> >   Buffers: shared hit=4 read=28241
> > Worker 1:  actual time=24.290..251.993 rows=190815 loops=1
> >   Heap Blocks: exact=188 lossy=17179
> >   Buffers: shared hit=3 read=17180
> >
> > Not only is "Buffers" clearly a sum of per-worker stats, but the "Heap
> > Blocks" simply disappeared because the leader does nothing and we don't
> > print zeros.
>
> Heap Blocks is specific to Bitmap Heap Scan. It seems that node specific stats
> do not aggregate workers' stats into leaders for some existing nodes. For 
> example,
> Memorize node for Hits, Misses, etc
>
>->  Nested Loop (actual rows=17 loops=3)
>  ->  Parallel Seq Scan on t (actual rows=3 loops=3)
>  ->  Memoize (actual rows=5 loops=10)
>Cache Key: t.j
>Cache Mode: logical
>Hits: 32991  Misses: 5  Evictions: 0  Overflows: 0  Memory 
> Usage: 2kB
>Worker 0:  Hits: 33551  Misses: 5  Evictions: 0  Overflows: 0  
> Memory Usage: 2kB
>Worker 1:  Hits: 33443  Misses: 5  Evictions: 0  Overflows: 0  
> Memory Usage: 2kB
>->  Index Scan using uj on u (actual rows=5 loops=15)
>  Index Cond: (j = t.j)
>
> Sort, HashAggregate also do the same stuff.
>
> > 3) I'm not sure dealing with various EXPLAIN flags may not be entirely
> > correct. Consider this:
> >
> > EXPLAIN (ANALYZE):
> >
> >->  Parallel Bitmap Heap Scan on t  (...)
> >  Recheck Cond: (a < 5000)
> >  Rows Removed by Index Recheck: 246882
> >  Worker 0:  Heap Blocks: exact=168 lossy=15648
> >  Worker 1:  Heap Blocks: exact=302 lossy=29337
> >
> > EXPLAIN (ANALYZE, VERBOSE):
> >
> >->  Parallel Bitmap Heap Scan on public.t  (...)
> >  Recheck Cond: (t.a < 5000)
> >  Rows Removed by Index Recheck: 246882
> >  Worker 0:  actual time=35.067..300.882 rows=282108 loops=1
> >Heap Blocks: exact=257 lossy=25358
> >  Worker 1:  actual time=32.827..302.224 rows=217819 loops=1
> >Heap Blocks: exact=213 lossy=19627
> >
> > EXPLAIN (ANALYZE, BUFFERS):
> >
> >->  Parallel Bitmap Heap Scan on t  (...)
> >  Recheck Cond: (a < 5000)
> >  Rows Removed by Index Recheck: 246882
> >  Buffers: shared hit=7 read=45421
> >  Worker 0:  Heap Blocks: exact=236 lossy=21870
> >  Worker 1:  Heap Blocks: exact=234 lossy=23115
> >
> > EXPLAIN (ANALYZE, VERBOSE, BUFFERS):
> >
> >->  Parallel Bitmap Heap Scan on public.t  (...)
> >  Recheck Cond: (t.a < 5000)
> >  Rows Removed by Index Recheck: 246882
> >  Buffers: shared hit=7 read=45421
> >  Worker 0:  actual time=28.265..260.381 rows=261264 loops=1
> >Heap Blocks: exact=260 lossy=23477
> >Buffers: shared hit=3 read=23478
> >  Worker 1:  actual time=28.224..261.627 rows=238663 loops=1
> >Heap Blocks: exact=210 lossy=21508
> >Buffers: shared hit=4 read=21943
> >
> > Why should the per-worker buffer info be shown when combined with the
> > VERBOSE flag, and not just with BUFFERS, when the patch shows the
> > per-worker info always?
> >
>
> It seems that the general explain print framework requires verbose mode to 
> show per worker stats.
> For example, how Buffers hits, JIT are printed. While in some specific nodes 
> which involves parallelism,
> they always show worker blocks. This is why we see that some worker blocks 
> don't have buffers
> 

Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-03-25 Thread Donghang Lin
Hi Tomas,
>
> On Sat, Feb 17, 2024 at 2:31 PM Tomas Vondra <
tomas.von...@enterprisedb.com> wrote:
> Hi David,
>
> Do you plan to work continue working on this patch? I did take a look,
> and on the whole it looks reasonable - it modifies the right places etc.
>
> I think there are two things that may need an improvement:
>
> 1) Storing variable-length data in ParallelBitmapHeapState
>
> I agree with Robert the snapshot_and_stats name is not great. I see
> Dmitry mentioned phs_snapshot_off as used by ParallelTableScanDescData -
> the reasons are somewhat different (phs_snapshot_off exists because we
> don't know which exact struct will be allocated), while here we simply
> need to allocate two variable-length pieces of memory. But it seems like
> it would work nicely for this. That is, we could try adding an offset
> for each of those pieces of memory:
>
>  - snapshot_off
>  - stats_off
>
> I don't like the GetSharedSnapshotData name very much, it seems very
> close to GetSnapshotData - quite confusing, I think.
>
> Dmitry also suggested we might add a separate piece of shared memory. I
> don't quite see how that would work for ParallelBitmapHeapState, but I
> doubt it'd be simpler than having two offsets. I don't think the extra
> complexity (paid by everyone) would be worth it just to make EXPLAIN
> ANALYZE work.

This issue is now gone after Heikki's fix.

> 2) Leader vs. worker counters
>
> It seems to me this does nothing to add the per-worker values from "Heap
> Blocks" into the leader, which means we get stuff like this:
>
> Heap Blocks: exact=102 lossy=10995
> Worker 0:  actual time=50.559..209.773 rows=215253 loops=1
>Heap Blocks: exact=207 lossy=19354
> Worker 1:  actual time=50.543..211.387 rows=162934 loops=1
>Heap Blocks: exact=161 lossy=14636
>
> I think this is wrong / confusing, and inconsistent with what we do for
> other nodes. It's also inconsistent with how we deal e.g. with BUFFERS,
> where we *do* add the values to the leader:
>
> Heap Blocks: exact=125 lossy=10789
> Buffers: shared hit=11 read=45420
> Worker 0:  actual time=51.419..221.904 rows=150437 loops=1
>   Heap Blocks: exact=136 lossy=13541
>   Buffers: shared hit=4 read=13541
> Worker 1:  actual time=56.610..222.469 rows=229738 loops=1
>   Heap Blocks: exact=209 lossy=20655
>   Buffers: shared hit=4 read=20655
>
> Here it's not entirely obvious, because leader participates in the
> execution, but once we disable leader participation, it's clearer:
>
> Buffers: shared hit=7 read=45421
> Worker 0:  actual time=28.540..247.683 rows=309112 loops=1
>   Heap Blocks: exact=282 lossy=27806
>   Buffers: shared hit=4 read=28241
> Worker 1:  actual time=24.290..251.993 rows=190815 loops=1
>   Heap Blocks: exact=188 lossy=17179
>   Buffers: shared hit=3 read=17180
>
> Not only is "Buffers" clearly a sum of per-worker stats, but the "Heap
> Blocks" simply disappeared because the leader does nothing and we don't
> print zeros.

Heap Blocks is specific to Bitmap Heap Scan. It seems that node specific
stats
do not aggregate workers' stats into leaders for some existing nodes. For
example,
Memorize node for Hits, Misses, etc

   ->  Nested Loop (actual rows=17 loops=3)
 ->  Parallel Seq Scan on t (actual rows=3 loops=3)
 ->  Memoize (actual rows=5 loops=10)
   Cache Key: t.j
   Cache Mode: logical
   Hits: 32991  Misses: 5  Evictions: 0  Overflows: 0  Memory
Usage: 2kB
   Worker 0:  Hits: 33551  Misses: 5  Evictions: 0  Overflows:
0  Memory Usage: 2kB
   Worker 1:  Hits: 33443  Misses: 5  Evictions: 0  Overflows:
0  Memory Usage: 2kB
   ->  Index Scan using uj on u (actual rows=5 loops=15)
 Index Cond: (j = t.j)

Sort, HashAggregate also do the same stuff.

> 3) I'm not sure dealing with various EXPLAIN flags may not be entirely
> correct. Consider this:
>
> EXPLAIN (ANALYZE):
>
>->  Parallel Bitmap Heap Scan on t  (...)
>  Recheck Cond: (a < 5000)
>  Rows Removed by Index Recheck: 246882
>  Worker 0:  Heap Blocks: exact=168 lossy=15648
>  Worker 1:  Heap Blocks: exact=302 lossy=29337
>
> EXPLAIN (ANALYZE, VERBOSE):
>
>->  Parallel Bitmap Heap Scan on public.t  (...)
>  Recheck Cond: (t.a < 5000)
>  Rows Removed by Index Recheck: 246882
>  Worker 0:  actual time=35.067..300.882 rows=282108 loops=1
>Heap Blocks: exact=257 lossy=25358
>  Worker 1:  actual time=32.827..302.224 rows=217819 loops=1
>Heap Blocks: exact=213 lossy=19627
>
> EXPLAIN (ANALYZE, BUFFERS):
>
>->  Parallel Bitmap Heap Scan on t  (...)
>  Recheck Cond: (a < 5000)
>  Rows Removed by Index Recheck: 246882
>  Buffers: shared hit=7 read=45421
>  Worker 0:  Heap Blocks: exact=236 lossy=21870
>  Worker 1:  Heap Blocks: exact=234 

Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-03-14 Thread Heikki Linnakangas

On 14/03/2024 22:00, Melanie Plageman wrote:

On Thu, Mar 14, 2024 at 05:30:30PM +0200, Heikki Linnakangas wrote:

typedef struct SharedBitmapHeapInstrumentation
{
int num_workers;
BitmapHeapScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
} SharedBitmapHeapInstrumentation;

typedef struct BitmapHeapScanState
{
ScanState   ss; /* its first field is 
NodeTag */
...
SharedBitmapHeapInstrumentation sinstrument;
} BitmapHeapScanState;

that compiles, at least with my compiler, but I find it weird to have a
variable-length inner struct embedded in an outer struct like that.


In the attached patch, BitmapHeapScanState->sinstrument is a pointer,
though. Or are you proposing the above as an alternative that you
decided not to go with?


Right, the above is what I contemplated at first but decided it was a 
bad idea.


--
Heikki Linnakangas
Neon (https://neon.tech)





Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-03-14 Thread Melanie Plageman
On Thu, Mar 14, 2024 at 05:30:30PM +0200, Heikki Linnakangas wrote:
> On 18/02/2024 00:31, Tomas Vondra wrote:
> > Do you plan to work continue working on this patch? I did take a look,
> > and on the whole it looks reasonable - it modifies the right places etc.
> 
> +1
> 
> > I think there are two things that may need an improvement:
> > 
> > 1) Storing variable-length data in ParallelBitmapHeapState
> > 
> > I agree with Robert the snapshot_and_stats name is not great. I see
> > Dmitry mentioned phs_snapshot_off as used by ParallelTableScanDescData -
> > the reasons are somewhat different (phs_snapshot_off exists because we
> > don't know which exact struct will be allocated), while here we simply
> > need to allocate two variable-length pieces of memory. But it seems like
> > it would work nicely for this. That is, we could try adding an offset
> > for each of those pieces of memory:
> > 
> >   - snapshot_off
> >   - stats_off
> > 
> > I don't like the GetSharedSnapshotData name very much, it seems very
> > close to GetSnapshotData - quite confusing, I think.
> > 
> > Dmitry also suggested we might add a separate piece of shared memory. I
> > don't quite see how that would work for ParallelBitmapHeapState, but I
> > doubt it'd be simpler than having two offsets. I don't think the extra
> > complexity (paid by everyone) would be worth it just to make EXPLAIN
> > ANALYZE work.
> 
> I just removed phs_snapshot_data in commit 84c18acaf6. I thought that would
> make this moot, but now that I rebased this, there are stills some aesthetic
> questions on how best to represent this.
> 
> In all the other node types that use shared instrumentation like this, the
> pattern is as follows: (using Memoize here as an example, but it's similar
> for Sort, IncrementalSort, Agg and Hash)
> 
> /* 
>  * Shared memory container for per-worker memoize information
>  * 
>  */
> typedef struct SharedMemoizeInfo
> {
>   int num_workers;
>   MemoizeInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
> } SharedMemoizeInfo;
> 
> /* this struct is backend-private */
> typedef struct MemoizeState
> {
>   ScanState   ss; /* its first field is 
> NodeTag */
>   ...
>   MemoizeInstrumentation stats;   /* execution statistics */
>   SharedMemoizeInfo *shared_info; /* statistics for parallel workers */
> } MemoizeState;
> 
> While the scan is running, the node updates its private data in
> MemoizeState->stats. At the end of a parallel scan, the worker process
> copies the MemoizeState->stats to MemoizeState->shared_info->stats, which
> lives in shared memory. The leader process copies
> MemoizeState->shared_info->stats to its own backend-private copy, which it
> then stores in its MemoizeState->shared_info, replacing the pointer to the
> shared memory with a pointer to the private copy. That happens in
> ExecMemoizeRetrieveInstrumentation().
> 
> This is a little different for parallel bitmap heap scans, because a bitmap
> heap scan keeps some other data in shared memory too, not just
> instrumentation data. Also, the naming is inconsistent: the equivalent of
> SharedMemoizeInfo is actually called ParallelBitmapHeapState. I think we
> should rename it to SharedBitmapHeapInfo, to make it clear that it lives in
> shared memory, but I digress.

FWIW, if we merge a BHS streaming read user like the one I propose in
[1] (not as a pre-condition to this but just as something to make you
more comfortable with these names), the ParallelBitmapHeapState will
basically only contain the shared iterator and the coordination state
for accessing it and could be named as such.

Then if you really wanted to be consistent with Memoize, you could name
the instrumentation SharedBitmapHeapInfo. But, personally I prefer the
name you gave it: SharedBitmapHeapInstrumentation. I think that would
have been a better name for SharedMemoizeInfo since num_workers is
really just used as the length of the array of instrumentation info.

> We could now put the new stats at the end of ParallelBitmapHeapState as a
> varlen field. But I'm not sure that's a good idea. In
> ExecBitmapHeapRetrieveInstrumentation(), would we make a backend-private
> copy of the whole ParallelBitmapHeapState struct, even though the other
> fields don't make sense after the shared memory is released? Sounds
> confusing. Or we could introduce a separate struct for the stats, and copy
> just that:
> 
> typedef struct SharedBitmapHeapInstrumentation
> {
>   int num_workers;
>   BitmapHeapScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
> } SharedBitmapHeapInstrumentation;
> 
> typedef struct BitmapHeapScanState
> {
>   ScanState   ss; /* its first field is 
> NodeTag */
>   ...
>   SharedBitmapHeapInstrumentation sinstrument;
> } BitmapHeapScanState;
> 
> that compiles, at least with my compiler, but I find it 

Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-03-14 Thread Heikki Linnakangas

On 18/02/2024 00:31, Tomas Vondra wrote:

Do you plan to work continue working on this patch? I did take a look,
and on the whole it looks reasonable - it modifies the right places etc.


+1


I think there are two things that may need an improvement:

1) Storing variable-length data in ParallelBitmapHeapState

I agree with Robert the snapshot_and_stats name is not great. I see
Dmitry mentioned phs_snapshot_off as used by ParallelTableScanDescData -
the reasons are somewhat different (phs_snapshot_off exists because we
don't know which exact struct will be allocated), while here we simply
need to allocate two variable-length pieces of memory. But it seems like
it would work nicely for this. That is, we could try adding an offset
for each of those pieces of memory:

  - snapshot_off
  - stats_off

I don't like the GetSharedSnapshotData name very much, it seems very
close to GetSnapshotData - quite confusing, I think.

Dmitry also suggested we might add a separate piece of shared memory. I
don't quite see how that would work for ParallelBitmapHeapState, but I
doubt it'd be simpler than having two offsets. I don't think the extra
complexity (paid by everyone) would be worth it just to make EXPLAIN
ANALYZE work.


I just removed phs_snapshot_data in commit 84c18acaf6. I thought that 
would make this moot, but now that I rebased this, there are stills some 
aesthetic questions on how best to represent this.


In all the other node types that use shared instrumentation like this, 
the pattern is as follows: (using Memoize here as an example, but it's 
similar for Sort, IncrementalSort, Agg and Hash)


/* 
 *   Shared memory container for per-worker memoize information
 * 
 */
typedef struct SharedMemoizeInfo
{
int num_workers;
MemoizeInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
} SharedMemoizeInfo;

/* this struct is backend-private */
typedef struct MemoizeState
{
ScanState   ss; /* its first field is 
NodeTag */
...
MemoizeInstrumentation stats;   /* execution statistics */
SharedMemoizeInfo *shared_info; /* statistics for parallel workers */
} MemoizeState;

While the scan is running, the node updates its private data in 
MemoizeState->stats. At the end of a parallel scan, the worker process 
copies the MemoizeState->stats to MemoizeState->shared_info->stats, 
which lives in shared memory. The leader process copies 
MemoizeState->shared_info->stats to its own backend-private copy, which 
it then stores in its MemoizeState->shared_info, replacing the pointer 
to the shared memory with a pointer to the private copy. That happens in 
ExecMemoizeRetrieveInstrumentation().


This is a little different for parallel bitmap heap scans, because a 
bitmap heap scan keeps some other data in shared memory too, not just 
instrumentation data. Also, the naming is inconsistent: the equivalent 
of SharedMemoizeInfo is actually called ParallelBitmapHeapState. I think 
we should rename it to SharedBitmapHeapInfo, to make it clear that it 
lives in shared memory, but I digress.


We could now put the new stats at the end of ParallelBitmapHeapState as 
a varlen field. But I'm not sure that's a good idea. In 
ExecBitmapHeapRetrieveInstrumentation(), would we make a backend-private 
copy of the whole ParallelBitmapHeapState struct, even though the other 
fields don't make sense after the shared memory is released? Sounds 
confusing. Or we could introduce a separate struct for the stats, and 
copy just that:


typedef struct SharedBitmapHeapInstrumentation
{
int num_workers;
BitmapHeapScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
} SharedBitmapHeapInstrumentation;

typedef struct BitmapHeapScanState
{
ScanState   ss; /* its first field is 
NodeTag */
...
SharedBitmapHeapInstrumentation sinstrument;
} BitmapHeapScanState;

that compiles, at least with my compiler, but I find it weird to have a 
variable-length inner struct embedded in an outer struct like that.


Long story short, I think it's still better to store 
ParallelBitmapHeapInstrumentationInfo separately in the DSM chunk, not 
as part of ParallelBitmapHeapState. Attached patch does that, rebased 
over current master.



I didn't address any of the other things that you, Tomas, pointed out, 
but I think they're valid concerns.


--
Heikki Linnakangas
Neon (https://neon.tech)
From 04f11a37ee04b282c51e9dae68ead7c9c3d5fb3d Mon Sep 17 00:00:00 2001
From: David Geier 
Date: Tue, 8 Nov 2022 19:40:31 +0100
Subject: [PATCH v3 1/1] Parallel Bitmap Heap Scan reports per-worker stats

Similarly to other nodes (e.g. hash join, sort, memoize),
Bitmap Heap Scan now reports per-worker stats in the EXPLAIN
ANALYZE output. Previously only the heap blocks stats for the
leader were reported which was incomplete in parallel scans.

Discussion: 

Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-03-01 Thread Melanie Plageman
On Sat, Feb 17, 2024 at 5:31 PM Tomas Vondra
 wrote:
>
> Hi David,
>
> Do you plan to work continue working on this patch? I did take a look,
> and on the whole it looks reasonable - it modifies the right places etc.

I haven't started reviewing this patch yet, but I just ran into the
behavior that it fixes and was very outraged. +1 to fixing this.

- Melanie




Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2024-02-17 Thread Tomas Vondra
Hi David,

Do you plan to work continue working on this patch? I did take a look,
and on the whole it looks reasonable - it modifies the right places etc.

I think there are two things that may need an improvement:

1) Storing variable-length data in ParallelBitmapHeapState

I agree with Robert the snapshot_and_stats name is not great. I see
Dmitry mentioned phs_snapshot_off as used by ParallelTableScanDescData -
the reasons are somewhat different (phs_snapshot_off exists because we
don't know which exact struct will be allocated), while here we simply
need to allocate two variable-length pieces of memory. But it seems like
it would work nicely for this. That is, we could try adding an offset
for each of those pieces of memory:

 - snapshot_off
 - stats_off

I don't like the GetSharedSnapshotData name very much, it seems very
close to GetSnapshotData - quite confusing, I think.

Dmitry also suggested we might add a separate piece of shared memory. I
don't quite see how that would work for ParallelBitmapHeapState, but I
doubt it'd be simpler than having two offsets. I don't think the extra
complexity (paid by everyone) would be worth it just to make EXPLAIN
ANALYZE work.


2) Leader vs. worker counters

It seems to me this does nothing to add the per-worker values from "Heap
Blocks" into the leader, which means we get stuff like this:

Heap Blocks: exact=102 lossy=10995
Worker 0:  actual time=50.559..209.773 rows=215253 loops=1
   Heap Blocks: exact=207 lossy=19354
Worker 1:  actual time=50.543..211.387 rows=162934 loops=1
   Heap Blocks: exact=161 lossy=14636

I think this is wrong / confusing, and inconsistent with what we do for
other nodes. It's also inconsistent with how we deal e.g. with BUFFERS,
where we *do* add the values to the leader:

Heap Blocks: exact=125 lossy=10789
Buffers: shared hit=11 read=45420
Worker 0:  actual time=51.419..221.904 rows=150437 loops=1
  Heap Blocks: exact=136 lossy=13541
  Buffers: shared hit=4 read=13541
Worker 1:  actual time=56.610..222.469 rows=229738 loops=1
  Heap Blocks: exact=209 lossy=20655
  Buffers: shared hit=4 read=20655

Here it's not entirely obvious, because leader participates in the
execution, but once we disable leader participation, it's clearer:

Buffers: shared hit=7 read=45421
Worker 0:  actual time=28.540..247.683 rows=309112 loops=1
  Heap Blocks: exact=282 lossy=27806
  Buffers: shared hit=4 read=28241
Worker 1:  actual time=24.290..251.993 rows=190815 loops=1
  Heap Blocks: exact=188 lossy=17179
  Buffers: shared hit=3 read=17180

Not only is "Buffers" clearly a sum of per-worker stats, but the "Heap
Blocks" simply disappeared because the leader does nothing and we don't
print zeros.


3) I'm not sure dealing with various EXPLAIN flags may not be entirely
correct. Consider this:

EXPLAIN (ANALYZE):

   ->  Parallel Bitmap Heap Scan on t  (...)
 Recheck Cond: (a < 5000)
 Rows Removed by Index Recheck: 246882
 Worker 0:  Heap Blocks: exact=168 lossy=15648
 Worker 1:  Heap Blocks: exact=302 lossy=29337

EXPLAIN (ANALYZE, VERBOSE):

   ->  Parallel Bitmap Heap Scan on public.t  (...)
 Recheck Cond: (t.a < 5000)
 Rows Removed by Index Recheck: 246882
 Worker 0:  actual time=35.067..300.882 rows=282108 loops=1
   Heap Blocks: exact=257 lossy=25358
 Worker 1:  actual time=32.827..302.224 rows=217819 loops=1
   Heap Blocks: exact=213 lossy=19627

EXPLAIN (ANALYZE, BUFFERS):

   ->  Parallel Bitmap Heap Scan on t  (...)
 Recheck Cond: (a < 5000)
 Rows Removed by Index Recheck: 246882
 Buffers: shared hit=7 read=45421
 Worker 0:  Heap Blocks: exact=236 lossy=21870
 Worker 1:  Heap Blocks: exact=234 lossy=23115

EXPLAIN (ANALYZE, VERBOSE, BUFFERS):

   ->  Parallel Bitmap Heap Scan on public.t  (...)
 Recheck Cond: (t.a < 5000)
 Rows Removed by Index Recheck: 246882
 Buffers: shared hit=7 read=45421
 Worker 0:  actual time=28.265..260.381 rows=261264 loops=1
   Heap Blocks: exact=260 lossy=23477
   Buffers: shared hit=3 read=23478
 Worker 1:  actual time=28.224..261.627 rows=238663 loops=1
   Heap Blocks: exact=210 lossy=21508
   Buffers: shared hit=4 read=21943

Why should the per-worker buffer info be shown when combined with the
VERBOSE flag, and not just with BUFFERS, when the patch shows the
per-worker info always?


4) Now that I think about this, isn't the *main* problem really that we
don't display the sum of the per-worker stats (which I think is wrong)?
I mean, we already can get the worker details VERBOSEm right? So the
only reason to display that by default seems to be that it the values in
"Heap Blocks" are from the leader only.

BTW doesn't this also suggest some of the code added to explain.c may
not be quite necessary? Wouldn't it be enough to just "extend" the
existing 

Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2023-11-04 Thread Dilip Kumar
On Fri, Jan 20, 2023 at 2:04 PM David Geier  wrote:
>
> Hi hackers,
>
> EXPLAIN ANALYZE for parallel Bitmap Heap Scans currently only reports
> the number of heap blocks processed by the leader. It's missing the
> per-worker stats. The attached patch adds that functionality in the
> spirit of e.g. Sort or Memoize. Here is a simple test case and the
> EXPLAIN ANALYZE output with and without the patch:
>

> With the patch:
>
>   Gather (actual rows=99501 loops=1)
> Workers Planned: 2
> Workers Launched: 2
> ->  Parallel Bitmap Heap Scan on foo (actual rows=33167 loops=3)
>   Recheck Cond: ((col0 > 900) OR (col1 = 1))
>   Heap Blocks: exact=98
>   Worker 0:  Heap Blocks: exact=171 lossy=0
>   Worker 1:  Heap Blocks: exact=172 lossy=0


else
  {
+ if (planstate->stats.exact_pages > 0)
+appendStringInfo(es->str, " exact=%ld", planstate->stats.exact_pages);
+ if (planstate->stats.lossy_pages > 0)
+ appendStringInfo(es->str, " lossy=%ld", planstate->stats.lossy_pages);
  appendStringInfoChar(es->str, '\n');
  }
  }

+ for (int n = 0; n < planstate->shared_info->num_workers; n++)
+ {

+ "Heap Blocks: exact="UINT64_FORMAT" lossy=" INT64_FORMAT"\n", +
si->exact_pages, si->lossy_pages);

Shouldn't we use the same format for reporting exact and lossy pages
for the actual backend and the worker? I mean here for the backend you
are showing lossy pages only if it is > 0 whereas for workers we are
showing 0 lossy pages as well.

-- 
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com




Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2023-10-17 Thread Robert Haas
On Mon, Oct 16, 2023 at 12:31 PM Michael Christofides
 wrote:
> According to the docs[1]: "In a parallel bitmap heap scan, one process is 
> chosen as the leader. That process performs a scan of one or more indexes and 
> builds a bitmap indicating which table blocks need to be visited. These 
> blocks are then divided among the cooperating processes as in a parallel 
> sequential scan."
>
> My understanding is that the "Heap Blocks" statistic is only reporting blocks 
> for the bitmap (i.e. not the subsequent scan). As such, I think it is correct 
> that the workers do not report additional exact heap blocks.

I think you're wrong about that. The bitmap index scans are what scan
the indexes and build the bitmap. The bitmap heap scan node is what
scans the heap i.e. the table, and that is what is divided across the
workers.

On the patch itself, snapshot_and_stats doesn't strike me as a great
name. If we added three more variable-length things would we call the
member snapshot_and_stats_and_pink_and_orange_and_blue? Probably
better to pick a name that is somehow more generic.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2023-10-16 Thread Michael Christofides
> EXPLAIN ANALYZE for parallel Bitmap Heap Scans currently only reports
> the number of heap blocks processed by the leader. It's missing the
> per-worker stats.


Hi David,

According to the docs[1]: "In a parallel bitmap heap scan, one process is
chosen as the leader. That process performs a scan of one or more indexes
and builds a bitmap indicating which table blocks need to be visited. These
blocks are then divided among the cooperating processes as in a parallel
sequential scan."

My understanding is that the "Heap Blocks" statistic is only reporting
blocks for the bitmap (i.e. not the subsequent scan). As such, I think it
is correct that the workers do not report additional exact heap blocks.


> explain (analyze, costs off, timing off) select * from foo where col0 >
> 900 or col1 = 1;
>

In your example, if you add the buffers and verbose parameters, do the
worker reported buffers numbers report what you are looking for?

i.e. explain (analyze, buffers, verbose, costs off, timing off) select *
from foo where col0 > 900 or col1 = 1;

—
Michael Christofides
Founder, pgMustard 

[1]:
https://www.postgresql.org/docs/current/parallel-plans.html#PARALLEL-SCANS


Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2023-10-06 Thread Dmitry Dolgov
> On Wed, Sep 20, 2023 at 03:42:43PM +0200, David Geier wrote:
> Another, not yet discussed, option I can see work is:
>
> 4. Allocate a fixed amount of memory for the instrumentation stats based on
> MAX_PARALLEL_WORKER_LIMIT: MAX_PARALLEL_WORKER_LIMIT is 1024 and used as the
> limit of the max_parallel_workers GUC. This way MAX_PARALLEL_WORKER_LIMIT *
> sizeof(BitmapHeapScanInstrumentation) = 1024 * 8 = 8192 bytes would be
> allocated. To cut this down in half we could additionally change the type of
> lossy_pages and exact_pages from long to uint32. Only possibly needed memory
> would have to get initialized, the remaining unused memory would remain
> untouched to not waste cycles.

I'm not sure that it would be acceptable -- if I understand correctly it
would be 8192 bytes per parallel bitmap heap scan node, and could be
noticeable in the worst case scenario with too many connections and too
many such nodes in every query.

I find the original approach with an offset not that bad, after all
there is something similar going on in other places, e.g. parallel heap
scan also has phs_snapshot_off (although the rest is fixed sized). My
commentary above in the thread was mostly about the cosmetic side.
Giving snapshot_and_stats a decent name and maybe even ditching the
access functions, using instead only the offset field couple of times,
and it would look better to me.




Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2023-09-20 Thread David Geier

Hi Dmitry,

Thanks for looking at the patch and sorry for the long line.

On 3/17/23 21:14, Dmitry Dolgov wrote:

The idea sounds reasonable to me, but I have to admit snapshot_and_stats
implementation looks awkward. Maybe it would be better to have a
separate structure field for both stats and snapshot, which will be set
to point to a corresponding place in the shared FAM e.g. when the worker
is getting initialized? shm_toc_allocate mentions BUFFERALIGN to handle
possibility of some atomic operations needing it, so I guess that would
have to be an alignment in this case as well.

Probably another option would be to allocate two separate pieces of
shared memory, which resolves questions like proper alignment, but
annoyingly will require an extra lookup and a new key.


I considered the other options and it seems to me none of them is 
particularly superior. All of them have pros and cons with the cons 
mostly outweighing the pros. Let me quickly elaborate:


1. Use multiple shm_toc entries: Shared state is split into multiple 
pieces. Extra pointers in BitmapHeapScanState needed to point at the 
split out data. BitmapHeapScanState has already a shared_info member, 
which is not a pointer to the shared memory but a pointer to the leader 
local data allocated used to store the instrumentation data from the 
workers. This is confusing but at least consistent with how its done in 
other places (e.g. execSort.c, nodeHash.c, nodeIncrementalSort.c). 
Having another pointer there which points to the shared memory makes it 
even more confusing. If we go this way we would have e.g. 
shared_info_copy and shared_info members in BitmapHeapScanState.


2. Store two extra pointers to the shared FAM entries in 
BitmapHeapScanState: IMHO, that is the better alternative of (1) as it 
doesn't need an extra TOC entry but comes with the same confusion of 
multiple pointers to SharedBitmapHeapScanInfo in BitmapHeapScanState. 
But maybe that's not too bad?


3. Solution in initial patch (use two functions to obtain pointers 
where/when needed): Avoids the need for another pointer in 
BitmapHeapScanState at the cost / ugliness of having to call the helper 
functions.


Another, not yet discussed, option I can see work is:

4. Allocate a fixed amount of memory for the instrumentation stats based 
on MAX_PARALLEL_WORKER_LIMIT: MAX_PARALLEL_WORKER_LIMIT is 1024 and used 
as the limit of the max_parallel_workers GUC. This way 
MAX_PARALLEL_WORKER_LIMIT * sizeof(BitmapHeapScanInstrumentation) = 1024 
* 8 = 8192 bytes would be allocated. To cut this down in half we could 
additionally change the type of lossy_pages and exact_pages from long to 
uint32. Only possibly needed memory would have to get initialized, the 
remaining unused memory would remain untouched to not waste cycles.


My first preference is the new option (4). My second preference is 
option (1). What's your take?


--
David Geier
(ServiceNow)





Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2023-03-17 Thread Dmitry Dolgov
> On Tue, Feb 21, 2023 at 01:02:35PM +0100, David Geier wrote:
> Hi,
>
> On 1/20/23 09:34, David Geier wrote:
> > EXPLAIN ANALYZE for parallel Bitmap Heap Scans currently only reports
> > the number of heap blocks processed by the leader. It's missing the
> > per-worker stats. The attached patch adds that functionality in the
> > spirit of e.g. Sort or Memoize. Here is a simple test case and the
> > EXPLAIN ANALYZE output with and without the patch:
>
> Attached is a rebased version of the patch. I would appreciate someone
> taking a look.
>
> As background: the change doesn't come out of thin air. We repeatedly took
> wrong conclusions in our query analysis because we assumed that the reported
> block counts include the workers.
>
> If no one objects I would also register the patch at the commit fest. The
> patch is passing cleanly on CI.

Thanks for the patch.

The idea sounds reasonable to me, but I have to admit snapshot_and_stats
implementation looks awkward. Maybe it would be better to have a
separate structure field for both stats and snapshot, which will be set
to point to a corresponding place in the shared FAM e.g. when the worker
is getting initialized? shm_toc_allocate mentions BUFFERALIGN to handle
possibility of some atomic operations needing it, so I guess that would
have to be an alignment in this case as well.

Probably another option would be to allocate two separate pieces of
shared memory, which resolves questions like proper alignment, but
annoyingly will require an extra lookup and a new key.




Re: Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2023-02-21 Thread David Geier

Hi,

On 1/20/23 09:34, David Geier wrote:
EXPLAIN ANALYZE for parallel Bitmap Heap Scans currently only reports 
the number of heap blocks processed by the leader. It's missing the 
per-worker stats. The attached patch adds that functionality in the 
spirit of e.g. Sort or Memoize. Here is a simple test case and the 
EXPLAIN ANALYZE output with and without the patch:


Attached is a rebased version of the patch. I would appreciate someone 
taking a look.


As background: the change doesn't come out of thin air. We repeatedly 
took wrong conclusions in our query analysis because we assumed that the 
reported block counts include the workers.


If no one objects I would also register the patch at the commit fest. 
The patch is passing cleanly on CI.


--
David Geier
(ServiceNow)
From de03dfb101810f051b883804947707ecddaffceb Mon Sep 17 00:00:00 2001
From: David Geier 
Date: Tue, 8 Nov 2022 19:40:31 +0100
Subject: [PATCH v2] Parallel Bitmap Heap Scan reports per-worker stats

Similarly to other nodes (e.g. hash join, sort, memoize),
Bitmap Heap Scan now reports per-worker stats in the EXPLAIN
ANALYZE output. Previously only the heap blocks stats for the
leader were reported which was incomplete in parallel scans.

	Discussion: https://www.postgresql.org/message-id/flat/b3d80961-c2e5-38cc-6a32-61886cdf766d%40gmail.com
---
 src/backend/commands/explain.c| 45 ++--
 src/backend/executor/execParallel.c   |  3 +
 src/backend/executor/nodeBitmapHeapscan.c | 88 +++
 src/include/executor/nodeBitmapHeapscan.h |  1 +
 src/include/nodes/execnodes.h | 23 +-
 5 files changed, 137 insertions(+), 23 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e57bda7b62..b5b31a763c 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3396,26 +3396,57 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 static void
 show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
 {
+	Assert(es->analyze);
+
 	if (es->format != EXPLAIN_FORMAT_TEXT)
 	{
 		ExplainPropertyInteger("Exact Heap Blocks", NULL,
-			   planstate->exact_pages, es);
+			   planstate->stats.exact_pages, es);
 		ExplainPropertyInteger("Lossy Heap Blocks", NULL,
-			   planstate->lossy_pages, es);
+			   planstate->stats.lossy_pages, es);
 	}
 	else
 	{
-		if (planstate->exact_pages > 0 || planstate->lossy_pages > 0)
+		if (planstate->stats.exact_pages > 0 || planstate->stats.lossy_pages > 0)
 		{
 			ExplainIndentText(es);
 			appendStringInfoString(es->str, "Heap Blocks:");
-			if (planstate->exact_pages > 0)
-appendStringInfo(es->str, " exact=%ld", planstate->exact_pages);
-			if (planstate->lossy_pages > 0)
-appendStringInfo(es->str, " lossy=%ld", planstate->lossy_pages);
+			if (planstate->stats.exact_pages > 0)
+appendStringInfo(es->str, " exact=%ld", planstate->stats.exact_pages);
+			if (planstate->stats.lossy_pages > 0)
+appendStringInfo(es->str, " lossy=%ld", planstate->stats.lossy_pages);
 			appendStringInfoChar(es->str, '\n');
 		}
 	}
+
+	if (planstate->shared_info != NULL)
+	{
+		for (int n = 0; n < planstate->shared_info->num_workers; n++)
+		{
+			BitmapHeapScanInstrumentation *si = >shared_info->sinstrument[n];
+
+			if (si->exact_pages == 0 && si->lossy_pages == 0)
+continue;
+
+			if (es->workers_state)
+ExplainOpenWorker(n, es);
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+ExplainIndentText(es);
+appendStringInfo(es->str, "Heap Blocks: exact=%ld lossy=%ld\n",
+		 si->exact_pages, si->lossy_pages);
+			}
+			else
+			{
+ExplainPropertyInteger("Exact Heap Blocks", NULL, si->exact_pages, es);
+ExplainPropertyInteger("Lossy Heap Blocks", NULL, si->lossy_pages, es);
+			}
+
+			if (es->workers_state)
+ExplainCloseWorker(n, es);
+		}
+	}
 }
 
 /*
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index aa3f283453..73ef4f0c0f 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -1072,6 +1072,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_MemoizeState:
 			ExecMemoizeRetrieveInstrumentation((MemoizeState *) planstate);
 			break;
+		case T_BitmapHeapScanState:
+			ExecBitmapHeapRetrieveInstrumentation((BitmapHeapScanState *) planstate);
+			break;
 		default:
 			break;
 	}
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index f35df0b8bf..6d36e7922b 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -239,9 +239,9 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			}
 
 			if (tbmres->ntuples >= 0)
-node->exact_pages++;
+node->stats.exact_pages++;
 			else
-node->lossy_pages++;
+node->stats.lossy_pages++;
 
 			/* Adjust the prefetch target */
 			BitmapAdjustPrefetchTarget(node);
@@ -641,6 +641,22 @@ 

Parallel Bitmap Heap Scan reports per-worker stats in EXPLAIN ANALYZE

2023-01-20 Thread David Geier

Hi hackers,

EXPLAIN ANALYZE for parallel Bitmap Heap Scans currently only reports 
the number of heap blocks processed by the leader. It's missing the 
per-worker stats. The attached patch adds that functionality in the 
spirit of e.g. Sort or Memoize. Here is a simple test case and the 
EXPLAIN ANALYZE output with and without the patch:


create table foo(col0 int, col1 int);
insert into foo select generate_series(1, 1000, 0.001), 
generate_series(1000, 2000, 0.001);

create index idx0 on foo(col0);
create index idx1 on foo(col1);
set parallel_tuple_cost = 0;
set parallel_setup_cost = 0;
explain (analyze, costs off, timing off) select * from foo where col0 > 
900 or col1 = 1;


With the patch:

 Gather (actual rows=99501 loops=1)
   Workers Planned: 2
   Workers Launched: 2
   ->  Parallel Bitmap Heap Scan on foo (actual rows=33167 loops=3)
 Recheck Cond: ((col0 > 900) OR (col1 = 1))
 Heap Blocks: exact=98
 Worker 0:  Heap Blocks: exact=171 lossy=0
 Worker 1:  Heap Blocks: exact=172 lossy=0
 ->  BitmapOr (actual rows=0 loops=1)
   ->  Bitmap Index Scan on idx0 (actual rows=99501 loops=1)
 Index Cond: (col0 > 900)
   ->  Bitmap Index Scan on idx1 (actual rows=0 loops=1)
 Index Cond: (col1 = 1)

Without the patch:

 Gather (actual rows=99501 loops=1)
   Workers Planned: 2
   Workers Launched: 2
   ->  Parallel Bitmap Heap Scan on foo (actual rows=33167 loops=3)
 Recheck Cond: ((col0 > 900) OR (col1 = 1))
 Heap Blocks: exact=91
 ->  BitmapOr (actual rows=0 loops=1)
   ->  Bitmap Index Scan on idx0 (actual rows=99501 loops=1)
 Index Cond: (col0 > 900)
   ->  Bitmap Index Scan on idx1 (actual rows=0 loops=1)
 Index Cond: (col1 = 1)

So in total the parallel Bitmap Heap Scan actually processed 441 heap 
blocks instead of just 91.


Now two variable length arrays (VLA) would be needed, one for the 
snapshot and one for the stats. As this obviously doesn't work, I now 
use a single, big VLA and added functions to retrieve pointers to the 
respective fields. I'm using MAXALIGN() to make sure the latter field is 
aligned properly. Am I doing this correctly? I'm not entirely sure 
around alignment conventions and requirements of other platforms.


I couldn't find existing tests that exercise the EXPLAIN ANALYZE output 
of specific nodes. I could only find a few basic smoke tests for EXPLAIN 
ANALYZE with parallel nodes in parallel_select.sql. Do we want tests for 
the changed functionality? If so I could right away also add tests for 
EXPLAIN ANALYZE including other parallel nodes.


Thank you for your feedback.

--
David Geier
(ServiceNow)
From b2c84fb16e9521d6cfadb0c069e27a213e8e8471 Mon Sep 17 00:00:00 2001
From: David Geier 
Date: Tue, 8 Nov 2022 19:40:31 +0100
Subject: [PATCH v1] Parallel Bitmap Heap Scan reports per-worker stats

Similarly to other nodes (e.g. hash join, sort, memoize),
Bitmap Heap Scan now reports per-worker stats in the EXPLAIN
ANALYZE output. Previously only the heap blocks stats for the
leader were reported which was incomplete in parallel scans.
---
 src/backend/commands/explain.c| 46 ++--
 src/backend/executor/execParallel.c   |  3 +
 src/backend/executor/nodeBitmapHeapscan.c | 88 +++
 src/include/executor/nodeBitmapHeapscan.h |  1 +
 src/include/nodes/execnodes.h | 23 +-
 5 files changed, 138 insertions(+), 23 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 5212a64b1e..d532fc4a87 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3396,26 +3396,58 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 static void
 show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
 {
+	Assert(es->analyze);
+
 	if (es->format != EXPLAIN_FORMAT_TEXT)
 	{
 		ExplainPropertyInteger("Exact Heap Blocks", NULL,
-			   planstate->exact_pages, es);
+			   planstate->stats.exact_pages, es);
 		ExplainPropertyInteger("Lossy Heap Blocks", NULL,
-			   planstate->lossy_pages, es);
+			   planstate->stats.lossy_pages, es);
 	}
 	else
 	{
-		if (planstate->exact_pages > 0 || planstate->lossy_pages > 0)
+		if (planstate->stats.exact_pages > 0 || planstate->stats.lossy_pages > 0)
 		{
 			ExplainIndentText(es);
 			appendStringInfoString(es->str, "Heap Blocks:");
-			if (planstate->exact_pages > 0)
-appendStringInfo(es->str, " exact=%ld", planstate->exact_pages);
-			if (planstate->lossy_pages > 0)
-appendStringInfo(es->str, " lossy=%ld", planstate->lossy_pages);
+			if (planstate->stats.exact_pages > 0)
+appendStringInfo(es->str, " exact=%ld", planstate->stats.exact_pages);
+			if (planstate->stats.lossy_pages > 0)
+appendStringInfo(es->str, " lossy=%ld", planstate->stats.lossy_pages);
 			appendStringInfoChar(es->str, '\n');
 		}