On Mon, Apr 24, 2023 at 6:13 PM Andres Freund <[email protected]> wrote:
> Hi,
>
> On 2023-04-24 17:37:48 -0400, Melanie Plageman wrote:
> > On Mon, Apr 24, 2023 at 02:14:32PM -0700, Andres Freund wrote:
> > > It starts blocking once "enough" IO is in flight. For things like an
> immediate
> > > checkpoint, that can happen fairly quickly, unless you have a very
> fast IO
> > > subsystem. So often it'll not matter whether we track smgrwriteback(),
> but
> > > when it matter, it can matter a lot.
> >
> > I see. So, it sounds like this is most likely to happen for checkpointer
> > and not likely to happen for other backends who call
> > ScheduleBufferTagForWriteback().
>
> It's more likely, but once the IO subsystem is busy, it'll also happen for
> other users ScheduleBufferTagForWriteback().
>
>
> > Also, it seems like this (given the current code) is only reachable for
> > permanent relations (i.e. not for IO object temp relation). If other
> backend
> > types than checkpointer may call smgrwriteback(), we likely have to
> consider
> > the IO context.
>
> I think we should take it into account - it'd e.g. interesting to see a
> COPY
> is bottlenecked on smgrwriteback() rather than just writing the data.
>
With the quick and dirty attached patch and using your example but with a
pgbench -T200 on my rather fast local NVMe SSD, you can still see quite
a difference.
This is with a stats reset before the checkpoint.
unpatched:
backend_type | object | context | writes | write_time |
fsyncs | fsync_time
---------------------+---------------+-----------+---------+------------+---------+------------
background writer | relation | normal | 443 | 1.408 |
0 | 0
checkpointer | relation | normal | 187804 | 396.829 |
47 | 254.226
patched:
backend_type | object | context | writes | write_time
| fsyncs | fsync_time
---------------------+---------------+-----------+---------+--------------------+--------+------------
background writer | relation | normal | 917 |
4.4670000000000005 | 0 | 0
checkpointer | relation | normal | 375798 |
977.354 | 48 | 202.514
I did compare client backend stats before and after pgbench and it made
basically no difference. I'll do a COPY example like you mentioned.
Patch needs cleanup/comments and a bit more work, but I could do with
a sanity check review on the approach.
- Melanie
From 6c5661ee5f0efbda9d246184f40cd799c21b5d2a Mon Sep 17 00:00:00 2001
From: Melanie Plageman <[email protected]>
Date: Mon, 24 Apr 2023 18:21:54 -0400
Subject: [PATCH v1] Add writeback to pg_stat_io writes.
---
src/backend/postmaster/bgwriter.c | 4 ++--
src/backend/storage/buffer/buf_init.c | 2 +-
src/backend/storage/buffer/bufmgr.c | 12 ++++++++++--
src/include/storage/buf_internals.h | 4 +++-
4 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index caad642ec9..4c3540d6e1 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -129,7 +129,7 @@ BackgroundWriterMain(void)
ALLOCSET_DEFAULT_SIZES);
MemoryContextSwitchTo(bgwriter_context);
- WritebackContextInit(&wb_context, &bgwriter_flush_after);
+ WritebackContextInit(&wb_context, IOCONTEXT_NORMAL, &bgwriter_flush_after);
/*
* If an exception is encountered, processing resumes here.
@@ -185,7 +185,7 @@ BackgroundWriterMain(void)
MemoryContextResetAndDeleteChildren(bgwriter_context);
/* re-initialize to avoid repeated errors causing problems */
- WritebackContextInit(&wb_context, &bgwriter_flush_after);
+ WritebackContextInit(&wb_context, IOCONTEXT_NORMAL, &bgwriter_flush_after);
/* Now we can allow interrupts again */
RESUME_INTERRUPTS();
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 0057443f0c..1d80532f3b 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -146,7 +146,7 @@ InitBufferPool(void)
StrategyInitialize(!foundDescs);
/* Initialize per-backend file flush context */
- WritebackContextInit(&BackendWritebackContext,
+ WritebackContextInit(&BackendWritebackContext, IOCONTEXT_NORMAL,
&backend_flush_after);
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1fa689052e..774dbadd08 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1685,6 +1685,7 @@ again:
FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
LWLockRelease(content_lock);
+ BackendWritebackContext.io_context = io_context;
ScheduleBufferTagForWriteback(&BackendWritebackContext,
&buf_hdr->tag);
}
@@ -2555,7 +2556,7 @@ BufferSync(int flags)
if (num_to_scan == 0)
return; /* nothing to do */
- WritebackContextInit(&wb_context, &checkpoint_flush_after);
+ WritebackContextInit(&wb_context, IOCONTEXT_NORMAL, &checkpoint_flush_after);
TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
@@ -5433,10 +5434,11 @@ ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
* writeback control will be performed.
*/
void
-WritebackContextInit(WritebackContext *context, int *max_pending)
+WritebackContextInit(WritebackContext *context, IOContext io_context, int *max_pending)
{
Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
+ context->io_context = io_context;
context->max_pending = max_pending;
context->nr_pending = 0;
}
@@ -5491,6 +5493,7 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
void
IssuePendingWritebacks(WritebackContext *context)
{
+ instr_time io_start;
int i;
if (context->nr_pending == 0)
@@ -5502,6 +5505,8 @@ IssuePendingWritebacks(WritebackContext *context)
*/
sort_pending_writebacks(context->pending_writebacks, context->nr_pending);
+ io_start = pgstat_prepare_io_time();
+
/*
* Coalesce neighbouring writes, but nothing else. For that we iterate
* through the, now sorted, array of pending flushes, and look forward to
@@ -5555,6 +5560,9 @@ IssuePendingWritebacks(WritebackContext *context)
smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
}
+ pgstat_count_io_op_time(IOOBJECT_RELATION, context->io_context, IOOP_WRITE,
+ io_start, context->nr_pending);
+
context->nr_pending = 0;
}
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 136cf8fbaf..0d6e93ffad 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -295,6 +295,8 @@ typedef struct PendingWriteback
/* struct forward declared in bufmgr.h */
typedef struct WritebackContext
{
+ IOContext io_context;
+
/* pointer to the max number of writeback requests to coalesce */
int *max_pending;
@@ -387,7 +389,7 @@ extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
* Internal buffer management routines
*/
/* bufmgr.c */
-extern void WritebackContextInit(WritebackContext *context, int *max_pending);
+extern void WritebackContextInit(WritebackContext *context, IOContext io_context, int *max_pending);
extern void IssuePendingWritebacks(WritebackContext *context);
extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
--
2.37.2