Good day,
27.03.2026 12:04, Yura Sokolov wrote:
> 26.03.2026 23:29, Andres Freund wrote:
>> Hi,
>>
>> On 2026-03-25 17:51:30 +0300, Yura Sokolov wrote:
>>> UnlockBufHdrExt does:
>>>
>>> buf_state |= set_bits;
>>> buf_state &= ~unset_bits;
>>> buf_state &= ~BM_LOCKED;
>>>
>>> TerminateBufferIO unconditionally does:
>>>
>>> unset_flag_bits |= BM_IO_ERROR;
>>>
>>> Due to this, AbortBufferIO and buffer_readv_complete_one are failed
>>> to set BM_IO_ERROR with call to TerminateBufferIO.
>>>
>>> It was found with proprietary code that was triggered on
>>> PGAIO_RS_ERROR and made assertion on BM_IO_ERROR presence.
>>
>> That's clearly not right. Care to write a patch? I think we should add a
>> test for this in src/test/modules/test_aio too. As we don't rely on things
>> like BM_IO_ERROR this is quite easy to not notice.
> I thought, fix is too small to go the way of patches.
> I don't mind if you just push it.
>
> But if I can save your time on writing test, I'll try.
>
> ...
>
> I believe, proper way is to add assertion in UnlockBufHdrExt:
>
> Assert(!(set_bits & unset_bits));
>
> And make TerminateBufferIO to exclude BM_IO_ERROR if it is among
> set_flag_bits.
>
> Is it ok?
Here patchset is.
I've tried to modify 001_aio.pl to test presence BM_IO_ERROR after read
failure.
I've also added FlushBuffer failure test in second patch (v1-002) using
injection point. I don't know if 001-aio.pl is a good place for since write
is not async yet.
v1-003 just mades DebugPrintBufferRefcount prettier.
--
regards
Yura Sokolov aka funny-falcon
From ef62af742c30f7c3e0c0d0dad0504d18a13596f9 Mon Sep 17 00:00:00 2001
From: Yura Sokolov <[email protected]>
Date: Fri, 27 Mar 2026 20:11:06 +0300
Subject: [PATCH v1 1/3] bufmgr: Fix possibility to set BM_IO_ERROR
Previously it couldn't be set because TerminateBufferIO added BM_IO_ERROR
to unset_flag_bits unconditionally, and UnlockBufHdrExt applied unset_bits
after set_bits.
Fix by not setting BM_IO_ERROR into unset_flag_bits if it is present in
set_flag_bits.
Also protect from possible similar errors by adding assertion to
UnlockBufHdrExt unset_bits and set_bits have no bits in common.
Modify src/test/modules/test_aio/t/001_aio.pl test_io_error to check
presence of BM_IO_ERROR.
---
src/backend/storage/buffer/bufmgr.c | 4 +--
src/include/storage/buf_internals.h | 1 +
src/test/modules/test_aio/t/001_aio.pl | 20 ++++++++++++++
src/test/modules/test_aio/test_aio--1.0.sql | 4 +++
src/test/modules/test_aio/test_aio.c | 30 +++++++++++++++++++++
5 files changed, 57 insertions(+), 2 deletions(-)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e212f6110f2..16b043cd681 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -7172,8 +7172,8 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits,
Assert(buf_state & BM_IO_IN_PROGRESS);
unset_flag_bits |= BM_IO_IN_PROGRESS;
- /* Clear earlier errors, if this IO failed, it'll be marked again */
- unset_flag_bits |= BM_IO_ERROR;
+ /* Clear earlier errors, unless this IO failed as well */
+ unset_flag_bits |= BM_IO_ERROR & ~set_flag_bits;
if (clear_dirty)
unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 8d1e16b5d51..14a36cfe624 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -475,6 +475,7 @@ UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state,
uint64 set_bits, uint64 unset_bits,
int refcount_change)
{
+ Assert(!(set_bits & unset_bits));
for (;;)
{
uint64 buf_state = old_buf_state;
diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl
index e18b2a2b8ae..c352dab5a41 100644
--- a/src/test/modules/test_aio/t/001_aio.pl
+++ b/src/test/modules/test_aio/t/001_aio.pl
@@ -328,6 +328,8 @@ SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
$tblname eq 'tbl_corr'
? qr/invalid page in block 1 of relation "base\/\d+\/\d+/
: qr/invalid page in block 1 of relation "base\/\d+\/t\d+_\d+/;
+ # BM_IO_ERROR and BM_TAG_VALID should be set
+ my $debug_print_re = qr/blockNum=1, flags=0x.?a000000, refcount=0/;
# verify the error is reported in custom C code
psql_like(
@@ -338,6 +340,12 @@ SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
qr/^$/,
$invalid_page_re);
+ psql_like(
+ $io_method, $psql,
+ "validate flags of $tblname page after read_rel_block_ll()",
+ qq(SELECT debug_print_rel_block('$tblname', 1)),
+ $debug_print_re, qr/^$/);
+
# verify the error is reported for bufmgr reads, seq scan
psql_like(
$io_method, $psql,
@@ -345,6 +353,12 @@ SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
qq(SELECT count(*) FROM $tblname),
qr/^$/, $invalid_page_re);
+ psql_like(
+ $io_method, $psql,
+ "validate flags of $tblname page after read_rel_block_ll()",
+ qq(SELECT debug_print_rel_block('$tblname', 1)),
+ $debug_print_re, qr/^$/);
+
# verify the error is reported for bufmgr reads, tid scan
psql_like(
$io_method,
@@ -353,6 +367,12 @@ SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
qq(SELECT count(*) FROM $tblname WHERE ctid = '(1, 1)'),
qr/^$/,
$invalid_page_re);
+
+ psql_like(
+ $io_method, $psql,
+ "validate flags of $tblname page after read_rel_block_ll()",
+ qq(SELECT debug_print_rel_block('$tblname', 1)),
+ $debug_print_re, qr/^$/);
}
$psql->quit();
diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql
index e495481c41e..478593907c8 100644
--- a/src/test/modules/test_aio/test_aio--1.0.sql
+++ b/src/test/modules/test_aio/test_aio--1.0.sql
@@ -33,6 +33,10 @@ CREATE FUNCTION read_rel_block_ll(
RETURNS pg_catalog.void STRICT
AS 'MODULE_PATHNAME' LANGUAGE C;
+CREATE FUNCTION debug_print_rel_block(rel regclass, blockno int)
+RETURNS pg_catalog.text STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
CREATE FUNCTION invalidate_rel_block(rel regclass, blockno int)
RETURNS pg_catalog.void STRICT
AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c
index b1aa8af9ec0..f616653ed02 100644
--- a/src/test/modules/test_aio/test_aio.c
+++ b/src/test/modules/test_aio/test_aio.c
@@ -511,6 +511,36 @@ invalidate_rel_block(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+PG_FUNCTION_INFO_V1(debug_print_rel_block);
+Datum
+debug_print_rel_block(PG_FUNCTION_ARGS)
+{
+ Oid relid = PG_GETARG_OID(0);
+ BlockNumber blkno = PG_GETARG_UINT32(1);
+ Relation rel;
+ PrefetchBufferResult pr;
+ Buffer buf;
+ char *desc = NULL;
+
+ rel = relation_open(relid, AccessExclusiveLock);
+
+ /*
+ * This is a gross hack, but there's no other API exposed that allows to
+ * get a buffer ID without actually reading the block in.
+ */
+ pr = PrefetchBuffer(rel, MAIN_FORKNUM, blkno);
+ buf = pr.recent_buffer;
+
+ if (BufferIsValid(buf))
+ desc = DebugPrintBufferRefcount(buf);
+
+ relation_close(rel, AccessExclusiveLock);
+
+ if (desc == NULL)
+ PG_RETURN_NULL();
+ PG_RETURN_TEXT_P(cstring_to_text(desc));
+}
+
PG_FUNCTION_INFO_V1(buffer_create_toy);
Datum
buffer_create_toy(PG_FUNCTION_ARGS)
--
2.51.0
From 7ca65ea99b000558a770fbba8863ac75c7f4921d Mon Sep 17 00:00:00 2001
From: Yura Sokolov <[email protected]>
Date: Fri, 27 Mar 2026 20:55:24 +0300
Subject: [PATCH v1 2/3] bufmgr: Add test for BM_IO_ERROR presence after write
error in FlushBuffer
smgrwrite should raise error on mdwrite error. Lets immitate it with
injection point.
Check BM_IO_ERROR is set and second attempt leads to "Multiple failures"
warning.
---
src/backend/storage/buffer/bufmgr.c | 2 +
src/test/modules/test_aio/t/001_aio.pl | 27 +++++++++++
src/test/modules/test_aio/test_aio--1.0.sql | 11 ++++-
src/test/modules/test_aio/test_aio.c | 52 +++++++++++++++++++++
4 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 16b043cd681..7d0ae0e4593 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -64,6 +64,7 @@
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
+#include "utils/injection_point.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
@@ -4521,6 +4522,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
bufToWrite,
false);
+ INJECTION_POINT("flush-buffer-after-smgr-write", buf);
/*
* When a strategy is in use, only flushes of dirty buffers already in the
* strategy ring are counted as strategy writes (IOCONTEXT
diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl
index c352dab5a41..680f13e43ec 100644
--- a/src/test/modules/test_aio/t/001_aio.pl
+++ b/src/test/modules/test_aio/t/001_aio.pl
@@ -800,6 +800,33 @@ SELECT invalidate_rel_block('tbl_ok', 2);
);
$psql->query_safe(qq(SELECT inj_io_short_read_detach()));
+ # Now test write error
+ $psql->query_safe("SELECT count(*) FROM tbl_ok");
+ $psql->query_safe("SELECT inj_io_error_flush_buffer_attach();");
+ psql_like(
+ $io_method, $psql,
+ "flush error is reported",
+ qq(SELECT invalidate_rel_block('tbl_ok', 2, force_flush=>true)),
+ qr/^$/,
+ qr/ERROR:.*injection point triggering failure to flush buffer/
+ );
+ # BM_IO_ERROR, BM_VALID and BM_TAG_VALID should be set
+ psql_like(
+ $io_method, $psql,
+ "validate flags of 'tbl_ok' page after read_rel_block_ll()",
+ qq(SELECT debug_print_rel_block('tbl_ok', 2)),
+ qr/blockNum=2, flags=0x.?b.00000, refcount=0/, qr/^$/);
+ # Validate second attempt leads to "Multiple failures" message
+ psql_like(
+ $io_method, $psql,
+ "flush error is reported",
+ qq(SELECT invalidate_rel_block('tbl_ok', 2, force_flush=>true)),
+ qr/^$/,
+ qr/Multiple failures --- write error might be permanent/
+ );
+
+ $psql->query_safe(qq(SELECT inj_io_error_flush_buffer_detach()));
+
$psql->quit();
}
diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql
index 478593907c8..caa210b0fd7 100644
--- a/src/test/modules/test_aio/test_aio--1.0.sql
+++ b/src/test/modules/test_aio/test_aio--1.0.sql
@@ -37,7 +37,8 @@ CREATE FUNCTION debug_print_rel_block(rel regclass, blockno int)
RETURNS pg_catalog.text STRICT
AS 'MODULE_PATHNAME' LANGUAGE C;
-CREATE FUNCTION invalidate_rel_block(rel regclass, blockno int)
+CREATE FUNCTION invalidate_rel_block(rel regclass, blockno int,
+ force_flush bool DEFAULT false)
RETURNS pg_catalog.void STRICT
AS 'MODULE_PATHNAME' LANGUAGE C;
@@ -110,3 +111,11 @@ AS 'MODULE_PATHNAME' LANGUAGE C;
CREATE FUNCTION inj_io_reopen_detach()
RETURNS pg_catalog.void STRICT
AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_error_flush_buffer_attach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_error_flush_buffer_detach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c
index f616653ed02..30a9833869b 100644
--- a/src/test/modules/test_aio/test_aio.c
+++ b/src/test/modules/test_aio/test_aio.c
@@ -39,6 +39,7 @@ typedef struct InjIoErrorState
{
bool enabled_short_read;
bool enabled_reopen;
+ bool enabled_error_flush_buffer;
bool short_read_result_set;
int short_read_result;
@@ -100,6 +101,12 @@ test_aio_shmem_startup(void)
0);
InjectionPointLoad("aio-worker-after-reopen");
+ InjectionPointAttach("flush-buffer-after-smgr-write",
+ "test_aio",
+ "inj_io_error_flush_buffer",
+ NULL,
+ 0);
+ InjectionPointLoad("flush-buffer-after-smgrwrite");
#endif
}
else
@@ -111,6 +118,7 @@ test_aio_shmem_startup(void)
#ifdef USE_INJECTION_POINTS
InjectionPointLoad("aio-process-completion-before-shared");
InjectionPointLoad("aio-worker-after-reopen");
+ InjectionPointLoad("flush-buffer-after-smgr-write");
elog(LOG, "injection point loaded");
#endif
}
@@ -464,6 +472,7 @@ invalidate_rel_block(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
BlockNumber blkno = PG_GETARG_UINT32(1);
+ bool force_flush = PG_GETARG_BOOL(2);
Relation rel;
PrefetchBufferResult pr;
Buffer buf;
@@ -489,6 +498,9 @@ invalidate_rel_block(PG_FUNCTION_ARGS)
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ if (force_flush)
+ MarkBufferDirty(buf);
+
if (pg_atomic_read_u64(&buf_hdr->state) & BM_DIRTY)
{
if (BufferIsLocal(buf))
@@ -716,6 +728,9 @@ extern PGDLLEXPORT void inj_io_short_read(const char *name,
extern PGDLLEXPORT void inj_io_reopen(const char *name,
const void *private_data,
void *arg);
+extern PGDLLEXPORT void inj_io_error_flush_buffer(const char *name,
+ const void *private_data,
+ void *arg);
void
inj_io_short_read(const char *name, const void *private_data, void *arg)
@@ -790,6 +805,18 @@ inj_io_reopen(const char *name, const void *private_data, void *arg)
if (inj_io_error_state->enabled_reopen)
elog(ERROR, "injection point triggering failure to reopen ");
}
+
+void
+inj_io_error_flush_buffer(const char *name, const void *private_data, void *arg)
+{
+ ereport(LOG,
+ errmsg("error_flush_buffer injection point called, is enabled: %d",
+ inj_io_error_state->enabled_error_flush_buffer),
+ errhidestmt(true), errhidecontext(true));
+
+ if (inj_io_error_state->enabled_error_flush_buffer)
+ elog(ERROR, "injection point triggering failure to flush buffer ");
+}
#endif
PG_FUNCTION_INFO_V1(inj_io_short_read_attach);
@@ -844,3 +871,28 @@ inj_io_reopen_detach(PG_FUNCTION_ARGS)
#endif
PG_RETURN_VOID();
}
+
+PG_FUNCTION_INFO_V1(inj_io_error_flush_buffer_attach);
+Datum
+inj_io_error_flush_buffer_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ inj_io_error_state->enabled_error_flush_buffer = true;
+#else
+ elog(ERROR, "injection points not supported");
+#endif
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_error_flush_buffer_detach);
+Datum
+inj_io_error_flush_buffer_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ inj_io_error_state->enabled_error_flush_buffer = false;
+#else
+ elog(ERROR, "injection points not supported");
+#endif
+ PG_RETURN_VOID();
+}
--
2.51.0
From 915c9fe80cfe2b6ba5bc42e0397471ee2034c1e1 Mon Sep 17 00:00:00 2001
From: Yura Sokolov <[email protected]>
Date: Fri, 27 Mar 2026 21:19:41 +0300
Subject: [PATCH v1 3/3] bufmgr: print flag names in DebugPrintBufferRefcount
---
src/backend/storage/buffer/bufmgr.c | 42 +++++++++++++++++++++-----
src/test/modules/test_aio/t/001_aio.pl | 6 ++--
2 files changed, 36 insertions(+), 12 deletions(-)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 7d0ae0e4593..33f4a8f541c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -48,6 +48,7 @@
#include "common/hashfn.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
+#include "lib/stringinfo.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
@@ -4324,9 +4325,10 @@ DebugPrintBufferRefcount(Buffer buffer)
{
BufferDesc *buf;
int32 loccount;
- char *result;
ProcNumber backend;
uint64 buf_state;
+ StringInfoData str;
+ char flag_prefix = '=';
Assert(BufferIsValid(buffer));
if (BufferIsLocal(buffer))
@@ -4342,16 +4344,39 @@ DebugPrintBufferRefcount(Buffer buffer)
backend = INVALID_PROC_NUMBER;
}
+ initStringInfoExt(&str, 256);
+
/* theoretically we should lock the bufHdr here */
buf_state = pg_atomic_read_u64(&buf->state);
- result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
- buffer,
- relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
- BufTagGetForkNum(&buf->tag)).str,
- buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
- BUF_STATE_GET_REFCOUNT(buf_state), loccount);
- return result;
+ appendStringInfo(&str, "[%03d] (rel=%s, blockNum=%u, flags",
+ buffer,
+ relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
+ BufTagGetForkNum(&buf->tag)).str,
+ buf->tag.blockNum);
+#define appendFlag(flag) do { \
+ if (buf_state & flag) { \
+ appendStringInfoChar(&str, flag_prefix); \
+ appendStringInfoString(&str, #flag); \
+ flag_prefix = '|'; \
+ } \
+} while(0)
+ appendFlag(BM_LOCKED);
+ appendFlag(BM_DIRTY);
+ appendFlag(BM_VALID);
+ appendFlag(BM_TAG_VALID);
+ appendFlag(BM_IO_IN_PROGRESS);
+ appendFlag(BM_IO_ERROR);
+ appendFlag(BM_PIN_COUNT_WAITER);
+ appendFlag(BM_CHECKPOINT_NEEDED);
+ appendFlag(BM_PERMANENT);
+ appendFlag(BM_LOCK_HAS_WAITERS);
+ appendFlag(BM_LOCK_WAKE_IN_PROGRESS);
+#undef appendFlag
+
+ appendStringInfo(&str, ", refcount=%u %d)",
+ BUF_STATE_GET_REFCOUNT(buf_state), loccount);
+ return str.data;
}
/*
@@ -4523,6 +4548,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
false);
INJECTION_POINT("flush-buffer-after-smgr-write", buf);
+
/*
* When a strategy is in use, only flushes of dirty buffers already in the
* strategy ring are counted as strategy writes (IOCONTEXT
diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl
index 680f13e43ec..babab7cdf2f 100644
--- a/src/test/modules/test_aio/t/001_aio.pl
+++ b/src/test/modules/test_aio/t/001_aio.pl
@@ -328,8 +328,7 @@ SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
$tblname eq 'tbl_corr'
? qr/invalid page in block 1 of relation "base\/\d+\/\d+/
: qr/invalid page in block 1 of relation "base\/\d+\/t\d+_\d+/;
- # BM_IO_ERROR and BM_TAG_VALID should be set
- my $debug_print_re = qr/blockNum=1, flags=0x.?a000000, refcount=0/;
+ my $debug_print_re = qr/blockNum=1, flags=BM_TAG_VALID\|BM_IO_ERROR\S*, refcount=0/;
# verify the error is reported in custom C code
psql_like(
@@ -810,12 +809,11 @@ SELECT invalidate_rel_block('tbl_ok', 2);
qr/^$/,
qr/ERROR:.*injection point triggering failure to flush buffer/
);
- # BM_IO_ERROR, BM_VALID and BM_TAG_VALID should be set
psql_like(
$io_method, $psql,
"validate flags of 'tbl_ok' page after read_rel_block_ll()",
qq(SELECT debug_print_rel_block('tbl_ok', 2)),
- qr/blockNum=2, flags=0x.?b.00000, refcount=0/, qr/^$/);
+ qr/blockNum=2, flags=BM_DIRTY\|BM_VALID\|BM_TAG_VALID\|BM_IO_ERROR\S*, refcount=0/, qr/^$/);
# Validate second attempt leads to "Multiple failures" message
psql_like(
$io_method, $psql,
--
2.51.0