Hi Michael, thanks for your answer.
Test 041_checkpoint_at_promote.pl is really good example
of using injection points, but anyway, I still don't see
how injection points can help us here. In failed test case
we started postgres, immediately open psql connection and commit
prepared transaction. Postgres, on start, before accepting
connections, fork checkpointer and checkpointer initialize
shared parameter sync_standbys_defined in CheckpointerMain,
before process loop. So, in most attempts, we can't call
injection_points_attach in psql before
macro INJECTION_POINT() code will be executed in
checkpointer(I mean INJECTION_POINT() code instead sleep()),
bacause checkpointer process is forking before database
system is ready to accept connections.
The manual execution of checkpoint can't help us too.
P.S.
sorry for style errors, I am new in postgres, so can miss
some rules. Anyway, to avoid any community rules mistakes,
attached patch with correct naming.
Thanks.
Best regards,
Maksim Melnikov
From 8804867a70c9c1c946683b9b9c455da2d1504aa0 Mon Sep 17 00:00:00 2001
From: Maksim Melnikov <m.melni...@postgrespro.ru>
Date: Wed, 26 Mar 2025 16:44:09 +0300
Subject: [PATCH v4] Fix sync_standbys_defined race on startup.
WalSndCtl->sync_standbys_defined, can be updated
by checkpointer process only and in backends it
can be read. So, on postgres startup, we have race
between checkpointer process and backends, that check
sync_standbys_defined for sync replications.
To avoid this, now we are working with sync_standbys_defined
not as bool field, but as bitmask, where 7th bit check
standbys_defined or not, 8th bit check
WalSndCtlData->sync_standbys_defined field has been
initialized by checkpointer. So if in synchronous replication functions
was detected that sync_standbys_defined is not initialized,
check SyncStandbysDefined() value.
---
src/backend/replication/syncrep.c | 32 +++++++++++++++++----
src/backend/replication/walsender.c | 2 +-
src/include/replication/walsender_private.h | 18 +++++++++++-
3 files changed, 44 insertions(+), 8 deletions(-)
diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c
index d75e3968035..c049e82286a 100644
--- a/src/backend/replication/syncrep.c
+++ b/src/backend/replication/syncrep.c
@@ -85,6 +85,7 @@
#include "tcop/tcopprot.h"
#include "utils/guc_hooks.h"
#include "utils/ps_status.h"
+#include "utils/injection_point.h"
/* User-settable parameters for sync rep */
char *SyncRepStandbyNames;
@@ -148,6 +149,7 @@ void
SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
{
int mode;
+ bool sync_standbys_defined;
/*
* This should be called while holding interrupts during a transaction
@@ -170,7 +172,7 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
* it's false, the lock is not necessary because we don't touch the queue.
*/
if (!SyncRepRequested() ||
- !((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined)
+ IS_SYNCSTANDBYS_INITIALIZED_ONLY(((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined))
return;
/* Cap the level for anything other than commit to remote flush only. */
@@ -184,6 +186,8 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING);
+ sync_standbys_defined = WalSndCtl->sync_standbys_defined & SYNCSTANDBYSINITIALIZED
+ ? (WalSndCtl->sync_standbys_defined & SYNCSTANDBYSDEFINED) : SyncStandbysDefined();
/*
* We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
@@ -193,7 +197,7 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
* condition but we'll be fetching that cache line anyway so it's likely
* to be a low cost check.
*/
- if (!WalSndCtl->sync_standbys_defined ||
+ if (!sync_standbys_defined ||
lsn <= WalSndCtl->lsn[mode])
{
LWLockRelease(SyncRepLock);
@@ -920,9 +924,10 @@ SyncRepWakeQueue(bool all, int mode)
void
SyncRepUpdateSyncStandbysDefined(void)
{
- bool sync_standbys_defined = SyncStandbysDefined();
+ bool new_sync_standbys_defined = SyncStandbysDefined();
+ bool old_sync_standbys_defined = (WalSndCtl->sync_standbys_defined & SYNCSTANDBYSDEFINED);
- if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
+ if (new_sync_standbys_defined != old_sync_standbys_defined)
{
LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
@@ -931,7 +936,7 @@ SyncRepUpdateSyncStandbysDefined(void)
* for backends to continue waiting. Since the user no longer wants
* synchronous replication, we'd better wake them up.
*/
- if (!sync_standbys_defined)
+ if (!new_sync_standbys_defined)
{
int i;
@@ -946,8 +951,23 @@ SyncRepUpdateSyncStandbysDefined(void)
* backend that hasn't yet reloaded its config might go to sleep on
* the queue (and never wake up). This prevents that.
*/
- WalSndCtl->sync_standbys_defined = sync_standbys_defined;
+ WalSndCtl->sync_standbys_defined = new_sync_standbys_defined
+ ? SET_SYNCSTANDBYS_DEFINED(WalSndCtl->sync_standbys_defined)
+ : RESET_SYNCSTANDBYS_DEFINED(WalSndCtl->sync_standbys_defined);
+
+ LWLockRelease(SyncRepLock);
+ }
+ else if (!(WalSndCtl->sync_standbys_defined & SYNCSTANDBYSINITIALIZED))
+ {
+ /*
+ * Even if new_sync_standbys_defined == old_sync_standbys_defined, we
+ * should check WalSndCtl->sync_standbys_defined is initialized and
+ * set its initialize bit, if it reset. Anyway this bit should be set
+ * only on start and then it will be always setted.
+ */
+ LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
+ WalSndCtl->sync_standbys_defined = SET_SYNCSTANDBYS_INITIALIZED(WalSndCtl->sync_standbys_defined);
LWLockRelease(SyncRepLock);
}
}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 1028919aecb..c181c7ee977 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1676,7 +1676,7 @@ WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId
*/
if (skipped_xact &&
SyncRepRequested() &&
- ((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined)
+ (((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined & SYNCSTANDBYSDEFINED))
{
WalSndKeepalive(false, lsn);
diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h
index 0fc77f1b4af..dd931b81e4f 100644
--- a/src/include/replication/walsender_private.h
+++ b/src/include/replication/walsender_private.h
@@ -100,7 +100,7 @@ typedef struct
* config file safely, so checkpointer updates this value as needed.
* Protected by SyncRepLock.
*/
- bool sync_standbys_defined;
+ uint8 sync_standbys_defined;
/* used as a registry of physical / logical walsenders to wake */
ConditionVariable wal_flush_cv;
@@ -116,6 +116,22 @@ typedef struct
WalSnd walsnds[FLEXIBLE_ARRAY_MEMBER];
} WalSndCtlData;
+/*
+ * WalSndCtlData->sync_standbys_defined bit flags. 7th bit check standbys_defined or not,
+ * 8th bit check WalSndCtlData->sync_standbys_defined field has been initialized.
+ */
+#define SYNCSTANDBYSINITIALIZED (1 << 0)
+#define SYNCSTANDBYSDEFINED (1 << 1)
+
+#define SET_SYNCSTANDBYS_INITIALIZED(sync_standbys) \
+ (sync_standbys | SYNCSTANDBYSINITIALIZED)
+#define SET_SYNCSTANDBYS_DEFINED(sync_standbys) \
+ (sync_standbys | SYNCSTANDBYSINITIALIZED | SYNCSTANDBYSDEFINED)
+#define RESET_SYNCSTANDBYS_DEFINED(sync_standbys) \
+ (SET_SYNCSTANDBYS_INITIALIZED(sync_standbys) & (~SYNCSTANDBYSDEFINED))
+#define IS_SYNCSTANDBYS_INITIALIZED_ONLY(sync_standbys) \
+ ((sync_standbys & (SYNCSTANDBYSINITIALIZED | SYNCSTANDBYSDEFINED)) == SYNCSTANDBYSINITIALIZED)
+
extern PGDLLIMPORT WalSndCtlData *WalSndCtl;
--
2.43.0