Hi,
After an off-list discussion with Fujii-san, I'm now trying to modify
the following message that is output when a client attempts to connect
instead of changing the log level as the original proposal:
$ psql: error: connection to server at "localhost" (::1), port 5433
failed: FATAL: the database system is not yet accepting connections
DETAIL: Consistent recovery state has not been yet reached.
I have now 2 candidates to do this.
The 1st
one(v1-0001-Change-log-message-when-hot-standby-is-not-access.patch) is
a simple update to the existing log messages, explicitly mentioning that
snapshot overflow could be a possible cause.
The 2nd(v1-0001-Make-it-clear-when-hot-standby-is-inaccessible-du.patch)
one introduces new states for pmState and CAC_state (which manages
whether connections can be accepted) to represent waiting for a
non-overflowed snapshot.
The advantage of the 2nd one is that it makes it clear whether the
connection failure is due to not reaching a consistent recovery state or
a snapshot overflow. However, I haven't found other significant
benefits, and I feel it might be overkill.
Personally, I feel 1st patch may be sufficient, but I would appreciate
any feedback.
--
Atsushi Torikoshi
Seconded from NTT DATA GROUP CORPORATION to SRA OSS K.K.
From 38a9ec23af2dc43ad24d939bb015d28d550d71fd Mon Sep 17 00:00:00 2001
From: Atsushi Torikoshi <toriko...@sraoss.co.jp>
Date: Wed, 12 Mar 2025 21:47:22 +0900
Subject: [PATCH v1] Make it clear when hot standby is inaccessible due to
subtransaction overflow
Previously, the log message only assumed that the recovery process had
not yet reached a consistent point. However, even after reaching the
consistent point, if there is a transaction with an overflowed
subtransaction, hot standby becomes inaccessible.
Since there was no log message indicating this reason, it was
difficult to identify the cause.
This patch explicitly handles such cases, making the cause clearer in
the logs.
---
src/backend/postmaster/postmaster.c | 29 ++++++++++++++++++++++-------
src/backend/storage/ipc/procarray.c | 17 +++++++++++++++++
src/backend/tcop/backend_startup.c | 13 +++++++++++++
src/include/storage/pmsignal.h | 2 ++
src/include/tcop/backend_startup.h | 1 +
5 files changed, 55 insertions(+), 7 deletions(-)
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index d2a7a7add6..5c3de3f97d 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -333,6 +333,8 @@ typedef enum
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
PM_RECOVERY, /* in archive recovery mode */
+ PM_SNAPSHOT_PENDING, /* in snapshot pending because of an
+ * overflowed subtransaction */
PM_HOT_STANDBY, /* in hot standby mode */
PM_RUN, /* normal "database is alive" state */
PM_STOP_BACKENDS, /* need to stop remaining backends */
@@ -1814,6 +1816,9 @@ canAcceptConnections(BackendType backend_type)
else if (!FatalError && pmState == PM_RECOVERY)
return CAC_NOTCONSISTENT; /* not yet at consistent recovery
* state */
+ else if (!FatalError && pmState == PM_SNAPSHOT_PENDING)
+ return CAC_SNAPSHOT_PENDING; /* waiting for non-overflowed
+ * snapshot */
else
return CAC_RECOVERY; /* else must be crash recovery */
}
@@ -2111,7 +2116,7 @@ process_pm_shutdown_request(void)
*/
if (pmState == PM_RUN || pmState == PM_HOT_STANDBY)
connsAllowed = false;
- else if (pmState == PM_STARTUP || pmState == PM_RECOVERY)
+ else if (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING)
{
/* There should be no clients, so proceed to stop children */
UpdatePMState(PM_STOP_BACKENDS);
@@ -2145,7 +2150,7 @@ process_pm_shutdown_request(void)
sd_notify(0, "STOPPING=1");
#endif
- if (pmState == PM_STARTUP || pmState == PM_RECOVERY)
+ if (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING)
{
/* Just shut down background processes silently */
UpdatePMState(PM_STOP_BACKENDS);
@@ -2711,6 +2716,7 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
/* wait for children to die */
case PM_RECOVERY:
+ case PM_SNAPSHOT_PENDING:
case PM_HOT_STANDBY:
case PM_RUN:
case PM_STOP_BACKENDS:
@@ -3193,6 +3199,7 @@ pmstate_name(PMState state)
PM_TOSTR_CASE(PM_INIT);
PM_TOSTR_CASE(PM_STARTUP);
PM_TOSTR_CASE(PM_RECOVERY);
+ PM_TOSTR_CASE(PM_SNAPSHOT_PENDING);
PM_TOSTR_CASE(PM_HOT_STANDBY);
PM_TOSTR_CASE(PM_RUN);
PM_TOSTR_CASE(PM_STOP_BACKENDS);
@@ -3245,7 +3252,7 @@ LaunchMissingBackgroundProcesses(void)
* the shutdown checkpoint. That's done in PostmasterStateMachine(), not
* here.)
*/
- if (pmState == PM_RUN || pmState == PM_RECOVERY ||
+ if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING ||
pmState == PM_HOT_STANDBY || pmState == PM_STARTUP)
{
if (CheckpointerPMChild == NULL)
@@ -3281,7 +3288,7 @@ LaunchMissingBackgroundProcesses(void)
*/
if (PgArchPMChild == NULL &&
((XLogArchivingActive() && pmState == PM_RUN) ||
- (XLogArchivingAlways() && (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) &&
+ (XLogArchivingAlways() && (pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING || pmState == PM_HOT_STANDBY))) &&
PgArchCanRestart())
PgArchPMChild = StartChildProcess(B_ARCHIVER);
@@ -3313,7 +3320,7 @@ LaunchMissingBackgroundProcesses(void)
if (WalReceiverRequested)
{
if (WalReceiverPMChild == NULL &&
- (pmState == PM_STARTUP || pmState == PM_RECOVERY ||
+ (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING ||
pmState == PM_HOT_STANDBY) &&
Shutdown <= SmartShutdown)
{
@@ -3663,8 +3670,15 @@ process_pm_pmsignal(void)
UpdatePMState(PM_RECOVERY);
}
- if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) &&
+ if (CheckPostmasterSignal(PMSIGNAL_SNAPSHOT_PENDING) &&
pmState == PM_RECOVERY && Shutdown == NoShutdown)
+ {
+ UpdatePMState(PM_SNAPSHOT_PENDING);
+ }
+
+ if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) &&
+ (pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING) &&
+ Shutdown == NoShutdown)
{
ereport(LOG,
(errmsg("database system is ready to accept read-only connections")));
@@ -3806,7 +3820,7 @@ process_pm_pmsignal(void)
}
if (StartupPMChild != NULL &&
- (pmState == PM_STARTUP || pmState == PM_RECOVERY ||
+ (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_SNAPSHOT_PENDING ||
pmState == PM_HOT_STANDBY) &&
CheckPromoteSignal())
{
@@ -4130,6 +4144,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
/* fall through */
case PM_RECOVERY:
+ case PM_SNAPSHOT_PENDING:
case PM_STARTUP:
case PM_INIT:
if (start_time == BgWorkerStart_PostmasterStart)
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 2e54c11f88..bb37ad2fc2 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -58,6 +58,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_lfind.h"
+#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/acl.h"
@@ -1125,11 +1126,19 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
"recovery snapshots are now enabled");
}
else
+ {
+ /*
+ * Inform postmaster that we are waiting for a non-overflowed
+ * snapshot, so it can notify clients why the connection is
+ * not yet acceptable.
+ */
+ SendPostmasterSignal(PMSIGNAL_SNAPSHOT_PENDING);
elog(DEBUG1,
"recovery snapshot waiting for non-overflowed snapshot or "
"until oldest active xid on standby is at least %u (now %u)",
standbySnapshotPendingXmin,
running->oldestRunningXid);
+ }
return;
}
}
@@ -1303,11 +1312,19 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
if (standbyState == STANDBY_SNAPSHOT_READY)
elog(DEBUG1, "recovery snapshots are now enabled");
else
+ {
+ /*
+ * Inform postmaster that we are waiting for a non-overflowed
+ * snapshot, so it can notify clients why the connection is not yet
+ * acceptable.
+ */
+ SendPostmasterSignal(PMSIGNAL_SNAPSHOT_PENDING);
elog(DEBUG1,
"recovery snapshot waiting for non-overflowed snapshot or "
"until oldest active xid on standby is at least %u (now %u)",
standbySnapshotPendingXmin,
running->oldestRunningXid);
+ }
}
/*
diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c
index c70746fa56..17e9708136 100644
--- a/src/backend/tcop/backend_startup.c
+++ b/src/backend/tcop/backend_startup.c
@@ -303,6 +303,19 @@ BackendInitialize(ClientSocket *client_sock, CAC_state cac)
errmsg("the database system is not accepting connections"),
errdetail("Hot standby mode is disabled.")));
break;
+ case CAC_SNAPSHOT_PENDING:
+ if (EnableHotStandby)
+ ereport(FATAL,
+ (errcode(ERRCODE_CANNOT_CONNECT_NOW),
+ errmsg("the database system is not yet accepting connections"),
+ errdetail("Snapshot is pending because subtransaction is overflowed."),
+ errhint("Find and close a transaction with more than %d subtransactions", PGPROC_MAX_CACHED_SUBXIDS)));
+ else
+ ereport(FATAL,
+ (errcode(ERRCODE_CANNOT_CONNECT_NOW),
+ errmsg("the database system is not accepting connections"),
+ errdetail("Hot standby mode is disabled.")));
+ break;
case CAC_SHUTDOWN:
ereport(FATAL,
(errcode(ERRCODE_CANNOT_CONNECT_NOW),
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index d84a383047..a67813a15b 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -33,6 +33,8 @@
typedef enum
{
PMSIGNAL_RECOVERY_STARTED, /* recovery has started */
+ PMSIGNAL_SNAPSHOT_PENDING, /* snapshot is pending because of an
+ * overflowed subtransaction */
PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */
PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */
PMSIGNAL_START_AUTOVAC_LAUNCHER, /* start an autovacuum launcher */
diff --git a/src/include/tcop/backend_startup.h b/src/include/tcop/backend_startup.h
index 7328561120..866a3b7cd2 100644
--- a/src/include/tcop/backend_startup.h
+++ b/src/include/tcop/backend_startup.h
@@ -30,6 +30,7 @@ typedef enum CAC_state
CAC_SHUTDOWN,
CAC_RECOVERY,
CAC_NOTCONSISTENT,
+ CAC_SNAPSHOT_PENDING,
CAC_TOOMANY,
} CAC_state;
--
2.43.0
From 96c95cbf855419c909b0f79b79be47c2220d2c51 Mon Sep 17 00:00:00 2001
From: Atsushi Torikoshi <toriko...@sraoss.co.jp>
Date: Wed, 12 Mar 2025 21:45:43 +0900
Subject: [PATCH v1] Change log message when hot standby is not accessible
Previously, the log message only assumed that the recovery process had not yet
reached a consistent point. However, even when we have reached the consistent
point, if there is a transaction whose subtransaction is overflowed, the hot
standby is not accessible and it was difficult to identify the cause since
there are no log message indicates the reason.
This change improves clarity by explicitly mention the case.
---
src/backend/postmaster/postmaster.c | 6 ++++--
src/backend/tcop/backend_startup.c | 5 +++--
src/include/tcop/backend_startup.h | 2 +-
3 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index d2a7a7add6..aafd238a7c 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1812,8 +1812,10 @@ canAcceptConnections(BackendType backend_type)
else if (!FatalError && pmState == PM_STARTUP)
return CAC_STARTUP; /* normal startup */
else if (!FatalError && pmState == PM_RECOVERY)
- return CAC_NOTCONSISTENT; /* not yet at consistent recovery
- * state */
+ return CAC_NOTCONSISTENT_OR_OVERFLOWED; /* not yet at consistent
+ * recovery state or
+ * subtransaction is
+ * overflowed */
else
return CAC_RECOVERY; /* else must be crash recovery */
}
diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c
index c70746fa56..46b1709e67 100644
--- a/src/backend/tcop/backend_startup.c
+++ b/src/backend/tcop/backend_startup.c
@@ -291,12 +291,13 @@ BackendInitialize(ClientSocket *client_sock, CAC_state cac)
(errcode(ERRCODE_CANNOT_CONNECT_NOW),
errmsg("the database system is starting up")));
break;
- case CAC_NOTCONSISTENT:
+ case CAC_NOTCONSISTENT_OR_OVERFLOWED:
if (EnableHotStandby)
ereport(FATAL,
(errcode(ERRCODE_CANNOT_CONNECT_NOW),
errmsg("the database system is not yet accepting connections"),
- errdetail("Consistent recovery state has not been yet reached.")));
+ errdetail("Consistent recovery state has not been yet reached, or snappshot is pending because subtransaction is overflowed."),
+ errhint("In the latter case, find and close the transaction with more than %d subtransactions", PGPROC_MAX_CACHED_SUBXIDS)));
else
ereport(FATAL,
(errcode(ERRCODE_CANNOT_CONNECT_NOW),
diff --git a/src/include/tcop/backend_startup.h b/src/include/tcop/backend_startup.h
index 7328561120..6580efcfbd 100644
--- a/src/include/tcop/backend_startup.h
+++ b/src/include/tcop/backend_startup.h
@@ -29,7 +29,7 @@ typedef enum CAC_state
CAC_STARTUP,
CAC_SHUTDOWN,
CAC_RECOVERY,
- CAC_NOTCONSISTENT,
+ CAC_NOTCONSISTENT_OR_OVERFLOWED,
CAC_TOOMANY,
} CAC_state;
--
2.43.0