Hello. We've noticed that when walreceiver is waiting for a connection to complete, standby does not immediately respond to promotion requests. In PG14, upon receiving a promotion request, walreceiver terminates instantly, but in PG16, it waits for connection timeout. This behavior is attributed to commit 728f86fec65, where a part of libpqrcv_connect was simply replaced with a call to libpqsrc_connect_params. This behavior can be verified by simply dropping packets from the standby to the primary.
By a simple thought, in walreceiver, libpqsrv_connect_internal could just call ProcessWalRcvInterrupts() instead of CHECK_FOR_INTERRUPTS(), but this approach is quite ugly. Since ProcessWalRcvInterrupts() originally calls CHECK_FOR_INTERRUPTS() and there are no standalone calls to CHECK_FOR_INTERRUPTS() within walreceiver, I think it might be better to use ProcDiePending instead of ShutdownRequestPending. I added a subset function of die() as the SIGTERM handler in walsender in a crude patch attached. What do you think about the issue, and the approach? If there are no issues or objections with this method, I will continue to refine this patch. For now, I plan to register it for the upcoming commitfest. regards. -- Kyotaro Horiguchi NTT Open Source Software Center
diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index 693b3669ba..e503799bd8 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -738,7 +738,7 @@ libpqrcv_PQgetResult(PGconn *streamConn) if (rc & WL_LATCH_SET) { ResetLatch(MyLatch); - ProcessWalRcvInterrupts(); + CHECK_FOR_INTERRUPTS(); } /* Consume whatever data is available from the socket */ @@ -1042,7 +1042,7 @@ libpqrcv_processTuples(PGresult *pgres, WalRcvExecResult *walres, { char *cstrs[MaxTupleAttributeNumber]; - ProcessWalRcvInterrupts(); + CHECK_FOR_INTERRUPTS(); /* Do the allocations in temporary context. */ oldcontext = MemoryContextSwitchTo(rowcontext); diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 26ded928a7..c53a8e6c89 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -147,39 +147,34 @@ static void XLogWalRcvSendReply(bool force, bool requestReply); static void XLogWalRcvSendHSFeedback(bool immed); static void ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime); static void WalRcvComputeNextWakeup(WalRcvWakeupReason reason, TimestampTz now); +static void WalRcvShutdownSignalHandler(SIGNAL_ARGS); -/* - * Process any interrupts the walreceiver process may have received. - * This should be called any time the process's latch has become set. - * - * Currently, only SIGTERM is of interest. We can't just exit(1) within the - * SIGTERM signal handler, because the signal might arrive in the middle of - * some critical operation, like while we're holding a spinlock. Instead, the - * signal handler sets a flag variable as well as setting the process's latch. - * We must check the flag (by calling ProcessWalRcvInterrupts) anytime the - * latch has become set. Operations that could block for a long time, such as - * reading from a remote server, must pay attention to the latch too; see - * libpqrcv_PQgetResult for example. - */ void -ProcessWalRcvInterrupts(void) +WalRcvShutdownSignalHandler(SIGNAL_ARGS) { - /* - * Although walreceiver interrupt handling doesn't use the same scheme as - * regular backends, call CHECK_FOR_INTERRUPTS() to make sure we receive - * any incoming signals on Win32, and also to make sure we process any - * barrier events. - */ - CHECK_FOR_INTERRUPTS(); + int save_errno = errno; - if (ShutdownRequestPending) + /* Don't joggle the elbow of proc_exit */ + if (!proc_exit_inprogress) { - ereport(FATAL, - (errcode(ERRCODE_ADMIN_SHUTDOWN), - errmsg("terminating walreceiver process due to administrator command"))); + InterruptPending = true; + ProcDiePending = true; } + + SetLatch(MyLatch); + + errno = save_errno; + } +/* + * Is current process a wal receiver? + */ +bool +IsWalReceiver(void) +{ + return WalRcv != NULL; +} /* Main entry point for walreceiver process */ void @@ -277,7 +272,7 @@ WalReceiverMain(void) pqsignal(SIGHUP, SignalHandlerForConfigReload); /* set flag to read config * file */ pqsignal(SIGINT, SIG_IGN); - pqsignal(SIGTERM, SignalHandlerForShutdownRequest); /* request shutdown */ + pqsignal(SIGTERM, WalRcvShutdownSignalHandler); /* request shutdown */ /* SIGQUIT handler was already set up by InitPostmasterChild */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); @@ -456,7 +451,7 @@ WalReceiverMain(void) errmsg("cannot continue WAL streaming, recovery has already ended"))); /* Process any requests or signals received recently */ - ProcessWalRcvInterrupts(); + CHECK_FOR_INTERRUPTS(); if (ConfigReloadPending) { @@ -552,7 +547,7 @@ WalReceiverMain(void) if (rc & WL_LATCH_SET) { ResetLatch(MyLatch); - ProcessWalRcvInterrupts(); + CHECK_FOR_INTERRUPTS(); if (walrcv->force_reply) { @@ -691,7 +686,7 @@ WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI) { ResetLatch(MyLatch); - ProcessWalRcvInterrupts(); + CHECK_FOR_INTERRUPTS(); SpinLockAcquire(&walrcv->mutex); Assert(walrcv->walRcvState == WALRCV_RESTARTING || diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 7298a187d1..04cab7dafa 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -59,6 +59,7 @@ #include "replication/logicallauncher.h" #include "replication/logicalworker.h" #include "replication/slot.h" +#include "replication/walreceiver.h" #include "replication/walsender.h" #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" @@ -3286,6 +3287,10 @@ ProcessInterrupts(void) */ proc_exit(1); } + else if (IsWalReceiver()) + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("terminating walreceiver process due to administrator command"))); else if (IsBackgroundWorker) ereport(FATAL, (errcode(ERRCODE_ADMIN_SHUTDOWN), diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h index 949e874f21..c69c8daa6a 100644 --- a/src/include/replication/walreceiver.h +++ b/src/include/replication/walreceiver.h @@ -456,8 +456,8 @@ walrcv_clear_result(WalRcvExecResult *walres) } /* prototypes for functions in walreceiver.c */ +extern bool IsWalReceiver(void); extern void WalReceiverMain(void) pg_attribute_noreturn(); -extern void ProcessWalRcvInterrupts(void); extern void WalRcvForceReply(void); /* prototypes for functions in walreceiverfuncs.c */