From 6e5406b82d4a20c88df5bff8aeb2dbe903b40e7b Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Mon, 1 May 2017 15:09:06 -0400
Subject: [PATCH] Prevent panic during shutdown checkpoint

When the checkpointer writes the shutdown checkpoint, it checks
afterwards whether any WAL has been written since it started and throws
a PANIC if so.  At that point, only walsenders are still active, so one
might think this could not happen, but walsenders can also generate WAL,
for instance in BASE_BACKUP and certain variants of
CREATE_REPLICATION_SLOT.  So they can trigger this panic if such a
command is run while the shutdown checkpoint is being written.

To fix this, divide the walsender shutdown into two phases.  First, the
postmaster sends a SIGUSR2 signal to all walsenders.  The walsenders
then put themselves into the "stopping" state.  In this state, they
reject any new commands.  (For simplicity, we reject all new commands,
so that in the future we do not have to track meticulously which
commands might generate WAL.)  The checkpointer waits for all walsenders
to reach this state before proceeding with the shutdown checkpoint.
After the shutdown checkpoint is done, the postmaster sends
SIGINT (previously unused) to the walsenders.  This triggers the
existing shutdown behavior of sending out the shutdown checkpoint record
and then terminating.

Author: Michael Paquier <michael.paquier@gmail.com>
Reported-by: Fujii Masao <masao.fujii@gmail.com>
---
 src/backend/access/transam/xlog.c           |   6 ++
 src/backend/postmaster/postmaster.c         |   9 ++-
 src/backend/replication/basebackup.c        |   2 +-
 src/backend/replication/walsender.c         | 121 ++++++++++++++++++++++++----
 src/include/replication/walsender.h         |   1 +
 src/include/replication/walsender_private.h |   3 +-
 6 files changed, 122 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8c7a239160..82bc216a84 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7936,6 +7936,12 @@ ShutdownXLOG(int code, Datum arg)
 	ereport(LOG,
 			(errmsg("shutting down")));
 
+	/*
+	 * Wait for WAL senders to be in stopping state.  This prevents commands
+	 * from writing new WAL.
+	 */
+	WalSndWaitStopping();
+
 	if (RecoveryInProgress())
 		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
 	else
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 95b2ce3c6b..148d1deeb2 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2516,7 +2516,7 @@ reaper(SIGNAL_ARGS)
 				ereport(LOG,
 						(errmsg("terminating all walsender processes to force cascaded "
 							"standby(s) to update timeline and reconnect")));
-				SignalSomeChildren(SIGUSR2, BACKEND_TYPE_WALSND);
+				SignalSomeChildren(SIGINT, BACKEND_TYPE_WALSND);
 			}
 
 			/*
@@ -2595,7 +2595,7 @@ reaper(SIGNAL_ARGS)
 				 * Waken walsenders for the last time. No regular backends
 				 * should be around anymore.
 				 */
-				SignalChildren(SIGUSR2);
+				SignalChildren(SIGINT);
 
 				pmState = PM_SHUTDOWN_2;
 
@@ -3138,7 +3138,9 @@ PostmasterStateMachine(void)
 				/*
 				 * If we get here, we are proceeding with normal shutdown. All
 				 * the regular children are gone, and it's time to tell the
-				 * checkpointer to do a shutdown checkpoint.
+				 * checkpointer to do a shutdown checkpoint. All WAL senders
+				 * are told to switch to a stopping state so that the shutdown
+				 * checkpoint can go ahead.
 				 */
 				Assert(Shutdown > NoShutdown);
 				/* Start the checkpointer if not running */
@@ -3147,6 +3149,7 @@ PostmasterStateMachine(void)
 				/* And tell it to shut down */
 				if (CheckpointerPID != 0)
 				{
+					SignalSomeChildren(SIGUSR2, BACKEND_TYPE_WALSND);
 					signal_child(CheckpointerPID, SIGUSR2);
 					pmState = PM_SHUTDOWN;
 				}
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index f9a48d9cca..caa07ebeb7 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -444,7 +444,7 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 		 * file is required for recovery, and even that only if there happens
 		 * to be a timeline switch in the first WAL segment that contains the
 		 * checkpoint record, or if we're taking a base backup from a standby
-		 * server and the target timeline changes while the backup is taken. 
+		 * server and the target timeline changes while the backup is taken.
 		 * But they are small and highly useful for debugging purposes, so
 		 * better include them all, always.
 		 */
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 94da622f05..bf5d0bcb04 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -19,11 +19,14 @@
  * are treated as not a crash but approximately normal termination;
  * the walsender will exit quickly without sending any more XLOG records.
  *
- * If the server is shut down, postmaster sends us SIGUSR2 after all
- * regular backends have exited and the shutdown checkpoint has been written.
- * This instruct walsender to send any outstanding WAL, including the
- * shutdown checkpoint record, wait for it to be replicated to the standby,
- * and then exit.
+ * If the server is shut down, postmaster sends us SIGUSR2 after all regular
+ * backends have exited. This causes the walsender to switch to the "stopping"
+ * state. In this state, the walsender will reject any replication command
+ * that may generate WAL activity. The checkpointer begins the shutdown
+ * checkpoint once all walsenders are confirmed as stopping. When the shutdown
+ * checkpoint finishes, the postmaster sends us SIGINT. This instructs
+ * walsender to send any outstanding WAL, including the shutdown checkpoint
+ * record, wait for it to be replicated to the standby, and then exit.
  *
  *
  * Portions Copyright (c) 2010-2012, PostgreSQL Global Development Group
@@ -110,6 +113,7 @@ static TimestampTz last_reply_timestamp;
 
 /* Flags set by signal handlers for later service in main loop */
 static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t got_SIGUSR2 = false;
 volatile sig_atomic_t walsender_shutdown_requested = false;
 volatile sig_atomic_t walsender_ready_to_stop = false;
 
@@ -118,6 +122,7 @@ static void WalSndSigHupHandler(SIGNAL_ARGS);
 static void WalSndShutdownHandler(SIGNAL_ARGS);
 static void WalSndQuickDieHandler(SIGNAL_ARGS);
 static void WalSndXLogSendHandler(SIGNAL_ARGS);
+static void WalSndSwitchStopping(SIGNAL_ARGS);
 static void WalSndLastCycleHandler(SIGNAL_ARGS);
 
 /* Prototypes for private functions */
@@ -372,15 +377,15 @@ StartReplication(StartReplicationCmd *cmd)
 	SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE);
 
 	/*
-	 * When promoting a cascading standby, postmaster sends SIGUSR2 to any
+	 * When promoting a cascading standby, postmaster sends SIGINT to any
 	 * cascading walsenders to kill them. But there is a corner-case where
-	 * such walsender fails to receive SIGUSR2 and survives a standby
-	 * promotion unexpectedly. This happens when postmaster sends SIGUSR2
+	 * such walsender fails to receive SIGINT and survives a standby
+	 * promotion unexpectedly. This happens when postmaster sends SIGINT
 	 * before the walsender marks itself as a WAL sender, because postmaster
-	 * sends SIGUSR2 to only the processes marked as a WAL sender.
+	 * sends SIGINT to only the processes marked as a WAL sender.
 	 *
 	 * To avoid this corner-case, if recovery is NOT in progress even though
-	 * the walsender is cascading one, we do the same thing as SIGUSR2 signal
+	 * the walsender is cascading one, we do the same thing as SIGINT signal
 	 * handler does, i.e., set walsender_ready_to_stop to true. Which causes
 	 * the walsender to end later.
 	 *
@@ -445,6 +450,22 @@ HandleReplicationCommand(const char *cmd_string)
 	MemoryContext cmd_context;
 	MemoryContext old_context;
 
+	/*
+	 * If WAL sender has been told that shutdown is getting close, switch its
+	 * status accordingly to handle the next replication commands correctly.
+	 */
+	if (got_SIGUSR2)
+		WalSndSetState(WALSNDSTATE_STOPPING);
+
+	/*
+	 * Throw error if in stopping mode.  We need prevent commands that could
+	 * generate WAL while the shutdown checkpoint is being written.  To be
+	 * safe, we just prohibit all new commands.
+	 */
+	if (MyWalSnd->state == WALSNDSTATE_STOPPING)
+		ereport(ERROR,
+				(errmsg("cannot execute new commands while WAL sender is in stopping mode")));
+
 	elog(DEBUG1, "received replication command: %s", cmd_string);
 
 	cmd_context = AllocSetContextCreate(CurrentMemoryContext,
@@ -800,7 +821,14 @@ WalSndLoop(void)
 			}
 
 			/*
-			 * When SIGUSR2 arrives, we send any outstanding logs up to the
+			 * At the reception of SIGUSR2, switch the WAL sender to the stopping
+			 * state.
+			 */
+			if (got_SIGUSR2)
+				WalSndSetState(WALSNDSTATE_STOPPING);
+
+			/*
+			 * When SIGINT arrives, we send any outstanding logs up to the
 			 * shutdown checkpoint record (i.e., the latest record), wait
 			 * for them to be replicated to the standby, and exit.
 			 * This may be a normal termination at shutdown, or a promotion,
@@ -1378,7 +1406,24 @@ WalSndXLogSendHandler(SIGNAL_ARGS)
 	errno = save_errno;
 }
 
-/* SIGUSR2: set flag to do a last cycle and shut down afterwards */
+/* SIGUSR2: set flag to switch to stopping state */
+static void
+WalSndSwitchStopping(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	got_SIGUSR2 = true;
+	if (MyWalSnd)
+		SetLatch(&MyWalSnd->latch);
+
+	errno = save_errno;
+}
+
+/*
+ * SIGINT: set flag to do a last cycle and shut down afterwards. The WAL
+ * sender should already have been switched to WALSNDSTATE_STOPPING at
+ * this point.
+ */
 static void
 WalSndLastCycleHandler(SIGNAL_ARGS)
 {
@@ -1398,14 +1443,14 @@ WalSndSignals(void)
 	/* Set up signal handlers */
 	pqsignal(SIGHUP, WalSndSigHupHandler);		/* set flag to read config
 												 * file */
-	pqsignal(SIGINT, SIG_IGN);	/* not used */
+	pqsignal(SIGINT, WalSndLastCycleHandler);	/* request a last cycle and
+												 * shutdown */
 	pqsignal(SIGTERM, WalSndShutdownHandler);	/* request shutdown */
 	pqsignal(SIGQUIT, WalSndQuickDieHandler);	/* hard crash time */
 	pqsignal(SIGALRM, handle_sig_alarm);
 	pqsignal(SIGPIPE, SIG_IGN);
 	pqsignal(SIGUSR1, WalSndXLogSendHandler);	/* request WAL sending */
-	pqsignal(SIGUSR2, WalSndLastCycleHandler);	/* request a last cycle and
-												 * shutdown */
+	pqsignal(SIGUSR2, WalSndSwitchStopping);	/* switch to stopping state */
 
 	/* Reset some signals that are accepted by postmaster but not here */
 	pqsignal(SIGCHLD, SIG_DFL);
@@ -1465,6 +1510,50 @@ WalSndWakeup(void)
 		SetLatch(&WalSndCtl->walsnds[i].latch);
 }
 
+/*
+ * Wait that all the WAL senders have reached the stopping state. This is
+ * used by the checkpointer to control when shutdown checkpoints can
+ * safely begin.
+ */
+void
+WalSndWaitStopping(void)
+{
+	for (;;)
+	{
+		int			i;
+		bool		all_stopped = true;
+
+		for (i = 0; i < max_wal_senders; i++)
+		{
+			WalSndState state;
+			WalSnd	   *walsnd = &WalSndCtl->walsnds[i];
+
+			SpinLockAcquire(&walsnd->mutex);
+
+			if (walsnd->pid == 0)
+			{
+				SpinLockRelease(&walsnd->mutex);
+				continue;
+			}
+
+			state = walsnd->state;
+			SpinLockRelease(&walsnd->mutex);
+
+			if (state != WALSNDSTATE_STOPPING)
+			{
+				all_stopped = false;
+				break;
+			}
+		}
+
+		/* safe to leave if confirmation is done for all WAL senders */
+		if (all_stopped)
+			return;
+
+		pg_usleep(10000L);	/* wait for 10 msec */
+	}
+}
+
 /* Set state for current walsender (only called in walsender) */
 void
 WalSndSetState(WalSndState state)
@@ -1499,6 +1588,8 @@ WalSndGetStateString(WalSndState state)
 			return "catchup";
 		case WALSNDSTATE_STREAMING:
 			return "streaming";
+		case WALSNDSTATE_STOPPING:
+			return "stopping";
 	}
 	return "UNKNOWN";
 }
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 128d2dbf59..de9d1a4597 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -31,6 +31,7 @@ extern void WalSndSignals(void);
 extern Size WalSndShmemSize(void);
 extern void WalSndShmemInit(void);
 extern void WalSndWakeup(void);
+extern void WalSndWaitStopping(void);
 extern void WalSndRqstFileReload(void);
 
 extern Datum pg_stat_get_wal_senders(PG_FUNCTION_ARGS);
diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h
index 45cd7444cd..212a73d8f9 100644
--- a/src/include/replication/walsender_private.h
+++ b/src/include/replication/walsender_private.h
@@ -24,7 +24,8 @@ typedef enum WalSndState
 	WALSNDSTATE_STARTUP = 0,
 	WALSNDSTATE_BACKUP,
 	WALSNDSTATE_CATCHUP,
-	WALSNDSTATE_STREAMING
+	WALSNDSTATE_STREAMING,
+	WALSNDSTATE_STOPPING
 } WalSndState;
 
 /*
-- 
2.13.0

