I wrote:
> So this looks like a pretty obvious race condition in the postmaster,
> which should be resolved by having it set a flag on receipt of
> PMSIGNAL_START_WALRECEIVER that's cleared only when it does start a
> new walreceiver.

Concretely, I propose the attached patch.  Together with reducing
wal_retrieve_retry_interval to 500ms, which I propose having
PostgresNode::init do in its standard postgresql.conf adjustments,
this takes the runtime of the recovery TAP tests down from 2m50s
(after the patches I posted yesterday) to 1m30s.

I think there's still gold to be mined, because "top" is still
showing pretty low CPU load over most of the run, but this is
lots better than 4m30s.

                        regards, tom lane

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 83e99b7..6c79c36 100644
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
*************** static volatile sig_atomic_t start_autov
*** 357,362 ****
--- 357,365 ----
  /* the launcher needs to be signalled to communicate some condition */
  static volatile bool avlauncher_needs_signal = false;
  
+ /* received START_WALRECEIVER signal */
+ static volatile sig_atomic_t WalReceiverRequested = false;
+ 
  /* set when there's a worker that needs to be started up */
  static volatile bool StartWorkerNeeded = true;
  static volatile bool HaveCrashedWorker = false;
*************** static void maybe_start_bgworkers(void);
*** 426,431 ****
--- 429,435 ----
  static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
  static pid_t StartChildProcess(AuxProcType type);
  static void StartAutovacuumWorker(void);
+ static void MaybeStartWalReceiver(void);
  static void InitPostmasterDeathWatchHandle(void);
  
  /*
*************** ServerLoop(void)
*** 1810,1815 ****
--- 1814,1823 ----
  				kill(AutoVacPID, SIGUSR2);
  		}
  
+ 		/* If we need to start a WAL receiver, try to do that now */
+ 		if (WalReceiverRequested)
+ 			MaybeStartWalReceiver();
+ 
  		/* Get other worker processes running, if needed */
  		if (StartWorkerNeeded || HaveCrashedWorker)
  			maybe_start_bgworkers();
*************** reaper(SIGNAL_ARGS)
*** 2958,2964 ****
  		/*
  		 * Was it the wal receiver?  If exit status is zero (normal) or one
  		 * (FATAL exit), we assume everything is all right just like normal
! 		 * backends.
  		 */
  		if (pid == WalReceiverPID)
  		{
--- 2966,2973 ----
  		/*
  		 * Was it the wal receiver?  If exit status is zero (normal) or one
  		 * (FATAL exit), we assume everything is all right just like normal
! 		 * backends.  (If we need a new wal receiver, we'll start one at the
! 		 * next iteration of the postmaster's main loop.)
  		 */
  		if (pid == WalReceiverPID)
  		{
*************** sigusr1_handler(SIGNAL_ARGS)
*** 5066,5079 ****
  		StartAutovacuumWorker();
  	}
  
! 	if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER) &&
! 		WalReceiverPID == 0 &&
! 		(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
! 		 pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) &&
! 		Shutdown == NoShutdown)
  	{
  		/* Startup Process wants us to start the walreceiver process. */
! 		WalReceiverPID = StartWalReceiver();
  	}
  
  	if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE) &&
--- 5075,5086 ----
  		StartAutovacuumWorker();
  	}
  
! 	if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER))
  	{
  		/* Startup Process wants us to start the walreceiver process. */
! 		/* Start immediately if possible, else remember request for later. */
! 		WalReceiverRequested = true;
! 		MaybeStartWalReceiver();
  	}
  
  	if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE) &&
*************** StartAutovacuumWorker(void)
*** 5410,5415 ****
--- 5417,5440 ----
  }
  
  /*
+  * MaybeStartWalReceiver
+  *		Start the WAL receiver process, if requested and our state allows.
+  */
+ static void
+ MaybeStartWalReceiver(void)
+ {
+ 	if (WalReceiverPID == 0 &&
+ 		(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
+ 		 pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) &&
+ 		Shutdown == NoShutdown)
+ 	{
+ 		WalReceiverPID = StartWalReceiver();
+ 		WalReceiverRequested = false;
+ 	}
+ }
+ 
+ 
+ /*
   * Create the opts file
   */
  static bool
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to