Some performance problems have been reported on HS from two users: Erik
and Stefan.

The characteristics of those issues have been that performance is
* sporadically reduced, though mostly runs at full speed
* context switch storms reported as being associated

So we're looking for something that doesn't always happen, but when it
does it involves lots of processes and context switching.

Unfortunately neither test reporter has been able to re-run tests,
leaving me not much to go on. Though since I know the code well, I can
focus in on likely suspects fairly easily; in this case I think I have a
root cause.

Earlier this year I added deadlock detection into Startup process when
it waits for a buffer pin. The deadlock detection was simplified since
it doesn't wait for deadlock_timeout before acting, it just immediately
sends a signal to all active processes to resolve the deadlock, even if
the buffer pin is released very soon afterwards. Heikki questioned this
implementation at the time, though I said it was easier to start simple
and add more code if problems arose and time allowed. It's clear that
with 100+ connections and reasonably frequent buffer pin waits, as would
occur when accessing same data blocks on both primary and standby, that
the current too-simple coding would cause performance issues, as Heikki
implied. Certainly actual deadlocks are much rarer than buffer pin
waits, so the current coding is wasteful.

The following patch adds some simple logic to make the Startup process
wait for deadlock_timeout before it sends the deadlock resolution
signals. It does that by refactoring the API to
enable_standby_sigalrm(), though doesn't change other behaviour or add
new features.

Viewpoints?

-- 
 Simon Riggs           www.2ndQuadrant.com
*** a/src/backend/storage/ipc/standby.c
--- b/src/backend/storage/ipc/standby.c
***************
*** 388,399 **** ResolveRecoveryConflictWithBufferPin(void)
  	}
  	else if (MaxStandbyDelay < 0)
  	{
  		/*
! 		 * Send out a request to check for buffer pin deadlocks before we
! 		 * wait. This is fairly cheap, so no need to wait for deadlock timeout
! 		 * before trying to send it out.
  		 */
! 		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
  	}
  	else
  	{
--- 388,402 ----
  	}
  	else if (MaxStandbyDelay < 0)
  	{
+ 		TimestampTz now = GetCurrentTimestamp();
+ 
  		/*
! 		 * Set timeout for deadlock check (only)
  		 */
! 		if (enable_standby_sig_alarm(now, now, true))
! 			sig_alarm_enabled = true;
! 		else
! 			elog(FATAL, "could not set timer for process wakeup");
  	}
  	else
  	{
***************
*** 410,443 **** ResolveRecoveryConflictWithBufferPin(void)
  		}
  		else
  		{
! 			TimestampTz fin_time;		/* Expected wake-up time by timer */
! 			long		timer_delay_secs;		/* Amount of time we set timer
! 												 * for */
! 			int			timer_delay_usecs;
! 
! 			/*
! 			 * Send out a request to check for buffer pin deadlocks before we
! 			 * wait. This is fairly cheap, so no need to wait for deadlock
! 			 * timeout before trying to send it out.
! 			 */
! 			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
  
  			/*
! 			 * How much longer we should wait?
  			 */
! 			fin_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay);
! 
! 			TimestampDifference(now, fin_time,
! 								&timer_delay_secs, &timer_delay_usecs);
  
  			/*
! 			 * It's possible that the difference is less than a microsecond;
! 			 * ensure we don't cancel, rather than set, the interrupt.
  			 */
! 			if (timer_delay_secs == 0 && timer_delay_usecs == 0)
! 				timer_delay_usecs = 1;
! 
! 			if (enable_standby_sig_alarm(timer_delay_secs, timer_delay_usecs, fin_time))
  				sig_alarm_enabled = true;
  			else
  				elog(FATAL, "could not set timer for process wakeup");
--- 413,431 ----
  		}
  		else
  		{
! 			TimestampTz max_standby_time;
  
  			/*
! 			 * At what point in the future do we hit MaxStandbyDelay?
  			 */
! 			max_standby_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay);
! 			Assert(max_standby_time > now);
  
  			/*
! 			 * Wake up at MaxStandby delay, and check for deadlocks as well
! 			 * if we will be waiting longer than deadlock_timeout
  			 */
! 			if (enable_standby_sig_alarm(now, max_standby_time, false))
  				sig_alarm_enabled = true;
  			else
  				elog(FATAL, "could not set timer for process wakeup");
*** a/src/backend/storage/lmgr/proc.c
--- b/src/backend/storage/lmgr/proc.c
***************
*** 85,90 **** static TimestampTz timeout_start_time;
--- 85,91 ----
  
  /* statement_fin_time is valid only if statement_timeout_active is true */
  static TimestampTz statement_fin_time;
+ static TimestampTz statement_fin_time2; /* valid only in recovery */
  
  
  static void RemoveProcFromArray(int code, Datum arg);
***************
*** 1619,1641 **** handle_sig_alarm(SIGNAL_ARGS)
   * To avoid various edge cases, we must be careful to do nothing
   * when there is nothing to be done.  We also need to be able to
   * reschedule the timer interrupt if called before end of statement.
   */
  bool
! enable_standby_sig_alarm(long delay_s, int delay_us, TimestampTz fin_time)
  {
! 	struct itimerval timeval;
! 
! 	Assert(delay_s >= 0 && delay_us >= 0);
  
! 	statement_fin_time = fin_time;
  
! 	standby_timeout_active = true;
  
- 	MemSet(&timeval, 0, sizeof(struct itimerval));
- 	timeval.it_value.tv_sec = delay_s;
- 	timeval.it_value.tv_usec = delay_us;
- 	if (setitimer(ITIMER_REAL, &timeval, NULL))
- 		return false;
  	return true;
  }
  
--- 1620,1680 ----
   * To avoid various edge cases, we must be careful to do nothing
   * when there is nothing to be done.  We also need to be able to
   * reschedule the timer interrupt if called before end of statement.
+  *
+  * We set either deadlock_timeout_active or statement_timeout_active
+  * or both. Interrupts are enabled if standby_timeout_active.
   */
  bool
! enable_standby_sig_alarm(TimestampTz now, TimestampTz fin_time, bool deadlock_only)
  {
! 	TimestampTz deadlock_time = TimestampTzPlusMilliseconds(now, DeadlockTimeout);
  
! 	if (deadlock_only)
! 	{
! 		/*
! 		 * Wake up at DeadlockTimeout only, then wait forever
! 		 */
! 		statement_fin_time = deadlock_time;
! 		deadlock_timeout_active = true;
! 		statement_timeout_active = false;
! 	}
! 	else if (fin_time > deadlock_time)
! 	{
! 		/*
! 		 * Wake up at DeadlockTimeout, then again at MaxStandbyDelay
! 		 */
! 		statement_fin_time = deadlock_time;
! 		statement_fin_time2 = fin_time;
! 		deadlock_timeout_active = true;
! 		statement_timeout_active = true;
! 	}
! 	else
! 	{
! 		/*
! 		 * Wake only at MaxStandbyDelay because its fairly soon
! 		 */
! 		statement_fin_time = fin_time;
! 		deadlock_timeout_active = false;
! 		statement_timeout_active = true;
! 	}
  
! 	if (deadlock_timeout_active || statement_timeout_active)
! 	{
! 		long		secs;
! 		int			usecs;
! 		struct itimerval timeval;
! 		TimestampDifference(now, statement_fin_time,
! 							&secs, &usecs);
! 		if (secs == 0 && usecs == 0)
! 			usecs = 1;
! 		MemSet(&timeval, 0, sizeof(struct itimerval));
! 		timeval.it_value.tv_sec = secs;
! 		timeval.it_value.tv_usec = usecs;
! 		if (setitimer(ITIMER_REAL, &timeval, NULL))
! 			return false;
! 		standby_timeout_active = true;
! 	}
  
  	return true;
  }
  
***************
*** 1675,1711 **** static bool
  CheckStandbyTimeout(void)
  {
  	TimestampTz now;
  
  	standby_timeout_active = false;
  
  	now = GetCurrentTimestamp();
  
  	if (now >= statement_fin_time)
! 		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
  	else
  	{
- 		/* Not time yet, so (re)schedule the interrupt */
  		long		secs;
  		int			usecs;
  		struct itimerval timeval;
- 
  		TimestampDifference(now, statement_fin_time,
  							&secs, &usecs);
- 
- 		/*
- 		 * It's possible that the difference is less than a microsecond;
- 		 * ensure we don't cancel, rather than set, the interrupt.
- 		 */
  		if (secs == 0 && usecs == 0)
  			usecs = 1;
- 
- 		standby_timeout_active = true;
- 
  		MemSet(&timeval, 0, sizeof(struct itimerval));
  		timeval.it_value.tv_sec = secs;
  		timeval.it_value.tv_usec = usecs;
  		if (setitimer(ITIMER_REAL, &timeval, NULL))
  			return false;
  	}
  
  	return true;
--- 1714,1777 ----
  CheckStandbyTimeout(void)
  {
  	TimestampTz now;
+ 	bool reschedule = false;
  
  	standby_timeout_active = false;
  
  	now = GetCurrentTimestamp();
  
+ 	/*
+ 	 * Reschedule the timer if its not time to wake yet, or if we
+ 	 * have both timers set and the first one has just been reached.
+ 	 */
  	if (now >= statement_fin_time)
! 	{
! 		if (deadlock_timeout_active)
! 		{
! 			/*
! 			 * We're still waiting when we reach DeadlockTimeout, so send out a request
! 			 * to have other backends check themselves for deadlock. Then continue
! 			 * waiting until MaxStandbyDelay.
! 			 */
! 			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
! 			deadlock_timeout_active = false;
! 
! 			/*
! 			 * Begin second waiting period to MaxStandbyDelay if required.
! 			 */
! 			if (statement_timeout_active)
! 			{
! 				reschedule = true;
! 				statement_fin_time = statement_fin_time2;
! 			}
! 		}
! 		else
! 		{
! 			/*
! 			 * We've now reached MaxStandbyDelay, so ask all conflicts to leave, cos
! 			 * its time for us to press ahead with applying changes in recovery.
! 			 */
! 			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
! 		}
! 	}
  	else
+ 		reschedule = true;
+ 
+ 	if (reschedule)
  	{
  		long		secs;
  		int			usecs;
  		struct itimerval timeval;
  		TimestampDifference(now, statement_fin_time,
  							&secs, &usecs);
  		if (secs == 0 && usecs == 0)
  			usecs = 1;
  		MemSet(&timeval, 0, sizeof(struct itimerval));
  		timeval.it_value.tv_sec = secs;
  		timeval.it_value.tv_usec = usecs;
  		if (setitimer(ITIMER_REAL, &timeval, NULL))
  			return false;
+ 		standby_timeout_active = true;
  	}
  
  	return true;
*** a/src/include/storage/proc.h
--- b/src/include/storage/proc.h
***************
*** 199,205 **** extern bool enable_sig_alarm(int delayms, bool is_statement_timeout);
  extern bool disable_sig_alarm(bool is_statement_timeout);
  extern void handle_sig_alarm(SIGNAL_ARGS);
  
! extern bool enable_standby_sig_alarm(long delay_s, int delay_us, TimestampTz fin_time);
  extern bool disable_standby_sig_alarm(void);
  extern void handle_standby_sig_alarm(SIGNAL_ARGS);
  
--- 199,206 ----
  extern bool disable_sig_alarm(bool is_statement_timeout);
  extern void handle_sig_alarm(SIGNAL_ARGS);
  
! extern bool enable_standby_sig_alarm(TimestampTz now,
! 									 TimestampTz fin_time, bool deadlock_only);
  extern bool disable_standby_sig_alarm(void);
  extern void handle_standby_sig_alarm(SIGNAL_ARGS);
  
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to