Patch to reduce the contention on SInvalLock, as discussed here:
http://archives.postgresql.org/pgsql-hackers/2007-09/msg00501.php
and
http://archives.postgresql.org/pgsql-performance/2008-01/msg00023.php

For discussion.

-- 
  Simon Riggs
  2ndQuadrant  http://www.2ndQuadrant.com
Index: src/backend/storage/ipc/sinval.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/ipc/sinval.c,v
retrieving revision 1.83
diff -c -r1.83 sinval.c
*** src/backend/storage/ipc/sinval.c	1 Jan 2008 19:45:51 -0000	1.83
--- src/backend/storage/ipc/sinval.c	25 Jan 2008 23:19:24 -0000
***************
*** 113,119 ****
  {
  	SharedInvalidationMessage data;
  	int			getResult;
! 	bool		gotMessage = false;
  
  	for (;;)
  	{
--- 113,120 ----
  {
  	SharedInvalidationMessage data;
  	int			getResult;
! 	bool		delMessages = false;
! 	int			numMsgs = 0;
  
  	for (;;)
  	{
***************
*** 137,145 ****
  		getResult = SIGetDataEntry(shmInvalBuffer, MyBackendId, &data);
  		LWLockRelease(SInvalLock);
  
! 		if (getResult == 0)
  			break;				/* nothing more to do */
! 		if (getResult < 0)
  		{
  			/* got a reset message */
  			elog(DEBUG4, "cache state reset");
--- 138,147 ----
  		getResult = SIGetDataEntry(shmInvalBuffer, MyBackendId, &data);
  		LWLockRelease(SInvalLock);
  
! 		if (getResult == SI_GET_NO_MSG)
  			break;				/* nothing more to do */
! 
! 		if (getResult == SI_GET_NEED_RESET)
  		{
  			/* got a reset message */
  			elog(DEBUG4, "cache state reset");
***************
*** 147,160 ****
  		}
  		else
  		{
  			/* got a normal data message */
  			invalFunction(&data);
  		}
! 		gotMessage = true;
  	}
  
! 	/* If we got any messages, try to release dead messages */
! 	if (gotMessage)
  	{
  		LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
  		SIDelExpiredDataEntries(shmInvalBuffer);
--- 149,183 ----
  		}
  		else
  		{
+ 			Assert(getResult == SI_GET_MSG && getResult == SI_GET_MIN_MSG);
+ 
  			/* got a normal data message */
  			invalFunction(&data);
+ 
+ 			/* 
+ 			 * If getResult is positive, then it is the value of the
+ 			 * lowest message number. In that case it's a fair bet that we
+ 			 * will finish last reading our messages, so by the time we do 
+ 			 * that we should be able to grab ExclusiveLock and delete
+ 			 * messages fairly easily. Doing this might mean the lowest
+ 			 * numbered backends all go away before deleting the queue
+ 			 * so it could lead to queue growth, so we delete the 
+ 			 * queue at insert time also, when queue is long.
+ 			 */
+ 			if (getResult == SI_GET_MIN_MSG)
+ 				delMessages = true;
  		}
! 		numMsgs++;
  	}
  
! 	/* 
! 	 * If we were the lowest message number and we've just cleared out
! 	 * a long queue, assume it was because of a pm signal. So attempt
! 	 * to delete the queue. Don't worry about this normally, just let
! 	 * the queue grow until it gets pruned during SendSharedInvalidMessage()
! 	 * so the invalidator pays the cost for dirtying the cache.
! 	 */
! 	if (delMessages && (numMsgs > (MAXNUMMESSAGES / 4)))
  	{
  		LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
  		SIDelExpiredDataEntries(shmInvalBuffer);
Index: src/backend/storage/ipc/sinvaladt.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/ipc/sinvaladt.c,v
retrieving revision 1.66
diff -c -r1.66 sinvaladt.c
*** src/backend/storage/ipc/sinvaladt.c	1 Jan 2008 19:45:51 -0000	1.66
--- src/backend/storage/ipc/sinvaladt.c	25 Jan 2008 23:18:07 -0000
***************
*** 78,83 ****
--- 78,85 ----
  	segP->maxBackends = MaxBackends;
  	segP->freeBackends = MaxBackends;
  
+ 	segP->allow_pm_signals = true;
+ 
  	/* The buffer[] array is initially all unused, so we need not fill it */
  
  	/* Mark all backends inactive, and initialize nextLXID */
***************
*** 233,238 ****
--- 235,256 ----
  	}
  
  	/*
+ 	 * Try to prevent table overflow.  When the table is >50% full 
+ 	 * issue a delete every ~6% of messages insertions. We do this
+ 	 * to avoid hitting the next higher limit where we wake everybody.
+ 	 */
+ 	if ((numMsgs > (MAXNUMMESSAGES / 2)) && (segP->maxMsgNum % (MAXNUMMESSAGES / 16) == 0))
+ 		SIDelExpiredDataEntries(segP);
+ 
+ 	/*
+ 	 * Re-arm the ability to send pm signals, if queue length has
+ 	 * now reduced since the last time we sent a signal
+ 	 */
+ 	if (!segP->allow_pm_signals && 
+ 		numMsgs < (MAXNUMMESSAGES * 30 / 100))
+ 		segP->allow_pm_signals = true;
+ 
+ 	/*
  	 * Try to prevent table overflow.  When the table is 70% full send a
  	 * WAKEN_CHILDREN request to the postmaster.  The postmaster will send a
  	 * SIGUSR1 signal to all the backends, which will cause sinval.c to read
***************
*** 242,252 ****
  	 * queries, but if a backend is sitting idle then it won't be starting
  	 * transactions and so won't be reading SI entries.
  	 */
! 	if (numMsgs == (MAXNUMMESSAGES * 70 / 100) &&
  		IsUnderPostmaster)
  	{
  		elog(DEBUG4, "SI table is 70%% full, signaling postmaster");
  		SendPostmasterSignal(PMSIGNAL_WAKEN_CHILDREN);
  	}
  
  	/*
--- 260,285 ----
  	 * queries, but if a backend is sitting idle then it won't be starting
  	 * transactions and so won't be reading SI entries.
  	 */
! 	if (segP->allow_pm_signals && 
! 		numMsgs == (MAXNUMMESSAGES * 70 / 100) &&
  		IsUnderPostmaster)
  	{
  		elog(DEBUG4, "SI table is 70%% full, signaling postmaster");
  		SendPostmasterSignal(PMSIGNAL_WAKEN_CHILDREN);
+ 
+ 		/*
+ 		 * Disarm the ability to send pm signals until we
+ 		 * have reduced the queue size sufficiently to re-arm it.
+ 		 * One signal is sufficient to reduce the queue length to 
+ 		 * a minimum, though that will take some time. 
+ 		 *
+ 		 * While that is happening, we want to avoid re-triggering 
+ 		 * the pm signal, to reduce contention as much as possible.
+ 		 * Otherwise a steady flow of incoming messages may cause 
+ 		 * the queue to fluctuate around the 70th percentile,
+ 		 * causing repeated retriggering of pm signals.
+ 		 */
+ 		segP->allow_pm_signals = false;
  	}
  
  	/*
***************
*** 272,277 ****
--- 305,312 ----
  	segP->minMsgNum = 0;
  	segP->maxMsgNum = 0;
  
+ 	segP->allow_pm_signals = true;
+ 
  	for (i = 0; i < segP->lastBackend; i++)
  	{
  		if (segP->procState[i].nextMsgNum >= 0) /* active backend? */
***************
*** 287,296 ****
   *		get next SI message for specified backend, if there is one
   *
   * Possible return values:
!  *	0: no SI message available
!  *	1: next SI message has been extracted into *data
!  *		(there may be more messages available after this one!)
!  * -1: SI reset message extracted
   *
   * NB: this can run in parallel with other instances of SIGetDataEntry
   * executing on behalf of other backends.  See comments in sinval.c in
--- 322,333 ----
   *		get next SI message for specified backend, if there is one
   *
   * Possible return values:
!  * SI_GET_NO_MSG: 		no SI message available
!  * SI_GET_MSG:			next SI message has been extracted into *data
!  *						there may be more messages available after this one!
!  * SI_GET_MIN_MSG:		next SI message extracted, but we are the ones
!  *						holding up the queue (possibly multiple backends)
!  * SI_GET_NEED_RESET:	SI reset message extracted
   *
   * NB: this can run in parallel with other instances of SIGetDataEntry
   * executing on behalf of other backends.  See comments in sinval.c in
***************
*** 301,306 ****
--- 338,344 ----
  			   SharedInvalidationMessage *data)
  {
  	ProcState  *stateP = &segP->procState[backendId - 1];
+ 	bool	min_message = false;
  
  	if (stateP->resetState)
  	{
***************
*** 310,320 ****
  		 */
  		stateP->resetState = false;
  		stateP->nextMsgNum = segP->maxMsgNum;
! 		return -1;
  	}
  
  	if (stateP->nextMsgNum >= segP->maxMsgNum)
! 		return 0;				/* nothing to read */
  
  	/*
  	 * Retrieve message and advance my counter.
--- 348,361 ----
  		 */
  		stateP->resetState = false;
  		stateP->nextMsgNum = segP->maxMsgNum;
! 		return SI_GET_NEED_RESET;
  	}
  
  	if (stateP->nextMsgNum >= segP->maxMsgNum)
! 		return SI_GET_NO_MSG;				/* nothing to read */
! 
! 	if (stateP->nextMsgNum == segP->minMsgNum)
! 		min_message = true;
  
  	/*
  	 * Retrieve message and advance my counter.
***************
*** 327,333 ****
  	 * delete it here. SIDelExpiredDataEntries() should be called to remove
  	 * dead messages.
  	 */
! 	return 1;					/* got a message */
  }
  
  /*
--- 368,374 ----
  	 * delete it here. SIDelExpiredDataEntries() should be called to remove
  	 * dead messages.
  	 */
! 	return (min_message ? SI_GET_MIN_MSG : SI_GET_MSG);	/* got a message */
  }
  
  /*
Index: src/include/storage/sinvaladt.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/sinvaladt.h,v
retrieving revision 1.45
diff -c -r1.45 sinvaladt.h
*** src/include/storage/sinvaladt.h	1 Jan 2008 19:45:59 -0000	1.45
--- src/include/storage/sinvaladt.h	25 Jan 2008 23:22:00 -0000
***************
*** 85,90 ****
--- 85,92 ----
  	int			maxBackends;	/* size of procState array */
  	int			freeBackends;	/* number of empty procState slots */
  
+ 	bool		allow_pm_signals;
+ 
  	/*
  	 * Next LocalTransactionId to use for each idle backend slot.  We keep
  	 * this here because it is indexed by BackendId and it is convenient to
***************
*** 119,124 ****
--- 121,134 ----
  extern bool SIInsertDataEntry(SISeg *segP, SharedInvalidationMessage *data);
  extern int SIGetDataEntry(SISeg *segP, int backendId,
  			   SharedInvalidationMessage *data);
+ 
+ /* Return values from SIGetDataEntry() */
+ #define	SI_GET_NO_MSG		0
+ #define	SI_GET_MSG  		1
+ #define	SI_GET_MIN_MSG		2
+ #define	SI_GET_NEED_RESET	3
+ 
+ 
  extern void SIDelExpiredDataEntries(SISeg *segP);
  
  extern LocalTransactionId GetNextLocalTransactionId(void);
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend

Reply via email to