Patch to reduce the contention on SInvalLock, as discussed here: http://archives.postgresql.org/pgsql-hackers/2007-09/msg00501.php and http://archives.postgresql.org/pgsql-performance/2008-01/msg00023.php
For discussion. -- Simon Riggs 2ndQuadrant http://www.2ndQuadrant.com
Index: src/backend/storage/ipc/sinval.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/ipc/sinval.c,v retrieving revision 1.83 diff -c -r1.83 sinval.c *** src/backend/storage/ipc/sinval.c 1 Jan 2008 19:45:51 -0000 1.83 --- src/backend/storage/ipc/sinval.c 25 Jan 2008 23:19:24 -0000 *************** *** 113,119 **** { SharedInvalidationMessage data; int getResult; ! bool gotMessage = false; for (;;) { --- 113,120 ---- { SharedInvalidationMessage data; int getResult; ! bool delMessages = false; ! int numMsgs = 0; for (;;) { *************** *** 137,145 **** getResult = SIGetDataEntry(shmInvalBuffer, MyBackendId, &data); LWLockRelease(SInvalLock); ! if (getResult == 0) break; /* nothing more to do */ ! if (getResult < 0) { /* got a reset message */ elog(DEBUG4, "cache state reset"); --- 138,147 ---- getResult = SIGetDataEntry(shmInvalBuffer, MyBackendId, &data); LWLockRelease(SInvalLock); ! if (getResult == SI_GET_NO_MSG) break; /* nothing more to do */ ! ! if (getResult == SI_GET_NEED_RESET) { /* got a reset message */ elog(DEBUG4, "cache state reset"); *************** *** 147,160 **** } else { /* got a normal data message */ invalFunction(&data); } ! gotMessage = true; } ! /* If we got any messages, try to release dead messages */ ! if (gotMessage) { LWLockAcquire(SInvalLock, LW_EXCLUSIVE); SIDelExpiredDataEntries(shmInvalBuffer); --- 149,183 ---- } else { + Assert(getResult == SI_GET_MSG && getResult == SI_GET_MIN_MSG); + /* got a normal data message */ invalFunction(&data); + + /* + * If getResult is positive, then it is the value of the + * lowest message number. In that case it's a fair bet that we + * will finish last reading our messages, so by the time we do + * that we should be able to grab ExclusiveLock and delete + * messages fairly easily. Doing this might mean the lowest + * numbered backends all go away before deleting the queue + * so it could lead to queue growth, so we delete the + * queue at insert time also, when queue is long. + */ + if (getResult == SI_GET_MIN_MSG) + delMessages = true; } ! numMsgs++; } ! /* ! * If we were the lowest message number and we've just cleared out ! * a long queue, assume it was because of a pm signal. So attempt ! * to delete the queue. Don't worry about this normally, just let ! * the queue grow until it gets pruned during SendSharedInvalidMessage() ! * so the invalidator pays the cost for dirtying the cache. ! */ ! if (delMessages && (numMsgs > (MAXNUMMESSAGES / 4))) { LWLockAcquire(SInvalLock, LW_EXCLUSIVE); SIDelExpiredDataEntries(shmInvalBuffer); Index: src/backend/storage/ipc/sinvaladt.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/ipc/sinvaladt.c,v retrieving revision 1.66 diff -c -r1.66 sinvaladt.c *** src/backend/storage/ipc/sinvaladt.c 1 Jan 2008 19:45:51 -0000 1.66 --- src/backend/storage/ipc/sinvaladt.c 25 Jan 2008 23:18:07 -0000 *************** *** 78,83 **** --- 78,85 ---- segP->maxBackends = MaxBackends; segP->freeBackends = MaxBackends; + segP->allow_pm_signals = true; + /* The buffer[] array is initially all unused, so we need not fill it */ /* Mark all backends inactive, and initialize nextLXID */ *************** *** 233,238 **** --- 235,256 ---- } /* + * Try to prevent table overflow. When the table is >50% full + * issue a delete every ~6% of messages insertions. We do this + * to avoid hitting the next higher limit where we wake everybody. + */ + if ((numMsgs > (MAXNUMMESSAGES / 2)) && (segP->maxMsgNum % (MAXNUMMESSAGES / 16) == 0)) + SIDelExpiredDataEntries(segP); + + /* + * Re-arm the ability to send pm signals, if queue length has + * now reduced since the last time we sent a signal + */ + if (!segP->allow_pm_signals && + numMsgs < (MAXNUMMESSAGES * 30 / 100)) + segP->allow_pm_signals = true; + + /* * Try to prevent table overflow. When the table is 70% full send a * WAKEN_CHILDREN request to the postmaster. The postmaster will send a * SIGUSR1 signal to all the backends, which will cause sinval.c to read *************** *** 242,252 **** * queries, but if a backend is sitting idle then it won't be starting * transactions and so won't be reading SI entries. */ ! if (numMsgs == (MAXNUMMESSAGES * 70 / 100) && IsUnderPostmaster) { elog(DEBUG4, "SI table is 70%% full, signaling postmaster"); SendPostmasterSignal(PMSIGNAL_WAKEN_CHILDREN); } /* --- 260,285 ---- * queries, but if a backend is sitting idle then it won't be starting * transactions and so won't be reading SI entries. */ ! if (segP->allow_pm_signals && ! numMsgs == (MAXNUMMESSAGES * 70 / 100) && IsUnderPostmaster) { elog(DEBUG4, "SI table is 70%% full, signaling postmaster"); SendPostmasterSignal(PMSIGNAL_WAKEN_CHILDREN); + + /* + * Disarm the ability to send pm signals until we + * have reduced the queue size sufficiently to re-arm it. + * One signal is sufficient to reduce the queue length to + * a minimum, though that will take some time. + * + * While that is happening, we want to avoid re-triggering + * the pm signal, to reduce contention as much as possible. + * Otherwise a steady flow of incoming messages may cause + * the queue to fluctuate around the 70th percentile, + * causing repeated retriggering of pm signals. + */ + segP->allow_pm_signals = false; } /* *************** *** 272,277 **** --- 305,312 ---- segP->minMsgNum = 0; segP->maxMsgNum = 0; + segP->allow_pm_signals = true; + for (i = 0; i < segP->lastBackend; i++) { if (segP->procState[i].nextMsgNum >= 0) /* active backend? */ *************** *** 287,296 **** * get next SI message for specified backend, if there is one * * Possible return values: ! * 0: no SI message available ! * 1: next SI message has been extracted into *data ! * (there may be more messages available after this one!) ! * -1: SI reset message extracted * * NB: this can run in parallel with other instances of SIGetDataEntry * executing on behalf of other backends. See comments in sinval.c in --- 322,333 ---- * get next SI message for specified backend, if there is one * * Possible return values: ! * SI_GET_NO_MSG: no SI message available ! * SI_GET_MSG: next SI message has been extracted into *data ! * there may be more messages available after this one! ! * SI_GET_MIN_MSG: next SI message extracted, but we are the ones ! * holding up the queue (possibly multiple backends) ! * SI_GET_NEED_RESET: SI reset message extracted * * NB: this can run in parallel with other instances of SIGetDataEntry * executing on behalf of other backends. See comments in sinval.c in *************** *** 301,306 **** --- 338,344 ---- SharedInvalidationMessage *data) { ProcState *stateP = &segP->procState[backendId - 1]; + bool min_message = false; if (stateP->resetState) { *************** *** 310,320 **** */ stateP->resetState = false; stateP->nextMsgNum = segP->maxMsgNum; ! return -1; } if (stateP->nextMsgNum >= segP->maxMsgNum) ! return 0; /* nothing to read */ /* * Retrieve message and advance my counter. --- 348,361 ---- */ stateP->resetState = false; stateP->nextMsgNum = segP->maxMsgNum; ! return SI_GET_NEED_RESET; } if (stateP->nextMsgNum >= segP->maxMsgNum) ! return SI_GET_NO_MSG; /* nothing to read */ ! ! if (stateP->nextMsgNum == segP->minMsgNum) ! min_message = true; /* * Retrieve message and advance my counter. *************** *** 327,333 **** * delete it here. SIDelExpiredDataEntries() should be called to remove * dead messages. */ ! return 1; /* got a message */ } /* --- 368,374 ---- * delete it here. SIDelExpiredDataEntries() should be called to remove * dead messages. */ ! return (min_message ? SI_GET_MIN_MSG : SI_GET_MSG); /* got a message */ } /* Index: src/include/storage/sinvaladt.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/sinvaladt.h,v retrieving revision 1.45 diff -c -r1.45 sinvaladt.h *** src/include/storage/sinvaladt.h 1 Jan 2008 19:45:59 -0000 1.45 --- src/include/storage/sinvaladt.h 25 Jan 2008 23:22:00 -0000 *************** *** 85,90 **** --- 85,92 ---- int maxBackends; /* size of procState array */ int freeBackends; /* number of empty procState slots */ + bool allow_pm_signals; + /* * Next LocalTransactionId to use for each idle backend slot. We keep * this here because it is indexed by BackendId and it is convenient to *************** *** 119,124 **** --- 121,134 ---- extern bool SIInsertDataEntry(SISeg *segP, SharedInvalidationMessage *data); extern int SIGetDataEntry(SISeg *segP, int backendId, SharedInvalidationMessage *data); + + /* Return values from SIGetDataEntry() */ + #define SI_GET_NO_MSG 0 + #define SI_GET_MSG 1 + #define SI_GET_MIN_MSG 2 + #define SI_GET_NEED_RESET 3 + + extern void SIDelExpiredDataEntries(SISeg *segP); extern LocalTransactionId GetNextLocalTransactionId(void);
---------------------------(end of broadcast)--------------------------- TIP 6: explain analyze is your friend