On a busy system, checkpoint could be starved while queuing for the CheckpointStartLock. To avoid that, get rid of CheckpointStartLock and instead set a flag in PGPROC struct when a commit starts. After computing the REDO ptr, checkpoint waits for all backends that had that flag set to finish their commits. This eliminates the same race condition the CheckpointStartLock was there for, without the risk of starvation.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/transam/twophase.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/twophase.c,v
retrieving revision 1.28
diff -c -r1.28 twophase.c
*** src/backend/access/transam/twophase.c	13 Feb 2007 19:39:42 -0000	1.28
--- src/backend/access/transam/twophase.c	3 Apr 2007 09:20:04 -0000
***************
*** 279,284 ****
--- 279,285 ----
  	gxact->proc.pid = 0;
  	gxact->proc.databaseId = databaseid;
  	gxact->proc.roleId = owner;
+ 	gxact->proc.inCommit = false;
  	gxact->proc.inVacuum = false;
  	gxact->proc.isAutovacuum = false;
  	gxact->proc.lwWaiting = false;
***************
*** 853,858 ****
--- 854,860 ----
  	pg_crc32	statefile_crc;
  	pg_crc32	bogus_crc;
  	int			fd;
+ 	volatile PGPROC *myproc = MyProc;
  
  	/* Add the end sentinel to the list of 2PC records */
  	RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
***************
*** 938,951 ****
  	 * starting immediately after the WAL record is inserted could complete
  	 * without fsync'ing our state file.  (This is essentially the same kind
  	 * of race condition as the COMMIT-to-clog-write case that
! 	 * RecordTransactionCommit uses CheckpointStartLock for; see notes there.)
  	 *
  	 * We save the PREPARE record's location in the gxact for later use by
  	 * CheckPointTwoPhase.
  	 */
  	START_CRIT_SECTION();
  
! 	LWLockAcquire(CheckpointStartLock, LW_SHARED);
  
  	gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
  									records.head);
--- 940,953 ----
  	 * starting immediately after the WAL record is inserted could complete
  	 * without fsync'ing our state file.  (This is essentially the same kind
  	 * of race condition as the COMMIT-to-clog-write case that
! 	 * RecordTransactionCommit uses inCommit flag for; see notes there.)
  	 *
  	 * We save the PREPARE record's location in the gxact for later use by
  	 * CheckPointTwoPhase.
  	 */
  	START_CRIT_SECTION();
  
! 	myproc->inCommit = true;
  
  	gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
  									records.head);
***************
*** 981,992 ****
  	 */
  	MarkAsPrepared(gxact);
  
- 	/*
- 	 * Now we can release the checkpoint start lock: a checkpoint starting
- 	 * after this will certainly see the gxact as a candidate for fsyncing.
- 	 */
- 	LWLockRelease(CheckpointStartLock);
- 
  	END_CRIT_SECTION();
  
  	records.tail = records.head = NULL;
--- 983,988 ----
***************
*** 1649,1655 ****
   *	RecordTransactionCommitPrepared
   *
   * This is basically the same as RecordTransactionCommit: in particular,
!  * we must take the CheckpointStartLock to avoid a race condition.
   *
   * We know the transaction made at least one XLOG entry (its PREPARE),
   * so it is never possible to optimize out the commit record.
--- 1645,1651 ----
   *	RecordTransactionCommitPrepared
   *
   * This is basically the same as RecordTransactionCommit: in particular,
!  * we must set the inCommit flag to avoid a race condition.
   *
   * We know the transaction made at least one XLOG entry (its PREPARE),
   * so it is never possible to optimize out the commit record.
***************
*** 1665,1675 ****
  	int			lastrdata = 0;
  	xl_xact_commit_prepared xlrec;
  	XLogRecPtr	recptr;
  
  	START_CRIT_SECTION();
  
  	/* See notes in RecordTransactionCommit */
! 	LWLockAcquire(CheckpointStartLock, LW_SHARED);
  
  	/* Emit the XLOG commit record */
  	xlrec.xid = xid;
--- 1661,1672 ----
  	int			lastrdata = 0;
  	xl_xact_commit_prepared xlrec;
  	XLogRecPtr	recptr;
+ 	volatile PGPROC *myproc = MyProc;
  
  	START_CRIT_SECTION();
  
  	/* See notes in RecordTransactionCommit */
! 	myproc->inCommit = true;
  
  	/* Emit the XLOG commit record */
  	xlrec.xid = xid;
***************
*** 1713,1721 ****
  	/* to avoid race conditions, the parent must commit first */
  	TransactionIdCommitTree(nchildren, children);
  
- 	/* Checkpoint is allowed again */
- 	LWLockRelease(CheckpointStartLock);
- 
  	END_CRIT_SECTION();
  }
  
--- 1710,1715 ----
Index: src/backend/access/transam/xact.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xact.c,v
retrieving revision 1.238
diff -c -r1.238 xact.c
*** src/backend/access/transam/xact.c	22 Mar 2007 19:55:04 -0000	1.238
--- src/backend/access/transam/xact.c	3 Apr 2007 12:30:32 -0000
***************
*** 718,739 ****
  
  		/*
  		 * If our transaction made any transaction-controlled XLOG entries, we
! 		 * need to lock out checkpoint start between writing our XLOG record
! 		 * and updating pg_clog.  Otherwise it is possible for the checkpoint
! 		 * to set REDO after the XLOG record but fail to flush the pg_clog
! 		 * update to disk, leading to loss of the transaction commit if we
! 		 * crash a little later.  Slightly klugy fix for problem discovered
! 		 * 2004-08-10.
  		 *
  		 * (If it made no transaction-controlled XLOG entries, its XID appears
  		 * nowhere in permanent storage, so no one else will ever care if it
  		 * committed; so it doesn't matter if we lose the commit flag.)
- 		 *
- 		 * Note we only need a shared lock.
  		 */
  		madeTCentries = (MyLastRecPtr.xrecoff != 0);
  		if (madeTCentries)
! 			LWLockAcquire(CheckpointStartLock, LW_SHARED);
  
  		/*
  		 * We only need to log the commit in XLOG if the transaction made any
--- 718,750 ----
  
  		/*
  		 * If our transaction made any transaction-controlled XLOG entries, we
! 		 * need to tell checkpoint to wait until we've updated pg_clog.
! 		 * Otherwise it is possible for the checkpoint to set REDO after the
! 		 * XLOG record but fail to flush the pg_clog update to disk, leading
! 		 * to loss of the transaction commit if we crash a little later.
  		 *
  		 * (If it made no transaction-controlled XLOG entries, its XID appears
  		 * nowhere in permanent storage, so no one else will ever care if it
  		 * committed; so it doesn't matter if we lose the commit flag.)
  		 */
  		madeTCentries = (MyLastRecPtr.xrecoff != 0);
  		if (madeTCentries)
! 		{
! 			/* Use volatile pointer to prevent code rearrangement.  We want
! 			 * to make sure that other backends (actually bgwriter) to see
! 			 * that we're committing before we insert a commit record.
! 			 *
! 			 * It's safe to set the inCommit flag of my own backend without
! 			 * holding the ProcArrayLock, we're the only one modifying it.
! 			 * Checkpoint will hold ProcArrayLock when reading the value,
! 			 * because it needs to atomically make note of the xid 
! 			 * associated with the flag.
! 			 *
! 			 * The flag is cleared later together with MyProc->xid.
! 			 */
! 			volatile PGPROC *myproc = MyProc;
! 			myproc->inCommit = true;
! 		}
  
  		/*
  		 * We only need to log the commit in XLOG if the transaction made any
***************
*** 825,834 ****
  			TransactionIdCommitTree(nchildren, children);
  		}
  
- 		/* Unlock checkpoint lock if we acquired it */
- 		if (madeTCentries)
- 			LWLockRelease(CheckpointStartLock);
- 
  		END_CRIT_SECTION();
  	}
  
--- 836,841 ----
***************
*** 1556,1561 ****
--- 1563,1569 ----
  		MyProc->xid = InvalidTransactionId;
  		MyProc->xmin = InvalidTransactionId;
  		MyProc->inVacuum = false;		/* must be cleared with xid/xmin */
+ 		MyProc->inCommit = false;
  
  		/* Clear the subtransaction-XID cache too while holding the lock */
  		MyProc->subxids.nxids = 0;
***************
*** 1795,1800 ****
--- 1803,1809 ----
  	MyProc->xid = InvalidTransactionId;
  	MyProc->xmin = InvalidTransactionId;
  	MyProc->inVacuum = false;	/* must be cleared with xid/xmin */
+ 	MyProc->inCommit = false;
  
  	/* Clear the subtransaction-XID cache too while holding the lock */
  	MyProc->subxids.nxids = 0;
***************
*** 1961,1966 ****
--- 1970,1976 ----
  		MyProc->xid = InvalidTransactionId;
  		MyProc->xmin = InvalidTransactionId;
  		MyProc->inVacuum = false;		/* must be cleared with xid/xmin */
+ 		MyProc->inCommit = false;
  
  		/* Clear the subtransaction-XID cache too while holding the lock */
  		MyProc->subxids.nxids = 0;
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.265
diff -c -r1.265 xlog.c
*** src/backend/access/transam/xlog.c	3 Mar 2007 20:02:26 -0000	1.265
--- src/backend/access/transam/xlog.c	3 Apr 2007 12:34:43 -0000
***************
*** 5356,5361 ****
--- 5356,5363 ----
  	int			nsegsadded = 0;
  	int			nsegsremoved = 0;
  	int			nsegsrecycled = 0;
+ 	TransactionId *inCommitXids;
+ 	int			nInCommit;
  
  	/*
  	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
***************
*** 5381,5394 ****
  	checkPoint.ThisTimeLineID = ThisTimeLineID;
  	checkPoint.time = time(NULL);
  
- 	/*
- 	 * We must hold CheckpointStartLock while determining the checkpoint REDO
- 	 * pointer.  This ensures that any concurrent transaction commits will be
- 	 * either not yet logged, or logged and recorded in pg_clog. See notes in
- 	 * RecordTransactionCommit().
- 	 */
- 	LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE);
- 
  	/* And we need WALInsertLock too */
  	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
  
--- 5383,5388 ----
***************
*** 5421,5433 ****
  			ControlFile->checkPointCopy.redo.xrecoff)
  		{
  			LWLockRelease(WALInsertLock);
- 			LWLockRelease(CheckpointStartLock);
  			LWLockRelease(CheckpointLock);
  			END_CRIT_SECTION();
  			return;
  		}
  	}
  
  	/*
  	 * Compute new REDO record ptr = location of next XLOG record.
  	 *
--- 5415,5441 ----
  			ControlFile->checkPointCopy.redo.xrecoff)
  		{
  			LWLockRelease(WALInsertLock);
  			LWLockRelease(CheckpointLock);
  			END_CRIT_SECTION();
  			return;
  		}
  	}
  
+ 	/* Make note of all transactions that have started but not yet finished
+ 	 * committing while we compute the new REDO ptr.
+ 	 *
+ 	 * NB: GetInCommitTransactions acquires ProcArrayLock lock while we're 
+ 	 * holding WALInsertLock.  That's not currently prone to deadlocks because
+ 	 * we always acquire them in the same order, but keep that in mind if you
+ 	 * fiddle with the locking.  It's important that we hold WALInsertLock,
+ 	 * otherwise a transaction might start committing and insert a commit 
+ 	 * record right after we call GetInCommitTransactions.  Holding the lock
+ 	 * ensures that even if someone starts committing after our 
+ 	 * GetInCommitTransactions call, he can't insert his commit record earlier
+ 	 * than our REDO pointer.
+ 	 */
+ 	inCommitXids = GetInCommitTransactions(&nInCommit);
+ 
  	/*
  	 * Compute new REDO record ptr = location of next XLOG record.
  	 *
***************
*** 5471,5478 ****
  	 */
  	LWLockRelease(WALInsertLock);
  
- 	LWLockRelease(CheckpointStartLock);
- 
  	if (!shutdown)
  		ereport(DEBUG2,
  				(errmsg("checkpoint starting")));
--- 5479,5484 ----
***************
*** 5509,5514 ****
--- 5515,5530 ----
  	 */
  	END_CRIT_SECTION();
  
+ 	/*
+ 	 * We must wait until all transactions that were committing when we 
+ 	 * determined the checkpoint REDO pointer are finished.  This ensures 
+ 	 * that any concurrent transaction commits will be either not yet logged,
+ 	 * or logged and recorded in pg_clog.  See notes in
+ 	 * RecordTransactionCommit().
+ 	 */
+ 	WaitForCommitsToFinish(inCommitXids, nInCommit);
+ 	pfree(inCommitXids);
+ 
  	CheckPointGuts(checkPoint.redo);
  
  	START_CRIT_SECTION();
Index: src/backend/storage/ipc/procarray.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/ipc/procarray.c,v
retrieving revision 1.23
diff -c -r1.23 procarray.c
*** src/backend/storage/ipc/procarray.c	25 Mar 2007 19:45:14 -0000	1.23
--- src/backend/storage/ipc/procarray.c	3 Apr 2007 12:31:28 -0000
***************
*** 740,745 ****
--- 740,841 ----
  }
  
  /*
+  * GetInCommitTransactions -- Returns an array of xids that are committing
+  *
+  * Gets a list of transactions that have started but not yet finished 
+  * committing.  Transactions in the middle of a PREPARE TRANSACTION are
+  * also returned.
+  *
+  * The meaning of commit-in-progress is that the transaction has inserted
+  * or is about to insert the commit record, but all on-disk changes have
+  * not been written yet.
+  *
+  * The returned array is palloc'd and the number of xids in it is stored
+  * in *nxids_ptr.
+  */
+ TransactionId *
+ GetInCommitTransactions(int *nxids_ptr)
+ {
+ 	ProcArrayStruct *arrayP = procArray;
+ 	TransactionId *result;
+ 	int	nxids;
+ 	int	index;
+ 
+ 	result = palloc(sizeof(TransactionId) * MaxBackends);
+ 	nxids = 0;
+ 
+ 	LWLockAcquire(ProcArrayLock, LW_SHARED);
+ 
+ 	for (index = 0; index < arrayP->numProcs; index++)
+ 	{
+ 		PGPROC	   *proc = arrayP->procs[index];
+ 
+ 		if (proc->inCommit)
+ 			result[nxids++] = proc->xid;
+ 	}
+ 
+ 	LWLockRelease(ProcArrayLock);
+ 
+ 	*nxids_ptr = nxids;
+ 	return result;
+ }
+ 
+ /*
+  * WaitForCommitsToFinish -- wait for given xids to finish committing
+  *
+  * NOTE: The contents of the input array are destroyed.
+  *
+  * NOTE: The current implementation is very inefficient.  Waiting is 
+  * implemented by sleeping and polling the proc array, and matching the
+  * in-progress commits with the ones to wait for has O(n^2) behavior.
+  * This is currently only used in checkpoints, so we can live with it.
+  */
+ void
+ WaitForCommitsToFinish(TransactionId *xids, int nxids)
+ {
+ 	TransactionId *currentlyInCommit;
+ 	int nCurrentlyInCommit;
+ 	int i, j;
+ 
+ 	for(;;)
+ 	{
+ 		currentlyInCommit = GetInCommitTransactions(&nCurrentlyInCommit);
+ 
+ 		/* Remove all entries from xids-array that are no longer 
+ 		 * commit-in-progress */
+ 		for(i = 0; i < nxids; i++)
+ 		{
+ 			bool found = false;
+ 			for(j = 0; j < nCurrentlyInCommit; j++)
+ 			{
+ 				if(TransactionIdEquals(currentlyInCommit[j], xids[i]))
+ 				{
+ 					found = true;
+ 					break;
+ 				}
+ 			}
+ 			if (!found)
+ 			{
+ 				/* This commit is no longer in progress.  Remove from array */
+ 				xids[i] = xids[nxids - 1];
+ 				nxids--;
+ 				i--;
+ 			}
+ 		}
+ 
+ 		pfree(currentlyInCommit);
+ 		
+ 		if(nxids == 0)
+ 		{
+ 			/* We're done, none of the original xids are inCommit anymore. */
+ 			break;
+ 		}
+ 
+ 		pg_usleep(100000L); /* sleep for 100 ms and try again */
+ 	}
+ }
+ 
+ /*
   * BackendPidGetProc -- get a backend's PGPROC given its PID
   *
   * Returns NULL if not found.  Note that it is up to the caller to be
Index: src/backend/storage/lmgr/proc.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/lmgr/proc.c,v
retrieving revision 1.186
diff -c -r1.186 proc.c
*** src/backend/storage/lmgr/proc.c	7 Mar 2007 13:35:03 -0000	1.186
--- src/backend/storage/lmgr/proc.c	3 Apr 2007 09:08:05 -0000
***************
*** 259,264 ****
--- 259,265 ----
  	/* databaseId and roleId will be filled in later */
  	MyProc->databaseId = InvalidOid;
  	MyProc->roleId = InvalidOid;
+ 	MyProc->inCommit = false;
  	MyProc->inVacuum = false;
  	MyProc->isAutovacuum = IsAutoVacuumWorkerProcess();
  	MyProc->lwWaiting = false;
***************
*** 392,397 ****
--- 393,399 ----
  	MyProc->xmin = InvalidTransactionId;
  	MyProc->databaseId = InvalidOid;
  	MyProc->roleId = InvalidOid;
+ 	MyProc->inCommit = false;
  	MyProc->inVacuum = false;
  	MyProc->isAutovacuum = IsAutoVacuumLauncherProcess(); /* is this needed? */
  	MyProc->lwWaiting = false;
Index: src/include/storage/lwlock.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/lwlock.h,v
retrieving revision 1.34
diff -c -r1.34 lwlock.h
*** src/include/storage/lwlock.h	15 Feb 2007 23:23:23 -0000	1.34
--- src/include/storage/lwlock.h	3 Apr 2007 12:34:15 -0000
***************
*** 49,55 ****
  	WALWriteLock,
  	ControlFileLock,
  	CheckpointLock,
- 	CheckpointStartLock,
  	CLogControlLock,
  	SubtransControlLock,
  	MultiXactGenLock,
--- 49,54 ----
Index: src/include/storage/proc.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/proc.h,v
retrieving revision 1.96
diff -c -r1.96 proc.h
*** src/include/storage/proc.h	7 Mar 2007 13:35:03 -0000	1.96
--- src/include/storage/proc.h	3 Apr 2007 08:56:57 -0000
***************
*** 74,79 ****
--- 74,80 ----
  	Oid			databaseId;		/* OID of database this backend is using */
  	Oid			roleId;			/* OID of role using this backend */
  
+ 	bool		inCommit;		/* true if commit is in progress */
  	bool		inVacuum;		/* true if current xact is a LAZY VACUUM */
  	bool		isAutovacuum;	/* true if it's autovacuum */
  
Index: src/include/storage/procarray.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/procarray.h,v
retrieving revision 1.12
diff -c -r1.12 procarray.h
*** src/include/storage/procarray.h	16 Jan 2007 13:28:57 -0000	1.12
--- src/include/storage/procarray.h	3 Apr 2007 11:06:11 -0000
***************
*** 25,30 ****
--- 25,32 ----
  extern bool TransactionIdIsInProgress(TransactionId xid);
  extern bool TransactionIdIsActive(TransactionId xid);
  extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum);
+ extern TransactionId *GetInCommitTransactions(int *nxids_ptr);
+ extern void WaitForCommitsToFinish(TransactionId *xids, int nxids);
  
  extern PGPROC *BackendPidGetProc(int pid);
  extern int	BackendXidGetPid(TransactionId xid);
---------------------------(end of broadcast)---------------------------
TIP 1: if posting/reading through Usenet, please send an appropriate
       subscribe-nomail command to [EMAIL PROTECTED] so that your
       message can get through to the mailing list cleanly

Reply via email to