*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 1869,1888 **** SET ENABLE_SEQSCAN TO OFF;
          When the commit data for a transaction is flushed to disk, any
          additional commits ready at that time are also flushed out.
          <varname>commit_delay</varname> adds a time delay, set in
!         microseconds, before a transaction attempts to
!         flush the WAL buffer out to disk.  A nonzero delay can allow more
!         transactions to be committed with only one flush operation, if
!         system load is high enough that additional transactions become
!         ready to commit within the given interval. But the delay is
!         just wasted if no other transactions become ready to
!         commit. Therefore, the delay is only performed if at least
!         <varname>commit_siblings</varname> other transactions are
!         active at the instant that a server process has written its
!         commit record.
          The default <varname>commit_delay</> is zero (no delay).
-         Since all pending commit data will be written at every flush
-         regardless of this setting, it is rare that adding delay
-         by increasing this parameter will actually improve performance.
         </para>
        </listitem>
       </varlistentry>
--- 1869,1888 ----
          When the commit data for a transaction is flushed to disk, any
          additional commits ready at that time are also flushed out.
          <varname>commit_delay</varname> adds a time delay, set in
!         microseconds, before a leading transaction participating in
!         group commit attempts to flush the WAL buffer out to disk.
!         This can add an additional latency of of up to
!         <varname>commit_delay</varname> microseconds for each transaction.
!         A nonzero delay can allow more transactions to be committed with
!         only one flush operation, if system load is high enough that
!         additional transactions become ready to commit within the
!         given interval. However, the delay is just wasted if no other
!         transactions become ready to commit. Therefore, the delay
!         is only performed if at least <varname>commit_siblings</varname>
!         other transactions are active immediately before the leader
!         backend participating in group commit proceeds with flushing
!         WAL.
          The default <varname>commit_delay</> is zero (no delay).
         </para>
        </listitem>
       </varlistentry>
*** a/doc/src/sgml/wal.sgml
--- b/doc/src/sgml/wal.sgml
***************
*** 376,384 ****
     <acronym>WAL</acronym> to disk, in the hope that a single flush
     executed by one such transaction can also serve other transactions
     committing at about the same time.  Setting <varname>commit_delay</varname>
!    can only help when there are many concurrently committing transactions,
!    and it is difficult to tune it to a value that actually helps rather
!    than hurt throughput.
    </para>
  
   </sect1>
--- 376,382 ----
     <acronym>WAL</acronym> to disk, in the hope that a single flush
     executed by one such transaction can also serve other transactions
     committing at about the same time.  Setting <varname>commit_delay</varname>
!    can only help when there are many concurrently committing transactions.
    </para>
  
   </sect1>
*** a/src/backend/access/transam/xact.c
--- b/src/backend/access/transam/xact.c
***************
*** 68,76 **** bool		XactDeferrable;
  
  int			synchronous_commit = SYNCHRONOUS_COMMIT_ON;
  
- int			CommitDelay = 0;	/* precommit delay in microseconds */
- int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
- 
  /*
   * MyXactAccessedTempRel is set when a temporary relation is accessed.
   * We don't allow PREPARE TRANSACTION in that case.  (This is global
--- 68,73 ----
***************
*** 1123,1144 **** RecordTransactionCommit(void)
  	if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) ||
  		forceSyncCommit || nrels > 0)
  	{
- 		/*
- 		 * Synchronous commit case:
- 		 *
- 		 * Sleep before flush! So we can flush more than one commit records
- 		 * per single fsync.  (The idea is some other backend may do the
- 		 * XLogFlush while we're sleeping.  This needs work still, because on
- 		 * most Unixen, the minimum select() delay is 10msec or more, which is
- 		 * way too long.)
- 		 *
- 		 * We do not sleep if enableFsync is not turned on, nor if there are
- 		 * fewer than CommitSiblings other backends with active transactions.
- 		 */
- 		if (CommitDelay > 0 && enableFsync &&
- 			MinimumActiveBackends(CommitSiblings))
- 			pg_usleep(CommitDelay);
- 
  		XLogFlush(XactLastRecEnd);
  
  		/*
--- 1120,1125 ----
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 80,85 **** bool		fullPageWrites = true;
--- 80,87 ----
  bool		log_checkpoints = false;
  int			sync_method = DEFAULT_SYNC_METHOD;
  int			wal_level = WAL_LEVEL_MINIMAL;
+ int			CommitDelay = 0;	/* precommit delay in microseconds */
+ int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  
  #ifdef WAL_DEBUG
  bool		XLOG_DEBUG = false;
***************
*** 2085,2118 **** XLogFlush(XLogRecPtr record)
  			 */
  			continue;
  		}
! 		/* Got the lock */
  		LogwrtResult = XLogCtl->LogwrtResult;
! 		if (!XLByteLE(record, LogwrtResult.Flush))
  		{
! 			/* try to write/flush later additions to XLOG as well */
! 			if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
! 			{
! 				XLogCtlInsert *Insert = &XLogCtl->Insert;
! 				uint32		freespace = INSERT_FREESPACE(Insert);
  
! 				if (freespace == 0)		/* buffer is full */
! 					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
! 				else
! 				{
! 					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
! 					WriteRqstPtr -= freespace;
! 				}
! 				LWLockRelease(WALInsertLock);
! 				WriteRqst.Write = WriteRqstPtr;
! 				WriteRqst.Flush = WriteRqstPtr;
! 			}
  			else
  			{
! 				WriteRqst.Write = WriteRqstPtr;
! 				WriteRqst.Flush = record;
  			}
! 			XLogWrite(WriteRqst, false, false);
  		}
  		LWLockRelease(WALWriteLock);
  		/* done */
  		break;
--- 2087,2135 ----
  			 */
  			continue;
  		}
! 
! 		/* Got the lock; recheck whether request is satisfied */
  		LogwrtResult = XLogCtl->LogwrtResult;
! 		if (XLByteLE(record, LogwrtResult.Flush))
! 			break;
! 
! 		/*
! 		 * Sleep before flush! By adding a delay here, we may give further
! 		 * backends the opportunity to join the backlog of group commit
! 		 * followers; this can significantly improve transaction throughput, at
! 		 * the risk of increasing transaction latency.
! 		 *
! 		 * We do not sleep if enableFsync is not turned on, nor if there are
! 		 * fewer than CommitSiblings other backends with active transactions.
! 		 */
! 		if (CommitDelay > 0 && enableFsync &&
! 			MinimumActiveBackends(CommitSiblings))
! 			pg_usleep(CommitDelay);
! 
! 		/* try to write/flush later additions to XLOG as well */
! 		if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
  		{
! 			XLogCtlInsert *Insert = &XLogCtl->Insert;
! 			uint32		freespace = INSERT_FREESPACE(Insert);
  
! 			if (freespace == 0)		/* buffer is full */
! 				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
  			else
  			{
! 				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
! 				WriteRqstPtr -= freespace;
  			}
! 			LWLockRelease(WALInsertLock);
! 			WriteRqst.Write = WriteRqstPtr;
! 			WriteRqst.Flush = WriteRqstPtr;
  		}
+ 		else
+ 		{
+ 			WriteRqst.Write = WriteRqstPtr;
+ 			WriteRqst.Flush = record;
+ 		}
+ 		XLogWrite(WriteRqst, false, false);
+ 
  		LWLockRelease(WALWriteLock);
  		/* done */
  		break;
