diff -cpr HEAD/doc/src/sgml/config.sgml LDC_V41/doc/src/sgml/config.sgml
*** HEAD/doc/src/sgml/config.sgml	Fri Apr 20 11:37:37 2007
--- LDC_V41/doc/src/sgml/config.sgml	Wed Apr 25 15:07:11 2007
*************** SET ENABLE_SEQSCAN TO OFF;
*** 1565,1570 ****
--- 1565,1619 ----
        </listitem>
       </varlistentry>
  
+      <varlistentry id="guc-checkpoint-write-percent" xreflabel="checkpoint_write_percent">
+       <term><varname>checkpoint_write_percent</varname> (<type>floating point</type>)</term>
+       <indexterm>
+        <primary><varname>checkpoint_write_percent</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         To spread works in checkpoints, each checkpoint spends the specified
+         time and delays to write out all dirty buffers in the shared buffer
+         pool. The default value is 50.0 (50% of <varname>checkpoint_timeout</>).
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
+      <varlistentry id="guc-checkpoint-nap-percent" xreflabel="checkpoint_nap_percent">
+       <term><varname>checkpoint_nap_percent</varname> (<type>floating point</type>)</term>
+       <indexterm>
+        <primary><varname>checkpoint_nap_percent</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Specifies the delay between writing out all dirty buffers and flushing
+         all modified files. Make the kernel's disk writer to flush dirty buffers
+         during this time in order to reduce works in the next flushing phase.
+         The default value is 10.0 (10% of <varname>checkpoint_timeout</>).
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
+      <varlistentry id="guc-checkpoint-sync-percent" xreflabel="checkpoint_sync_percent">
+       <term><varname>checkpoint_sync_percent</varname> (<type>floating point</type>)</term>
+       <indexterm>
+        <primary><varname>checkpoint_sync_percent</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         To spread works in checkpoints, each checkpoint spends the specified
+         time and delays to flush all modified files.
+         The default value is 20.0 (20% of <varname>checkpoint_timeout</>).
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
       <varlistentry id="guc-checkpoint-warning" xreflabel="checkpoint_warning">
        <term><varname>checkpoint_warning</varname> (<type>integer</type>)</term>
        <indexterm>
diff -cpr HEAD/src/backend/access/transam/xlog.c LDC_V41/src/backend/access/transam/xlog.c
*** HEAD/src/backend/access/transam/xlog.c	Wed Apr  4 01:34:35 2007
--- LDC_V41/src/backend/access/transam/xlog.c	Wed Apr 25 15:07:11 2007
*************** static void readRecoveryCommandFile(void
*** 399,405 ****
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
! static void CheckPointGuts(XLogRecPtr checkPointRedo);
  
  static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
  				XLogRecPtr *lsn, BkpBlock *bkpb);
--- 399,405 ----
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
! static void CheckPointGuts(XLogRecPtr checkPointRedo, bool immediate);
  
  static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
  				XLogRecPtr *lsn, BkpBlock *bkpb);
*************** GetRedoRecPtr(void)
*** 5274,5279 ****
--- 5274,5296 ----
  }
  
  /*
+  * GetInsertRecPtr -- Returns the current insert position.
+  */
+ XLogRecPtr
+ GetInsertRecPtr(void)
+ {
+ 	volatile XLogCtlData *xlogctl = XLogCtl;
+ 	XLogCtlInsert  *Insert = &XLogCtl->Insert;
+ 	XLogRecPtr		recptr;
+ 
+ 	SpinLockAcquire(&xlogctl->info_lck);
+ 	INSERT_RECPTR(recptr, Insert, Insert->curridx);
+ 	SpinLockRelease(&xlogctl->info_lck);
+ 
+ 	return recptr;
+ }
+ 
+ /*
   * Get the time of the last xlog segment switch
   */
  time_t
*************** CreateCheckPoint(bool shutdown, bool for
*** 5546,5552 ****
  	 */
  	END_CRIT_SECTION();
  
! 	CheckPointGuts(checkPoint.redo);
  
  	START_CRIT_SECTION();
  
--- 5563,5569 ----
  	 */
  	END_CRIT_SECTION();
  
! 	CheckPointGuts(checkPoint.redo, force);
  
  	START_CRIT_SECTION();
  
*************** CreateCheckPoint(bool shutdown, bool for
*** 5652,5663 ****
   * recovery restartpoints.
   */
  static void
! CheckPointGuts(XLogRecPtr checkPointRedo)
  {
  	CheckPointCLOG();
  	CheckPointSUBTRANS();
  	CheckPointMultiXact();
! 	FlushBufferPool();			/* performs all required fsyncs */
  	/* We deliberately delay 2PC checkpointing as long as possible */
  	CheckPointTwoPhase(checkPointRedo);
  }
--- 5669,5680 ----
   * recovery restartpoints.
   */
  static void
! CheckPointGuts(XLogRecPtr checkPointRedo, bool immediate)
  {
  	CheckPointCLOG();
  	CheckPointSUBTRANS();
  	CheckPointMultiXact();
! 	FlushBufferPool(immediate);		/* performs all required fsyncs */
  	/* We deliberately delay 2PC checkpointing as long as possible */
  	CheckPointTwoPhase(checkPointRedo);
  }
*************** RecoveryRestartPoint(const CheckPoint *c
*** 5706,5712 ****
  	/*
  	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo);
  
  	/*
  	 * Update pg_control so that any subsequent crash will restart from this
--- 5723,5729 ----
  	/*
  	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo, true);
  
  	/*
  	 * Update pg_control so that any subsequent crash will restart from this
diff -cpr HEAD/src/backend/commands/dbcommands.c LDC_V41/src/backend/commands/dbcommands.c
*** HEAD/src/backend/commands/dbcommands.c	Fri Apr 13 00:04:35 2007
--- LDC_V41/src/backend/commands/dbcommands.c	Wed Apr 25 15:07:11 2007
*************** createdb(const CreatedbStmt *stmt)
*** 400,406 ****
  	 * up-to-date for the copy.  (We really only need to flush buffers for the
  	 * source database, but bufmgr.c provides no API for that.)
  	 */
! 	BufferSync();
  
  	/*
  	 * Once we start copying subdirectories, we need to be able to clean 'em
--- 400,406 ----
  	 * up-to-date for the copy.  (We really only need to flush buffers for the
  	 * source database, but bufmgr.c provides no API for that.)
  	 */
! 	BufferSync(true);
  
  	/*
  	 * Once we start copying subdirectories, we need to be able to clean 'em
*************** dbase_redo(XLogRecPtr lsn, XLogRecord *r
*** 1417,1423 ****
  		 * up-to-date for the copy.  (We really only need to flush buffers for
  		 * the source database, but bufmgr.c provides no API for that.)
  		 */
! 		BufferSync();
  
  		/*
  		 * Copy this subdirectory to the new location
--- 1417,1423 ----
  		 * up-to-date for the copy.  (We really only need to flush buffers for
  		 * the source database, but bufmgr.c provides no API for that.)
  		 */
! 		BufferSync(true);
  
  		/*
  		 * Copy this subdirectory to the new location
diff -cpr HEAD/src/backend/postmaster/bgwriter.c LDC_V41/src/backend/postmaster/bgwriter.c
*** HEAD/src/backend/postmaster/bgwriter.c	Sat Mar 31 03:34:55 2007
--- LDC_V41/src/backend/postmaster/bgwriter.c	Wed Apr 25 15:07:11 2007
***************
*** 44,49 ****
--- 44,50 ----
  #include "postgres.h"
  
  #include <signal.h>
+ #include <sys/time.h>
  #include <time.h>
  #include <unistd.h>
  
*************** typedef struct
*** 117,122 ****
--- 118,124 ----
  	sig_atomic_t ckpt_failed;	/* advances when checkpoint fails */
  
  	sig_atomic_t ckpt_time_warn;	/* warn if too soon since last ckpt? */
+ 	sig_atomic_t ckpt_force;		/* any waiter for the checkpoint? */
  
  	int			num_requests;	/* current # of requests */
  	int			max_requests;	/* allocated array size */
*************** PgStat_MsgBgWriter BgWriterStats;
*** 138,143 ****
--- 140,148 ----
  int			BgWriterDelay = 200;
  int			CheckPointTimeout = 300;
  int			CheckPointWarning = 30;
+ double		checkpoint_write_percent = 50.0;
+ double		checkpoint_nap_percent = 10.0;
+ double		checkpoint_sync_percent = 20.0;
  
  /*
   * Flags set by interrupt handlers for later service in the main loop.
*************** static bool am_bg_writer = false;
*** 153,162 ****
--- 158,175 ----
  
  static bool ckpt_active = false;
  
+ static time_t		ckpt_start_time;
+ static XLogRecPtr	ckpt_start_recptr;
+ static double		ckpt_progress_at_sync_start;
+ 
  static time_t last_checkpoint_time;
  static time_t last_xlog_switch_time;
  
  
+ static void CheckArchiveTimeout(void);
+ static void BgWriterNap(long msec);
+ static bool NextCheckpointRequested(void);
+ static double GetCheckpointTargetProgress(void);
  static void bg_quickdie(SIGNAL_ARGS);
  static void BgSigHupHandler(SIGNAL_ARGS);
  static void ReqCheckpointHandler(SIGNAL_ARGS);
*************** BackgroundWriterMain(void)
*** 343,349 ****
  		bool		force_checkpoint = false;
  		time_t		now;
  		int			elapsed_secs;
- 		long		udelay;
  
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
--- 356,361 ----
*************** BackgroundWriterMain(void)
*** 362,374 ****
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
- 		if (checkpoint_requested)
- 		{
- 			checkpoint_requested = false;
- 			do_checkpoint = true;
- 			force_checkpoint = true;
- 			BgWriterStats.m_requested_checkpoints++;
- 		}
  		if (shutdown_requested)
  		{
  			/*
--- 374,379 ----
*************** BackgroundWriterMain(void)
*** 389,399 ****
  		 */
  		now = time(NULL);
  		elapsed_secs = now - last_checkpoint_time;
! 		if (elapsed_secs >= CheckPointTimeout)
  		{
  			do_checkpoint = true;
! 			if (!force_checkpoint)
! 				BgWriterStats.m_timed_checkpoints++;
  		}
  
  		/*
--- 394,410 ----
  		 */
  		now = time(NULL);
  		elapsed_secs = now - last_checkpoint_time;
! 		if (checkpoint_requested)
  		{
+ 			checkpoint_requested = false;
+ 			force_checkpoint = BgWriterShmem->ckpt_force;
  			do_checkpoint = true;
! 			BgWriterStats.m_requested_checkpoints++;
! 		}
! 		else if (elapsed_secs >= CheckPointTimeout)
! 		{
! 			do_checkpoint = true;
! 			BgWriterStats.m_timed_checkpoints++;
  		}
  
  		/*
*************** BackgroundWriterMain(void)
*** 416,428 ****
--- 427,444 ----
  								elapsed_secs),
  						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
  			BgWriterShmem->ckpt_time_warn = false;
+ 			BgWriterShmem->ckpt_force = false;
  
  			/*
  			 * Indicate checkpoint start to any waiting backends.
  			 */
  			ckpt_active = true;
+ 			elog(DEBUG1, "CHECKPOINT: start");
  			BgWriterShmem->ckpt_started++;
  
+ 			ckpt_start_time = now;
+ 			ckpt_start_recptr = GetInsertRecPtr();
+ 			ckpt_progress_at_sync_start = 0;
  			CreateCheckPoint(false, force_checkpoint);
  
  			/*
*************** BackgroundWriterMain(void)
*** 435,440 ****
--- 451,457 ----
  			 * Indicate checkpoint completion to any waiting backends.
  			 */
  			BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
+ 			elog(DEBUG1, "CHECKPOINT: end");
  			ckpt_active = false;
  
  			/*
*************** BackgroundWriterMain(void)
*** 451,458 ****
  		 * Check for archive_timeout, if so, switch xlog files.  First we do a
  		 * quick check using possibly-stale local state.
  		 */
! 		if (XLogArchiveTimeout > 0 &&
! 			(int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
  		{
  			/*
  			 * Update local state ... note that last_xlog_switch_time is the
--- 468,495 ----
  		 * Check for archive_timeout, if so, switch xlog files.  First we do a
  		 * quick check using possibly-stale local state.
  		 */
! 		CheckArchiveTimeout();
! 
! 		/* Nap for the configured time. */
! 		BgWriterNap(0);
! 	}
! }
! 
! /*
!  * CheckArchiveTimeout -- check for archive_timeout
!  */
! static void
! CheckArchiveTimeout(void)
! {
! 	time_t		now;
! 
! 	if (XLogArchiveTimeout <= 0)
! 		return;
! 
! 	now = time(NULL);
! 	if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
! 		return;
! 
  		{
  			/*
  			 * Update local state ... note that last_xlog_switch_time is the
*************** BackgroundWriterMain(void)
*** 462,471 ****
  
  			last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
  
- 			/* if we did a checkpoint, 'now' might be stale too */
- 			if (do_checkpoint)
- 				now = time(NULL);
- 
  			/* Now we can do the real check */
  			if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
  			{
--- 499,504 ----
*************** BackgroundWriterMain(void)
*** 490,495 ****
--- 523,540 ----
  				last_xlog_switch_time = now;
  			}
  		}
+ }
+ 
+ /*
+  * BgWriterNap -- short nap in bgwriter
+  *
+  * Nap for the shorter time of the configured time or the mdelay unless
+  * it is zero. Return the actual nap time in msec.
+  */
+ static void
+ BgWriterNap(long mdelay)
+ {
+ 	long		udelay;
  
  		/*
  		 * Send off activity statistics to the stats collector
*************** BackgroundWriterMain(void)
*** 515,520 ****
--- 560,569 ----
  		else
  			udelay = 10000000L; /* Ten seconds */
  
+ 		/* Clamp the delay to the upper bound. */
+ 		if (mdelay > 0)
+ 			udelay = Min(udelay, mdelay * 1000L);
+ 
  		while (udelay > 999999L)
  		{
  			if (got_SIGHUP || checkpoint_requested || shutdown_requested)
*************** BackgroundWriterMain(void)
*** 526,534 ****
--- 575,745 ----
  
  		if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
  			pg_usleep(udelay);
+ }
+ 
+ /*
+  * CheckpointWriteDelay -- periodical sleep in checkpoint write phase
+  */
+ void
+ CheckpointWriteDelay(double progress)
+ {
+ 	if (!ckpt_active || checkpoint_write_percent <= 0)
+ 		return;
+ 
+ 	elog(DEBUG1, "CheckpointWriteDelay: progress=%.3f", progress);
+ 
+ 	if (!NextCheckpointRequested() &&
+ 		progress * checkpoint_write_percent > GetCheckpointTargetProgress())
+ 	{
+ 		AbsorbFsyncRequests();
+ 		BgLruBufferSync();
+ 		BgWriterNap(0);
  	}
  }
  
+ /*
+  * CheckpointNapDelay -- sleep between checkpoint write and sync phases
+  */
+ void
+ CheckpointNapDelay(double percent)
+ {
+ 	if (!ckpt_active)
+ 		return;
+ 
+ 	if (percent > 0)
+ 	{
+ 		double	ckpt_progress_at_nap_start = GetCheckpointTargetProgress();
+ 		double	remain;
+ 
+ 		/*
+ 		 * If the write phase was finished in less time than the configuration,
+ 		 * we also shorten this nap phase.
+ 		 */
+ 		if (ckpt_progress_at_nap_start < checkpoint_write_percent)
+ 			percent *= ckpt_progress_at_nap_start / checkpoint_write_percent;
+ 
+ 		/*
+ 		 * If we're already past the deadline in the last write phase,
+ 		 * clamp the nap time so that we don't run out of time.
+ 		 */
+ 		percent = Min(percent,
+ 			100.0 - ckpt_progress_at_nap_start - checkpoint_sync_percent);
+ 
+ 		if (percent > 0)
+ 		{
+ 			elog(DEBUG1, "CheckpointNapDelay: %f%%", percent);
+ 
+ 			while (!NextCheckpointRequested() &&
+ 				(remain = ckpt_progress_at_nap_start + percent
+ 					- GetCheckpointTargetProgress()) > 0)
+ 			{
+ 				long	msec = (long)
+ 					(CheckPointTimeout * 1000.0 * remain / 100.0);
+ 
+ 				AbsorbFsyncRequests();
+ 				BgLruBufferSync();
+ 				BgWriterNap(msec);
+ 			}
+ 		}
+ 	}
+ 
+ 	ckpt_progress_at_sync_start = GetCheckpointTargetProgress();
+ }
+ 
+ /*
+  * CheckpointSyncDelay -- periodical sleep in checkpoint sync phase
+  */
+ void
+ CheckpointSyncDelay(double progress, double percent)
+ {
+ 	double	remain;
+ 
+ 	if (!ckpt_active || percent <= 0)
+ 		return;
+ 
+ 	elog(DEBUG1, "CheckpointSyncDelay: progress=%.3f", progress);
+ 
+ 	while (!NextCheckpointRequested() &&
+ 		(remain = ckpt_progress_at_sync_start + progress * percent
+ 			- GetCheckpointTargetProgress()) > 0)
+ 	{
+ 		long	msec = (long) (CheckPointTimeout * 1000.0 * remain / 100.0);
+ 
+ 		AbsorbFsyncRequests();
+ 		BgLruBufferSync();
+ 		BgWriterNap(msec);
+ 	}
+ }
+ 
+ /*
+  * NextCheckpointRequested -- true iff the next checkpoint is requested
+  *
+  *	Do also check any signals received recently.
+  */
+ static bool
+ NextCheckpointRequested(void)
+ {
+ 	if (!am_bg_writer || !ckpt_active)
+ 		return true;
+ 
+ 	/* Don't sleep this checkpoint if next checkpoint is requested. */
+ 	if (checkpoint_requested || shutdown_requested ||
+ 		(time(NULL) - ckpt_start_time >= CheckPointTimeout))
+ 	{
+ 		elog(DEBUG1, "NextCheckpointRequested");
+ 		checkpoint_requested = true;
+ 		return true;
+ 	}
+ 
+ 	/* Process reload signals. */
+ 	if (got_SIGHUP)
+ 	{
+ 		got_SIGHUP = false;
+ 		ProcessConfigFile(PGC_SIGHUP);
+ 	}
+ 
+ 	/* Check for archive_timeout and nap for the configured time. */
+ 	CheckArchiveTimeout();
+ 
+ 	return false;
+ }
+ 
+ /*
+  * GetCheckpointTargetProgress -- progress of the current checkpoint in range 0-100%
+  */
+ static double
+ GetCheckpointTargetProgress(void)
+ {
+ 	struct timeval	now;
+ 	XLogRecPtr		recptr;
+ 	double			progress_in_time,
+ 					progress_in_xlog;
+ 	double			percent;
+ 
+ 	Assert(ckpt_active);
+ 
+ 	/* coordinate the progress with checkpoint_timeout */
+ 	gettimeofday(&now, NULL);
+ 	progress_in_time = ((double) (now.tv_sec - ckpt_start_time) +
+ 		now.tv_usec / 1000000.0) / CheckPointTimeout;
+ 
+ 	/* coordinate the progress with checkpoint_segments */
+ 	recptr = GetInsertRecPtr();
+ 	progress_in_xlog =
+ 		(((double) recptr.xlogid - (double) ckpt_start_recptr.xlogid) * XLogSegsPerFile +
+ 		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+ 		CheckPointSegments;
+ 
+ 	percent = 100.0 * Max(progress_in_time, progress_in_xlog);
+ 	if (percent > 100.0)
+ 		percent = 100.0;
+ 
+ 	elog(DEBUG2, "GetCheckpointTargetProgress: time=%.3f, xlog=%.3f",
+ 		progress_in_time, progress_in_xlog);
+ 
+ 	return percent;
+ }
+ 
  
  /* --------------------------------
   *		signal handler routines
*************** RequestCheckpoint(bool waitforit, bool w
*** 668,673 ****
--- 879,886 ----
  	/* Set warning request flag if appropriate */
  	if (warnontime)
  		bgs->ckpt_time_warn = true;
+ 	if (waitforit)
+ 		bgs->ckpt_force = true;
  
  	/*
  	 * Send signal to request checkpoint.  When waitforit is false, we
diff -cpr HEAD/src/backend/storage/buffer/bufmgr.c LDC_V41/src/backend/storage/buffer/bufmgr.c
*** HEAD/src/backend/storage/buffer/bufmgr.c	Sat Mar 31 03:34:55 2007
--- LDC_V41/src/backend/storage/buffer/bufmgr.c	Wed Apr 25 15:07:11 2007
*************** UnpinBuffer(volatile BufferDesc *buf, bo
*** 947,957 ****
   * This is called at checkpoint time to write out all dirty shared buffers.
   */
  void
! BufferSync(void)
  {
  	int			buf_id;
  	int			num_to_scan;
  	int			absorb_counter;
  
  	/*
  	 * Find out where to start the circular scan.
--- 947,960 ----
   * This is called at checkpoint time to write out all dirty shared buffers.
   */
  void
! BufferSync(bool immediate)
  {
  	int			buf_id;
  	int			num_to_scan;
+ 	int			num_written;
  	int			absorb_counter;
+ 	int			writes_per_nap = (bgwriter_all_maxpages > 0 ?
+ 					bgwriter_all_maxpages : WRITES_PER_ABSORB);
  
  	/*
  	 * Find out where to start the circular scan.
*************** BufferSync(void)
*** 965,970 ****
--- 968,974 ----
  	 * Loop over all buffers.
  	 */
  	num_to_scan = NBuffers;
+ 	num_written = 0;
  	absorb_counter = WRITES_PER_ABSORB;
  	while (num_to_scan-- > 0)
  	{
*************** BufferSync(void)
*** 972,977 ****
--- 976,988 ----
  		{
  			BgWriterStats.m_buf_written_checkpoints++;
  
+ 			if (!immediate && ++num_written >= writes_per_nap)
+ 			{
+ 				num_written = 0;
+ 				CheckpointWriteDelay(
+ 					(double) (NBuffers - num_to_scan) / NBuffers);
+ 			}
+ 
  			/*
  			 * If in bgwriter, absorb pending fsync requests after each
  			 * WRITES_PER_ABSORB write operations, to prevent overflow of the
*************** void
*** 998,1004 ****
  BgBufferSync(void)
  {
  	static int	buf_id1 = 0;
- 	int			buf_id2;
  	int			num_to_scan;
  	int			num_written;
  
--- 1009,1014 ----
*************** BgBufferSync(void)
*** 1044,1049 ****
--- 1054,1072 ----
  		BgWriterStats.m_buf_written_all += num_written;
  	}
  
+ 	BgLruBufferSync();
+ }
+ 
+ /*
+  * BgLruBufferSync -- Write out some lru dirty buffers in the pool.
+  */
+ void
+ BgLruBufferSync(void)
+ {
+ 	int			buf_id2;
+ 	int			num_to_scan;
+ 	int			num_written;
+ 
  	/*
  	 * This loop considers only unpinned buffers close to the clock sweep
  	 * point.
*************** PrintBufferLeakWarning(Buffer buffer)
*** 1286,1295 ****
   * flushed.
   */
  void
! FlushBufferPool(void)
  {
! 	BufferSync();
! 	smgrsync();
  }
  
  
--- 1309,1324 ----
   * flushed.
   */
  void
! FlushBufferPool(bool immediate)
  {
! 	elog(DEBUG1, "CHECKPOINT: write phase");
! 	BufferSync(immediate || checkpoint_write_percent <= 0);
! 
! 	elog(DEBUG1, "CHECKPOINT: nap phase");
! 	CheckpointNapDelay(immediate ? 0 : checkpoint_nap_percent);
! 
! 	elog(DEBUG1, "CHECKPOINT: sync phase");
! 	smgrsync(immediate || checkpoint_sync_percent <= 0);
  }
  
  
diff -cpr HEAD/src/backend/storage/smgr/md.c LDC_V41/src/backend/storage/smgr/md.c
*** HEAD/src/backend/storage/smgr/md.c	Fri Apr 13 02:10:55 2007
--- LDC_V41/src/backend/storage/smgr/md.c	Wed Apr 25 15:07:11 2007
*************** mdimmedsync(SMgrRelation reln)
*** 863,875 ****
   *	mdsync() -- Sync previous writes to stable storage.
   */
  void
! mdsync(void)
  {
  	static bool mdsync_in_progress = false;
  
  	HASH_SEQ_STATUS hstat;
  	PendingOperationEntry *entry;
  	int			absorb_counter;
  
  	/*
  	 * This is only called during checkpoints, and checkpoints should only
--- 863,878 ----
   *	mdsync() -- Sync previous writes to stable storage.
   */
  void
! mdsync(bool immediate)
  {
  	static bool mdsync_in_progress = false;
  
  	HASH_SEQ_STATUS hstat;
  	PendingOperationEntry *entry;
  	int			absorb_counter;
+ 	double		sync_percent = checkpoint_sync_percent;
+ 	double		progress = 0;	/* progress in bytes */
+ 	double		total = 0;		/* total filesize to fsync in bytes */
  
  	/*
  	 * This is only called during checkpoints, and checkpoints should only
*************** mdsync(void)
*** 910,922 ****
  	 * From a performance point of view it doesn't matter anyway, as this
  	 * path will never be taken in a system that's functioning normally.
  	 */
! 	if (mdsync_in_progress)
  	{
  		/* prior try failed, so update any stale cycle_ctr values */
  		hash_seq_init(&hstat, pendingOpsTable);
  		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  		{
! 			entry->cycle_ctr = mdsync_cycle_ctr;
  		}
  	}
  
--- 913,958 ----
  	 * From a performance point of view it doesn't matter anyway, as this
  	 * path will never be taken in a system that's functioning normally.
  	 */
! 	if (mdsync_in_progress || (enableFsync && !immediate))
  	{
  		/* prior try failed, so update any stale cycle_ctr values */
  		hash_seq_init(&hstat, pendingOpsTable);
  		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  		{
! 			if (mdsync_in_progress)
! 				entry->cycle_ctr = mdsync_cycle_ctr;
! 			else if (enableFsync && !immediate && !entry->canceled)
! 			{
! 				SMgrRelation	reln;
! 				MdfdVec		   *seg;
! 				long			len;
! 
! 				/* Sum up lengths of the files on non-immediate case. */
! 				reln = smgropen(entry->tag.rnode);
! 				seg = _mdfd_getseg(reln,
! 						entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
! 						true, EXTENSION_RETURN_NULL);
! 				if (seg != NULL &&
! 					(len = FileSeek(seg->mdfd_vfd, 0, SEEK_END)) >= 0)
! 					total += len;
! 			}
! 		}
! 
! 		/*
! 		 * Cut down the duration of the phase if there is only small area
! 		 * to be fsync-ed.
! 		 */
! 		if (total > 0 && bgwriter_all_maxpages > 0)
! 		{
! 			double	flush_per_delay = bgwriter_all_maxpages * 10;
! 
! 			sync_percent = total / BLCKSZ * BgWriterDelay / flush_per_delay
! 				/ 1000.0 / CheckPointTimeout * 100.0;
! 
! 			if (sync_percent > checkpoint_sync_percent)
! 				sync_percent = checkpoint_sync_percent;
! 
! 			elog(DEBUG1, "CHECKPOINT: adjust checkpoint_sync_percent to %f", sync_percent);
  		}
  	}
  
*************** mdsync(void)
*** 931,936 ****
--- 967,974 ----
  	hash_seq_init(&hstat, pendingOpsTable);
  	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  	{
+ 		long			seglen = -1;
+ 
  		/*
  		 * If the entry is new then don't process it this time.  Note that
  		 * "continue" bypasses the hash-remove call at the bottom of the loop.
*************** mdsync(void)
*** 1010,1016 ****
--- 1048,1058 ----
  								   false, EXTENSION_RETURN_NULL);
  				if (seg != NULL &&
  					FileSync(seg->mdfd_vfd) >= 0)
+ 				{
+ 					if (total > 0)
+ 						seglen = FileSeek(seg->mdfd_vfd, 0, SEEK_END);
  					break;		/* success; break out of retry loop */
+ 				}
  
  				/*
  				 * XXX is there any point in allowing more than one retry?
*************** mdsync(void)
*** 1054,1059 ****
--- 1096,1110 ----
  		if (hash_search(pendingOpsTable, &entry->tag,
  						HASH_REMOVE, NULL) == NULL)
  			elog(ERROR, "pendingOpsTable corrupted");
+ 
+ 		/*
+ 		 * Nap some seconds according to the file size.
+ 		 */
+ 		if (seglen > 0 && total > 0)
+ 		{
+ 			progress += seglen;
+ 			CheckpointSyncDelay(progress / total, sync_percent);
+ 		}
  	}	/* end loop over hashtable entries */
  
  	/* Flag successful completion of mdsync */
diff -cpr HEAD/src/backend/storage/smgr/smgr.c LDC_V41/src/backend/storage/smgr/smgr.c
*** HEAD/src/backend/storage/smgr/smgr.c	Sat Jan  6 07:19:39 2007
--- LDC_V41/src/backend/storage/smgr/smgr.c	Wed Apr 25 15:07:11 2007
***************
*** 21,26 ****
--- 21,27 ----
  #include "access/xlogutils.h"
  #include "commands/tablespace.h"
  #include "pgstat.h"
+ #include "postmaster/bgwriter.h"
  #include "storage/bufmgr.h"
  #include "storage/freespace.h"
  #include "storage/ipc.h"
*************** typedef struct f_smgr
*** 57,63 ****
  	void		(*smgr_immedsync) (SMgrRelation reln);
  	void		(*smgr_commit) (void);	/* may be NULL */
  	void		(*smgr_abort) (void);	/* may be NULL */
! 	void		(*smgr_sync) (void);	/* may be NULL */
  } f_smgr;
  
  
--- 58,64 ----
  	void		(*smgr_immedsync) (SMgrRelation reln);
  	void		(*smgr_commit) (void);	/* may be NULL */
  	void		(*smgr_abort) (void);	/* may be NULL */
! 	void		(*smgr_sync) (bool immediate);	/* may be NULL */
  } f_smgr;
  
  
*************** smgrabort(void)
*** 781,794 ****
   *	smgrsync() -- Sync files to disk at checkpoint time.
   */
  void
! smgrsync(void)
  {
  	int			i;
  
  	for (i = 0; i < NSmgr; i++)
  	{
  		if (smgrsw[i].smgr_sync)
! 			(*(smgrsw[i].smgr_sync)) ();
  	}
  }
  
--- 782,795 ----
   *	smgrsync() -- Sync files to disk at checkpoint time.
   */
  void
! smgrsync(bool immediate)
  {
  	int			i;
  
  	for (i = 0; i < NSmgr; i++)
  	{
  		if (smgrsw[i].smgr_sync)
! 			(*(smgrsw[i].smgr_sync))(immediate);
  	}
  }
  
diff -cpr HEAD/src/backend/utils/misc/guc.c LDC_V41/src/backend/utils/misc/guc.c
*** HEAD/src/backend/utils/misc/guc.c	Sun Apr 22 12:52:40 2007
--- LDC_V41/src/backend/utils/misc/guc.c	Wed Apr 25 15:07:11 2007
*************** static struct config_real ConfigureNames
*** 1832,1837 ****
--- 1832,1864 ----
  		0.1, 0.0, 100.0, NULL, NULL
  	},
  
+ 	{
+ 		{"checkpoint_write_percent", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("Sets the duration percentage of write phase in checkpoints."),
+ 			NULL
+ 		},
+ 		&checkpoint_write_percent,
+ 		50.0, 0.0, 100.0, NULL, NULL
+ 	},
+ 
+ 	{
+ 		{"checkpoint_nap_percent", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("Sets the duration percentage between write and sync phases in checkpoints."),
+ 			NULL
+ 		},
+ 		&checkpoint_nap_percent,
+ 		10.0, 0.0, 100.0, NULL, NULL
+ 	},
+ 
+ 	{
+ 		{"checkpoint_sync_percent", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("Sets the duration percentage of sync phase in checkpoints."),
+ 			NULL
+ 		},
+ 		&checkpoint_sync_percent,
+ 		20.0, 0.0, 100.0, NULL, NULL
+ 	},
+ 
  	/* End-of-list marker */
  	{
  		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL
diff -cpr HEAD/src/backend/utils/misc/postgresql.conf.sample LDC_V41/src/backend/utils/misc/postgresql.conf.sample
*** HEAD/src/backend/utils/misc/postgresql.conf.sample	Thu Apr 19 01:44:18 2007
--- LDC_V41/src/backend/utils/misc/postgresql.conf.sample	Wed Apr 25 15:07:11 2007
***************
*** 168,173 ****
--- 168,176 ----
  
  #checkpoint_segments = 3		# in logfile segments, min 1, 16MB each
  #checkpoint_timeout = 5min		# range 30s-1h
+ #checkpoint_write_percent = 50.0		# duration percentage in write phase
+ #checkpoint_nap_percent = 10.0		# duration percentage between write and sync phases
+ #checkpoint_sync_percent = 20.0		# duration percentage in sync phase
  #checkpoint_warning = 30s		# 0 is off
  
  # - Archiving -
diff -cpr HEAD/src/include/access/xlog.h LDC_V41/src/include/access/xlog.h
*** HEAD/src/include/access/xlog.h	Sat Jan  6 07:19:51 2007
--- LDC_V41/src/include/access/xlog.h	Wed Apr 25 15:07:11 2007
*************** extern void InitXLOGAccess(void);
*** 165,170 ****
--- 165,171 ----
  extern void CreateCheckPoint(bool shutdown, bool force);
  extern void XLogPutNextOid(Oid nextOid);
  extern XLogRecPtr GetRedoRecPtr(void);
+ extern XLogRecPtr GetInsertRecPtr(void);
  extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);
  
  #endif   /* XLOG_H */
diff -cpr HEAD/src/include/postmaster/bgwriter.h LDC_V41/src/include/postmaster/bgwriter.h
*** HEAD/src/include/postmaster/bgwriter.h	Sat Jan  6 07:19:57 2007
--- LDC_V41/src/include/postmaster/bgwriter.h	Wed Apr 25 15:07:11 2007
***************
*** 20,29 ****
--- 20,35 ----
  extern int	BgWriterDelay;
  extern int	CheckPointTimeout;
  extern int	CheckPointWarning;
+ extern double	checkpoint_write_percent;
+ extern double	checkpoint_nap_percent;
+ extern double	checkpoint_sync_percent;
  
  extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(bool waitforit, bool warnontime);
+ extern void CheckpointWriteDelay(double progress);
+ extern void CheckpointNapDelay(double percent);
+ extern void CheckpointSyncDelay(double progress, double percent);
  
  extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
  extern void AbsorbFsyncRequests(void);
diff -cpr HEAD/src/include/storage/bufmgr.h LDC_V41/src/include/storage/bufmgr.h
*** HEAD/src/include/storage/bufmgr.h	Sat Jan  6 07:19:57 2007
--- LDC_V41/src/include/storage/bufmgr.h	Wed Apr 25 15:07:11 2007
*************** extern char *ShowBufferUsage(void);
*** 125,131 ****
  extern void ResetBufferUsage(void);
  extern void AtEOXact_Buffers(bool isCommit);
  extern void PrintBufferLeakWarning(Buffer buffer);
! extern void FlushBufferPool(void);
  extern BlockNumber BufferGetBlockNumber(Buffer buffer);
  extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
  extern void RelationTruncate(Relation rel, BlockNumber nblocks);
--- 125,131 ----
  extern void ResetBufferUsage(void);
  extern void AtEOXact_Buffers(bool isCommit);
  extern void PrintBufferLeakWarning(Buffer buffer);
! extern void FlushBufferPool(bool immediate);
  extern BlockNumber BufferGetBlockNumber(Buffer buffer);
  extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
  extern void RelationTruncate(Relation rel, BlockNumber nblocks);
*************** extern void LockBufferForCleanup(Buffer 
*** 150,157 ****
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
! extern void BufferSync(void);
  extern void BgBufferSync(void);
  
  extern void AtProcExit_LocalBuffers(void);
  
--- 150,158 ----
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
! extern void BufferSync(bool immediate);
  extern void BgBufferSync(void);
+ extern void BgLruBufferSync(void);
  
  extern void AtProcExit_LocalBuffers(void);
  
diff -cpr HEAD/src/include/storage/smgr.h LDC_V41/src/include/storage/smgr.h
*** HEAD/src/include/storage/smgr.h	Thu Jan 18 01:25:01 2007
--- LDC_V41/src/include/storage/smgr.h	Wed Apr 25 15:07:11 2007
*************** extern void AtSubAbort_smgr(void);
*** 82,88 ****
  extern void PostPrepare_smgr(void);
  extern void smgrcommit(void);
  extern void smgrabort(void);
! extern void smgrsync(void);
  
  extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void smgr_desc(StringInfo buf, uint8 xl_info, char *rec);
--- 82,88 ----
  extern void PostPrepare_smgr(void);
  extern void smgrcommit(void);
  extern void smgrabort(void);
! extern void smgrsync(bool immediate);
  
  extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void smgr_desc(StringInfo buf, uint8 xl_info, char *rec);
*************** extern void mdwrite(SMgrRelation reln, B
*** 103,109 ****
  extern BlockNumber mdnblocks(SMgrRelation reln);
  extern void mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp);
  extern void mdimmedsync(SMgrRelation reln);
! extern void mdsync(void);
  
  extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
  extern void ForgetRelationFsyncRequests(RelFileNode rnode);
--- 103,109 ----
  extern BlockNumber mdnblocks(SMgrRelation reln);
  extern void mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp);
  extern void mdimmedsync(SMgrRelation reln);
! extern void mdsync(bool immediate);
  
  extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
  extern void ForgetRelationFsyncRequests(RelFileNode rnode);
