On Thu, 2008-08-07 at 12:44 +0100, Simon Riggs wrote:
> I would like to propose some changes to the infrastructure for recovery.
> These changes are beneficial in themselves, but also form the basis for
> other work we might later contemplate.
> 
> Currently 
> * the startup process performs restartpoints during recovery
> * the death of the startup process is tied directly to the change of
> state in the postmaster following recovery
> 
> I propose to
> * have startup process signal postmaster when it starts Redo phase (if
> it starts it)

> Decoupling things in this way allows us to
> 1. arrange for the bgwriter to start during Redo, so it can:
> i) clean dirty blocks for the startup process
> ii) perform restartpoints in background
>   Both of these aspects will increase performance of recovery

Taking into account comments from Tom and Alvaro

Included patch with the following changes:

* new postmaster mode known as consistent recovery, entered only when
recovery passes safe/consistent point. InRedo is now set in all
processes when started/connected in consistent recovery mode.

* bgwriter and stats process starts in consistent recovery mode.
bgwriter changes mode when startup process completes.

* bgwriter now performs restartpoints and also cleans shared_buffers
while the startup process performs redo apply

* recovery.conf parameter log_restartpoints is now deprecated, since
function overlaps with log_checkpoints too much. I've kept the
distinction between restartpoints and checkpoints in code, to avoid
convoluted code. Minor change, not critical.

[Replying to one of Alvaro's other comments: Startup process still uses
XLogReadBuffer. I'm not planning on changing that either, at least not
in this patch.]

Patch doesn't conflict with rmgr plugin patch.

Passes make check, but that's easy.
Various other tests all seem to be working.

-- 
 Simon Riggs           www.2ndQuadrant.com
 PostgreSQL Training, Services and Support
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.317
diff -c -r1.317 xlog.c
*** src/backend/access/transam/xlog.c	11 Aug 2008 11:05:10 -0000	1.317
--- src/backend/access/transam/xlog.c	31 Aug 2008 19:07:40 -0000
***************
*** 131,137 ****
  static bool recoveryTarget = false;
  static bool recoveryTargetExact = false;
  static bool recoveryTargetInclusive = true;
- static bool recoveryLogRestartpoints = false;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
  static TimestampTz recoveryLastXTime = 0;
--- 131,136 ----
***************
*** 386,392 ****
  static XLogRecord *nextRecord = NULL;
  static TimeLineID lastPageTLI = 0;
  
! static bool InRedo = false;
  
  
  static void XLogArchiveNotify(const char *xlog);
--- 385,391 ----
  static XLogRecord *nextRecord = NULL;
  static TimeLineID lastPageTLI = 0;
  
! bool InRedo = false;
  
  
  static void XLogArchiveNotify(const char *xlog);
***************
*** 480,485 ****
--- 479,488 ----
  	bool		doPageWrites;
  	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
  
+ 	/* cross-check on whether we should be here or not */
+ 	if (InRedo)
+ 		elog(FATAL, "cannot write new WAL data during recovery mode");
+ 
  	/* info's high bits are reserved for use by me */
  	if (info & XLR_INFO_MASK)
  		elog(PANIC, "invalid xlog info mask %02X", info);
***************
*** 2051,2057 ****
  		unlink(tmppath);
  	}
  
! 	elog(DEBUG2, "done creating and filling new WAL file");
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
--- 2054,2061 ----
  		unlink(tmppath);
  	}
  
! 	XLogFileName(tmppath, ThisTimeLineID, log, seg);
! 	elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
***************
*** 4532,4546 ****
  		}
  		else if (strcmp(tok1, "log_restartpoints") == 0)
  		{
- 			/*
- 			 * does nothing if a recovery_target is not also set
- 			 */
- 			if (!parse_bool(tok2, &recoveryLogRestartpoints))
- 				  ereport(ERROR,
- 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					  errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
  			ereport(LOG,
! 					(errmsg("log_restartpoints = %s", tok2)));
  		}
  		else
  			ereport(FATAL,
--- 4536,4544 ----
  		}
  		else if (strcmp(tok1, "log_restartpoints") == 0)
  		{
  			ereport(LOG,
! 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 					  errmsg("parameter \"log_restartpoints\" has been deprecated")));
  		}
  		else
  			ereport(FATAL,
***************
*** 4811,4816 ****
--- 4809,4815 ----
  	CheckPoint	checkPoint;
  	bool		wasShutdown;
  	bool		reachedStopPoint = false;
+ 	bool		reachedSafeStopPoint = false;
  	bool		haveBackupLabel = false;
  	XLogRecPtr	RecPtr,
  				LastRec,
***************
*** 5039,5044 ****
--- 5038,5048 ----
  		UpdateControlFile();
  
  		/*
+ 		 * Reset pgstat data, because it may be invalid after recovery.
+ 		 */
+ 		pgstat_reset_all();
+ 
+ 		/*
  		 * If there was a backup label file, it's done its job and the info
  		 * has now been propagated into pg_control.  We must get rid of the
  		 * label file so that if we crash during recovery, we'll pick up at
***************
*** 5148,5153 ****
--- 5152,5172 ----
  
  				LastRec = ReadRecPtr;
  
+ 				/*
+ 				 * Have we reached our safe stopping point? If so, we can
+ 				 * signal Postmaster to enter consistent recovery mode
+ 				 */
+ 				if (!reachedSafeStopPoint && 
+ 					 XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ 				{
+ 					reachedSafeStopPoint = true;
+ 					ereport(LOG,
+ 						(errmsg("consistent recovery state reached at %X/%X",
+ 							EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ 					if (IsUnderPostmaster)
+ 						SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ 				}
+ 
  				record = ReadRecord(NULL, LOG);
  			} while (record != NULL && recoveryContinue);
  
***************
*** 5169,5174 ****
--- 5188,5194 ----
  			/* there are no WAL records following the checkpoint */
  			ereport(LOG,
  					(errmsg("redo is not required")));
+ 			reachedSafeStopPoint = true;
  		}
  	}
  
***************
*** 5184,5190 ****
  	 * Complain if we did not roll forward far enough to render the backup
  	 * dump consistent.
  	 */
! 	if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
--- 5204,5210 ----
  	 * Complain if we did not roll forward far enough to render the backup
  	 * dump consistent.
  	 */
! 	if (InRecovery && !reachedSafeStopPoint)
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
***************
*** 5305,5314 ****
  		 */
  		XLogCheckInvalidPages();
  
! 		/*
! 		 * Reset pgstat data, because it may be invalid after recovery.
! 		 */
! 		pgstat_reset_all();
  
  		/*
  		 * Perform a checkpoint to update all our recovery activity to disk.
--- 5325,5332 ----
  		 */
  		XLogCheckInvalidPages();
  
! 		if (IsUnderPostmaster)
! 			BgWriterCompleteRestartPointImmediately();
  
  		/*
  		 * Perform a checkpoint to update all our recovery activity to disk.
***************
*** 5318,5323 ****
--- 5336,5344 ----
  		 * assigning a new TLI, using a shutdown checkpoint allows us to have
  		 * the rule that TLI only changes in shutdown checkpoints, which
  		 * allows some extra error checking in xlog_redo.
+ 		 *
+ 		 * Note that this will wait behind any restartpoint that the bgwriter
+ 		 * is currently performing, though will be much faster as a result.
  		 */
  		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
  	}
***************
*** 5372,5377 ****
--- 5393,5401 ----
  		readRecordBuf = NULL;
  		readRecordBufSize = 0;
  	}
+ 
+ 	if (IsUnderPostmaster)
+ 		BgWriterRecoveryComplete();
  }
  
  /*
***************
*** 5642,5648 ****
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(void)
  {
  	long		write_secs,
  				sync_secs,
--- 5666,5672 ----
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(bool checkpoint)
  {
  	long		write_secs,
  				sync_secs,
***************
*** 5665,5673 ****
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
! 	elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
  		 "%d transaction log file(s) added, %d removed, %d recycled; "
  		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
  		 CheckpointStats.ckpt_bufs_written,
  		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  		 CheckpointStats.ckpt_segs_added,
--- 5689,5698 ----
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
! 	elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
  		 "%d transaction log file(s) added, %d removed, %d recycled; "
  		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+ 		 (checkpoint ? "  checkpoint" : "restartpoint"),
  		 CheckpointStats.ckpt_bufs_written,
  		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  		 CheckpointStats.ckpt_segs_added,
***************
*** 6002,6008 ****
  
  	/* All real work is done, but log before releasing lock. */
  	if (log_checkpoints)
! 		LogCheckpointEnd();
  
  	LWLockRelease(CheckpointLock);
  }
--- 6027,6033 ----
  
  	/* All real work is done, but log before releasing lock. */
  	if (log_checkpoints)
! 		LogCheckpointEnd(true);
  
  	LWLockRelease(CheckpointLock);
  }
***************
*** 6071,6099 ****
  			}
  	}
  
  	/*
! 	 * OK, force data out to disk
! 	 */
! 	CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
! 
! 	/*
! 	 * Update pg_control so that any subsequent crash will restart from this
! 	 * checkpoint.	Note: ReadRecPtr gives the XLOG address of the checkpoint
! 	 * record itself.
  	 */
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
  	ControlFile->checkPoint = ReadRecPtr;
  	ControlFile->checkPointCopy = *checkPoint;
  	ControlFile->time = (pg_time_t) time(NULL);
  	UpdateControlFile();
  
! 	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
  			(errmsg("recovery restart point at %X/%X",
  					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! 	if (recoveryLastXTime)
! 		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! 				(errmsg("last completed transaction was at log time %s",
! 						timestamptz_to_str(recoveryLastXTime))));
  }
  
  /*
--- 6096,6164 ----
  			}
  	}
  
+ 	if (recoveryLastXTime)
+ 		ereport((log_checkpoints ? LOG : DEBUG2),
+ 				(errmsg("last completed transaction was at log time %s",
+ 						timestamptz_to_str(recoveryLastXTime))));
  	/*
! 	 * Update ControlFile data in shared memory. 
! 	 * Note: ReadRecPtr gives the XLOG address of the checkpoint record itself.
  	 */
+ 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
  	ControlFile->checkPoint = ReadRecPtr;
  	ControlFile->checkPointCopy = *checkPoint;
+ 	RequestRestartPoint();
+ 	LWLockRelease(ControlFileLock);
+ }
+ 
+ /*
+  * As of 8.4, RestartPoints are always created by the bgwriter
+  */
+ void
+ CreateRestartPoint(void)
+ {
+ 	CheckPoint *checkPoint;
+ 
+ 	if (log_checkpoints)
+ 	{
+ 		/*
+ 		 * Prepare to accumulate statistics.
+ 		 */
+ 
+ 		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ 		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+ 
+ 		elog(LOG, "restartpoint starting:");
+ 	}
+ 
+ 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+ 
+ 	checkPoint = &ControlFile->checkPointCopy;
+ 
+ 	/*
+ 	 * OK, write out dirty blocks smoothly
+ 	 */
+ 	CheckPointGuts(checkPoint->redo, 0);
+ 
+ 	/*
+ 	 * Update pg_control, using current time
+ 	 */
+ 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	ControlFile->time = (pg_time_t) time(NULL);
  	UpdateControlFile();
+ 	LWLockRelease(ControlFileLock);
+ 
+ 	/* All real work is done, but log before releasing lock. */
+ 	if (log_checkpoints)
+ 		LogCheckpointEnd(true);
  
! 	ereport((log_checkpoints ? LOG : DEBUG2),
  			(errmsg("recovery restart point at %X/%X",
  					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! 
! 	LWLockRelease(CheckpointLock);
! 
  }
  
  /*
Index: src/backend/postmaster/bgwriter.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/bgwriter.c,v
retrieving revision 1.51
diff -c -r1.51 bgwriter.c
*** src/backend/postmaster/bgwriter.c	11 Aug 2008 11:05:11 -0000	1.51
--- src/backend/postmaster/bgwriter.c	31 Aug 2008 19:45:39 -0000
***************
*** 122,127 ****
--- 122,129 ----
  {
  	pid_t		bgwriter_pid;	/* PID of bgwriter (0 if not started) */
  
+ 	bool		InRedo;
+ 
  	slock_t		ckpt_lck;		/* protects all the ckpt_* fields */
  
  	int			ckpt_started;	/* advances when checkpoint starts */
***************
*** 166,172 ****
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
--- 168,174 ----
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;	/* not used if InRedo */
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
***************
*** 186,191 ****
--- 188,208 ----
  static void ReqCheckpointHandler(SIGNAL_ARGS);
  static void ReqShutdownHandler(SIGNAL_ARGS);
  
+ /* notify bgwriter of change in mode */
+ void
+ BgWriterRecoveryComplete(void)
+ {
+ 	BgWriterShmem->InRedo = false;
+ 	elog(DEBUG1, "recovery complete"); 
+ }
+ 
+ /* ask bgwriter to complete any restartpoint, if any, with zero delay */
+ void
+ BgWriterCompleteRestartPointImmediately(void)
+ {
+ 	BgWriterShmem->ckpt_flags = CHECKPOINT_IMMEDIATE;
+ 	elog(DEBUG2, "asking bgwriter to complete any restartpoint with zero delay"); 
+ }
  
  /*
   * Main entry point for bgwriter process
***************
*** 202,207 ****
--- 219,230 ----
  	BgWriterShmem->bgwriter_pid = MyProcPid;
  	am_bg_writer = true;
  
+ 	/* 
+ 	 * Follow the postmaster's current mode at startup. If we are InRedo then
+ 	 * the startup process will later tell us when it is complete.
+ 	 */
+ 	BgWriterShmem->InRedo = InRedo;
+ 
  	/*
  	 * If possible, make this process a group leader, so that the postmaster
  	 * can signal any child processes too.	(bgwriter probably never has any
***************
*** 356,371 ****
  	 */
  	PG_SETMASK(&UnBlockSig);
  
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
- 		bool		do_checkpoint = false;
- 		int			flags = 0;
- 		pg_time_t	now;
- 		int			elapsed_secs;
- 
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
--- 379,393 ----
  	 */
  	PG_SETMASK(&UnBlockSig);
  
+ 	if (InRedo)
+ 		elog(DEBUG1, "bgwriter starting in recovery mode, pid = %u", 
+ 			BgWriterShmem->bgwriter_pid);
+ 
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
***************
*** 383,501 ****
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
- 		if (checkpoint_requested)
- 		{
- 			checkpoint_requested = false;
- 			do_checkpoint = true;
- 			BgWriterStats.m_requested_checkpoints++;
- 		}
- 		if (shutdown_requested)
- 		{
- 			/*
- 			 * From here on, elog(ERROR) should end with exit(1), not send
- 			 * control back to the sigsetjmp block above
- 			 */
- 			ExitOnAnyError = true;
- 			/* Close down the database */
- 			ShutdownXLOG(0, 0);
- 			DumpFreeSpaceMap(0, 0);
- 			/* Normal exit from the bgwriter is here */
- 			proc_exit(0);		/* done */
- 		}
- 
- 		/*
- 		 * Force a checkpoint if too much time has elapsed since the last one.
- 		 * Note that we count a timed checkpoint in stats only when this
- 		 * occurs without an external request, but we set the CAUSE_TIME flag
- 		 * bit even if there is also an external request.
- 		 */
- 		now = (pg_time_t) time(NULL);
- 		elapsed_secs = now - last_checkpoint_time;
- 		if (elapsed_secs >= CheckPointTimeout)
- 		{
- 			if (!do_checkpoint)
- 				BgWriterStats.m_timed_checkpoints++;
- 			do_checkpoint = true;
- 			flags |= CHECKPOINT_CAUSE_TIME;
- 		}
  
! 		/*
! 		 * Do a checkpoint if requested, otherwise do one cycle of
! 		 * dirty-buffer writing.
! 		 */
! 		if (do_checkpoint)
  		{
- 			/* use volatile pointer to prevent code rearrangement */
- 			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
- 
  			/*
! 			 * Atomically fetch the request flags to figure out what kind of a
! 			 * checkpoint we should perform, and increase the started-counter
! 			 * to acknowledge that we've started a new checkpoint.
  			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			flags |= bgs->ckpt_flags;
! 			bgs->ckpt_flags = 0;
! 			bgs->ckpt_started++;
! 			SpinLockRelease(&bgs->ckpt_lck);
  
! 			/*
! 			 * We will warn if (a) too soon since last checkpoint (whatever
! 			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 			 * since the last checkpoint start.  Note in particular that this
! 			 * implementation will not generate warnings caused by
! 			 * CheckPointTimeout < CheckPointWarning.
! 			 */
! 			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 				elapsed_secs < CheckPointWarning)
! 				ereport(LOG,
! 						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 								elapsed_secs),
! 						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
  
! 			/*
! 			 * Initialize bgwriter-private variables used during checkpoint.
! 			 */
! 			ckpt_active = true;
! 			ckpt_start_recptr = GetInsertRecPtr();
! 			ckpt_start_time = now;
! 			ckpt_cached_elapsed = 0;
  
! 			/*
! 			 * Do the checkpoint.
! 			 */
! 			CreateCheckPoint(flags);
  
  			/*
! 			 * After any checkpoint, close all smgr files.	This is so we
! 			 * won't hang onto smgr references to deleted files indefinitely.
  			 */
! 			smgrcloseall();
  
  			/*
! 			 * Indicate checkpoint completion to any waiting backends.
  			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			bgs->ckpt_done = bgs->ckpt_started;
! 			SpinLockRelease(&bgs->ckpt_lck);
! 
! 			ckpt_active = false;
  
  			/*
! 			 * Note we record the checkpoint start time not end time as
! 			 * last_checkpoint_time.  This is so that time-driven checkpoints
! 			 * happen at a predictable spacing.
  			 */
! 			last_checkpoint_time = now;
! 		}
! 		else
! 			BgBufferSync();
  
! 		/* Check for archive_timeout and switch xlog files if necessary. */
! 		CheckArchiveTimeout();
  
! 		/* Nap for the configured time. */
! 		BgWriterNap();
  	}
  }
  
--- 405,610 ----
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
  
! 		if (InRedo)
  		{
  			/*
! 			 * Check to see whether startup process has completed redo.
! 			 * If so, we can permanently change out of recovery mode.
  			 */
! 			if (BgWriterShmem->InRedo == false)
! 			{
  
! 				elog(LOG, "changing to InRedo = false");
  
! 				InitXLOGAccess();
! 				InRedo = false;
  
! 				/*
! 				 * Start time-driven events from now
! 				 */
! 				last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
! 			}
! 
! 			if (checkpoint_requested) 
! 			{
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_time = (pg_time_t) time(NULL);
! 				ckpt_cached_elapsed = 0;
! 
! 				CreateRestartPoint();
! 
! 				ckpt_active = false;
! 				checkpoint_requested = false;
! 				/* 
! 				 * Reset any flags if we requested immediate completion part 
! 				 * way through the restart point
! 				 */
! 				BgWriterShmem->ckpt_flags = 0;
! 			}
! 			else
! 			{
! 				/* Clean buffers dirtied by recovery */
! 				BgBufferSync();
! 
! 				/* Nap for the configured time. */
! 				BgWriterNap();
! 			}
! 
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
  
  			/*
! 			 * Check to see whether startup process has completed redo.
! 			 * If so, we can permanently change out of recovery mode.
  			 */
! 			if (BgWriterShmem->InRedo == false)
! 			{
! 				elog(DEBUG2, "changing to InRedo = false");
! 
! 				InitXLOGAccess();
! 				InRedo = false;
! 
! 				/*
! 				 * Start time-driven events from now
! 				 */
! 				last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
! 			}
! 		}
! 		else	/* Normal processing */
! 		{
! 			bool		do_checkpoint = false;
! 			int			flags = 0;
! 			pg_time_t	now;
! 			int			elapsed_secs;
! 
! 			Assert(!InRedo);
! 
! 			if (checkpoint_requested) 
! 			{
! 				checkpoint_requested = false;
! 				do_checkpoint = true;
! 				BgWriterStats.m_requested_checkpoints++;
! 			}
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Close down the database */
! 				ShutdownXLOG(0, 0);
! 				DumpFreeSpaceMap(0, 0);
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
  
  			/*
! 			 * Force a checkpoint if too much time has elapsed since the last one.
! 			 * Note that we count a timed checkpoint in stats only when this
! 			 * occurs without an external request, but we set the CAUSE_TIME flag
! 			 * bit even if there is also an external request.
  			 */
! 			now = (pg_time_t) time(NULL);
! 			elapsed_secs = now - last_checkpoint_time;
! 			if (elapsed_secs >= CheckPointTimeout)
! 			{
! 				if (!do_checkpoint)
! 					BgWriterStats.m_timed_checkpoints++;
! 				do_checkpoint = true;
! 				flags |= CHECKPOINT_CAUSE_TIME;
! 			}
  
  			/*
! 			 * Do a checkpoint if requested, otherwise do one cycle of
! 			 * dirty-buffer writing.
  			 */
! 			if (do_checkpoint)
! 			{
! 				/* use volatile pointer to prevent code rearrangement */
! 				volatile BgWriterShmemStruct *bgs = BgWriterShmem;
! 
! 				/*
! 				 * Atomically fetch the request flags to figure out what kind of a
! 				 * checkpoint we should perform, and increase the started-counter
! 				 * to acknowledge that we've started a new checkpoint.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				flags |= bgs->ckpt_flags;
! 				bgs->ckpt_flags = 0;
! 				bgs->ckpt_started++;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				/*
! 				 * We will warn if (a) too soon since last checkpoint (whatever
! 				 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 				 * since the last checkpoint start.  Note in particular that this
! 				 * implementation will not generate warnings caused by
! 				 * CheckPointTimeout < CheckPointWarning.
! 				 */
! 				if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 					elapsed_secs < CheckPointWarning)
! 					ereport(LOG,
! 							(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 									elapsed_secs),
! 							 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_recptr = GetInsertRecPtr();
! 				ckpt_start_time = now;
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Do the checkpoint.
! 				 */
! 				CreateCheckPoint(flags);
! 
! 				/*
! 				 * After any checkpoint, close all smgr files.	This is so we
! 				 * won't hang onto smgr references to deleted files indefinitely.
! 				 */
! 				smgrcloseall();
! 
! 				/*
! 				 * Indicate checkpoint completion to any waiting backends.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				bgs->ckpt_done = bgs->ckpt_started;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				ckpt_active = false;
! 
! 				/*
! 				 * Note we record the checkpoint start time not end time as
! 				 * last_checkpoint_time.  This is so that time-driven checkpoints
! 				 * happen at a predictable spacing.
! 				 */
! 				last_checkpoint_time = now;
! 			}
! 			else
! 				BgBufferSync();
  
! 			/* Check for archive_timeout and switch xlog files if necessary. */
! 			CheckArchiveTimeout();
  
! 			/* Nap for the configured time. */
! 			BgWriterNap();
! 		}
  	}
  }
  
***************
*** 588,594 ****
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
--- 697,704 ----
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		if (!InRedo)
! 			AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
***************
*** 642,647 ****
--- 752,770 ----
  	if (!am_bg_writer)
  		return;
  
+ 	/* Perform minimal duties during recovery and skip wait if requested */
+ 	if (InRedo)
+ 	{
+ 		BgBufferSync();
+ 
+ 		if (!ImmediateCheckpointRequested() &&
+ 			!shutdown_requested &&
+ 			IsCheckpointOnSchedule(progress))
+ 			BgWriterNap();
+ 
+ 		return;
+ 	}
+ 
  	/*
  	 * Perform the usual bgwriter duties and take a nap, unless we're behind
  	 * schedule, in which case we just try to catch up as quickly as possible.
***************
*** 716,731 ****
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	recptr = GetInsertRecPtr();
! 	elapsed_xlogs =
! 		(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 		CheckPointSegments;
! 
! 	if (progress < elapsed_xlogs)
  	{
! 		ckpt_cached_elapsed = elapsed_xlogs;
! 		return false;
  	}
  
  	/*
--- 839,857 ----
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	if (!InRedo)
  	{
! 		recptr = GetInsertRecPtr();
! 		elapsed_xlogs =
! 			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 			CheckPointSegments;
! 
! 		if (progress < elapsed_xlogs)
! 		{
! 			ckpt_cached_elapsed = elapsed_xlogs;
! 			return false;
! 		}
  	}
  
  	/*
***************
*** 966,971 ****
--- 1092,1118 ----
  	}
  }
  
+ void
+ RequestRestartPoint(void)
+ {
+ 	/*
+ 	 * If in a standalone backend, just do it ourselves.
+ 	 */
+ 	if (!IsPostmasterEnvironment)
+ 	{
+ 		CreateRestartPoint();
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Send signal to request restartpoint.
+ 	 */
+ 	if (BgWriterShmem->bgwriter_pid == 0)
+ 		elog(LOG, "could not request restartpoint because bgwriter not running");
+ 	if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ 		elog(LOG, "could not signal for restartpoint: %m");	
+ }
+ 
  /*
   * ForwardFsyncRequest
   *		Forward a file-fsync request from a backend to the bgwriter
Index: src/backend/postmaster/postmaster.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/postmaster.c,v
retrieving revision 1.561
diff -c -r1.561 postmaster.c
*** src/backend/postmaster/postmaster.c	26 Jun 2008 02:47:19 -0000	1.561
--- src/backend/postmaster/postmaster.c	31 Aug 2008 17:10:31 -0000
***************
*** 254,259 ****
--- 254,264 ----
  {
  	PM_INIT,					/* postmaster starting */
  	PM_STARTUP,					/* waiting for startup subprocess */
+ 	PM_RECOVERY,				/* consistent recovery mode; state only
+ 								 * entered for archive and streaming recovery,
+ 								 * and only after the point where the 
+ 								 * all data is in consistent state.
+ 								 */
  	PM_RUN,						/* normal "database is alive" state */
  	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
  	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
***************
*** 2104,2110 ****
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
--- 2109,2115 ----
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
***************
*** 2136,2141 ****
--- 2141,2147 ----
  			 * Otherwise, commence normal operations.
  			 */
  			pmState = PM_RUN;
+ 			InRedo = false;
  
  			/*
  			 * Load the flat authorization file into postmaster's cache. The
***************
*** 2148,2155 ****
  			 * Crank up the background writer.	It doesn't matter if this
  			 * fails, we'll just try again later.
  			 */
! 			Assert(BgWriterPID == 0);
! 			BgWriterPID = StartBackgroundWriter();
  
  			/*
  			 * Likewise, start other special children as needed.  In a restart
--- 2154,2161 ----
  			 * Crank up the background writer.	It doesn't matter if this
  			 * fails, we'll just try again later.
  			 */
! 			if (BgWriterPID == 0)
! 				BgWriterPID = StartBackgroundWriter();
  
  			/*
  			 * Likewise, start other special children as needed.  In a restart
***************
*** 2812,2817 ****
--- 2818,2825 ----
  	 */
  	MyCancelKey = PostmasterRandom();
  
+ 	InRedo = (pmState != PM_RUN);
+ 
  	/*
  	 * Make room for backend data structure.  Better before the fork() so we
  	 * can handle failure cleanly.
***************
*** 3821,3826 ****
--- 3829,3880 ----
  
  	PG_SETMASK(&BlockSig);
  
+ 	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ 	{
+ 		Assert(pmState == PM_STARTUP);
+ 
+ 		/*
+ 		 * Go to shutdown mode if a shutdown request was pending.
+ 		 */
+ 		if (Shutdown > NoShutdown)
+ 		{
+ 			pmState = PM_WAIT_BACKENDS;
+ 			/* PostmasterStateMachine logic does the rest */
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * Startup process has entered recovery
+ 			 */
+ 			pmState = PM_RECOVERY;
+ 			InRedo = true;
+ 
+ 			/*
+ 			 * Load the flat authorization file into postmaster's cache. The
+ 			 * startup process won't have recomputed this from the database yet,
+ 			 * so we it may change following recovery. 
+ 			 */
+ 			load_role();
+ 
+ 			/*
+ 			 * Crank up the background writer.	It doesn't matter if this
+ 			 * fails, we'll just try again later.
+ 			 */
+ 			Assert(BgWriterPID == 0);
+ 			BgWriterPID = StartBackgroundWriter();
+ 
+ 			/*
+ 			 * Likewise, start other special children as needed.
+ 			 */
+ 			Assert(PgStatPID == 0);
+ 			PgStatPID = pgstat_start();
+ 
+ 			/* XXX at this point we could accept read-only connections */
+ 			ereport(DEBUG1,
+ 				 (errmsg("database system is in consistent recovery mode")));
+ 		}
+ 	}
+ 
  	if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
  	{
  		/*
Index: src/include/miscadmin.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/miscadmin.h,v
retrieving revision 1.202
diff -c -r1.202 miscadmin.h
*** src/include/miscadmin.h	23 Apr 2008 13:44:59 -0000	1.202
--- src/include/miscadmin.h	31 Aug 2008 11:23:45 -0000
***************
*** 64,69 ****
--- 64,71 ----
   *
   *****************************************************************************/
  
+ extern bool InRedo;
+ 
  /* in globals.c */
  /* these are marked volatile because they are set by signal handlers: */
  extern PGDLLIMPORT volatile bool InterruptPending;
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog.h,v
retrieving revision 1.88
diff -c -r1.88 xlog.h
*** src/include/access/xlog.h	12 May 2008 08:35:05 -0000	1.88
--- src/include/access/xlog.h	31 Aug 2008 13:33:43 -0000
***************
*** 205,210 ****
--- 205,211 ----
  extern void ShutdownXLOG(int code, Datum arg);
  extern void InitXLOGAccess(void);
  extern void CreateCheckPoint(int flags);
+ extern void CreateRestartPoint(void);
  extern void XLogPutNextOid(Oid nextOid);
  extern XLogRecPtr GetRedoRecPtr(void);
  extern XLogRecPtr GetInsertRecPtr(void);
Index: src/include/postmaster/bgwriter.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/postmaster/bgwriter.h,v
retrieving revision 1.12
diff -c -r1.12 bgwriter.h
*** src/include/postmaster/bgwriter.h	11 Aug 2008 11:05:11 -0000	1.12
--- src/include/postmaster/bgwriter.h	31 Aug 2008 15:35:30 -0000
***************
*** 25,36 ****
--- 25,40 ----
  extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(int flags);
+ extern void RequestRestartPoint(void);
  extern void CheckpointWriteDelay(int flags, double progress);
  
  extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
  								BlockNumber segno);
  extern void AbsorbFsyncRequests(void);
  
+ extern void BgWriterRecoveryComplete(void);
+ extern void BgWriterCompleteRestartPointImmediately(void);
+ 
  extern Size BgWriterShmemSize(void);
  extern void BgWriterShmemInit(void);
  
Index: src/include/storage/pmsignal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/pmsignal.h,v
retrieving revision 1.20
diff -c -r1.20 pmsignal.h
*** src/include/storage/pmsignal.h	19 Jun 2008 21:32:56 -0000	1.20
--- src/include/storage/pmsignal.h	31 Aug 2008 11:26:40 -0000
***************
*** 22,27 ****
--- 22,28 ----
   */
  typedef enum
  {
+ 	PMSIGNAL_RECOVERY_START,	/* move to PM_RECOVERY state */
  	PMSIGNAL_PASSWORD_CHANGE,	/* pg_auth file has changed */
  	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
  	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */
-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches

Reply via email to