Re: [PATCHES] [HACKERS] Infrastructure changes for recovery

Simon Riggs Thu, 18 Sep 2008 05:18:15 -0700

On Thu, 2008-09-18 at 09:05 +0100, Simon Riggs wrote:

> Feels like I should shutdown the bgwriter after recovery and then
> allow it to be cranked up again after normal processing starts, and do
> all of this through postmaster state changes. That way bgwriter
> doesn't need to do a dynamic state change.


This approach appears to be working nicely so far. Some ugly bits of
former patch removed.

Patch passes basic tests and changes state cleanly. 

Restarting test cycle on this patch now, confirm tomorrow.

-- 
 Simon Riggs           www.2ndQuadrant.com
 PostgreSQL Training, Services and Support

Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.317
diff -c -r1.317 xlog.c
*** src/backend/access/transam/xlog.c	11 Aug 2008 11:05:10 -0000	1.317
--- src/backend/access/transam/xlog.c	18 Sep 2008 11:39:07 -0000
***************
*** 119,124 ****
--- 119,126 ----
  
  /* Are we doing recovery from XLOG? */
  bool		InRecovery = false;
+ bool		reachedSafeStopPoint = false;
+ bool		StartupCanWriteWAL = false;
  
  /* Are we recovering using offline XLOG archives? */
  static bool InArchiveRecovery = false;
***************
*** 131,137 ****
  static bool recoveryTarget = false;
  static bool recoveryTargetExact = false;
  static bool recoveryTargetInclusive = true;
- static bool recoveryLogRestartpoints = false;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
  static TimestampTz recoveryLastXTime = 0;
--- 133,138 ----
***************
*** 286,295 ****
--- 287,298 ----
  /*
   * Total shared-memory state for XLOG.
   */
+ #define	XLOGCTL_BUFFER_SPACING	128
  typedef struct XLogCtlData
  {
  	/* Protected by WALInsertLock: */
  	XLogCtlInsert Insert;
+ 	char	InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)];
  
  	/* Protected by info_lck: */
  	XLogwrtRqst LogwrtRqst;
***************
*** 297,305 ****
--- 300,315 ----
  	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
  	TransactionId ckptXid;
  	XLogRecPtr	asyncCommitLSN; /* LSN of newest async commit */
+ 	/* add data structure padding for above info_lck declarations */
+ 	char	InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst) 
+ 												- sizeof(XLogwrtResult)
+ 												- sizeof(uint32)
+ 												- sizeof(TransactionId)
+ 												- sizeof(XLogRecPtr)];
  
  	/* Protected by WALWriteLock: */
  	XLogCtlWrite Write;
+ 	char	WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)];
  
  	/*
  	 * These values do not change after startup, although the pointed-to pages
***************
*** 311,316 ****
--- 321,337 ----
  	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
  	TimeLineID	ThisTimeLineID;
  
+ 	/*
+ 	 * canWriteWAL changes at the end of recovery only and is only ever set
+ 	 * by the Startup process. We assume that changes to it are atomic,
+ 	 * so accesses to it is never locked. When it does change bgwriter
+ 	 * must immediately begin using it, since this helps it decide whether
+ 	 * to flush WAL or not when it writes dirty blocks. If bgwriter does
+ 	 * it too soon, we will write invalid WAL records and if it reflects the
+ 	 * change too late it could skip flushing WAL for a data block change.
+ 	 */
+ 	bool		canWriteWAL;
+ 
  	slock_t		info_lck;		/* locks shared variables shown above */
  } XLogCtlData;
  
***************
*** 480,485 ****
--- 501,510 ----
  	bool		doPageWrites;
  	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
  
+ 	/* cross-check on whether we should be here or not */
+ 	if (!(canWriteWAL() || (InRecovery && StartupCanWriteWAL)))
+ 		elog(FATAL, "cannot make new WAL entries during recovery");
+ 
  	/* info's high bits are reserved for use by me */
  	if (info & XLR_INFO_MASK)
  		elog(PANIC, "invalid xlog info mask %02X", info);
***************
*** 1677,1684 ****
  	XLogRecPtr	WriteRqstPtr;
  	XLogwrtRqst WriteRqst;
  
! 	/* Disabled during REDO */
! 	if (InRedo)
  		return;
  
  	/* Quick exit if already known flushed */
--- 1702,1709 ----
  	XLogRecPtr	WriteRqstPtr;
  	XLogwrtRqst WriteRqst;
  
! 	/* Disabled during StartupXLog */
! 	if (!canWriteWAL())
  		return;
  
  	/* Quick exit if already known flushed */
***************
*** 1766,1774 ****
  	 * the bad page is encountered again during recovery then we would be
  	 * unable to restart the database at all!  (This scenario has actually
  	 * happened in the field several times with 7.1 releases. Note that we
! 	 * cannot get here while InRedo is true, but if the bad page is brought in
! 	 * and marked dirty during recovery then CreateCheckPoint will try to
! 	 * flush it at the end of recovery.)
  	 *
  	 * The current approach is to ERROR under normal conditions, but only
  	 * WARNING during recovery, so that the system can be brought up even if
--- 1791,1799 ----
  	 * the bad page is encountered again during recovery then we would be
  	 * unable to restart the database at all!  (This scenario has actually
  	 * happened in the field several times with 7.1 releases. Note that we
! 	 * cannot get here while canWriteWAL() is false, but if the bad page is
! 	 * brought in and marked dirty during recovery then CreateCheckPoint will
! 	 * try to flush it at the end of recovery.)
  	 *
  	 * The current approach is to ERROR under normal conditions, but only
  	 * WARNING during recovery, so that the system can be brought up even if
***************
*** 2051,2057 ****
  		unlink(tmppath);
  	}
  
! 	elog(DEBUG2, "done creating and filling new WAL file");
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
--- 2076,2083 ----
  		unlink(tmppath);
  	}
  
! 	XLogFileName(tmppath, ThisTimeLineID, log, seg);
! 	elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
***************
*** 4532,4546 ****
  		}
  		else if (strcmp(tok1, "log_restartpoints") == 0)
  		{
- 			/*
- 			 * does nothing if a recovery_target is not also set
- 			 */
- 			if (!parse_bool(tok2, &recoveryLogRestartpoints))
- 				  ereport(ERROR,
- 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					  errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
  			ereport(LOG,
! 					(errmsg("log_restartpoints = %s", tok2)));
  		}
  		else
  			ereport(FATAL,
--- 4558,4566 ----
  		}
  		else if (strcmp(tok1, "log_restartpoints") == 0)
  		{
  			ereport(LOG,
! 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 					  errmsg("parameter \"log_restartpoints\" has been deprecated")));
  		}
  		else
  			ereport(FATAL,
***************
*** 4823,4828 ****
--- 4843,4850 ----
  	uint32		freespace;
  	TransactionId oldestActiveXID;
  
+ 	XLogCtl->canWriteWAL = false;
+ 
  	/*
  	 * Read control file and check XLOG status looks valid.
  	 *
***************
*** 5039,5044 ****
--- 5061,5071 ----
  		UpdateControlFile();
  
  		/*
+ 		 * Reset pgstat data, because it may be invalid after recovery.
+ 		 */
+ 		pgstat_reset_all();
+ 
+ 		/*
  		 * If there was a backup label file, it's done its job and the info
  		 * has now been propagated into pg_control.  We must get rid of the
  		 * label file so that if we crash during recovery, we'll pick up at
***************
*** 5148,5153 ****
--- 5175,5195 ----
  
  				LastRec = ReadRecPtr;
  
+ 				/*
+ 				 * Have we reached our safe stopping point? If so, we can
+ 				 * signal Postmaster to enter consistent recovery mode
+ 				 */
+ 				if (!reachedSafeStopPoint && 
+ 					 XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ 				{
+ 					reachedSafeStopPoint = true;
+ 					ereport(LOG,
+ 						(errmsg("consistent recovery state reached at %X/%X",
+ 							EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ 					if (IsUnderPostmaster)
+ 						SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ 				}
+ 
  				record = ReadRecord(NULL, LOG);
  			} while (record != NULL && recoveryContinue);
  
***************
*** 5169,5174 ****
--- 5211,5217 ----
  			/* there are no WAL records following the checkpoint */
  			ereport(LOG,
  					(errmsg("redo is not required")));
+ 			reachedSafeStopPoint = true;
  		}
  	}
  
***************
*** 5184,5190 ****
  	 * Complain if we did not roll forward far enough to render the backup
  	 * dump consistent.
  	 */
! 	if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
--- 5227,5233 ----
  	 * Complain if we did not roll forward far enough to render the backup
  	 * dump consistent.
  	 */
! 	if (InRecovery && !reachedSafeStopPoint)
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
***************
*** 5306,5314 ****
  		XLogCheckInvalidPages();
  
  		/*
! 		 * Reset pgstat data, because it may be invalid after recovery.
  		 */
! 		pgstat_reset_all();
  
  		/*
  		 * Perform a checkpoint to update all our recovery activity to disk.
--- 5349,5365 ----
  		XLogCheckInvalidPages();
  
  		/*
! 		 * Let Postmaster know we are about to write Shutdown checkpoint,
! 		 * so that we can shutdown bgwriter cleanly and be left in peace
! 		 * to get on with this.
  		 */
! 		if (IsUnderPostmaster)
! 			SendPostmasterSignal(PMSIGNAL_RECOVERY_CKPT);
! 
! 		/*
! 		 * Startup process is now allowed to write WAL records
! 		 */
! 		StartupCanWriteWAL = true;
  
  		/*
  		 * Perform a checkpoint to update all our recovery activity to disk.
***************
*** 5318,5323 ****
--- 5369,5377 ----
  		 * assigning a new TLI, using a shutdown checkpoint allows us to have
  		 * the rule that TLI only changes in shutdown checkpoints, which
  		 * allows some extra error checking in xlog_redo.
+ 		 *
+ 		 * Note that this will wait behind any restartpoint that the bgwriter
+ 		 * is currently performing, though will be much faster as a result.
  		 */
  		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
  	}
***************
*** 5372,5377 ****
--- 5426,5442 ----
  		readRecordBuf = NULL;
  		readRecordBufSize = 0;
  	}
+ 
+ 	/*
+ 	 * Lastly, allow everybody else to write WAL now
+ 	 */
+ 	XLogCtl->canWriteWAL = true;
+ }
+ 
+ bool
+ canWriteWAL(void)
+ {
+ 	return XLogCtl->canWriteWAL;
  }
  
  /*
***************
*** 6071,6099 ****
  			}
  	}
  
  	/*
! 	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
  
  	/*
! 	 * Update pg_control so that any subsequent crash will restart from this
! 	 * checkpoint.	Note: ReadRecPtr gives the XLOG address of the checkpoint
! 	 * record itself.
  	 */
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
! 	ControlFile->checkPoint = ReadRecPtr;
! 	ControlFile->checkPointCopy = *checkPoint;
  	ControlFile->time = (pg_time_t) time(NULL);
  	UpdateControlFile();
  
! 	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
  			(errmsg("recovery restart point at %X/%X",
! 					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! 	if (recoveryLastXTime)
! 		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! 				(errmsg("last completed transaction was at log time %s",
! 						timestamptz_to_str(recoveryLastXTime))));
  }
  
  /*
--- 6136,6200 ----
  			}
  	}
  
+ 	if (recoveryLastXTime)
+ 		ereport((log_checkpoints ? LOG : DEBUG2),
+ 				(errmsg("last completed transaction was at log time %s",
+ 						timestamptz_to_str(recoveryLastXTime))));
+ 
+ 	RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStopPoint);
+ }
+ 
+ /*
+  * As of 8.4, RestartPoints are always created by the bgwriter
+  * once we have reachedSafeStopPoint. We use bgwriter's shared memory
+  * area wherever we call it from, to keep better code structure.
+  */
+ void
+ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint)
+ {
+ 	if (log_checkpoints)
+ 	{
+ 		/*
+ 		 * Prepare to accumulate statistics.
+ 		 */
+ 
+ 		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ 		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+ 
+ 		/*
+ 		 * Do the restartpoint equivalent of LogCheckpointStart()
+ 		 */
+ 		elog(LOG, "restartpoint starting:");
+ 	}
+ 
+ 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+ 
  	/*
! 	 * OK, write out dirty blocks smoothly
  	 */
! 	CheckPointGuts(restartPoint->redo, 0);
  
  	/*
! 	 * Update pg_control, using current time
  	 */
+ 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
! 	ControlFile->checkPoint = ReadPtr;
! 	ControlFile->checkPointCopy = *restartPoint;
  	ControlFile->time = (pg_time_t) time(NULL);
  	UpdateControlFile();
+ 	LWLockRelease(ControlFileLock);
  
! 	/* All real work is done, but log before releasing lock. */
! 	if (log_checkpoints)
! 		LogCheckpointEnd();
! 
! 	ereport((log_checkpoints ? LOG : DEBUG2),
  			(errmsg("recovery restart point at %X/%X",
! 					restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
! 
! 	LWLockRelease(CheckpointLock);
! 
  }
  
  /*
Index: src/backend/postmaster/bgwriter.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/bgwriter.c,v
retrieving revision 1.51
diff -c -r1.51 bgwriter.c
*** src/backend/postmaster/bgwriter.c	11 Aug 2008 11:05:11 -0000	1.51
--- src/backend/postmaster/bgwriter.c	18 Sep 2008 12:06:05 -0000
***************
*** 49,54 ****
--- 49,55 ----
  #include <unistd.h>
  
  #include "access/xlog_internal.h"
+ #include "catalog/pg_control.h"
  #include "libpq/pqsignal.h"
  #include "miscadmin.h"
  #include "pgstat.h"
***************
*** 130,135 ****
--- 131,144 ----
  
  	int			ckpt_flags;		/* checkpoint flags, as defined in xlog.h */
  
+ 	/* 
+ 	 * When the Startup process wants a restartpoint, it sets these fields
+ 	 * so that whoever performs the restartpoint can update the control file,
+ 	 * allowing the caller to continue, if it is running in another process.
+ 	 */
+ 	XLogRecPtr	ReadPtr;		/* ReadRecPtr for RestartPoint request */
+ 	CheckPoint *restartPoint;	/* restartPoint data for ControlFile */
+ 
  	uint32		num_backend_writes;		/* counts non-bgwriter buffer writes */
  
  	int			num_requests;	/* current # of requests */
***************
*** 164,172 ****
  
  static bool ckpt_active = false;
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
--- 173,184 ----
  
  static bool ckpt_active = false;
  
+ static bool BgWriterInRecovery = true;
+ 
+ 
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;	/* not used if canWriteWAL */
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
***************
*** 186,192 ****
  static void ReqCheckpointHandler(SIGNAL_ARGS);
  static void ReqShutdownHandler(SIGNAL_ARGS);
  
- 
  /*
   * Main entry point for bgwriter process
   *
--- 198,203 ----
***************
*** 357,371 ****
  	PG_SETMASK(&UnBlockSig);
  
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
- 		bool		do_checkpoint = false;
- 		int			flags = 0;
- 		pg_time_t	now;
- 		int			elapsed_secs;
- 
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
--- 368,389 ----
  	PG_SETMASK(&UnBlockSig);
  
  	/*
+ 	 * We are either in recovery mode, or we're not. bgwriter never
+ 	 * attepts to change state, we just shutdown and startup in new mode.
+ 	 * That's fine don't really want to hang around while Startup process
+ 	 * performs it's shutdown checkpoint.
+ 	 */
+ 	BgWriterInRecovery = !canWriteWAL();
+ 
+ 	if (BgWriterInRecovery)
+ 		elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
+ 			BgWriterShmem->bgwriter_pid);
+ 
+ 	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
***************
*** 383,501 ****
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
- 		if (checkpoint_requested)
- 		{
- 			checkpoint_requested = false;
- 			do_checkpoint = true;
- 			BgWriterStats.m_requested_checkpoints++;
- 		}
- 		if (shutdown_requested)
- 		{
- 			/*
- 			 * From here on, elog(ERROR) should end with exit(1), not send
- 			 * control back to the sigsetjmp block above
- 			 */
- 			ExitOnAnyError = true;
- 			/* Close down the database */
- 			ShutdownXLOG(0, 0);
- 			DumpFreeSpaceMap(0, 0);
- 			/* Normal exit from the bgwriter is here */
- 			proc_exit(0);		/* done */
- 		}
  
! 		/*
! 		 * Force a checkpoint if too much time has elapsed since the last one.
! 		 * Note that we count a timed checkpoint in stats only when this
! 		 * occurs without an external request, but we set the CAUSE_TIME flag
! 		 * bit even if there is also an external request.
! 		 */
! 		now = (pg_time_t) time(NULL);
! 		elapsed_secs = now - last_checkpoint_time;
! 		if (elapsed_secs >= CheckPointTimeout)
  		{
! 			if (!do_checkpoint)
! 				BgWriterStats.m_timed_checkpoints++;
! 			do_checkpoint = true;
! 			flags |= CHECKPOINT_CAUSE_TIME;
  		}
! 
! 		/*
! 		 * Do a checkpoint if requested, otherwise do one cycle of
! 		 * dirty-buffer writing.
! 		 */
! 		if (do_checkpoint)
  		{
! 			/* use volatile pointer to prevent code rearrangement */
! 			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
! 
! 			/*
! 			 * Atomically fetch the request flags to figure out what kind of a
! 			 * checkpoint we should perform, and increase the started-counter
! 			 * to acknowledge that we've started a new checkpoint.
! 			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			flags |= bgs->ckpt_flags;
! 			bgs->ckpt_flags = 0;
! 			bgs->ckpt_started++;
! 			SpinLockRelease(&bgs->ckpt_lck);
  
  			/*
! 			 * We will warn if (a) too soon since last checkpoint (whatever
! 			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 			 * since the last checkpoint start.  Note in particular that this
! 			 * implementation will not generate warnings caused by
! 			 * CheckPointTimeout < CheckPointWarning.
  			 */
! 			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 				elapsed_secs < CheckPointWarning)
! 				ereport(LOG,
! 						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 								elapsed_secs),
! 						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
  
  			/*
! 			 * Initialize bgwriter-private variables used during checkpoint.
  			 */
! 			ckpt_active = true;
! 			ckpt_start_recptr = GetInsertRecPtr();
! 			ckpt_start_time = now;
! 			ckpt_cached_elapsed = 0;
  
! 			/*
! 			 * Do the checkpoint.
! 			 */
! 			CreateCheckPoint(flags);
! 
! 			/*
! 			 * After any checkpoint, close all smgr files.	This is so we
! 			 * won't hang onto smgr references to deleted files indefinitely.
! 			 */
! 			smgrcloseall();
! 
! 			/*
! 			 * Indicate checkpoint completion to any waiting backends.
! 			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			bgs->ckpt_done = bgs->ckpt_started;
! 			SpinLockRelease(&bgs->ckpt_lck);
! 
! 			ckpt_active = false;
  
! 			/*
! 			 * Note we record the checkpoint start time not end time as
! 			 * last_checkpoint_time.  This is so that time-driven checkpoints
! 			 * happen at a predictable spacing.
! 			 */
! 			last_checkpoint_time = now;
  		}
- 		else
- 			BgBufferSync();
- 
- 		/* Check for archive_timeout and switch xlog files if necessary. */
- 		CheckArchiveTimeout();
- 
- 		/* Nap for the configured time. */
- 		BgWriterNap();
  	}
  }
  
--- 401,586 ----
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
  
! 		if (BgWriterInRecovery)
  		{
! 			if (checkpoint_requested) 
! 			{
! 				XLogRecPtr		ReadPtr;
! 				CheckPoint		restartPoint;
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_time = (pg_time_t) time(NULL);
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Get the requested values from shared memory that the 
! 				 * Startup process has put there for us
! 				 */
! 				SpinLockAcquire(&BgWriterShmem->ckpt_lck);
! 				ReadPtr = BgWriterShmem->ReadPtr;
! 				memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
! 				SpinLockRelease(&BgWriterShmem->ckpt_lck);
! 
! 				CreateRestartPoint(ReadPtr, &restartPoint);
! 
! 				/* 
! 				 * Reset any flags if we requested immediate completion part 
! 				 * way through the restart point
! 				 */
! 				SpinLockAcquire(&BgWriterShmem->ckpt_lck);
! 				BgWriterShmem->ckpt_flags = 0;
! 				SpinLockRelease(&BgWriterShmem->ckpt_lck);
! 
! 				ckpt_active = false;
! 				checkpoint_requested = false;
! 			}
! 			else
! 			{
! 				/* Clean buffers dirtied by recovery */
! 				BgBufferSync();
! 
! 				/* Nap for the configured time. */
! 				BgWriterNap();
! 			}
! 
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
  		}
! 		else	/* Normal processing */
  		{
! 			bool		do_checkpoint = false;
! 			int			flags = 0;
! 			pg_time_t	now;
! 			int			elapsed_secs;
! 
! 			Assert(canWriteWAL());
! 
! 			if (checkpoint_requested) 
! 			{
! 				checkpoint_requested = false;
! 				do_checkpoint = true;
! 				BgWriterStats.m_requested_checkpoints++;
! 			}
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Close down the database */
! 				ShutdownXLOG(0, 0);
! 				DumpFreeSpaceMap(0, 0);
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
  
  			/*
! 			 * Force a checkpoint if too much time has elapsed since the last one.
! 			 * Note that we count a timed checkpoint in stats only when this
! 			 * occurs without an external request, but we set the CAUSE_TIME flag
! 			 * bit even if there is also an external request.
  			 */
! 			now = (pg_time_t) time(NULL);
! 			elapsed_secs = now - last_checkpoint_time;
! 			if (elapsed_secs >= CheckPointTimeout)
! 			{
! 				if (!do_checkpoint)
! 					BgWriterStats.m_timed_checkpoints++;
! 				do_checkpoint = true;
! 				flags |= CHECKPOINT_CAUSE_TIME;
! 			}
  
  			/*
! 			 * Do a checkpoint if requested, otherwise do one cycle of
! 			 * dirty-buffer writing.
  			 */
! 			if (do_checkpoint)
! 			{
! 				/* use volatile pointer to prevent code rearrangement */
! 				volatile BgWriterShmemStruct *bgs = BgWriterShmem;
! 
! 				/*
! 				 * Atomically fetch the request flags to figure out what kind of a
! 				 * checkpoint we should perform, and increase the started-counter
! 				 * to acknowledge that we've started a new checkpoint.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				flags |= bgs->ckpt_flags;
! 				bgs->ckpt_flags = 0;
! 				bgs->ckpt_started++;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				/*
! 				 * We will warn if (a) too soon since last checkpoint (whatever
! 				 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 				 * since the last checkpoint start.  Note in particular that this
! 				 * implementation will not generate warnings caused by
! 				 * CheckPointTimeout < CheckPointWarning.
! 				 */
! 				if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 					elapsed_secs < CheckPointWarning)
! 					ereport(LOG,
! 							(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 									elapsed_secs),
! 							 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_recptr = GetInsertRecPtr();
! 				ckpt_start_time = now;
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Do the checkpoint.
! 				 */
! 				CreateCheckPoint(flags);
! 
! 				/*
! 				 * After any checkpoint, close all smgr files.	This is so we
! 				 * won't hang onto smgr references to deleted files indefinitely.
! 				 */
! 				smgrcloseall();
! 
! 				/*
! 				 * Indicate checkpoint completion to any waiting backends.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				bgs->ckpt_done = bgs->ckpt_started;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				ckpt_active = false;
! 
! 				/*
! 				 * Note we record the checkpoint start time not end time as
! 				 * last_checkpoint_time.  This is so that time-driven checkpoints
! 				 * happen at a predictable spacing.
! 				 */
! 				last_checkpoint_time = now;
! 			}
! 			else
! 				BgBufferSync();
  
! 			/* Check for archive_timeout and switch xlog files if necessary. */
! 			CheckArchiveTimeout();
  
! 			/* Nap for the configured time. */
! 			BgWriterNap();
  		}
  	}
  }
  
***************
*** 588,594 ****
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
--- 673,680 ----
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		if (canWriteWAL())
! 			AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
***************
*** 642,647 ****
--- 728,745 ----
  	if (!am_bg_writer)
  		return;
  
+ 	/* Perform minimal duties during recovery and skip wait if requested */
+ 	if (!canWriteWAL())
+ 	{
+ 		BgBufferSync();
+ 
+ 		if (!shutdown_requested &&
+ 			IsCheckpointOnSchedule(progress))
+ 			BgWriterNap();
+ 
+ 		return;
+ 	}
+ 
  	/*
  	 * Perform the usual bgwriter duties and take a nap, unless we're behind
  	 * schedule, in which case we just try to catch up as quickly as possible.
***************
*** 716,731 ****
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	recptr = GetInsertRecPtr();
! 	elapsed_xlogs =
! 		(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 		CheckPointSegments;
! 
! 	if (progress < elapsed_xlogs)
  	{
! 		ckpt_cached_elapsed = elapsed_xlogs;
! 		return false;
  	}
  
  	/*
--- 814,832 ----
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	if (canWriteWAL())
  	{
! 		recptr = GetInsertRecPtr();
! 		elapsed_xlogs =
! 			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 			CheckPointSegments;
! 
! 		if (progress < elapsed_xlogs)
! 		{
! 			ckpt_cached_elapsed = elapsed_xlogs;
! 			return false;
! 		}
  	}
  
  	/*
***************
*** 967,972 ****
--- 1068,1104 ----
  }
  
  /*
+  * Always runs in Startup process (see xlog.c)
+  */
+ void
+ RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter)
+ {
+ 	/*
+ 	 * Should we just do it ourselves?
+ 	 */
+ 	if (!IsPostmasterEnvironment || !sendToBGWriter)
+ 	{
+ 		CreateRestartPoint(ReadPtr, restartPoint);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Push requested values into shared memory, then signal to request restartpoint.
+ 	 */
+ 	if (BgWriterShmem->bgwriter_pid == 0)
+ 		elog(LOG, "could not request restartpoint because bgwriter not running");
+ 
+ 	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ 	BgWriterShmem->ReadPtr = ReadPtr;
+ 	memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
+ 	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+ 
+ 
+ 	if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ 		elog(LOG, "could not signal for restartpoint: %m");	
+ }
+ 
+ /*
   * ForwardFsyncRequest
   *		Forward a file-fsync request from a backend to the bgwriter
   *
Index: src/backend/postmaster/postmaster.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/postmaster.c,v
retrieving revision 1.561
diff -c -r1.561 postmaster.c
*** src/backend/postmaster/postmaster.c	26 Jun 2008 02:47:19 -0000	1.561
--- src/backend/postmaster/postmaster.c	18 Sep 2008 12:03:07 -0000
***************
*** 254,259 ****
--- 254,267 ----
  {
  	PM_INIT,					/* postmaster starting */
  	PM_STARTUP,					/* waiting for startup subprocess */
+ 	PM_RECOVERY,				/* consistent recovery mode; state only
+ 								 * entered for archive and streaming recovery,
+ 								 * and only after the point where the 
+ 								 * all data is in consistent state.
+ 								 */
+ 	PM_RECOVERY_CKPT,			/* Startup process writes shutdown checkpoint
+ 								 * so bgwriter must shutdown immediately.
+ 								 */
  	PM_RUN,						/* normal "database is alive" state */
  	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
  	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
***************
*** 1294,1300 ****
  		 * state that prevents it, start one.  It doesn't matter if this
  		 * fails, we'll just try again later.
  		 */
! 		if (BgWriterPID == 0 && pmState == PM_RUN)
  			BgWriterPID = StartBackgroundWriter();
  
  		/*
--- 1302,1308 ----
  		 * state that prevents it, start one.  It doesn't matter if this
  		 * fails, we'll just try again later.
  		 */
! 		if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
  			BgWriterPID = StartBackgroundWriter();
  
  		/*
***************
*** 2104,2110 ****
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
--- 2112,2119 ----
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY 
! 										 || pmState == PM_RECOVERY_CKPT);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
***************
*** 2116,2121 ****
--- 2125,2133 ----
  				ExitPostmaster(1);
  			}
  
+ 			/* We should never exit normally during PM_RECOVERY state */
+ 			Assert(pmState != PM_RECOVERY);
+ 
  			/*
  			 * Startup succeeded - we are done with system startup or
  			 * recovery.
***************
*** 2148,2155 ****
  			 * Crank up the background writer.	It doesn't matter if this
  			 * fails, we'll just try again later.
  			 */
! 			Assert(BgWriterPID == 0);
! 			BgWriterPID = StartBackgroundWriter();
  
  			/*
  			 * Likewise, start other special children as needed.  In a restart
--- 2160,2167 ----
  			 * Crank up the background writer.	It doesn't matter if this
  			 * fails, we'll just try again later.
  			 */
! 			if (BgWriterPID == 0)
! 				BgWriterPID = StartBackgroundWriter();
  
  			/*
  			 * Likewise, start other special children as needed.  In a restart
***************
*** 2211,2216 ****
--- 2223,2238 ----
  				if (PgStatPID != 0)
  					signal_child(PgStatPID, SIGQUIT);
  			}
+ 			else if (EXIT_STATUS_0(exitstatus) && pmState == PM_RECOVERY_CKPT)
+ 			{
+ 				/*
+ 				 * bgwriter has shutdown as it was requested, so this is OK.
+ 				 * There is no state change associated with this event,
+ 				 * since this was caused by the state change.
+ 				 * We now wait for Startup process to complete, before
+ 				 * moving to PM_RUN state.
+ 				 */
+ 			}
  			else
  			{
  				/*
***************
*** 2570,2575 ****
--- 2592,2619 ----
  static void
  PostmasterStateMachine(void)
  {
+ 	if (pmState == PM_RECOVERY)
+ 	{
+ 		/* Start the bgwriter if not running */
+ 		if (BgWriterPID == 0)
+ 			BgWriterPID = StartBackgroundWriter();
+ 
+ 		/* If we have lost the stats collector, try to start a new one */
+ 		if (PgStatPID == 0)
+ 			PgStatPID = pgstat_start();
+ 	}
+ 
+ 	if (pmState == PM_RECOVERY_CKPT)
+ 	{
+ 		/* Tell bgwriter to shut down, if its still active */
+ 		if (BgWriterPID != 0)
+ 			signal_child(BgWriterPID, SIGUSR2);
+ 
+ 		/* If we have lost the stats collector, try to start a new one */
+ 		if (PgStatPID == 0)
+ 			PgStatPID = pgstat_start();
+ 	}
+ 
  	if (pmState == PM_WAIT_BACKUP)
  	{
  		/*
***************
*** 3821,3826 ****
--- 3865,3940 ----
  
  	PG_SETMASK(&BlockSig);
  
+ 	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ 	{
+ 		Assert(pmState == PM_STARTUP);
+ 
+ 		/*
+ 		 * Go to shutdown mode if a shutdown request was pending.
+ 		 */
+ 		if (Shutdown > NoShutdown)
+ 		{
+ 			pmState = PM_WAIT_BACKENDS;
+ 			/* PostmasterStateMachine logic does the rest */
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * Startup process has entered recovery
+ 			 */
+ 			pmState = PM_RECOVERY;
+ 
+ 			/*
+ 			 * Load the flat authorization file into postmaster's cache. The
+ 			 * startup process won't have recomputed this from the database yet,
+ 			 * so we it may change following recovery. 
+ 			 */
+ 			load_role();
+ 
+ 			/*
+ 			 * Crank up the background writer.	It doesn't matter if this
+ 			 * fails, we'll just try again later.
+ 			 */
+ 			Assert(BgWriterPID == 0);
+ 			BgWriterPID = StartBackgroundWriter();
+ 
+ 			/*
+ 			 * Likewise, start other special children as needed.
+ 			 */
+ 			Assert(PgStatPID == 0);
+ 			PgStatPID = pgstat_start();
+ 
+ 			/* XXX at this point we could accept read-only connections */
+ 			ereport(DEBUG1,
+ 				 (errmsg("database system is in consistent recovery mode")));
+ 		}
+ 	}
+ 
+ 	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_CKPT))
+ 	{
+ 		Assert(pmState == PM_RECOVERY);
+ 
+ 		/*
+ 		 * Go to shutdown mode if a shutdown request was pending.
+ 		 */
+ 		if (Shutdown > NoShutdown)
+ 		{
+ 			pmState = PM_WAIT_BACKENDS;
+ 			/* PostmasterStateMachine logic does the rest */
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * Startup process has entered recovery checkpoint phase
+ 			 */
+ 			pmState = PM_RECOVERY_CKPT;
+ 
+ 			/* Tell bgwriter to shut down */
+ 			if (BgWriterPID != 0)
+ 				signal_child(BgWriterPID, SIGUSR2);
+ 		}
+ 	}
+ 
  	if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
  	{
  		/*
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog.h,v
retrieving revision 1.88
diff -c -r1.88 xlog.h
*** src/include/access/xlog.h	12 May 2008 08:35:05 -0000	1.88
--- src/include/access/xlog.h	18 Sep 2008 11:40:27 -0000
***************
*** 197,202 ****
--- 197,204 ----
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
  
+ extern bool canWriteWAL(void);
+ 
  extern void UpdateControlFile(void);
  extern Size XLOGShmemSize(void);
  extern void XLOGShmemInit(void);
Index: src/include/access/xlog_internal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog_internal.h,v
retrieving revision 1.24
diff -c -r1.24 xlog_internal.h
*** src/include/access/xlog_internal.h	11 Aug 2008 11:05:11 -0000	1.24
--- src/include/access/xlog_internal.h	18 Sep 2008 05:54:53 -0000
***************
*** 17,22 ****
--- 17,23 ----
  #define XLOG_INTERNAL_H
  
  #include "access/xlog.h"
+ #include "catalog/pg_control.h"
  #include "fmgr.h"
  #include "pgtime.h"
  #include "storage/block.h"
***************
*** 245,250 ****
--- 246,254 ----
  extern pg_time_t GetLastSegSwitchTime(void);
  extern XLogRecPtr RequestXLogSwitch(void);
  
+ 
+ extern void CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint);
+ 
  /*
   * These aren't in xlog.h because I'd rather not include fmgr.h there.
   */
Index: src/include/postmaster/bgwriter.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/postmaster/bgwriter.h,v
retrieving revision 1.12
diff -c -r1.12 bgwriter.h
*** src/include/postmaster/bgwriter.h	11 Aug 2008 11:05:11 -0000	1.12
--- src/include/postmaster/bgwriter.h	18 Sep 2008 10:44:27 -0000
***************
*** 12,17 ****
--- 12,18 ----
  #ifndef _BGWRITER_H
  #define _BGWRITER_H
  
+ #include "catalog/pg_control.h"
  #include "storage/block.h"
  #include "storage/relfilenode.h"
  
***************
*** 25,30 ****
--- 26,32 ----
  extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(int flags);
+ extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
  extern void CheckpointWriteDelay(int flags, double progress);
  
  extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
Index: src/include/storage/pmsignal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/pmsignal.h,v
retrieving revision 1.20
diff -c -r1.20 pmsignal.h
*** src/include/storage/pmsignal.h	19 Jun 2008 21:32:56 -0000	1.20
--- src/include/storage/pmsignal.h	18 Sep 2008 11:03:03 -0000
***************
*** 22,27 ****
--- 22,29 ----
   */
  typedef enum
  {
+ 	PMSIGNAL_RECOVERY_START,	/* move to PM_RECOVERY state */
+ 	PMSIGNAL_RECOVERY_CKPT,		/* move to PM_RECOVERY_CKPT state */
  	PMSIGNAL_PASSWORD_CHANGE,	/* pg_auth file has changed */
  	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
  	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches

Re: [PATCHES] [HACKERS] Infrastructure changes for recovery

Reply via email to