On Thu, 2008-09-18 at 10:09 -0400, Tom Lane wrote:
> Simon Riggs <[EMAIL PROTECTED]> writes:
> > On Thu, 2008-09-18 at 09:06 -0400, Tom Lane wrote:
> >> Do we really need a checkpoint there at all?
>
> > "Timelines only change at shutdown checkpoints".
>
> Hmm. I *think* that that is just a debugging crosscheck rather than a
> critical property. But yeah, it would take some close investigation,
> which maybe isn't warranted if you have a less-invasive solution.
OK, new patch, version 6. Some major differences to previous patch.
* new IsRecoveryProcessingMode() in shmem
* padding in XLogCtl to ensure above call is cheap
* specific part of bgwriter shmem for passing restartpoint data
* avoid Shutdown checkpoint at end of recovery, with carefully
considered positioning of statements (beware!)
* only one new postmaster mode, PM_RECOVERY
* bgwriter changes state without stopping/starting
Modes I have tested so far
* make check
* Start, Stop
* Crash Recovery
* Archive Recovery
* Archive Recovery, switch in middle of restartpoint
Modes not yet tested
* EXEC_BACKEND
Ready for serious review prior to commit. I will be performing further
testing also.
backend/access/transam/multixact.c | 2
backend/access/transam/xlog.c | 328 ++++++++++++---!!!!!!!!!!!!
backend/postmaster/bgwriter.c | 371 +++++---!!!!!!!!!!!!!!!!!!!!!
backend/postmaster/postmaster.c | 62 ++++!!
backend/storage/buffer/README | 5
backend/storage/buffer/bufmgr.c | 34 +!!
include/access/xlog.h | 14 !
include/access/xlog_internal.h | 3
include/catalog/pg_control.h | 2
include/postmaster/bgwriter.h | 2
include/storage/bufmgr.h | 2
include/storage/pmsignal.h | 1
12 files changed, 279 insertions(+), 56 deletions(-), 491 mods(!)
There's a few subtle points along the way. I've tried to explain them
all in code comments, but questions welcome. At v6, most things are now
done a particular way for a specific reason.
Look especially at InRecovery, which is used extensively in different
parts of the code. The meaning of this has been subdivided into two
meanings, so only *some* of the places that use it have been changed.
All have been checked.
--
Simon Riggs www.2ndQuadrant.com
PostgreSQL Training, Services and Support
Index: src/backend/access/transam/multixact.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/multixact.c,v
retrieving revision 1.28
diff -c -r1.28 multixact.c
*** src/backend/access/transam/multixact.c 1 Aug 2008 13:16:08 -0000 1.28
--- src/backend/access/transam/multixact.c 22 Sep 2008 19:28:56 -0000
***************
*** 1543,1549 ****
* SimpleLruTruncate would get confused. It seems best not to risk
* removing any data during recovery anyway, so don't truncate.
*/
! if (!InRecovery)
TruncateMultiXact();
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
--- 1543,1549 ----
* SimpleLruTruncate would get confused. It seems best not to risk
* removing any data during recovery anyway, so don't truncate.
*/
! if (!IsRecoveryProcessingMode())
TruncateMultiXact();
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.317
diff -c -r1.317 xlog.c
*** src/backend/access/transam/xlog.c 11 Aug 2008 11:05:10 -0000 1.317
--- src/backend/access/transam/xlog.c 22 Sep 2008 21:30:24 -0000
***************
*** 119,124 ****
--- 119,125 ----
/* Are we doing recovery from XLOG? */
bool InRecovery = false;
+ bool reachedSafeStopPoint = false;
/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;
***************
*** 131,137 ****
static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
- static bool recoveryLogRestartpoints = false;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
static TimestampTz recoveryLastXTime = 0;
--- 132,137 ----
***************
*** 286,295 ****
--- 286,297 ----
/*
* Total shared-memory state for XLOG.
*/
+ #define XLOGCTL_BUFFER_SPACING 128
typedef struct XLogCtlData
{
/* Protected by WALInsertLock: */
XLogCtlInsert Insert;
+ char InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)];
/* Protected by info_lck: */
XLogwrtRqst LogwrtRqst;
***************
*** 297,305 ****
--- 299,314 ----
uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
TransactionId ckptXid;
XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */
+ /* add data structure padding for above info_lck declarations */
+ char InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst)
+ - sizeof(XLogwrtResult)
+ - sizeof(uint32)
+ - sizeof(TransactionId)
+ - sizeof(XLogRecPtr)];
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
+ char WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)];
/*
* These values do not change after startup, although the pointed-to pages
***************
*** 311,316 ****
--- 320,344 ----
int XLogCacheBlck; /* highest allocated xlog buffer index */
TimeLineID ThisTimeLineID;
+ /*
+ * IsRecoveryProcessingMode shows whether the postmaster is in a
+ * postmaster state earlier than PM_RUN, or not. This is a globally
+ * accessible state to allow EXEC_BACKEND case.
+ *
+ * We also retain a local state variable InRecovery. InRecovery=true
+ * means the code is being executed by Startup process and therefore
+ * always during RecoveryProcessingMode. This allows us to retain the
+ * often important distinction between code executed *during*
+ * RecoveryProcessingMode and but not necessarily by Startup process.
+ *
+ * Reviewer's note: all call points InRecovery and InRedo have been checked
+ * for correctness and have been changed to IsRecoveryProcessingMode()
+ * if appropriate.
+ */
+ bool IsRecoveryProcessingMode;
+
+ char InfoLockPadding[XLOGCTL_BUFFER_SPACING];
+
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
***************
*** 396,401 ****
--- 424,430 ----
static void readRecoveryCommandFile(void);
static void exitArchiveRecovery(TimeLineID endTLI,
uint32 endLogId, uint32 endLogSeg);
+ static XLogRecPtr exitRecovery(void);
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
***************
*** 479,484 ****
--- 508,518 ----
bool updrqst;
bool doPageWrites;
bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ bool isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
+
+ /* cross-check on whether we should be here or not */
+ if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+ elog(FATAL, "cannot make new WAL entries during recovery");
/* info's high bits are reserved for use by me */
if (info & XLR_INFO_MASK)
***************
*** 1677,1684 ****
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
! /* Disabled during REDO */
! if (InRedo)
return;
/* Quick exit if already known flushed */
--- 1711,1717 ----
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
! if (IsRecoveryProcessingMode())
return;
/* Quick exit if already known flushed */
***************
*** 1766,1774 ****
* the bad page is encountered again during recovery then we would be
* unable to restart the database at all! (This scenario has actually
* happened in the field several times with 7.1 releases. Note that we
! * cannot get here while InRedo is true, but if the bad page is brought in
! * and marked dirty during recovery then CreateCheckPoint will try to
! * flush it at the end of recovery.)
*
* The current approach is to ERROR under normal conditions, but only
* WARNING during recovery, so that the system can be brought up even if
--- 1799,1807 ----
* the bad page is encountered again during recovery then we would be
* unable to restart the database at all! (This scenario has actually
* happened in the field several times with 7.1 releases. Note that we
! * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
! * brought in and marked dirty during recovery then CreateCheckPoint will
! * try to flush it at the end of recovery.)
*
* The current approach is to ERROR under normal conditions, but only
* WARNING during recovery, so that the system can be brought up even if
***************
*** 2051,2057 ****
unlink(tmppath);
}
! elog(DEBUG2, "done creating and filling new WAL file");
/* Set flag to tell caller there was no existent file */
*use_existent = false;
--- 2084,2091 ----
unlink(tmppath);
}
! XLogFileName(tmppath, ThisTimeLineID, log, seg);
! elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
/* Set flag to tell caller there was no existent file */
*use_existent = false;
***************
*** 4532,4546 ****
}
else if (strcmp(tok1, "log_restartpoints") == 0)
{
- /*
- * does nothing if a recovery_target is not also set
- */
- if (!parse_bool(tok2, &recoveryLogRestartpoints))
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
ereport(LOG,
! (errmsg("log_restartpoints = %s", tok2)));
}
else
ereport(FATAL,
--- 4566,4574 ----
}
else if (strcmp(tok1, "log_restartpoints") == 0)
{
ereport(LOG,
! (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! errmsg("parameter \"log_restartpoints\" has been deprecated")));
}
else
ereport(FATAL,
***************
*** 4811,4828 ****
CheckPoint checkPoint;
bool wasShutdown;
bool reachedStopPoint = false;
bool haveBackupLabel = false;
XLogRecPtr RecPtr,
LastRec,
checkPointLoc,
minRecoveryLoc,
! EndOfLog;
uint32 endLogId;
uint32 endLogSeg;
XLogRecord *record;
uint32 freespace;
TransactionId oldestActiveXID;
/*
* Read control file and check XLOG status looks valid.
*
--- 4839,4860 ----
CheckPoint checkPoint;
bool wasShutdown;
bool reachedStopPoint = false;
+ bool performedArchiveRecovery = false;
bool haveBackupLabel = false;
XLogRecPtr RecPtr,
LastRec,
checkPointLoc,
minRecoveryLoc,
! EndOfLog,
! RecoveryCompletionPtr;
uint32 endLogId;
uint32 endLogSeg;
XLogRecord *record;
uint32 freespace;
TransactionId oldestActiveXID;
+ XLogCtl->IsRecoveryProcessingMode = true;
+
/*
* Read control file and check XLOG status looks valid.
*
***************
*** 5039,5044 ****
--- 5071,5081 ----
UpdateControlFile();
/*
+ * Reset pgstat data, because it may be invalid after recovery.
+ */
+ pgstat_reset_all();
+
+ /*
* If there was a backup label file, it's done its job and the info
* has now been propagated into pg_control. We must get rid of the
* label file so that if we crash during recovery, we'll pick up at
***************
*** 5148,5153 ****
--- 5185,5205 ----
LastRec = ReadRecPtr;
+ /*
+ * Have we reached our safe stopping point? If so, we can
+ * signal Postmaster to enter consistent recovery mode
+ */
+ if (!reachedSafeStopPoint &&
+ XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ {
+ reachedSafeStopPoint = true;
+ ereport(LOG,
+ (errmsg("consistent recovery state reached at %X/%X",
+ EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ if (IsUnderPostmaster)
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ }
+
record = ReadRecord(NULL, LOG);
} while (record != NULL && recoveryContinue);
***************
*** 5169,5174 ****
--- 5221,5227 ----
/* there are no WAL records following the checkpoint */
ereport(LOG,
(errmsg("redo is not required")));
+ reachedSafeStopPoint = true;
}
}
***************
*** 5184,5190 ****
* Complain if we did not roll forward far enough to render the backup
* dump consistent.
*/
! if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
--- 5237,5243 ----
* Complain if we did not roll forward far enough to render the backup
* dump consistent.
*/
! if (InRecovery && !reachedSafeStopPoint)
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
***************
*** 5227,5233 ****
--- 5280,5289 ----
* we will use that below.)
*/
if (InArchiveRecovery)
+ {
+ performedArchiveRecovery = true;
exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
+ }
/*
* Prepare to write WAL starting at EndOfLog position, and init xlog
***************
*** 5286,5291 ****
--- 5342,5349 ----
/* Pre-scan prepared transactions to find out the range of XIDs present */
oldestActiveXID = PrescanPreparedTransactions();
+ RecoveryCompletionPtr = EndOfLog;
+
if (InRecovery)
{
int rmid;
***************
*** 5306,5343 ****
XLogCheckInvalidPages();
/*
! * Reset pgstat data, because it may be invalid after recovery.
! */
! pgstat_reset_all();
!
! /*
! * Perform a checkpoint to update all our recovery activity to disk.
! *
! * Note that we write a shutdown checkpoint rather than an on-line
! * one. This is not particularly critical, but since we may be
! * assigning a new TLI, using a shutdown checkpoint allows us to have
! * the rule that TLI only changes in shutdown checkpoints, which
! * allows some extra error checking in xlog_redo.
*/
! CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
}
- /*
- * Preallocate additional log files, if wanted.
- */
- PreallocXlogFiles(EndOfLog);
-
- /*
- * Okay, we're officially UP.
- */
- InRecovery = false;
-
- ControlFile->state = DB_IN_PRODUCTION;
- ControlFile->time = (pg_time_t) time(NULL);
- UpdateControlFile();
-
/* start the archive_timeout timer running */
! XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
/* initialize shared-memory copy of latest checkpoint XID/epoch */
XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
--- 5364,5377 ----
XLogCheckInvalidPages();
/*
! * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
! * a shutdown checkpoint here, but we ask bgwriter to do that now.
*/
! RecoveryCompletionPtr = exitRecovery();
}
/* start the archive_timeout timer running */
! XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
/* initialize shared-memory copy of latest checkpoint XID/epoch */
XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
***************
*** 5372,5377 ****
--- 5406,5446 ----
readRecordBuf = NULL;
readRecordBufSize = 0;
}
+
+ /*
+ * Okay, we're officially UP.
+ */
+ XLogCtl->IsRecoveryProcessingMode = false;
+
+ /*
+ * If we had to perform archive recovery we don't mark the control file,
+ * yet, since we haven't definitely got a safe point to recover from that
+ * doesn't rely on archived WAL files. So we switch quickly into normal
+ * processing and rely on the bgwriter's checkpoint (NOT a restartpoint)
+ * to define a safe recovery point and put us into full production state.
+ * We specifically do not want to wait for checkpoint completion here,
+ * so we can reduce startup time in a standby mode replication failover.
+ * The checkpoint creation will also flush WAL, so we wait for that
+ * otherwise we may need to prepare WAL files ourselves.
+ *
+ * If we are doing crash recovery, we know we have WAL files accessible
+ * so we just get started again as quickly as possible.
+ */
+ if (performedArchiveRecovery)
+ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+ else
+ {
+ XLogFlush(RecoveryCompletionPtr);
+ ControlFile->state = DB_IN_PRODUCTION;
+ ControlFile->time = (pg_time_t) time(NULL);
+ UpdateControlFile();
+ }
+ }
+
+ bool
+ IsRecoveryProcessingMode(void)
+ {
+ return XLogCtl->IsRecoveryProcessingMode;
}
/*
***************
*** 5642,5648 ****
* Log end of a checkpoint.
*/
static void
! LogCheckpointEnd(void)
{
long write_secs,
sync_secs,
--- 5711,5717 ----
* Log end of a checkpoint.
*/
static void
! LogCheckpointEnd(int flags)
{
long write_secs,
sync_secs,
***************
*** 5665,5681 ****
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);
! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! "%d transaction log file(s) added, %d removed, %d recycled; "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! CheckpointStats.ckpt_segs_added,
! CheckpointStats.ckpt_segs_removed,
! CheckpointStats.ckpt_segs_recycled,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
}
/*
--- 5734,5759 ----
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);
! if (flags & CHECKPOINT_RESTARTPOINT)
! elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
! else
! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! "%d transaction log file(s) added, %d removed, %d recycled; "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! CheckpointStats.ckpt_segs_added,
! CheckpointStats.ckpt_segs_removed,
! CheckpointStats.ckpt_segs_recycled,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
}
/*
***************
*** 5944,5949 ****
--- 6022,6029 ----
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
+ else
+ ControlFile->state = DB_IN_PRODUCTION;
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
***************
*** 6002,6008 ****
/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
! LogCheckpointEnd();
LWLockRelease(CheckpointLock);
}
--- 6082,6088 ----
/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
! LogCheckpointEnd(flags);
LWLockRelease(CheckpointLock);
}
***************
*** 6071,6099 ****
}
}
/*
! * OK, force data out to disk
*/
! CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
/*
! * Update pg_control so that any subsequent crash will restart from this
! * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint
! * record itself.
*/
ControlFile->prevCheckPoint = ControlFile->checkPoint;
! ControlFile->checkPoint = ReadRecPtr;
! ControlFile->checkPointCopy = *checkPoint;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
! ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
! checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! if (recoveryLastXTime)
! ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! (errmsg("last completed transaction was at log time %s",
! timestamptz_to_str(recoveryLastXTime))));
}
/*
--- 6151,6215 ----
}
}
+ if (recoveryLastXTime)
+ ereport((log_checkpoints ? LOG : DEBUG2),
+ (errmsg("last completed transaction was at log time %s",
+ timestamptz_to_str(recoveryLastXTime))));
+
+ RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStopPoint);
+ }
+
+ /*
+ * As of 8.4, RestartPoints are always created by the bgwriter
+ * once we have reachedSafeStopPoint. We use bgwriter's shared memory
+ * area wherever we call it from, to keep better code structure.
+ */
+ void
+ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint)
+ {
+ if (log_checkpoints)
+ {
+ /*
+ * Prepare to accumulate statistics.
+ */
+
+ MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+ /*
+ * Do the restartpoint equivalent of LogCheckpointStart()
+ */
+ elog(LOG, "restartpoint starting");
+ }
+
+ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
/*
! * OK, write out dirty blocks smoothly
*/
! CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT);
/*
! * Update pg_control, using current time
*/
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->prevCheckPoint = ControlFile->checkPoint;
! ControlFile->checkPoint = ReadPtr;
! ControlFile->checkPointCopy = *restartPoint;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ /* All real work is done, but log before releasing lock. */
+ if (log_checkpoints)
+ LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
! ereport((log_checkpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
! restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
!
! LWLockRelease(CheckpointLock);
!
}
/*
***************
*** 6158,6164 ****
}
/*
! * XLOG resource manager's routines
*/
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
--- 6274,6313 ----
}
/*
! * exitRecovery()
! *
! * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
! * only record type that can record a change of timelineID. We assume
! * caller has already set ThisTimeLineID, if appropriate.
! */
! static XLogRecPtr
! exitRecovery(void)
! {
! XLogRecPtr RecPtr;
! XLogRecData rdata;
!
! rdata.buffer = InvalidBuffer;
! rdata.data = (char *) (&ThisTimeLineID);
! rdata.len = sizeof(TimeLineID);
! rdata.next = NULL;
!
! /*
! * This is the only type of WAL message that can be inserted during
! * recovery. This ensures that we don't allow others to get access
! * until after we have changed state.
! */
! RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
!
! InRecovery = false;
!
! return RecPtr;
! }
!
! /*
! * XLOG resource manager's routines.
! *
! * Definitions of message info are in include/catalog/pg_control.h,
! * though not all messages relate to control file processing.
*/
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
***************
*** 6193,6213 ****
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
/*
! * TLI may change in a shutdown checkpoint, but it shouldn't decrease
*/
! if (checkPoint.ThisTimeLineID != ThisTimeLineID)
{
! if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
!list_member_int(expectedTLIs,
! (int) checkPoint.ThisTimeLineID))
ereport(PANIC,
! (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
! checkPoint.ThisTimeLineID, ThisTimeLineID)));
/* Following WAL records should be run with new TLI */
! ThisTimeLineID = checkPoint.ThisTimeLineID;
}
-
- RecoveryRestartPoint(&checkPoint);
}
else if (info == XLOG_CHECKPOINT_ONLINE)
{
--- 6342,6379 ----
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
/*
! * TLI no longer changes at shutdown checkpoint, since as of 8.4,
! * shutdown checkpoints only occur at shutdown. Much less confusing.
*/
!
! RecoveryRestartPoint(&checkPoint);
! }
! else if (info == XLOG_RECOVERY_END)
! {
! TimeLineID tli;
!
! memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
!
! /*
! * TLI may change when recovery ends, but it shouldn't decrease.
! *
! * This is the only WAL record that can tell us to change timelineID
! * while we process WAL records.
! *
! * We can *choose* to stop recovery at any point, generating a
! * new timelineID which is recorded using this record type.
! */
! if (tli != ThisTimeLineID)
{
! if (tli < ThisTimeLineID ||
!list_member_int(expectedTLIs,
! (int) tli))
ereport(PANIC,
! (errmsg("unexpected timeline ID %u (after %u) at recovery end record",
! tli, ThisTimeLineID)));
/* Following WAL records should be run with new TLI */
! ThisTimeLineID = tli;
}
}
else if (info == XLOG_CHECKPOINT_ONLINE)
{
Index: src/backend/postmaster/bgwriter.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/bgwriter.c,v
retrieving revision 1.51
diff -c -r1.51 bgwriter.c
*** src/backend/postmaster/bgwriter.c 11 Aug 2008 11:05:11 -0000 1.51
--- src/backend/postmaster/bgwriter.c 22 Sep 2008 20:58:59 -0000
***************
*** 49,54 ****
--- 49,55 ----
#include <unistd.h>
#include "access/xlog_internal.h"
+ #include "catalog/pg_control.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
***************
*** 130,135 ****
--- 131,143 ----
int ckpt_flags; /* checkpoint flags, as defined in xlog.h */
+ /*
+ * When the Startup process wants bgwriter to perform a restartpoint, it
+ * sets these fields so that we can update the control file afterwards.
+ */
+ XLogRecPtr ReadPtr; /* ReadRecPtr for RestartPoint request */
+ CheckPoint *restartPoint; /* restartPoint data for ControlFile */
+
uint32 num_backend_writes; /* counts non-bgwriter buffer writes */
int num_requests; /* current # of requests */
***************
*** 166,172 ****
/* these values are valid when ckpt_active is true: */
static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;
static double ckpt_cached_elapsed;
static pg_time_t last_checkpoint_time;
--- 174,180 ----
/* these values are valid when ckpt_active is true: */
static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr; /* not used if IsRecoveryProcessingMode */
static double ckpt_cached_elapsed;
static pg_time_t last_checkpoint_time;
***************
*** 198,203 ****
--- 206,212 ----
{
sigjmp_buf local_sigjmp_buf;
MemoryContext bgwriter_context;
+ bool BgWriterRecoveryMode;
BgWriterShmem->bgwriter_pid = MyProcPid;
am_bg_writer = true;
***************
*** 356,371 ****
*/
PG_SETMASK(&UnBlockSig);
/*
* Loop forever
*/
for (;;)
{
- bool do_checkpoint = false;
- int flags = 0;
- pg_time_t now;
- int elapsed_secs;
-
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
--- 365,381 ----
*/
PG_SETMASK(&UnBlockSig);
+ BgWriterRecoveryMode = IsRecoveryProcessingMode();
+
+ if (BgWriterRecoveryMode)
+ elog(DEBUG1, "bgwriter starting during recovery, pid = %u",
+ BgWriterShmem->bgwriter_pid);
+
/*
* Loop forever
*/
for (;;)
{
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
***************
*** 383,501 ****
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}
- if (checkpoint_requested)
- {
- checkpoint_requested = false;
- do_checkpoint = true;
- BgWriterStats.m_requested_checkpoints++;
- }
- if (shutdown_requested)
- {
- /*
- * From here on, elog(ERROR) should end with exit(1), not send
- * control back to the sigsetjmp block above
- */
- ExitOnAnyError = true;
- /* Close down the database */
- ShutdownXLOG(0, 0);
- DumpFreeSpaceMap(0, 0);
- /* Normal exit from the bgwriter is here */
- proc_exit(0); /* done */
- }
! /*
! * Force a checkpoint if too much time has elapsed since the last one.
! * Note that we count a timed checkpoint in stats only when this
! * occurs without an external request, but we set the CAUSE_TIME flag
! * bit even if there is also an external request.
! */
! now = (pg_time_t) time(NULL);
! elapsed_secs = now - last_checkpoint_time;
! if (elapsed_secs >= CheckPointTimeout)
{
! if (!do_checkpoint)
! BgWriterStats.m_timed_checkpoints++;
! do_checkpoint = true;
! flags |= CHECKPOINT_CAUSE_TIME;
}
!
! /*
! * Do a checkpoint if requested, otherwise do one cycle of
! * dirty-buffer writing.
! */
! if (do_checkpoint)
{
! /* use volatile pointer to prevent code rearrangement */
! volatile BgWriterShmemStruct *bgs = BgWriterShmem;
/*
! * Atomically fetch the request flags to figure out what kind of a
! * checkpoint we should perform, and increase the started-counter
! * to acknowledge that we've started a new checkpoint.
*/
! SpinLockAcquire(&bgs->ckpt_lck);
! flags |= bgs->ckpt_flags;
! bgs->ckpt_flags = 0;
! bgs->ckpt_started++;
! SpinLockRelease(&bgs->ckpt_lck);
!
! /*
! * We will warn if (a) too soon since last checkpoint (whatever
! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! * since the last checkpoint start. Note in particular that this
! * implementation will not generate warnings caused by
! * CheckPointTimeout < CheckPointWarning.
! */
! if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! elapsed_secs < CheckPointWarning)
! ereport(LOG,
! (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! elapsed_secs),
! errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
!
! /*
! * Initialize bgwriter-private variables used during checkpoint.
! */
! ckpt_active = true;
! ckpt_start_recptr = GetInsertRecPtr();
! ckpt_start_time = now;
! ckpt_cached_elapsed = 0;
!
! /*
! * Do the checkpoint.
! */
! CreateCheckPoint(flags);
/*
! * After any checkpoint, close all smgr files. This is so we
! * won't hang onto smgr references to deleted files indefinitely.
*/
! smgrcloseall();
! /*
! * Indicate checkpoint completion to any waiting backends.
! */
! SpinLockAcquire(&bgs->ckpt_lck);
! bgs->ckpt_done = bgs->ckpt_started;
! SpinLockRelease(&bgs->ckpt_lck);
!
! ckpt_active = false;
! /*
! * Note we record the checkpoint start time not end time as
! * last_checkpoint_time. This is so that time-driven checkpoints
! * happen at a predictable spacing.
! */
! last_checkpoint_time = now;
}
- else
- BgBufferSync();
-
- /* Check for archive_timeout and switch xlog files if necessary. */
- CheckArchiveTimeout();
-
- /* Nap for the configured time. */
- BgWriterNap();
}
}
--- 393,592 ----
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}
! if (BgWriterRecoveryMode)
{
! if (shutdown_requested)
! {
! /*
! * From here on, elog(ERROR) should end with exit(1), not send
! * control back to the sigsetjmp block above
! */
! ExitOnAnyError = true;
! /* Normal exit from the bgwriter is here */
! proc_exit(0); /* done */
! }
!
! if (!IsRecoveryProcessingMode())
! {
! elog(DEBUG2, "bgwriter changing from recovery to normal mode");
!
! InitXLOGAccess();
! BgWriterRecoveryMode = false;
!
! /*
! * Start time-driven events from now
! */
! last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
!
! continue;
! }
!
! if (checkpoint_requested)
! {
! XLogRecPtr ReadPtr;
! CheckPoint restartPoint;
!
! checkpoint_requested = false;
!
! /*
! * Initialize bgwriter-private variables used during checkpoint.
! */
! ckpt_active = true;
! ckpt_start_time = (pg_time_t) time(NULL);
! ckpt_cached_elapsed = 0;
!
! /*
! * Get the requested values from shared memory that the
! * Startup process has put there for us.
! */
! SpinLockAcquire(&BgWriterShmem->ckpt_lck);
! ReadPtr = BgWriterShmem->ReadPtr;
! memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
! SpinLockRelease(&BgWriterShmem->ckpt_lck);
!
! CreateRestartPoint(ReadPtr, &restartPoint);
!
! ckpt_active = false;
!
! /*
! * After any checkpoint, close all smgr files. This is so we
! * won't hang onto smgr references to deleted files indefinitely.
! */
! smgrcloseall();
! }
! else
! {
! /* Clean buffers dirtied by recovery */
! BgBufferSync(true);
!
! /* Nap for the configured time. */
! BgWriterNap();
! }
}
! else /* Normal processing */
{
! bool do_checkpoint = false;
! int flags = 0;
! pg_time_t now;
! int elapsed_secs;
!
! Assert(!IsRecoveryProcessingMode());
!
! if (checkpoint_requested)
! {
! checkpoint_requested = false;
! do_checkpoint = true;
! BgWriterStats.m_requested_checkpoints++;
! }
! if (shutdown_requested)
! {
! /*
! * From here on, elog(ERROR) should end with exit(1), not send
! * control back to the sigsetjmp block above
! */
! ExitOnAnyError = true;
! /* Close down the database */
! ShutdownXLOG(0, 0);
! DumpFreeSpaceMap(0, 0);
! /* Normal exit from the bgwriter is here */
! proc_exit(0); /* done */
! }
/*
! * Force a checkpoint if too much time has elapsed since the last one.
! * Note that we count a timed checkpoint in stats only when this
! * occurs without an external request, but we set the CAUSE_TIME flag
! * bit even if there is also an external request.
*/
! now = (pg_time_t) time(NULL);
! elapsed_secs = now - last_checkpoint_time;
! if (elapsed_secs >= CheckPointTimeout)
! {
! if (!do_checkpoint)
! BgWriterStats.m_timed_checkpoints++;
! do_checkpoint = true;
! flags |= CHECKPOINT_CAUSE_TIME;
! }
/*
! * Do a checkpoint if requested, otherwise do one cycle of
! * dirty-buffer writing.
*/
! if (do_checkpoint)
! {
! /* use volatile pointer to prevent code rearrangement */
! volatile BgWriterShmemStruct *bgs = BgWriterShmem;
!
! /*
! * Atomically fetch the request flags to figure out what kind of a
! * checkpoint we should perform, and increase the started-counter
! * to acknowledge that we've started a new checkpoint.
! */
! SpinLockAcquire(&bgs->ckpt_lck);
! flags |= bgs->ckpt_flags;
! bgs->ckpt_flags = 0;
! bgs->ckpt_started++;
! SpinLockRelease(&bgs->ckpt_lck);
!
! /*
! * We will warn if (a) too soon since last checkpoint (whatever
! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! * since the last checkpoint start. Note in particular that this
! * implementation will not generate warnings caused by
! * CheckPointTimeout < CheckPointWarning.
! */
! if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! elapsed_secs < CheckPointWarning)
! ereport(LOG,
! (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! elapsed_secs),
! errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
!
! /*
! * Initialize bgwriter-private variables used during checkpoint.
! */
! ckpt_active = true;
! ckpt_start_recptr = GetInsertRecPtr();
! ckpt_start_time = now;
! ckpt_cached_elapsed = 0;
!
! /*
! * Do the checkpoint.
! */
! CreateCheckPoint(flags);
!
! /*
! * After any checkpoint, close all smgr files. This is so we
! * won't hang onto smgr references to deleted files indefinitely.
! */
! smgrcloseall();
!
! /*
! * Indicate checkpoint completion to any waiting backends.
! */
! SpinLockAcquire(&bgs->ckpt_lck);
! bgs->ckpt_done = bgs->ckpt_started;
! SpinLockRelease(&bgs->ckpt_lck);
!
! ckpt_active = false;
!
! /*
! * Note we record the checkpoint start time not end time as
! * last_checkpoint_time. This is so that time-driven checkpoints
! * happen at a predictable spacing.
! */
! last_checkpoint_time = now;
! }
! else
! BgBufferSync(false);
! /* Check for archive_timeout and switch xlog files if necessary. */
! CheckArchiveTimeout();
! /* Nap for the configured time. */
! BgWriterNap();
}
}
}
***************
*** 588,594 ****
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
! AbsorbFsyncRequests();
udelay -= 1000000L;
}
--- 679,686 ----
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
! if (!IsRecoveryProcessingMode())
! AbsorbFsyncRequests();
udelay -= 1000000L;
}
***************
*** 642,647 ****
--- 734,751 ----
if (!am_bg_writer)
return;
+ /* Perform minimal duties during recovery and skip wait if requested */
+ if (IsRecoveryProcessingMode())
+ {
+ BgBufferSync(true);
+
+ if (!shutdown_requested &&
+ IsCheckpointOnSchedule(progress))
+ BgWriterNap();
+
+ return;
+ }
+
/*
* Perform the usual bgwriter duties and take a nap, unless we're behind
* schedule, in which case we just try to catch up as quickly as possible.
***************
*** 660,666 ****
AbsorbFsyncRequests();
absorb_counter = WRITES_PER_ABSORB;
! BgBufferSync();
CheckArchiveTimeout();
BgWriterNap();
}
--- 764,770 ----
AbsorbFsyncRequests();
absorb_counter = WRITES_PER_ABSORB;
! BgBufferSync(false);
CheckArchiveTimeout();
BgWriterNap();
}
***************
*** 716,731 ****
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
! recptr = GetInsertRecPtr();
! elapsed_xlogs =
! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! CheckPointSegments;
!
! if (progress < elapsed_xlogs)
{
! ckpt_cached_elapsed = elapsed_xlogs;
! return false;
}
/*
--- 820,838 ----
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
! if (!IsRecoveryProcessingMode())
{
! recptr = GetInsertRecPtr();
! elapsed_xlogs =
! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! CheckPointSegments;
!
! if (progress < elapsed_xlogs)
! {
! ckpt_cached_elapsed = elapsed_xlogs;
! return false;
! }
}
/*
***************
*** 967,972 ****
--- 1074,1109 ----
}
/*
+ * Always runs in Startup process (see xlog.c)
+ */
+ void
+ RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter)
+ {
+ /*
+ * Should we just do it ourselves?
+ */
+ if (!IsPostmasterEnvironment || !sendToBGWriter)
+ {
+ CreateRestartPoint(ReadPtr, restartPoint);
+ return;
+ }
+
+ /*
+ * Push requested values into shared memory, then signal to request restartpoint.
+ */
+ if (BgWriterShmem->bgwriter_pid == 0)
+ elog(LOG, "could not request restartpoint because bgwriter not running");
+
+ SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ BgWriterShmem->ReadPtr = ReadPtr;
+ memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
+ SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+ if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ elog(LOG, "could not signal for restartpoint: %m");
+ }
+
+ /*
* ForwardFsyncRequest
* Forward a file-fsync request from a backend to the bgwriter
*
Index: src/backend/postmaster/postmaster.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/postmaster.c,v
retrieving revision 1.561
diff -c -r1.561 postmaster.c
*** src/backend/postmaster/postmaster.c 26 Jun 2008 02:47:19 -0000 1.561
--- src/backend/postmaster/postmaster.c 22 Sep 2008 20:25:36 -0000
***************
*** 254,259 ****
--- 254,264 ----
{
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
+ PM_RECOVERY, /* consistent recovery mode; state only
+ * entered for archive and streaming recovery,
+ * and only after the point where the
+ * all data is in consistent state.
+ */
PM_RUN, /* normal "database is alive" state */
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
***************
*** 1294,1300 ****
* state that prevents it, start one. It doesn't matter if this
* fails, we'll just try again later.
*/
! if (BgWriterPID == 0 && pmState == PM_RUN)
BgWriterPID = StartBackgroundWriter();
/*
--- 1299,1305 ----
* state that prevents it, start one. It doesn't matter if this
* fails, we'll just try again later.
*/
! if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
BgWriterPID = StartBackgroundWriter();
/*
***************
*** 2104,2110 ****
if (pid == StartupPID)
{
StartupPID = 0;
! Assert(pmState == PM_STARTUP);
/* FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
--- 2109,2115 ----
if (pid == StartupPID)
{
StartupPID = 0;
! Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
/* FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
***************
*** 2145,2155 ****
load_role();
/*
! * Crank up the background writer. It doesn't matter if this
! * fails, we'll just try again later.
*/
! Assert(BgWriterPID == 0);
! BgWriterPID = StartBackgroundWriter();
/*
* Likewise, start other special children as needed. In a restart
--- 2150,2160 ----
load_role();
/*
! * Check whether we need to start background writer, if not
! * already running.
*/
! if (BgWriterPID == 0)
! BgWriterPID = StartBackgroundWriter();
/*
* Likewise, start other special children as needed. In a restart
***************
*** 3821,3826 ****
--- 3826,3876 ----
PG_SETMASK(&BlockSig);
+ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ {
+ Assert(pmState == PM_STARTUP);
+
+ /*
+ * Go to shutdown mode if a shutdown request was pending.
+ */
+ if (Shutdown > NoShutdown)
+ {
+ pmState = PM_WAIT_BACKENDS;
+ /* PostmasterStateMachine logic does the rest */
+ }
+ else
+ {
+ /*
+ * Startup process has entered recovery
+ */
+ pmState = PM_RECOVERY;
+
+ /*
+ * Load the flat authorization file into postmaster's cache. The
+ * startup process won't have recomputed this from the database yet,
+ * so we it may change following recovery.
+ */
+ load_role();
+
+ /*
+ * Crank up the background writer. It doesn't matter if this
+ * fails, we'll just try again later.
+ */
+ Assert(BgWriterPID == 0);
+ BgWriterPID = StartBackgroundWriter();
+
+ /*
+ * Likewise, start other special children as needed.
+ */
+ Assert(PgStatPID == 0);
+ PgStatPID = pgstat_start();
+
+ /* XXX at this point we could accept read-only connections */
+ ereport(DEBUG1,
+ (errmsg("database system is in consistent recovery mode")));
+ }
+ }
+
if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
{
/*
Index: src/backend/storage/buffer/README
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/README,v
retrieving revision 1.14
diff -c -r1.14 README
*** src/backend/storage/buffer/README 21 Mar 2008 13:23:28 -0000 1.14
--- src/backend/storage/buffer/README 22 Sep 2008 21:54:20 -0000
***************
*** 264,266 ****
--- 264,271 ----
This ensures that the page image transferred to disk is reasonably consistent.
We might miss a hint-bit update or two but that isn't a problem, for the same
reasons mentioned under buffer access rules.
+
+ As of 8.4, background writer starts during recovery mode when there is
+ some form of potentially extended recovery to perform. It performs an
+ identical service to normal processing, except that checkpoints it
+ writes are technically restartpoints nor needs to flush WAL for dirty buffers.
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.237
diff -c -r1.237 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c 11 Aug 2008 11:05:11 -0000 1.237
--- src/backend/storage/buffer/bufmgr.c 22 Sep 2008 21:01:02 -0000
***************
*** 1080,1086 ****
*
* This is called at checkpoint time to write out all dirty shared buffers.
* The checkpoint request flags should be passed in; currently the only one
! * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
*/
static void
BufferSync(int flags)
--- 1080,1087 ----
*
* This is called at checkpoint time to write out all dirty shared buffers.
* The checkpoint request flags should be passed in; currently the only one
! * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes,
! * except for restartpoints, which use CHECKPOINT_RESTARTPOINT.
*/
static void
BufferSync(int flags)
***************
*** 1089,1094 ****
--- 1090,1096 ----
int num_to_scan;
int num_to_write;
int num_written;
+ bool cleanup_and_exit = false;
/* Make sure we can handle the pin inside SyncOneBuffer */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
***************
*** 1163,1169 ****
*/
if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
{
! if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
{
TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
BgWriterStats.m_buf_written_checkpoints++;
--- 1165,1180 ----
*/
if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
{
! if (cleanup_and_exit)
! {
! LockBufHdr(bufHdr);
!
! if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
! bufHdr->flags &= ~BM_CHECKPOINT_NEEDED;
!
! UnlockBufHdr(bufHdr);
! }
! else if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
{
TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
BgWriterStats.m_buf_written_checkpoints++;
***************
*** 1194,1199 ****
--- 1205,1218 ----
if (++buf_id >= NBuffers)
buf_id = 0;
+
+ /*
+ * Quit scanning if state changes while we're here. If it
+ * does we want to stop performing the current restartpoint
+ * as quickly as possible. We'll write a real checkpoint instead.
+ */
+ if (flags & CHECKPOINT_RESTARTPOINT && !IsRecoveryProcessingMode())
+ cleanup_and_exit = true;
}
TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_write);
***************
*** 1211,1217 ****
* This is called periodically by the background writer process.
*/
void
! BgBufferSync(void)
{
/* info obtained from freelist.c */
int strategy_buf_id;
--- 1230,1236 ----
* This is called periodically by the background writer process.
*/
void
! BgBufferSync(bool calledInRecovery)
{
/* info obtained from freelist.c */
int strategy_buf_id;
***************
*** 1423,1428 ****
--- 1442,1456 ----
{
int buffer_state = SyncOneBuffer(next_to_clean, true);
+ /*
+ * Quit scanning if state changes while we're here, otherwise
+ * we might skip writing WAL for a buffer that was modified
+ * by a transaction that committed very soon after recovery
+ * mode completes.
+ */
+ if (calledInRecovery && !IsRecoveryProcessingMode())
+ break;
+
if (++next_to_clean >= NBuffers)
{
next_to_clean = 0;
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog.h,v
retrieving revision 1.88
diff -c -r1.88 xlog.h
*** src/include/access/xlog.h 12 May 2008 08:35:05 -0000 1.88
--- src/include/access/xlog.h 22 Sep 2008 19:04:56 -0000
***************
*** 133,139 ****
} XLogRecData;
extern TimeLineID ThisTimeLineID; /* current TLI */
! extern bool InRecovery;
extern XLogRecPtr XactLastRecEnd;
/* these variables are GUC parameters related to XLOG */
--- 133,148 ----
} XLogRecData;
extern TimeLineID ThisTimeLineID; /* current TLI */
!
! /*
! * Prior to 8.4, all activity during recovery were carried out by Startup
! * process. This local variable continues to be used in many parts of the
! * code to indicate actions taken by RecoveryManagers. Other processes who
! * potentially perform work during recovery should check
! * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
! */
! extern bool InRecovery;
!
extern XLogRecPtr XactLastRecEnd;
/* these variables are GUC parameters related to XLOG */
***************
*** 166,171 ****
--- 175,181 ----
/* These indicate the cause of a checkpoint request */
#define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */
#define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */
+ #define CHECKPOINT_RESTARTPOINT 0x0040 /* Restartpoint during recovery */
/* Checkpoint statistics */
typedef struct CheckpointStatsData
***************
*** 197,202 ****
--- 207,214 ----
extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool IsRecoveryProcessingMode(void);
+
extern void UpdateControlFile(void);
extern Size XLOGShmemSize(void);
extern void XLOGShmemInit(void);
Index: src/include/access/xlog_internal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog_internal.h,v
retrieving revision 1.24
diff -c -r1.24 xlog_internal.h
*** src/include/access/xlog_internal.h 11 Aug 2008 11:05:11 -0000 1.24
--- src/include/access/xlog_internal.h 22 Sep 2008 20:26:56 -0000
***************
*** 17,22 ****
--- 17,23 ----
#define XLOG_INTERNAL_H
#include "access/xlog.h"
+ #include "catalog/pg_control.h"
#include "fmgr.h"
#include "pgtime.h"
#include "storage/block.h"
***************
*** 245,250 ****
--- 246,253 ----
extern pg_time_t GetLastSegSwitchTime(void);
extern XLogRecPtr RequestXLogSwitch(void);
+ extern void CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint);
+
/*
* These aren't in xlog.h because I'd rather not include fmgr.h there.
*/
Index: src/include/catalog/pg_control.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/catalog/pg_control.h,v
retrieving revision 1.41
diff -c -r1.41 pg_control.h
*** src/include/catalog/pg_control.h 21 Apr 2008 00:26:47 -0000 1.41
--- src/include/catalog/pg_control.h 22 Sep 2008 19:21:37 -0000
***************
*** 46,52 ****
#define XLOG_NOOP 0x20
#define XLOG_NEXTOID 0x30
#define XLOG_SWITCH 0x40
!
/* System status indicator */
typedef enum DBState
--- 46,52 ----
#define XLOG_NOOP 0x20
#define XLOG_NEXTOID 0x30
#define XLOG_SWITCH 0x40
! #define XLOG_RECOVERY_END 0x50
/* System status indicator */
typedef enum DBState
Index: src/include/postmaster/bgwriter.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/postmaster/bgwriter.h,v
retrieving revision 1.12
diff -c -r1.12 bgwriter.h
*** src/include/postmaster/bgwriter.h 11 Aug 2008 11:05:11 -0000 1.12
--- src/include/postmaster/bgwriter.h 22 Sep 2008 15:53:22 -0000
***************
*** 12,17 ****
--- 12,18 ----
#ifndef _BGWRITER_H
#define _BGWRITER_H
+ #include "catalog/pg_control.h"
#include "storage/block.h"
#include "storage/relfilenode.h"
***************
*** 25,30 ****
--- 26,32 ----
extern void BackgroundWriterMain(void);
extern void RequestCheckpoint(int flags);
+ extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
extern void CheckpointWriteDelay(int flags, double progress);
extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.115
diff -c -r1.115 bufmgr.h
*** src/include/storage/bufmgr.h 11 Aug 2008 11:05:11 -0000 1.115
--- src/include/storage/bufmgr.h 22 Sep 2008 18:36:54 -0000
***************
*** 193,199 ****
extern void AbortBufferIO(void);
extern void BufmgrCommit(void);
! extern void BgBufferSync(void);
extern void AtProcExit_LocalBuffers(void);
--- 193,199 ----
extern void AbortBufferIO(void);
extern void BufmgrCommit(void);
! extern void BgBufferSync(bool calledInRecovery);
extern void AtProcExit_LocalBuffers(void);
Index: src/include/storage/pmsignal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/pmsignal.h,v
retrieving revision 1.20
diff -c -r1.20 pmsignal.h
*** src/include/storage/pmsignal.h 19 Jun 2008 21:32:56 -0000 1.20
--- src/include/storage/pmsignal.h 22 Sep 2008 15:54:57 -0000
***************
*** 22,27 ****
--- 22,28 ----
*/
typedef enum
{
+ PMSIGNAL_RECOVERY_START, /* move to PM_RECOVERY state */
PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */
PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */
PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers