On Thu, 2008-08-07 at 12:44 +0100, Simon Riggs wrote: > I would like to propose some changes to the infrastructure for recovery. > These changes are beneficial in themselves, but also form the basis for > other work we might later contemplate. > > Currently > * the startup process performs restartpoints during recovery > * the death of the startup process is tied directly to the change of > state in the postmaster following recovery > > I propose to > * have startup process signal postmaster when it starts Redo phase (if > it starts it)
> Decoupling things in this way allows us to > 1. arrange for the bgwriter to start during Redo, so it can: > i) clean dirty blocks for the startup process > ii) perform restartpoints in background > Both of these aspects will increase performance of recovery Taking into account comments from Tom and Alvaro Included patch with the following changes: * new postmaster mode known as consistent recovery, entered only when recovery passes safe/consistent point. InRedo is now set in all processes when started/connected in consistent recovery mode. * bgwriter and stats process starts in consistent recovery mode. bgwriter changes mode when startup process completes. * bgwriter now performs restartpoints and also cleans shared_buffers while the startup process performs redo apply * recovery.conf parameter log_restartpoints is now deprecated, since function overlaps with log_checkpoints too much. I've kept the distinction between restartpoints and checkpoints in code, to avoid convoluted code. Minor change, not critical. [Replying to one of Alvaro's other comments: Startup process still uses XLogReadBuffer. I'm not planning on changing that either, at least not in this patch.] Patch doesn't conflict with rmgr plugin patch. Passes make check, but that's easy. Various other tests all seem to be working. -- Simon Riggs www.2ndQuadrant.com PostgreSQL Training, Services and Support
Index: src/backend/access/transam/xlog.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/xlog.c,v retrieving revision 1.317 diff -c -r1.317 xlog.c *** src/backend/access/transam/xlog.c 11 Aug 2008 11:05:10 -0000 1.317 --- src/backend/access/transam/xlog.c 31 Aug 2008 19:07:40 -0000 *************** *** 131,137 **** static bool recoveryTarget = false; static bool recoveryTargetExact = false; static bool recoveryTargetInclusive = true; - static bool recoveryLogRestartpoints = false; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; static TimestampTz recoveryLastXTime = 0; --- 131,136 ---- *************** *** 386,392 **** static XLogRecord *nextRecord = NULL; static TimeLineID lastPageTLI = 0; ! static bool InRedo = false; static void XLogArchiveNotify(const char *xlog); --- 385,391 ---- static XLogRecord *nextRecord = NULL; static TimeLineID lastPageTLI = 0; ! bool InRedo = false; static void XLogArchiveNotify(const char *xlog); *************** *** 480,485 **** --- 479,488 ---- bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + /* cross-check on whether we should be here or not */ + if (InRedo) + elog(FATAL, "cannot write new WAL data during recovery mode"); + /* info's high bits are reserved for use by me */ if (info & XLR_INFO_MASK) elog(PANIC, "invalid xlog info mask %02X", info); *************** *** 2051,2057 **** unlink(tmppath); } ! elog(DEBUG2, "done creating and filling new WAL file"); /* Set flag to tell caller there was no existent file */ *use_existent = false; --- 2054,2061 ---- unlink(tmppath); } ! XLogFileName(tmppath, ThisTimeLineID, log, seg); ! elog(DEBUG2, "done creating and filling new WAL file %s", tmppath); /* Set flag to tell caller there was no existent file */ *use_existent = false; *************** *** 4532,4546 **** } else if (strcmp(tok1, "log_restartpoints") == 0) { - /* - * does nothing if a recovery_target is not also set - */ - if (!parse_bool(tok2, &recoveryLogRestartpoints)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("parameter \"log_restartpoints\" requires a Boolean value"))); ereport(LOG, ! (errmsg("log_restartpoints = %s", tok2))); } else ereport(FATAL, --- 4536,4544 ---- } else if (strcmp(tok1, "log_restartpoints") == 0) { ereport(LOG, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("parameter \"log_restartpoints\" has been deprecated"))); } else ereport(FATAL, *************** *** 4811,4816 **** --- 4809,4815 ---- CheckPoint checkPoint; bool wasShutdown; bool reachedStopPoint = false; + bool reachedSafeStopPoint = false; bool haveBackupLabel = false; XLogRecPtr RecPtr, LastRec, *************** *** 5039,5044 **** --- 5038,5048 ---- UpdateControlFile(); /* + * Reset pgstat data, because it may be invalid after recovery. + */ + pgstat_reset_all(); + + /* * If there was a backup label file, it's done its job and the info * has now been propagated into pg_control. We must get rid of the * label file so that if we crash during recovery, we'll pick up at *************** *** 5148,5153 **** --- 5152,5172 ---- LastRec = ReadRecPtr; + /* + * Have we reached our safe stopping point? If so, we can + * signal Postmaster to enter consistent recovery mode + */ + if (!reachedSafeStopPoint && + XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr)) + { + reachedSafeStopPoint = true; + ereport(LOG, + (errmsg("consistent recovery state reached at %X/%X", + EndRecPtr.xlogid, EndRecPtr.xrecoff))); + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_START); + } + record = ReadRecord(NULL, LOG); } while (record != NULL && recoveryContinue); *************** *** 5169,5174 **** --- 5188,5194 ---- /* there are no WAL records following the checkpoint */ ereport(LOG, (errmsg("redo is not required"))); + reachedSafeStopPoint = true; } } *************** *** 5184,5190 **** * Complain if we did not roll forward far enough to render the backup * dump consistent. */ ! if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint)) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, --- 5204,5210 ---- * Complain if we did not roll forward far enough to render the backup * dump consistent. */ ! if (InRecovery && !reachedSafeStopPoint) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, *************** *** 5305,5314 **** */ XLogCheckInvalidPages(); ! /* ! * Reset pgstat data, because it may be invalid after recovery. ! */ ! pgstat_reset_all(); /* * Perform a checkpoint to update all our recovery activity to disk. --- 5325,5332 ---- */ XLogCheckInvalidPages(); ! if (IsUnderPostmaster) ! BgWriterCompleteRestartPointImmediately(); /* * Perform a checkpoint to update all our recovery activity to disk. *************** *** 5318,5323 **** --- 5336,5344 ---- * assigning a new TLI, using a shutdown checkpoint allows us to have * the rule that TLI only changes in shutdown checkpoints, which * allows some extra error checking in xlog_redo. + * + * Note that this will wait behind any restartpoint that the bgwriter + * is currently performing, though will be much faster as a result. */ CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } *************** *** 5372,5377 **** --- 5393,5401 ---- readRecordBuf = NULL; readRecordBufSize = 0; } + + if (IsUnderPostmaster) + BgWriterRecoveryComplete(); } /* *************** *** 5642,5648 **** * Log end of a checkpoint. */ static void ! LogCheckpointEnd(void) { long write_secs, sync_secs, --- 5666,5672 ---- * Log end of a checkpoint. */ static void ! LogCheckpointEnd(bool checkpoint) { long write_secs, sync_secs, *************** *** 5665,5673 **** CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); ! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " "%d transaction log file(s) added, %d removed, %d recycled; " "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, --- 5689,5698 ---- CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); ! elog(LOG, "%s complete: wrote %d buffers (%.1f%%); " "%d transaction log file(s) added, %d removed, %d recycled; " "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", + (checkpoint ? " checkpoint" : "restartpoint"), CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, *************** *** 6002,6008 **** /* All real work is done, but log before releasing lock. */ if (log_checkpoints) ! LogCheckpointEnd(); LWLockRelease(CheckpointLock); } --- 6027,6033 ---- /* All real work is done, but log before releasing lock. */ if (log_checkpoints) ! LogCheckpointEnd(true); LWLockRelease(CheckpointLock); } *************** *** 6071,6099 **** } } /* ! * OK, force data out to disk ! */ ! CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE); ! ! /* ! * Update pg_control so that any subsequent crash will restart from this ! * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint ! * record itself. */ ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = ReadRecPtr; ControlFile->checkPointCopy = *checkPoint; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); ! ereport((recoveryLogRestartpoints ? LOG : DEBUG2), (errmsg("recovery restart point at %X/%X", checkPoint->redo.xlogid, checkPoint->redo.xrecoff))); ! if (recoveryLastXTime) ! ereport((recoveryLogRestartpoints ? LOG : DEBUG2), ! (errmsg("last completed transaction was at log time %s", ! timestamptz_to_str(recoveryLastXTime)))); } /* --- 6096,6164 ---- } } + if (recoveryLastXTime) + ereport((log_checkpoints ? LOG : DEBUG2), + (errmsg("last completed transaction was at log time %s", + timestamptz_to_str(recoveryLastXTime)))); /* ! * Update ControlFile data in shared memory. ! * Note: ReadRecPtr gives the XLOG address of the checkpoint record itself. */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = ReadRecPtr; ControlFile->checkPointCopy = *checkPoint; + RequestRestartPoint(); + LWLockRelease(ControlFileLock); + } + + /* + * As of 8.4, RestartPoints are always created by the bgwriter + */ + void + CreateRestartPoint(void) + { + CheckPoint *checkPoint; + + if (log_checkpoints) + { + /* + * Prepare to accumulate statistics. + */ + + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + elog(LOG, "restartpoint starting:"); + } + + LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + + checkPoint = &ControlFile->checkPointCopy; + + /* + * OK, write out dirty blocks smoothly + */ + CheckPointGuts(checkPoint->redo, 0); + + /* + * Update pg_control, using current time + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* All real work is done, but log before releasing lock. */ + if (log_checkpoints) + LogCheckpointEnd(true); ! ereport((log_checkpoints ? LOG : DEBUG2), (errmsg("recovery restart point at %X/%X", checkPoint->redo.xlogid, checkPoint->redo.xrecoff))); ! ! LWLockRelease(CheckpointLock); ! } /* Index: src/backend/postmaster/bgwriter.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/bgwriter.c,v retrieving revision 1.51 diff -c -r1.51 bgwriter.c *** src/backend/postmaster/bgwriter.c 11 Aug 2008 11:05:11 -0000 1.51 --- src/backend/postmaster/bgwriter.c 31 Aug 2008 19:45:39 -0000 *************** *** 122,127 **** --- 122,129 ---- { pid_t bgwriter_pid; /* PID of bgwriter (0 if not started) */ + bool InRedo; + slock_t ckpt_lck; /* protects all the ckpt_* fields */ int ckpt_started; /* advances when checkpoint starts */ *************** *** 166,172 **** /* these values are valid when ckpt_active is true: */ static pg_time_t ckpt_start_time; ! static XLogRecPtr ckpt_start_recptr; static double ckpt_cached_elapsed; static pg_time_t last_checkpoint_time; --- 168,174 ---- /* these values are valid when ckpt_active is true: */ static pg_time_t ckpt_start_time; ! static XLogRecPtr ckpt_start_recptr; /* not used if InRedo */ static double ckpt_cached_elapsed; static pg_time_t last_checkpoint_time; *************** *** 186,191 **** --- 188,208 ---- static void ReqCheckpointHandler(SIGNAL_ARGS); static void ReqShutdownHandler(SIGNAL_ARGS); + /* notify bgwriter of change in mode */ + void + BgWriterRecoveryComplete(void) + { + BgWriterShmem->InRedo = false; + elog(DEBUG1, "recovery complete"); + } + + /* ask bgwriter to complete any restartpoint, if any, with zero delay */ + void + BgWriterCompleteRestartPointImmediately(void) + { + BgWriterShmem->ckpt_flags = CHECKPOINT_IMMEDIATE; + elog(DEBUG2, "asking bgwriter to complete any restartpoint with zero delay"); + } /* * Main entry point for bgwriter process *************** *** 202,207 **** --- 219,230 ---- BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; + /* + * Follow the postmaster's current mode at startup. If we are InRedo then + * the startup process will later tell us when it is complete. + */ + BgWriterShmem->InRedo = InRedo; + /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (bgwriter probably never has any *************** *** 356,371 **** */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { - bool do_checkpoint = false; - int flags = 0; - pg_time_t now; - int elapsed_secs; - /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. --- 379,393 ---- */ PG_SETMASK(&UnBlockSig); + if (InRedo) + elog(DEBUG1, "bgwriter starting in recovery mode, pid = %u", + BgWriterShmem->bgwriter_pid); + /* * Loop forever */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. *************** *** 383,501 **** got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } - if (checkpoint_requested) - { - checkpoint_requested = false; - do_checkpoint = true; - BgWriterStats.m_requested_checkpoints++; - } - if (shutdown_requested) - { - /* - * From here on, elog(ERROR) should end with exit(1), not send - * control back to the sigsetjmp block above - */ - ExitOnAnyError = true; - /* Close down the database */ - ShutdownXLOG(0, 0); - DumpFreeSpaceMap(0, 0); - /* Normal exit from the bgwriter is here */ - proc_exit(0); /* done */ - } - - /* - * Force a checkpoint if too much time has elapsed since the last one. - * Note that we count a timed checkpoint in stats only when this - * occurs without an external request, but we set the CAUSE_TIME flag - * bit even if there is also an external request. - */ - now = (pg_time_t) time(NULL); - elapsed_secs = now - last_checkpoint_time; - if (elapsed_secs >= CheckPointTimeout) - { - if (!do_checkpoint) - BgWriterStats.m_timed_checkpoints++; - do_checkpoint = true; - flags |= CHECKPOINT_CAUSE_TIME; - } ! /* ! * Do a checkpoint if requested, otherwise do one cycle of ! * dirty-buffer writing. ! */ ! if (do_checkpoint) { - /* use volatile pointer to prevent code rearrangement */ - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - /* ! * Atomically fetch the request flags to figure out what kind of a ! * checkpoint we should perform, and increase the started-counter ! * to acknowledge that we've started a new checkpoint. */ ! SpinLockAcquire(&bgs->ckpt_lck); ! flags |= bgs->ckpt_flags; ! bgs->ckpt_flags = 0; ! bgs->ckpt_started++; ! SpinLockRelease(&bgs->ckpt_lck); ! /* ! * We will warn if (a) too soon since last checkpoint (whatever ! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag ! * since the last checkpoint start. Note in particular that this ! * implementation will not generate warnings caused by ! * CheckPointTimeout < CheckPointWarning. ! */ ! if ((flags & CHECKPOINT_CAUSE_XLOG) && ! elapsed_secs < CheckPointWarning) ! ereport(LOG, ! (errmsg("checkpoints are occurring too frequently (%d seconds apart)", ! elapsed_secs), ! errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! ckpt_start_recptr = GetInsertRecPtr(); ! ckpt_start_time = now; ! ckpt_cached_elapsed = 0; ! /* ! * Do the checkpoint. ! */ ! CreateCheckPoint(flags); /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. */ ! smgrcloseall(); /* ! * Indicate checkpoint completion to any waiting backends. */ ! SpinLockAcquire(&bgs->ckpt_lck); ! bgs->ckpt_done = bgs->ckpt_started; ! SpinLockRelease(&bgs->ckpt_lck); ! ! ckpt_active = false; /* ! * Note we record the checkpoint start time not end time as ! * last_checkpoint_time. This is so that time-driven checkpoints ! * happen at a predictable spacing. */ ! last_checkpoint_time = now; ! } ! else ! BgBufferSync(); ! /* Check for archive_timeout and switch xlog files if necessary. */ ! CheckArchiveTimeout(); ! /* Nap for the configured time. */ ! BgWriterNap(); } } --- 405,610 ---- got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } ! if (InRedo) { /* ! * Check to see whether startup process has completed redo. ! * If so, we can permanently change out of recovery mode. */ ! if (BgWriterShmem->InRedo == false) ! { ! elog(LOG, "changing to InRedo = false"); ! InitXLOGAccess(); ! InRedo = false; ! /* ! * Start time-driven events from now ! */ ! last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); ! } ! ! if (checkpoint_requested) ! { ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! ckpt_start_time = (pg_time_t) time(NULL); ! ckpt_cached_elapsed = 0; ! ! CreateRestartPoint(); ! ! ckpt_active = false; ! checkpoint_requested = false; ! /* ! * Reset any flags if we requested immediate completion part ! * way through the restart point ! */ ! BgWriterShmem->ckpt_flags = 0; ! } ! else ! { ! /* Clean buffers dirtied by recovery */ ! BgBufferSync(); ! ! /* Nap for the configured time. */ ! BgWriterNap(); ! } ! ! if (shutdown_requested) ! { ! /* ! * From here on, elog(ERROR) should end with exit(1), not send ! * control back to the sigsetjmp block above ! */ ! ExitOnAnyError = true; ! /* Normal exit from the bgwriter is here */ ! proc_exit(0); /* done */ ! } /* ! * Check to see whether startup process has completed redo. ! * If so, we can permanently change out of recovery mode. */ ! if (BgWriterShmem->InRedo == false) ! { ! elog(DEBUG2, "changing to InRedo = false"); ! ! InitXLOGAccess(); ! InRedo = false; ! ! /* ! * Start time-driven events from now ! */ ! last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); ! } ! } ! else /* Normal processing */ ! { ! bool do_checkpoint = false; ! int flags = 0; ! pg_time_t now; ! int elapsed_secs; ! ! Assert(!InRedo); ! ! if (checkpoint_requested) ! { ! checkpoint_requested = false; ! do_checkpoint = true; ! BgWriterStats.m_requested_checkpoints++; ! } ! if (shutdown_requested) ! { ! /* ! * From here on, elog(ERROR) should end with exit(1), not send ! * control back to the sigsetjmp block above ! */ ! ExitOnAnyError = true; ! /* Close down the database */ ! ShutdownXLOG(0, 0); ! DumpFreeSpaceMap(0, 0); ! /* Normal exit from the bgwriter is here */ ! proc_exit(0); /* done */ ! } /* ! * Force a checkpoint if too much time has elapsed since the last one. ! * Note that we count a timed checkpoint in stats only when this ! * occurs without an external request, but we set the CAUSE_TIME flag ! * bit even if there is also an external request. */ ! now = (pg_time_t) time(NULL); ! elapsed_secs = now - last_checkpoint_time; ! if (elapsed_secs >= CheckPointTimeout) ! { ! if (!do_checkpoint) ! BgWriterStats.m_timed_checkpoints++; ! do_checkpoint = true; ! flags |= CHECKPOINT_CAUSE_TIME; ! } /* ! * Do a checkpoint if requested, otherwise do one cycle of ! * dirty-buffer writing. */ ! if (do_checkpoint) ! { ! /* use volatile pointer to prevent code rearrangement */ ! volatile BgWriterShmemStruct *bgs = BgWriterShmem; ! ! /* ! * Atomically fetch the request flags to figure out what kind of a ! * checkpoint we should perform, and increase the started-counter ! * to acknowledge that we've started a new checkpoint. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! flags |= bgs->ckpt_flags; ! bgs->ckpt_flags = 0; ! bgs->ckpt_started++; ! SpinLockRelease(&bgs->ckpt_lck); ! ! /* ! * We will warn if (a) too soon since last checkpoint (whatever ! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag ! * since the last checkpoint start. Note in particular that this ! * implementation will not generate warnings caused by ! * CheckPointTimeout < CheckPointWarning. ! */ ! if ((flags & CHECKPOINT_CAUSE_XLOG) && ! elapsed_secs < CheckPointWarning) ! ereport(LOG, ! (errmsg("checkpoints are occurring too frequently (%d seconds apart)", ! elapsed_secs), ! errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); ! ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! ckpt_start_recptr = GetInsertRecPtr(); ! ckpt_start_time = now; ! ckpt_cached_elapsed = 0; ! ! /* ! * Do the checkpoint. ! */ ! CreateCheckPoint(flags); ! ! /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. ! */ ! smgrcloseall(); ! ! /* ! * Indicate checkpoint completion to any waiting backends. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! bgs->ckpt_done = bgs->ckpt_started; ! SpinLockRelease(&bgs->ckpt_lck); ! ! ckpt_active = false; ! ! /* ! * Note we record the checkpoint start time not end time as ! * last_checkpoint_time. This is so that time-driven checkpoints ! * happen at a predictable spacing. ! */ ! last_checkpoint_time = now; ! } ! else ! BgBufferSync(); ! /* Check for archive_timeout and switch xlog files if necessary. */ ! CheckArchiveTimeout(); ! /* Nap for the configured time. */ ! BgWriterNap(); ! } } } *************** *** 588,594 **** (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) break; pg_usleep(1000000L); ! AbsorbFsyncRequests(); udelay -= 1000000L; } --- 697,704 ---- (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) break; pg_usleep(1000000L); ! if (!InRedo) ! AbsorbFsyncRequests(); udelay -= 1000000L; } *************** *** 642,647 **** --- 752,770 ---- if (!am_bg_writer) return; + /* Perform minimal duties during recovery and skip wait if requested */ + if (InRedo) + { + BgBufferSync(); + + if (!ImmediateCheckpointRequested() && + !shutdown_requested && + IsCheckpointOnSchedule(progress)) + BgWriterNap(); + + return; + } + /* * Perform the usual bgwriter duties and take a nap, unless we're behind * schedule, in which case we just try to catch up as quickly as possible. *************** *** 716,731 **** * However, it's good enough for our purposes, we're only calculating an * estimate anyway. */ ! recptr = GetInsertRecPtr(); ! elapsed_xlogs = ! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + ! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / ! CheckPointSegments; ! ! if (progress < elapsed_xlogs) { ! ckpt_cached_elapsed = elapsed_xlogs; ! return false; } /* --- 839,857 ---- * However, it's good enough for our purposes, we're only calculating an * estimate anyway. */ ! if (!InRedo) { ! recptr = GetInsertRecPtr(); ! elapsed_xlogs = ! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + ! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / ! CheckPointSegments; ! ! if (progress < elapsed_xlogs) ! { ! ckpt_cached_elapsed = elapsed_xlogs; ! return false; ! } } /* *************** *** 966,971 **** --- 1092,1118 ---- } } + void + RequestRestartPoint(void) + { + /* + * If in a standalone backend, just do it ourselves. + */ + if (!IsPostmasterEnvironment) + { + CreateRestartPoint(); + return; + } + + /* + * Send signal to request restartpoint. + */ + if (BgWriterShmem->bgwriter_pid == 0) + elog(LOG, "could not request restartpoint because bgwriter not running"); + if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0) + elog(LOG, "could not signal for restartpoint: %m"); + } + /* * ForwardFsyncRequest * Forward a file-fsync request from a backend to the bgwriter Index: src/backend/postmaster/postmaster.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/postmaster.c,v retrieving revision 1.561 diff -c -r1.561 postmaster.c *** src/backend/postmaster/postmaster.c 26 Jun 2008 02:47:19 -0000 1.561 --- src/backend/postmaster/postmaster.c 31 Aug 2008 17:10:31 -0000 *************** *** 254,259 **** --- 254,264 ---- { PM_INIT, /* postmaster starting */ PM_STARTUP, /* waiting for startup subprocess */ + PM_RECOVERY, /* consistent recovery mode; state only + * entered for archive and streaming recovery, + * and only after the point where the + * all data is in consistent state. + */ PM_RUN, /* normal "database is alive" state */ PM_WAIT_BACKUP, /* waiting for online backup mode to end */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ *************** *** 2104,2110 **** if (pid == StartupPID) { StartupPID = 0; ! Assert(pmState == PM_STARTUP); /* FATAL exit of startup is treated as catastrophic */ if (!EXIT_STATUS_0(exitstatus)) --- 2109,2115 ---- if (pid == StartupPID) { StartupPID = 0; ! Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY); /* FATAL exit of startup is treated as catastrophic */ if (!EXIT_STATUS_0(exitstatus)) *************** *** 2136,2141 **** --- 2141,2147 ---- * Otherwise, commence normal operations. */ pmState = PM_RUN; + InRedo = false; /* * Load the flat authorization file into postmaster's cache. The *************** *** 2148,2155 **** * Crank up the background writer. It doesn't matter if this * fails, we'll just try again later. */ ! Assert(BgWriterPID == 0); ! BgWriterPID = StartBackgroundWriter(); /* * Likewise, start other special children as needed. In a restart --- 2154,2161 ---- * Crank up the background writer. It doesn't matter if this * fails, we'll just try again later. */ ! if (BgWriterPID == 0) ! BgWriterPID = StartBackgroundWriter(); /* * Likewise, start other special children as needed. In a restart *************** *** 2812,2817 **** --- 2818,2825 ---- */ MyCancelKey = PostmasterRandom(); + InRedo = (pmState != PM_RUN); + /* * Make room for backend data structure. Better before the fork() so we * can handle failure cleanly. *************** *** 3821,3826 **** --- 3829,3880 ---- PG_SETMASK(&BlockSig); + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START)) + { + Assert(pmState == PM_STARTUP); + + /* + * Go to shutdown mode if a shutdown request was pending. + */ + if (Shutdown > NoShutdown) + { + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + } + else + { + /* + * Startup process has entered recovery + */ + pmState = PM_RECOVERY; + InRedo = true; + + /* + * Load the flat authorization file into postmaster's cache. The + * startup process won't have recomputed this from the database yet, + * so we it may change following recovery. + */ + load_role(); + + /* + * Crank up the background writer. It doesn't matter if this + * fails, we'll just try again later. + */ + Assert(BgWriterPID == 0); + BgWriterPID = StartBackgroundWriter(); + + /* + * Likewise, start other special children as needed. + */ + Assert(PgStatPID == 0); + PgStatPID = pgstat_start(); + + /* XXX at this point we could accept read-only connections */ + ereport(DEBUG1, + (errmsg("database system is in consistent recovery mode"))); + } + } + if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE)) { /* Index: src/include/miscadmin.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/miscadmin.h,v retrieving revision 1.202 diff -c -r1.202 miscadmin.h *** src/include/miscadmin.h 23 Apr 2008 13:44:59 -0000 1.202 --- src/include/miscadmin.h 31 Aug 2008 11:23:45 -0000 *************** *** 64,69 **** --- 64,71 ---- * *****************************************************************************/ + extern bool InRedo; + /* in globals.c */ /* these are marked volatile because they are set by signal handlers: */ extern PGDLLIMPORT volatile bool InterruptPending; Index: src/include/access/xlog.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog.h,v retrieving revision 1.88 diff -c -r1.88 xlog.h *** src/include/access/xlog.h 12 May 2008 08:35:05 -0000 1.88 --- src/include/access/xlog.h 31 Aug 2008 13:33:43 -0000 *************** *** 205,210 **** --- 205,211 ---- extern void ShutdownXLOG(int code, Datum arg); extern void InitXLOGAccess(void); extern void CreateCheckPoint(int flags); + extern void CreateRestartPoint(void); extern void XLogPutNextOid(Oid nextOid); extern XLogRecPtr GetRedoRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void); Index: src/include/postmaster/bgwriter.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/postmaster/bgwriter.h,v retrieving revision 1.12 diff -c -r1.12 bgwriter.h *** src/include/postmaster/bgwriter.h 11 Aug 2008 11:05:11 -0000 1.12 --- src/include/postmaster/bgwriter.h 31 Aug 2008 15:35:30 -0000 *************** *** 25,36 **** --- 25,40 ---- extern void BackgroundWriterMain(void); extern void RequestCheckpoint(int flags); + extern void RequestRestartPoint(void); extern void CheckpointWriteDelay(int flags, double progress); extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno); extern void AbsorbFsyncRequests(void); + extern void BgWriterRecoveryComplete(void); + extern void BgWriterCompleteRestartPointImmediately(void); + extern Size BgWriterShmemSize(void); extern void BgWriterShmemInit(void); Index: src/include/storage/pmsignal.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/pmsignal.h,v retrieving revision 1.20 diff -c -r1.20 pmsignal.h *** src/include/storage/pmsignal.h 19 Jun 2008 21:32:56 -0000 1.20 --- src/include/storage/pmsignal.h 31 Aug 2008 11:26:40 -0000 *************** *** 22,27 **** --- 22,28 ---- */ typedef enum { + PMSIGNAL_RECOVERY_START, /* move to PM_RECOVERY state */ PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */ PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */ PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */
-- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches