WIP patch to perform a switch from one log file to next when we do a pg_stop_backup(), including wal replay handling at recovery time.
Patch currently crashes server at various points, so don't stare too hard, but patch applies cleanly on cvstip, compiles and make checks. Main issue is the need to poke the xlog record pointer with a new value after the log switch. I'm a little uncertain about that approach and I'm very likely getting it wrong now. Better ideas welcome. Patch is incomplete in that it doesn't handle shutdown checkpoints as log switches in archive mode (yet) Also nothing in here about standby databases (yet) Any comments appreciated before I spend too much time on this. Best Regards, Simon Riggs
Index: src/backend/access/transam/xlog.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/xlog.c,v retrieving revision 1.187 diff -d -c -r1.187 xlog.c *** src/backend/access/transam/xlog.c 17 Apr 2005 03:04:29 -0000 1.187 --- src/backend/access/transam/xlog.c 19 Apr 2005 18:31:27 -0000 *************** *** 403,408 **** --- 403,410 ---- static uint32 openLogId = 0; static uint32 openLogSeg = 0; static uint32 openLogOff = 0; + static uint32 newLogId = 0; /* used when switching log segment file */ + static uint32 newLogSeg = 0; /* * These variables are used similarly to the ones above, but for reading *************** *** 428,433 **** --- 430,437 ---- static XLogRecord *nextRecord = NULL; static TimeLineID lastPageTLI = 0; + static bool PerformSwitchXLogFile = false; + static bool InRedo = false; *************** *** 442,447 **** --- 446,452 ---- static bool AdvanceXLInsertBuffer(void); static void XLogWrite(XLogwrtRqst WriteRqst); + static void XLogSwitchLogSegment(void); static int XLogFileInit(uint32 log, uint32 seg, bool *use_existent, bool use_lock); static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath, *************** *** 1150,1156 **** XLogCtlWrite *Write = &XLogCtl->Write; char *from; bool ispartialpage; - bool use_existent; /* We should always be inside a critical section here */ Assert(CritSectionCount > 0); --- 1155,1160 ---- *************** *** 1181,1253 **** if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg)) { ! /* ! * Switch to new logfile segment. ! */ ! if (openLogFile >= 0) ! { ! if (close(openLogFile)) ! ereport(PANIC, ! (errcode_for_file_access(), ! errmsg("could not close log file %u, segment %u: %m", ! openLogId, openLogSeg))); ! openLogFile = -1; ! } ! XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg); ! ! /* create/use new log file */ ! use_existent = true; ! openLogFile = XLogFileInit(openLogId, openLogSeg, ! &use_existent, true); ! openLogOff = 0; ! ! /* update pg_control, unless someone else already did */ ! LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ! if (ControlFile->logId < openLogId || ! (ControlFile->logId == openLogId && ! ControlFile->logSeg < openLogSeg + 1)) ! { ! ControlFile->logId = openLogId; ! ControlFile->logSeg = openLogSeg + 1; ! ControlFile->time = time(NULL); ! UpdateControlFile(); ! ! /* ! * Signal bgwriter to start a checkpoint if it's been ! * too long since the last one. (We look at local copy of ! * RedoRecPtr which might be a little out of date, but ! * should be close enough for this purpose.) ! * ! * A straight computation of segment number could overflow ! * 32 bits. Rather than assuming we have working 64-bit ! * arithmetic, we compare the highest-order bits separately, ! * and force a checkpoint immediately when they change. ! */ ! if (IsUnderPostmaster) ! { ! uint32 old_segno, ! new_segno; ! uint32 old_highbits, ! new_highbits; ! ! old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile + ! (RedoRecPtr.xrecoff / XLogSegSize); ! old_highbits = RedoRecPtr.xlogid / XLogSegSize; ! new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + ! openLogSeg; ! new_highbits = openLogId / XLogSegSize; ! if (new_highbits != old_highbits || ! new_segno >= old_segno + (uint32) CheckPointSegments) ! { ! #ifdef WAL_DEBUG ! if (XLOG_DEBUG) ! elog(LOG, "time for a checkpoint, signaling bgwriter"); ! #endif ! RequestCheckpoint(false); ! } ! } ! } ! LWLockRelease(ControlFileLock); } if (openLogFile < 0) --- 1185,1191 ---- if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg)) { ! XLogSwitchLogSegment(); } if (openLogFile < 0) *************** *** 1285,1290 **** --- 1223,1229 ---- /* * If we just wrote the whole last page of a logfile segment, + * or if we have been instructed to switch to a new logfile segment * fsync the segment immediately. This avoids having to go back * and re-open prior segments when an fsync request comes along * later. Doing it here ensures that one and only one backend will *************** *** 1293,1301 **** * This is also the right place to notify the Archiver that the * segment is ready to copy to archival storage. */ ! if (openLogOff >= XLogSegSize && !ispartialpage) { issue_xlog_fsync(); LogwrtResult.Flush = LogwrtResult.Write; /* end of current page */ if (XLogArchivingActive()) --- 1232,1258 ---- * This is also the right place to notify the Archiver that the * segment is ready to copy to archival storage. */ ! if (PerformSwitchXLogFile || ! (openLogOff >= XLogSegSize && !ispartialpage)) { issue_xlog_fsync(); + /* If we are switching logfiles, then allow the xlog switch record + * to be written and then afterwards move the rec pointer + * forwards. If writing the xlog switch record took us exactly + * to the end of the logfile segment, there is no need to + * switch logfiles segments twice + */ + if (PerformSwitchXLogFile && + !(openLogOff >= XLogSegSize && !ispartialpage)) + { + newLogId = openLogId; + newLogSeg = openLogSeg; + NextLogSeg(newLogId, newLogSeg); + LogwrtResult.Write.xlogid = newLogId; + LogwrtResult.Write.xrecoff = newLogSeg * XLogSegSize; + XLogSwitchLogSegment(); + PerformSwitchXLogFile = false; + } LogwrtResult.Flush = LogwrtResult.Write; /* end of current page */ if (XLogArchivingActive()) *************** *** 1369,1374 **** --- 1326,1408 ---- } /* + * Switch to new logfile segment + * + * Convenience function to avoid duplicated code in XLogWrite + * and should not be called from another location + */ + void + XLogSwitchLogSegment(void) + { + bool use_existent; + + if (openLogFile >= 0) + { + if (close(openLogFile)) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close log file %u, segment %u: %m", + openLogId, openLogSeg))); + openLogFile = -1; + } + XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg); + + /* create/use new log file */ + use_existent = true; + openLogFile = XLogFileInit(openLogId, openLogSeg, + &use_existent, true); + openLogOff = 0; + + /* update pg_control, unless someone else already did */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (ControlFile->logId < openLogId || + (ControlFile->logId == openLogId && + ControlFile->logSeg < openLogSeg + 1)) + { + ControlFile->logId = openLogId; + ControlFile->logSeg = openLogSeg + 1; + ControlFile->time = time(NULL); + UpdateControlFile(); + + /* + * Signal bgwriter to start a checkpoint if it's been + * too long since the last one. (We look at local copy of + * RedoRecPtr which might be a little out of date, but + * should be close enough for this purpose.) + * + * A straight computation of segment number could overflow + * 32 bits. Rather than assuming we have working 64-bit + * arithmetic, we compare the highest-order bits separately, + * and force a checkpoint immediately when they change. + */ + if (IsUnderPostmaster) + { + uint32 old_segno, + new_segno; + uint32 old_highbits, + new_highbits; + + old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile + + (RedoRecPtr.xrecoff / XLogSegSize); + old_highbits = RedoRecPtr.xlogid / XLogSegSize; + new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + + openLogSeg; + new_highbits = openLogId / XLogSegSize; + if (new_highbits != old_highbits || + new_segno >= old_segno + (uint32) CheckPointSegments) + { + #ifdef WAL_DEBUG + if (XLOG_DEBUG) + elog(LOG, "time for a checkpoint, signaling bgwriter"); + #endif + RequestCheckpoint(false); + } + } + } + LWLockRelease(ControlFileLock); + } + + /* * Ensure that all XLOG data through the given position is flushed to disk. * * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not *************** *** 2408,2414 **** { RecPtr = &tmpRecPtr; /* fast case if next record is on same page */ ! if (nextRecord != NULL) { record = nextRecord; goto got_record; --- 2442,2448 ---- { RecPtr = &tmpRecPtr; /* fast case if next record is on same page */ ! if (nextRecord != NULL && !PerformSwitchXLogFile) { record = nextRecord; goto got_record; *************** *** 2416,2425 **** /* align old recptr to next page */ if (tmpRecPtr.xrecoff % BLCKSZ != 0) tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ); ! if (tmpRecPtr.xrecoff >= XLogFileSize) { (tmpRecPtr.xlogid)++; tmpRecPtr.xrecoff = 0; } /* We will account for page header size below */ } --- 2450,2460 ---- /* align old recptr to next page */ if (tmpRecPtr.xrecoff % BLCKSZ != 0) tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ); ! if (PerformSwitchXLogFile || tmpRecPtr.xrecoff >= XLogFileSize) { (tmpRecPtr.xlogid)++; tmpRecPtr.xrecoff = 0; + PerformSwitchXLogFile = false; } /* We will account for page header size below */ } *************** *** 5054,5059 **** --- 5089,5118 ---- } /* + * Writes an record to xlog to show that an xlog switch has taken place + * This will also cause a physical xlog switch to occur immediately + * afterwards, as if the xlog file had been completely filled + */ + void + RequestXLogSwitch(void) + { + XLogRecData rdata; + + xl_xlog_switch xlrec; + + xlrec.xtime = time(NULL); + + rdata.buffer = InvalidBuffer; + rdata.data = (char *) (&xlrec); + rdata.len = sizeof(xl_xlog_switch); + rdata.next = NULL; + + PerformSwitchXLogFile = true; + + (void) XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata); + } + + /* * XLOG resource manager's routines */ void *************** *** 5072,5077 **** --- 5131,5141 ---- ShmemVariableCache->oidCount = 0; } } + else if (info == XLOG_SWITCH) + { + /* Processing instruction only. Does not effect database contents */ + PerformSwitchXLogFile = true; + } else if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; *************** *** 5150,5155 **** --- 5214,5228 ---- memcpy(&nextOid, rec, sizeof(Oid)); sprintf(buf + strlen(buf), "nextOid: %u", nextOid); } + else if (info == XLOG_SWITCH) + { + xl_xlog_switch *xlrec = (xl_xlog_switch *) rec; + struct tm *tm = localtime(&xlrec->xtime); + + sprintf(buf + strlen(buf), "xlog switch: %04u-%02u-%02u %02u:%02u:%02u", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec); + } else strcat(buf, "UNKNOWN"); } *************** *** 5555,5560 **** --- 5628,5642 ---- } /* + * Force xlog switch so that the archive set is complete + * We do this after the history file, since the history file points + * to the xlog file, so if we switched too soon then the pointer + * would still point to an unarchived xlog file + * There are no recoverable errors from this action. + */ + RequestXLogSwitch(); + + /* * We're done. As a convenience, return the ending WAL offset. */ snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X", Index: src/backend/postmaster/pgarch.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/postmaster/pgarch.c,v retrieving revision 1.15 diff -d -c -r1.15 pgarch.c *** src/backend/postmaster/pgarch.c 10 Mar 2005 07:14:03 -0000 1.15 --- src/backend/postmaster/pgarch.c 19 Apr 2005 18:31:28 -0000 *************** *** 82,87 **** --- 82,88 ---- */ static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t wakened = false; + static volatile sig_atomic_t archshutdown = false; /* ---------- * Local function forward declarations *************** *** 95,100 **** --- 96,103 ---- static void pgarch_exit(SIGNAL_ARGS); static void ArchSigHupHandler(SIGNAL_ARGS); static void pgarch_waken(SIGNAL_ARGS); + static void pgarch_shutdown(SIGNAL_ARGS); + static void pgarch_MainLoop(void); static void pgarch_ArchiverCopyLoop(void); static bool pgarch_archiveXlog(char *xlog); *************** *** 234,240 **** pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, pgarch_waken); ! pqsignal(SIGUSR2, SIG_IGN); pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); --- 237,243 ---- pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, pgarch_waken); ! pqsignal(SIGUSR2, pgarch_shutdown); pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); *************** *** 280,287 **** --- 283,300 ---- pgarch_waken(SIGNAL_ARGS) { wakened = true; + archshutdown = false; + } + + /* SIGUSR2 signal handler for archiver process */ + static void + pgarch_shutdown(SIGNAL_ARGS) + { + wakened = true; + archshutdown = true; } + /* * pgarch_MainLoop * *************** *** 317,324 **** --- 330,344 ---- if (wakened) { wakened = false; + /* Allow ourselves to be signalled while archiving + * since we may start doing a clean shutdown, then + * have the user decide they dont want to wait that long + */ pgarch_ArchiverCopyLoop(); last_copy_time = time(NULL); + /* check to see if we should shutdown after archiving... */ + if (archshutdown) + exit (0); } /* Index: src/backend/postmaster/postmaster.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/postmaster/postmaster.c,v retrieving revision 1.450 diff -d -c -r1.450 postmaster.c *** src/backend/postmaster/postmaster.c 25 Mar 2005 00:34:21 -0000 1.450 --- src/backend/postmaster/postmaster.c 19 Apr 2005 18:31:44 -0000 *************** *** 1867,1875 **** /* And tell it to shut down */ if (BgWriterPID != 0) kill(BgWriterPID, SIGUSR2); ! /* Tell pgarch to shut down too; nothing left for it to do */ if (PgArchPID != 0) ! kill(PgArchPID, SIGQUIT); /* Tell pgstat to shut down too; nothing left for it to do */ if (PgStatPID != 0) kill(PgStatPID, SIGQUIT); --- 1867,1877 ---- /* And tell it to shut down */ if (BgWriterPID != 0) kill(BgWriterPID, SIGUSR2); ! /* Tell pgarch to shut down too, but finish archiving first */ ! if (XLogArchivingActive() && PgArchPID == 0) ! PgArchPID = pgarch_start(); if (PgArchPID != 0) ! kill(PgArchPID, SIGUSR2); /* Tell pgstat to shut down too; nothing left for it to do */ if (PgStatPID != 0) kill(PgStatPID, SIGQUIT); Index: src/include/access/xlog.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/access/xlog.h,v retrieving revision 1.59 diff -d -c -r1.59 xlog.h *** src/include/access/xlog.h 31 Dec 2004 22:03:21 -0000 1.59 --- src/include/access/xlog.h 19 Apr 2005 18:31:45 -0000 *************** *** 133,138 **** --- 133,140 ---- extern void InitXLOGAccess(void); extern void CreateCheckPoint(bool shutdown, bool force); extern void XLogPutNextOid(Oid nextOid); + extern void RequestXLogSwitch(void); + extern XLogRecPtr GetRedoRecPtr(void); #endif /* XLOG_H */ Index: src/include/catalog/pg_control.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/catalog/pg_control.h,v retrieving revision 1.20 diff -d -c -r1.20 pg_control.h *** src/include/catalog/pg_control.h 29 Mar 2005 03:01:32 -0000 1.20 --- src/include/catalog/pg_control.h 19 Apr 2005 18:31:45 -0000 *************** *** 47,52 **** --- 47,57 ---- #define XLOG_CHECKPOINT_ONLINE 0x10 #define XLOG_NEXTOID 0x30 + #define XLOG_SWITCH 0x40 + typedef struct xl_xlog_switch + { + time_t xtime; + } xl_xlog_switch; /* System status indicator */ typedef enum DBState
---------------------------(end of broadcast)--------------------------- TIP 3: if posting/reading through Usenet, please send an appropriate subscribe-nomail command to [EMAIL PROTECTED] so that your message can get through to the mailing list cleanly