On Sat, Sep 19, 2015 at 8:27 AM, Michael Paquier wrote: > On Fri, Sep 18, 2015 at 6:25 PM, Michael Paquier wrote: >> The refactoring of getTimelineHistory as you propose looks like a good >> idea to me, I tried to remove by myself the difference between source >> and target in copy_fetch.c and friends but this gets uglier, >> particularly because of datadir_source in copy_file_range. Not worth >> it. > > Forgot that: > if (ControlFile_target.state != DB_SHUTDOWNED) > pg_fatal("target server must be shut down cleanly\n"); > We may want to allow a target node shutdowned in recovery as well here.
So, attached is a more polished version of this patch, cleaned up of its typos with as well other things. I have noticed for example that it would be more useful to add the debug information of a timeline file fetched from the source or a target server directly in getTimelineHistory. I have as well updated a couple of comments in the code regarding the fact that we do not necessarily use a master as a target node, and mentioned in findCommonAncestorTimeline that we check as well the start position of a timeline to cover the case where both target and source node forked at the same timeline number but with a different WAL fork position. I am marking this patch as ready for committer. It would be cool in the future to use the recovery test suite to have more advanced scenarios tested, but it seems a shame to block this patch because of that. Regards, -- Michael
From 563f49ac3b07a49e89844112887eee2d4d982879 Mon Sep 17 00:00:00 2001 From: Michael Paquier <mich...@otacoo.com> Date: Wed, 30 Sep 2015 15:39:34 +0900 Subject: [PATCH] Handle timeline switches in pg_rewind This feature allows pg_rewind to handle data directory synchronization is much more general way, one example being to be able to return a promoted standby to its old master. This patch softens the shutdown condition of a target and source node (when this one's data directory is used as a synchronization point) to allow the use of nodes that have been shutdown while in recovery. Patch by Alexander Korotkov. --- src/bin/pg_rewind/Makefile | 2 +- src/bin/pg_rewind/parsexlog.c | 39 ++++++--- src/bin/pg_rewind/pg_rewind.c | 189 ++++++++++++++++++++++++++++++------------ src/bin/pg_rewind/pg_rewind.h | 10 ++- 4 files changed, 170 insertions(+), 70 deletions(-) diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index 92b5d20..48dc770 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -8,7 +8,7 @@ # #------------------------------------------------------------------------- -PGFILEDESC = "pg_rewind - repurpose an old master server as standby" +PGFILEDESC = "pg_rewind - synchronize a data directory with another one forked from" PGAPPICON = win32 subdir = src/bin/pg_rewind diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 2081cf8..d69eafb 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -45,7 +45,7 @@ static char xlogfpath[MAXPGPATH]; typedef struct XLogPageReadPrivate { const char *datadir; - TimeLineID tli; + int tliIndex; } XLogPageReadPrivate; static int SimpleXLogPageRead(XLogReaderState *xlogreader, @@ -55,11 +55,11 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader, /* * Read WAL from the datadir/pg_xlog, starting from 'startpoint' on timeline - * 'tli', until 'endpoint'. Make note of the data blocks touched by the WAL - * records, and return them in a page map. + * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of + * the data blocks touched by the WAL records, and return them in a page map. */ void -extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli, +extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint) { XLogRecord *record; @@ -68,7 +68,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli, XLogPageReadPrivate private; private.datadir = datadir; - private.tli = tli; + private.tliIndex = tliIndex; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); @@ -112,7 +112,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli, * doing anything with the record itself. */ XLogRecPtr -readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli) +readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex) { XLogRecord *record; XLogReaderState *xlogreader; @@ -121,7 +121,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli) XLogRecPtr endptr; private.datadir = datadir; - private.tli = tli; + private.tliIndex = tliIndex; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); @@ -152,7 +152,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli) * Find the previous checkpoint preceding given WAL position. */ void -findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli, +findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo) { @@ -173,7 +173,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli, forkptr += (forkptr % XLogSegSize == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD; private.datadir = datadir; - private.tli = tli; + private.tliIndex = tliIndex; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); @@ -236,9 +236,11 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, { XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; uint32 targetPageOff; - XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; + XLogRecPtr targetSegEnd; + XLogSegNo targetSegNo; XLByteToSeg(targetPagePtr, targetSegNo); + XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, targetSegEnd); targetPageOff = targetPagePtr % XLogSegSize; /* @@ -257,7 +259,20 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, { char xlogfname[MAXFNAMELEN]; - XLogFileName(xlogfname, private->tli, xlogreadsegno); + /* + * Since incomplete segments are copied into next timelines, switch to + * the timeline holding the required segment. Assuming this scan can be + * done both forward and backward, consider also switching timeline + * accordingly. + */ + while (private->tliIndex < targetNentries - 1 && + targetHistory[private->tliIndex].end < targetSegEnd) + private->tliIndex++; + while (private->tliIndex > 0 && + targetHistory[private->tliIndex].begin >= targetSegEnd) + private->tliIndex--; + + XLogFileName(xlogfname, targetHistory[private->tliIndex].tli, xlogreadsegno); snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", private->datadir, xlogfname); @@ -293,7 +308,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, Assert(targetSegNo == xlogreadsegno); - *pageTLI = private->tli; + *pageTLI = targetHistory[private->tliIndex].tli; return XLOG_BLCKSZ; } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 4b7e26a..8ad3af8 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * pg_rewind.c - * Synchronizes an old master server to a new timeline + * Synchronizes a PostgreSQL data directory to a new timeline * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * @@ -37,7 +37,7 @@ static void digestControlFile(ControlFileData *ControlFile, char *source, size_t size); static void updateControlFile(ControlFileData *ControlFile); static void sanityChecks(void); -static void findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli); +static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex); static ControlFileData ControlFile_target; static ControlFileData ControlFile_source; @@ -53,6 +53,10 @@ bool debug = false; bool showprogress = false; bool dry_run = false; +/* Target history */ +TimeLineHistoryEntry *targetHistory; +int targetNentries; + static void usage(const char *progname) { @@ -88,7 +92,7 @@ main(int argc, char **argv) int option_index; int c; XLogRecPtr divergerec; - TimeLineID lastcommontli; + int lastcommontliIndex; XLogRecPtr chkptrec; TimeLineID chkpttli; XLogRecPtr chkptredo; @@ -214,9 +218,10 @@ main(int argc, char **argv) if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID) pg_fatal("source and target cluster are on the same timeline\n"); - findCommonAncestorTimeline(&divergerec, &lastcommontli); + findCommonAncestorTimeline(&divergerec, &lastcommontliIndex); printf(_("The servers diverged at WAL position %X/%X on timeline %u.\n"), - (uint32) (divergerec >> 32), (uint32) divergerec, lastcommontli); + (uint32) (divergerec >> 32), (uint32) divergerec, + targetHistory[lastcommontliIndex].tli); /* * Check for the possibility that the target is in fact a direct ancestor @@ -234,7 +239,7 @@ main(int argc, char **argv) /* Read the checkpoint record on the target to see where it ends. */ chkptendrec = readOneRecord(datadir_target, ControlFile_target.checkPoint, - ControlFile_target.checkPointCopy.ThisTimeLineID); + targetNentries - 1); /* * If the histories diverged exactly at the end of the shutdown @@ -254,7 +259,8 @@ main(int argc, char **argv) exit(0); } - findLastCheckpoint(datadir_target, divergerec, lastcommontli, + findLastCheckpoint(datadir_target, divergerec, + lastcommontliIndex, &chkptrec, &chkpttli, &chkptredo); printf(_("Rewinding from last common checkpoint at %X/%X on timeline %u\n"), (uint32) (chkptrec >> 32), (uint32) chkptrec, @@ -277,7 +283,7 @@ main(int argc, char **argv) * we would need to replay until the end of WAL here. */ pg_log(PG_PROGRESS, "reading WAL in target\n"); - extractPageMap(datadir_target, chkptrec, lastcommontli, + extractPageMap(datadir_target, chkptrec, lastcommontliIndex, ControlFile_target.checkPoint); filemap_finalize(); @@ -374,10 +380,11 @@ sanityChecks(void) /* * Target cluster better not be running. This doesn't guard against * someone starting the cluster concurrently. Also, this is probably more - * strict than necessary; it's OK if the master was not shut down cleanly, - * as long as it isn't running at the moment. + * strict than necessary; it's OK if the target node was not shut down + * cleanly, as long as it isn't running at the moment. */ - if (ControlFile_target.state != DB_SHUTDOWNED) + if (ControlFile_target.state != DB_SHUTDOWNED && + ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("target server must be shut down cleanly\n"); /* @@ -385,75 +392,149 @@ sanityChecks(void) * server is shut down. There isn't any very strong reason for this * limitation, but better safe than sorry. */ - if (datadir_source && ControlFile_source.state != DB_SHUTDOWNED) + if (datadir_source && + ControlFile_source.state != DB_SHUTDOWNED && + ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("source data directory must be shut down cleanly\n"); } /* - * Determine the TLI of the last common timeline in the histories of the two - * clusters. *tli is set to the last common timeline, and *recptr is set to - * the position where the histories diverged (ie. the first WAL record that's - * not the same in both clusters). - * - * Control files of both clusters must be read into ControlFile_target/source - * before calling this. + * Find minimum from two XLOG positions assuming InvalidXLogRecPtr means + * infinity as src/include/access/timeline.h states. This routine should + * be used only when comparing XLOG positions related to history files. */ -static void -findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli) +static XLogRecPtr +MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b) { - TimeLineID targettli; - TimeLineHistoryEntry *sourceHistory; - int nentries; - int i; - TimeLineID sourcetli; + if (XLogRecPtrIsInvalid(a)) + return b; + else if (XLogRecPtrIsInvalid(b)) + return a; + else + return Min(a, b); +} + +/* + * Retrieve timeline history for given control file which should behold + * either source or target. + */ +static TimeLineHistoryEntry * +getTimelineHistory(ControlFileData *controlFile, int *nentries) +{ + TimeLineHistoryEntry *history; + TimeLineID tli; - targettli = ControlFile_target.checkPointCopy.ThisTimeLineID; - sourcetli = ControlFile_source.checkPointCopy.ThisTimeLineID; + tli = controlFile->checkPointCopy.ThisTimeLineID; - /* Timeline 1 does not have a history file, so no need to check */ - if (sourcetli == 1) + /* + * Timeline 1 does not have a history file, so there is no need to check and + * fake an entry with infinite start and end positions. + */ + if (tli == 1) { - sourceHistory = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry)); - sourceHistory->tli = sourcetli; - sourceHistory->begin = sourceHistory->end = InvalidXLogRecPtr; - nentries = 1; + history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry)); + history->tli = tli; + history->begin = history->end = InvalidXLogRecPtr; + *nentries = 1; } else { char path[MAXPGPATH]; char *histfile; - TLHistoryFilePath(path, sourcetli); - histfile = fetchFile(path, NULL); + TLHistoryFilePath(path, tli); + + /* Get history file from appropriate source */ + if (controlFile == &ControlFile_source) + histfile = fetchFile(path, NULL); + else if (controlFile == &ControlFile_target) + histfile = slurpFile(datadir_target, path, NULL); + else + pg_fatal("Invalid control file"); - sourceHistory = rewind_parseTimeLineHistory(histfile, - ControlFile_source.checkPointCopy.ThisTimeLineID, - &nentries); + history = rewind_parseTimeLineHistory(histfile, tli, nentries); pg_free(histfile); } - /* - * Trace the history backwards, until we hit the target timeline. - * - * TODO: This assumes that there are no timeline switches on the target - * cluster after the fork. - */ - for (i = nentries - 1; i >= 0; i--) + if (debug) { - TimeLineHistoryEntry *entry = &sourceHistory[i]; + int i; + + if (controlFile == &ControlFile_source) + printf("Source timeline history:\n"); + else if (controlFile == &ControlFile_target) + printf("Target timeline history:\n"); + else + Assert(false); - if (entry->tli == targettli) + /* + * Print the target timeline history. + */ + for (i = 0; i < targetNentries; i++) { - /* found it */ - *recptr = entry->end; - *tli = entry->tli; + TimeLineHistoryEntry *entry; - pg_free(sourceHistory); - return; + entry = &history[i]; + printf("%d: %X/%X - %X/%X\n", entry->tli, + (uint32) (entry->begin >> 32), (uint32) (entry->begin), + (uint32) (entry->end >> 32), (uint32) (entry->end)); } } - pg_fatal("could not find common ancestor of the source and target cluster's timelines\n"); + return history; +} + +/* + * Determine the TLI of the last common timeline in the timeline history of the + * two clusters. targetHistory is filled with target timeline history and + * targetNentries is number of items in targetHistory. *tliIndex is set to the + * index of last common timeline in targetHistory array, and *recptr is set to + * the position where the timeline history diverged (ie. the first WAL record + * that's not the same in both clusters). + * + * Control files of both clusters must be read into ControlFile_target/source + * before calling this routine. + */ +static void +findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex) +{ + TimeLineHistoryEntry *sourceHistory; + int sourceNentries; + int i, n; + + /* Retrieve timelines for both source and target */ + sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries); + targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries); + + /* + * Trace the history forward, until we hit the timeline diverge. It may + * still be possible that the source and target nodes used the same + * timeline number in their history but with different start position + * depending on the history files that each node has fetched in previous + * recovery processes. Hence check the start position of the new timeline + * as well and move down by one extra timeline entry if they do not match. + */ + n = Min(sourceNentries, targetNentries); + for (i = 0; i < n; i++) + { + if (sourceHistory[i].tli != targetHistory[i].tli || + sourceHistory[i].begin != targetHistory[i].begin) + break; + } + + if (i > 0) + { + i--; + *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end); + *tliIndex = i; + + pg_free(sourceHistory); + return; + } + else + { + pg_fatal("could not find common ancestor of the source and target cluster's timelines\n"); + } } diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index e281369..4826dde 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -27,15 +27,19 @@ extern bool debug; extern bool showprogress; extern bool dry_run; +/* Target history */ +extern TimeLineHistoryEntry *targetHistory; +extern int targetNentries; + /* in parsexlog.c */ extern void extractPageMap(const char *datadir, XLogRecPtr startpoint, - TimeLineID tli, XLogRecPtr endpoint); + int tliIndex, XLogRecPtr endpoint); extern void findLastCheckpoint(const char *datadir, XLogRecPtr searchptr, - TimeLineID tli, + int tliIndex, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo); extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, - TimeLineID tli); + int tliIndex); /* in timeline.c */ extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer, -- 2.6.0
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers