On Sat, Sep 19, 2015 at 8:27 AM, Michael Paquier wrote:
> On Fri, Sep 18, 2015 at 6:25 PM, Michael Paquier wrote:
>> The refactoring of getTimelineHistory as you propose looks like a good
>> idea to me, I tried to remove by myself the difference between source
>> and target in copy_fetch.c and friends but this gets uglier,
>> particularly because of datadir_source in copy_file_range. Not worth
>> it.
>
> Forgot that:
>     if (ControlFile_target.state != DB_SHUTDOWNED)
>         pg_fatal("target server must be shut down cleanly\n");
> We may want to allow a target node shutdowned in recovery as well here.

So, attached is a more polished version of this patch, cleaned up of
its typos with as well other things. I have noticed for example that
it would be more useful to add the debug information of a timeline
file fetched from the source or a target server directly in
getTimelineHistory. I have as well updated a couple of comments in the
code regarding the fact that we do not necessarily use a master as a
target node, and mentioned in findCommonAncestorTimeline that we check
as well the start position of a timeline to cover the case where both
target and source node forked at the same timeline number but with a
different WAL fork position.
I am marking this patch as ready for committer. It would be cool in
the future to use the recovery test suite to have more advanced
scenarios tested, but it seems a shame to block this patch because of
that.
Regards,
-- 
Michael
From 563f49ac3b07a49e89844112887eee2d4d982879 Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@otacoo.com>
Date: Wed, 30 Sep 2015 15:39:34 +0900
Subject: [PATCH] Handle timeline switches in pg_rewind

This feature allows pg_rewind to handle data directory synchronization
is much more general way, one example being to be able to return a
promoted standby to its old master. This patch softens the shutdown
condition of a target and source node (when this one's data directory
is used as a synchronization point) to allow the use of nodes that
have been shutdown while in recovery.

Patch by Alexander Korotkov.
---
 src/bin/pg_rewind/Makefile    |   2 +-
 src/bin/pg_rewind/parsexlog.c |  39 ++++++---
 src/bin/pg_rewind/pg_rewind.c | 189 ++++++++++++++++++++++++++++++------------
 src/bin/pg_rewind/pg_rewind.h |  10 ++-
 4 files changed, 170 insertions(+), 70 deletions(-)

diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile
index 92b5d20..48dc770 100644
--- a/src/bin/pg_rewind/Makefile
+++ b/src/bin/pg_rewind/Makefile
@@ -8,7 +8,7 @@
 #
 #-------------------------------------------------------------------------
 
-PGFILEDESC = "pg_rewind - repurpose an old master server as standby"
+PGFILEDESC = "pg_rewind - synchronize a data directory with another one forked from"
 PGAPPICON = win32
 
 subdir = src/bin/pg_rewind
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 2081cf8..d69eafb 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -45,7 +45,7 @@ static char xlogfpath[MAXPGPATH];
 typedef struct XLogPageReadPrivate
 {
 	const char *datadir;
-	TimeLineID	tli;
+	int			tliIndex;
 } XLogPageReadPrivate;
 
 static int SimpleXLogPageRead(XLogReaderState *xlogreader,
@@ -55,11 +55,11 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader,
 
 /*
  * Read WAL from the datadir/pg_xlog, starting from 'startpoint' on timeline
- * 'tli', until 'endpoint'. Make note of the data blocks touched by the WAL
- * records, and return them in a page map.
+ * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
+ * the data blocks touched by the WAL records, and return them in a page map.
  */
 void
-extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
+extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
 			   XLogRecPtr endpoint)
 {
 	XLogRecord *record;
@@ -68,7 +68,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
 	XLogPageReadPrivate private;
 
 	private.datadir = datadir;
-	private.tli = tli;
+	private.tliIndex = tliIndex;
 	xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
 	if (xlogreader == NULL)
 		pg_fatal("out of memory\n");
@@ -112,7 +112,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
  * doing anything with the record itself.
  */
 XLogRecPtr
-readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
+readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex)
 {
 	XLogRecord *record;
 	XLogReaderState *xlogreader;
@@ -121,7 +121,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
 	XLogRecPtr	endptr;
 
 	private.datadir = datadir;
-	private.tli = tli;
+	private.tliIndex = tliIndex;
 	xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
 	if (xlogreader == NULL)
 		pg_fatal("out of memory\n");
@@ -152,7 +152,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
  * Find the previous checkpoint preceding given WAL position.
  */
 void
-findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli,
+findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
 				   XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
 				   XLogRecPtr *lastchkptredo)
 {
@@ -173,7 +173,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli,
 		forkptr += (forkptr % XLogSegSize == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD;
 
 	private.datadir = datadir;
-	private.tli = tli;
+	private.tliIndex = tliIndex;
 	xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
 	if (xlogreader == NULL)
 		pg_fatal("out of memory\n");
@@ -236,9 +236,11 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 {
 	XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
 	uint32		targetPageOff;
-	XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+	XLogRecPtr	targetSegEnd;
+	XLogSegNo	targetSegNo;
 
 	XLByteToSeg(targetPagePtr, targetSegNo);
+	XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, targetSegEnd);
 	targetPageOff = targetPagePtr % XLogSegSize;
 
 	/*
@@ -257,7 +259,20 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 	{
 		char		xlogfname[MAXFNAMELEN];
 
-		XLogFileName(xlogfname, private->tli, xlogreadsegno);
+		/*
+		 * Since incomplete segments are copied into next timelines, switch to
+		 * the timeline holding the required segment. Assuming this scan can be
+		 * done both forward and backward, consider also switching timeline
+		 * accordingly.
+		 */
+		while (private->tliIndex < targetNentries - 1 &&
+				targetHistory[private->tliIndex].end < targetSegEnd)
+			private->tliIndex++;
+		while (private->tliIndex > 0 &&
+				targetHistory[private->tliIndex].begin >= targetSegEnd)
+			private->tliIndex--;
+
+		XLogFileName(xlogfname, targetHistory[private->tliIndex].tli, xlogreadsegno);
 
 		snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", private->datadir, xlogfname);
 
@@ -293,7 +308,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 
 	Assert(targetSegNo == xlogreadsegno);
 
-	*pageTLI = private->tli;
+	*pageTLI = targetHistory[private->tliIndex].tli;
 	return XLOG_BLCKSZ;
 }
 
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 4b7e26a..8ad3af8 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_rewind.c
- *	  Synchronizes an old master server to a new timeline
+ *	  Synchronizes a PostgreSQL data directory to a new timeline
  *
  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
  *
@@ -37,7 +37,7 @@ static void digestControlFile(ControlFileData *ControlFile, char *source,
 				  size_t size);
 static void updateControlFile(ControlFileData *ControlFile);
 static void sanityChecks(void);
-static void findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli);
+static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
 
 static ControlFileData ControlFile_target;
 static ControlFileData ControlFile_source;
@@ -53,6 +53,10 @@ bool		debug = false;
 bool		showprogress = false;
 bool		dry_run = false;
 
+/* Target history */
+TimeLineHistoryEntry *targetHistory;
+int targetNentries;
+
 static void
 usage(const char *progname)
 {
@@ -88,7 +92,7 @@ main(int argc, char **argv)
 	int			option_index;
 	int			c;
 	XLogRecPtr	divergerec;
-	TimeLineID	lastcommontli;
+	int			lastcommontliIndex;
 	XLogRecPtr	chkptrec;
 	TimeLineID	chkpttli;
 	XLogRecPtr	chkptredo;
@@ -214,9 +218,10 @@ main(int argc, char **argv)
 	if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID)
 		pg_fatal("source and target cluster are on the same timeline\n");
 
-	findCommonAncestorTimeline(&divergerec, &lastcommontli);
+	findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
 	printf(_("The servers diverged at WAL position %X/%X on timeline %u.\n"),
-		   (uint32) (divergerec >> 32), (uint32) divergerec, lastcommontli);
+		   (uint32) (divergerec >> 32), (uint32) divergerec,
+		   targetHistory[lastcommontliIndex].tli);
 
 	/*
 	 * Check for the possibility that the target is in fact a direct ancestor
@@ -234,7 +239,7 @@ main(int argc, char **argv)
 		/* Read the checkpoint record on the target to see where it ends. */
 		chkptendrec = readOneRecord(datadir_target,
 									ControlFile_target.checkPoint,
-						   ControlFile_target.checkPointCopy.ThisTimeLineID);
+									targetNentries - 1);
 
 		/*
 		 * If the histories diverged exactly at the end of the shutdown
@@ -254,7 +259,8 @@ main(int argc, char **argv)
 		exit(0);
 	}
 
-	findLastCheckpoint(datadir_target, divergerec, lastcommontli,
+	findLastCheckpoint(datadir_target, divergerec,
+					   lastcommontliIndex,
 					   &chkptrec, &chkpttli, &chkptredo);
 	printf(_("Rewinding from last common checkpoint at %X/%X on timeline %u\n"),
 		   (uint32) (chkptrec >> 32), (uint32) chkptrec,
@@ -277,7 +283,7 @@ main(int argc, char **argv)
 	 * we would need to replay until the end of WAL here.
 	 */
 	pg_log(PG_PROGRESS, "reading WAL in target\n");
-	extractPageMap(datadir_target, chkptrec, lastcommontli,
+	extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
 				   ControlFile_target.checkPoint);
 	filemap_finalize();
 
@@ -374,10 +380,11 @@ sanityChecks(void)
 	/*
 	 * Target cluster better not be running. This doesn't guard against
 	 * someone starting the cluster concurrently. Also, this is probably more
-	 * strict than necessary; it's OK if the master was not shut down cleanly,
-	 * as long as it isn't running at the moment.
+	 * strict than necessary; it's OK if the target node was not shut down
+	 * cleanly, as long as it isn't running at the moment.
 	 */
-	if (ControlFile_target.state != DB_SHUTDOWNED)
+	if (ControlFile_target.state != DB_SHUTDOWNED &&
+		ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
 		pg_fatal("target server must be shut down cleanly\n");
 
 	/*
@@ -385,75 +392,149 @@ sanityChecks(void)
 	 * server is shut down. There isn't any very strong reason for this
 	 * limitation, but better safe than sorry.
 	 */
-	if (datadir_source && ControlFile_source.state != DB_SHUTDOWNED)
+	if (datadir_source &&
+		ControlFile_source.state != DB_SHUTDOWNED &&
+		ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
 		pg_fatal("source data directory must be shut down cleanly\n");
 }
 
 /*
- * Determine the TLI of the last common timeline in the histories of the two
- * clusters. *tli is set to the last common timeline, and *recptr is set to
- * the position where the histories diverged (ie. the first WAL record that's
- * not the same in both clusters).
- *
- * Control files of both clusters must be read into ControlFile_target/source
- * before calling this.
+ * Find minimum from two XLOG positions assuming InvalidXLogRecPtr means
+ * infinity as src/include/access/timeline.h states. This routine should
+ * be used only when comparing XLOG positions related to history files.
  */
-static void
-findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli)
+static XLogRecPtr
+MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
 {
-	TimeLineID	targettli;
-	TimeLineHistoryEntry *sourceHistory;
-	int			nentries;
-	int			i;
-	TimeLineID	sourcetli;
+	if (XLogRecPtrIsInvalid(a))
+		return b;
+	else if (XLogRecPtrIsInvalid(b))
+		return a;
+	else
+		return Min(a, b);
+}
+
+/*
+ * Retrieve timeline history for given control file which should behold
+ * either source or target.
+ */
+static TimeLineHistoryEntry *
+getTimelineHistory(ControlFileData *controlFile, int *nentries)
+{
+	TimeLineHistoryEntry   *history;
+	TimeLineID				tli;
 
-	targettli = ControlFile_target.checkPointCopy.ThisTimeLineID;
-	sourcetli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+	tli = controlFile->checkPointCopy.ThisTimeLineID;
 
-	/* Timeline 1 does not have a history file, so no need to check */
-	if (sourcetli == 1)
+	/*
+	 * Timeline 1 does not have a history file, so there is no need to check and
+	 * fake an entry with infinite start and end positions.
+	 */
+	if (tli == 1)
 	{
-		sourceHistory = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
-		sourceHistory->tli = sourcetli;
-		sourceHistory->begin = sourceHistory->end = InvalidXLogRecPtr;
-		nentries = 1;
+		history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
+		history->tli = tli;
+		history->begin = history->end = InvalidXLogRecPtr;
+		*nentries = 1;
 	}
 	else
 	{
 		char		path[MAXPGPATH];
 		char	   *histfile;
 
-		TLHistoryFilePath(path, sourcetli);
-		histfile = fetchFile(path, NULL);
+		TLHistoryFilePath(path, tli);
+
+		/* Get history file from appropriate source */
+		if (controlFile == &ControlFile_source)
+			histfile = fetchFile(path, NULL);
+		else if (controlFile == &ControlFile_target)
+			histfile = slurpFile(datadir_target, path, NULL);
+		else
+			pg_fatal("Invalid control file");
 
-		sourceHistory = rewind_parseTimeLineHistory(histfile,
-							ControlFile_source.checkPointCopy.ThisTimeLineID,
-													&nentries);
+		history = rewind_parseTimeLineHistory(histfile, tli, nentries);
 		pg_free(histfile);
 	}
 
-	/*
-	 * Trace the history backwards, until we hit the target timeline.
-	 *
-	 * TODO: This assumes that there are no timeline switches on the target
-	 * cluster after the fork.
-	 */
-	for (i = nentries - 1; i >= 0; i--)
+	if (debug)
 	{
-		TimeLineHistoryEntry *entry = &sourceHistory[i];
+		int		i;
+
+		if (controlFile == &ControlFile_source)
+			printf("Source timeline history:\n");
+		else if (controlFile == &ControlFile_target)
+			printf("Target timeline history:\n");
+		else
+			Assert(false);
 
-		if (entry->tli == targettli)
+		/*
+		 * Print the target timeline history.
+		 */
+		for (i = 0; i < targetNentries; i++)
 		{
-			/* found it */
-			*recptr = entry->end;
-			*tli = entry->tli;
+			TimeLineHistoryEntry *entry;
 
-			pg_free(sourceHistory);
-			return;
+			entry = &history[i];
+			printf("%d: %X/%X - %X/%X\n", entry->tli,
+				(uint32) (entry->begin >> 32), (uint32) (entry->begin),
+				(uint32) (entry->end >> 32), (uint32) (entry->end));
 		}
 	}
 
-	pg_fatal("could not find common ancestor of the source and target cluster's timelines\n");
+	return history;
+}
+
+/*
+ * Determine the TLI of the last common timeline in the timeline history of the
+ * two clusters. targetHistory is filled with target timeline history and
+ * targetNentries is number of items in targetHistory. *tliIndex is set to the
+ * index of last common timeline in targetHistory array, and *recptr is set to
+ * the position where the timeline history diverged (ie. the first WAL record
+ * that's not the same in both clusters).
+ *
+ * Control files of both clusters must be read into ControlFile_target/source
+ * before calling this routine.
+ */
+static void
+findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
+{
+	TimeLineHistoryEntry *sourceHistory;
+	int			sourceNentries;
+	int			i, n;
+
+	/* Retrieve timelines for both source and target */
+	sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
+	targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
+
+	/*
+	 * Trace the history forward, until we hit the timeline diverge. It may
+	 * still be possible that the source and target nodes used the same
+	 * timeline number in their history but with different start position
+	 * depending on the history files that each node has fetched in previous
+	 * recovery processes. Hence check the start position of the new timeline
+	 * as well and move down by one extra timeline entry if they do not match.
+	 */
+	n = Min(sourceNentries, targetNentries);
+	for (i = 0; i < n; i++)
+	{
+		if (sourceHistory[i].tli != targetHistory[i].tli ||
+			sourceHistory[i].begin != targetHistory[i].begin)
+			break;
+	}
+
+	if (i > 0)
+	{
+		i--;
+		*recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
+		*tliIndex = i;
+
+		pg_free(sourceHistory);
+		return;
+	}
+	else
+	{
+		pg_fatal("could not find common ancestor of the source and target cluster's timelines\n");
+	}
 }
 
 
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index e281369..4826dde 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -27,15 +27,19 @@ extern bool debug;
 extern bool showprogress;
 extern bool dry_run;
 
+/* Target history */
+extern TimeLineHistoryEntry *targetHistory;
+extern int targetNentries;
+
 /* in parsexlog.c */
 extern void extractPageMap(const char *datadir, XLogRecPtr startpoint,
-			   TimeLineID tli, XLogRecPtr endpoint);
+			   int tliIndex, XLogRecPtr endpoint);
 extern void findLastCheckpoint(const char *datadir, XLogRecPtr searchptr,
-				   TimeLineID tli,
+				   int tliIndex,
 				   XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
 				   XLogRecPtr *lastchkptredo);
 extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
-			  TimeLineID tli);
+			  int tliIndex);
 
 /* in timeline.c */
 extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer,
-- 
2.6.0

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to