At Tue, 07 Jun 2022 12:39:38 +0900 (JST), Kyotaro Horiguchi
<[email protected]> wrote in
> One possible way to detect promotion reliably is to look into timeline
> history files. It is written immediately at promotion even on
> standbys.
The attached seems to work. It uses timeline history files to identify
the source timeline. With this change pg_waldump no longer need to
wait for end-of-recovery to finish.
(It lacks doc part and test.. But I'm not sure how we can test this
behavior.)
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 6cb288f099..2a407da1e4 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -309,9 +309,11 @@ sync_target_dir(void)
* buffer is actually *filesize + 1. That's handy when reading a text file.
* This function can be used to read binary files as well, you can just
* ignore the zero-terminator in that case.
+ *
+ * If noerror is true, returns NULL when the file is not found.
*/
char *
-slurpFile(const char *datadir, const char *path, size_t *filesize)
+slurpFile(const char *datadir, const char *path, size_t *filesize, bool
noerror)
{
int fd;
char *buffer;
@@ -323,8 +325,13 @@ slurpFile(const char *datadir, const char *path, size_t
*filesize)
snprintf(fullpath, sizeof(fullpath), "%s/%s", datadir, path);
if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) == -1)
+ {
+ if (noerror && errno == ENOENT)
+ return NULL;
+
pg_fatal("could not open file \"%s\" for reading: %m",
fullpath);
+ }
if (fstat(fd, &statbuf) < 0)
pg_fatal("could not open file \"%s\" for reading: %m",
diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h
index 54a853bd42..92e19042cb 100644
--- a/src/bin/pg_rewind/file_ops.h
+++ b/src/bin/pg_rewind/file_ops.h
@@ -21,7 +21,8 @@ extern void create_target(file_entry_t *t);
extern void remove_target(file_entry_t *t);
extern void sync_target_dir(void);
-extern char *slurpFile(const char *datadir, const char *path, size_t
*filesize);
+extern char *slurpFile(const char *datadir, const char *path, size_t *filesize,
+ bool noerror);
typedef void (*process_file_callback_t) (const char *path, file_type_t type,
size_t size, const char *link_target);
extern void traverse_datadir(const char *datadir, process_file_callback_t
callback);
diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c
index 011c9cce6e..92067d4f2c 100644
--- a/src/bin/pg_rewind/libpq_source.c
+++ b/src/bin/pg_rewind/libpq_source.c
@@ -68,7 +68,7 @@ static void libpq_queue_fetch_range(rewind_source *source,
const char *path,
off_t
off, size_t len);
static void libpq_finish_fetch(rewind_source *source);
static char *libpq_fetch_file(rewind_source *source, const char *path,
- size_t *filesize);
+ size_t *filesize,
bool noerror);
static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source);
static void libpq_destroy(rewind_source *source);
@@ -620,9 +620,12 @@ appendArrayEscapedString(StringInfo buf, const char *str)
/*
* Fetch a single file as a malloc'd buffer.
+ *
+ * If noerror is true, returns NULL if pg_read_binary_file() failed.
*/
static char *
-libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+ bool noerror)
{
PGconn *conn = ((libpq_source *) source)->conn;
PGresult *res;
@@ -631,6 +634,34 @@ libpq_fetch_file(rewind_source *source, const char *path,
size_t *filesize)
const char *paramValues[1];
paramValues[0] = path;
+
+ /*
+ * check the existence of the file. We don't do this separately from
+ * pg_read_binary_file so that server doesn't emit an error
+ */
+ if (noerror)
+ {
+ res = PQexecParams(conn, "SELECT pg_stat_file($1, true)",
+ 1, NULL, paramValues, NULL,
NULL, 1);
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ pg_fatal("could not stat remote file \"%s\": %s",
+ path, PQresultErrorMessage(res));
+ }
+
+ /* sanity check the result set */
+ if (PQntuples(res) != 1)
+ pg_fatal("unexpected result set while stating remote
file \"%s\"",
+ path);
+
+ /* Return NULL if the file was not found */
+ if (PQgetisnull(res, 0, 0))
+ return NULL;
+
+ PQclear(res);
+ }
+
res = PQexecParams(conn, "SELECT pg_read_binary_file($1)",
1, NULL, paramValues, NULL, NULL, 1);
diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c
index 2e50485c39..fc2e1e9f11 100644
--- a/src/bin/pg_rewind/local_source.c
+++ b/src/bin/pg_rewind/local_source.c
@@ -28,7 +28,7 @@ typedef struct
static void local_traverse_files(rewind_source *source,
process_file_callback_t callback);
static char *local_fetch_file(rewind_source *source, const char *path,
- size_t *filesize);
+ size_t *filesize,
bool noerror);
static void local_queue_fetch_file(rewind_source *source, const char *path,
size_t len);
static void local_queue_fetch_range(rewind_source *source, const char *path,
@@ -63,9 +63,11 @@ local_traverse_files(rewind_source *source,
process_file_callback_t callback)
}
static char *
-local_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+local_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+ bool noerror)
{
- return slurpFile(((local_source *) source)->datadir, path, filesize);
+ return slurpFile(((local_source *) source)->datadir, path, filesize,
+ noerror);
}
/*
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 1ff8da1676..f9c7853f08 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -43,6 +43,8 @@ static void createBackupLabel(XLogRecPtr startpoint,
TimeLineID starttli,
static void digestControlFile(ControlFileData *ControlFile,
const char *content,
size_t size);
+static TimeLineHistoryEntry *getTimelineHistory(ControlFileData *controlFile,
+
int *nentries);
static void getRestoreCommand(const char *argv0);
static void sanityChecks(void);
static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
@@ -141,6 +143,7 @@ main(int argc, char **argv)
bool rewind_needed;
bool writerecoveryconf = false;
filemap_t *filemap;
+ TimeLineID source_tli;
pg_logging_init(argv[0]);
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
@@ -311,7 +314,7 @@ main(int argc, char **argv)
* need to make sure by themselves that the target cluster is in a clean
* state.
*/
- buffer = slurpFile(datadir_target, "global/pg_control", &size);
+ buffer = slurpFile(datadir_target, "global/pg_control", &size, false);
digestControlFile(&ControlFile_target, buffer, size);
pg_free(buffer);
@@ -321,25 +324,47 @@ main(int argc, char **argv)
{
ensureCleanShutdown(argv[0]);
- buffer = slurpFile(datadir_target, "global/pg_control", &size);
+ buffer = slurpFile(datadir_target, "global/pg_control", &size,
false);
digestControlFile(&ControlFile_target, buffer, size);
pg_free(buffer);
}
- buffer = source->fetch_file(source, "global/pg_control", &size);
+ buffer = source->fetch_file(source, "global/pg_control", &size, false);
digestControlFile(&ControlFile_source, buffer, size);
pg_free(buffer);
sanityChecks();
+ /*
+ * There may be a case where the source has been promoted but the
+ * end-of-recovery checkpoint has not completed. In this case the soruce
+ * control file is has a bit older content for this purpose. Look into
+ * timeline history file, which is refreshed up-to-date.
+ */
+ source_tli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+ if (ControlFile_target.checkPointCopy.ThisTimeLineID == source_tli)
+ {
+ int nentries;
+ TimeLineHistoryEntry *hist;
+
+ hist = getTimelineHistory(&ControlFile_source, &nentries);
+
+ /* last line of history file is the newest timeline */
+ if (nentries > 0 && hist[nentries - 1].tli > source_tli)
+ {
+ pg_log_info("source's actual timeline ID (%d) is newer
than control file (%d)", hist[nentries - 1].tli, source_tli);
+ source_tli = hist[nentries - 1].tli;
+ }
+ pg_free(hist);
+ }
+
/*
* Find the common ancestor timeline between the clusters.
*
* If both clusters are already on the same timeline, there's nothing to
* do.
*/
- if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
- ControlFile_source.checkPointCopy.ThisTimeLineID)
+ if (ControlFile_target.checkPointCopy.ThisTimeLineID == source_tli)
{
pg_log_info("source and target cluster are on the same
timeline");
rewind_needed = false;
@@ -581,7 +606,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
* Fetch the control file from the source last. This ensures that the
* minRecoveryPoint is up-to-date.
*/
- buffer = source->fetch_file(source, "global/pg_control", &size);
+ buffer = source->fetch_file(source, "global/pg_control", &size, false);
digestControlFile(&ControlFile_source_after, buffer, size);
pg_free(buffer);
@@ -630,6 +655,10 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
*/
if (connstr_source)
{
+ int nentries;
+ TimeLineHistoryEntry *hist;
+ int i;
+
/*
* The source is a live server. Like in an online backup, it's
* important that we recover all the WAL that was generated
while we
@@ -655,6 +684,29 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
endrec = source->get_current_wal_insert_lsn(source);
endtli =
ControlFile_source_after.checkPointCopy.ThisTimeLineID;
+
+ /*
+ * Find the timeline ID corresponding to endrec on the
source.
+ *
+ * In most cases we can rely on control file, but that
is not the
+ * case after promotion until end-of-recovery
checkpoint completes.
+ * Identify the timeline ID the hard way since we don't
have a
+ * easer way to detect that case. In case where we
failed to do
+ * that, fall back to the control file's value.
+ */
+ hist = getTimelineHistory(&ControlFile_source,
&nentries);
+ if (hist[nentries - 1].tli != endtli)
+ {
+ for (i = 0; i < nentries; i++)
+ {
+ if ((hist[i].begin == 0 ||
hist[i].begin <= endrec) &&
+ (hist[i].end == 0 || endrec <
hist[i].end))
+ {
+ endtli = hist[i].tli;
+ break;
+ }
+ }
+ }
}
}
else
@@ -804,9 +856,32 @@ getTimelineHistory(ControlFileData *controlFile, int
*nentries)
{
TimeLineHistoryEntry *history;
TimeLineID tli;
+ TimeLineID probe_tli;
tli = controlFile->checkPointCopy.ThisTimeLineID;
+ Assert(tli > 0);
+ for (probe_tli = tli + 1 ;; probe_tli++)
+ {
+ char path[MAXPGPATH];
+ char *histfile;
+
+ TLHistoryFilePath(path, probe_tli);
+
+ /* Get history file from appropriate source */
+ if (controlFile == &ControlFile_source)
+ histfile = source->fetch_file(source, path, NULL, true);
+ else if (controlFile == &ControlFile_target)
+ histfile = slurpFile(datadir_target, path, NULL, true);
+
+ if (!histfile)
+ break;
+
+ pg_free(histfile);
+ }
+
+ tli = probe_tli - 1;
+
/*
* Timeline 1 does not have a history file, so there is no need to check
* and fake an entry with infinite start and end positions.
@@ -827,9 +902,9 @@ getTimelineHistory(ControlFileData *controlFile, int
*nentries)
/* Get history file from appropriate source */
if (controlFile == &ControlFile_source)
- histfile = source->fetch_file(source, path, NULL);
+ histfile = source->fetch_file(source, path, NULL,
false);
else if (controlFile == &ControlFile_target)
- histfile = slurpFile(datadir_target, path, NULL);
+ histfile = slurpFile(datadir_target, path, NULL, false);
else
pg_fatal("invalid control file");
diff --git a/src/bin/pg_rewind/rewind_source.h
b/src/bin/pg_rewind/rewind_source.h
index 1310e86e75..6975848668 100644
--- a/src/bin/pg_rewind/rewind_source.h
+++ b/src/bin/pg_rewind/rewind_source.h
@@ -35,7 +35,7 @@ typedef struct rewind_source
* handy for text files.
*/
char *(*fetch_file) (struct rewind_source *, const char *path,
- size_t *filesize);
+ size_t *filesize,
bool noerror);
/*
* Request to fetch (part of) a file in the source system, specified by
an