From 6a87644bb05b1e7473a79f8c64552a8895ebc25c Mon Sep 17 00:00:00 2001
From: Paul Guo <paulguo@gmail.com>
Date: Mon, 25 Jan 2021 11:19:40 +0800
Subject: [PATCH 1/2] Fsync the affected files/directories only in pg_rewind.

Previously in the end, pg_rewind would fsync the whole pgdata directory on the
target, but that is a waste since usually just part of the files/directories on
the target are modified. Other files on the target should have been flushed
since pg_rewind requires a clean shutdown before doing the real work. This
would help the scenario that the target postgres instance includes millions of
files, which has been seen in a real environment.
---
 src/backend/storage/file/fd.c   |  9 ----
 src/bin/pg_rewind/file_ops.c    | 19 --------
 src/bin/pg_rewind/file_ops.h    |  1 -
 src/bin/pg_rewind/pg_rewind.c   | 98 ++++++++++++++++++++++++++++++++++++++++-
 src/common/file_utils.c         | 12 +----
 src/include/common/file_utils.h |  3 ++
 src/include/pg_config_manual.h  |  9 ++++
 7 files changed, 110 insertions(+), 41 deletions(-)

diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index b58502837a..6cafdac0b8 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -99,15 +99,6 @@
 #include "utils/guc.h"
 #include "utils/resowner_private.h"
 
-/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
-#if defined(HAVE_SYNC_FILE_RANGE)
-#define PG_FLUSH_DATA_WORKS 1
-#elif !defined(WIN32) && defined(MS_ASYNC)
-#define PG_FLUSH_DATA_WORKS 1
-#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-#define PG_FLUSH_DATA_WORKS 1
-#endif
-
 /*
  * We must leave some file descriptors free for system(), the dynamic loader,
  * and other code that tries to open files without consulting fd.c.  This
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index c50f283ede..0dd9c6c95e 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -281,25 +281,6 @@ remove_target_symlink(const char *path)
 				 dstpath);
 }
 
-/*
- * Sync target data directory to ensure that modifications are safely on disk.
- *
- * We do this once, for the whole data directory, for performance reasons.  At
- * the end of pg_rewind's run, the kernel is likely to already have flushed
- * most dirty buffers to disk.  Additionally fsync_pgdata uses a two-pass
- * approach (only initiating writeback in the first pass), which often reduces
- * the overall amount of IO noticeably.
- */
-void
-sync_target_dir(void)
-{
-	if (!do_sync || dry_run)
-		return;
-
-	fsync_pgdata(datadir_target, PG_VERSION_NUM);
-}
-
-
 /*
  * Read a file into memory. The file to be read is <datadir>/<path>.
  * The file contents are returned in a malloc'd buffer, and *filesize
diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h
index 611981f293..b6b8b319d5 100644
--- a/src/bin/pg_rewind/file_ops.h
+++ b/src/bin/pg_rewind/file_ops.h
@@ -19,7 +19,6 @@ extern void remove_target_file(const char *path, bool missing_ok);
 extern void truncate_target_file(const char *path, off_t newsize);
 extern void create_target(file_entry_t *t);
 extern void remove_target(file_entry_t *t);
-extern void sync_target_dir(void);
 
 extern char *slurpFile(const char *datadir, const char *path, size_t *filesize);
 
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 359a6a587c..c85b75b285 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -20,6 +20,7 @@
 #include "catalog/pg_control.h"
 #include "common/controldata_utils.h"
 #include "common/file_perm.h"
+#include "common/file_utils.h"
 #include "common/restricted_token.h"
 #include "common/string.h"
 #include "fe_utils/recovery_gen.h"
@@ -32,6 +33,7 @@
 
 static void usage(const char *progname);
 
+static void perform_sync(filemap_t *filemap);
 static void perform_rewind(filemap_t *filemap, rewind_source *source,
 						   XLogRecPtr chkptrec,
 						   TimeLineID chkpttli,
@@ -464,7 +466,7 @@ main(int argc, char **argv)
 
 	if (showprogress)
 		pg_log_info("syncing target data directory");
-	sync_target_dir();
+	perform_sync(filemap);
 
 	/* Also update the standby configuration, if requested. */
 	if (writerecoveryconf && !dry_run)
@@ -484,6 +486,100 @@ main(int argc, char **argv)
 	return 0;
 }
 
+/*
+ * Fsync the modified files and directories on the target.
+ *
+ * The target should be cleanly shutdown before rewinding so we do not need
+ * to fsync the whole pg data directory, instead, we just flush those modified
+ * files/directories.
+ */
+static void
+perform_sync(filemap_t *filemap)
+{
+	char		parentpath[MAXPGPATH];
+
+	if (!do_sync || dry_run)
+		return;
+
+#ifdef PG_FLUSH_DATA_WORKS
+	/* Hint the OS to flush data. */
+	for (int i = 0; i < filemap->nentries; i++)
+	{
+		file_entry_t *entry = filemap->entries[i];
+
+		if (entry->target_pages_to_overwrite.bitmapsize > 0)
+			pre_sync_fname(entry->path, false);
+
+		switch (entry->action)
+		{
+			case FILE_ACTION_COPY:
+			case FILE_ACTION_TRUNCATE:
+			case FILE_ACTION_COPY_TAIL:
+				pre_sync_fname(entry->path, false);
+				break;
+
+			case FILE_ACTION_CREATE:
+				pre_sync_fname(entry->path,
+							entry->source_type == FILE_TYPE_DIRECTORY);
+				/* FALLTHROUGH */
+			case FILE_ACTION_REMOVE:
+				/*
+				 * Fsync the parent directory if we either create or delete
+				 * files/directories in the parent directory. The parent
+				 * directory might be missing as expected, but we ignore that
+				 * error.
+				 */
+				strlcpy(parentpath, entry->path, MAXPGPATH);
+				get_parent_directory(parentpath);
+				pre_sync_fname(parentpath, true);
+				break;
+
+			default:
+				break;
+		}
+	}
+#endif
+
+	/* Do the fsync()s finally */
+	for (int i = 0; i < filemap->nentries; i++)
+	{
+		file_entry_t *entry = filemap->entries[i];
+
+		if (entry->target_pages_to_overwrite.bitmapsize > 0)
+			fsync_fname(entry->path, false);
+
+		switch (entry->action)
+		{
+			case FILE_ACTION_COPY:
+			case FILE_ACTION_TRUNCATE:
+			case FILE_ACTION_COPY_TAIL:
+				fsync_fname(entry->path, false);
+				break;
+
+			case FILE_ACTION_CREATE:
+				fsync_fname(entry->path,
+							entry->source_type == FILE_TYPE_DIRECTORY);
+				/* FALLTHROUGH */
+			case FILE_ACTION_REMOVE:
+				/*
+				 * Fsync the parent directory if we either create or delete
+				 * files/directories in the parent directory. The parent
+				 * directory might be missing as expected, but we ignore that
+				 * error.
+				 */
+				fsync_parent_path(entry->path);
+				break;
+
+			default:
+				break;
+		}
+	}
+
+	fsync_fname("global/pg_control", false);
+	fsync_fname("backup_label", false);
+}
+
+
 /*
  * Perform the rewind.
  *
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 40b73bbe1a..68da0615f7 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -31,21 +31,11 @@
 
 #ifdef FRONTEND
 
-/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
-#if defined(HAVE_SYNC_FILE_RANGE)
-#define PG_FLUSH_DATA_WORKS 1
-#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-#define PG_FLUSH_DATA_WORKS 1
-#endif
-
 /*
  * pg_xlog has been renamed to pg_wal in version 10.
  */
 #define MINIMUM_VERSION_FOR_PG_WAL	100000
 
-#ifdef PG_FLUSH_DATA_WORKS
-static int	pre_sync_fname(const char *fname, bool isdir);
-#endif
 static void walkdir(const char *path,
 					int (*action) (const char *fname, bool isdir),
 					bool process_symlinks);
@@ -218,7 +208,7 @@ walkdir(const char *path,
  */
 #ifdef PG_FLUSH_DATA_WORKS
 
-static int
+int
 pre_sync_fname(const char *fname, bool isdir)
 {
 	int			fd;
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 978a57460a..114c196556 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -25,6 +25,9 @@ typedef enum PGFileType
 } PGFileType;
 
 #ifdef FRONTEND
+#ifdef PG_FLUSH_DATA_WORKS
+extern int	pre_sync_fname(const char *fname, bool isdir);
+#endif
 extern int	fsync_fname(const char *fname, bool isdir);
 extern void fsync_pgdata(const char *pg_data, int serverVersion);
 extern void fsync_dir_recurse(const char *dir);
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index d27c8601fa..e69f9af2bb 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -390,3 +390,12 @@
  * Enable tracing of syncscan operations (see also the trace_syncscan GUC var).
  */
 /* #define TRACE_SYNCSCAN */
+
+/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
+#if defined(HAVE_SYNC_FILE_RANGE)
+#define PG_FLUSH_DATA_WORKS 1
+#elif !defined(WIN32) && defined(MS_ASYNC)
+#define PG_FLUSH_DATA_WORKS 1
+#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+#define PG_FLUSH_DATA_WORKS 1
+#endif
-- 
2.14.3

