On Wed, Apr 13, 2022 at 06:54:12AM -0500, Justin Pryzby wrote:
> I didn't pursue this patch, as it's easier for me to use /bin/sync -f.  
> Someone
> should adopt it if interested.

I was about to start a new thread, but I found this one with some good
preliminary discussion.  I came to the same conclusion about introducing a
new option instead of using syncfs() by default wherever it is available.
The attached patch is still a work-in-progress, but it seems to behave as
expected.  I began investigating this because I noticed that the
sync-data-directory step on pg_upgrade takes quite a while when there are
many files, and I am looking for ways to reduce the amount of downtime
required for pg_upgrade.

The attached patch adds a new --sync-method option to the relevant frontend
utilities, but I am not wedded to that name/approach.

Thoughts?

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 4f484cd0268e63da8226d78dd21a8d7e29aa2b78 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Fri, 28 Jul 2023 15:56:24 -0700
Subject: [PATCH v1 1/1] allow syncfs in frontend utilities

---
 src/bin/initdb/initdb.c               | 11 +++-
 src/bin/pg_basebackup/pg_basebackup.c |  8 ++-
 src/bin/pg_checksums/pg_checksums.c   |  8 ++-
 src/bin/pg_rewind/file_ops.c          |  2 +-
 src/bin/pg_rewind/pg_rewind.c         |  8 +++
 src/bin/pg_rewind/pg_rewind.h         |  2 +
 src/bin/pg_upgrade/option.c           | 12 ++++
 src/bin/pg_upgrade/pg_upgrade.c       |  6 +-
 src/bin/pg_upgrade/pg_upgrade.h       |  1 +
 src/common/file_utils.c               | 79 ++++++++++++++++++++++++++-
 src/fe_utils/option_utils.c           | 18 ++++++
 src/include/common/file_utils.h       | 15 ++++-
 src/include/fe_utils/option_utils.h   |  3 +
 src/include/storage/fd.h              |  4 ++
 src/tools/pgindent/typedefs.list      |  1 +
 15 files changed, 168 insertions(+), 10 deletions(-)

diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 3f4167682a..908263ee62 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -76,6 +76,7 @@
 #include "common/restricted_token.h"
 #include "common/string.h"
 #include "common/username.h"
+#include "fe_utils/option_utils.h"
 #include "fe_utils/string_utils.h"
 #include "getopt_long.h"
 #include "mb/pg_wchar.h"
@@ -165,6 +166,7 @@ static bool data_checksums = false;
 static char *xlog_dir = NULL;
 static char *str_wal_segment_size_mb = NULL;
 static int	wal_segment_size_mb;
+static SyncMethod sync_method = SYNC_METHOD_FSYNC;
 
 
 /* internal vars */
@@ -3106,6 +3108,7 @@ main(int argc, char *argv[])
 		{"locale-provider", required_argument, NULL, 15},
 		{"icu-locale", required_argument, NULL, 16},
 		{"icu-rules", required_argument, NULL, 17},
+		{"sync-method", required_argument, NULL, 18},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -3285,6 +3288,10 @@ main(int argc, char *argv[])
 			case 17:
 				icu_rules = pg_strdup(optarg);
 				break;
+			case 18:
+				if (!parse_sync_method(optarg, &sync_method))
+					exit(1);
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -3332,7 +3339,7 @@ main(int argc, char *argv[])
 
 		fputs(_("syncing data to disk ... "), stdout);
 		fflush(stdout);
-		fsync_pgdata(pg_data, PG_VERSION_NUM);
+		fsync_pgdata(pg_data, PG_VERSION_NUM, sync_method);
 		check_ok();
 		return 0;
 	}
@@ -3409,7 +3416,7 @@ main(int argc, char *argv[])
 	{
 		fputs(_("syncing data to disk ... "), stdout);
 		fflush(stdout);
-		fsync_pgdata(pg_data, PG_VERSION_NUM);
+		fsync_pgdata(pg_data, PG_VERSION_NUM, sync_method);
 		check_ok();
 	}
 	else
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 1dc8efe0cb..548e764b2f 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -148,6 +148,7 @@ static bool verify_checksums = true;
 static bool manifest = true;
 static bool manifest_force_encode = false;
 static char *manifest_checksums = NULL;
+static SyncMethod sync_method = SYNC_METHOD_FSYNC;
 
 static bool success = false;
 static bool made_new_pgdata = false;
@@ -2203,7 +2204,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail,
 		}
 		else
 		{
-			(void) fsync_pgdata(basedir, serverVersion);
+			(void) fsync_pgdata(basedir, serverVersion, sync_method);
 		}
 	}
 
@@ -2281,6 +2282,7 @@ main(int argc, char **argv)
 		{"no-manifest", no_argument, NULL, 5},
 		{"manifest-force-encode", no_argument, NULL, 6},
 		{"manifest-checksums", required_argument, NULL, 7},
+		{"sync-method", required_argument, NULL, 8},
 		{NULL, 0, NULL, 0}
 	};
 	int			c;
@@ -2452,6 +2454,10 @@ main(int argc, char **argv)
 			case 7:
 				manifest_checksums = pg_strdup(optarg);
 				break;
+			case 8:
+				if (!parse_sync_method(optarg, &sync_method))
+					exit(1);
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 19eb67e485..a1dfc51273 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -44,6 +44,7 @@ static char *only_filenode = NULL;
 static bool do_sync = true;
 static bool verbose = false;
 static bool showprogress = false;
+static SyncMethod sync_method = SYNC_METHOD_FSYNC;
 
 typedef enum
 {
@@ -445,6 +446,7 @@ main(int argc, char *argv[])
 		{"no-sync", no_argument, NULL, 'N'},
 		{"progress", no_argument, NULL, 'P'},
 		{"verbose", no_argument, NULL, 'v'},
+		{"sync-method", required_argument, NULL, 1},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -503,6 +505,10 @@ main(int argc, char *argv[])
 			case 'v':
 				verbose = true;
 				break;
+			case 1:
+				if (!parse_sync_method(optarg, &sync_method))
+					exit(1);
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -633,7 +639,7 @@ main(int argc, char *argv[])
 		if (do_sync)
 		{
 			pg_log_info("syncing data directory");
-			fsync_pgdata(DataDir, PG_VERSION_NUM);
+			fsync_pgdata(DataDir, PG_VERSION_NUM, sync_method);
 		}
 
 		pg_log_info("updating control file");
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 25996b4da4..451fb1856e 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -296,7 +296,7 @@ sync_target_dir(void)
 	if (!do_sync || dry_run)
 		return;
 
-	fsync_pgdata(datadir_target, PG_VERSION_NUM);
+	fsync_pgdata(datadir_target, PG_VERSION_NUM, sync_method);
 }
 
 
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index f7f3b8227f..a424762f1e 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -22,6 +22,7 @@
 #include "common/file_perm.h"
 #include "common/restricted_token.h"
 #include "common/string.h"
+#include "fe_utils/option_utils.h"
 #include "fe_utils/recovery_gen.h"
 #include "fe_utils/string_utils.h"
 #include "file_ops.h"
@@ -74,6 +75,7 @@ bool		showprogress = false;
 bool		dry_run = false;
 bool		do_sync = true;
 bool		restore_wal = false;
+SyncMethod	sync_method = SYNC_METHOD_FSYNC;
 
 /* Target history */
 TimeLineHistoryEntry *targetHistory;
@@ -131,6 +133,7 @@ main(int argc, char **argv)
 		{"no-sync", no_argument, NULL, 'N'},
 		{"progress", no_argument, NULL, 'P'},
 		{"debug", no_argument, NULL, 3},
+		{"sync-method", required_argument, NULL, 6},
 		{NULL, 0, NULL, 0}
 	};
 	int			option_index;
@@ -218,6 +221,11 @@ main(int argc, char **argv)
 				config_file = pg_strdup(optarg);
 				break;
 
+			case 6:
+				if (!parse_sync_method(optarg, &sync_method))
+					exit(1);
+				break;
+
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index ef8bdc1fbb..a7ce754880 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -13,6 +13,7 @@
 
 #include "access/timeline.h"
 #include "common/logging.h"
+#include "common/file_utils.h"
 #include "datapagemap.h"
 #include "libpq-fe.h"
 #include "storage/block.h"
@@ -24,6 +25,7 @@ extern bool showprogress;
 extern bool dry_run;
 extern bool do_sync;
 extern int	WalSegSz;
+extern SyncMethod sync_method;
 
 /* Target history */
 extern TimeLineHistoryEntry *targetHistory;
diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c
index 640361009e..4f9da8e685 100644
--- a/src/bin/pg_upgrade/option.c
+++ b/src/bin/pg_upgrade/option.c
@@ -14,6 +14,7 @@
 #endif
 
 #include "common/string.h"
+#include "fe_utils/option_utils.h"
 #include "getopt_long.h"
 #include "pg_upgrade.h"
 #include "utils/pidfile.h"
@@ -57,12 +58,14 @@ parseCommandLine(int argc, char *argv[])
 		{"verbose", no_argument, NULL, 'v'},
 		{"clone", no_argument, NULL, 1},
 		{"copy", no_argument, NULL, 2},
+		{"sync-method", required_argument, NULL, 3},
 
 		{NULL, 0, NULL, 0}
 	};
 	int			option;			/* Command line option */
 	int			optindex = 0;	/* used by getopt_long */
 	int			os_user_effective_id;
+	SyncMethod	unused;
 
 	user_opts.do_sync = true;
 	user_opts.transfer_mode = TRANSFER_MODE_COPY;
@@ -199,6 +202,12 @@ parseCommandLine(int argc, char *argv[])
 				user_opts.transfer_mode = TRANSFER_MODE_COPY;
 				break;
 
+			case 3:
+				if (!parse_sync_method(optarg, &unused))
+					exit(1);
+				user_opts.sync_method = pg_strdup(optarg);
+				break;
+
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 						os_info.progname);
@@ -209,6 +218,9 @@ parseCommandLine(int argc, char *argv[])
 	if (optind < argc)
 		pg_fatal("too many command-line arguments (first is \"%s\")", argv[optind]);
 
+	if (!user_opts.sync_method)
+		user_opts.sync_method = pg_strdup("fsync");
+
 	if (log_opts.verbose)
 		pg_log(PG_REPORT, "Running in verbose mode");
 
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 4562dafcff..96bfb67167 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -192,8 +192,10 @@ main(int argc, char **argv)
 	{
 		prep_status("Sync data directory to disk");
 		exec_prog(UTILITY_LOG_FILE, NULL, true, true,
-				  "\"%s/initdb\" --sync-only \"%s\"", new_cluster.bindir,
-				  new_cluster.pgdata);
+				  "\"%s/initdb\" --sync-only \"%s\" --sync-method %s",
+				  new_cluster.bindir,
+				  new_cluster.pgdata,
+				  user_opts.sync_method);
 		check_ok();
 	}
 
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 3eea0139c7..13457b2f75 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -304,6 +304,7 @@ typedef struct
 	transferMode transfer_mode; /* copy files or link them? */
 	int			jobs;			/* number of processes/threads to use */
 	char	   *socketdir;		/* directory to use for Unix sockets */
+	char	   *sync_method;
 } UserOpts;
 
 typedef struct
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 74833c4acb..b3d814d0c4 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -51,6 +51,33 @@ static void walkdir(const char *path,
 					int (*action) (const char *fname, bool isdir),
 					bool process_symlinks);
 
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+	int			fd;
+
+	fd = open(path, O_RDONLY | PG_BINARY, 0);
+
+	if (fd < 0)
+	{
+		if (errno == EACCES || errno == EISDIR)
+			return;
+		pg_log_error("could not open file \"%s\": %m", path);
+		return;
+	}
+
+	if (syncfs(fd) < 0)
+	{
+		pg_log_error("could not synchronize file system for file \"%s\": %m", path);
+		(void) close(fd);
+		exit(EXIT_FAILURE);
+	}
+
+	(void) close(fd);
+}
+#endif
+
 /*
  * Issue fsync recursively on PGDATA and all its contents.
  *
@@ -63,7 +90,8 @@ static void walkdir(const char *path,
  */
 void
 fsync_pgdata(const char *pg_data,
-			 int serverVersion)
+			 int serverVersion,
+			 SyncMethod sync_method)
 {
 	bool		xlog_is_symlink;
 	char		pg_wal[MAXPGPATH];
@@ -89,6 +117,55 @@ fsync_pgdata(const char *pg_data,
 			xlog_is_symlink = true;
 	}
 
+#ifdef HAVE_SYNCFS
+	if (sync_method == SYNC_METHOD_SYNCFS)
+	{
+		DIR		   *dir;
+		struct dirent *de;
+
+		/*
+		 * On Linux, we don't have to open every single file one by one.  We
+		 * can use syncfs() to sync whole filesystems.  We only expect
+		 * filesystem boundaries to exist where we tolerate symlinks, namely
+		 * pg_wal and the tablespaces, so we call syncfs() for each of those
+		 * directories.
+		 */
+
+		/* Sync the top level pgdata directory. */
+		do_syncfs(pg_data);
+
+		/* If any tablespaces are configured, sync each of those. */
+		dir = opendir(pg_tblspc);
+		if (dir == NULL)
+			pg_log_error("could not open directory \"%s\": %m", pg_tblspc);
+		else
+		{
+			while (errno = 0, (de = readdir(dir)) != NULL)
+			{
+				char		subpath[MAXPGPATH * 2];
+
+				if (strcmp(de->d_name, ".") == 0 ||
+					strcmp(de->d_name, "..") == 0)
+					continue;
+
+				snprintf(subpath, sizeof(subpath), "%s/%s", pg_tblspc, de->d_name);
+				do_syncfs(subpath);
+			}
+
+			if (errno)
+				pg_log_error("could not read directory \"%s\": %m", pg_tblspc);
+
+			(void) closedir(dir);
+		}
+
+		/* If pg_wal is a symlink, process that too. */
+		if (xlog_is_symlink)
+			do_syncfs(pg_wal);
+
+		return;
+	}
+#endif							/* HAVE_SYNCFS */
+
 	/*
 	 * If possible, hint to the kernel that we're soon going to fsync the data
 	 * directory and its contents.
diff --git a/src/fe_utils/option_utils.c b/src/fe_utils/option_utils.c
index 763c991015..c65aedf8f5 100644
--- a/src/fe_utils/option_utils.c
+++ b/src/fe_utils/option_utils.c
@@ -82,3 +82,21 @@ option_parse_int(const char *optarg, const char *optname,
 		*result = val;
 	return true;
 }
+
+bool
+parse_sync_method(const char *optarg, SyncMethod *sync_method)
+{
+	if (strcmp(optarg, "fsync") == 0)
+		*sync_method = SYNC_METHOD_FSYNC;
+#ifdef HAVE_SYNCFS
+	else if (strcmp(optarg, "syncfs") == 0)
+		*sync_method = SYNC_METHOD_SYNCFS;
+#endif
+	else
+	{
+		pg_log_error("unrecognized sync method: %s", optarg);
+		return false;
+	}
+
+	return true;
+}
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index b7efa1226d..fd5162fef1 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -27,12 +27,23 @@ typedef enum PGFileType
 struct iovec;					/* avoid including port/pg_iovec.h here */
 
 #ifdef FRONTEND
+
+typedef enum SyncMethod
+{
+	SYNC_METHOD_FSYNC,
+#ifdef HAVE_SYNCFS
+	SYNC_METHOD_SYNCFS
+#endif
+} SyncMethod;
+
 extern int	fsync_fname(const char *fname, bool isdir);
-extern void fsync_pgdata(const char *pg_data, int serverVersion);
+extern void fsync_pgdata(const char *pg_data, int serverVersion,
+						 SyncMethod sync_method);
 extern void fsync_dir_recurse(const char *dir);
 extern int	durable_rename(const char *oldfile, const char *newfile);
 extern int	fsync_parent_path(const char *fname);
-#endif
+
+#endif							/* FRONTEND */
 
 extern PGFileType get_dirent_type(const char *path,
 								  const struct dirent *de,
diff --git a/src/include/fe_utils/option_utils.h b/src/include/fe_utils/option_utils.h
index b7b0654cee..3ad8737219 100644
--- a/src/include/fe_utils/option_utils.h
+++ b/src/include/fe_utils/option_utils.h
@@ -14,6 +14,8 @@
 
 #include "postgres_fe.h"
 
+#include "common/file_utils.h"
+
 typedef void (*help_handler) (const char *progname);
 
 extern void handle_help_version_opts(int argc, char *argv[],
@@ -22,5 +24,6 @@ extern void handle_help_version_opts(int argc, char *argv[],
 extern bool option_parse_int(const char *optarg, const char *optname,
 							 int min_range, int max_range,
 							 int *result);
+extern bool parse_sync_method(const char *optarg, SyncMethod *sync_method);
 
 #endif							/* OPTION_UTILS_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 6791a406fc..196f20e716 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -43,6 +43,8 @@
 #ifndef FD_H
 #define FD_H
 
+#ifndef FRONTEND
+
 #include <dirent.h>
 #include <fcntl.h>
 
@@ -195,6 +197,8 @@ extern int	durable_unlink(const char *fname, int elevel);
 extern void SyncDataDirectory(void);
 extern int	data_sync_elevel(int elevel);
 
+#endif							/* ! FRONTEND */
+
 /* Filename components */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
 #define PG_TEMP_FILE_PREFIX "pgsql_tmp"
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 11d47294cf..3bbf70fdf3 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2678,6 +2678,7 @@ SupportRequestSelectivity
 SupportRequestSimplify
 SupportRequestWFuncMonotonic
 Syn
+SyncMethod
 SyncOps
 SyncRepConfigData
 SyncRepStandbyData
-- 
2.25.1

Reply via email to