On Wed, Apr 13, 2022 at 06:54:12AM -0500, Justin Pryzby wrote: > I didn't pursue this patch, as it's easier for me to use /bin/sync -f. > Someone > should adopt it if interested.
I was about to start a new thread, but I found this one with some good preliminary discussion. I came to the same conclusion about introducing a new option instead of using syncfs() by default wherever it is available. The attached patch is still a work-in-progress, but it seems to behave as expected. I began investigating this because I noticed that the sync-data-directory step on pg_upgrade takes quite a while when there are many files, and I am looking for ways to reduce the amount of downtime required for pg_upgrade. The attached patch adds a new --sync-method option to the relevant frontend utilities, but I am not wedded to that name/approach. Thoughts? -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 4f484cd0268e63da8226d78dd21a8d7e29aa2b78 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Fri, 28 Jul 2023 15:56:24 -0700 Subject: [PATCH v1 1/1] allow syncfs in frontend utilities --- src/bin/initdb/initdb.c | 11 +++- src/bin/pg_basebackup/pg_basebackup.c | 8 ++- src/bin/pg_checksums/pg_checksums.c | 8 ++- src/bin/pg_rewind/file_ops.c | 2 +- src/bin/pg_rewind/pg_rewind.c | 8 +++ src/bin/pg_rewind/pg_rewind.h | 2 + src/bin/pg_upgrade/option.c | 12 ++++ src/bin/pg_upgrade/pg_upgrade.c | 6 +- src/bin/pg_upgrade/pg_upgrade.h | 1 + src/common/file_utils.c | 79 ++++++++++++++++++++++++++- src/fe_utils/option_utils.c | 18 ++++++ src/include/common/file_utils.h | 15 ++++- src/include/fe_utils/option_utils.h | 3 + src/include/storage/fd.h | 4 ++ src/tools/pgindent/typedefs.list | 1 + 15 files changed, 168 insertions(+), 10 deletions(-) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 3f4167682a..908263ee62 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -76,6 +76,7 @@ #include "common/restricted_token.h" #include "common/string.h" #include "common/username.h" +#include "fe_utils/option_utils.h" #include "fe_utils/string_utils.h" #include "getopt_long.h" #include "mb/pg_wchar.h" @@ -165,6 +166,7 @@ static bool data_checksums = false; static char *xlog_dir = NULL; static char *str_wal_segment_size_mb = NULL; static int wal_segment_size_mb; +static SyncMethod sync_method = SYNC_METHOD_FSYNC; /* internal vars */ @@ -3106,6 +3108,7 @@ main(int argc, char *argv[]) {"locale-provider", required_argument, NULL, 15}, {"icu-locale", required_argument, NULL, 16}, {"icu-rules", required_argument, NULL, 17}, + {"sync-method", required_argument, NULL, 18}, {NULL, 0, NULL, 0} }; @@ -3285,6 +3288,10 @@ main(int argc, char *argv[]) case 17: icu_rules = pg_strdup(optarg); break; + case 18: + if (!parse_sync_method(optarg, &sync_method)) + exit(1); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -3332,7 +3339,7 @@ main(int argc, char *argv[]) fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - fsync_pgdata(pg_data, PG_VERSION_NUM); + fsync_pgdata(pg_data, PG_VERSION_NUM, sync_method); check_ok(); return 0; } @@ -3409,7 +3416,7 @@ main(int argc, char *argv[]) { fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - fsync_pgdata(pg_data, PG_VERSION_NUM); + fsync_pgdata(pg_data, PG_VERSION_NUM, sync_method); check_ok(); } else diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 1dc8efe0cb..548e764b2f 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -148,6 +148,7 @@ static bool verify_checksums = true; static bool manifest = true; static bool manifest_force_encode = false; static char *manifest_checksums = NULL; +static SyncMethod sync_method = SYNC_METHOD_FSYNC; static bool success = false; static bool made_new_pgdata = false; @@ -2203,7 +2204,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, } else { - (void) fsync_pgdata(basedir, serverVersion); + (void) fsync_pgdata(basedir, serverVersion, sync_method); } } @@ -2281,6 +2282,7 @@ main(int argc, char **argv) {"no-manifest", no_argument, NULL, 5}, {"manifest-force-encode", no_argument, NULL, 6}, {"manifest-checksums", required_argument, NULL, 7}, + {"sync-method", required_argument, NULL, 8}, {NULL, 0, NULL, 0} }; int c; @@ -2452,6 +2454,10 @@ main(int argc, char **argv) case 7: manifest_checksums = pg_strdup(optarg); break; + case 8: + if (!parse_sync_method(optarg, &sync_method)) + exit(1); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 19eb67e485..a1dfc51273 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -44,6 +44,7 @@ static char *only_filenode = NULL; static bool do_sync = true; static bool verbose = false; static bool showprogress = false; +static SyncMethod sync_method = SYNC_METHOD_FSYNC; typedef enum { @@ -445,6 +446,7 @@ main(int argc, char *argv[]) {"no-sync", no_argument, NULL, 'N'}, {"progress", no_argument, NULL, 'P'}, {"verbose", no_argument, NULL, 'v'}, + {"sync-method", required_argument, NULL, 1}, {NULL, 0, NULL, 0} }; @@ -503,6 +505,10 @@ main(int argc, char *argv[]) case 'v': verbose = true; break; + case 1: + if (!parse_sync_method(optarg, &sync_method)) + exit(1); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -633,7 +639,7 @@ main(int argc, char *argv[]) if (do_sync) { pg_log_info("syncing data directory"); - fsync_pgdata(DataDir, PG_VERSION_NUM); + fsync_pgdata(DataDir, PG_VERSION_NUM, sync_method); } pg_log_info("updating control file"); diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index 25996b4da4..451fb1856e 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -296,7 +296,7 @@ sync_target_dir(void) if (!do_sync || dry_run) return; - fsync_pgdata(datadir_target, PG_VERSION_NUM); + fsync_pgdata(datadir_target, PG_VERSION_NUM, sync_method); } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index f7f3b8227f..a424762f1e 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -22,6 +22,7 @@ #include "common/file_perm.h" #include "common/restricted_token.h" #include "common/string.h" +#include "fe_utils/option_utils.h" #include "fe_utils/recovery_gen.h" #include "fe_utils/string_utils.h" #include "file_ops.h" @@ -74,6 +75,7 @@ bool showprogress = false; bool dry_run = false; bool do_sync = true; bool restore_wal = false; +SyncMethod sync_method = SYNC_METHOD_FSYNC; /* Target history */ TimeLineHistoryEntry *targetHistory; @@ -131,6 +133,7 @@ main(int argc, char **argv) {"no-sync", no_argument, NULL, 'N'}, {"progress", no_argument, NULL, 'P'}, {"debug", no_argument, NULL, 3}, + {"sync-method", required_argument, NULL, 6}, {NULL, 0, NULL, 0} }; int option_index; @@ -218,6 +221,11 @@ main(int argc, char **argv) config_file = pg_strdup(optarg); break; + case 6: + if (!parse_sync_method(optarg, &sync_method)) + exit(1); + break; + default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index ef8bdc1fbb..a7ce754880 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -13,6 +13,7 @@ #include "access/timeline.h" #include "common/logging.h" +#include "common/file_utils.h" #include "datapagemap.h" #include "libpq-fe.h" #include "storage/block.h" @@ -24,6 +25,7 @@ extern bool showprogress; extern bool dry_run; extern bool do_sync; extern int WalSegSz; +extern SyncMethod sync_method; /* Target history */ extern TimeLineHistoryEntry *targetHistory; diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 640361009e..4f9da8e685 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -14,6 +14,7 @@ #endif #include "common/string.h" +#include "fe_utils/option_utils.h" #include "getopt_long.h" #include "pg_upgrade.h" #include "utils/pidfile.h" @@ -57,12 +58,14 @@ parseCommandLine(int argc, char *argv[]) {"verbose", no_argument, NULL, 'v'}, {"clone", no_argument, NULL, 1}, {"copy", no_argument, NULL, 2}, + {"sync-method", required_argument, NULL, 3}, {NULL, 0, NULL, 0} }; int option; /* Command line option */ int optindex = 0; /* used by getopt_long */ int os_user_effective_id; + SyncMethod unused; user_opts.do_sync = true; user_opts.transfer_mode = TRANSFER_MODE_COPY; @@ -199,6 +202,12 @@ parseCommandLine(int argc, char *argv[]) user_opts.transfer_mode = TRANSFER_MODE_COPY; break; + case 3: + if (!parse_sync_method(optarg, &unused)) + exit(1); + user_opts.sync_method = pg_strdup(optarg); + break; + default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), os_info.progname); @@ -209,6 +218,9 @@ parseCommandLine(int argc, char *argv[]) if (optind < argc) pg_fatal("too many command-line arguments (first is \"%s\")", argv[optind]); + if (!user_opts.sync_method) + user_opts.sync_method = pg_strdup("fsync"); + if (log_opts.verbose) pg_log(PG_REPORT, "Running in verbose mode"); diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 4562dafcff..96bfb67167 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -192,8 +192,10 @@ main(int argc, char **argv) { prep_status("Sync data directory to disk"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/initdb\" --sync-only \"%s\"", new_cluster.bindir, - new_cluster.pgdata); + "\"%s/initdb\" --sync-only \"%s\" --sync-method %s", + new_cluster.bindir, + new_cluster.pgdata, + user_opts.sync_method); check_ok(); } diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 3eea0139c7..13457b2f75 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -304,6 +304,7 @@ typedef struct transferMode transfer_mode; /* copy files or link them? */ int jobs; /* number of processes/threads to use */ char *socketdir; /* directory to use for Unix sockets */ + char *sync_method; } UserOpts; typedef struct diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 74833c4acb..b3d814d0c4 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -51,6 +51,33 @@ static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), bool process_symlinks); +#ifdef HAVE_SYNCFS +static void +do_syncfs(const char *path) +{ + int fd; + + fd = open(path, O_RDONLY | PG_BINARY, 0); + + if (fd < 0) + { + if (errno == EACCES || errno == EISDIR) + return; + pg_log_error("could not open file \"%s\": %m", path); + return; + } + + if (syncfs(fd) < 0) + { + pg_log_error("could not synchronize file system for file \"%s\": %m", path); + (void) close(fd); + exit(EXIT_FAILURE); + } + + (void) close(fd); +} +#endif + /* * Issue fsync recursively on PGDATA and all its contents. * @@ -63,7 +90,8 @@ static void walkdir(const char *path, */ void fsync_pgdata(const char *pg_data, - int serverVersion) + int serverVersion, + SyncMethod sync_method) { bool xlog_is_symlink; char pg_wal[MAXPGPATH]; @@ -89,6 +117,55 @@ fsync_pgdata(const char *pg_data, xlog_is_symlink = true; } +#ifdef HAVE_SYNCFS + if (sync_method == SYNC_METHOD_SYNCFS) + { + DIR *dir; + struct dirent *de; + + /* + * On Linux, we don't have to open every single file one by one. We + * can use syncfs() to sync whole filesystems. We only expect + * filesystem boundaries to exist where we tolerate symlinks, namely + * pg_wal and the tablespaces, so we call syncfs() for each of those + * directories. + */ + + /* Sync the top level pgdata directory. */ + do_syncfs(pg_data); + + /* If any tablespaces are configured, sync each of those. */ + dir = opendir(pg_tblspc); + if (dir == NULL) + pg_log_error("could not open directory \"%s\": %m", pg_tblspc); + else + { + while (errno = 0, (de = readdir(dir)) != NULL) + { + char subpath[MAXPGPATH * 2]; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, sizeof(subpath), "%s/%s", pg_tblspc, de->d_name); + do_syncfs(subpath); + } + + if (errno) + pg_log_error("could not read directory \"%s\": %m", pg_tblspc); + + (void) closedir(dir); + } + + /* If pg_wal is a symlink, process that too. */ + if (xlog_is_symlink) + do_syncfs(pg_wal); + + return; + } +#endif /* HAVE_SYNCFS */ + /* * If possible, hint to the kernel that we're soon going to fsync the data * directory and its contents. diff --git a/src/fe_utils/option_utils.c b/src/fe_utils/option_utils.c index 763c991015..c65aedf8f5 100644 --- a/src/fe_utils/option_utils.c +++ b/src/fe_utils/option_utils.c @@ -82,3 +82,21 @@ option_parse_int(const char *optarg, const char *optname, *result = val; return true; } + +bool +parse_sync_method(const char *optarg, SyncMethod *sync_method) +{ + if (strcmp(optarg, "fsync") == 0) + *sync_method = SYNC_METHOD_FSYNC; +#ifdef HAVE_SYNCFS + else if (strcmp(optarg, "syncfs") == 0) + *sync_method = SYNC_METHOD_SYNCFS; +#endif + else + { + pg_log_error("unrecognized sync method: %s", optarg); + return false; + } + + return true; +} diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index b7efa1226d..fd5162fef1 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -27,12 +27,23 @@ typedef enum PGFileType struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND + +typedef enum SyncMethod +{ + SYNC_METHOD_FSYNC, +#ifdef HAVE_SYNCFS + SYNC_METHOD_SYNCFS +#endif +} SyncMethod; + extern int fsync_fname(const char *fname, bool isdir); -extern void fsync_pgdata(const char *pg_data, int serverVersion); +extern void fsync_pgdata(const char *pg_data, int serverVersion, + SyncMethod sync_method); extern void fsync_dir_recurse(const char *dir); extern int durable_rename(const char *oldfile, const char *newfile); extern int fsync_parent_path(const char *fname); -#endif + +#endif /* FRONTEND */ extern PGFileType get_dirent_type(const char *path, const struct dirent *de, diff --git a/src/include/fe_utils/option_utils.h b/src/include/fe_utils/option_utils.h index b7b0654cee..3ad8737219 100644 --- a/src/include/fe_utils/option_utils.h +++ b/src/include/fe_utils/option_utils.h @@ -14,6 +14,8 @@ #include "postgres_fe.h" +#include "common/file_utils.h" + typedef void (*help_handler) (const char *progname); extern void handle_help_version_opts(int argc, char *argv[], @@ -22,5 +24,6 @@ extern void handle_help_version_opts(int argc, char *argv[], extern bool option_parse_int(const char *optarg, const char *optname, int min_range, int max_range, int *result); +extern bool parse_sync_method(const char *optarg, SyncMethod *sync_method); #endif /* OPTION_UTILS_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 6791a406fc..196f20e716 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -43,6 +43,8 @@ #ifndef FD_H #define FD_H +#ifndef FRONTEND + #include <dirent.h> #include <fcntl.h> @@ -195,6 +197,8 @@ extern int durable_unlink(const char *fname, int elevel); extern void SyncDataDirectory(void); extern int data_sync_elevel(int elevel); +#endif /* ! FRONTEND */ + /* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILE_PREFIX "pgsql_tmp" diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 11d47294cf..3bbf70fdf3 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2678,6 +2678,7 @@ SupportRequestSelectivity SupportRequestSimplify SupportRequestWFuncMonotonic Syn +SyncMethod SyncOps SyncRepConfigData SyncRepStandbyData -- 2.25.1