On Wed, Mar 19, 2025 at 04:28:23PM -0500, Nathan Bossart wrote:
> On Wed, Mar 19, 2025 at 02:32:01PM -0500, Nathan Bossart wrote:
>> In addition to testing with in-place tablespaces, we might also want to
>> teach the transfer modes test to do cross-version testing when possible.
>> In that case, we can test normal (non-in-place) tablespaces.  However, that
>> would be limited to the buildfarm.
> 
> Actually, this one was pretty easy to do.

And here is yet another new version of the full patch set.  I'm planning to
commit 0001 (the new pg_upgrade transfer mode test) tomorrow so that I can
deal with any buildfarm indigestion before committing swap mode.  I did run
the test locally for upgrades from v9.6, v13, and v17, but who knows what
unique configurations I've failed to anticipate...

-- 
nathan
>From 5b5fbd87faac7041ad5dd2defacd29cf1eaf6397 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 19 Mar 2025 20:24:41 -0500
Subject: [PATCH v8 1/4] Add test for pg_upgrade file transfer modes.

This new test checks all of pg_upgrade's file transfer modes.  For
each mode, we verify that pg_upgrade either succeeds (and some test
objects successfully reach the new version) or fails with an error
that indicates the mode is not supported on the current platform.
For cross-version tests, we also check that pg_upgrade transfers
non-default tablespaces correctly.  (Tablespaces can't be tested on
same version upgrades because of the version-specific subdirectory
conflict, but we might be able to enable such tests once we teach
pg_upgrade how to handle in-place tablespaces.)

Suggested-by: Robert Haas <robertmh...@gmail.com>
Reviewed-by: Andres Freund <and...@anarazel.de>
Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan
---
 src/bin/pg_upgrade/meson.build           |   1 +
 src/bin/pg_upgrade/t/006_modes.pl        | 101 +++++++++++++++++++++++
 src/test/perl/PostgreSQL/Test/Cluster.pm |  19 +++++
 src/test/perl/PostgreSQL/Test/Utils.pm   |  25 ++++++
 4 files changed, 146 insertions(+)
 create mode 100644 src/bin/pg_upgrade/t/006_modes.pl

diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build
index da84344966a..16cd9247e76 100644
--- a/src/bin/pg_upgrade/meson.build
+++ b/src/bin/pg_upgrade/meson.build
@@ -46,6 +46,7 @@ tests += {
       't/003_logical_slots.pl',
       't/004_subscription.pl',
       't/005_char_signedness.pl',
+      't/006_modes.pl',
     ],
     'test_kwargs': {'priority': 40}, # pg_upgrade tests are slow
   },
diff --git a/src/bin/pg_upgrade/t/006_modes.pl 
b/src/bin/pg_upgrade/t/006_modes.pl
new file mode 100644
index 00000000000..518e0994145
--- /dev/null
+++ b/src/bin/pg_upgrade/t/006_modes.pl
@@ -0,0 +1,101 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+# Tests for file transfer modes
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+sub test_mode
+{
+       my ($mode) = @_;
+
+       my $old = PostgreSQL::Test::Cluster->new('old', install_path => 
$ENV{oldinstall});
+       my $new = PostgreSQL::Test::Cluster->new('new');
+
+       if (defined($ENV{oldinstall}))
+       {
+               # Checksums are now enabled by default, but weren't before 18, 
so pass
+               # '-k' to initdb on older versions so that upgrades work.
+               $old->init(extra => ['-k']);
+       }
+       else
+       {
+               $old->init();
+       }
+       $new->init();
+
+       # Create a small variety of simple test objects on the old cluster.  
We'll
+       # check that these reach the new version after upgrading.
+       $old->start;
+       $old->safe_psql('postgres', "CREATE TABLE test1 AS SELECT 
generate_series(1, 100)");
+       $old->safe_psql('postgres', "CREATE DATABASE testdb1");
+       $old->safe_psql('testdb1', "CREATE TABLE test2 AS SELECT 
generate_series(200, 300)");
+       $old->safe_psql('testdb1', "VACUUM FULL test2");
+       $old->safe_psql('testdb1', "CREATE SEQUENCE testseq START 5432");
+
+       # For cross-version tests, we can also check that pg_upgrade handles
+       # tablespaces.
+       if (defined($ENV{oldinstall}))
+       {
+               my $tblspc = PostgreSQL::Test::Utils::tempdir_short();
+               $old->safe_psql('postgres', "CREATE TABLESPACE test_tblspc 
LOCATION '$tblspc'");
+               $old->safe_psql('postgres', "CREATE DATABASE testdb2 TABLESPACE 
test_tblspc");
+               $old->safe_psql('postgres', "CREATE TABLE test3 TABLESPACE 
test_tblspc AS SELECT generate_series(300, 401)");
+               $old->safe_psql('testdb2', "CREATE TABLE test4 AS SELECT 
generate_series(400, 502)");
+       }
+       $old->stop;
+
+       my $result = command_ok_or_fails_like(
+               [
+                       'pg_upgrade', '--no-sync',
+                       '--old-datadir' => $old->data_dir,
+                       '--new-datadir' => $new->data_dir,
+                       '--old-bindir' => $old->config_data('--bindir'),
+                       '--new-bindir' => $new->config_data('--bindir'),
+                       '--socketdir' => $new->host,
+                       '--old-port' => $old->port,
+                       '--new-port' => $new->port,
+                       $mode
+               ],
+               qr/.* not supported on this platform|could not .* between old 
and new data directories: .*/,
+               qr/^$/,
+               "pg_upgrade with transfer mode $mode");
+
+       # If pg_upgrade was successful, check that all of our test objects 
reached
+       # the new version.
+       if ($result)
+       {
+               $new->start;
+               $result = $new->safe_psql('postgres', "SELECT COUNT(*) FROM 
test1");
+               is($result, '100', "test1 data after pg_upgrade $mode");
+               $result = $new->safe_psql('testdb1', "SELECT COUNT(*) FROM 
test2");
+               is($result, '101', "test2 data after pg_upgrade $mode");
+               $result = $new->safe_psql('testdb1', "SELECT 
nextval('testseq')");
+               is($result, '5432', "sequence data after pg_upgrade $mode");
+
+               # For cross-version tests, we should have some objects in a 
non-default
+               # tablespace.
+               if (defined($ENV{oldinstall}))
+               {
+                       $result = $new->safe_psql('postgres', "SELECT COUNT(*) 
FROM test3");
+                       is($result, '102', "test3 data after pg_upgrade $mode");
+                       $result = $new->safe_psql('testdb2', "SELECT COUNT(*) 
FROM test4");
+                       is($result, '103', "test4 data after pg_upgrade $mode");
+               }
+               $new->stop;
+       }
+
+       $old->clean_node();
+       $new->clean_node();
+}
+
+test_mode('--clone');
+test_mode('--copy');
+test_mode('--copy-file-range');
+test_mode('--link');
+
+done_testing();
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm 
b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 05bd94609d4..8759ed2cbba 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -2801,6 +2801,25 @@ sub command_fails_like
 
 =pod
 
+=item $node->command_ok_or_fails_like(...)
+
+PostgreSQL::Test::Utils::command_ok_or_fails_like with our connection 
parameters. See command_ok(...)
+
+=cut
+
+sub command_ok_or_fails_like
+{
+       local $Test::Builder::Level = $Test::Builder::Level + 1;
+
+       my $self = shift;
+
+       local %ENV = $self->_get_env();
+
+       return PostgreSQL::Test::Utils::command_ok_or_fails_like(@_);
+}
+
+=pod
+
 =item $node->command_checks_all(...)
 
 PostgreSQL::Test::Utils::command_checks_all with our connection parameters. See
diff --git a/src/test/perl/PostgreSQL/Test/Utils.pm 
b/src/test/perl/PostgreSQL/Test/Utils.pm
index d1ad131eadf..7d7ca83495f 100644
--- a/src/test/perl/PostgreSQL/Test/Utils.pm
+++ b/src/test/perl/PostgreSQL/Test/Utils.pm
@@ -89,6 +89,7 @@ our @EXPORT = qw(
   command_like
   command_like_safe
   command_fails_like
+  command_ok_or_fails_like
   command_checks_all
 
   $windows_os
@@ -1067,6 +1068,30 @@ sub command_fails_like
 
 =pod
 
+=item command_ok_or_fails_like(cmd, expected_stdout, expected_stderr, 
test_name)
+
+Check that the command either succeeds or fails with an error that matches the
+given regular expressions.
+
+=cut
+
+sub command_ok_or_fails_like
+{
+       local $Test::Builder::Level = $Test::Builder::Level + 1;
+       my ($cmd, $expected_stdout, $expected_stderr, $test_name) = @_;
+       my ($stdout, $stderr);
+       print("# Running: " . join(" ", @{$cmd}) . "\n");
+       my $result = IPC::Run::run $cmd, '>' => \$stdout, '2>' => \$stderr;
+       if (!$result)
+       {
+               like($stdout, $expected_stdout, "$test_name: stdout matches");
+               like($stderr, $expected_stderr, "$test_name: stderr matches");
+       }
+       return $result;
+}
+
+=pod
+
 =item command_checks_all(cmd, ret, out, err, test_name)
 
 Run a command and check its status and outputs.
-- 
2.39.5 (Apple Git-154)

>From 1afc1225ce3e49b1da3d97ada50fa01444bdafc4 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 19 Feb 2025 09:14:51 -0600
Subject: [PATCH v8 2/4] initdb: Add --no-sync-data-files.

This new option instructs initdb to skip synchronizing any files
in database directories and the database directories themselves,
i.e., everything in the base/ subdirectory and any other
tablespace directories.  Other files, such as those in pg_wal/ and
pg_xact/, will still be synchronized unless --no-sync is also
specified.  --no-sync-data-files is primarily intended for internal
use by tools that separately ensure the skipped files are
synchronized to disk.  A follow-up commit will use this to help
optimize pg_upgrade's file transfer step.

Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan
---
 doc/src/sgml/ref/initdb.sgml                | 27 +++++++
 src/bin/initdb/initdb.c                     | 10 ++-
 src/bin/initdb/t/001_initdb.pl              |  1 +
 src/bin/pg_basebackup/pg_basebackup.c       |  2 +-
 src/bin/pg_checksums/pg_checksums.c         |  2 +-
 src/bin/pg_combinebackup/pg_combinebackup.c |  2 +-
 src/bin/pg_rewind/file_ops.c                |  2 +-
 src/common/file_utils.c                     | 85 +++++++++++++--------
 src/include/common/file_utils.h             |  2 +-
 9 files changed, 96 insertions(+), 37 deletions(-)

diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
index 0026318485a..2f1f9a42f90 100644
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@@ -527,6 +527,33 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry id="app-initdb-option-no-sync-data-files">
+      <term><option>--no-sync-data-files</option></term>
+      <listitem>
+       <para>
+        By default, <command>initdb</command> safely writes all database files
+        to disk.  This option instructs <command>initdb</command> to skip
+        synchronizing all files in the individual database directories, the
+        database directories themselves, and the tablespace directories, i.e.,
+        everything in the <filename>base</filename> subdirectory and any other
+        tablespace directories.  Other files, such as those in
+        <literal>pg_wal</literal> and <literal>pg_xact</literal>, will still be
+        synchronized unless the <option>--no-sync</option> option is also
+        specified.
+       </para>
+       <para>
+        Note that if <option>--no-sync-data-files</option> is used in
+        conjuction with <option>--sync-method=syncfs</option>, some or all of
+        the aforementioned files and directories will be synchronized because
+        <literal>syncfs</literal> processes entire file systems.
+       </para>
+       <para>
+        This option is primarily intended for internal use by tools that
+        separately ensure the skipped files are synchronized to disk.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="app-initdb-option-no-instructions">
       <term><option>--no-instructions</option></term>
       <listitem>
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 21a0fe3ecd9..22b7d31b165 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -168,6 +168,7 @@ static bool data_checksums = true;
 static char *xlog_dir = NULL;
 static int     wal_segment_size_mb = (DEFAULT_XLOG_SEG_SIZE) / (1024 * 1024);
 static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+static bool sync_data_files = true;
 
 
 /* internal vars */
@@ -2566,6 +2567,7 @@ usage(const char *progname)
        printf(_("  -L DIRECTORY              where to find the input 
files\n"));
        printf(_("  -n, --no-clean            do not clean up after errors\n"));
        printf(_("  -N, --no-sync             do not wait for changes to be 
written safely to disk\n"));
+       printf(_("      --no-sync-data-files  do not sync files within database 
directories\n"));
        printf(_("      --no-instructions     do not print instructions for 
next steps\n"));
        printf(_("  -s, --show                show internal settings, then 
exit\n"));
        printf(_("      --sync-method=METHOD  set method for syncing files to 
disk\n"));
@@ -3208,6 +3210,7 @@ main(int argc, char *argv[])
                {"icu-rules", required_argument, NULL, 18},
                {"sync-method", required_argument, NULL, 19},
                {"no-data-checksums", no_argument, NULL, 20},
+               {"no-sync-data-files", no_argument, NULL, 21},
                {NULL, 0, NULL, 0}
        };
 
@@ -3402,6 +3405,9 @@ main(int argc, char *argv[])
                        case 20:
                                data_checksums = false;
                                break;
+                       case 21:
+                               sync_data_files = false;
+                               break;
                        default:
                                /* getopt_long already emitted a complaint */
                                pg_log_error_hint("Try \"%s --help\" for more 
information.", progname);
@@ -3453,7 +3459,7 @@ main(int argc, char *argv[])
 
                fputs(_("syncing data to disk ... "), stdout);
                fflush(stdout);
-               sync_pgdata(pg_data, PG_VERSION_NUM, sync_method);
+               sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, 
sync_data_files);
                check_ok();
                return 0;
        }
@@ -3516,7 +3522,7 @@ main(int argc, char *argv[])
        {
                fputs(_("syncing data to disk ... "), stdout);
                fflush(stdout);
-               sync_pgdata(pg_data, PG_VERSION_NUM, sync_method);
+               sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, 
sync_data_files);
                check_ok();
        }
        else
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index 01cc4a1602b..15dd10ce40a 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -76,6 +76,7 @@ command_like(
        'checksums are enabled in control file');
 
 command_ok([ 'initdb', '--sync-only', $datadir ], 'sync only');
+command_ok([ 'initdb', '--sync-only', '--no-sync-data-files', $datadir ], 
'--no-sync-data-files');
 command_fails([ 'initdb', $datadir ], 'existing data directory');
 
 if ($supports_syncfs)
diff --git a/src/bin/pg_basebackup/pg_basebackup.c 
b/src/bin/pg_basebackup/pg_basebackup.c
index d4b4e334014..1da4bfc2351 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -2310,7 +2310,7 @@ BaseBackup(char *compression_algorithm, char 
*compression_detail,
                }
                else
                {
-                       (void) sync_pgdata(basedir, serverVersion, sync_method);
+                       (void) sync_pgdata(basedir, serverVersion, sync_method, 
true);
                }
        }
 
diff --git a/src/bin/pg_checksums/pg_checksums.c 
b/src/bin/pg_checksums/pg_checksums.c
index 867aeddc601..f20be82862a 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -633,7 +633,7 @@ main(int argc, char *argv[])
                if (do_sync)
                {
                        pg_log_info("syncing data directory");
-                       sync_pgdata(DataDir, PG_VERSION_NUM, sync_method);
+                       sync_pgdata(DataDir, PG_VERSION_NUM, sync_method, true);
                }
 
                pg_log_info("updating control file");
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c 
b/src/bin/pg_combinebackup/pg_combinebackup.c
index d480dc74436..050260ee832 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -424,7 +424,7 @@ main(int argc, char *argv[])
                else
                {
                        pg_log_debug("recursively fsyncing \"%s\"", opt.output);
-                       sync_pgdata(opt.output, version * 10000, 
opt.sync_method);
+                       sync_pgdata(opt.output, version * 10000, 
opt.sync_method, true);
                }
        }
 
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 467845419ed..55659ce201f 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -296,7 +296,7 @@ sync_target_dir(void)
        if (!do_sync || dry_run)
                return;
 
-       sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method);
+       sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method, true);
 }
 
 
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 0e3cfede935..78e272916f5 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -50,7 +50,8 @@ static int    pre_sync_fname(const char *fname, bool isdir);
 #endif
 static void walkdir(const char *path,
                                        int (*action) (const char *fname, bool 
isdir),
-                                       bool process_symlinks);
+                                       bool process_symlinks,
+                                       const char *exclude_dir);
 
 #ifdef HAVE_SYNCFS
 
@@ -93,11 +94,15 @@ do_syncfs(const char *path)
  * syncing, and might not have privileges to write at all.
  *
  * serverVersion indicates the version of the server to be sync'd.
+ *
+ * If sync_data_files is false, this function skips syncing "base/" and any
+ * other tablespace directories.
  */
 void
 sync_pgdata(const char *pg_data,
                        int serverVersion,
-                       DataDirSyncMethod sync_method)
+                       DataDirSyncMethod sync_method,
+                       bool sync_data_files)
 {
        bool            xlog_is_symlink;
        char            pg_wal[MAXPGPATH];
@@ -147,30 +152,33 @@ sync_pgdata(const char *pg_data,
                                do_syncfs(pg_data);
 
                                /* If any tablespaces are configured, sync each 
of those. */
-                               dir = opendir(pg_tblspc);
-                               if (dir == NULL)
-                                       pg_log_error("could not open directory 
\"%s\": %m",
-                                                                pg_tblspc);
-                               else
+                               if (sync_data_files)
                                {
-                                       while (errno = 0, (de = readdir(dir)) 
!= NULL)
+                                       dir = opendir(pg_tblspc);
+                                       if (dir == NULL)
+                                               pg_log_error("could not open 
directory \"%s\": %m",
+                                                                        
pg_tblspc);
+                                       else
                                        {
-                                               char            
subpath[MAXPGPATH * 2];
+                                               while (errno = 0, (de = 
readdir(dir)) != NULL)
+                                               {
+                                                       char            
subpath[MAXPGPATH * 2];
 
-                                               if (strcmp(de->d_name, ".") == 
0 ||
-                                                       strcmp(de->d_name, 
"..") == 0)
-                                                       continue;
+                                                       if (strcmp(de->d_name, 
".") == 0 ||
+                                                               
strcmp(de->d_name, "..") == 0)
+                                                               continue;
 
-                                               snprintf(subpath, 
sizeof(subpath), "%s/%s",
-                                                                pg_tblspc, 
de->d_name);
-                                               do_syncfs(subpath);
-                                       }
+                                                       snprintf(subpath, 
sizeof(subpath), "%s/%s",
+                                                                        
pg_tblspc, de->d_name);
+                                                       do_syncfs(subpath);
+                                               }
 
-                                       if (errno)
-                                               pg_log_error("could not read 
directory \"%s\": %m",
-                                                                        
pg_tblspc);
+                                               if (errno)
+                                                       pg_log_error("could not 
read directory \"%s\": %m",
+                                                                               
 pg_tblspc);
 
-                                       (void) closedir(dir);
+                                               (void) closedir(dir);
+                                       }
                                }
 
                                /* If pg_wal is a symlink, process that too. */
@@ -182,15 +190,21 @@ sync_pgdata(const char *pg_data,
 
                case DATA_DIR_SYNC_METHOD_FSYNC:
                        {
+                               char       *exclude_dir = NULL;
+
+                               if (!sync_data_files)
+                                       exclude_dir = psprintf("%s/base", 
pg_data);
+
                                /*
                                 * If possible, hint to the kernel that we're 
soon going to
                                 * fsync the data directory and its contents.
                                 */
 #ifdef PG_FLUSH_DATA_WORKS
-                               walkdir(pg_data, pre_sync_fname, false);
+                               walkdir(pg_data, pre_sync_fname, false, 
exclude_dir);
                                if (xlog_is_symlink)
-                                       walkdir(pg_wal, pre_sync_fname, false);
-                               walkdir(pg_tblspc, pre_sync_fname, true);
+                                       walkdir(pg_wal, pre_sync_fname, false, 
NULL);
+                               if (sync_data_files)
+                                       walkdir(pg_tblspc, pre_sync_fname, 
true, NULL);
 #endif
 
                                /*
@@ -203,10 +217,14 @@ sync_pgdata(const char *pg_data,
                                 * get fsync'd twice. That's not an expected 
case so we don't
                                 * worry about optimizing it.
                                 */
-                               walkdir(pg_data, fsync_fname, false);
+                               walkdir(pg_data, fsync_fname, false, 
exclude_dir);
                                if (xlog_is_symlink)
-                                       walkdir(pg_wal, fsync_fname, false);
-                               walkdir(pg_tblspc, fsync_fname, true);
+                                       walkdir(pg_wal, fsync_fname, false, 
NULL);
+                               if (sync_data_files)
+                                       walkdir(pg_tblspc, fsync_fname, true, 
NULL);
+
+                               if (exclude_dir)
+                                       pfree(exclude_dir);
                        }
                        break;
        }
@@ -245,10 +263,10 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod 
sync_method)
                                 * fsync the data directory and its contents.
                                 */
 #ifdef PG_FLUSH_DATA_WORKS
-                               walkdir(dir, pre_sync_fname, false);
+                               walkdir(dir, pre_sync_fname, false, NULL);
 #endif
 
-                               walkdir(dir, fsync_fname, false);
+                               walkdir(dir, fsync_fname, false, NULL);
                        }
                        break;
        }
@@ -264,6 +282,9 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod 
sync_method)
  * ignored in subdirectories, ie we intentionally don't pass down the
  * process_symlinks flag to recursive calls.
  *
+ * If exclude_dir is not NULL, it specifies a directory path to skip
+ * processing.
+ *
  * Errors are reported but not considered fatal.
  *
  * See also walkdir in fd.c, which is a backend version of this logic.
@@ -271,11 +292,15 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod 
sync_method)
 static void
 walkdir(const char *path,
                int (*action) (const char *fname, bool isdir),
-               bool process_symlinks)
+               bool process_symlinks,
+               const char *exclude_dir)
 {
        DIR                *dir;
        struct dirent *de;
 
+       if (exclude_dir && strcmp(exclude_dir, path) == 0)
+               return;
+
        dir = opendir(path);
        if (dir == NULL)
        {
@@ -299,7 +324,7 @@ walkdir(const char *path,
                                (*action) (subpath, false);
                                break;
                        case PGFILETYPE_DIR:
-                               walkdir(subpath, action, false);
+                               walkdir(subpath, action, false, exclude_dir);
                                break;
                        default:
 
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index a832210adc1..8274bc877ab 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -35,7 +35,7 @@ struct iovec;                                 /* avoid 
including port/pg_iovec.h here */
 #ifdef FRONTEND
 extern int     fsync_fname(const char *fname, bool isdir);
 extern void sync_pgdata(const char *pg_data, int serverVersion,
-                                               DataDirSyncMethod sync_method);
+                                               DataDirSyncMethod sync_method, 
bool sync_data_files);
 extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method);
 extern int     durable_rename(const char *oldfile, const char *newfile);
 extern int     fsync_parent_path(const char *fname);
-- 
2.39.5 (Apple Git-154)

>From 4325f2786554c79480993284117bb583298127a3 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 19 Feb 2025 11:25:28 -0600
Subject: [PATCH v8 3/4] pg_dump: Add --sequence-data.

This new option instructs pg_dump to dump sequence data when the
--no-data, --schema-only, or --statistics-only option is specified.
This was originally considered for commit a7e5457db8, but it was
left out at that time because there was no known use-case.  A
follow-up commit will use this to optimize pg_upgrade's file
transfer step.

Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan
---
 doc/src/sgml/ref/pg_dump.sgml               | 11 +++++++++++
 src/bin/pg_dump/pg_dump.c                   | 10 ++--------
 src/bin/pg_dump/t/002_pg_dump.pl            |  1 +
 src/bin/pg_upgrade/dump.c                   |  2 +-
 src/test/modules/test_pg_dump/t/001_base.pl |  2 +-
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 0ae40f9be58..63cca18711a 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -1298,6 +1298,17 @@ PostgreSQL documentation
        </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--sequence-data</option></term>
+      <listitem>
+       <para>
+        Include sequence data in the dump.  This is the default behavior except
+        when <option>--no-data</option>, <option>--schema-only</option>, or
+        <option>--statistics-only</option> is specified.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>--serializable-deferrable</option></term>
       <listitem>
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 428ed2d60fc..e6253331e27 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -518,6 +518,7 @@ main(int argc, char **argv)
                {"sync-method", required_argument, NULL, 15},
                {"filter", required_argument, NULL, 16},
                {"exclude-extension", required_argument, NULL, 17},
+               {"sequence-data", no_argument, &dopt.sequence_data, 1},
 
                {NULL, 0, NULL, 0}
        };
@@ -801,14 +802,6 @@ main(int argc, char **argv)
        if (dopt.column_inserts && dopt.dump_inserts == 0)
                dopt.dump_inserts = DUMP_DEFAULT_ROWS_PER_INSERT;
 
-       /*
-        * Binary upgrade mode implies dumping sequence data even in schema-only
-        * mode.  This is not exposed as a separate option, but kept separate
-        * internally for clarity.
-        */
-       if (dopt.binary_upgrade)
-               dopt.sequence_data = 1;
-
        if (data_only && schema_only)
                pg_fatal("options -s/--schema-only and -a/--data-only cannot be 
used together");
        if (schema_only && statistics_only)
@@ -1275,6 +1268,7 @@ help(const char *progname)
        printf(_("  --quote-all-identifiers      quote all identifiers, even if 
not key words\n"));
        printf(_("  --rows-per-insert=NROWS      number of rows per INSERT; 
implies --inserts\n"));
        printf(_("  --section=SECTION            dump named section (pre-data, 
data, or post-data)\n"));
+       printf(_("  --sequence-data              include sequence data in 
dump\n"));
        printf(_("  --serializable-deferrable    wait until the dump can run 
without anomalies\n"));
        printf(_("  --snapshot=SNAPSHOT          use given snapshot for the 
dump\n"));
        printf(_("  --statistics-only            dump only the statistics, not 
schema or data\n"));
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index d281e27aa67..ed379033da7 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -66,6 +66,7 @@ my %pgdump_runs = (
                        '--file' => "$tempdir/binary_upgrade.dump",
                        '--no-password',
                        '--no-data',
+                       '--sequence-data',
                        '--binary-upgrade',
                        '--dbname' => 'postgres',    # alternative way to 
specify database
                ],
diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c
index 23fe7280a16..b8fd0d0acee 100644
--- a/src/bin/pg_upgrade/dump.c
+++ b/src/bin/pg_upgrade/dump.c
@@ -52,7 +52,7 @@ generate_old_dump(void)
                snprintf(log_file_name, sizeof(log_file_name), 
DB_DUMP_LOG_FILE_MASK, old_db->db_oid);
 
                parallel_exec_prog(log_file_name, NULL,
-                                                  "\"%s/pg_dump\" %s --no-data 
%s --quote-all-identifiers "
+                                                  "\"%s/pg_dump\" %s --no-data 
%s --sequence-data --quote-all-identifiers "
                                                   "--binary-upgrade 
--format=custom %s --no-sync --file=\"%s/%s\" %s",
                                                   new_cluster.bindir, 
cluster_conn_opts(&old_cluster),
                                                   log_opts.verbose ? 
"--verbose" : "",
diff --git a/src/test/modules/test_pg_dump/t/001_base.pl 
b/src/test/modules/test_pg_dump/t/001_base.pl
index a9bcac4169d..adcaa419616 100644
--- a/src/test/modules/test_pg_dump/t/001_base.pl
+++ b/src/test/modules/test_pg_dump/t/001_base.pl
@@ -48,7 +48,7 @@ my %pgdump_runs = (
                dump_cmd => [
                        'pg_dump', '--no-sync',
                        '--file' => "$tempdir/binary_upgrade.sql",
-                       '--schema-only', '--binary-upgrade',
+                       '--schema-only', '--sequence-data', '--binary-upgrade',
                        '--dbname' => 'postgres',
                ],
        },
-- 
2.39.5 (Apple Git-154)

>From 0bb275bea08d724a32d3f5154cd5d583b9c87ace Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 5 Mar 2025 17:36:54 -0600
Subject: [PATCH v8 4/4] pg_upgrade: Add --swap for faster file transfer.

This new option instructs pg_upgrade to move the data directories
from the old cluster to the new cluster and then to replace the
catalog files with those generated for the new cluster.  This mode
can outperform --link, --clone, --copy, and --copy-file-range,
especially on clusters with many relations.

However, this mode creates many garbage files in the old cluster,
which can prolong the file synchronization step.  To handle that,
we use "initdb --sync-only --no-sync-data-files" for file
synchronization, and we synchronize the catalog files as they are
transferred.  We assume that the database files transferred from
the old cluster were synchronized prior to upgrade.  This mode also
complicates reverting to the old cluster, so we recommend restoring
from backup upon failure during or after file transfer.

The new mode is limited to clusters located in the same file system
and to upgrades from version 10 and newer.

Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan
---
 doc/src/sgml/ref/pgupgrade.sgml    |  59 ++++-
 src/bin/pg_upgrade/TESTING         |   6 +-
 src/bin/pg_upgrade/check.c         |  29 ++-
 src/bin/pg_upgrade/controldata.c   |  21 +-
 src/bin/pg_upgrade/dump.c          |   4 +-
 src/bin/pg_upgrade/file.c          |  14 +-
 src/bin/pg_upgrade/info.c          |   4 +-
 src/bin/pg_upgrade/option.c        |   7 +
 src/bin/pg_upgrade/pg_upgrade.c    |  16 +-
 src/bin/pg_upgrade/pg_upgrade.h    |   5 +-
 src/bin/pg_upgrade/relfilenumber.c | 371 +++++++++++++++++++++++++++++
 src/bin/pg_upgrade/t/006_modes.pl  |  10 +
 src/common/file_utils.c            |  14 +-
 src/include/common/file_utils.h    |   1 +
 14 files changed, 527 insertions(+), 34 deletions(-)

diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml
index 5db761d1ff1..da261619043 100644
--- a/doc/src/sgml/ref/pgupgrade.sgml
+++ b/doc/src/sgml/ref/pgupgrade.sgml
@@ -244,7 +244,8 @@ PostgreSQL documentation
       <listitem>
        <para>
         Copy files to the new cluster.  This is the default.  (See also
-        <option>--link</option> and <option>--clone</option>.)
+        <option>--link</option>, <option>--clone</option>,
+        <option>--copy-file-range</option>, and <option>--swap</option>.)
        </para>
       </listitem>
      </varlistentry>
@@ -262,6 +263,32 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--swap</option></term>
+      <listitem>
+       <para>
+        Move the data directories from the old cluster to the new cluster.
+        Then, replace the catalog files with those generated for the new
+        cluster.  This mode can outperform <option>--link</option>,
+        <option>--clone</option>, <option>--copy</option>, and
+        <option>--copy-file-range</option>, especially on clusters with many
+        relations.
+       </para>
+       <para>
+        However, this mode creates many garbage files in the old cluster, which
+        can prolong the file synchronization step if
+        <option>--sync-method=syncfs</option> is used.  Therefore, it is
+        recommended to use <option>--sync-method=fsync</option> with
+        <option>--swap</option>.
+       </para>
+       <para>
+        Additionally, once the file transfer step begins, the old cluster will
+        be destructively modified and therefore will no longer be safe to
+        start.  See <xref linkend="pgupgrade-step-revert"/> for details.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       
<term><option>--sync-method=</option><replaceable>method</replaceable></term>
       <listitem>
@@ -530,6 +557,10 @@ NET STOP postgresql-&majorversion;
      is started.  Clone mode also requires that the old and new data
      directories be in the same file system.  This mode is only available
      on certain operating systems and file systems.
+     Swap mode may be the fastest if there are many relations, but you will not
+     be able to access your old cluster once the file transfer step begins.
+     Swap mode also requires that the old and new cluster data directories be
+     in the same file system.
     </para>
 
     <para>
@@ -889,6 +920,32 @@ psql --username=postgres --file=script.sql postgres
 
         </itemizedlist></para>
       </listitem>
+
+      <listitem>
+       <para>
+        If the <option>--swap</option> option was used, the old cluster might
+        be destructively modified:
+
+        <itemizedlist>
+         <listitem>
+          <para>
+           If <command>pg_upgrade</command> aborts before reporting that the
+           old cluster is no longer safe to start, the old cluster was
+           unmodified; it can be restarted.
+          </para>
+         </listitem>
+
+         <listitem>
+          <para>
+           If <command>pg_upgrade</command> has reported that the old cluster
+           is no longer safe to start, the old cluster was destructively
+           modified.  The old cluster will need to be restored from backup in
+           this case.
+          </para>
+         </listitem>
+        </itemizedlist>
+       </para>
+      </listitem>
      </itemizedlist></para>
    </step>
   </procedure>
diff --git a/src/bin/pg_upgrade/TESTING b/src/bin/pg_upgrade/TESTING
index 00842ac6ec3..c3d463c9c29 100644
--- a/src/bin/pg_upgrade/TESTING
+++ b/src/bin/pg_upgrade/TESTING
@@ -20,13 +20,13 @@ export oldinstall=...otherversion/  (old version's install 
base path)
 See DETAILS below for more information about creation of the dump.
 
 You can also test the different transfer modes (--copy, --link,
---clone, --copy-file-range) by setting the environment variable
+--clone, --copy-file-range, --swap) by setting the environment variable
 PG_TEST_PG_UPGRADE_MODE to the respective command-line option, like
 
        make check PG_TEST_PG_UPGRADE_MODE=--link
 
-The default is --copy.  Note that the other modes are not supported on
-all operating systems.
+The default is --copy.  Note that not all modes are supported on all
+operating systems.
 
 DETAILS
 -------
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 88daa808035..564a9116ca5 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -709,7 +709,34 @@ check_new_cluster(void)
                        check_copy_file_range();
                        break;
                case TRANSFER_MODE_LINK:
-                       check_hard_link();
+                       check_hard_link(TRANSFER_MODE_LINK);
+                       break;
+               case TRANSFER_MODE_SWAP:
+
+                       /*
+                        * We do the hard link check for --swap, too, since 
it's an easy
+                        * way to verify the clusters are in the same file 
system.  This
+                        * allows us to take some shortcuts in the file 
synchronization
+                        * step.  With some more effort, we could probably 
support the
+                        * separate-file-system use case, but this mode is 
unlikely to
+                        * offer much benefit if we have to copy the files 
across file
+                        * system boundaries.
+                        */
+                       check_hard_link(TRANSFER_MODE_SWAP);
+
+                       /*
+                        * There are a few known issues with using --swap to 
upgrade from
+                        * versions older than 10.  For example, the sequence 
tuple format
+                        * changed in v10, and the visibility map format 
changed in 9.6.
+                        * While such problems are not insurmountable (and we 
may have to
+                        * deal with similar problems in the future, anyway), 
it doesn't
+                        * seem worth the effort to support swap mode for 
upgrades from
+                        * long-unsupported versions.
+                        */
+                       if (GET_MAJOR_VERSION(old_cluster.major_version) < 1000)
+                               pg_fatal("Swap mode can only upgrade clusters 
from PostgreSQL version %s and later.",
+                                                "10");
+
                        break;
        }
 
diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c
index bd49ea867bf..47ee27ec835 100644
--- a/src/bin/pg_upgrade/controldata.c
+++ b/src/bin/pg_upgrade/controldata.c
@@ -751,7 +751,7 @@ check_control_data(ControlData *oldctrl,
 
 
 void
-disable_old_cluster(void)
+disable_old_cluster(transferMode transfer_mode)
 {
        char            old_path[MAXPGPATH],
                                new_path[MAXPGPATH];
@@ -766,10 +766,17 @@ disable_old_cluster(void)
                                 old_path, new_path);
        check_ok();
 
-       pg_log(PG_REPORT, "\n"
-                  "If you want to start the old cluster, you will need to 
remove\n"
-                  "the \".old\" suffix from %s/global/pg_control.old.\n"
-                  "Because \"link\" mode was used, the old cluster cannot be 
safely\n"
-                  "started once the new cluster has been started.",
-                  old_cluster.pgdata);
+       if (transfer_mode == TRANSFER_MODE_LINK)
+               pg_log(PG_REPORT, "\n"
+                          "If you want to start the old cluster, you will need 
to remove\n"
+                          "the \".old\" suffix from 
%s/global/pg_control.old.\n"
+                          "Because \"link\" mode was used, the old cluster 
cannot be safely\n"
+                          "started once the new cluster has been started.",
+                          old_cluster.pgdata);
+       else if (transfer_mode == TRANSFER_MODE_SWAP)
+               pg_log(PG_REPORT, "\n"
+                          "Because \"swap\" mode was used, the old cluster can 
no longer be\n"
+                          "safely started.");
+       else
+               pg_fatal("unrecognized transfer mode");
 }
diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c
index b8fd0d0acee..23cb08e8347 100644
--- a/src/bin/pg_upgrade/dump.c
+++ b/src/bin/pg_upgrade/dump.c
@@ -52,9 +52,11 @@ generate_old_dump(void)
                snprintf(log_file_name, sizeof(log_file_name), 
DB_DUMP_LOG_FILE_MASK, old_db->db_oid);
 
                parallel_exec_prog(log_file_name, NULL,
-                                                  "\"%s/pg_dump\" %s --no-data 
%s --sequence-data --quote-all-identifiers "
+                                                  "\"%s/pg_dump\" %s --no-data 
%s %s --quote-all-identifiers "
                                                   "--binary-upgrade 
--format=custom %s --no-sync --file=\"%s/%s\" %s",
                                                   new_cluster.bindir, 
cluster_conn_opts(&old_cluster),
+                                                  (user_opts.transfer_mode == 
TRANSFER_MODE_SWAP) ?
+                                                  "" : "--sequence-data",
                                                   log_opts.verbose ? 
"--verbose" : "",
                                                   user_opts.do_statistics ? "" 
: "--no-statistics",
                                                   log_opts.dumpdir,
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index 7fd1991204a..91ed16acb08 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -434,7 +434,7 @@ check_copy_file_range(void)
 }
 
 void
-check_hard_link(void)
+check_hard_link(transferMode transfer_mode)
 {
        char            existing_file[MAXPGPATH];
        char            new_link_file[MAXPGPATH];
@@ -444,8 +444,16 @@ check_hard_link(void)
        unlink(new_link_file);          /* might fail */
 
        if (link(existing_file, new_link_file) < 0)
-               pg_fatal("could not create hard link between old and new data 
directories: %m\n"
-                                "In link mode the old and new data directories 
must be on the same file system.");
+       {
+               if (transfer_mode == TRANSFER_MODE_LINK)
+                       pg_fatal("could not create hard link between old and 
new data directories: %m\n"
+                                        "In link mode the old and new data 
directories must be on the same file system.");
+               else if (transfer_mode == TRANSFER_MODE_SWAP)
+                       pg_fatal("could not create hard link between old and 
new data directories: %m\n"
+                                        "In swap mode the old and new data 
directories must be on the same file system.");
+               else
+                       pg_fatal("unrecognized transfer mode");
+       }
 
        unlink(new_link_file);
 }
diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index ad52de8b607..4b7a56f5b3b 100644
--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -490,7 +490,7 @@ get_rel_infos_query(void)
                                          "  FROM pg_catalog.pg_class c JOIN 
pg_catalog.pg_namespace n "
                                          "         ON c.relnamespace = n.oid "
                                          "  WHERE relkind IN (" 
CppAsString2(RELKIND_RELATION) ", "
-                                         CppAsString2(RELKIND_MATVIEW) ") AND "
+                                         CppAsString2(RELKIND_MATVIEW) "%s) 
AND "
        /* exclude possible orphaned temp tables */
                                          "    ((n.nspname !~ '^pg_temp_' AND "
                                          "      n.nspname !~ '^pg_toast_temp_' 
AND "
@@ -499,6 +499,8 @@ get_rel_infos_query(void)
                                          "      c.oid >= %u::pg_catalog.oid) 
OR "
                                          "     (n.nspname = 'pg_catalog' AND "
                                          "      relname IN ('pg_largeobject') 
))), ",
+                                         (user_opts.transfer_mode == 
TRANSFER_MODE_SWAP) ?
+                                         ", " CppAsString2(RELKIND_SEQUENCE) : 
"",
                                          FirstNormalObjectId);
 
        /*
diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c
index 188dd8d8a8b..7fd7f1d33fc 100644
--- a/src/bin/pg_upgrade/option.c
+++ b/src/bin/pg_upgrade/option.c
@@ -62,6 +62,7 @@ parseCommandLine(int argc, char *argv[])
                {"sync-method", required_argument, NULL, 4},
                {"no-statistics", no_argument, NULL, 5},
                {"set-char-signedness", required_argument, NULL, 6},
+               {"swap", no_argument, NULL, 7},
 
                {NULL, 0, NULL, 0}
        };
@@ -228,6 +229,11 @@ parseCommandLine(int argc, char *argv[])
                                else
                                        pg_fatal("invalid argument for option 
%s", "--set-char-signedness");
                                break;
+
+                       case 7:
+                               user_opts.transfer_mode = TRANSFER_MODE_SWAP;
+                               break;
+
                        default:
                                fprintf(stderr, _("Try \"%s --help\" for more 
information.\n"),
                                                os_info.progname);
@@ -325,6 +331,7 @@ usage(void)
        printf(_("  --no-statistics               do not import statistics from 
old cluster\n"));
        printf(_("  --set-char-signedness=OPTION  set new cluster char 
signedness to \"signed\" or\n"
                         "                                \"unsigned\"\n"));
+       printf(_("  --swap                        move data directories to new 
cluster\n"));
        printf(_("  --sync-method=METHOD          set method for syncing files 
to disk\n"));
        printf(_("  -?, --help                    show this help, then 
exit\n"));
        printf(_("\n"
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 174cd920840..9295e46aed3 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -170,12 +170,14 @@ main(int argc, char **argv)
 
        /*
         * Most failures happen in create_new_objects(), which has completed at
-        * this point.  We do this here because it is just before linking, which
-        * will link the old and new cluster data files, preventing the old
-        * cluster from being safely started once the new cluster is started.
+        * this point.  We do this here because it is just before file transfer,
+        * which for --link will make it unsafe to start the old cluster once 
the
+        * new cluster is started, and for --swap will make it unsafe to start 
the
+        * old cluster at all.
         */
-       if (user_opts.transfer_mode == TRANSFER_MODE_LINK)
-               disable_old_cluster();
+       if (user_opts.transfer_mode == TRANSFER_MODE_LINK ||
+               user_opts.transfer_mode == TRANSFER_MODE_SWAP)
+               disable_old_cluster(user_opts.transfer_mode);
 
        transfer_all_new_tablespaces(&old_cluster.dbarr, &new_cluster.dbarr,
                                                                 
old_cluster.pgdata, new_cluster.pgdata);
@@ -212,8 +214,10 @@ main(int argc, char **argv)
        {
                prep_status("Sync data directory to disk");
                exec_prog(UTILITY_LOG_FILE, NULL, true, true,
-                                 "\"%s/initdb\" --sync-only \"%s\" 
--sync-method %s",
+                                 "\"%s/initdb\" --sync-only %s \"%s\" 
--sync-method %s",
                                  new_cluster.bindir,
+                                 (user_opts.transfer_mode == 
TRANSFER_MODE_SWAP) ?
+                                 "--no-sync-data-files" : "",
                                  new_cluster.pgdata,
                                  user_opts.sync_method);
                check_ok();
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 4c9d0172149..69c965bb7d0 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -262,6 +262,7 @@ typedef enum
        TRANSFER_MODE_COPY,
        TRANSFER_MODE_COPY_FILE_RANGE,
        TRANSFER_MODE_LINK,
+       TRANSFER_MODE_SWAP,
 } transferMode;
 
 /*
@@ -391,7 +392,7 @@ void                
create_script_for_old_cluster_deletion(char **deletion_script_file_name);
 
 void           get_control_data(ClusterInfo *cluster);
 void           check_control_data(ControlData *oldctrl, ControlData *newctrl);
-void           disable_old_cluster(void);
+void           disable_old_cluster(transferMode transfer_mode);
 
 
 /* dump.c */
@@ -423,7 +424,7 @@ void                rewriteVisibilityMap(const char 
*fromfile, const char *tofile,
                                                                 const char 
*schemaName, const char *relName);
 void           check_file_clone(void);
 void           check_copy_file_range(void);
-void           check_hard_link(void);
+void           check_hard_link(transferMode transfer_mode);
 
 /* fopen_priv() is no longer different from fopen() */
 #define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/bin/pg_upgrade/relfilenumber.c 
b/src/bin/pg_upgrade/relfilenumber.c
index 8c23c583172..b07f3330fee 100644
--- a/src/bin/pg_upgrade/relfilenumber.c
+++ b/src/bin/pg_upgrade/relfilenumber.c
@@ -11,11 +11,92 @@
 
 #include <sys/stat.h>
 
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "common/int.h"
+#include "common/logging.h"
 #include "pg_upgrade.h"
 
 static void transfer_single_new_db(FileNameMap *maps, int size, char 
*old_tablespace);
 static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool 
vm_must_add_frozenbit);
 
+/*
+ * The following set of sync_queue_* functions are used for --swap to reduce
+ * the amount of time spent synchronizing the swapped catalog files.  When a
+ * file is added to the queue, we also alert the file system that we'd like it
+ * to be persisted to disk in the near future (if that operation is supported
+ * by the current platform).  Once the queue is full, all of the files are
+ * synchronized to disk.  This strategy should generally be much faster than
+ * simply calling fsync() on the files right away.
+ *
+ * The general usage pattern should be something like:
+ *
+ *     for (int i = 0; i < num_files; i++)
+ *         sync_queue_push(files[i]);
+ *
+ *     // be sure to sync any remaining files in the queue
+ *     sync_queue_sync_all();
+ *     synq_queue_destroy();
+ */
+
+#define SYNC_QUEUE_MAX_LEN     (1024)
+
+static char *sync_queue[SYNC_QUEUE_MAX_LEN];
+static bool sync_queue_inited;
+static int     sync_queue_len;
+
+static inline void
+sync_queue_init(void)
+{
+       if (sync_queue_inited)
+               return;
+
+       sync_queue_inited = true;
+       for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++)
+               sync_queue[i] = palloc(MAXPGPATH);
+}
+
+static inline void
+sync_queue_sync_all(void)
+{
+       if (!sync_queue_inited)
+               return;
+
+       for (int i = 0; i < sync_queue_len; i++)
+       {
+               if (fsync_fname(sync_queue[i], false) != 0)
+                       pg_fatal("could not synchronize file \"%s\": %m", 
sync_queue[i]);
+       }
+
+       sync_queue_len = 0;
+}
+
+static inline void
+sync_queue_push(const char *fname)
+{
+       sync_queue_init();
+
+       pre_sync_fname(fname, false);
+
+       strncpy(sync_queue[sync_queue_len++], fname, MAXPGPATH);
+       if (sync_queue_len >= SYNC_QUEUE_MAX_LEN)
+               sync_queue_sync_all();
+}
+
+static inline void
+sync_queue_destroy(void)
+{
+       if (!sync_queue_inited)
+               return;
+
+       sync_queue_inited = false;
+       sync_queue_len = 0;
+       for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++)
+       {
+               pfree(sync_queue[i]);
+               sync_queue[i] = NULL;
+       }
+}
 
 /*
  * transfer_all_new_tablespaces()
@@ -41,6 +122,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, 
DbInfoArr *new_db_arr,
                case TRANSFER_MODE_LINK:
                        prep_status_progress("Linking user relation files");
                        break;
+               case TRANSFER_MODE_SWAP:
+                       prep_status_progress("Swapping data directories");
+                       break;
        }
 
        /*
@@ -125,6 +209,274 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr 
*new_db_arr,
                /* We allocate something even for n_maps == 0 */
                pg_free(mappings);
        }
+
+       /*
+        * Make sure anything pending synchronization in swap mode is fully
+        * persisted to disk.  This is a no-op for other transfer modes.
+        */
+       sync_queue_sync_all();
+       sync_queue_destroy();
+}
+
+/*
+ * prepare_for_swap()
+ *
+ * This function moves the database directory from the old cluster to the new
+ * cluster in preparation for moving the pg_restore-generated catalog files
+ * into place.  Returns false if the database with the given OID does not have
+ * a directory in the given tablespace, otherwise returns true.
+ *
+ * old_cat (the directory for the old catalog files), new_dat (the database
+ * directory in the new cluster), and moved_dat (the destination for the
+ * pg_restore-generated database directory) should be sized to MAXPGPATH bytes.
+ * This function will return the appropriate paths in those variables.
+ */
+static bool
+prepare_for_swap(const char *old_tablespace, Oid db_oid,
+                                char *old_cat, char *new_dat, char *moved_dat)
+{
+       const char *new_tablespace;
+       const char *old_tblspc_suffix;
+       const char *new_tblspc_suffix;
+       char            old_tblspc[MAXPGPATH];
+       char            new_tblspc[MAXPGPATH];
+       char            moved_tblspc[MAXPGPATH];
+       char            old_dat[MAXPGPATH];
+       struct stat st;
+
+       if (strcmp(old_tablespace, old_cluster.pgdata) == 0)
+       {
+               new_tablespace = new_cluster.pgdata;
+               new_tblspc_suffix = "/base";
+               old_tblspc_suffix = "/base";
+       }
+       else
+       {
+               /*
+                * XXX: The below line is a hack to deal with the fact that we
+                * presently don't have an easy way to find the corresponding 
new
+                * tablespace's path.  This will need to be fixed if/when we add
+                * pg_upgrade support for in-place tablespaces.
+                */
+               new_tablespace = old_tablespace;
+
+               new_tblspc_suffix = new_cluster.tablespace_suffix;
+               old_tblspc_suffix = old_cluster.tablespace_suffix;
+       }
+
+       /* Old and new cluster paths. */
+       snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", old_tablespace, 
old_tblspc_suffix);
+       snprintf(new_tblspc, sizeof(new_tblspc), "%s%s", new_tablespace, 
new_tblspc_suffix);
+       snprintf(old_dat, sizeof(old_dat), "%s/%u", old_tblspc, db_oid);
+       snprintf(new_dat, MAXPGPATH, "%s/%u", new_tblspc, db_oid);
+
+       /*
+        * Paths for "moved aside" stuff.  We intentionally put these in the old
+        * cluster so that the delete_old_cluster.{sh,bat} script handles them.
+        */
+       snprintf(moved_tblspc, sizeof(moved_tblspc), "%s/moved_for_upgrade", 
old_tblspc);
+       snprintf(old_cat, MAXPGPATH, "%s/%u_old_catalogs", moved_tblspc, 
db_oid);
+       snprintf(moved_dat, MAXPGPATH, "%s/%u", moved_tblspc, db_oid);
+
+       /* Check that the database directory exists in the given tablespace. */
+       if (stat(old_dat, &st) != 0)
+       {
+               if (errno != ENOENT)
+                       pg_fatal("could not stat file \"%s\": %m", old_dat);
+               return false;
+       }
+
+       /* Create directory for stuff that is moved aside. */
+       if (pg_mkdir_p(moved_tblspc, pg_dir_create_mode) != 0 && errno != 
EEXIST)
+               pg_fatal("could not create directory \"%s\"", moved_tblspc);
+
+       /* Create directory for old catalog files. */
+       if (pg_mkdir_p(old_cat, pg_dir_create_mode) != 0)
+               pg_fatal("could not create directory \"%s\"", old_cat);
+
+       /* Move the new cluster's database directory aside. */
+       if (rename(new_dat, moved_dat) != 0)
+               pg_fatal("could not rename \"%s\" to \"%s\"", new_dat, 
moved_dat);
+
+       /* Move the old cluster's database directory into place. */
+       if (rename(old_dat, new_dat) != 0)
+               pg_fatal("could not rename \"%s\" to \"%s\"", old_dat, new_dat);
+
+       return true;
+}
+
+/*
+ * FileNameMapCmp()
+ *
+ * qsort() comparator for FileNameMap that sorts by RelFileNumber.
+ */
+static int
+FileNameMapCmp(const void *a, const void *b)
+{
+       const FileNameMap *map1 = (const FileNameMap *) a;
+       const FileNameMap *map2 = (const FileNameMap *) b;
+
+       return pg_cmp_u32(map1->relfilenumber, map2->relfilenumber);
+}
+
+/*
+ * parse_relfilenumber()
+ *
+ * Attempt to parse the RelFileNumber of the given file name.  If we can't,
+ * return InvalidRelFileNumber.  Note that this code snippet is lifted from
+ * parse_filename_for_nontemp_relation().
+ */
+static RelFileNumber
+parse_relfilenumber(const char *filename)
+{
+       char       *endp;
+       unsigned long n;
+
+       if (filename[0] < '1' || filename[0] > '9')
+               return InvalidRelFileNumber;
+
+       errno = 0;
+       n = strtoul(filename, &endp, 10);
+       if (errno || filename == endp || n <= 0 || n > PG_UINT32_MAX)
+               return InvalidRelFileNumber;
+
+       return (RelFileNumber) n;
+}
+
+/*
+ * swap_catalog_files()
+ *
+ * Moves the old catalog files aside, and moves the new catalog files into
+ * place.
+ */
+static void
+swap_catalog_files(FileNameMap *maps, int size, const char *old_cat,
+                                  const char *new_dat, const char *moved_dat)
+{
+       DIR                *dir;
+       struct dirent *de;
+       char            path[MAXPGPATH];
+       char            dest[MAXPGPATH];
+       RelFileNumber rfn;
+
+       /* Move the old catalog files aside. */
+       dir = opendir(new_dat);
+       if (dir == NULL)
+               pg_fatal("could not open directory \"%s\": %m", new_dat);
+       while (errno = 0, (de = readdir(dir)) != NULL)
+       {
+               snprintf(path, sizeof(path), "%s/%s", new_dat, de->d_name);
+               if (get_dirent_type(path, de, false, PG_LOG_ERROR) != 
PGFILETYPE_REG)
+                       continue;
+
+               rfn = parse_relfilenumber(de->d_name);
+               if (RelFileNumberIsValid(rfn))
+               {
+                       FileNameMap key = {.relfilenumber = rfn};
+
+                       if (bsearch(&key, maps, size, sizeof(FileNameMap), 
FileNameMapCmp))
+                               continue;
+               }
+
+               snprintf(dest, sizeof(dest), "%s/%s", old_cat, de->d_name);
+               if (rename(path, dest) != 0)
+                       pg_fatal("could not rename \"%s\" to \"%s\": %m", path, 
dest);
+       }
+       if (errno)
+               pg_fatal("could not read directory \"%s\": %m", new_dat);
+       (void) closedir(dir);
+
+       /* Move the new catalog files into place. */
+       dir = opendir(moved_dat);
+       if (dir == NULL)
+               pg_fatal("could not open directory \"%s\": %m", moved_dat);
+       while (errno = 0, (de = readdir(dir)) != NULL)
+       {
+               snprintf(path, sizeof(path), "%s/%s", moved_dat, de->d_name);
+               if (get_dirent_type(path, de, false, PG_LOG_ERROR) != 
PGFILETYPE_REG)
+                       continue;
+
+               rfn = parse_relfilenumber(de->d_name);
+               if (RelFileNumberIsValid(rfn))
+               {
+                       FileNameMap key = {.relfilenumber = rfn};
+
+                       if (bsearch(&key, maps, size, sizeof(FileNameMap), 
FileNameMapCmp))
+                               continue;
+               }
+
+               snprintf(dest, sizeof(dest), "%s/%s", new_dat, de->d_name);
+               if (rename(path, dest) != 0)
+                       pg_fatal("could not rename \"%s\" to \"%s\": %m", path, 
dest);
+
+               /*
+                * We don't fsync() the database files in the file 
synchronization
+                * stage of pg_upgrade in swap mode, so we need to synchronize 
them
+                * ourselves.  We only do this for the catalog files because 
they were
+                * created during pg_restore with fsync=off.  We assume that 
the user
+                * data files files were properly persisted to disk when the 
user last
+                * shut it down.
+                */
+               if (user_opts.do_sync)
+                       sync_queue_push(dest);
+       }
+       if (errno)
+               pg_fatal("could not read directory \"%s\": %m", moved_dat);
+       (void) closedir(dir);
+
+       /* Ensure the directory entries are persisted to disk. */
+       if (fsync_fname(new_dat, true) != 0)
+               pg_fatal("could not synchronize directory \"%s\": %m", new_dat);
+       if (fsync_parent_path(new_dat) != 0)
+               pg_fatal("could not synchronize parent directory of \"%s\": 
%m", new_dat);
+}
+
+/*
+ * do_swap()
+ *
+ * Perform the required steps for --swap for a single database.  In short this
+ * moves the old cluster's database directory into the new cluster and then
+ * replaces any files for system catalogs with the ones that were generated
+ * during pg_restore.
+ */
+static void
+do_swap(FileNameMap *maps, int size, char *old_tablespace)
+{
+       char            old_cat[MAXPGPATH];
+       char            new_dat[MAXPGPATH];
+       char            moved_dat[MAXPGPATH];
+
+       /*
+        * We perform many lookups on maps by relfilenumber in swap mode, so 
make
+        * sure it's sorted by relfilenumber.  maps should already be sorted by
+        * OID, so in general this shouldn't have much work to do.
+        */
+       qsort(maps, size, sizeof(FileNameMap), FileNameMapCmp);
+
+       /*
+        * If an old tablespace is given, we only need to process that one.  If 
no
+        * old tablespace is specified, we need to process all the tablespaces 
on
+        * the system.
+        */
+       if (old_tablespace)
+       {
+               if (prepare_for_swap(old_tablespace, maps[0].db_oid,
+                                                        old_cat, new_dat, 
moved_dat))
+                       swap_catalog_files(maps, size, old_cat, new_dat, 
moved_dat);
+       }
+       else
+       {
+               if (prepare_for_swap(old_cluster.pgdata, maps[0].db_oid,
+                                                        old_cat, new_dat, 
moved_dat))
+                       swap_catalog_files(maps, size, old_cat, new_dat, 
moved_dat);
+
+               for (int tblnum = 0; tblnum < os_info.num_old_tablespaces; 
tblnum++)
+               {
+                       if (prepare_for_swap(os_info.old_tablespaces[tblnum], 
maps[0].db_oid,
+                                                                old_cat, 
new_dat, moved_dat))
+                               swap_catalog_files(maps, size, old_cat, 
new_dat, moved_dat);
+               }
+       }
 }
 
 /*
@@ -145,6 +497,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char 
*old_tablespace)
                new_cluster.controldata.cat_ver >= 
VISIBILITY_MAP_FROZEN_BIT_CAT_VER)
                vm_must_add_frozenbit = true;
 
+       /* --swap has its own subroutine */
+       if (user_opts.transfer_mode == TRANSFER_MODE_SWAP)
+       {
+               /*
+                * We don't support --swap to upgrade from versions that require
+                * rewriting the visibility map.  We should've failed already if
+                * someone tries to do that.
+                */
+               Assert(!vm_must_add_frozenbit);
+
+               do_swap(maps, size, old_tablespace);
+               return;
+       }
+
        for (mapnum = 0; mapnum < size; mapnum++)
        {
                if (old_tablespace == NULL ||
@@ -259,6 +625,11 @@ transfer_relfile(FileNameMap *map, const char 
*type_suffix, bool vm_must_add_fro
                                        pg_log(PG_VERBOSE, "linking \"%s\" to 
\"%s\"",
                                                   old_file, new_file);
                                        linkFile(old_file, new_file, 
map->nspname, map->relname);
+                                       break;
+                               case TRANSFER_MODE_SWAP:
+                                       /* swap mode is handled in its own code 
path */
+                                       pg_fatal("should never happen");
+                                       break;
                        }
        }
 }
diff --git a/src/bin/pg_upgrade/t/006_modes.pl 
b/src/bin/pg_upgrade/t/006_modes.pl
index 518e0994145..34fddbcdab5 100644
--- a/src/bin/pg_upgrade/t/006_modes.pl
+++ b/src/bin/pg_upgrade/t/006_modes.pl
@@ -16,6 +16,15 @@ sub test_mode
        my $old = PostgreSQL::Test::Cluster->new('old', install_path => 
$ENV{oldinstall});
        my $new = PostgreSQL::Test::Cluster->new('new');
 
+       # --swap can't be used to upgrade from versions older than 10, so just 
skip
+       # the test if the old cluster version is too old.
+       if ($old->pg_version < 10 && $mode eq "--swap")
+       {
+               $old->clean_node();
+               $new->clean_node();
+               return;
+       }
+
        if (defined($ENV{oldinstall}))
        {
                # Checksums are now enabled by default, but weren't before 18, 
so pass
@@ -97,5 +106,6 @@ test_mode('--clone');
 test_mode('--copy');
 test_mode('--copy-file-range');
 test_mode('--link');
+test_mode('--swap');
 
 done_testing();
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 78e272916f5..4405ef8b425 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -45,9 +45,6 @@
  */
 #define MINIMUM_VERSION_FOR_PG_WAL     100000
 
-#ifdef PG_FLUSH_DATA_WORKS
-static int     pre_sync_fname(const char *fname, bool isdir);
-#endif
 static void walkdir(const char *path,
                                        int (*action) (const char *fname, bool 
isdir),
                                        bool process_symlinks,
@@ -352,16 +349,16 @@ walkdir(const char *path,
 }
 
 /*
- * Hint to the OS that it should get ready to fsync() this file.
+ * Hint to the OS that it should get ready to fsync() this file, if supported
+ * by the platform.
  *
  * Ignores errors trying to open unreadable files, and reports other errors
  * non-fatally.
  */
-#ifdef PG_FLUSH_DATA_WORKS
-
-static int
+int
 pre_sync_fname(const char *fname, bool isdir)
 {
+#ifdef PG_FLUSH_DATA_WORKS
        int                     fd;
 
        fd = open(fname, O_RDONLY | PG_BINARY, 0);
@@ -388,11 +385,10 @@ pre_sync_fname(const char *fname, bool isdir)
 #endif
 
        (void) close(fd);
+#endif                                                 /* PG_FLUSH_DATA_WORKS 
*/
        return 0;
 }
 
-#endif                                                 /* PG_FLUSH_DATA_WORKS 
*/
-
 /*
  * fsync_fname -- Try to fsync a file or directory
  *
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 8274bc877ab..9fd88953e43 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -33,6 +33,7 @@ typedef enum DataDirSyncMethod
 struct iovec;                                  /* avoid including 
port/pg_iovec.h here */
 
 #ifdef FRONTEND
+extern int     pre_sync_fname(const char *fname, bool isdir);
 extern int     fsync_fname(const char *fname, bool isdir);
 extern void sync_pgdata(const char *pg_data, int serverVersion,
                                                DataDirSyncMethod sync_method, 
bool sync_data_files);
-- 
2.39.5 (Apple Git-154)

Reply via email to