At Mon, 28 Mar 2022 10:01:05 +0900 (JST), Kyotaro Horiguchi 
<horikyota....@gmail.com> wrote in 
> At Fri, 25 Mar 2022 13:26:05 +0100, Alvaro Herrera <alvhe...@alvh.no-ip.org> 
> wrote in 
> > Pushed this, backpatching to 14 and 13.  It would have been good to
> > backpatch further, but there's an (textually trivial) merge conflict
> > related to commit e6d8069522c8.  Because that commit conceptually
> > touches the same area that this bugfix is about, I'm not sure that
> > backpatching further without a lot more thought is wise -- particularly
> > so when there's no way to automate the test in branches older than
> > master.
> 
> Thaks for committing.
> 
> > This is quite annoying, considering that the bug was reported shortly
> > before 12 went into beta.
> 
> Sure.  I'm going to look into that.

This is a preparatory patch and tentative (yes, it's just tentative)
test. This is made for 12 but applies with some warnings to 10-11.

(Hope the attachments are attached as "attachment", not  "inline".)

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From 3d5b24691517c1aac4b49728abb122c66a4e33be Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota....@gmail.com>
Date: Mon, 28 Mar 2022 16:29:04 +0900
Subject: [PATCH 1/2] Tentative test for tsp replay fix

---
 src/test/perl/PostgresNode.pm             | 342 +++++++++++++++++++++-
 src/test/recovery/t/011_crash_recovery.pl | 108 ++++++-
 2 files changed, 447 insertions(+), 3 deletions(-)

diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 7b2ec29bb7..88fa08b61d 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -104,6 +104,8 @@ use TestLib ();
 use Time::HiRes qw(usleep);
 use Scalar::Util qw(blessed);
 
+my $windows_os = 0;
+
 our @EXPORT = qw(
   get_new_node
   get_free_port
@@ -323,6 +325,64 @@ sub archive_dir
 
 =pod
 
+=item $node->tablespace_storage([, nocreate])
+
+Diretory to store tablespace directories.
+If nocreate is true, returns undef if not yet created.
+
+=cut
+
+sub tablespace_storage
+{
+       my ($self, $nocreate) = @_;
+
+       if (!defined $self->{_tsproot})
+       {
+               # tablespace is not used, return undef if nocreate is specified.
+               return undef if ($nocreate);
+
+               # create and remember the tablespae root directotry.
+               $self->{_tsproot} = TestLib::tempdir_short();
+       }
+
+       return $self->{_tsproot};
+}
+
+=pod
+
+=item $node->tablespaces()
+
+Returns a hash from tablespace OID to tablespace directory name.  For
+example, an oid 16384 pointing to /tmp/jWAhkT_fs0/ts1 is stored as
+$hash{16384} = "ts1".
+
+=cut
+
+sub tablespaces
+{
+       my ($self) = @_;
+       my $pg_tblspc = $self->data_dir . '/pg_tblspc';
+       my %ret;
+
+       # return undef if no tablespace is used
+       return undef if (!defined $self->tablespace_storage(1));
+
+       # collect tablespace entries in pg_tblspc directory
+       opendir(my $dir, $pg_tblspc);
+       while (my $oid = readdir($dir))
+       {
+               next if ($oid !~ /^([0-9]+)$/);
+               my $linkpath = "$pg_tblspc/$oid";
+               my $tsppath = dir_readlink($linkpath);
+               $ret{$oid} = File::Basename::basename($tsppath);
+       }
+       closedir($dir);
+
+       return %ret;
+}
+
+=pod
+
 =item $node->backup_dir()
 
 The output path for backups taken with $node->backup()
@@ -338,6 +398,77 @@ sub backup_dir
 
 =pod
 
+=item $node->backup_tablespace_storage_path(backup_name)
+
+Returns tablespace location path for backup_name.
+Retuns the parent directory if backup_name is not given.
+
+=cut
+
+sub backup_tablespace_storage_path
+{
+       my ($self, $backup_name) = @_;
+       my $dir = $self->backup_dir . '/__tsps';
+
+       $dir .= "/$backup_name" if (defined $backup_name);
+
+       return $dir;
+}
+
+=pod
+
+=item $node->backup_create_tablespace_storage(backup_name)
+
+Create tablespace location directory for backup_name if not yet.
+Create the parent tablespace storage that holds all location
+directories if backup_name is not supplied.
+
+=cut
+
+sub backup_create_tablespace_storage
+{
+       my ($self, $backup_name) = @_;
+       my $dir = $self->backup_tablespace_storage_path($backup_name);
+
+       File::Path::make_path $dir if (! -d $dir);
+}
+
+=pod
+
+=item $node->backup_tablespaces(backup_name)
+
+Returns a reference to hash from tablespace OID to tablespace
+directory name of tablespace directory that the specified backup has.
+For example, an oid 16384 pointing to ../tsps/backup1/ts1 is stored as
+$hash{16384} = "ts1".
+
+=cut
+
+sub backup_tablespaces
+{
+       my ($self, $backup_name) = @_;
+       my $pg_tblspc = $self->backup_dir . '/' . $backup_name . '/pg_tblspc';
+       my %ret;
+
+       #return undef if this backup holds no tablespaces
+       return undef if (! -d 
$self->backup_tablespace_storage_path($backup_name));
+
+       # scan pg_tblspc directory of the backup
+       opendir(my $dir, $pg_tblspc);
+       while (my $oid = readdir($dir))
+       {
+               next if ($oid !~ /^([0-9]+)$/);
+               my $linkpath = "$pg_tblspc/$oid";
+               my $tsppath = dir_readlink($linkpath);
+               $ret{$oid} = File::Basename::basename($tsppath);
+       }
+       closedir($dir);
+
+       return \%ret;
+}
+
+=pod
+
 =item $node->info()
 
 Return a string containing human-readable diagnostic information (paths, etc)
@@ -354,6 +485,7 @@ sub info
        print $fh "Data directory: " . $self->data_dir . "\n";
        print $fh "Backup directory: " . $self->backup_dir . "\n";
        print $fh "Archive directory: " . $self->archive_dir . "\n";
+       print $fh "Tablespace directory: " . $self->tablespace_storage . "\n";
        print $fh "Connection string: " . $self->connstr . "\n";
        print $fh "Log file: " . $self->logfile . "\n";
        close $fh or die;
@@ -536,6 +668,43 @@ sub append_conf
 
 =pod
 
+=item $node->new_tablespace(name)
+
+Create a tablespace directory with the name then returns the path.
+
+=cut
+
+sub new_tablespace
+{
+       my ($self, $name) = @_;
+
+       my $path = $self->tablespace_storage . '/' . $name;
+
+       die "tablespace \"$name\" already exists" if (!mkdir($path));
+
+       return $path;
+}
+
+=pod
+
+=item $node->tablespace_dir(name)
+
+Return the path of the existing tablespace with the name.
+
+=cut
+
+sub tablespace_dir
+{
+       my ($self, $name) = @_;
+
+       my $path = $self->tablespace_storage . '/' . $name;
+       return undef if (!-d $path);
+
+       return $path;
+}
+
+=pod
+
 =item $node->backup(backup_name)
 
 Create a hot backup with B<pg_basebackup> in subdirectory B<backup_name> of
@@ -555,13 +724,54 @@ sub backup
        my ($self, $backup_name, %params) = @_;
        my $backup_path = $self->backup_dir . '/' . $backup_name;
        my $name        = $self->name;
+       my @tsp_maps;
+
+       # Build tablespace mappings.  We once let pg_basebackup copy
+       # tablespaces into temporary tablespace storage with a short name
+       # so that we can work on pathnames that fit our tar format which
+       # pg_basebackup depends on.
+       my $map_src_root = $self->tablespace_storage(1);
+       my $backup_tmptsp_root = TestLib::tempdir_short();
+       my %tsps = $self->tablespaces();
+       foreach my $tspname (values %tsps)
+       {
+               my $src = "$map_src_root/$tspname";
+               my $dst = "$backup_tmptsp_root/$tspname";
+               push(@tsp_maps, "--tablespace-mapping=$src=$dst");
+       }
 
        print "# Taking pg_basebackup $backup_name from node \"$name\"\n";
        TestLib::system_or_bail(
                'pg_basebackup', '-D', $backup_path, '-h',
                $self->host,     '-p', $self->port,  '--checkpoint',
                'fast',          '--no-sync',
+               @tsp_maps,
                @{ $params{backup_options} });
+
+       # Move the tablespaces from temporary storage into backup
+       # directory, unless the backup is in tar mode.
+       if (%tsps && ! -f "$backup_path/base.tar")
+       {
+               $self->backup_create_tablespace_storage();
+               RecursiveCopy::copypath(
+                       $backup_tmptsp_root,
+                       $self->backup_tablespace_storage_path($backup_name));
+               # delete the temporary directory right away
+               rmtree $backup_tmptsp_root;
+
+               # Fix tablespace symlinks.  This is not necessarily required
+               # in backups but keep them consistent.
+               my $linkdst_root = "$backup_path/pg_tblspc";
+               my $linksrc_root = 
$self->backup_tablespace_storage_path($backup_name);
+               foreach my $oid (keys %tsps)
+               {
+                       my $tspdst = "$linkdst_root/$oid";
+                       my $tspsrc = "$linksrc_root/" . $tsps{$oid};
+                       unlink $tspdst;
+                       dir_symlink($tspsrc, $tspdst);
+               }
+       }
+
        print "# Backup finished\n";
        return;
 }
@@ -623,11 +833,32 @@ sub _backup_fs
        RecursiveCopy::copypath(
                $self->data_dir,
                $backup_path,
+               # Skipping some files and tablespace symlinks
                filterfn => sub {
                        my $src = shift;
-                       return ($src ne 'log' and $src ne 'postmaster.pid');
+                       return ($src ne 'log' and $src ne 'postmaster.pid' and
+                                       $src !~ m!^pg_tblspc/[0-9]+$!);
                });
 
+       # Copy tablespaces if any
+       my %tsps = $self->tablespaces();
+       if (%tsps)
+       {
+               $self->backup_create_tablespace_storage();
+               RecursiveCopy::copypath(
+                       $self->tablespace_storage,
+                       $self->backup_tablespace_storage_path($backup_name));
+
+               my $linkdst_root = $backup_path . '/pg_tblspc';
+               my $linksrc_root = 
$self->backup_tablespace_storage_path($backup_name);
+               foreach my $oid (keys %tsps)
+               {
+                       my $tspdst = "$linkdst_root/$oid";
+                       my $tspsrc = "$linksrc_root/" . $tsps{$oid};
+                       dir_symlink($tspsrc, $tspdst);
+               }
+       }
+
        if ($hot)
        {
 
@@ -645,6 +876,80 @@ sub _backup_fs
 
 
 
+=pod
+
+=item dir_symlink(oldname, newname)
+
+Portably create a symlink for a directory. On Windows this creates a junction
+point. Elsewhere it just calls perl's builtin symlink.
+
+=cut
+
+sub dir_symlink
+{
+       my $oldname = shift;
+       my $newname = shift;
+       if ($windows_os)
+       {
+               $oldname =~ s,/,\\,g;
+               $newname =~ s,/,\\,g;
+               my $cmd = qq{mklink /j "$newname" "$oldname"};
+               if ($Config{osname} eq 'msys')
+               {
+                       # need some indirection on msys
+                       $cmd = qq{echo '$cmd' | \$COMSPEC /Q};
+               }
+               system($cmd);
+       }
+       else
+       {
+               symlink $oldname, $newname;
+       }
+       die "No $newname" unless -e $newname;
+}
+
+=pod
+
+=item dir_readlink(name)
+
+Portably read a symlink for a directory. On Windows this reads a junction
+point. Elsewhere it just calls perl's builtin readlink.
+
+=cut
+
+sub dir_readlink
+{
+       my $name = shift;
+       if ($windows_os)
+       {
+               $name .= '/..';
+               $name =~ s,/,\\,g;
+               # Split the path into parent directory and link name
+               die "invalid path spec: $name" if ($name !~ 
m!^(.*)\\([^\\]+)\\?$!);
+               my ($dir, $fname) = ($1, $2);
+               my $cmd = qq{cmd /c "dir /A:L $dir"};
+               if ($Config{osname} eq 'msys')
+               {
+                       # need some indirection on msys
+                       $cmd = qq{echo '$cmd' | \$COMSPEC /Q};
+               }
+
+               my $result;
+               foreach my $l (split /[\r\n]+/, `$cmd`)
+               {
+                       $result = $1 if ($l =~ m/<JUNCTION>\W+$fname \[(.*)\]/)
+               }
+               die "junction $name not found" if (!defined $result);
+
+               $name =~ s,\\,/,g;
+               return $result;
+       }
+       else
+       {
+               return readlink $name;
+       }
+}
+
 =pod
 
 =item $node->init_from_backup(root_node, backup_name)
@@ -689,7 +994,40 @@ sub init_from_backup
 
        my $data_path = $self->data_dir;
        rmdir($data_path);
-       RecursiveCopy::copypath($backup_path, $data_path);
+
+       RecursiveCopy::copypath(
+               $backup_path,
+               $data_path,
+               # Skipping tablespace symlinks
+               filterfn => sub {
+                       my $src = shift;
+                       return ($src !~ m!^pg_tblspc/[0-9]+$!);
+               });
+
+       # Copy tablespaces if any
+       my $tsps = $root_node->backup_tablespaces($backup_name);
+
+       if ($tsps)
+       {
+               my $tsp_src = 
$root_node->backup_tablespace_storage_path($backup_name);
+               my $tsp_dst = $self->tablespace_storage();
+               my $linksrc_root = $data_path . '/pg_tblspc';
+
+               # copypath() rejects to copy into existing directory.
+               # Copy individual directories in the storage.
+               foreach my $oid (keys %{$tsps})
+               {
+                       my $tsp = ${$tsps}{$oid};
+                       my $tspsrc = "$tsp_src/$tsp";
+                       my $tspdst = "$tsp_dst/$tsp";
+                       RecursiveCopy::copypath($tspsrc, $tspdst);
+
+                       # Create tablespace symlink for this tablespace
+                       my $linkdst = "$linksrc_root/$oid";
+                       dir_symlink($tspdst, $linkdst);
+               }
+       }
+
        chmod(0700, $data_path);
 
        # Base configuration for this node
diff --git a/src/test/recovery/t/011_crash_recovery.pl 
b/src/test/recovery/t/011_crash_recovery.pl
index 5dc52412ca..30aaf763e5 100644
--- a/src/test/recovery/t/011_crash_recovery.pl
+++ b/src/test/recovery/t/011_crash_recovery.pl
@@ -15,7 +15,7 @@ if ($Config{osname} eq 'MSWin32')
 }
 else
 {
-       plan tests => 3;
+       plan tests => 5;
 }
 
 my $node = get_new_node('master');
@@ -66,3 +66,109 @@ is($node->safe_psql('postgres', qq[SELECT 
txid_status('$xid');]),
        'aborted', 'xid is aborted after crash');
 
 $tx->kill_kill;
+
+my $node_primary = get_new_node('primary2');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
+my $dropme_ts_primary1 = $node_primary->new_tablespace('dropme_ts1');
+my $dropme_ts_primary2 = $node_primary->new_tablespace('dropme_ts2');
+my $soruce_ts_primary = $node_primary->new_tablespace('source_ts');
+my $target_ts_primary = $node_primary->new_tablespace('target_ts');
+
+$node_primary->psql('postgres',
+qq[
+       CREATE TABLESPACE dropme_ts1 LOCATION '$dropme_ts_primary1';
+       CREATE TABLESPACE dropme_ts2 LOCATION '$dropme_ts_primary2';
+       CREATE TABLESPACE source_ts  LOCATION '$soruce_ts_primary';
+       CREATE TABLESPACE target_ts  LOCATION '$target_ts_primary';
+    CREATE DATABASE template_db IS_TEMPLATE = true;
+]);
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+my $node_standby = get_new_node('standby2');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 
1);
+$node_standby->start;
+
+# Make sure connection is made
+$node_primary->poll_query_until(
+       'postgres', 'SELECT count(*) = 1 FROM pg_stat_replication');
+
+$node_standby->safe_psql('postgres', 'CHECKPOINT');
+
+# Do immediate shutdown just after a sequence of CREAT DATABASE / DROP
+# DATABASE / DROP TABLESPACE. This causes CREATE DATABASE WAL records
+# to be applied to already-removed directories.
+$node_primary->safe_psql('postgres',
+                                               q[CREATE DATABASE dropme_db1 
WITH TABLESPACE dropme_ts1;
+                                                 CREATE DATABASE dropme_db2 
WITH TABLESPACE dropme_ts2;
+                                                 CREATE DATABASE moveme_db 
TABLESPACE source_ts;
+                                                 ALTER DATABASE moveme_db SET 
TABLESPACE target_ts;
+                                                 CREATE DATABASE newdb 
TEMPLATE template_db;
+                                                 ALTER DATABASE template_db 
IS_TEMPLATE = false;
+                                                 DROP DATABASE dropme_db1;
+                                                 DROP DATABASE dropme_db2; 
DROP TABLESPACE dropme_ts2;
+                                                 DROP TABLESPACE source_ts;
+                                                 DROP DATABASE template_db;]);
+
+$node_primary->wait_for_catchup($node_standby, 'replay',
+                                                          
$node_primary->lsn('replay'));
+$node_standby->stop('immediate');
+
+# Should restart ignoring directory creation error.
+is($node_standby->start(fail_ok => 1), 1);
+
+
+# TEST 5
+#
+# Ensure that a missing tablespace directory during create database
+# replay immediately causes panic if the standby has already reached
+# consistent state (archive recovery is in progress).
+
+$node_primary = get_new_node('primary3');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
+
+# Create tablespace
+my $ts_primary = $node_primary->new_tablespace('dropme_ts1');
+$node_primary->safe_psql('postgres',
+                                                "CREATE TABLESPACE ts1 
LOCATION '$ts_primary'");
+$node_primary->safe_psql('postgres', "CREATE DATABASE db1 TABLESPACE ts1");
+
+# Take backup
+$backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+$node_standby = get_new_node('standby3');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 
1);
+$node_standby->start;
+
+# Make sure standby reached consistency and starts accepting connections
+$node_standby->poll_query_until('postgres', 'SELECT 1', '1');
+
+# Remove standby tablespace directory so it will be missing when
+# replay resumes.
+File::Path::rmtree($node_standby->tablespace_dir('dropme_ts1'));
+
+# Create a database in the tablespace and a table in default tablespace
+$node_primary->safe_psql('postgres',
+                                               q[CREATE TABLE 
should_not_replay_insertion(a int);
+                                                 CREATE DATABASE db2 WITH 
TABLESPACE ts1;
+                                                 INSERT INTO 
should_not_replay_insertion VALUES (1);]);
+
+# Standby should fail and should not silently skip replaying the wal
+if ($node_primary->poll_query_until(
+               'postgres',
+               'SELECT count(*) = 0 FROM pg_stat_replication',
+               't') == 1)
+{
+       pass('standby failed as expected');
+       # We know that the standby has failed.  Setting its pid to
+       # undefined avoids error when PostgreNode module tries to stop the
+       # standby node as part of tear_down sequence.
+       $node_standby->{_pid} = undef;
+}
+else
+{
+       fail('standby did not fail within 5 seconds');
+}
+
-- 
2.27.0

>From bfd70d2ab7aaf5b5791c46d78e6bf087041abb0f Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota....@gmail.com>
Date: Mon, 28 Mar 2022 16:29:33 +0900
Subject: [PATCH 2/2] Fix replay of create database records on standby

Crash recovery on standby may encounter missing directories when
replaying create database WAL records.  Prior to this patch, the standby
would fail to recover in such a case.  However, the directories could be
legitimately missing.  Consider a sequence of WAL records as follows:

    CREATE DATABASE
    DROP DATABASE
    DROP TABLESPACE

If, after replaying the last WAL record and removing the tablespace
directory, the standby crashes and has to replay the create database
record again, the crash recovery must be able to move on.

This patch adds a mechanism similar to invalid-page tracking, to keep a
tally of missing directories during crash recovery.  If all the missing
directory references are matched with corresponding drop records at the
end of crash recovery, the standby can safely continue following the
primary.

Backpatch to from 10 to 12. This fix has already been committed to 13
and later.

A new TAP test file is added to verify the condition.

Diagnosed-by: Paul Guo <paul...@gmail.com>
Author: Paul Guo <paul...@gmail.com>
Author: Kyotaro Horiguchi <horikyota....@gmail.com>
Author: Asim R Praveen <aprav...@pivotal.io>
Discussion: 
https://postgr.es/m/CAEET0ZGx9AvioViLf7nbR_8tH9-=27dn5xwj2p9-roh16e4...@mail.gmail.com
---
 src/backend/access/transam/xlog.c      |   6 +
 src/backend/access/transam/xlogutils.c | 159 ++++++++++++++++++++++++-
 src/backend/commands/dbcommands.c      |  55 +++++++++
 src/backend/commands/tablespace.c      |  17 +++
 src/include/access/xlogutils.h         |   4 +
 src/tools/pgindent/typedefs.list       |   2 +
 6 files changed, 242 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c 
b/src/backend/access/transam/xlog.c
index 7141e5dca8..3d3342b714 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8006,6 +8006,12 @@ CheckRecoveryConsistency(void)
                 */
                XLogCheckInvalidPages();
 
+               /*
+                * Check if the XLOG sequence contained any unresolved 
references to
+                * missing directories.
+                */
+               XLogCheckMissingDirs();
+
                reachedConsistency = true;
                ereport(LOG,
                                (errmsg("consistent recovery state reached at 
%X/%X",
diff --git a/src/backend/access/transam/xlogutils.c 
b/src/backend/access/transam/xlogutils.c
index 10a663bae6..11c40b7446 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -31,6 +31,164 @@
 #include "utils/rel.h"
 
 
+
+/*
+ * If a create database WAL record is being replayed more than once during
+ * crash recovery on a standby, it is possible that either the tablespace
+ * directory or the template database directory is missing.  This happens when
+ * the directories are removed by replay of subsequent drop records.  Note
+ * that this problem happens only on standby and not on master.  On master, a
+ * checkpoint is created at the end of create database operation. On standby,
+ * however, such a strategy (creating restart points during replay) is not
+ * viable because it will slow down WAL replay.
+ *
+ * The alternative is to track references to each missing directory
+ * encountered when performing crash recovery in the following hash table.
+ * Similar to invalid page table above, the expectation is that each missing
+ * directory entry should be matched with a drop database or drop tablespace
+ * WAL record by the end of crash recovery.
+ */
+typedef struct xl_missing_dir_key
+{
+       Oid                     spcNode;
+       Oid                     dbNode;
+} xl_missing_dir_key;
+
+typedef struct xl_missing_dir
+{
+       xl_missing_dir_key key;
+       char            path[MAXPGPATH];
+} xl_missing_dir;
+
+static HTAB *missing_dir_tab = NULL;
+
+
+/*
+ * Keep track of a directory that wasn't found while replaying database
+ * creation records.  These should match up with tablespace removal records
+ * later in the WAL stream; we verify that before reaching consistency.
+ */
+void
+XLogRememberMissingDir(Oid spcNode, Oid dbNode, char *path)
+{
+       xl_missing_dir_key key;
+       bool            found;
+       xl_missing_dir *entry;
+
+       /*
+        * Database OID may be invalid but tablespace OID must be valid.  If
+        * dbNode is InvalidOid, we are logging a missing tablespace directory,
+        * otherwise we are logging a missing database directory.
+        */
+       Assert(OidIsValid(spcNode));
+
+       if (missing_dir_tab == NULL)
+       {
+               /* create hash table when first needed */
+               HASHCTL         ctl;
+
+               memset(&ctl, 0, sizeof(ctl));
+               ctl.keysize = sizeof(xl_missing_dir_key);
+               ctl.entrysize = sizeof(xl_missing_dir);
+
+               missing_dir_tab = hash_create("XLOG missing directory table",
+                                                                         100,
+                                                                         &ctl,
+                                                                         
HASH_ELEM | HASH_BLOBS);
+       }
+
+       key.spcNode = spcNode;
+       key.dbNode = dbNode;
+
+       entry = hash_search(missing_dir_tab, &key, HASH_ENTER, &found);
+
+       if (found)
+       {
+               if (dbNode == InvalidOid)
+                       elog(DEBUG1, "missing directory %s (tablespace %u) 
already exists: %s",
+                                path, spcNode, entry->path);
+               else
+                       elog(DEBUG1, "missing directory %s (tablespace %u 
database %u) already exists: %s",
+                                path, spcNode, dbNode, entry->path);
+       }
+       else
+       {
+               strlcpy(entry->path, path, sizeof(entry->path));
+               if (dbNode == InvalidOid)
+                       elog(DEBUG1, "logged missing dir %s (tablespace %u)",
+                                path, spcNode);
+               else
+                       elog(DEBUG1, "logged missing dir %s (tablespace %u 
database %u)",
+                                path, spcNode, dbNode);
+       }
+}
+
+/*
+ * Remove an entry from the list of directories not found.  This is to be done
+ * when the matching tablespace removal WAL record is found.
+ */
+void
+XLogForgetMissingDir(Oid spcNode, Oid dbNode)
+{
+       xl_missing_dir_key key;
+
+       key.spcNode = spcNode;
+       key.dbNode = dbNode;
+
+       /* Database OID may be invalid but tablespace OID must be valid. */
+       Assert(OidIsValid(spcNode));
+
+       if (missing_dir_tab == NULL)
+               return;
+
+       if (hash_search(missing_dir_tab, &key, HASH_REMOVE, NULL) != NULL)
+       {
+               if (dbNode == InvalidOid)
+               {
+                       elog(DEBUG2, "forgot missing dir (tablespace %u)", 
spcNode);
+               }
+               else
+               {
+                       char       *path = GetDatabasePath(dbNode, spcNode);
+
+                       elog(DEBUG2, "forgot missing dir %s (tablespace %u 
database %u)",
+                                path, spcNode, dbNode);
+                       pfree(path);
+               }
+       }
+}
+
+/*
+ * This is called at the end of crash recovery, before entering archive
+ * recovery on a standby.  PANIC if the hash table is not empty.
+ */
+void
+XLogCheckMissingDirs(void)
+{
+       HASH_SEQ_STATUS status;
+       xl_missing_dir *hentry;
+       bool            foundone = false;
+
+       if (missing_dir_tab == NULL)
+               return;                                 /* nothing to do */
+
+       hash_seq_init(&status, missing_dir_tab);
+
+       while ((hentry = (xl_missing_dir *) hash_seq_search(&status)) != NULL)
+       {
+               elog(WARNING, "missing directory \"%s\" tablespace %u database 
%u",
+                        hentry->path, hentry->key.spcNode, hentry->key.dbNode);
+               foundone = true;
+       }
+
+       if (foundone)
+               elog(PANIC, "WAL contains references to missing directories");
+
+       hash_destroy(missing_dir_tab);
+       missing_dir_tab = NULL;
+}
+
+
 /*
  * During XLOG replay, we may see XLOG records for incremental updates of
  * pages that no longer exist, because their relation was later dropped or
@@ -56,7 +214,6 @@ typedef struct xl_invalid_page
 
 static HTAB *invalid_page_tab = NULL;
 
-
 /* Report a reference to an invalid page */
 static void
 report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno,
diff --git a/src/backend/commands/dbcommands.c 
b/src/backend/commands/dbcommands.c
index 863f89f19d..44512a8a30 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -2108,7 +2108,9 @@ dbase_redo(XLogReaderState *record)
                xl_dbase_create_rec *xlrec = (xl_dbase_create_rec *) 
XLogRecGetData(record);
                char       *src_path;
                char       *dst_path;
+               char       *parent_path;
                struct stat st;
+               bool            skip = false;
 
                src_path = GetDatabasePath(xlrec->src_db_id, 
xlrec->src_tablespace_id);
                dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
@@ -2126,6 +2128,56 @@ dbase_redo(XLogReaderState *record)
                                                (errmsg("some useless files may 
be left behind in old database directory \"%s\"",
                                                                dst_path)));
                }
+               else if (!reachedConsistency)
+               {
+                       /*
+                        * It is possible that a drop tablespace record 
appearing later in
+                        * WAL has already been replayed -- in other words, 
that we are
+                        * replaying the database creation record a second time 
with no
+                        * intervening checkpoint.  In that case, the 
tablespace directory
+                        * has already been removed and the create database 
operation
+                        * cannot be replayed.  Skip the replay itself, but 
remember the
+                        * fact that the tablespace directory is missing, to be 
matched
+                        * with the expected tablespace drop record later.
+                        */
+                       parent_path = pstrdup(dst_path);
+                       get_parent_directory(parent_path);
+                       if (!(stat(parent_path, &st) == 0 && 
S_ISDIR(st.st_mode)))
+                       {
+                               XLogRememberMissingDir(xlrec->tablespace_id, 
InvalidOid, parent_path);
+                               skip = true;
+                               ereport(WARNING,
+                                               (errmsg("skipping replay of 
database creation WAL record"),
+                                                errdetail("The target 
tablespace \"%s\" directory was not found.",
+                                                                  parent_path),
+                                                errhint("A future WAL record 
that removes the directory before reaching consistent mode is expected.")));
+                       }
+                       pfree(parent_path);
+               }
+
+               /*
+                * If the source directory is missing, skip the copy and make a 
note of
+                * it for later.
+                *
+                * One possible reason for this is that the template database 
used for
+                * creating this database may have been dropped, as noted above.
+                * Moving a database from one tablespace may also be a partner 
in the
+                * crime.
+                */
+               if (!(stat(src_path, &st) == 0 && S_ISDIR(st.st_mode)) &&
+                       !reachedConsistency)
+               {
+                       XLogRememberMissingDir(xlrec->src_tablespace_id, 
xlrec->src_db_id, src_path);
+                       skip = true;
+                       ereport(WARNING,
+                                       (errmsg("skipping replay of database 
creation WAL record"),
+                                        errdetail("The source database 
directory \"%s\" was not found.",
+                                                          src_path),
+                                        errhint("A future WAL record that 
removes the directory before reaching consistent mode is expected.")));
+               }
+
+               if (skip)
+                       return;
 
                /*
                 * Force dirty buffers out to disk, to ensure source database is
@@ -2181,6 +2233,9 @@ dbase_redo(XLogReaderState *record)
                                        (errmsg("some useless files may be left 
behind in old database directory \"%s\"",
                                                        dst_path)));
 
+               if (!reachedConsistency)
+                       XLogForgetMissingDir(xlrec->tablespace_id, 
xlrec->db_id);
+
                if (InHotStandby)
                {
                        /*
diff --git a/src/backend/commands/tablespace.c 
b/src/backend/commands/tablespace.c
index f060c24599..5b600a98ff 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -58,6 +58,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
@@ -1530,6 +1531,22 @@ tblspc_redo(XLogReaderState *record)
        {
                xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) 
XLogRecGetData(record);
 
+               if (!reachedConsistency)
+                       XLogForgetMissingDir(xlrec->ts_id, InvalidOid);
+
+               /*
+                * Before we remove the tablespace directory, update minimum 
recovery
+                * point to cover this WAL record. Once the tablespace is 
removed,
+                * there's no going back.  This manually enforces the WAL-first 
rule.
+                * Doing this before the removal means that if the removal 
fails for
+                * some reason, the directory is left alone and needs to be 
manually
+                * removed.  Alternatively we could update the minimum recovery 
point
+                * after removal, but that would leave a small window where the
+                * WAL-first rule could be violated.
+                */
+               if (!reachedConsistency)
+                       XLogFlush(record->EndRecPtr);
+
                /*
                 * If we issued a WAL record for a drop tablespace it implies 
that
                 * there were no files in it at all when the DROP was done. 
That means
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 4105b59904..a17c204638 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -23,6 +23,10 @@ extern void XLogDropDatabase(Oid dbid);
 extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
                                                                 BlockNumber 
nblocks);
 
+extern void XLogRememberMissingDir(Oid spcNode, Oid dbNode, char *path);
+extern void XLogForgetMissingDir(Oid spcNode, Oid dbNode);
+extern void XLogCheckMissingDirs(void);
+
 /* Result codes for XLogReadBufferForRedo[Extended] */
 typedef enum
 {
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index daebb77387..bdf6b25d59 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3404,6 +3404,8 @@ xl_invalid_page
 xl_invalid_page_key
 xl_invalidations
 xl_logical_message
+xl_missing_dir_key
+xl_missing_dir
 xl_multi_insert_tuple
 xl_multixact_create
 xl_multixact_truncate
-- 
2.27.0

Reply via email to