Hello After reading the whole thread a couple of times to make sure I understood the problem correctly, I think the approach in the v10 patch is a reasonable one. I agree that it's better for maintainability to keep a separate hash table. I made some cosmetic adjustments -- didn't find any fault in the code. I also verified that the test is hitting the problematic case.
So here's v11, which I'll try to get out tomorrow. I also assessed back-patching this. There's some conflicts, but nothing serious, back to 14. Unfortunately, 13 lacks requisite infrastructure, so I think it'll have to stay as it is. (12 is dead.) Can you please verify that my explanation in the commit message is sound? Thanks -- Álvaro Herrera 48°01'N 7°57'E — https://www.EnterpriseDB.com/ "Cada quien es cada cual y baja las escaleras como quiere" (JMSerrat)
>From 21ac0241c66f8612e233b0512a775b6b6f4dc6d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= <alvhe...@alvh.no-ip.org> Date: Tue, 12 Nov 2024 20:17:15 +0100 Subject: [PATCH v11] Be more picky with WAL segment deletion in pg_rewind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make pg_rewind to be a bit wiser in terms of creating filemap: preserve on the target all WAL segments that contain records between the last common checkpoint and the point of divergence. Previously, it was possible (in unlucky cases) for pg_rewind to remove WAL segments from the rewound demoted master, if they had been marked for archival (.ready files created) but not archived. This is because pg_rewind sees that they aren't present in the new primary. However, they are essential for recovery of the demoted primary to catch up to the new primary. We fix this by keeping a hash table of files in this situation, which pg_rewind can consult so that it knows to preserve them. Co-authored-by: Полина Бунгина (Polina Bungina) <bung...@gmail.com> Co-authored-by: Alexander Kukushkin <cyberd...@gmail.com> Reviewed-by: Kyotaro Horiguchi <horikyota....@gmail.com> Reviewed-by: Atsushi Torikoshi <torikos...@oss.nttdata.com> Discussion: https://postgr.es/m/caatgl4ahzmbrsesaddz7065t+k+bscnadftqp1ncpmsqwa5...@mail.gmail.com --- src/bin/pg_rewind/filemap.c | 83 +++++++++++++++++-- src/bin/pg_rewind/filemap.h | 3 + src/bin/pg_rewind/meson.build | 1 + src/bin/pg_rewind/parsexlog.c | 21 +++++ src/bin/pg_rewind/pg_rewind.c | 3 + src/bin/pg_rewind/t/010_keep_recycled_wals.pl | 62 ++++++++++++++ src/tools/pgindent/typedefs.list | 2 + 7 files changed, 168 insertions(+), 7 deletions(-) create mode 100644 src/bin/pg_rewind/t/010_keep_recycled_wals.pl diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 00e644d9886..e8af93f5dde 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -38,14 +38,14 @@ * Define a hash table which we can use to store information about the files * appearing in source and target systems. */ -#define SH_PREFIX filehash -#define SH_ELEMENT_TYPE file_entry_t -#define SH_KEY_TYPE const char * -#define SH_KEY path +#define SH_PREFIX filehash +#define SH_ELEMENT_TYPE file_entry_t +#define SH_KEY_TYPE const char * +#define SH_KEY path #define SH_HASH_KEY(tb, key) hash_string(key) #define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) -#define SH_SCOPE static inline -#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_SCOPE static inline +#define SH_RAW_ALLOCATOR pg_malloc0 #define SH_DECLARE #define SH_DEFINE #include "lib/simplehash.h" @@ -60,7 +60,33 @@ static char *datasegpath(RelFileLocator rlocator, ForkNumber forknum, static file_entry_t *insert_filehash_entry(const char *path); static file_entry_t *lookup_filehash_entry(const char *path); + +/* + * A separate hash table which tracks WAL files that must not be deleted. + */ +typedef struct keepwal_entry +{ + const char *path; + uint32 status; +} keepwal_entry; + +#define SH_PREFIX keepwal +#define SH_ELEMENT_TYPE keepwal_entry +#define SH_KEY_TYPE const char * +#define SH_KEY path +#define SH_HASH_KEY(tb, key) hash_string(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE static inline +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +static keepwal_hash *keepwal = NULL; +static bool keepwal_entry_exists(const char *path); + static int final_filemap_cmp(const void *a, const void *b); + static bool check_file_excluded(const char *path, bool is_source); /* @@ -206,6 +232,41 @@ lookup_filehash_entry(const char *path) return filehash_lookup(filehash, path); } +/* + * Initialize a hash table to store WAL file names that must be kept. + */ +void +keepwal_init(void) +{ + /* + * This hash table is empty in the vast majority of cases, so set an + * initial size of 0. + */ + keepwal = keepwal_create(0, NULL); +} + +/* Prevent deletion of the given file */ +void +keepwal_add_entry(const char *path) +{ + keepwal_entry *entry; + bool found; + + /* Should only be called with keepwal initialized */ + Assert(keepwal != NULL); + + entry = keepwal_insert(keepwal, path, &found); + + if (!found) + entry->path = pg_strdup(path); +} + +static bool +keepwal_entry_exists(const char *path) +{ + return keepwal_lookup(keepwal, path) != NULL; +} + /* * Callback for processing source file list. * @@ -685,7 +746,15 @@ decide_file_action(file_entry_t *entry) } else if (entry->target_exists && !entry->source_exists) { - /* File exists in target, but not source. Remove it. */ + /* + * For files that exist in target but not in source, we check the + * keepwal hash table; any files listed therein must not be removed. + */ + if (keepwal_entry_exists(path)) + { + pg_log_debug("Not removing %s because it is required for recovery", path); + return FILE_ACTION_NONE; + } return FILE_ACTION_REMOVE; } else if (!entry->target_exists && !entry->source_exists) diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h index 007e0f17cf4..5fceaeb64df 100644 --- a/src/bin/pg_rewind/filemap.h +++ b/src/bin/pg_rewind/filemap.h @@ -110,4 +110,7 @@ extern filemap_t *decide_file_actions(void); extern void calculate_totals(filemap_t *filemap); extern void print_filemap(filemap_t *filemap); +extern void keepwal_init(void); +extern void keepwal_add_entry(const char *path); + #endif /* FILEMAP_H */ diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build index e0f88bde221..200ebf84eb9 100644 --- a/src/bin/pg_rewind/meson.build +++ b/src/bin/pg_rewind/meson.build @@ -43,6 +43,7 @@ tests += { 't/007_standby_source.pl', 't/008_min_recovery_point.pl', 't/009_growing_files.pl', + 't/010_keep_recycled_wals.pl', ], }, } diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 22f7351fdcd..242326c97a7 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -175,6 +175,8 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, XLogReaderState *xlogreader; char *errormsg; XLogPageReadPrivate private; + XLogSegNo current_segno = 0; + TimeLineID current_tli = 0; /* * The given fork pointer points to the end of the last common record, @@ -217,6 +219,25 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, LSN_FORMAT_ARGS(searchptr)); } + /* Detect if a new WAL file has been opened */ + if (xlogreader->seg.ws_tli != current_tli || + xlogreader->seg.ws_segno != current_segno) + { + char xlogfname[MAXFNAMELEN]; + + snprintf(xlogfname, MAXFNAMELEN, XLOGDIR "/"); + + /* update curent values */ + current_tli = xlogreader->seg.ws_tli; + current_segno = xlogreader->seg.ws_segno; + + XLogFileName(xlogfname + sizeof(XLOGDIR), + current_tli, current_segno, WalSegSz); + + /* Track this filename as one to not remove */ + keepwal_add_entry(xlogfname); + } + /* * Check if it is a checkpoint record. This checkpoint record needs to * be the latest checkpoint before WAL forked and not the checkpoint diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 960916a1e86..c4fe4e37040 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -455,6 +455,9 @@ main(int argc, char **argv) exit(0); } + /* Initialize hashtable that tracks WAL files protected from removal */ + keepwal_init(); + findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex, &chkptrec, &chkpttli, &chkptredo, restore_command); pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u", diff --git a/src/bin/pg_rewind/t/010_keep_recycled_wals.pl b/src/bin/pg_rewind/t/010_keep_recycled_wals.pl new file mode 100644 index 00000000000..49b87617ed8 --- /dev/null +++ b/src/bin/pg_rewind/t/010_keep_recycled_wals.pl @@ -0,0 +1,62 @@ +# Copyright (c) 2021-2024, PostgreSQL Global Development Group +# +# Test situation where a target data directory contains +# WAL files that were already recycled by the new primary. +# + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; +use RewindTest; + +RewindTest::setup_cluster(); +$node_primary->enable_archiving(); +RewindTest::start_primary(); + +RewindTest::create_standby(); +$node_standby->enable_restoring($node_primary, 0); +$node_standby->reload(); + +RewindTest::primary_psql("CHECKPOINT"); # last common checkpoint + +# We use "perl -e 'exit(1)'" as an alternative to "false", because the latter +# might not be available on Windows. +my $false = "$^X -e 'exit(1)'"; +$node_primary->append_conf( + 'postgresql.conf', qq( +archive_command = '$false' +)); +$node_primary->reload(); + +# advance WAL on primary; this WAL segment will never make it to the archive +RewindTest::primary_psql("CREATE TABLE t(a int)"); +RewindTest::primary_psql("INSERT INTO t VALUES(0)"); +RewindTest::primary_psql("SELECT pg_switch_wal()"); + +RewindTest::promote_standby; + +# new primary loses diverging WAL segment +RewindTest::standby_psql("INSERT INTO t values(0)"); +RewindTest::standby_psql("SELECT pg_switch_wal()"); + +$node_standby->stop(); +$node_primary->stop(); + +my ($stdout, $stderr) = run_command( + [ + 'pg_rewind', '--debug', + '--source-pgdata', $node_standby->data_dir, + '--target-pgdata', $node_primary->data_dir, + '--no-sync', + ]); + +like( + $stderr, + qr/Not removing pg_wal.* because it is required for recovery/, + "some WAL files were skipped"); + +done_testing(); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 100afe40e1d..7d06d69d171 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3594,6 +3594,8 @@ json_manifest_version_callback json_ofield_action json_scalar_action json_struct_action +keepwal_entry +keepwal_hash keyEntryData key_t lclContext -- 2.39.5