I pushed the retry-loop-in-frontend-executables patch and the
missing-locking-in-SQL-functions patch yesterday.  That leaves the
backup ones, which I've rebased and attached, no change.  It sounds
like we need some more healthy debate about that backup label idea
that would mean we don't need these (two birds with one stone), so
I'll just leave these here to keep the cfbot happy in the meantime.
From 3ab478ba3bb1712e6fc529f4bdaf48b58a6d72c5 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Wed, 26 Jul 2023 08:46:19 +1200
Subject: [PATCH v7 1/2] Update control file atomically during backups.

If there is a backup in progress, switch to a rename-based approach when
writing out the control file to avoid problems with torn reads that can
occur on some systems (at least ext4, ntfs).  We don't want to make
UpdateControlFile() slower in general, because it's called frequently
during recovery, so we only do it while runningBackups > 0.  In order to
activate the new behavior without races, we now also acquire
ControlFileLock when backups begin and end.

Back-patch to all supported releases.

Reviewed-by: Anton A. Melnikov <aamelni...@inbox.ru>
Reviewed-by: David Steele <da...@pgmasters.net>
Discussion: https://postgr.es/m/20221123014224.xisi44byq3cf5psi%40awork3.anarazel.de
---
 src/backend/access/transam/xlog.c      | 33 +++++++++++++++++++--
 src/bin/pg_checksums/pg_checksums.c    |  2 +-
 src/bin/pg_resetwal/pg_resetwal.c      |  2 +-
 src/bin/pg_rewind/pg_rewind.c          |  2 +-
 src/common/controldata_utils.c         | 41 ++++++++++++++++++++++++--
 src/include/common/controldata_utils.h |  4 ++-
 6 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c0e4ca5089..c82e5c6ad4 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -442,7 +442,9 @@ typedef struct XLogCtlInsert
 	/*
 	 * runningBackups is a counter indicating the number of backups currently
 	 * in progress. lastBackupStart is the latest checkpoint redo location
-	 * used as a starting point for an online backup.
+	 * used as a starting point for an online backup.  runningBackups is
+	 * protected by both ControlFileLock and WAL insertion lock (in that
+	 * order), because it affects the behavior of UpdateControlFile().
 	 */
 	int			runningBackups;
 	XLogRecPtr	lastBackupStart;
@@ -4208,7 +4210,13 @@ ReadControlFile(void)
 static void
 UpdateControlFile(void)
 {
-	update_controlfile(DataDir, ControlFile, true);
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	bool		atomic;
+
+	Assert(LWLockHeldByMeInMode(ControlFileLock, LW_EXCLUSIVE));
+	atomic = Insert->runningBackups > 0;
+
+	update_controlfile(DataDir, ControlFile, atomic, true);
 }
 
 /*
@@ -5344,9 +5352,12 @@ StartupXLOG(void)
 		 * pg_control with any minimum recovery stop point obtained from a
 		 * backup history file.
 		 *
-		 * No need to hold ControlFileLock yet, we aren't up far enough.
+		 * ControlFileLock not really needed yet, we aren't up far enough, but
+		 * makes assertions simpler.
 		 */
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
 
 		/*
 		 * If there was a backup label file, it's done its job and the info
@@ -8335,6 +8346,12 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
 
 	memcpy(state->name, backupidstr, strlen(backupidstr));
 
+	/*
+	 * UpdateControlFile() behaves differently while backups are running, and
+	 * we need to avoid races when the backups start.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
 	/*
 	 * Mark backup active in shared memory.  We must do full-page WAL writes
 	 * during an on-line backup even if not doing so at other times, because
@@ -8360,6 +8377,8 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
 	XLogCtl->Insert.runningBackups++;
 	WALInsertLockRelease();
 
+	LWLockRelease(ControlFileLock);
+
 	/*
 	 * Ensure we decrement runningBackups if we fail below. NB -- for this to
 	 * work correctly, it is critical that sessionBackupState is only updated
@@ -8650,6 +8669,12 @@ do_pg_backup_stop(BackupState *state, bool waitforarchive)
 				 errmsg("WAL level not sufficient for making an online backup"),
 				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
 
+	/*
+	 * Interlocking with UpdateControlFile(), so that it can start using atomic
+	 * mode while backups are running.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
 	/*
 	 * OK to update backup counter and session-level lock.
 	 *
@@ -8679,6 +8704,8 @@ do_pg_backup_stop(BackupState *state, bool waitforarchive)
 
 	WALInsertLockRelease();
 
+	LWLockRelease(ControlFileLock);
+
 	/*
 	 * If we are taking an online backup from the standby, we confirm that the
 	 * standby has not been promoted during the backup.
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index e009ba5e0b..4dab08b4a0 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -634,7 +634,7 @@ main(int argc, char *argv[])
 		}
 
 		pg_log_info("updating control file");
-		update_controlfile(DataDir, ControlFile, do_sync);
+		update_controlfile(DataDir, ControlFile, false, do_sync);
 
 		if (verbose)
 			printf(_("Data checksum version: %u\n"), ControlFile->data_checksum_version);
diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c
index 3ae3fc06df..a9142a213d 100644
--- a/src/bin/pg_resetwal/pg_resetwal.c
+++ b/src/bin/pg_resetwal/pg_resetwal.c
@@ -889,7 +889,7 @@ RewriteControlFile(void)
 	ControlFile.max_locks_per_xact = 64;
 
 	/* The control file gets flushed here. */
-	update_controlfile(".", &ControlFile, true);
+	update_controlfile(".", &ControlFile, false, true);
 }
 
 
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index bfd44a284e..6b0531f4d1 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -723,7 +723,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
 	ControlFile_new.minRecoveryPointTLI = endtli;
 	ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
 	if (!dry_run)
-		update_controlfile(datadir_target, &ControlFile_new, do_sync);
+		update_controlfile(datadir_target, &ControlFile_new, false, do_sync);
 }
 
 static void
diff --git a/src/common/controldata_utils.c b/src/common/controldata_utils.c
index 4d1cd1ce98..8f2587661a 100644
--- a/src/common/controldata_utils.c
+++ b/src/common/controldata_utils.c
@@ -170,14 +170,27 @@ retry:
  * optionally used to flush the updated control file.  Note that it is up
  * to the caller to properly lock ControlFileLock when calling this
  * routine in the backend.
+ *
+ * If atomic is true, a slower procedure is used in backend code that renames a
+ * temporary file into place.  This is intended for use while backups are in
+ * progress, to avoid the possibility of a torn read of the control file.  Note
+ * that this refers to atomicity of concurrent reads and writes.  (Atomicity
+ * under power failure is a separate topic; we always assume that writes of
+ * size PG_CONTROL_MAX_SAFE_SIZE are power-failure-atomic, even when 'atomic'
+ * is false here.)
  */
 void
 update_controlfile(const char *DataDir,
-				   ControlFileData *ControlFile, bool do_sync)
+				   ControlFileData *ControlFile,
+				   bool atomic,
+				   bool do_sync)
 {
 	int			fd;
 	char		buffer[PG_CONTROL_FILE_SIZE];
 	char		ControlFilePath[MAXPGPATH];
+#ifndef FRONTEND
+	int			flags = 0;
+#endif
 
 	/* Update timestamp  */
 	ControlFile->time = (pg_time_t) time(NULL);
@@ -197,7 +210,18 @@ update_controlfile(const char *DataDir,
 	memset(buffer, 0, PG_CONTROL_FILE_SIZE);
 	memcpy(buffer, ControlFile, sizeof(ControlFileData));
 
-	snprintf(ControlFilePath, sizeof(ControlFilePath), "%s/%s", DataDir, XLOG_CONTROL_FILE);
+	/* In atomic mode, we'll write into a new temporary file first. */
+	snprintf(ControlFilePath,
+			 sizeof(ControlFilePath),
+			 "%s/%s%s",
+			 DataDir,
+			 XLOG_CONTROL_FILE,
+			 atomic ? ".tmp" : "");
+#ifndef FRONTEND
+	flags = 0;
+	if (atomic)
+		flags = O_CREAT;
+#endif
 
 #ifndef FRONTEND
 
@@ -205,7 +229,7 @@ update_controlfile(const char *DataDir,
 	 * All errors issue a PANIC, so no need to use OpenTransientFile() and to
 	 * worry about file descriptor leaks.
 	 */
-	if ((fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY)) < 0)
+	if ((fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY | flags)) < 0)
 		ereport(PANIC,
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m",
@@ -266,4 +290,15 @@ update_controlfile(const char *DataDir,
 		pg_fatal("could not close file \"%s\": %m", ControlFilePath);
 #endif
 	}
+
+#ifndef FRONTEND
+	if (atomic)
+	{
+		char		path[MAXPGPATH];
+
+		snprintf(path, sizeof(path), "%s/%s", DataDir, XLOG_CONTROL_FILE);
+
+		durable_rename(ControlFilePath, path, PANIC);
+	}
+#endif
 }
diff --git a/src/include/common/controldata_utils.h b/src/include/common/controldata_utils.h
index 49e7c52d31..0d9b41a2ba 100644
--- a/src/include/common/controldata_utils.h
+++ b/src/include/common/controldata_utils.h
@@ -14,6 +14,8 @@
 
 extern ControlFileData *get_controlfile(const char *DataDir, bool *crc_ok_p);
 extern void update_controlfile(const char *DataDir,
-							   ControlFileData *ControlFile, bool do_sync);
+							   ControlFileData *ControlFile,
+							   bool atomic,
+							   bool do_sync);
 
 #endif							/* COMMON_CONTROLDATA_UTILS_H */
-- 
2.42.0

From 800f5c290cc73c5b461d44c66d4d9ebe830bd08f Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Tue, 31 Jan 2023 20:01:49 +1300
Subject: [PATCH v7 2/2] Acquire ControlFileLock in base backups.

Even though we now rename() the control file to gain atomicity during
backups, we can avoid some unpleasant retry behaviors and problems
observed with the atomicity of rename() on Windows if we also acquire
ControlFileLock to read the file during base backups.

Back-patch to all supported releases.

Reviewed-by: Anton A. Melnikov <aamelni...@inbox.ru>
Reviewed-by: David Steele <da...@pgmasters.net>
Discussion: https://postgr.es/m/20221123014224.xisi44byq3cf5psi%40awork3.anarazel.de
---
 src/backend/backup/basebackup.c | 36 ++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index 7d025bcf38..93a8949de1 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -24,6 +24,7 @@
 #include "backup/basebackup_target.h"
 #include "commands/defrem.h"
 #include "common/compression.h"
+#include "common/controldata_utils.h"
 #include "common/file_perm.h"
 #include "common/file_utils.h"
 #include "lib/stringinfo.h"
@@ -39,6 +40,7 @@
 #include "storage/checksum.h"
 #include "storage/dsm_impl.h"
 #include "storage/ipc.h"
+#include "storage/lwlock.h"
 #include "storage/reinit.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
@@ -93,7 +95,7 @@ static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
 								 BlockNumber blkno,
 								 uint16 *expected_checksum);
 static void sendFileWithContent(bbsink *sink, const char *filename,
-								const char *content,
+								const char *content, int len,
 								backup_manifest_info *manifest);
 static int64 _tarWriteHeader(bbsink *sink, const char *filename,
 							 const char *linktarget, struct stat *statbuf,
@@ -324,7 +326,8 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 
 			if (ti->path == NULL)
 			{
-				struct stat statbuf;
+				ControlFileData *control_file;
+				bool		crc_ok;
 				bool		sendtblspclinks = true;
 				char	   *backup_label;
 
@@ -333,14 +336,14 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 				/* In the main tar, include the backup_label first... */
 				backup_label = build_backup_content(backup_state, false);
 				sendFileWithContent(sink, BACKUP_LABEL_FILE,
-									backup_label, &manifest);
+									backup_label, -1, &manifest);
 				pfree(backup_label);
 
 				/* Then the tablespace_map file, if required... */
 				if (opt->sendtblspcmapfile)
 				{
 					sendFileWithContent(sink, TABLESPACE_MAP,
-										tablespace_map->data, &manifest);
+										tablespace_map->data, -1, &manifest);
 					sendtblspclinks = false;
 				}
 
@@ -349,13 +352,13 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 						sendtblspclinks, &manifest, NULL);
 
 				/* ... and pg_control after everything else. */
-				if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not stat file \"%s\": %m",
-									XLOG_CONTROL_FILE)));
-				sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
-						 false, InvalidOid, &manifest, NULL);
+				LWLockAcquire(ControlFileLock, LW_SHARED);
+				control_file = get_controlfile(DataDir, &crc_ok);
+				LWLockRelease(ControlFileLock);
+				sendFileWithContent(sink, XLOG_CONTROL_FILE,
+									(char *) control_file, sizeof(*control_file),
+									&manifest);
+				pfree(control_file);
 			}
 			else
 			{
@@ -600,7 +603,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 			 * complete segment.
 			 */
 			StatusFilePath(pathbuf, walFileName, ".done");
-			sendFileWithContent(sink, pathbuf, "", &manifest);
+			sendFileWithContent(sink, pathbuf, "", -1, &manifest);
 		}
 
 		/*
@@ -628,7 +631,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 
 			/* unconditionally mark file as archived */
 			StatusFilePath(pathbuf, fname, ".done");
-			sendFileWithContent(sink, pathbuf, "", &manifest);
+			sendFileWithContent(sink, pathbuf, "", -1, &manifest);
 		}
 
 		/* Properly terminate the tar file. */
@@ -1039,18 +1042,19 @@ SendBaseBackup(BaseBackupCmd *cmd)
  */
 static void
 sendFileWithContent(bbsink *sink, const char *filename, const char *content,
+					int len,
 					backup_manifest_info *manifest)
 {
 	struct stat statbuf;
-	int			bytes_done = 0,
-				len;
+	int			bytes_done = 0;
 	pg_checksum_context checksum_ctx;
 
 	if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
 		elog(ERROR, "could not initialize checksum of file \"%s\"",
 			 filename);
 
-	len = strlen(content);
+	if (len < 0)
+		len = strlen(content);
 
 	/*
 	 * Construct a stat struct for the backup_label file we're injecting in
-- 
2.42.0

Reply via email to