On Tue, Feb 2, 2016 at 4:20 PM, Michael Paquier wrote:
> Not wrong, and this leads to the following:
> void rename_safe(const char *old, const char *new, bool isdir, int elevel);
> Controlling elevel is necessary per the multiple code paths that would
> use it. Some use ERROR, most of them FATAL, and a bit of WARNING. Does
> that look fine?

After really coding it, I finished with the following thing:
+int
+rename_safe(const char *old, const char *new)

There is no need to extend that for directories, well we could of
course but all the renames happen on files so I see no need to make
that more complicated. More refactoring of the other rename() calls
could be done as well by extending rename_safe() with a flag to fsync
the data within a critical section, particularly for the replication
slot code. I have let that out to not complicate more the patch.
-- 
Michael
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index f6da673..11287aa 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -418,9 +418,10 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	TLHistoryFilePath(path, newTLI);
 
 	/*
-	 * Prefer link() to rename() here just to be really sure that we don't
-	 * overwrite an existing file.  However, there shouldn't be one, so
-	 * rename() is an acceptable substitute except for the truly paranoid.
+	 * Prefer link() to rename_safe() here just to be really sure that we
+	 * don't overwrite an existing file.  However, there shouldn't be one,
+	 * so rename_safe() is an acceptable substitute except for the truly
+	 * paranoid.
 	 */
 #if HAVE_WORKING_LINK
 	if (link(tmppath, path) < 0)
@@ -430,7 +431,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 						tmppath, path)));
 	unlink(tmppath);
 #else
-	if (rename(tmppath, path) < 0)
+	if (rename_safe(tmppath, path) < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -508,9 +509,10 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
 	TLHistoryFilePath(path, tli);
 
 	/*
-	 * Prefer link() to rename() here just to be really sure that we don't
-	 * overwrite an existing logfile.  However, there shouldn't be one, so
-	 * rename() is an acceptable substitute except for the truly paranoid.
+	 * Prefer link() to rename_safe() here just to be really sure that we
+	 * don't overwrite an existing logfile.  However, there shouldn't be
+	 * one, so rename_safe() is an acceptable substitute except for the
+	 * truly paranoid.
 	 */
 #if HAVE_WORKING_LINK
 	if (link(tmppath, path) < 0)
@@ -520,7 +522,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
 						tmppath, path)));
 	unlink(tmppath);
 #else
-	if (rename(tmppath, path) < 0)
+	if (rename_safe(tmppath, path) < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a2846c4..b61fcc7 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3251,9 +3251,10 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 	}
 
 	/*
-	 * Prefer link() to rename() here just to be really sure that we don't
-	 * overwrite an existing logfile.  However, there shouldn't be one, so
-	 * rename() is an acceptable substitute except for the truly paranoid.
+	 * Prefer link() to rename_safe() here just to be really sure that we
+	 * don't overwrite an existing logfile.  However, there shouldn't be
+	 * one, so rename_safe() is an acceptable substitute except for the
+	 * truly paranoid.
 	 */
 #if HAVE_WORKING_LINK
 	if (link(tmppath, path) < 0)
@@ -3268,7 +3269,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 	}
 	unlink(tmppath);
 #else
-	if (rename(tmppath, path) < 0)
+	if (rename_safe(tmppath, path) < 0)
 	{
 		if (use_lock)
 			LWLockRelease(ControlFileLock);
@@ -3792,7 +3793,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
 		 * flag, rename will fail. We'll try again at the next checkpoint.
 		 */
 		snprintf(newpath, MAXPGPATH, "%s.deleted", path);
-		if (rename(path, newpath) != 0)
+		if (rename_safe(path, newpath) != 0)
 		{
 			ereport(LOG,
 					(errcode_for_file_access(),
@@ -3800,10 +3801,12 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
 					  path)));
 			return;
 		}
+
 		rc = unlink(newpath);
 #else
 		rc = unlink(path);
 #endif
+
 		if (rc != 0)
 		{
 			ereport(LOG,
@@ -5291,7 +5294,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
 	 * re-enter archive recovery mode in a subsequent crash.
 	 */
 	unlink(RECOVERY_COMMAND_DONE);
-	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+	if (rename_safe(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
 		ereport(FATAL,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -6138,7 +6141,7 @@ StartupXLOG(void)
 		if (stat(TABLESPACE_MAP, &st) == 0)
 		{
 			unlink(TABLESPACE_MAP_OLD);
-			if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
+			if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
 				ereport(LOG,
 					(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
 							TABLESPACE_MAP, BACKUP_LABEL_FILE),
@@ -6501,7 +6504,7 @@ StartupXLOG(void)
 		if (haveBackupLabel)
 		{
 			unlink(BACKUP_LABEL_OLD);
-			if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
+			if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -6518,7 +6521,7 @@ StartupXLOG(void)
 		if (haveTblspcMap)
 		{
 			unlink(TABLESPACE_MAP_OLD);
-			if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0)
+			if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0)
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -7299,7 +7302,7 @@ StartupXLOG(void)
 				 */
 				XLogArchiveCleanup(partialfname);
 
-				if (rename(origpath, partialpath) != 0)
+				if (rename_safe(origpath, partialpath) != 0)
 					ereport(ERROR,
 							(errcode_for_file_access(),
 						 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -10863,7 +10866,7 @@ CancelBackup(void)
 	/* remove leftover file from previously canceled backup if it exists */
 	unlink(BACKUP_LABEL_OLD);
 
-	if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
+	if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
 	{
 		ereport(WARNING,
 				(errcode_for_file_access(),
@@ -10886,7 +10889,7 @@ CancelBackup(void)
 	/* remove leftover file from previously canceled backup if it exists */
 	unlink(TABLESPACE_MAP_OLD);
 
-	if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
+	if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
 	{
 		ereport(LOG,
 				(errmsg("online backup mode canceled"),
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index 277c14a..13ab7fa 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -451,7 +451,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname)
 		 */
 		snprintf(oldpath, MAXPGPATH, "%s.deleted%u",
 				 xlogfpath, deletedcounter++);
-		if (rename(xlogfpath, oldpath) != 0)
+		if (rename_safe(xlogfpath, oldpath) != 0)
 		{
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -470,7 +470,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname)
 		reload = true;
 	}
 
-	if (rename(path, xlogfpath) < 0)
+	if (rename_safe(path, xlogfpath) < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -580,12 +580,11 @@ XLogArchiveForceDone(const char *xlog)
 	StatusFilePath(archiveReady, xlog, ".ready");
 	if (stat(archiveReady, &stat_buf) == 0)
 	{
-		if (rename(archiveReady, archiveDone) < 0)
+		if (rename_safe(archiveReady, archiveDone) < 0)
 			ereport(WARNING,
 					(errcode_for_file_access(),
 					 errmsg("could not rename file \"%s\" to \"%s\": %m",
 							archiveReady, archiveDone)));
-
 		return;
 	}
 
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 397f802..db54889 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -728,7 +728,7 @@ pgarch_archiveDone(char *xlog)
 
 	StatusFilePath(rlogready, xlog, ".ready");
 	StatusFilePath(rlogdone, xlog, ".done");
-	if (rename(rlogready, rlogdone) < 0)
+	if (rename_safe(rlogready, rlogdone) < 0)
 		ereport(WARNING,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c
index 757b50e..6327972 100644
--- a/src/backend/replication/logical/origin.c
+++ b/src/backend/replication/logical/origin.c
@@ -617,16 +617,13 @@ CheckPointReplicationOrigin(void)
 	CloseTransientFile(tmpfd);
 
 	/* rename to permanent file, fsync file and directory */
-	if (rename(tmppath, path) != 0)
+	if (rename_safe(tmppath, path) != 0)
 	{
 		ereport(PANIC,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
 						tmppath, path)));
 	}
-
-	fsync_fname((char *) path, false);
-	fsync_fname("pg_logical", true);
 }
 
 /*
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index ed823ec..1f17f42 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1488,8 +1488,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
 		 * That ought to be cheap because in most scenarios it should already
 		 * be safely on disk.
 		 */
-		fsync_fname(path, false);
-		fsync_fname("pg_logical/snapshots", true);
+		fsync_fname(path, false, false);
+		fsync_fname("pg_logical/snapshots", true, false);
 
 		builder->last_serialized_snapshot = lsn;
 		goto out;
@@ -1593,7 +1593,7 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
 	}
 	CloseTransientFile(fd);
 
-	fsync_fname("pg_logical/snapshots", true);
+	fsync_fname("pg_logical/snapshots", true, false);
 
 	/*
 	 * We may overwrite the work from some other backend, but that's ok, our
@@ -1608,8 +1608,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
 	}
 
 	/* make sure we persist */
-	fsync_fname(path, false);
-	fsync_fname("pg_logical/snapshots", true);
+	fsync_fname(path, false, false);
+	fsync_fname("pg_logical/snapshots", true, false);
 
 	/*
 	 * Now there's no way we can loose the dumped state anymore, remember this
@@ -1660,8 +1660,8 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
 	 * either...
 	 * ----
 	 */
-	fsync_fname(path, false);
-	fsync_fname("pg_logical/snapshots", true);
+	fsync_fname(path, false, false);
+	fsync_fname("pg_logical/snapshots", true, false);
 
 
 	/* read statically sized portion of snapshot */
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 251b549..65d01e8 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -453,8 +453,8 @@ ReplicationSlotDropAcquired(void)
 		 * restart.
 		 */
 		START_CRIT_SECTION();
-		fsync_fname(tmppath, true);
-		fsync_fname("pg_replslot", true);
+		fsync_fname(tmppath, true, false);
+		fsync_fname("pg_replslot", true, false);
 		END_CRIT_SECTION();
 	}
 	else
@@ -912,7 +912,7 @@ StartupReplicationSlots(void)
 						 errmsg("could not remove directory \"%s\"", path)));
 				continue;
 			}
-			fsync_fname("pg_replslot", true);
+			fsync_fname("pg_replslot", true, false);
 			continue;
 		}
 
@@ -968,7 +968,7 @@ CreateSlotOnDisk(ReplicationSlot *slot)
 				(errcode_for_file_access(),
 				 errmsg("could not create directory \"%s\": %m",
 						tmppath)));
-	fsync_fname(tmppath, true);
+	fsync_fname(tmppath, true, false);
 
 	/* Write the actual state file. */
 	slot->dirty = true;			/* signal that we really need to write */
@@ -988,8 +988,8 @@ CreateSlotOnDisk(ReplicationSlot *slot)
 	 */
 	START_CRIT_SECTION();
 
-	fsync_fname(path, true);
-	fsync_fname("pg_replslot", true);
+	fsync_fname(path, true, false);
+	fsync_fname("pg_replslot", true, false);
 
 	END_CRIT_SECTION();
 }
@@ -1094,9 +1094,9 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 	/* Check CreateSlot() for the reasoning of using a crit. section. */
 	START_CRIT_SECTION();
 
-	fsync_fname(path, false);
-	fsync_fname((char *) dir, true);
-	fsync_fname("pg_replslot", true);
+	fsync_fname(path, false, false);
+	fsync_fname((char *) dir, true, false);
+	fsync_fname("pg_replslot", true, false);
 
 	END_CRIT_SECTION();
 
@@ -1165,7 +1165,7 @@ RestoreSlotFromDisk(const char *name)
 
 	/* Also sync the parent directory */
 	START_CRIT_SECTION();
-	fsync_fname(path, true);
+	fsync_fname(path, true, false);
 	END_CRIT_SECTION();
 
 	/* read part of statefile that's guaranteed to be version independent */
@@ -1248,7 +1248,7 @@ RestoreSlotFromDisk(const char *name)
 					(errcode_for_file_access(),
 					 errmsg("could not remove directory \"%s\"", path)));
 		}
-		fsync_fname("pg_replslot", true);
+		fsync_fname("pg_replslot", true, false);
 		return;
 	}
 
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index 522f420..20f49ab 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -115,7 +115,7 @@ copydir(char *fromdir, char *todir, bool recurse)
 					 errmsg("could not stat file \"%s\": %m", tofile)));
 
 		if (S_ISREG(fst.st_mode))
-			fsync_fname(tofile, false);
+			fsync_fname(tofile, false, false);
 	}
 	FreeDir(xldir);
 
@@ -125,7 +125,7 @@ copydir(char *fromdir, char *todir, bool recurse)
 	 * synced. Recent versions of ext4 have made the window much wider but
 	 * it's been true for ext3 and other filesystems in the past.
 	 */
-	fsync_fname(todir, true);
+	fsync_fname(todir, true, false);
 }
 
 /*
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 1b30100..c0825e2 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -410,10 +410,11 @@ pg_flush_data(int fd, off_t offset, off_t amount)
  * fsync_fname -- fsync a file or directory, handling errors properly
  *
  * Try to fsync a file or directory. When doing the latter, ignore errors that
- * indicate the OS just doesn't allow/require fsyncing directories.
+ * indicate the OS just doesn't allow/require fsyncing directories. Optionally
+ * one can skip error regarding non-existing entries attempted to be fsync'ed.
  */
 void
-fsync_fname(char *fname, bool isdir)
+fsync_fname(char *fname, bool isdir, bool missing_ok)
 {
 	int			fd;
 	int			returncode;
@@ -440,9 +441,14 @@ fsync_fname(char *fname, bool isdir)
 		return;
 
 	else if (fd < 0)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not open file \"%s\": %m", fname)));
+	{
+		if (missing_ok)
+			return;
+		else
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m", fname)));
+	}
 
 	returncode = pg_fsync(fd);
 
@@ -461,6 +467,52 @@ fsync_fname(char *fname, bool isdir)
 	CloseTransientFile(fd);
 }
 
+/*
+ * rename_safe -- rename of a file, making it on-disk persistent
+ *
+ * This routine ensures that a rename file persists in case of a crash by using
+ * fsync on the old and new files before and after performing the rename so as
+ * this categorizes as an all-or-nothing operation.
+ */
+int
+rename_safe(const char *old, const char *new)
+{
+	char	*parentpath;
+
+	/*
+	 * First fsync the old and new entries to ensure that they are properly
+	 * persistent on disk.
+	 */
+	fsync_fname(old, false, false);
+	fsync_fname(new, false, true);
+
+	/* Time to do the real deal... */
+	if (rename(old, new) != 0)
+		return -1;
+
+	/*
+	 * Make change persistent in case of an OS crash, both the new entry and
+	 * its parent directory need to be flushed.
+	 */
+	fsync_fname(new, false, false);
+
+	/* Same for parent directory */
+	parentpath = pstrdup(new);
+	get_parent_directory(parentpath);
+
+	/*
+	 * get_parent_directory() returns an empty string if the input argument
+	 * is just a file name (see comments in path.c), so handle that as being
+	 * the current directory.
+	 */
+	if (strlen(parentpath) == 0)
+		fsync_fname(".", true, false);
+	else
+		fsync_fname(parentpath, true, false);
+	pfree(parentpath);
+	return 0;
+}
+
 
 /*
  * InitFileAccess --- initialize this module during backend startup
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
index 4ad85380..a3a0a77 100644
--- a/src/backend/storage/file/reinit.c
+++ b/src/backend/storage/file/reinit.c
@@ -380,12 +380,12 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
 					 strlen(forkNames[INIT_FORKNUM]));
 
-			fsync_fname(mainpath, false);
+			fsync_fname(mainpath, false, false);
 		}
 
 		FreeDir(dbspace_dir);
 
-		fsync_fname((char *) dbspacedirname, true);
+		fsync_fname((char *) dbspacedirname, true, false);
 	}
 }
 
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 4a3fccb..24f04f9 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -113,7 +113,8 @@ extern int	pg_fsync_no_writethrough(int fd);
 extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern int	pg_flush_data(int fd, off_t offset, off_t amount);
-extern void fsync_fname(char *fname, bool isdir);
+extern void fsync_fname(char *fname, bool isdir, bool missing_ok);
+extern int	rename_safe(const char *old, const char *new);
 extern void SyncDataDirectory(void);
 
 /* Filename components for OpenTemporaryFile */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to