On Tue, Feb 2, 2016 at 4:20 PM, Michael Paquier wrote:
> Not wrong, and this leads to the following:
> void rename_safe(const char *old, const char *new, bool isdir, int elevel);
> Controlling elevel is necessary per the multiple code paths that would
> use it. Some use ERROR, most of them FATAL, and a bit of WARNING. Does
> that look fine?
After really coding it, I finished with the following thing:
+int
+rename_safe(const char *old, const char *new)
There is no need to extend that for directories, well we could of
course but all the renames happen on files so I see no need to make
that more complicated. More refactoring of the other rename() calls
could be done as well by extending rename_safe() with a flag to fsync
the data within a critical section, particularly for the replication
slot code. I have let that out to not complicate more the patch.
--
Michael
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index f6da673..11287aa 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -418,9 +418,10 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
TLHistoryFilePath(path, newTLI);
/*
- * Prefer link() to rename() here just to be really sure that we don't
- * overwrite an existing file. However, there shouldn't be one, so
- * rename() is an acceptable substitute except for the truly paranoid.
+ * Prefer link() to rename_safe() here just to be really sure that we
+ * don't overwrite an existing file. However, there shouldn't be one,
+ * so rename_safe() is an acceptable substitute except for the truly
+ * paranoid.
*/
#if HAVE_WORKING_LINK
if (link(tmppath, path) < 0)
@@ -430,7 +431,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
tmppath, path)));
unlink(tmppath);
#else
- if (rename(tmppath, path) < 0)
+ if (rename_safe(tmppath, path) < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -508,9 +509,10 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
TLHistoryFilePath(path, tli);
/*
- * Prefer link() to rename() here just to be really sure that we don't
- * overwrite an existing logfile. However, there shouldn't be one, so
- * rename() is an acceptable substitute except for the truly paranoid.
+ * Prefer link() to rename_safe() here just to be really sure that we
+ * don't overwrite an existing logfile. However, there shouldn't be
+ * one, so rename_safe() is an acceptable substitute except for the
+ * truly paranoid.
*/
#if HAVE_WORKING_LINK
if (link(tmppath, path) < 0)
@@ -520,7 +522,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
tmppath, path)));
unlink(tmppath);
#else
- if (rename(tmppath, path) < 0)
+ if (rename_safe(tmppath, path) < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a2846c4..b61fcc7 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3251,9 +3251,10 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
}
/*
- * Prefer link() to rename() here just to be really sure that we don't
- * overwrite an existing logfile. However, there shouldn't be one, so
- * rename() is an acceptable substitute except for the truly paranoid.
+ * Prefer link() to rename_safe() here just to be really sure that we
+ * don't overwrite an existing logfile. However, there shouldn't be
+ * one, so rename_safe() is an acceptable substitute except for the
+ * truly paranoid.
*/
#if HAVE_WORKING_LINK
if (link(tmppath, path) < 0)
@@ -3268,7 +3269,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
}
unlink(tmppath);
#else
- if (rename(tmppath, path) < 0)
+ if (rename_safe(tmppath, path) < 0)
{
if (use_lock)
LWLockRelease(ControlFileLock);
@@ -3792,7 +3793,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
* flag, rename will fail. We'll try again at the next checkpoint.
*/
snprintf(newpath, MAXPGPATH, "%s.deleted", path);
- if (rename(path, newpath) != 0)
+ if (rename_safe(path, newpath) != 0)
{
ereport(LOG,
(errcode_for_file_access(),
@@ -3800,10 +3801,12 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
path)));
return;
}
+
rc = unlink(newpath);
#else
rc = unlink(path);
#endif
+
if (rc != 0)
{
ereport(LOG,
@@ -5291,7 +5294,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
* re-enter archive recovery mode in a subsequent crash.
*/
unlink(RECOVERY_COMMAND_DONE);
- if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+ if (rename_safe(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -6138,7 +6141,7 @@ StartupXLOG(void)
if (stat(TABLESPACE_MAP, &st) == 0)
{
unlink(TABLESPACE_MAP_OLD);
- if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
+ if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
ereport(LOG,
(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
TABLESPACE_MAP, BACKUP_LABEL_FILE),
@@ -6501,7 +6504,7 @@ StartupXLOG(void)
if (haveBackupLabel)
{
unlink(BACKUP_LABEL_OLD);
- if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
+ if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -6518,7 +6521,7 @@ StartupXLOG(void)
if (haveTblspcMap)
{
unlink(TABLESPACE_MAP_OLD);
- if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0)
+ if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -7299,7 +7302,7 @@ StartupXLOG(void)
*/
XLogArchiveCleanup(partialfname);
- if (rename(origpath, partialpath) != 0)
+ if (rename_safe(origpath, partialpath) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -10863,7 +10866,7 @@ CancelBackup(void)
/* remove leftover file from previously canceled backup if it exists */
unlink(BACKUP_LABEL_OLD);
- if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
+ if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
{
ereport(WARNING,
(errcode_for_file_access(),
@@ -10886,7 +10889,7 @@ CancelBackup(void)
/* remove leftover file from previously canceled backup if it exists */
unlink(TABLESPACE_MAP_OLD);
- if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
+ if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
{
ereport(LOG,
(errmsg("online backup mode canceled"),
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index 277c14a..13ab7fa 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -451,7 +451,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname)
*/
snprintf(oldpath, MAXPGPATH, "%s.deleted%u",
xlogfpath, deletedcounter++);
- if (rename(xlogfpath, oldpath) != 0)
+ if (rename_safe(xlogfpath, oldpath) != 0)
{
ereport(ERROR,
(errcode_for_file_access(),
@@ -470,7 +470,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname)
reload = true;
}
- if (rename(path, xlogfpath) < 0)
+ if (rename_safe(path, xlogfpath) < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -580,12 +580,11 @@ XLogArchiveForceDone(const char *xlog)
StatusFilePath(archiveReady, xlog, ".ready");
if (stat(archiveReady, &stat_buf) == 0)
{
- if (rename(archiveReady, archiveDone) < 0)
+ if (rename_safe(archiveReady, archiveDone) < 0)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
archiveReady, archiveDone)));
-
return;
}
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 397f802..db54889 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -728,7 +728,7 @@ pgarch_archiveDone(char *xlog)
StatusFilePath(rlogready, xlog, ".ready");
StatusFilePath(rlogdone, xlog, ".done");
- if (rename(rlogready, rlogdone) < 0)
+ if (rename_safe(rlogready, rlogdone) < 0)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c
index 757b50e..6327972 100644
--- a/src/backend/replication/logical/origin.c
+++ b/src/backend/replication/logical/origin.c
@@ -617,16 +617,13 @@ CheckPointReplicationOrigin(void)
CloseTransientFile(tmpfd);
/* rename to permanent file, fsync file and directory */
- if (rename(tmppath, path) != 0)
+ if (rename_safe(tmppath, path) != 0)
{
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
tmppath, path)));
}
-
- fsync_fname((char *) path, false);
- fsync_fname("pg_logical", true);
}
/*
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index ed823ec..1f17f42 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1488,8 +1488,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
* That ought to be cheap because in most scenarios it should already
* be safely on disk.
*/
- fsync_fname(path, false);
- fsync_fname("pg_logical/snapshots", true);
+ fsync_fname(path, false, false);
+ fsync_fname("pg_logical/snapshots", true, false);
builder->last_serialized_snapshot = lsn;
goto out;
@@ -1593,7 +1593,7 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
}
CloseTransientFile(fd);
- fsync_fname("pg_logical/snapshots", true);
+ fsync_fname("pg_logical/snapshots", true, false);
/*
* We may overwrite the work from some other backend, but that's ok, our
@@ -1608,8 +1608,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
}
/* make sure we persist */
- fsync_fname(path, false);
- fsync_fname("pg_logical/snapshots", true);
+ fsync_fname(path, false, false);
+ fsync_fname("pg_logical/snapshots", true, false);
/*
* Now there's no way we can loose the dumped state anymore, remember this
@@ -1660,8 +1660,8 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
* either...
* ----
*/
- fsync_fname(path, false);
- fsync_fname("pg_logical/snapshots", true);
+ fsync_fname(path, false, false);
+ fsync_fname("pg_logical/snapshots", true, false);
/* read statically sized portion of snapshot */
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 251b549..65d01e8 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -453,8 +453,8 @@ ReplicationSlotDropAcquired(void)
* restart.
*/
START_CRIT_SECTION();
- fsync_fname(tmppath, true);
- fsync_fname("pg_replslot", true);
+ fsync_fname(tmppath, true, false);
+ fsync_fname("pg_replslot", true, false);
END_CRIT_SECTION();
}
else
@@ -912,7 +912,7 @@ StartupReplicationSlots(void)
errmsg("could not remove directory \"%s\"", path)));
continue;
}
- fsync_fname("pg_replslot", true);
+ fsync_fname("pg_replslot", true, false);
continue;
}
@@ -968,7 +968,7 @@ CreateSlotOnDisk(ReplicationSlot *slot)
(errcode_for_file_access(),
errmsg("could not create directory \"%s\": %m",
tmppath)));
- fsync_fname(tmppath, true);
+ fsync_fname(tmppath, true, false);
/* Write the actual state file. */
slot->dirty = true; /* signal that we really need to write */
@@ -988,8 +988,8 @@ CreateSlotOnDisk(ReplicationSlot *slot)
*/
START_CRIT_SECTION();
- fsync_fname(path, true);
- fsync_fname("pg_replslot", true);
+ fsync_fname(path, true, false);
+ fsync_fname("pg_replslot", true, false);
END_CRIT_SECTION();
}
@@ -1094,9 +1094,9 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
/* Check CreateSlot() for the reasoning of using a crit. section. */
START_CRIT_SECTION();
- fsync_fname(path, false);
- fsync_fname((char *) dir, true);
- fsync_fname("pg_replslot", true);
+ fsync_fname(path, false, false);
+ fsync_fname((char *) dir, true, false);
+ fsync_fname("pg_replslot", true, false);
END_CRIT_SECTION();
@@ -1165,7 +1165,7 @@ RestoreSlotFromDisk(const char *name)
/* Also sync the parent directory */
START_CRIT_SECTION();
- fsync_fname(path, true);
+ fsync_fname(path, true, false);
END_CRIT_SECTION();
/* read part of statefile that's guaranteed to be version independent */
@@ -1248,7 +1248,7 @@ RestoreSlotFromDisk(const char *name)
(errcode_for_file_access(),
errmsg("could not remove directory \"%s\"", path)));
}
- fsync_fname("pg_replslot", true);
+ fsync_fname("pg_replslot", true, false);
return;
}
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index 522f420..20f49ab 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -115,7 +115,7 @@ copydir(char *fromdir, char *todir, bool recurse)
errmsg("could not stat file \"%s\": %m", tofile)));
if (S_ISREG(fst.st_mode))
- fsync_fname(tofile, false);
+ fsync_fname(tofile, false, false);
}
FreeDir(xldir);
@@ -125,7 +125,7 @@ copydir(char *fromdir, char *todir, bool recurse)
* synced. Recent versions of ext4 have made the window much wider but
* it's been true for ext3 and other filesystems in the past.
*/
- fsync_fname(todir, true);
+ fsync_fname(todir, true, false);
}
/*
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 1b30100..c0825e2 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -410,10 +410,11 @@ pg_flush_data(int fd, off_t offset, off_t amount)
* fsync_fname -- fsync a file or directory, handling errors properly
*
* Try to fsync a file or directory. When doing the latter, ignore errors that
- * indicate the OS just doesn't allow/require fsyncing directories.
+ * indicate the OS just doesn't allow/require fsyncing directories. Optionally
+ * one can skip error regarding non-existing entries attempted to be fsync'ed.
*/
void
-fsync_fname(char *fname, bool isdir)
+fsync_fname(char *fname, bool isdir, bool missing_ok)
{
int fd;
int returncode;
@@ -440,9 +441,14 @@ fsync_fname(char *fname, bool isdir)
return;
else if (fd < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open file \"%s\": %m", fname)));
+ {
+ if (missing_ok)
+ return;
+ else
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", fname)));
+ }
returncode = pg_fsync(fd);
@@ -461,6 +467,52 @@ fsync_fname(char *fname, bool isdir)
CloseTransientFile(fd);
}
+/*
+ * rename_safe -- rename of a file, making it on-disk persistent
+ *
+ * This routine ensures that a rename file persists in case of a crash by using
+ * fsync on the old and new files before and after performing the rename so as
+ * this categorizes as an all-or-nothing operation.
+ */
+int
+rename_safe(const char *old, const char *new)
+{
+ char *parentpath;
+
+ /*
+ * First fsync the old and new entries to ensure that they are properly
+ * persistent on disk.
+ */
+ fsync_fname(old, false, false);
+ fsync_fname(new, false, true);
+
+ /* Time to do the real deal... */
+ if (rename(old, new) != 0)
+ return -1;
+
+ /*
+ * Make change persistent in case of an OS crash, both the new entry and
+ * its parent directory need to be flushed.
+ */
+ fsync_fname(new, false, false);
+
+ /* Same for parent directory */
+ parentpath = pstrdup(new);
+ get_parent_directory(parentpath);
+
+ /*
+ * get_parent_directory() returns an empty string if the input argument
+ * is just a file name (see comments in path.c), so handle that as being
+ * the current directory.
+ */
+ if (strlen(parentpath) == 0)
+ fsync_fname(".", true, false);
+ else
+ fsync_fname(parentpath, true, false);
+ pfree(parentpath);
+ return 0;
+}
+
/*
* InitFileAccess --- initialize this module during backend startup
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
index 4ad85380..a3a0a77 100644
--- a/src/backend/storage/file/reinit.c
+++ b/src/backend/storage/file/reinit.c
@@ -380,12 +380,12 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
strlen(forkNames[INIT_FORKNUM]));
- fsync_fname(mainpath, false);
+ fsync_fname(mainpath, false, false);
}
FreeDir(dbspace_dir);
- fsync_fname((char *) dbspacedirname, true);
+ fsync_fname((char *) dbspacedirname, true, false);
}
}
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 4a3fccb..24f04f9 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -113,7 +113,8 @@ extern int pg_fsync_no_writethrough(int fd);
extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount);
-extern void fsync_fname(char *fname, bool isdir);
+extern void fsync_fname(char *fname, bool isdir, bool missing_ok);
+extern int rename_safe(const char *old, const char *new);
extern void SyncDataDirectory(void);
/* Filename components for OpenTemporaryFile */
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers