On Sat, Mar 5, 2016 at 7:47 AM, Andres Freund <[email protected]> wrote:
> On 2016-03-05 07:43:00 +0900, Michael Paquier wrote:
>> On Sat, Mar 5, 2016 at 7:35 AM, Andres Freund <[email protected]> wrote:
>> > On 2016-03-04 14:51:50 +0900, Michael Paquier wrote:
>> >> On Fri, Mar 4, 2016 at 4:06 AM, Andres Freund <[email protected]> wrote:
>> >> Hm. OK. I don't see any reason why switching to link() even in code
>> >> paths like KeepFileRestoredFromArchive() or pgarch_archiveDone() would
>> >> be a problem thinking about it. Should HAVE_WORKING_LINK be available
>> >> on a platform we can combine it with unlink. Is that in line with what
>> >> you think?
>> >
>> > I wasn't trying to suggest we should replace all rename codepaths with
>> > the link wrapper, just the ones that already have a HAVE_WORKING_LINK
>> > check. The name of the routine I suggested is bad though...
>>
>> So we'd introduce a first routine rename_or_link_safe(), say replace_safe().
>
> Or actually maybe just link_safe(), which falls back to access() &&
> rename() if !HAVE_WORKING_LINK.
>
>> > That's one approach, yes. Combined with the fact that you can't actually
>> > reliably rename across directories, the two could be on different
>> > filesystems after all, that'd be a suitable defense. It just needs to be
>> > properly documented in the function header, not at the bottom.
>>
>> OK. Got it. Or the two could be on the same filesystem.
>
>> Still, link() and rename() do not support doing their stuff on
>> different filesystems (EXDEV).
>
> That's my point ...
OK, I hacked a v7:
- Move the link()/rename() group with HAVE_WORKING_LINK into a single
routine, making the previous link_safe renamed to replace_safe. This
is sharing a lot of things with rename_safe. I am not sure it is worth
complicating the code more this way by having a common single routine
for whole. Thoughts welcome. Honestly, I kind of liked the separation
with link_safe/rename_safe of previous patches because link_safe could
have been directly used by extensions and plugins btw.
- Remove the call of stat() in rename_safe() and implement a logic
depending on OpenTransientFile()/pg_fsync() to flush any existing
target file before performing the rename.
Andres, feel free to use this patch as a base, perhaps that will help.
--
Michael
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index f6da673..80ec293 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -417,25 +417,12 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
*/
TLHistoryFilePath(path, newTLI);
- /*
- * Prefer link() to rename() here just to be really sure that we don't
- * overwrite an existing file. However, there shouldn't be one, so
- * rename() is an acceptable substitute except for the truly paranoid.
- */
-#if HAVE_WORKING_LINK
- if (link(tmppath, path) < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not link file \"%s\" to \"%s\": %m",
- tmppath, path)));
- unlink(tmppath);
-#else
- if (rename(tmppath, path) < 0)
+ /* And perform the rename */
+ if (replace_safe(tmppath, path) < 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not rename file \"%s\" to \"%s\": %m",
+ errmsg("could not replace file \"%s\" to \"%s\": %m",
tmppath, path)));
-#endif
/* The history file can be archived immediately. */
if (XLogArchivingActive())
@@ -507,25 +494,12 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
*/
TLHistoryFilePath(path, tli);
- /*
- * Prefer link() to rename() here just to be really sure that we don't
- * overwrite an existing logfile. However, there shouldn't be one, so
- * rename() is an acceptable substitute except for the truly paranoid.
- */
-#if HAVE_WORKING_LINK
- if (link(tmppath, path) < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not link file \"%s\" to \"%s\": %m",
- tmppath, path)));
- unlink(tmppath);
-#else
- if (rename(tmppath, path) < 0)
+ /* And perform the rename */
+ if (replace_safe(tmppath, path) < 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not rename file \"%s\" to \"%s\": %m",
+ errmsg("could not replace file \"%s\" to \"%s\": %m",
tmppath, path)));
-#endif
}
/*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 94b79ac..6c4f36d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3298,35 +3298,17 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
}
}
- /*
- * Prefer link() to rename() here just to be really sure that we don't
- * overwrite an existing logfile. However, there shouldn't be one, so
- * rename() is an acceptable substitute except for the truly paranoid.
- */
-#if HAVE_WORKING_LINK
- if (link(tmppath, path) < 0)
+ /* rename the segment file */
+ if (replace_safe(tmppath, path) < 0)
{
if (use_lock)
LWLockRelease(ControlFileLock);
ereport(LOG,
(errcode_for_file_access(),
- errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
+ errmsg("could not replace file \"%s\" to \"%s\" (initialization of log file): %m",
tmppath, path)));
return false;
}
- unlink(tmppath);
-#else
- if (rename(tmppath, path) < 0)
- {
- if (use_lock)
- LWLockRelease(ControlFileLock);
- ereport(LOG,
- (errcode_for_file_access(),
- errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
- tmppath, path)));
- return false;
- }
-#endif
if (use_lock)
LWLockRelease(ControlFileLock);
@@ -3840,7 +3822,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
* flag, rename will fail. We'll try again at the next checkpoint.
*/
snprintf(newpath, MAXPGPATH, "%s.deleted", path);
- if (rename(path, newpath) != 0)
+ if (rename_safe(path, newpath) != 0)
{
ereport(LOG,
(errcode_for_file_access(),
@@ -5339,7 +5321,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
* re-enter archive recovery mode in a subsequent crash.
*/
unlink(RECOVERY_COMMAND_DONE);
- if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+ if (rename_safe(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -6186,7 +6168,7 @@ StartupXLOG(void)
if (stat(TABLESPACE_MAP, &st) == 0)
{
unlink(TABLESPACE_MAP_OLD);
- if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
+ if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
ereport(LOG,
(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
TABLESPACE_MAP, BACKUP_LABEL_FILE),
@@ -6549,7 +6531,7 @@ StartupXLOG(void)
if (haveBackupLabel)
{
unlink(BACKUP_LABEL_OLD);
- if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
+ if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -6566,7 +6548,7 @@ StartupXLOG(void)
if (haveTblspcMap)
{
unlink(TABLESPACE_MAP_OLD);
- if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0)
+ if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -7347,7 +7329,7 @@ StartupXLOG(void)
*/
XLogArchiveCleanup(partialfname);
- if (rename(origpath, partialpath) != 0)
+ if (rename_safe(origpath, partialpath) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -10907,7 +10889,7 @@ CancelBackup(void)
/* remove leftover file from previously canceled backup if it exists */
unlink(BACKUP_LABEL_OLD);
- if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
+ if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
{
ereport(WARNING,
(errcode_for_file_access(),
@@ -10930,7 +10912,7 @@ CancelBackup(void)
/* remove leftover file from previously canceled backup if it exists */
unlink(TABLESPACE_MAP_OLD);
- if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
+ if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
{
ereport(LOG,
(errmsg("online backup mode canceled"),
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index 277c14a..65c03b2 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -451,7 +451,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname)
*/
snprintf(oldpath, MAXPGPATH, "%s.deleted%u",
xlogfpath, deletedcounter++);
- if (rename(xlogfpath, oldpath) != 0)
+ if (rename_safe(xlogfpath, oldpath) != 0)
{
ereport(ERROR,
(errcode_for_file_access(),
@@ -470,7 +470,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname)
reload = true;
}
- if (rename(path, xlogfpath) < 0)
+ if (rename_safe(path, xlogfpath) < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -580,7 +580,7 @@ XLogArchiveForceDone(const char *xlog)
StatusFilePath(archiveReady, xlog, ".ready");
if (stat(archiveReady, &stat_buf) == 0)
{
- if (rename(archiveReady, archiveDone) < 0)
+ if (rename_safe(archiveReady, archiveDone) < 0)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 397f802..db54889 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -728,7 +728,7 @@ pgarch_archiveDone(char *xlog)
StatusFilePath(rlogready, xlog, ".ready");
StatusFilePath(rlogdone, xlog, ".done");
- if (rename(rlogready, rlogdone) < 0)
+ if (rename_safe(rlogready, rlogdone) < 0)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c
index 0caf7a3..96368c7 100644
--- a/src/backend/replication/logical/origin.c
+++ b/src/backend/replication/logical/origin.c
@@ -617,16 +617,13 @@ CheckPointReplicationOrigin(void)
CloseTransientFile(tmpfd);
/* rename to permanent file, fsync file and directory */
- if (rename(tmppath, path) != 0)
+ if (rename_safe(tmppath, path) != 0)
{
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
tmppath, path)));
}
-
- fsync_fname((char *) path, false);
- fsync_fname("pg_logical", true);
}
/*
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index affa9b9..ead221d 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1095,7 +1095,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
START_CRIT_SECTION();
fsync_fname(path, false);
- fsync_fname((char *) dir, true);
+ fsync_fname(dir, true);
fsync_fname("pg_replslot", true);
END_CRIT_SECTION();
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 1b30100..85bab37 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -307,6 +307,7 @@ static void walkdir(const char *path,
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
+static void fsync_parent_path(const char *fname);
/*
@@ -413,7 +414,7 @@ pg_flush_data(int fd, off_t offset, off_t amount)
* indicate the OS just doesn't allow/require fsyncing directories.
*/
void
-fsync_fname(char *fname, bool isdir)
+fsync_fname(const char *fname, bool isdir)
{
int fd;
int returncode;
@@ -424,11 +425,11 @@ fsync_fname(char *fname, bool isdir)
* cases here
*/
if (!isdir)
- fd = OpenTransientFile(fname,
+ fd = OpenTransientFile((char *) fname,
O_RDWR | PG_BINARY,
S_IRUSR | S_IWUSR);
else
- fd = OpenTransientFile(fname,
+ fd = OpenTransientFile((char *) fname,
O_RDONLY | PG_BINARY,
S_IRUSR | S_IWUSR);
@@ -461,6 +462,100 @@ fsync_fname(char *fname, bool isdir)
CloseTransientFile(fd);
}
+/*
+ * rename_safe -- rename of a file, making it on-disk persistent
+ *
+ * This routine ensures that a rename file persists in case of a crash by
+ * using fsync on the old and new files before and after performing the
+ * rename so as this categorizes as an all-or-nothing operation.
+ *
+ * rename() is not reliable across directories, particularly if the origin
+ * point and the target point are located on different mounted partitions
+ * so this routine should be called when the replacement of a file is
+ * located in the same directory as its origin file.
+ */
+int
+rename_safe(const char *oldfile, const char *newfile)
+{
+ int fd;
+
+ /*
+ * First fsync the old entry and new entry, it this one exists, to ensure
+ * that they are properly persistent on disk. Calling this routine with
+ * an existing new target file is fine, rename() will first remove it
+ * before performing its operation.
+ */
+ fsync_fname(oldfile, false);
+
+ fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDONLY, 0);
+ if (fd < 0)
+ {
+ if (errno != ENOENT)
+ return -1;
+ }
+ else
+ {
+ if (pg_fsync(fd) != 0)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m",
+ newfile)));
+ (void) CloseTransientFile(fd);
+ }
+
+ /* Time to do the real deal... */
+ if (rename(oldfile, newfile) != 0)
+ return -1;
+
+ /*
+ * Make change persistent in case of an OS crash, both the new entry and
+ * its parent directory need to be flushed.
+ */
+ fsync_fname(newfile, false);
+
+ /* Same for parent directory */
+ fsync_parent_path(newfile);
+ return 0;
+}
+
+/*
+ * replace_safe -- replace a file, making it on-disk persistent
+ *
+ * This routine ensures that a file link or rename on a file persists on
+ * system in case of a crash by using fsync where on the link generated
+ * as well as on its parent directory. link() is preferred to rename() just
+ * to be really sure that an existing file is not overwritten. However,
+ * there should not be an existing file when calling this routine, so rename()
+ * is an acceptable substitute except for the truly paranoid.
+ *
+ * rename() and link() are not reliable across directories, particularly
+ * if the origin point and the target point are located on different mounted
+ * partitions, so this routine should be called when the replacement of a
+ * file is located in the same directory as its origin file.
+ */
+int
+replace_safe(const char *oldfile, const char *newfile)
+{
+#if HAVE_WORKING_LINK
+ if (link(oldfile, newfile) < 0)
+ return -1;
+ unlink(oldfile);
+#else
+ if (rename(oldfile, newfile) < 0)
+ return -1;
+#endif
+
+ /*
+ * Make change persistent in case of an OS crash, both the new entry and
+ * its parent directory need to be flushed.
+ */
+ fsync_fname(newfile, false);
+
+ /* Same for parent directory */
+ fsync_parent_path(newfile);
+ return 0;
+}
+
/*
* InitFileAccess --- initialize this module during backend startup
@@ -2719,3 +2814,29 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
(void) CloseTransientFile(fd);
}
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+static void
+fsync_parent_path(const char *fname)
+{
+ char parentpath[MAXPGPATH];
+
+ /* Same for parent directory */
+ snprintf(parentpath, MAXPGPATH, "%s", fname);
+ get_parent_directory(parentpath);
+
+ /*
+ * get_parent_directory() returns an empty string if the input argument
+ * is just a file name (see comments in path.c), so handle that as being
+ * the current directory.
+ */
+ if (strlen(parentpath) == 0)
+ fsync_fname(".", true);
+ else
+ fsync_fname(parentpath, true);
+}
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 4a3fccb..60836a1 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -113,7 +113,9 @@ extern int pg_fsync_no_writethrough(int fd);
extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount);
-extern void fsync_fname(char *fname, bool isdir);
+extern void fsync_fname(const char *fname, bool isdir);
+extern int rename_safe(const char *oldfile, const char *newfile);
+extern int replace_safe(const char *oldfile, const char *newfile);
extern void SyncDataDirectory(void);
/* Filename components for OpenTemporaryFile */
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers