On Tue, Feb 2, 2016 at 4:20 PM, Michael Paquier wrote: > Not wrong, and this leads to the following: > void rename_safe(const char *old, const char *new, bool isdir, int elevel); > Controlling elevel is necessary per the multiple code paths that would > use it. Some use ERROR, most of them FATAL, and a bit of WARNING. Does > that look fine?
After really coding it, I finished with the following thing: +int +rename_safe(const char *old, const char *new) There is no need to extend that for directories, well we could of course but all the renames happen on files so I see no need to make that more complicated. More refactoring of the other rename() calls could be done as well by extending rename_safe() with a flag to fsync the data within a critical section, particularly for the replication slot code. I have let that out to not complicate more the patch. -- Michael
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index f6da673..11287aa 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -418,9 +418,10 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, TLHistoryFilePath(path, newTLI); /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing file. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link() to rename_safe() here just to be really sure that we + * don't overwrite an existing file. However, there shouldn't be one, + * so rename_safe() is an acceptable substitute except for the truly + * paranoid. */ #if HAVE_WORKING_LINK if (link(tmppath, path) < 0) @@ -430,7 +431,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, tmppath, path))); unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -508,9 +509,10 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) TLHistoryFilePath(path, tli); /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link() to rename_safe() here just to be really sure that we + * don't overwrite an existing logfile. However, there shouldn't be + * one, so rename_safe() is an acceptable substitute except for the + * truly paranoid. */ #if HAVE_WORKING_LINK if (link(tmppath, path) < 0) @@ -520,7 +522,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) tmppath, path))); unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a2846c4..b61fcc7 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3251,9 +3251,10 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, } /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link() to rename_safe() here just to be really sure that we + * don't overwrite an existing logfile. However, there shouldn't be + * one, so rename_safe() is an acceptable substitute except for the + * truly paranoid. */ #if HAVE_WORKING_LINK if (link(tmppath, path) < 0) @@ -3268,7 +3269,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, } unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) { if (use_lock) LWLockRelease(ControlFileLock); @@ -3792,7 +3793,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) * flag, rename will fail. We'll try again at the next checkpoint. */ snprintf(newpath, MAXPGPATH, "%s.deleted", path); - if (rename(path, newpath) != 0) + if (rename_safe(path, newpath) != 0) { ereport(LOG, (errcode_for_file_access(), @@ -3800,10 +3801,12 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) path))); return; } + rc = unlink(newpath); #else rc = unlink(path); #endif + if (rc != 0) { ereport(LOG, @@ -5291,7 +5294,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) * re-enter archive recovery mode in a subsequent crash. */ unlink(RECOVERY_COMMAND_DONE); - if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) + if (rename_safe(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -6138,7 +6141,7 @@ StartupXLOG(void) if (stat(TABLESPACE_MAP, &st) == 0) { unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) ereport(LOG, (errmsg("ignoring file \"%s\" because no file \"%s\" exists", TABLESPACE_MAP, BACKUP_LABEL_FILE), @@ -6501,7 +6504,7 @@ StartupXLOG(void) if (haveBackupLabel) { unlink(BACKUP_LABEL_OLD); - if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) + if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -6518,7 +6521,7 @@ StartupXLOG(void) if (haveTblspcMap) { unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -7299,7 +7302,7 @@ StartupXLOG(void) */ XLogArchiveCleanup(partialfname); - if (rename(origpath, partialpath) != 0) + if (rename_safe(origpath, partialpath) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -10863,7 +10866,7 @@ CancelBackup(void) /* remove leftover file from previously canceled backup if it exists */ unlink(BACKUP_LABEL_OLD); - if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) + if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) { ereport(WARNING, (errcode_for_file_access(), @@ -10886,7 +10889,7 @@ CancelBackup(void) /* remove leftover file from previously canceled backup if it exists */ unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) { ereport(LOG, (errmsg("online backup mode canceled"), diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index 277c14a..13ab7fa 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -451,7 +451,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname) */ snprintf(oldpath, MAXPGPATH, "%s.deleted%u", xlogfpath, deletedcounter++); - if (rename(xlogfpath, oldpath) != 0) + if (rename_safe(xlogfpath, oldpath) != 0) { ereport(ERROR, (errcode_for_file_access(), @@ -470,7 +470,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname) reload = true; } - if (rename(path, xlogfpath) < 0) + if (rename_safe(path, xlogfpath) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -580,12 +580,11 @@ XLogArchiveForceDone(const char *xlog) StatusFilePath(archiveReady, xlog, ".ready"); if (stat(archiveReady, &stat_buf) == 0) { - if (rename(archiveReady, archiveDone) < 0) + if (rename_safe(archiveReady, archiveDone) < 0) ereport(WARNING, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", archiveReady, archiveDone))); - return; } diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 397f802..db54889 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -728,7 +728,7 @@ pgarch_archiveDone(char *xlog) StatusFilePath(rlogready, xlog, ".ready"); StatusFilePath(rlogdone, xlog, ".done"); - if (rename(rlogready, rlogdone) < 0) + if (rename_safe(rlogready, rlogdone) < 0) ereport(WARNING, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index 757b50e..6327972 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -617,16 +617,13 @@ CheckPointReplicationOrigin(void) CloseTransientFile(tmpfd); /* rename to permanent file, fsync file and directory */ - if (rename(tmppath, path) != 0) + if (rename_safe(tmppath, path) != 0) { ereport(PANIC, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); } - - fsync_fname((char *) path, false); - fsync_fname("pg_logical", true); } /* diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index ed823ec..1f17f42 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -1488,8 +1488,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) * That ought to be cheap because in most scenarios it should already * be safely on disk. */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); + fsync_fname(path, false, false); + fsync_fname("pg_logical/snapshots", true, false); builder->last_serialized_snapshot = lsn; goto out; @@ -1593,7 +1593,7 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) } CloseTransientFile(fd); - fsync_fname("pg_logical/snapshots", true); + fsync_fname("pg_logical/snapshots", true, false); /* * We may overwrite the work from some other backend, but that's ok, our @@ -1608,8 +1608,8 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) } /* make sure we persist */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); + fsync_fname(path, false, false); + fsync_fname("pg_logical/snapshots", true, false); /* * Now there's no way we can loose the dumped state anymore, remember this @@ -1660,8 +1660,8 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) * either... * ---- */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); + fsync_fname(path, false, false); + fsync_fname("pg_logical/snapshots", true, false); /* read statically sized portion of snapshot */ diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 251b549..65d01e8 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -453,8 +453,8 @@ ReplicationSlotDropAcquired(void) * restart. */ START_CRIT_SECTION(); - fsync_fname(tmppath, true); - fsync_fname("pg_replslot", true); + fsync_fname(tmppath, true, false); + fsync_fname("pg_replslot", true, false); END_CRIT_SECTION(); } else @@ -912,7 +912,7 @@ StartupReplicationSlots(void) errmsg("could not remove directory \"%s\"", path))); continue; } - fsync_fname("pg_replslot", true); + fsync_fname("pg_replslot", true, false); continue; } @@ -968,7 +968,7 @@ CreateSlotOnDisk(ReplicationSlot *slot) (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", tmppath))); - fsync_fname(tmppath, true); + fsync_fname(tmppath, true, false); /* Write the actual state file. */ slot->dirty = true; /* signal that we really need to write */ @@ -988,8 +988,8 @@ CreateSlotOnDisk(ReplicationSlot *slot) */ START_CRIT_SECTION(); - fsync_fname(path, true); - fsync_fname("pg_replslot", true); + fsync_fname(path, true, false); + fsync_fname("pg_replslot", true, false); END_CRIT_SECTION(); } @@ -1094,9 +1094,9 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) /* Check CreateSlot() for the reasoning of using a crit. section. */ START_CRIT_SECTION(); - fsync_fname(path, false); - fsync_fname((char *) dir, true); - fsync_fname("pg_replslot", true); + fsync_fname(path, false, false); + fsync_fname((char *) dir, true, false); + fsync_fname("pg_replslot", true, false); END_CRIT_SECTION(); @@ -1165,7 +1165,7 @@ RestoreSlotFromDisk(const char *name) /* Also sync the parent directory */ START_CRIT_SECTION(); - fsync_fname(path, true); + fsync_fname(path, true, false); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ @@ -1248,7 +1248,7 @@ RestoreSlotFromDisk(const char *name) (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); } - fsync_fname("pg_replslot", true); + fsync_fname("pg_replslot", true, false); return; } diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index 522f420..20f49ab 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -115,7 +115,7 @@ copydir(char *fromdir, char *todir, bool recurse) errmsg("could not stat file \"%s\": %m", tofile))); if (S_ISREG(fst.st_mode)) - fsync_fname(tofile, false); + fsync_fname(tofile, false, false); } FreeDir(xldir); @@ -125,7 +125,7 @@ copydir(char *fromdir, char *todir, bool recurse) * synced. Recent versions of ext4 have made the window much wider but * it's been true for ext3 and other filesystems in the past. */ - fsync_fname(todir, true); + fsync_fname(todir, true, false); } /* diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 1b30100..c0825e2 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -410,10 +410,11 @@ pg_flush_data(int fd, off_t offset, off_t amount) * fsync_fname -- fsync a file or directory, handling errors properly * * Try to fsync a file or directory. When doing the latter, ignore errors that - * indicate the OS just doesn't allow/require fsyncing directories. + * indicate the OS just doesn't allow/require fsyncing directories. Optionally + * one can skip error regarding non-existing entries attempted to be fsync'ed. */ void -fsync_fname(char *fname, bool isdir) +fsync_fname(char *fname, bool isdir, bool missing_ok) { int fd; int returncode; @@ -440,9 +441,14 @@ fsync_fname(char *fname, bool isdir) return; else if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", fname))); + { + if (missing_ok) + return; + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + } returncode = pg_fsync(fd); @@ -461,6 +467,52 @@ fsync_fname(char *fname, bool isdir) CloseTransientFile(fd); } +/* + * rename_safe -- rename of a file, making it on-disk persistent + * + * This routine ensures that a rename file persists in case of a crash by using + * fsync on the old and new files before and after performing the rename so as + * this categorizes as an all-or-nothing operation. + */ +int +rename_safe(const char *old, const char *new) +{ + char *parentpath; + + /* + * First fsync the old and new entries to ensure that they are properly + * persistent on disk. + */ + fsync_fname(old, false, false); + fsync_fname(new, false, true); + + /* Time to do the real deal... */ + if (rename(old, new) != 0) + return -1; + + /* + * Make change persistent in case of an OS crash, both the new entry and + * its parent directory need to be flushed. + */ + fsync_fname(new, false, false); + + /* Same for parent directory */ + parentpath = pstrdup(new); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument + * is just a file name (see comments in path.c), so handle that as being + * the current directory. + */ + if (strlen(parentpath) == 0) + fsync_fname(".", true, false); + else + fsync_fname(parentpath, true, false); + pfree(parentpath); + return 0; +} + /* * InitFileAccess --- initialize this module during backend startup diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 4ad85380..a3a0a77 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -380,12 +380,12 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) dbspacedirname, oidbuf, de->d_name + oidchars + 1 + strlen(forkNames[INIT_FORKNUM])); - fsync_fname(mainpath, false); + fsync_fname(mainpath, false, false); } FreeDir(dbspace_dir); - fsync_fname((char *) dbspacedirname, true); + fsync_fname((char *) dbspacedirname, true, false); } } diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 4a3fccb..24f04f9 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -113,7 +113,8 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); -extern void fsync_fname(char *fname, bool isdir); +extern void fsync_fname(char *fname, bool isdir, bool missing_ok); +extern int rename_safe(const char *old, const char *new); extern void SyncDataDirectory(void); /* Filename components for OpenTemporaryFile */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers