On Tue, Feb 15, 2022 at 09:09:52AM -0800, Andres Freund wrote: > On 2022-02-10 21:30:45 +0530, Bharath Rupireddy wrote: >> Replace ReadDir with ReadDirExtended (in CheckPointSnapBuild) and >> get rid of lstat entirely. > > I think this might be based on a slight misunderstanding / bad phrasing on my > part. We can use get_dirent_type() to optimize away the lstat on most > platforms, ReadDirExtended itself doesn't do that automatically. I was trying > to reference removing lstat calls by using get_dirent_type() in more places... > > >> We still use ReadDir in CheckPointLogicalRewriteHeap >> because unable to read directory would result a NULL from >> ReadDirExtended and we may miss to fsync the remaining map files, >> so here let's error out with ReadDir. > > Then why is this skipping the lstat? > > >> Also, convert "could not parse filename" and "could not remove file" >> errors to LOG messages in CheckPointLogicalRewriteHeap. This will >> enable checkpoint not to waste the amount of work that it had done. > > I still doubt this is a good idea.
IIUC you are advocating for something more like the attached patches. -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 8dddbf0241258e7b52ab664e077193d5008ee6cd Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Tue, 15 Feb 2022 09:40:53 -0800 Subject: [PATCH v9 1/3] make use of get_dirent_type() in replication code --- src/backend/access/heap/rewriteheap.c | 4 ++-- src/backend/replication/logical/snapbuild.c | 5 ++--- src/backend/replication/slot.c | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 2a53826736..d64d7aae2e 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -113,6 +113,7 @@ #include "access/xact.h" #include "access/xloginsert.h" #include "catalog/catalog.h" +#include "common/file_utils.h" #include "lib/ilist.h" #include "miscadmin.h" #include "pgstat.h" @@ -1213,7 +1214,6 @@ CheckPointLogicalRewriteHeap(void) mappings_dir = AllocateDir("pg_logical/mappings"); while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL) { - struct stat statbuf; Oid dboid; Oid relid; XLogRecPtr lsn; @@ -1227,7 +1227,7 @@ CheckPointLogicalRewriteHeap(void) continue; snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name); - if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) + if (get_dirent_type(path, mapping_de, false, ERROR) != PGFILETYPE_REG) continue; /* Skip over files that cannot be ours. */ diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 83fca8a77d..df50abfd98 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -123,6 +123,7 @@ #include "access/heapam_xlog.h" #include "access/transam.h" #include "access/xact.h" +#include "common/file_utils.h" #include "miscadmin.h" #include "pgstat.h" #include "replication/logical.h" @@ -1947,15 +1948,13 @@ CheckPointSnapBuild(void) uint32 hi; uint32 lo; XLogRecPtr lsn; - struct stat statbuf; if (strcmp(snap_de->d_name, ".") == 0 || strcmp(snap_de->d_name, "..") == 0) continue; snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name); - - if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) + if (get_dirent_type(path, snap_de, false, ERROR) != PGFILETYPE_REG) { elog(DEBUG1, "only regular files expected: %s", path); continue; diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 5da5fa825a..480a26bc7f 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -41,6 +41,7 @@ #include "access/transam.h" #include "access/xlog_internal.h" +#include "common/file_utils.h" #include "common/string.h" #include "miscadmin.h" #include "pgstat.h" @@ -1428,7 +1429,6 @@ StartupReplicationSlots(void) replication_dir = AllocateDir("pg_replslot"); while ((replication_de = ReadDir(replication_dir, "pg_replslot")) != NULL) { - struct stat statbuf; char path[MAXPGPATH + 12]; if (strcmp(replication_de->d_name, ".") == 0 || @@ -1438,7 +1438,7 @@ StartupReplicationSlots(void) snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name); /* we're only creating directories here, skip if it's not our's */ - if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode)) + if (get_dirent_type(path, replication_de, false, ERROR) != PGFILETYPE_DIR) continue; /* we crashed while a slot was being setup or deleted, clean up */ -- 2.25.1
>From 1cb3fdc3e7ed323691ec37b76b95055ad9af9757 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Tue, 15 Feb 2022 09:46:44 -0800 Subject: [PATCH v9 2/3] add error checking for call to lstat() in replication code --- src/backend/replication/logical/reorderbuffer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index c2d9be81fa..9b9e15858e 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -4812,7 +4812,11 @@ ReorderBufferCleanupSerializedTXNs(const char *slotname) sprintf(path, "pg_replslot/%s", slotname); /* we're only handling directories here, skip if it's not ours */ - if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode)) + if (lstat(path, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + else if (!S_ISDIR(statbuf.st_mode)) return; spill_dir = AllocateDir(path); -- 2.25.1
>From 7d2bed2e783aab2506bae27d1e4df90cc1b65dd4 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Tue, 15 Feb 2022 09:53:58 -0800 Subject: [PATCH v9 3/3] minor improvements to replication code --- src/backend/replication/logical/snapbuild.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index df50abfd98..6a60e942b7 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -1965,16 +1965,10 @@ CheckPointSnapBuild(void) * everything but are postfixed by .$pid.tmp. We can just remove them * the same as other files because there can be none that are * currently being written that are older than cutoff. - * - * We just log a message if a file doesn't fit the pattern, it's - * probably some editors lock/state file or similar... */ if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2) - { - ereport(LOG, + ereport(ERROR, (errmsg("could not parse file name \"%s\"", path))); - continue; - } lsn = ((uint64) hi) << 32 | lo; @@ -1983,19 +1977,11 @@ CheckPointSnapBuild(void) { elog(DEBUG1, "removing snapbuild snapshot %s", path); - /* - * It's not particularly harmful, though strange, if we can't - * remove the file here. Don't prevent the checkpoint from - * completing, that'd be a cure worse than the disease. - */ if (unlink(path) < 0) - { - ereport(LOG, + ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); - continue; - } } } FreeDir(snap_dir); -- 2.25.1