On Tue, Feb 15, 2022 at 09:09:52AM -0800, Andres Freund wrote:
> On 2022-02-10 21:30:45 +0530, Bharath Rupireddy wrote:
>> Replace ReadDir with ReadDirExtended (in CheckPointSnapBuild) and
>> get rid of lstat entirely.
> 
> I think this might be based on a slight misunderstanding / bad phrasing on my
> part.  We can use get_dirent_type() to optimize away the lstat on most
> platforms, ReadDirExtended itself doesn't do that automatically. I was trying
> to reference removing lstat calls by using get_dirent_type() in more places...
> 
> 
>> We still use ReadDir in CheckPointLogicalRewriteHeap
>> because unable to read directory would result a NULL from
>> ReadDirExtended and we may miss to fsync the remaining map files,
>> so here let's error out with ReadDir.
> 
> Then why is this skipping the lstat?
> 
> 
>> Also, convert "could not parse filename" and "could not remove file"
>> errors to LOG messages in  CheckPointLogicalRewriteHeap. This will
>> enable checkpoint not to waste the amount of work that it had done.
> 
> I still doubt this is a good idea.

IIUC you are advocating for something more like the attached patches.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 8dddbf0241258e7b52ab664e077193d5008ee6cd Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Tue, 15 Feb 2022 09:40:53 -0800
Subject: [PATCH v9 1/3] make use of get_dirent_type() in replication code

---
 src/backend/access/heap/rewriteheap.c       | 4 ++--
 src/backend/replication/logical/snapbuild.c | 5 ++---
 src/backend/replication/slot.c              | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2a53826736..d64d7aae2e 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -113,6 +113,7 @@
 #include "access/xact.h"
 #include "access/xloginsert.h"
 #include "catalog/catalog.h"
+#include "common/file_utils.h"
 #include "lib/ilist.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -1213,7 +1214,6 @@ CheckPointLogicalRewriteHeap(void)
 	mappings_dir = AllocateDir("pg_logical/mappings");
 	while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL)
 	{
-		struct stat statbuf;
 		Oid			dboid;
 		Oid			relid;
 		XLogRecPtr	lsn;
@@ -1227,7 +1227,7 @@ CheckPointLogicalRewriteHeap(void)
 			continue;
 
 		snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name);
-		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+		if (get_dirent_type(path, mapping_de, false, ERROR) != PGFILETYPE_REG)
 			continue;
 
 		/* Skip over files that cannot be ours. */
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 83fca8a77d..df50abfd98 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -123,6 +123,7 @@
 #include "access/heapam_xlog.h"
 #include "access/transam.h"
 #include "access/xact.h"
+#include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "replication/logical.h"
@@ -1947,15 +1948,13 @@ CheckPointSnapBuild(void)
 		uint32		hi;
 		uint32		lo;
 		XLogRecPtr	lsn;
-		struct stat statbuf;
 
 		if (strcmp(snap_de->d_name, ".") == 0 ||
 			strcmp(snap_de->d_name, "..") == 0)
 			continue;
 
 		snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name);
-
-		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+		if (get_dirent_type(path, snap_de, false, ERROR) != PGFILETYPE_REG)
 		{
 			elog(DEBUG1, "only regular files expected: %s", path);
 			continue;
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 5da5fa825a..480a26bc7f 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -41,6 +41,7 @@
 
 #include "access/transam.h"
 #include "access/xlog_internal.h"
+#include "common/file_utils.h"
 #include "common/string.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -1428,7 +1429,6 @@ StartupReplicationSlots(void)
 	replication_dir = AllocateDir("pg_replslot");
 	while ((replication_de = ReadDir(replication_dir, "pg_replslot")) != NULL)
 	{
-		struct stat statbuf;
 		char		path[MAXPGPATH + 12];
 
 		if (strcmp(replication_de->d_name, ".") == 0 ||
@@ -1438,7 +1438,7 @@ StartupReplicationSlots(void)
 		snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name);
 
 		/* we're only creating directories here, skip if it's not our's */
-		if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
+		if (get_dirent_type(path, replication_de, false, ERROR) != PGFILETYPE_DIR)
 			continue;
 
 		/* we crashed while a slot was being setup or deleted, clean up */
-- 
2.25.1

>From 1cb3fdc3e7ed323691ec37b76b95055ad9af9757 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Tue, 15 Feb 2022 09:46:44 -0800
Subject: [PATCH v9 2/3] add error checking for call to lstat() in replication
 code

---
 src/backend/replication/logical/reorderbuffer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index c2d9be81fa..9b9e15858e 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -4812,7 +4812,11 @@ ReorderBufferCleanupSerializedTXNs(const char *slotname)
 	sprintf(path, "pg_replslot/%s", slotname);
 
 	/* we're only handling directories here, skip if it's not ours */
-	if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
+	if (lstat(path, &statbuf) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not stat file \"%s\": %m", path)));
+	else if (!S_ISDIR(statbuf.st_mode))
 		return;
 
 	spill_dir = AllocateDir(path);
-- 
2.25.1

>From 7d2bed2e783aab2506bae27d1e4df90cc1b65dd4 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Tue, 15 Feb 2022 09:53:58 -0800
Subject: [PATCH v9 3/3] minor improvements to replication code

---
 src/backend/replication/logical/snapbuild.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index df50abfd98..6a60e942b7 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1965,16 +1965,10 @@ CheckPointSnapBuild(void)
 		 * everything but are postfixed by .$pid.tmp. We can just remove them
 		 * the same as other files because there can be none that are
 		 * currently being written that are older than cutoff.
-		 *
-		 * We just log a message if a file doesn't fit the pattern, it's
-		 * probably some editors lock/state file or similar...
 		 */
 		if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
-		{
-			ereport(LOG,
+			ereport(ERROR,
 					(errmsg("could not parse file name \"%s\"", path)));
-			continue;
-		}
 
 		lsn = ((uint64) hi) << 32 | lo;
 
@@ -1983,19 +1977,11 @@ CheckPointSnapBuild(void)
 		{
 			elog(DEBUG1, "removing snapbuild snapshot %s", path);
 
-			/*
-			 * It's not particularly harmful, though strange, if we can't
-			 * remove the file here. Don't prevent the checkpoint from
-			 * completing, that'd be a cure worse than the disease.
-			 */
 			if (unlink(path) < 0)
-			{
-				ereport(LOG,
+				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m",
 								path)));
-				continue;
-			}
 		}
 	}
 	FreeDir(snap_dir);
-- 
2.25.1

Reply via email to