On Wed, Nov 25, 2020 at 8:00 AM Pavel Borisov <pashkin.e...@gmail.com> wrote:
> The new status of this patch is: Ready for Committer

Thanks!  One small thing bothered me about the last version of the
patch.  It tried to unlink when ENOENT had already been enountered by
open(2), so COMMIT of a DROP looks like:

openat(AT_FDCWD, "base/14208/16384", O_RDWR) = 54
ftruncate(54, 0)                        = 0
close(54)                               = 0
openat(AT_FDCWD, "base/14208/16384.1", O_RDWR) = -1 ENOENT
unlink("base/14208/16384.1")            = -1 ENOENT
openat(AT_FDCWD, "base/14208/16384_fsm", O_RDWR) = -1 ENOENT
unlink("base/14208/16384_fsm")          = -1 ENOENT
openat(AT_FDCWD, "base/14208/16384_vm", O_RDWR) = -1 ENOENT
unlink("base/14208/16384_vm")           = -1 ENOENT
openat(AT_FDCWD, "base/14208/16384_init", O_RDWR) = -1 ENOENT
unlink("base/14208/16384_init")         = -1 ENOENT

So I fixed that, by adding a return value to do_truncate() and
checking it.  That's the version I plan to commit tomorrow, unless
there are further comments or objections.  I've also attached a
version suitable for REL_11_STABLE and earlier branches (with a name
that cfbot should ignore), where things are slightly different.  In
those branches, the register_forget_request() logic is elsewhere.

While looking at trace output, I figured we should just use
truncate(2) on non-Windows, on the master branch only.  It's not like
it really makes much difference, but I don't see why we shouldn't
allow ourselves to use ancient standardised Unix syscalls when we can.
That'd get us down to just the following when committing a DROP:

truncate("base/14208/16396", 0)         = 0
truncate("base/14208/16396.1", 0)       = -1 ENOENT
truncate("base/14208/16396_fsm", 0)     = -1 ENOENT
truncate("base/14208/16396_vm", 0)      = -1 ENOENT
truncate("base/14208/16396_init", 0)    = -1 ENOENT
From 8c2c189ce0c2aa25d0c3f414034b939abe4eb4ee Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmu...@postgresql.org>
Date: Mon, 30 Nov 2020 11:33:38 +1300
Subject: [PATCH v3] Free disk space for dropped relations on commit.

When committing a transaction that dropped a relation, we previously
truncated only the first segment file to free up disk space (the one
that won't be unlinked until the next checkpoint).

Truncate higher numbered segments too, even though we unlink them on
commit.  This frees the disk space immediately, even if other backends
have open file descriptors and might take a long time to get around to
handling shared invalidation events and closing them.  Also extend the
same behavior to the first segment, in recovery.

Back-patch to all supported releases.

Bug: #16663
Reported-by: Denis Patron <denis.pat...@previnet.it>
Reviewed-by: Pavel Borisov <pashkin.e...@gmail.com>
Reviewed-by: Neil Chen <carpenter.nail...@gmail.com>
Reviewed-by: David Zhang <david.zh...@highgo.ca>
Discussion: https://postgr.es/m/16663-fe97ccf9932fc800%40postgresql.org
---
 src/backend/storage/smgr/md.c | 100 +++++++++++++++++++++++-----------
 1 file changed, 69 insertions(+), 31 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 1d4aa482cc..c697af00d9 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -286,6 +286,41 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 		mdunlinkfork(rnode, forkNum, isRedo);
 }
 
+/*
+ * Truncate a file to release disk space.
+ */
+static int
+do_truncate(const char *path)
+{
+	int			save_errno;
+	int			ret;
+	int			fd;
+
+	/* truncate(2) would be easier here, but Windows hasn't got it */
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+	if (fd >= 0)
+	{
+		ret = ftruncate(fd, 0);
+		save_errno = errno;
+		CloseTransientFile(fd);
+		errno = save_errno;
+	}
+	else
+		ret = -1;
+
+	/* Log a warning here to avoid repetition in callers. */
+	if (ret < 0 && errno != ENOENT)
+	{
+		save_errno = errno;
+		ereport(WARNING,
+				(errcode_for_file_access(),
+				 errmsg("could not truncate file \"%s\": %m", path)));
+		errno = save_errno;
+	}
+
+	return ret;
+}
+
 static void
 mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 {
@@ -299,38 +334,31 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	 */
 	if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
 	{
-		/* First, forget any pending sync requests for the first segment */
 		if (!RelFileNodeBackendIsTemp(rnode))
+		{
+			/* Prevent other backends' fds from holding on to the disk space */
+			ret = do_truncate(path);
+
+			/* Forget any pending sync requests for the first segment */
 			register_forget_request(rnode, forkNum, 0 /* first seg */ );
+		}
+		else
+			ret = 0;
 
-		/* Next unlink the file */
-		ret = unlink(path);
-		if (ret < 0 && errno != ENOENT)
-			ereport(WARNING,
-					(errcode_for_file_access(),
-					 errmsg("could not remove file \"%s\": %m", path)));
+		/* Next unlink the file, unless it was already found to be missing */
+		if (ret == 0 || errno != ENOENT)
+		{
+			ret = unlink(path);
+			if (ret < 0 && errno != ENOENT)
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", path)));
+		}
 	}
 	else
 	{
-		/* truncate(2) would be easier here, but Windows hasn't got it */
-		int			fd;
-
-		fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
-		if (fd >= 0)
-		{
-			int			save_errno;
-
-			ret = ftruncate(fd, 0);
-			save_errno = errno;
-			CloseTransientFile(fd);
-			errno = save_errno;
-		}
-		else
-			ret = -1;
-		if (ret < 0 && errno != ENOENT)
-			ereport(WARNING,
-					(errcode_for_file_access(),
-					 errmsg("could not truncate file \"%s\": %m", path)));
+		/* Prevent other backends' fds from holding on to the disk space */
+		ret = do_truncate(path);
 
 		/* Register request to unlink first segment later */
 		register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
@@ -350,14 +378,24 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 		 */
 		for (segno = 1;; segno++)
 		{
-			/*
-			 * Forget any pending sync requests for this segment before we try
-			 * to unlink.
-			 */
+			sprintf(segpath, "%s.%u", path, segno);
+
 			if (!RelFileNodeBackendIsTemp(rnode))
+			{
+				/*
+				 * Prevent other backends' fds from holding on to the disk
+				 * space.
+				 */
+				if (do_truncate(segpath) < 0 && errno == ENOENT)
+					break;
+
+				/*
+				 * Forget any pending sync requests for this segment before we
+				 * try to unlink.
+				 */
 				register_forget_request(rnode, forkNum, segno);
+			}
 
-			sprintf(segpath, "%s.%u", path, segno);
 			if (unlink(segpath) < 0)
 			{
 				/* ENOENT is expected after the last segment... */
-- 
2.20.1

Attachment: v3-0001-Free-disk-space-for-dropped-relations-on-commit.patch-REL_11_STABLE
Description: Binary data

From 0857703de891ccd1ffb8c96ab35ae5e90929ddb4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmu...@postgresql.org>
Date: Mon, 30 Nov 2020 18:32:32 +1300
Subject: [PATCH 2/2] Use truncate(2) instead of open+ftruncate+close.

When truncating files, use POSIX truncate(2).  Windows doesn't have
that, so provide a fallback.

Discussion: https://postgr.es/m/16663-fe97ccf9932fc800%40postgresql.org
---
 src/backend/storage/file/fd.c | 27 +++++++++++++++++++++++++++
 src/backend/storage/smgr/md.c | 13 +------------
 src/include/storage/fd.h      |  1 +
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 05abcf72d6..046953022b 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -622,6 +622,33 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
 #endif
 }
 
+/*
+ * Truncate a file to a given length by name.
+ */
+int
+pg_truncate(const char *path, off_t length)
+{
+#ifdef WIN32
+	int		save_errno;
+	int		ret;
+	int		fd;
+
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+	if (fd >= 0)
+	{
+		ret = ftruncate(fd, 0);
+		save_errno = errno;
+		CloseTransientFile(fd);
+		errno = save_errno;
+	}
+	else
+		ret = -1;
+
+	return ret;
+#else
+	return truncate(path, length);
+#endif
+}
 
 /*
  * fsync_fname -- fsync a file or directory, handling errors properly
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index c697af00d9..9889ad6ad8 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -294,19 +294,8 @@ do_truncate(const char *path)
 {
 	int			save_errno;
 	int			ret;
-	int			fd;
 
-	/* truncate(2) would be easier here, but Windows hasn't got it */
-	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
-	if (fd >= 0)
-	{
-		ret = ftruncate(fd, 0);
-		save_errno = errno;
-		CloseTransientFile(fd);
-		errno = save_errno;
-	}
-	else
-		ret = -1;
+	ret = pg_truncate(path, 0);
 
 	/* Log a warning here to avoid repetition in callers. */
 	if (ret < 0 && errno != ENOENT)
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index e209f047e8..4e1cc12e23 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -153,6 +153,7 @@ extern int	pg_fsync_no_writethrough(int fd);
 extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern void pg_flush_data(int fd, off_t offset, off_t amount);
+extern int	pg_truncate(const char *path, off_t length);
 extern void fsync_fname(const char *fname, bool isdir);
 extern int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
 extern int	durable_rename(const char *oldfile, const char *newfile, int loglevel);
-- 
2.20.1

Reply via email to