From 7b7087a34d69fe10718dd24310c8626c39a68d92 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 3 Apr 2019 22:15:19 +1300
Subject: [PATCH 2/2] Use the fsync queue for SLRU files.

Previously, we called fsync() after writing out SLRU page.  Use the
same mechanism for deferring and handing off fsync work to the
checkpointer that md.c uses.

This is a proof-of-concept only for now.
---
 src/backend/access/transam/clog.c      |  13 +++-
 src/backend/access/transam/commit_ts.c |  12 ++-
 src/backend/access/transam/multixact.c |  24 +++++-
 src/backend/access/transam/slru.c      | 104 +++++++++++++++++++------
 src/backend/access/transam/subtrans.c  |   4 +-
 src/backend/commands/async.c           |   5 +-
 src/backend/storage/lmgr/predicate.c   |   4 +-
 src/backend/storage/sync/sync.c        |  22 +++++-
 src/include/access/clog.h              |   3 +
 src/include/access/commit_ts.h         |   3 +
 src/include/access/multixact.h         |   4 +
 src/include/access/slru.h              |  12 ++-
 src/include/storage/sync.h             |   7 +-
 13 files changed, 173 insertions(+), 44 deletions(-)

diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 3bd55fbdd33..a3d3f9a304e 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -42,6 +42,7 @@
 #include "pgstat.h"
 #include "pg_trace.h"
 #include "storage/proc.h"
+#include "storage/sync.h"
 
 /*
  * Defines for CLOG page sizes.  A page is the same BLCKSZ as is used
@@ -699,7 +700,8 @@ CLOGShmemInit(void)
 {
 	ClogCtl->PagePrecedes = CLOGPagePrecedes;
 	SimpleLruInit(ClogCtl, "clog", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
-				  CLogControlLock, "pg_xact", LWTRANCHE_CLOG_BUFFERS);
+				  CLogControlLock, "pg_xact", LWTRANCHE_CLOG_BUFFERS,
+				  SYNC_HANDLER_CLOG);
 }
 
 /*
@@ -1041,3 +1043,12 @@ clog_redo(XLogReaderState *record)
 	else
 		elog(PANIC, "clog_redo: unknown op code %u", info);
 }
+
+/*
+ * Entrypoint for sync.c to sync clog files.
+ */
+int
+clogsyncfiletag(const FileTag *ftag, char *path)
+{
+	return slrusyncfiletag(ClogCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 8162f884bd1..e35480d89ec 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -494,7 +494,8 @@ CommitTsShmemInit(void)
 	CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
 	SimpleLruInit(CommitTsCtl, "commit_timestamp", CommitTsShmemBuffers(), 0,
 				  CommitTsControlLock, "pg_commit_ts",
-				  LWTRANCHE_COMMITTS_BUFFERS);
+				  LWTRANCHE_COMMITTS_BUFFERS,
+				  SYNC_HANDLER_COMMIT_TS);
 
 	commitTsShared = ShmemInitStruct("CommitTs shared",
 									 sizeof(CommitTimestampShared),
@@ -1022,3 +1023,12 @@ commit_ts_redo(XLogReaderState *record)
 	else
 		elog(PANIC, "commit_ts_redo: unknown op code %u", info);
 }
+
+/*
+ * Entrypoint for sync.c to sync commit_ts files.
+ */
+int
+committssyncfiletag(const FileTag *ftag, char *path)
+{
+	return slrusyncfiletag(CommitTsCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 763b9997071..bf2e9886032 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1829,11 +1829,13 @@ MultiXactShmemInit(void)
 	SimpleLruInit(MultiXactOffsetCtl,
 				  "multixact_offset", NUM_MXACTOFFSET_BUFFERS, 0,
 				  MultiXactOffsetControlLock, "pg_multixact/offsets",
-				  LWTRANCHE_MXACTOFFSET_BUFFERS);
+				  LWTRANCHE_MXACTOFFSET_BUFFERS,
+				  SYNC_HANDLER_MULTIXACT_OFFSET);
 	SimpleLruInit(MultiXactMemberCtl,
 				  "multixact_member", NUM_MXACTMEMBER_BUFFERS, 0,
 				  MultiXactMemberControlLock, "pg_multixact/members",
-				  LWTRANCHE_MXACTMEMBER_BUFFERS);
+				  LWTRANCHE_MXACTMEMBER_BUFFERS,
+				  SYNC_HANDLER_MULTIXACT_MEMBER);
 
 	/* Initialize our shared state struct */
 	MultiXactState = ShmemInitStruct("Shared MultiXact State",
@@ -3392,3 +3394,21 @@ pg_get_multixact_members(PG_FUNCTION_ARGS)
 
 	SRF_RETURN_DONE(funccxt);
 }
+
+/*
+ * Entrypoint for sync.c to sync offsets files.
+ */
+int
+multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
+{
+	return slrusyncfiletag(MultiXactOffsetCtl, ftag, path);
+}
+
+/*
+ * Entrypoint for sync.c to sync members files.
+ */
+int
+multixactmemberssyncfiletag(const FileTag *ftag, char *path)
+{
+	return slrusyncfiletag(MultiXactMemberCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 974d42fc866..467e265ff8c 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -81,6 +81,20 @@ typedef struct SlruFlushData
 
 typedef struct SlruFlushData *SlruFlush;
 
+/*
+ * Populate a file tag describing a segment file.  We only use the segment
+ * number, since we can derive everything else we need by having separate
+ * sync handler functions for clog, multixact etc.
+ */
+#define INIT_SLRUFILETAG(a,xx_segno) \
+( \
+	(a).rnode.spcNode = 0, \
+	(a).rnode.dbNode = 0, \
+	(a).rnode.relNode = 0, \
+	(a).forknum = 0, \
+	(a).segno = (xx_segno) \
+)
+
 /*
  * Macro to mark a buffer slot "most recently used".  Note multiple evaluation
  * of arguments!
@@ -163,7 +177,8 @@ SimpleLruShmemSize(int nslots, int nlsns)
 
 void
 SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-			  LWLock *ctllock, const char *subdir, int tranche_id)
+			  LWLock *ctllock, const char *subdir, int tranche_id,
+			  SyncRequestHandler sync_handler)
 {
 	SlruShared	shared;
 	bool		found;
@@ -247,7 +262,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 	 * assume caller set PagePrecedes.
 	 */
 	ctl->shared = shared;
-	ctl->do_fsync = true;		/* default behavior */
+	ctl->sync_handler = sync_handler;
 	StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir));
 }
 
@@ -862,23 +877,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 	}
 	pgstat_report_wait_end();
 
-	/*
-	 * If not part of Flush, need to fsync now.  We assume this happens
-	 * infrequently enough that it's not a performance issue.
-	 */
-	if (!fdata)
+	/* Queue up a sync request for the checkpointer. */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
 	{
-		pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
-		if (ctl->do_fsync && pg_fsync(fd))
+		FileTag		tag;
+
+		INIT_SLRUFILETAG(tag, segno);
+		if (!RegisterSyncRequest(&tag, SYNC_REQUEST, ctl->sync_handler, false))
 		{
+			/* No space to enqueue sync request.  Do it synchronously. */
+			pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
+			if (pg_fsync(fd) < 0)
+			{
+				pgstat_report_wait_end();
+				slru_errcause = SLRU_FSYNC_FAILED;
+				slru_errno = errno;
+				CloseTransientFile(fd);
+				return false;
+			}
 			pgstat_report_wait_end();
-			slru_errcause = SLRU_FSYNC_FAILED;
-			slru_errno = errno;
-			CloseTransientFile(fd);
-			return false;
 		}
-		pgstat_report_wait_end();
+	}
 
+	/* Close file, unless part of flush request. */
+	if (!fdata)
+	{
 		if (CloseTransientFile(fd))
 		{
 			slru_errcause = SLRU_CLOSE_FAILED;
@@ -1140,21 +1163,11 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
 	LWLockRelease(shared->ControlLock);
 
 	/*
-	 * Now fsync and close any files that were open
+	 * Now close any files that were open
 	 */
 	ok = true;
 	for (i = 0; i < fdata.num_files; i++)
 	{
-		pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
-		if (ctl->do_fsync && pg_fsync(fdata.fd[i]))
-		{
-			slru_errcause = SLRU_FSYNC_FAILED;
-			slru_errno = errno;
-			pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
-			ok = false;
-		}
-		pgstat_report_wait_end();
-
 		if (CloseTransientFile(fdata.fd[i]))
 		{
 			slru_errcause = SLRU_CLOSE_FAILED;
@@ -1270,6 +1283,7 @@ SlruDeleteSegment(SlruCtl ctl, int segno)
 	int			slotno;
 	char		path[MAXPGPATH];
 	bool		did_write;
+	FileTag		tag;
 
 	/* Clean out any possibly existing references to the segment. */
 	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
@@ -1313,6 +1327,18 @@ restart:
 	snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno);
 	ereport(DEBUG2,
 			(errmsg("removing file \"%s\"", path)));
+
+	/*
+	 * Tell the checkpointer to forget any sync requests, before we unlink the
+	 * file.
+	 */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
+	{
+		INIT_SLRUFILETAG(tag, segno);
+		RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, ctl->sync_handler,
+							true);
+	}
+
 	unlink(path);
 
 	LWLockRelease(shared->ControlLock);
@@ -1411,3 +1437,31 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
 
 	return retval;
 }
+
+/*
+ * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
+ * that they can provide the correct "SlruCtl" (otherwise we don't know how to
+ * build the path), but they just forward to this common implementation that
+ * performs the fsync.
+ */
+int
+slrusyncfiletag(SlruCtl ctl, const FileTag *ftag, char *path)
+{
+	int			fd;
+	int			save_errno;
+	int			result;
+
+	SlruFileName(ctl, path, ftag->segno);
+
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+	if (fd < 0)
+		return -1;
+
+	result = pg_fsync(fd);
+	save_errno = errno;
+
+	CloseTransientFile(fd);
+
+	errno = save_errno;
+	return result;
+}
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index e667fd02385..aa71a6ddbc6 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -193,9 +193,7 @@ SUBTRANSShmemInit(void)
 	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
 	SimpleLruInit(SubTransCtl, "subtrans", NUM_SUBTRANS_BUFFERS, 0,
 				  SubtransControlLock, "pg_subtrans",
-				  LWTRANCHE_SUBTRANS_BUFFERS);
-	/* Override default assumption that writes should be fsync'd */
-	SubTransCtl->do_fsync = false;
+				  LWTRANCHE_SUBTRANS_BUFFERS, SYNC_HANDLER_NONE);
 }
 
 /*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 5a7ee0de4cf..9c68358da6a 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -479,9 +479,8 @@ AsyncShmemInit(void)
 	 */
 	AsyncCtl->PagePrecedes = asyncQueuePagePrecedes;
 	SimpleLruInit(AsyncCtl, "async", NUM_ASYNC_BUFFERS, 0,
-				  AsyncCtlLock, "pg_notify", LWTRANCHE_ASYNC_BUFFERS);
-	/* Override default assumption that writes should be fsync'd */
-	AsyncCtl->do_fsync = false;
+				  AsyncCtlLock, "pg_notify", LWTRANCHE_ASYNC_BUFFERS,
+				  SYNC_HANDLER_NONE);
 
 	if (!found)
 	{
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 4e4d04bae37..ab41194930f 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -824,9 +824,7 @@ OldSerXidInit(void)
 	OldSerXidSlruCtl->PagePrecedes = OldSerXidPagePrecedesLogically;
 	SimpleLruInit(OldSerXidSlruCtl, "oldserxid",
 				  NUM_OLDSERXID_BUFFERS, 0, OldSerXidLock, "pg_serial",
-				  LWTRANCHE_OLDSERXID_BUFFERS);
-	/* Override default assumption that writes should be fsync'd */
-	OldSerXidSlruCtl->do_fsync = false;
+				  LWTRANCHE_OLDSERXID_BUFFERS, SYNC_HANDLER_NONE);
 
 	/*
 	 * Create or attach to the OldSerXidControl structure.
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index 7bcfa6b6c90..80838ef3dfa 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -20,6 +20,9 @@
 
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "access/commit_ts.h"
+#include "access/clog.h"
+#include "access/multixact.h"
 #include "access/xlogutils.h"
 #include "access/xlog.h"
 #include "commands/tablespace.h"
@@ -96,13 +99,30 @@ typedef struct SyncOps
 										const FileTag *candidate);
 } SyncOps;
 
+/* These indexes must correspond to the values of the SyncRequestType enum. */
 static const SyncOps syncsw[] = {
 	/* magnetic disk */
 	{
 		.sync_syncfiletag = mdsyncfiletag,
 		.sync_unlinkfiletag = mdunlinkfiletag,
 		.sync_filetagmatches = mdfiletagmatches
-	}
+	},
+	/* pg_xact */
+	{
+		.sync_syncfiletag = clogsyncfiletag
+	},
+	/* pg_commit_ts */
+	{
+		.sync_syncfiletag = committssyncfiletag
+	},
+	/* pg_multixact/offsets */
+	{
+		.sync_syncfiletag = multixactoffsetssyncfiletag
+	},
+	/* pg_multixact/members */
+	{
+		.sync_syncfiletag = multixactmemberssyncfiletag
+	},
 };
 
 /*
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index 57ef9fe858e..f55391c73e3 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -12,6 +12,7 @@
 #define CLOG_H
 
 #include "access/xlogreader.h"
+#include "storage/sync.h"
 #include "lib/stringinfo.h"
 
 /*
@@ -50,6 +51,8 @@ extern void CheckPointCLOG(void);
 extern void ExtendCLOG(TransactionId newestXact);
 extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid);
 
+extern int clogsyncfiletag(const FileTag *ftag, char *path);
+
 /* XLOG stuff */
 #define CLOG_ZEROPAGE		0x00
 #define CLOG_TRUNCATE		0x10
diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h
index 123c91128b8..1f32196873d 100644
--- a/src/include/access/commit_ts.h
+++ b/src/include/access/commit_ts.h
@@ -14,6 +14,7 @@
 #include "access/xlog.h"
 #include "datatype/timestamp.h"
 #include "replication/origin.h"
+#include "storage/sync.h"
 #include "utils/guc.h"
 
 
@@ -45,6 +46,8 @@ extern void SetCommitTsLimit(TransactionId oldestXact,
 				 TransactionId newestXact);
 extern void AdvanceOldestCommitTsXid(TransactionId oldestXact);
 
+extern int committssyncfiletag(const FileTag *ftag, char *path);
+
 /* XLOG stuff */
 #define COMMIT_TS_ZEROPAGE		0x00
 #define COMMIT_TS_TRUNCATE		0x10
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 83ae5b6b795..05dcbc8ae35 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -13,6 +13,7 @@
 
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
+#include "storage/sync.h"
 
 
 /*
@@ -116,6 +117,9 @@ extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
 extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1,
 							MultiXactId multi2);
 
+extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path);
+extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path);
+
 extern void AtEOXact_MultiXact(void);
 extern void AtPrepare_MultiXact(void);
 extern void PostPrepare_MultiXact(TransactionId xid);
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index b6e66f56a0a..deccde4cc44 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -15,6 +15,7 @@
 
 #include "access/xlogdefs.h"
 #include "storage/lwlock.h"
+#include "storage/sync.h"
 
 
 /*
@@ -115,10 +116,10 @@ typedef struct SlruCtlData
 	SlruShared	shared;
 
 	/*
-	 * This flag tells whether to fsync writes (true for pg_xact and multixact
-	 * stuff, false for pg_subtrans and pg_notify).
+	 * Which sync handler function to use when handing sync requests over to
+	 * the checkpointer.  SYNC_HANDLER_NONE to disable fsync (eg pg_notify).
 	 */
-	bool		do_fsync;
+	SyncRequestHandler sync_handler;
 
 	/*
 	 * Decide which of two page numbers is "older" for truncation purposes. We
@@ -139,7 +140,8 @@ typedef SlruCtlData *SlruCtl;
 
 extern Size SimpleLruShmemSize(int nslots, int nlsns);
 extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-			  LWLock *ctllock, const char *subdir, int tranche_id);
+			  LWLock *ctllock, const char *subdir, int tranche_id,
+			  SyncRequestHandler sync_handler);
 extern int	SimpleLruZeroPage(SlruCtl ctl, int pageno);
 extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 				  TransactionId xid);
@@ -155,6 +157,8 @@ typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
 extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
 extern void SlruDeleteSegment(SlruCtl ctl, int segno);
 
+extern int slrusyncfiletag(SlruCtl ctl, const FileTag *ftag, char *path);
+
 /* SlruScanDirectory public callbacks */
 extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
 							int segpage, void *data);
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index 6fb066a53b7..53d39926339 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -35,7 +35,12 @@ typedef enum SyncRequestType
  */
 typedef enum SyncRequestHandler
 {
-	SYNC_HANDLER_MD = 0			/* md smgr */
+	SYNC_HANDLER_MD = 0,
+	SYNC_HANDLER_CLOG,
+	SYNC_HANDLER_COMMIT_TS,
+	SYNC_HANDLER_MULTIXACT_OFFSET,
+	SYNC_HANDLER_MULTIXACT_MEMBER,
+	SYNC_HANDLER_NONE
 } SyncRequestHandler;
 
 /*
-- 
2.21.0

