From afd2eb044eacf07601e711f2d292c66ecb8bb134 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@enterprisedb.com>
Date: Mon, 31 Dec 2018 15:25:16 +1300
Subject: [PATCH 2/2] Refactor the fsync machinery to support future SMGR
 implementations.

In anticipation of proposed block storage managers alongside md.c that
map bufmgr.c blocks to files optimised for different usage patterns:

1.  Move the system for requesting fsyncs out of md.c into a new
translation unit smgrsync.c.

2.  Have smgrsync.c perform the actual fsync() calls via the existing
polymorphic smgrimmedsync() interface, extended to allow an individual
segment number to be specified.

3.  Teach the checkpointer how to forget individual segments that are
unlinked from the 'front' after having been dropped from shared
buffers.

4.  Move the request tracking from a bitmapset into a sorted vector,
because the proposed block storage managers are not anchored at zero
and use potentially very large and sparse integers.

Author: Thomas Munro
Reviewed-by:
Discussion: https://postgr.es/m/CAEepm=2gTANm=e3ARnJT=n0h8hf88wqmaZxk0JYkxw+b21fNrw@mail.gmail.com
---
 src/backend/access/heap/heapam.c      |   4 +-
 src/backend/access/nbtree/nbtree.c    |   2 +-
 src/backend/access/nbtree/nbtsort.c   |   2 +-
 src/backend/access/spgist/spginsert.c |   2 +-
 src/backend/access/transam/xlog.c     |   2 +
 src/backend/bootstrap/bootstrap.c     |   1 +
 src/backend/catalog/heap.c            |   2 +-
 src/backend/commands/dbcommands.c     |   2 +-
 src/backend/commands/tablecmds.c      |   2 +-
 src/backend/commands/tablespace.c     |   2 +-
 src/backend/postmaster/bgwriter.c     |   1 +
 src/backend/postmaster/checkpointer.c |  21 +-
 src/backend/storage/buffer/bufmgr.c   |   2 +
 src/backend/storage/ipc/ipci.c        |   1 +
 src/backend/storage/smgr/Makefile     |   2 +-
 src/backend/storage/smgr/md.c         | 801 ++-----------------------
 src/backend/storage/smgr/smgr.c       | 104 ++--
 src/backend/storage/smgr/smgrsync.c   | 834 ++++++++++++++++++++++++++
 src/backend/tcop/utility.c            |   2 +-
 src/backend/utils/misc/guc.c          |   1 +
 src/include/postmaster/bgwriter.h     |  24 +-
 src/include/storage/smgr.h            |  29 +-
 src/include/storage/smgrsync.h        |  36 ++
 23 files changed, 992 insertions(+), 887 deletions(-)
 create mode 100644 src/backend/storage/smgr/smgrsync.c
 create mode 100644 src/include/storage/smgrsync.h

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 96501456422..f3d53bb47dd 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -9361,7 +9361,7 @@ heap_sync(Relation rel)
 	/* main heap */
 	FlushRelationBuffers(rel);
 	/* FlushRelationBuffers will have opened rd_smgr */
-	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
+	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM, InvalidSegmentNumber);
 
 	/* FSM is not critical, don't bother syncing it */
 
@@ -9372,7 +9372,7 @@ heap_sync(Relation rel)
 
 		toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
 		FlushRelationBuffers(toastrel);
-		smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM, InvalidSegmentNumber);
 		heap_close(toastrel, AccessShareLock);
 	}
 }
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index e8725fbbe1e..a0f957d1ef4 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -178,7 +178,7 @@ btbuildempty(Relation index)
 	 * write did not go through shared_buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	smgrimmedsync(index->rd_smgr, INIT_FORKNUM, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 16f57557776..a829c9cc034 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1207,7 +1207,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	if (RelationNeedsWAL(wstate->index))
 	{
 		RelationOpenSmgr(wstate->index);
-		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM, InvalidBlockNumber);
 	}
 }
 
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 7dd0d61fbbc..7201b6533f3 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -205,7 +205,7 @@ spgbuildempty(Relation index)
 	 * writes did not go through shared buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	smgrimmedsync(index->rd_smgr, INIT_FORKNUM, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a28be4f7db8..23e840ede2b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -44,6 +44,7 @@
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "postmaster/walwriter.h"
 #include "postmaster/startup.h"
 #include "replication/basebackup.h"
@@ -64,6 +65,7 @@
 #include "storage/procarray.h"
 #include "storage/reinit.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "storage/spin.h"
 #include "utils/backend_random.h"
 #include "utils/builtins.h"
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index fc1927c537b..f04cb86d650 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -31,6 +31,7 @@
 #include "pg_getopt.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "postmaster/startup.h"
 #include "postmaster/walwriter.h"
 #include "replication/walreceiver.h"
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 4d5b82aaa95..7927b353fcf 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1405,7 +1405,7 @@ heap_create_init_fork(Relation rel)
 	RelationOpenSmgr(rel);
 	smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
 	log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
-	smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
+	smgrimmedsync(rel->rd_smgr, INIT_FORKNUM, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index f640f469729..b59414b3350 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -47,7 +47,7 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "replication/slot.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index c8c50e8c989..a5f19eaf3f0 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11300,7 +11300,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	 * here, they might still not be on disk when the crash occurs.
 	 */
 	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
-		smgrimmedsync(dst, forkNum);
+		smgrimmedsync(dst, forkNum, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 4a714f6e2be..aa76b8d25ec 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -70,7 +70,7 @@
 #include "commands/tablespace.h"
 #include "common/file_perm.h"
 #include "miscadmin.h"
-#include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "storage/fd.h"
 #include "storage/lmgr.h"
 #include "storage/standby.h"
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 7612b17b442..b37a25fc2a6 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -44,6 +44,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
 #include "storage/condition_variable.h"
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index b9c118e1560..f420fce60dd 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -47,6 +47,8 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
+#include "postmaster/postmaster.h"
 #include "replication/syncrep.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
@@ -108,10 +110,10 @@
  */
 typedef struct
 {
-	RelFileNode rnode;
+	int			type;
+	RelFileNode	rnode;
 	ForkNumber	forknum;
-	BlockNumber segno;			/* see md.c for special values */
-	/* might add a real request-type field later; not needed yet */
+	SegmentNumber segno;
 } CheckpointerRequest;
 
 typedef struct
@@ -1077,9 +1079,7 @@ RequestCheckpoint(int flags)
  * RelFileNodeBackend.
  *
  * segno specifies which segment (not block!) of the relation needs to be
- * fsync'd.  (Since the valid range is much less than BlockNumber, we can
- * use high values for special flags; that's all internal to md.c, which
- * see for details.)
+ * fsync'd.
  *
  * To avoid holding the lock for longer than necessary, we normally write
  * to the requests[] queue without checking for duplicates.  The checkpointer
@@ -1092,13 +1092,14 @@ RequestCheckpoint(int flags)
  * let the backend know by returning false.
  */
 bool
-ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+ForwardFsyncRequest(int type, RelFileNode rnode, ForkNumber forknum,
+					SegmentNumber segno)
 {
 	CheckpointerRequest *request;
 	bool		too_full;
 
 	if (!IsUnderPostmaster)
-		return false;			/* probably shouldn't even get here */
+		elog(ERROR, "ForwardFsyncRequest must not be called in single user mode");
 
 	if (AmCheckpointerProcess())
 		elog(ERROR, "ForwardFsyncRequest must not be called in checkpointer");
@@ -1130,6 +1131,7 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
 
 	/* OK, insert request */
 	request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
+	request->type = type;
 	request->rnode = rnode;
 	request->forknum = forknum;
 	request->segno = segno;
@@ -1314,7 +1316,8 @@ AbsorbFsyncRequests(void)
 	LWLockRelease(CheckpointerCommLock);
 
 	for (request = requests; n > 0; request++, n--)
-		RememberFsyncRequest(request->rnode, request->forknum, request->segno);
+		RememberFsyncRequest(request->type, request->rnode, request->forknum,
+							 request->segno);
 
 	END_CRIT_SECTION();
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 9817770affc..52c4801ddf4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -42,11 +42,13 @@
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "storage/standby.h"
 #include "utils/rel.h"
 #include "utils/resowner_private.h"
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 0c86a581c03..4531e4dc4f7 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -27,6 +27,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/slot.h"
diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile
index 2b95cb0df16..c9c4be325ed 100644
--- a/src/backend/storage/smgr/Makefile
+++ b/src/backend/storage/smgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/storage/smgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = md.o smgr.o smgrtype.o
+OBJS = md.o smgr.o smgrsync.o smgrtype.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 4c6a50509f8..114963ff42a 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -30,37 +30,24 @@
 #include "access/xlog.h"
 #include "pgstat.h"
 #include "portability/instr_time.h"
-#include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "storage/fd.h"
 #include "storage/bufmgr.h"
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "pg_trace.h"
 
 
-/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
-#define FSYNCS_PER_ABSORB		10
-#define UNLINKS_PER_ABSORB		10
-
-/*
- * Special values for the segno arg to RememberFsyncRequest.
- *
- * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
- * fsync request from the queue if an identical, subsequent request is found.
- * See comments there before making changes here.
- */
-#define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
-#define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
-#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
 
 /*
  * On Windows, we have to interpret EACCES as possibly meaning the same as
  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  * that's what you get.  Ugh.  This code is designed so that we don't
  * actually believe these cases are okay without further evidence (namely,
- * a pending fsync request getting canceled ... see mdsync).
+ * a pending fsync request getting canceled ... see smgrsync).
  */
 #ifndef WIN32
 #define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT)
@@ -134,30 +121,9 @@ static MemoryContext MdCxt;		/* context for all MdfdVec objects */
  * (Regular backends do not track pending operations locally, but forward
  * them to the checkpointer.)
  */
-typedef uint16 CycleCtr;		/* can be any convenient integer size */
+typedef uint32 CycleCtr;		/* can be any convenient integer size */
 
-typedef struct
-{
-	RelFileNode rnode;			/* hash table key (must be first!) */
-	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr of oldest request */
-	/* requests[f] has bit n set if we need to fsync segment n of fork f */
-	Bitmapset  *requests[MAX_FORKNUM + 1];
-	/* canceled[f] is true if we canceled fsyncs for fork "recently" */
-	bool		canceled[MAX_FORKNUM + 1];
-} PendingOperationEntry;
-
-typedef struct
-{
-	RelFileNode rnode;			/* the dead relation to delete */
-	CycleCtr	cycle_ctr;		/* mdckpt_cycle_ctr when request was made */
-} PendingUnlinkEntry;
 
-static HTAB *pendingOpsTable = NULL;
-static List *pendingUnlinks = NIL;
-static MemoryContext pendingOpsCxt; /* context for the above  */
-
-static CycleCtr mdsync_cycle_ctr = 0;
-static CycleCtr mdckpt_cycle_ctr = 0;
 
 
 /*** behavior for mdopen & _mdfd_getseg ***/
@@ -184,8 +150,7 @@ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
 			 bool isRedo);
 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
-					   MdfdVec *seg);
-static void register_unlink(RelFileNodeBackend rnode);
+							   MdfdVec *seg);
 static void _fdvec_resize(SMgrRelation reln,
 			  ForkNumber forknum,
 			  int nseg);
@@ -208,64 +173,6 @@ mdinit(void)
 	MdCxt = AllocSetContextCreate(TopMemoryContext,
 								  "MdSmgr",
 								  ALLOCSET_DEFAULT_SIZES);
-
-	/*
-	 * Create pending-operations hashtable if we need it.  Currently, we need
-	 * it if we are standalone (not under a postmaster) or if we are a startup
-	 * or checkpointer auxiliary process.
-	 */
-	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
-	{
-		HASHCTL		hash_ctl;
-
-		/*
-		 * XXX: The checkpointer needs to add entries to the pending ops table
-		 * when absorbing fsync requests.  That is done within a critical
-		 * section, which isn't usually allowed, but we make an exception. It
-		 * means that there's a theoretical possibility that you run out of
-		 * memory while absorbing fsync requests, which leads to a PANIC.
-		 * Fortunately the hash table is small so that's unlikely to happen in
-		 * practice.
-		 */
-		pendingOpsCxt = AllocSetContextCreate(MdCxt,
-											  "Pending ops context",
-											  ALLOCSET_DEFAULT_SIZES);
-		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
-
-		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-		hash_ctl.keysize = sizeof(RelFileNode);
-		hash_ctl.entrysize = sizeof(PendingOperationEntry);
-		hash_ctl.hcxt = pendingOpsCxt;
-		pendingOpsTable = hash_create("Pending Ops Table",
-									  100L,
-									  &hash_ctl,
-									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-		pendingUnlinks = NIL;
-	}
-}
-
-/*
- * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
- * already created the pendingOpsTable during initialization of the startup
- * process.  Calling this function drops the local pendingOpsTable so that
- * subsequent requests will be forwarded to checkpointer.
- */
-void
-SetForwardFsyncRequests(void)
-{
-	/* Perform any pending fsyncs we may have queued up, then drop table */
-	if (pendingOpsTable)
-	{
-		mdsync();
-		hash_destroy(pendingOpsTable);
-	}
-	pendingOpsTable = NULL;
-
-	/*
-	 * We should not have any pending unlink requests, since mdunlink doesn't
-	 * queue unlink requests when isRedo.
-	 */
-	Assert(pendingUnlinks == NIL);
 }
 
 /*
@@ -388,7 +295,7 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 {
 	/*
 	 * We have to clean out any pending fsync requests for the doomed
-	 * relation, else the next mdsync() will fail.  There can't be any such
+	 * relation, else the next smgrsync() will fail.  There can't be any such
 	 * requests for a temp relation, though.  We can send just one request
 	 * even when deleting multiple forks, since the fsync queuing code accepts
 	 * the "InvalidForkNumber = all forks" convention.
@@ -448,7 +355,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 					 errmsg("could not truncate file \"%s\": %m", path)));
 
 		/* Register request to unlink first segment later */
-		register_unlink(rnode);
+		UnlinkAfterCheckpoint(rnode);
 	}
 
 	/*
@@ -993,423 +900,55 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
  *
  * Note that only writes already issued are synced; this routine knows
  * nothing of dirty buffers that may exist inside the buffer manager.
+ *
+ * See smgrimmedsync comment for contract.
  */
-void
-mdimmedsync(SMgrRelation reln, ForkNumber forknum)
+bool
+mdimmedsync(SMgrRelation reln, ForkNumber forknum, SegmentNumber segno)
 {
-	int			segno;
+	MdfdVec	   *segments;
+	size_t		num_segments;
+	size_t		i;
 
-	/*
-	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
-	 * fsync loop will get them all!
-	 */
-	mdnblocks(reln, forknum);
-
-	segno = reln->md_num_open_segs[forknum];
+	if (segno != InvalidSegmentNumber)
+	{
+		/*
+		 * Get the specified segment, or report failure if it doesn't seem to
+		 * exist.
+		 */
+		segments = _mdfd_openseg(reln, forknum, segno * RELSEG_SIZE,
+								 EXTENSION_RETURN_NULL);
+		if (segments == NULL)
+			return false;
+		num_segments = 1;
+	}
+	else
+	{
+		/*
+		 * NOTE: mdnblocks makes sure we have opened all active segments, so that
+		 * fsync loop will get them all!
+		 */
+		mdnblocks(reln, forknum);
+		num_segments = reln->md_num_open_segs[forknum];
+		segments = &reln->md_seg_fds[forknum][0];
+	}
 
-	while (segno > 0)
+	for (i = 0; i < num_segments; ++i)
 	{
-		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
+		MdfdVec    *v = &segments[i];
 
 		if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
 			ereport(data_sync_elevel(ERROR),
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
-		segno--;
-	}
-}
-
-/*
- *	mdsync() -- Sync previous writes to stable storage.
- */
-void
-mdsync(void)
-{
-	static bool mdsync_in_progress = false;
-
-	HASH_SEQ_STATUS hstat;
-	PendingOperationEntry *entry;
-	int			absorb_counter;
-
-	/* Statistics on sync times */
-	int			processed = 0;
-	instr_time	sync_start,
-				sync_end,
-				sync_diff;
-	uint64		elapsed;
-	uint64		longest = 0;
-	uint64		total_elapsed = 0;
-
-	/*
-	 * This is only called during checkpoints, and checkpoints should only
-	 * occur in processes that have created a pendingOpsTable.
-	 */
-	if (!pendingOpsTable)
-		elog(ERROR, "cannot sync without a pendingOpsTable");
-
-	/*
-	 * If we are in the checkpointer, the sync had better include all fsync
-	 * requests that were queued by backends up to this point.  The tightest
-	 * race condition that could occur is that a buffer that must be written
-	 * and fsync'd for the checkpoint could have been dumped by a backend just
-	 * before it was visited by BufferSync().  We know the backend will have
-	 * queued an fsync request before clearing the buffer's dirtybit, so we
-	 * are safe as long as we do an Absorb after completing BufferSync().
-	 */
-	AbsorbFsyncRequests();
-
-	/*
-	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
-	 * checkpoint), we want to ignore fsync requests that are entered into the
-	 * hashtable after this point --- they should be processed next time,
-	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
-	 * ones: new ones will have cycle_ctr equal to the incremented value of
-	 * mdsync_cycle_ctr.
-	 *
-	 * In normal circumstances, all entries present in the table at this point
-	 * will have cycle_ctr exactly equal to the current (about to be old)
-	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
-	 * fsync'ing loop, then older values of cycle_ctr might remain when we
-	 * come back here to try again.  Repeated checkpoint failures would
-	 * eventually wrap the counter around to the point where an old entry
-	 * might appear new, causing us to skip it, possibly allowing a checkpoint
-	 * to succeed that should not have.  To forestall wraparound, any time the
-	 * previous mdsync() failed to complete, run through the table and
-	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
-	 *
-	 * Think not to merge this loop with the main loop, as the problem is
-	 * exactly that that loop may fail before having visited all the entries.
-	 * From a performance point of view it doesn't matter anyway, as this path
-	 * will never be taken in a system that's functioning normally.
-	 */
-	if (mdsync_in_progress)
-	{
-		/* prior try failed, so update any stale cycle_ctr values */
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			entry->cycle_ctr = mdsync_cycle_ctr;
-		}
 	}
 
-	/* Advance counter so that new hashtable entries are distinguishable */
-	mdsync_cycle_ctr++;
-
-	/* Set flag to detect failure if we don't reach the end of the loop */
-	mdsync_in_progress = true;
-
-	/* Now scan the hashtable for fsync requests to process */
-	absorb_counter = FSYNCS_PER_ABSORB;
-	hash_seq_init(&hstat, pendingOpsTable);
-	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-	{
-		ForkNumber	forknum;
-
-		/*
-		 * If the entry is new then don't process it this time; it might
-		 * contain multiple fsync-request bits, but they are all new.  Note
-		 * "continue" bypasses the hash-remove call at the bottom of the loop.
-		 */
-		if (entry->cycle_ctr == mdsync_cycle_ctr)
-			continue;
-
-		/* Else assert we haven't missed it */
-		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
-
-		/*
-		 * Scan over the forks and segments represented by the entry.
-		 *
-		 * The bitmap manipulations are slightly tricky, because we can call
-		 * AbsorbFsyncRequests() inside the loop and that could result in
-		 * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
-		 * So we detach it, but if we fail we'll merge it with any new
-		 * requests that have arrived in the meantime.
-		 */
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			Bitmapset  *requests = entry->requests[forknum];
-			int			segno;
-
-			entry->requests[forknum] = NULL;
-			entry->canceled[forknum] = false;
-
-			segno = -1;
-			while ((segno = bms_next_member(requests, segno)) >= 0)
-			{
-				int			failures;
-
-				/*
-				 * If fsync is off then we don't have to bother opening the
-				 * file at all.  (We delay checking until this point so that
-				 * changing fsync on the fly behaves sensibly.)
-				 */
-				if (!enableFsync)
-					continue;
-
-				/*
-				 * If in checkpointer, we want to absorb pending requests
-				 * every so often to prevent overflow of the fsync request
-				 * queue.  It is unspecified whether newly-added entries will
-				 * be visited by hash_seq_search, but we don't care since we
-				 * don't need to process them anyway.
-				 */
-				if (--absorb_counter <= 0)
-				{
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB;
-				}
-
-				/*
-				 * The fsync table could contain requests to fsync segments
-				 * that have been deleted (unlinked) by the time we get to
-				 * them. Rather than just hoping an ENOENT (or EACCES on
-				 * Windows) error can be ignored, what we do on error is
-				 * absorb pending requests and then retry.  Since mdunlink()
-				 * queues a "cancel" message before actually unlinking, the
-				 * fsync request is guaranteed to be marked canceled after the
-				 * absorb if it really was this case. DROP DATABASE likewise
-				 * has to tell us to forget fsync requests before it starts
-				 * deletions.
-				 */
-				for (failures = 0;; failures++) /* loop exits at "break" */
-				{
-					SMgrRelation reln;
-					MdfdVec    *seg;
-					char	   *path;
-					int			save_errno;
-
-					/*
-					 * Find or create an smgr hash entry for this relation.
-					 * This may seem a bit unclean -- md calling smgr?	But
-					 * it's really the best solution.  It ensures that the
-					 * open file reference isn't permanently leaked if we get
-					 * an error here. (You may say "but an unreferenced
-					 * SMgrRelation is still a leak!" Not really, because the
-					 * only case in which a checkpoint is done by a process
-					 * that isn't about to shut down is in the checkpointer,
-					 * and it will periodically do smgrcloseall(). This fact
-					 * justifies our not closing the reln in the success path
-					 * either, which is a good thing since in non-checkpointer
-					 * cases we couldn't safely do that.)
-					 */
-					reln = smgropen(entry->rnode, InvalidBackendId);
-
-					/* Attempt to open and fsync the target segment */
-					seg = _mdfd_getseg(reln, forknum,
-									   (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
-									   false,
-									   EXTENSION_RETURN_NULL
-									   | EXTENSION_DONT_CHECK_SIZE);
-
-					INSTR_TIME_SET_CURRENT(sync_start);
-
-					if (seg != NULL &&
-						FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
-					{
-						/* Success; update statistics about sync timing */
-						INSTR_TIME_SET_CURRENT(sync_end);
-						sync_diff = sync_end;
-						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
-						elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
-						if (elapsed > longest)
-							longest = elapsed;
-						total_elapsed += elapsed;
-						processed++;
-						requests = bms_del_member(requests, segno);
-						if (log_checkpoints)
-							elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
-								 processed,
-								 FilePathName(seg->mdfd_vfd),
-								 (double) elapsed / 1000);
-
-						break;	/* out of retry loop */
-					}
-
-					/* Compute file name for use in message */
-					save_errno = errno;
-					path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
-					errno = save_errno;
-
-					/*
-					 * It is possible that the relation has been dropped or
-					 * truncated since the fsync request was entered.
-					 * Therefore, allow ENOENT, but only if we didn't fail
-					 * already on this file.  This applies both for
-					 * _mdfd_getseg() and for FileSync, since fd.c might have
-					 * closed the file behind our back.
-					 *
-					 * XXX is there any point in allowing more than one retry?
-					 * Don't see one at the moment, but easy to change the
-					 * test here if so.
-					 */
-					if (!FILE_POSSIBLY_DELETED(errno) ||
-						failures > 0)
-					{
-						Bitmapset  *new_requests;
-
-						/*
-						 * We need to merge these unsatisfied requests with
-						 * any others that have arrived since we started.
-						 */
-						new_requests = entry->requests[forknum];
-						entry->requests[forknum] =
-							bms_join(new_requests, requests);
-
-						errno = save_errno;
-						ereport(data_sync_elevel(ERROR),
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\": %m",
-										path)));
-					}
-					else
-						ereport(DEBUG1,
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\" but retrying: %m",
-										path)));
-					pfree(path);
-
-					/*
-					 * Absorb incoming requests and check to see if a cancel
-					 * arrived for this relation fork.
-					 */
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
-
-					if (entry->canceled[forknum])
-						break;
-				}				/* end retry loop */
-			}
-			bms_free(requests);
-		}
-
-		/*
-		 * We've finished everything that was requested before we started to
-		 * scan the entry.  If no new requests have been inserted meanwhile,
-		 * remove the entry.  Otherwise, update its cycle counter, as all the
-		 * requests now in it must have arrived during this cycle.
-		 */
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			if (entry->requests[forknum] != NULL)
-				break;
-		}
-		if (forknum <= MAX_FORKNUM)
-			entry->cycle_ctr = mdsync_cycle_ctr;
-		else
-		{
-			/* Okay to remove it */
-			if (hash_search(pendingOpsTable, &entry->rnode,
-							HASH_REMOVE, NULL) == NULL)
-				elog(ERROR, "pendingOpsTable corrupted");
-		}
-	}							/* end loop over hashtable entries */
-
-	/* Return sync performance metrics for report at checkpoint end */
-	CheckpointStats.ckpt_sync_rels = processed;
-	CheckpointStats.ckpt_longest_sync = longest;
-	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
-
-	/* Flag successful completion of mdsync */
-	mdsync_in_progress = false;
-}
-
-/*
- * mdpreckpt() -- Do pre-checkpoint work
- *
- * To distinguish unlink requests that arrived before this checkpoint
- * started from those that arrived during the checkpoint, we use a cycle
- * counter similar to the one we use for fsync requests. That cycle
- * counter is incremented here.
- *
- * This must be called *before* the checkpoint REDO point is determined.
- * That ensures that we won't delete files too soon.
- *
- * Note that we can't do anything here that depends on the assumption
- * that the checkpoint will be completed.
- */
-void
-mdpreckpt(void)
-{
-	/*
-	 * Any unlink requests arriving after this point will be assigned the next
-	 * cycle counter, and won't be unlinked until next checkpoint.
-	 */
-	mdckpt_cycle_ctr++;
-}
-
-/*
- * mdpostckpt() -- Do post-checkpoint work
- *
- * Remove any lingering files that can now be safely removed.
- */
-void
-mdpostckpt(void)
-{
-	int			absorb_counter;
-
-	absorb_counter = UNLINKS_PER_ABSORB;
-	while (pendingUnlinks != NIL)
-	{
-		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
-		char	   *path;
-
-		/*
-		 * New entries are appended to the end, so if the entry is new we've
-		 * reached the end of old entries.
-		 *
-		 * Note: if just the right number of consecutive checkpoints fail, we
-		 * could be fooled here by cycle_ctr wraparound.  However, the only
-		 * consequence is that we'd delay unlinking for one more checkpoint,
-		 * which is perfectly tolerable.
-		 */
-		if (entry->cycle_ctr == mdckpt_cycle_ctr)
-			break;
-
-		/* Unlink the file */
-		path = relpathperm(entry->rnode, MAIN_FORKNUM);
-		if (unlink(path) < 0)
-		{
-			/*
-			 * There's a race condition, when the database is dropped at the
-			 * same time that we process the pending unlink requests. If the
-			 * DROP DATABASE deletes the file before we do, we will get ENOENT
-			 * here. rmtree() also has to ignore ENOENT errors, to deal with
-			 * the possibility that we delete the file first.
-			 */
-			if (errno != ENOENT)
-				ereport(WARNING,
-						(errcode_for_file_access(),
-						 errmsg("could not remove file \"%s\": %m", path)));
-		}
-		pfree(path);
-
-		/* And remove the list entry */
-		pendingUnlinks = list_delete_first(pendingUnlinks);
-		pfree(entry);
-
-		/*
-		 * As in mdsync, we don't want to stop absorbing fsync requests for a
-		 * long time when there are many deletions to be done.  We can safely
-		 * call AbsorbFsyncRequests() at this point in the loop (note it might
-		 * try to delete list entries).
-		 */
-		if (--absorb_counter <= 0)
-		{
-			AbsorbFsyncRequests();
-			absorb_counter = UNLINKS_PER_ABSORB;
-		}
-	}
+	return true;
 }
 
 /*
  * register_dirty_segment() -- Mark a relation segment as needing fsync
- *
- * If there is a local pending-ops table, just make an entry in it for
- * mdsync to process later.  Otherwise, try to pass off the fsync request
- * to the checkpointer process.  If that fails, just do the fsync
- * locally before returning (we hope this will not happen often enough
- * to be a performance problem).
  */
 static void
 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
@@ -1417,16 +956,8 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 	/* Temp relations should never be fsync'd */
 	Assert(!SmgrIsTemp(reln));
 
-	if (pendingOpsTable)
+	if (!FsyncAtCheckpoint(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
 	{
-		/* push it into local pending-ops table */
-		RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
-	}
-	else
-	{
-		if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
-			return;				/* passed it off successfully */
-
 		ereport(DEBUG1,
 				(errmsg("could not forward fsync request because request queue is full")));
 
@@ -1438,258 +969,6 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 	}
 }
 
-/*
- * register_unlink() -- Schedule a file to be deleted after next checkpoint
- *
- * We don't bother passing in the fork number, because this is only used
- * with main forks.
- *
- * As with register_dirty_segment, this could involve either a local or
- * a remote pending-ops table.
- */
-static void
-register_unlink(RelFileNodeBackend rnode)
-{
-	/* Should never be used with temp relations */
-	Assert(!RelFileNodeBackendIsTemp(rnode));
-
-	if (pendingOpsTable)
-	{
-		/* push it into local pending-ops table */
-		RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
-							 UNLINK_RELATION_REQUEST);
-	}
-	else
-	{
-		/*
-		 * Notify the checkpointer about it.  If we fail to queue the request
-		 * message, we have to sleep and try again, because we can't simply
-		 * delete the file now.  Ugly, but hopefully won't happen often.
-		 *
-		 * XXX should we just leave the file orphaned instead?
-		 */
-		Assert(IsUnderPostmaster);
-		while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
-									UNLINK_RELATION_REQUEST))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-	}
-}
-
-/*
- * RememberFsyncRequest() -- callback from checkpointer side of fsync request
- *
- * We stuff fsync requests into the local hash table for execution
- * during the checkpointer's next checkpoint.  UNLINK requests go into a
- * separate linked list, however, because they get processed separately.
- *
- * The range of possible segment numbers is way less than the range of
- * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define three:
- * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
- *	 either for one fork, or all forks if forknum is InvalidForkNumber
- * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
- * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
- *	 checkpoint.
- * Note also that we're assuming real segment numbers don't exceed INT_MAX.
- *
- * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
- * table has to be searched linearly, but dropping a database is a pretty
- * heavyweight operation anyhow, so we'll live with it.)
- */
-void
-RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
-{
-	Assert(pendingOpsTable);
-
-	if (segno == FORGET_RELATION_FSYNC)
-	{
-		/* Remove any pending requests for the relation (one or all forks) */
-		PendingOperationEntry *entry;
-
-		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-													  &rnode,
-													  HASH_FIND,
-													  NULL);
-		if (entry)
-		{
-			/*
-			 * We can't just delete the entry since mdsync could have an
-			 * active hashtable scan.  Instead we delete the bitmapsets; this
-			 * is safe because of the way mdsync is coded.  We also set the
-			 * "canceled" flags so that mdsync can tell that a cancel arrived
-			 * for the fork(s).
-			 */
-			if (forknum == InvalidForkNumber)
-			{
-				/* remove requests for all forks */
-				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-				{
-					bms_free(entry->requests[forknum]);
-					entry->requests[forknum] = NULL;
-					entry->canceled[forknum] = true;
-				}
-			}
-			else
-			{
-				/* remove requests for single fork */
-				bms_free(entry->requests[forknum]);
-				entry->requests[forknum] = NULL;
-				entry->canceled[forknum] = true;
-			}
-		}
-	}
-	else if (segno == FORGET_DATABASE_FSYNC)
-	{
-		/* Remove any pending requests for the entire database */
-		HASH_SEQ_STATUS hstat;
-		PendingOperationEntry *entry;
-		ListCell   *cell,
-				   *prev,
-				   *next;
-
-		/* Remove fsync requests */
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			if (entry->rnode.dbNode == rnode.dbNode)
-			{
-				/* remove requests for all forks */
-				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-				{
-					bms_free(entry->requests[forknum]);
-					entry->requests[forknum] = NULL;
-					entry->canceled[forknum] = true;
-				}
-			}
-		}
-
-		/* Remove unlink requests */
-		prev = NULL;
-		for (cell = list_head(pendingUnlinks); cell; cell = next)
-		{
-			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
-
-			next = lnext(cell);
-			if (entry->rnode.dbNode == rnode.dbNode)
-			{
-				pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
-				pfree(entry);
-			}
-			else
-				prev = cell;
-		}
-	}
-	else if (segno == UNLINK_RELATION_REQUEST)
-	{
-		/* Unlink request: put it in the linked list */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingUnlinkEntry *entry;
-
-		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
-		Assert(forknum == MAIN_FORKNUM);
-
-		entry = palloc(sizeof(PendingUnlinkEntry));
-		entry->rnode = rnode;
-		entry->cycle_ctr = mdckpt_cycle_ctr;
-
-		pendingUnlinks = lappend(pendingUnlinks, entry);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
-	else
-	{
-		/* Normal case: enter a request to fsync this segment */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingOperationEntry *entry;
-		bool		found;
-
-		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-													  &rnode,
-													  HASH_ENTER,
-													  &found);
-		/* if new entry, initialize it */
-		if (!found)
-		{
-			entry->cycle_ctr = mdsync_cycle_ctr;
-			MemSet(entry->requests, 0, sizeof(entry->requests));
-			MemSet(entry->canceled, 0, sizeof(entry->canceled));
-		}
-
-		/*
-		 * NB: it's intentional that we don't change cycle_ctr if the entry
-		 * already exists.  The cycle_ctr must represent the oldest fsync
-		 * request that could be in the entry.
-		 */
-
-		entry->requests[forknum] = bms_add_member(entry->requests[forknum],
-												  (int) segno);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
-}
-
-/*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
- *
- * forknum == InvalidForkNumber means all forks, although this code doesn't
- * actually know that, since it's just forwarding the request elsewhere.
- */
-void
-ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
-{
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/*
-		 * Notify the checkpointer about it.  If we fail to queue the cancel
-		 * message, we have to sleep and try again ... ugly, but hopefully
-		 * won't happen often.
-		 *
-		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
-		 * error would leave the no-longer-used file still present on disk,
-		 * which would be bad, so I'm inclined to assume that the checkpointer
-		 * will always empty the queue soon.
-		 */
-		while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-
-		/*
-		 * Note we don't wait for the checkpointer to actually absorb the
-		 * cancel message; see mdsync() for the implications.
-		 */
-	}
-}
-
-/*
- * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
- */
-void
-ForgetDatabaseFsyncRequests(Oid dbid)
-{
-	RelFileNode rnode;
-
-	rnode.dbNode = dbid;
-	rnode.spcNode = 0;
-	rnode.relNode = 0;
-
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/* see notes in ForgetRelationFsyncRequests */
-		while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
-									FORGET_DATABASE_FSYNC))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-	}
-}
-
 /*
  * DropRelationFiles -- drop files of all given relations
  */
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 189342ef86a..42596f14e9e 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -21,6 +21,7 @@
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 
@@ -58,10 +59,8 @@ typedef struct f_smgr
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
-	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_pre_ckpt) (void);	/* may be NULL */
-	void		(*smgr_sync) (void);	/* may be NULL */
-	void		(*smgr_post_ckpt) (void);	/* may be NULL */
+	bool		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum,
+								   SegmentNumber segno);
 } f_smgr;
 
 
@@ -81,10 +80,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_writeback = mdwriteback,
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
-		.smgr_immedsync = mdimmedsync,
-		.smgr_pre_ckpt = mdpreckpt,
-		.smgr_sync = mdsync,
-		.smgr_post_ckpt = mdpostckpt
+		.smgr_immedsync = mdimmedsync
 	}
 };
 
@@ -104,6 +100,14 @@ static void smgrshutdown(int code, Datum arg);
 static void add_to_unowned_list(SMgrRelation reln);
 static void remove_from_unowned_list(SMgrRelation reln);
 
+/*
+ * For now there is only one implementation.
+ */
+static inline int
+which_for_relfilenode(RelFileNode rnode)
+{
+	return 0;	/* we only have md.c at present */
+}
 
 /*
  *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
@@ -118,6 +122,8 @@ smgrinit(void)
 {
 	int			i;
 
+	smgrsync_init();
+
 	for (i = 0; i < NSmgr; i++)
 	{
 		if (smgrsw[i].smgr_init)
@@ -185,7 +191,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 		reln->smgr_targblock = InvalidBlockNumber;
 		reln->smgr_fsm_nblocks = InvalidBlockNumber;
 		reln->smgr_vm_nblocks = InvalidBlockNumber;
-		reln->smgr_which = 0;	/* we only have md.c at present */
+		reln->smgr_which = which_for_relfilenode(rnode);
 
 		/* mark it not open */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
@@ -726,17 +732,20 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
  *	smgrimmedsync() -- Force the specified relation to stable storage.
  *
  *		Synchronously force all previous writes to the specified relation
- *		down to disk.
- *
- *		This is useful for building completely new relations (eg, new
- *		indexes).  Instead of incrementally WAL-logging the index build
- *		steps, we can just write completed index pages to disk with smgrwrite
- *		or smgrextend, and then fsync the completed index file before
- *		committing the transaction.  (This is sufficient for purposes of
- *		crash recovery, since it effectively duplicates forcing a checkpoint
- *		for the completed index.  But it is *not* sufficient if one wishes
- *		to use the WAL log for PITR or replication purposes: in that case
- *		we have to make WAL entries as well.)
+ *		down to disk.  If segnum is >= 0, only applies to data in
+ *		one segment file.
+ *
+ *		Used for checkpointing dirty files.
+ *
+ *		This can also be used for building completely new relations (eg, new
+ *		indexes).  Instead of incrementally WAL-logging the index build steps,
+ *		we can just write completed index pages to disk with smgrwrite or
+ *		smgrextend, and then fsync the completed index file before committing
+ *		the transaction.  (This is sufficient for purposes of crash recovery,
+ *		since it effectively duplicates forcing a checkpoint for the completed
+ *		index.  But it is *not* sufficient if one wishes to use the WAL log
+ *		for PITR or replication purposes: in that case we have to make WAL
+ *		entries as well.)
  *
  *		The preceding writes should specify skipFsync = true to avoid
  *		duplicative fsyncs.
@@ -744,57 +753,14 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
  *		Note that you need to do FlushRelationBuffers() first if there is
  *		any possibility that there are dirty buffers for the relation;
  *		otherwise the sync is not very meaningful.
+ *
+ *		Fail to fsync raises an error, but non-existence of a requested
+ *		segment is reported with a false return value.
  */
-void
-smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
-{
-	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
-}
-
-
-/*
- *	smgrpreckpt() -- Prepare for checkpoint.
- */
-void
-smgrpreckpt(void)
-{
-	int			i;
-
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_pre_ckpt)
-			smgrsw[i].smgr_pre_ckpt();
-	}
-}
-
-/*
- *	smgrsync() -- Sync files to disk during checkpoint.
- */
-void
-smgrsync(void)
-{
-	int			i;
-
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_sync)
-			smgrsw[i].smgr_sync();
-	}
-}
-
-/*
- *	smgrpostckpt() -- Post-checkpoint cleanup.
- */
-void
-smgrpostckpt(void)
+bool
+smgrimmedsync(SMgrRelation reln, ForkNumber forknum, SegmentNumber segno)
 {
-	int			i;
-
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_post_ckpt)
-			smgrsw[i].smgr_post_ckpt();
-	}
+	return smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum, segno);
 }
 
 /*
diff --git a/src/backend/storage/smgr/smgrsync.c b/src/backend/storage/smgr/smgrsync.c
new file mode 100644
index 00000000000..b202acef7e1
--- /dev/null
+++ b/src/backend/storage/smgr/smgrsync.c
@@ -0,0 +1,834 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgrsync.c
+ *	  management of file synchronization.
+ *
+ * This modules tracks which files need to be fsynced or unlinked at the
+ * next checkpoint, and performs those actions.  Normally the work is done
+ * when called by the checkpointer, but it is also done in standalone mode
+ * and startup.
+ *
+ * Originally this logic was inside md.c, but it is now made more general,
+ * for reuse by other SMGR implementations that work with files.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/smgr/smgrsync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "nodes/pg_list.h"
+#include "pgstat.h"
+#include "portability/instr_time.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
+#include "storage/relfilenode.h"
+#include "storage/smgrsync.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+
+static MemoryContext pendingOpsCxt; /* context for the pending ops state  */
+
+#define SV_PREFIX segnum_vector
+#define SV_DECLARE
+#define SV_DEFINE
+#define SV_ELEMENT_TYPE BlockNumber
+#define SV_SCOPE static inline
+#define SV_GLOBAL_MEMORY_CONTEXT pendingOpsCxt
+#include "lib/simplevector.h"
+
+#define SA_PREFIX segnum_array
+#define SA_COMPARE(a,b) (*a < *b ? -1 : *a == *b ? 0 : 1)
+#define SA_DECLARE
+#define SA_DEFINE
+#define SA_ELEMENT_TYPE SV_ELEMENT_TYPE
+#define SA_SCOPE static inline
+#include "lib/simplealgo.h"
+
+/*
+ * In some contexts (currently, standalone backends and the checkpointer)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  A hash
+ * table remembers the pending operations.  We use a hash table mostly as
+ * a convenient way of merging duplicate requests.
+ *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the checkpointer.)
+ */
+
+typedef uint32 CycleCtr;		/* can be any convenient integer size */
+
+/*
+ * Values for the "type" member of CheckpointerRequest.
+ *
+ * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
+ * fsync request from the queue if an identical, subsequent request is found.
+ * See comments there before making changes here.
+ */
+#define FSYNC_SEGMENT_REQUEST	1
+#define FORGET_SEGMENT_FSYNC	2
+#define FORGET_RELATION_FSYNC	3
+#define FORGET_DATABASE_FSYNC	4
+#define UNLINK_RELATION_REQUEST 5
+#define UNLINK_SEGMENT_REQUEST	6
+
+/* intervals for calling AbsorbFsyncRequests in smgrsync and smgrpostckpt */
+#define FSYNCS_PER_ABSORB		10
+#define UNLINKS_PER_ABSORB		10
+
+/*
+ * An entry in the hash table of files that need to be flushed for the next
+ * checkpoint.
+ */
+typedef struct PendingFsyncEntry
+{
+	RelFileNode	rnode;
+	segnum_vector requests[MAX_FORKNUM + 1];
+	segnum_vector requests_in_progress[MAX_FORKNUM + 1];
+	CycleCtr	cycle_ctr;
+} PendingFsyncEntry;
+
+typedef struct PendingUnlinkEntry
+{
+	RelFileNode rnode;			/* the dead relation to delete */
+	CycleCtr	cycle_ctr;		/* ckpt_cycle_ctr when request was made */
+} PendingUnlinkEntry;
+
+static bool sync_in_progress = false;
+static CycleCtr sync_cycle_ctr = 0;
+static CycleCtr ckpt_cycle_ctr = 0;
+
+static HTAB *pendingFsyncTable = NULL;
+static List *pendingUnlinks = NIL;
+
+/*
+ * Initialize the pending operations state, if necessary.
+ */
+void
+smgrsync_init(void)
+{
+	/*
+	 * Create pending-operations hashtable if we need it.  Currently, we need
+	 * it if we are standalone (not under a postmaster) or if we are a startup
+	 * or checkpointer auxiliary process.
+	 */
+	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
+	{
+		HASHCTL		hash_ctl;
+
+		/*
+		 * XXX: The checkpointer needs to add entries to the pending ops table
+		 * when absorbing fsync requests.  That is done within a critical
+		 * section, which isn't usually allowed, but we make an exception. It
+		 * means that there's a theoretical possibility that you run out of
+		 * memory while absorbing fsync requests, which leads to a PANIC.
+		 * Fortunately the hash table is small so that's unlikely to happen in
+		 * practice.
+		 */
+		pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
+											  "Pending ops context",
+											  ALLOCSET_DEFAULT_SIZES);
+		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
+		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+		hash_ctl.keysize = sizeof(RelFileNode);
+		hash_ctl.entrysize = sizeof(PendingFsyncEntry);
+		hash_ctl.hcxt = pendingOpsCxt;
+		pendingFsyncTable = hash_create("Pending Ops Table",
+									  100L,
+									  &hash_ctl,
+									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+		pendingUnlinks = NIL;
+	}
+}
+
+/*
+ * Do pre-checkpoint work.
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+smgrpreckpt(void)
+{
+	/*
+	 * Any unlink requests arriving after this point will be assigned the next
+	 * cycle counter, and won't be unlinked until next checkpoint.
+	 */
+	ckpt_cycle_ctr++;
+}
+
+/*
+ * Sync previous writes to stable storage.
+ */
+void
+smgrsync(void)
+{
+	HASH_SEQ_STATUS hstat;
+	PendingFsyncEntry *entry;
+	int			absorb_counter;
+
+	/* Statistics on sync times */
+	instr_time	sync_start,
+				sync_end,
+				sync_diff;
+	uint64		elapsed;
+	int			processed = CheckpointStats.ckpt_sync_rels;
+	uint64		longest = CheckpointStats.ckpt_longest_sync;
+	uint64		total_elapsed = CheckpointStats.ckpt_agg_sync_time;
+
+	/*
+	 * This is only called during checkpoints, and checkpoints should only
+	 * occur in processes that have created a pendingFsyncTable.
+	 */
+	if (!pendingFsyncTable)
+		elog(ERROR, "cannot sync without a pendingFsyncTable");
+
+	/*
+	 * If we are in the checkpointer, the sync had better include all fsync
+	 * requests that were queued by backends up to this point.  The tightest
+	 * race condition that could occur is that a buffer that must be written
+	 * and fsync'd for the checkpoint could have been dumped by a backend just
+	 * before it was visited by BufferSync().  We know the backend will have
+	 * queued an fsync request before clearing the buffer's dirtybit, so we
+	 * are safe as long as we do an Absorb after completing BufferSync().
+	 */
+	AbsorbFsyncRequests();
+
+	/*
+	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+	 * checkpoint), we want to ignore fsync requests that are entered into the
+	 * hashtable after this point --- they should be processed next time,
+	 * instead.  We use GetCheckpointSyncCycle() to tell old entries apart
+	 * from new ones: new ones will have cycle_ctr equal to
+	 * IncCheckpointSyncCycle().
+	 *
+	 * In normal circumstances, all entries present in the table at this point
+	 * will have cycle_ctr exactly equal to the current (about to be old)
+	 * value of sync_cycle_ctr.  However, if we fail partway through the
+	 * fsync'ing loop, then older values of cycle_ctr might remain when we
+	 * come back here to try again.  Repeated checkpoint failures would
+	 * eventually wrap the counter around to the point where an old entry
+	 * might appear new, causing us to skip it, possibly allowing a checkpoint
+	 * to succeed that should not have.  To forestall wraparound, any time the
+	 * previous smgrsync() failed to complete, run through the table and
+	 * forcibly set cycle_ctr = sync_cycle_ctr.
+	 *
+	 * Think not to merge this loop with the main loop, as the problem is
+	 * exactly that that loop may fail before having visited all the entries.
+	 * From a performance point of view it doesn't matter anyway, as this path
+	 * will never be taken in a system that's functioning normally.
+	 */
+	if (sync_in_progress)
+	{
+		/* prior try failed, so update any stale cycle_ctr values */
+		hash_seq_init(&hstat, pendingFsyncTable);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			ForkNumber		forknum;
+
+			entry->cycle_ctr = sync_cycle_ctr;
+
+			/*
+			 * If any requests remain unprocessed, they need to be merged with
+			 * the segment numbers that have arrived since.
+			 */
+			for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+			{
+				segnum_vector *requests = &entry->requests[forknum];
+				segnum_vector *requests_in_progress =
+					&entry->requests_in_progress[forknum];
+
+				if (!segnum_vector_empty(requests_in_progress))
+				{
+					/* Append the unfinished requests that were not yet handled. */
+					segnum_vector_append_n(requests,
+										   segnum_vector_data(requests_in_progress),
+										   segnum_vector_size(requests_in_progress));
+					segnum_vector_reset(requests_in_progress);
+
+					/* Sort and make unique. */
+					segnum_array_sort(segnum_vector_begin(requests),
+									  segnum_vector_end(requests));
+					segnum_vector_resize(requests,
+									 segnum_array_unique(segnum_vector_begin(requests),
+														 segnum_vector_end(requests)) -
+										 segnum_vector_begin(requests));
+				}
+			}
+		}
+	}
+
+	/* Advance counter so that new hashtable entries are distinguishable */
+	sync_cycle_ctr++;
+
+	/* Set flag to detect failure if we don't reach the end of the loop */
+	sync_in_progress = true;
+
+	/* Now scan the hashtable for fsync requests to process */
+	absorb_counter = FSYNCS_PER_ABSORB;
+	hash_seq_init(&hstat, pendingFsyncTable);
+	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)))
+	{
+		ForkNumber forknum;
+		SMgrRelation reln;
+
+		/*
+		 * If the entry is new then don't process it this time; it might
+		 * contain multiple fsync requests, but they are all new.  Note
+		 * "continue" bypasses the hash-remove call at the bottom of the loop.
+		 */
+		if (entry->cycle_ctr == sync_cycle_ctr)
+			continue;
+
+		/* Else assert we haven't missed it */
+		Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
+
+		/*
+		 * Scan over the forks and segments represented by the entry.
+		 *
+		 * The vector manipulations are slightly tricky, because we can call
+		 * AbsorbFsyncRequests() inside the loop and that could result in new
+		 * segment numbers being added.  So we swap the contents of "requests"
+		 * with "requests_in_progress", and if we fail we'll merge it with any
+		 * new requests that have arrived in the meantime.
+		 */
+		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			segnum_vector *requests_in_progress =
+				&entry->requests_in_progress[forknum];
+
+			/*
+			 * Transfer the current set of segment numbers into the "in
+			 * progress" vector (which must be empty initially).
+			 */
+			Assert(segnum_vector_empty(requests_in_progress));
+			segnum_vector_swap(&entry->requests[forknum], requests_in_progress);
+
+			/* Loop until all requests have been handled. */
+			while (!segnum_vector_empty(requests_in_progress))
+			{
+				SegmentNumber	segno = *segnum_vector_back(requests_in_progress);
+
+				INSTR_TIME_SET_CURRENT(sync_start);
+
+				reln = smgropen(entry->rnode, InvalidBackendId);
+				if (!smgrimmedsync(reln, forknum, segno))
+				{
+					/*
+					 * The underlying file couldn't be found.  Check if a
+					 * later message in the queue reports that it has been
+					 * unlinked; if so it will be removed from the vector,
+					 * indicating that we can safely skip it.
+					 */
+					AbsorbFsyncRequests();
+					if (!segnum_array_binary_search(segnum_vector_begin(requests_in_progress),
+													segnum_vector_end(requests_in_progress),
+													&segno))
+						continue;
+
+					/* Otherwise it's an unexpectedly missing file. */
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not open backing file to fsync: %u/%u/%u",
+									entry->rnode.dbNode,
+									entry->rnode.relNode,
+									segno)));
+				}
+
+				/* Success; update statistics about sync timing */
+				INSTR_TIME_SET_CURRENT(sync_end);
+				sync_diff = sync_end;
+				INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+				elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+				if (elapsed > longest)
+					longest = elapsed;
+				total_elapsed += elapsed;
+				processed++;
+
+				/* Remove this segment number. */
+				Assert(segno == *segnum_vector_back(requests_in_progress));
+				segnum_vector_pop_back(requests_in_progress);
+
+				if (log_checkpoints)
+					ereport(DEBUG1,
+							(errmsg("checkpoint sync: number=%d db=%u rel=%u seg=%u time=%.3f msec",
+									processed,
+									entry->rnode.dbNode,
+									entry->rnode.relNode,
+									segno,
+									(double) elapsed / 1000),
+							 errhidestmt(true),
+							 errhidecontext(true)));
+			}
+		}
+
+		/*
+		 * We've finished everything that was requested before we started to
+		 * scan the entry.  If no new requests have been inserted meanwhile,
+		 * remove the entry.  Otherwise, update its cycle counter, as all the
+		 * requests now in it must have arrived during this cycle.
+		 */
+		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			Assert(segnum_vector_empty(&entry->requests_in_progress[forknum]));
+			if (!segnum_vector_empty(&entry->requests[forknum]))
+				break;
+			segnum_vector_reset(&entry->requests[forknum]);
+		}
+		if (forknum <= MAX_FORKNUM)
+			entry->cycle_ctr = sync_cycle_ctr;
+		else
+		{
+			/* Okay to remove it */
+			if (hash_search(pendingFsyncTable, &entry->rnode,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "pendingOpsTable corrupted");
+		}
+	}							/* end loop over hashtable entries */
+
+	/* Maintain sync performance metrics for report at checkpoint end */
+	CheckpointStats.ckpt_sync_rels = processed;
+	CheckpointStats.ckpt_longest_sync = longest;
+	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
+	/* Flag successful completion of smgrsync */
+	sync_in_progress = false;
+}
+
+/*
+ * Do post-checkpoint work.
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+smgrpostckpt(void)
+{
+	int			absorb_counter;
+
+	absorb_counter = UNLINKS_PER_ABSORB;
+	while (pendingUnlinks != NIL)
+	{
+		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
+		char	   *path;
+
+		/*
+		 * New entries are appended to the end, so if the entry is new we've
+		 * reached the end of old entries.
+		 *
+		 * Note: if just the right number of consecutive checkpoints fail, we
+		 * could be fooled here by cycle_ctr wraparound.  However, the only
+		 * consequence is that we'd delay unlinking for one more checkpoint,
+		 * which is perfectly tolerable.
+		 */
+		if (entry->cycle_ctr == ckpt_cycle_ctr)
+			break;
+
+		/* Unlink the file */
+		path = relpathperm(entry->rnode, MAIN_FORKNUM);
+		if (unlink(path) < 0)
+		{
+			/*
+			 * There's a race condition, when the database is dropped at the
+			 * same time that we process the pending unlink requests. If the
+			 * DROP DATABASE deletes the file before we do, we will get ENOENT
+			 * here. rmtree() also has to ignore ENOENT errors, to deal with
+			 * the possibility that we delete the file first.
+			 */
+			if (errno != ENOENT)
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", path)));
+		}
+		pfree(path);
+
+		/* And remove the list entry */
+		pendingUnlinks = list_delete_first(pendingUnlinks);
+		pfree(entry);
+
+		/*
+		 * As in smgrsync, we don't want to stop absorbing fsync requests for a
+		 * long time when there are many deletions to be done.  We can safely
+		 * call AbsorbFsyncRequests() at this point in the loop (note it might
+		 * try to delete list entries).
+		 */
+		if (--absorb_counter <= 0)
+		{
+			AbsorbFsyncRequests();
+			absorb_counter = UNLINKS_PER_ABSORB;
+		}
+	}
+}
+
+
+/*
+ * Mark a file as needing fsync.
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * smgrsync to process later.  Otherwise, try to pass off the fsync request to
+ * the checkpointer process.
+ *
+ * Returns true on success, but false if the queue was full and we couldn't
+ * pass the request to the the checkpointer, meaning that the caller must
+ * perform the fsync.
+ */
+bool
+FsyncAtCheckpoint(RelFileNode rnode, ForkNumber forknum, SegmentNumber segno)
+{
+	if (pendingFsyncTable)
+	{
+		RememberFsyncRequest(FSYNC_SEGMENT_REQUEST, rnode, forknum, segno);
+		return true;
+	}
+	else
+		return ForwardFsyncRequest(FSYNC_SEGMENT_REQUEST, rnode, forknum,
+								   segno);
+}
+
+/*
+ * Schedule a file to be deleted after next checkpoint.
+ *
+ * As with FsyncAtCheckpoint, this could involve either a local or a remote
+ * pending-ops table.
+ */
+void
+UnlinkAfterCheckpoint(RelFileNodeBackend rnode)
+{
+	/* Should never be used with temp relations */
+	Assert(!RelFileNodeBackendIsTemp(rnode));
+
+	if (pendingFsyncTable)
+	{
+		/* push it into local pending-ops table */
+		RememberFsyncRequest(UNLINK_RELATION_REQUEST,
+							 rnode.node,
+							 MAIN_FORKNUM,
+							 InvalidSegmentNumber);
+	}
+	else
+	{
+		/* Notify the checkpointer about it. */
+		Assert(IsUnderPostmaster);
+
+		ForwardFsyncRequest(UNLINK_RELATION_REQUEST,
+							rnode.node,
+							MAIN_FORKNUM,
+							InvalidSegmentNumber);
+	}
+}
+
+/*
+ * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
+ * already created the pendingFsyncTable during initialization of the startup
+ * process.  Calling this function drops the local pendingFsyncTable so that
+ * subsequent requests will be forwarded to checkpointer.
+ */
+void
+SetForwardFsyncRequests(void)
+{
+	/* Perform any pending fsyncs we may have queued up, then drop table */
+	if (pendingFsyncTable)
+	{
+		smgrsync();
+		hash_destroy(pendingFsyncTable);
+	}
+	pendingFsyncTable = NULL;
+
+	/*
+	 * We should not have any pending unlink requests, since mdunlink doesn't
+	 * queue unlink requests when isRedo.
+	 */
+	Assert(pendingUnlinks == NIL);
+}
+
+/*
+ * Find and remove a segment number by binary search.
+ */
+static inline void
+delete_segno(segnum_vector *vec, SegmentNumber segno)
+{
+	SegmentNumber *position =
+		segnum_array_lower_bound(segnum_vector_begin(vec),
+								 segnum_vector_end(vec),
+								 &segno);
+
+	if (position != segnum_vector_end(vec) &&
+		*position == segno)
+		segnum_vector_erase(vec, position);
+}
+
+/*
+ * Add a segment number by binary search.  Hopefully these tend to be added a
+ * the high end, which is cheap.
+ */
+static inline void
+insert_segno(segnum_vector *vec, SegmentNumber segno)
+{
+	segnum_vector_insert(vec,
+						 segnum_array_lower_bound(segnum_vector_begin(vec),
+												  segnum_vector_end(vec),
+												  &segno),
+						 &segno);
+}
+
+/*
+ * RememberFsyncRequest() -- callback from checkpointer side of fsync request
+ *
+ * We stuff fsync requests into the local hash table for execution
+ * during the checkpointer's next checkpoint.  UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
+ *
+ * Valid valid values for 'type':
+ * - FSYNC_SEGMENT_REQUEST means to schedule an fsync
+ * - FORGET_SEGMENT_FSYNC means to cancel pending fsyncs for one segment
+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
+ *	 either for one fork, or all forks if forknum is InvalidForkNumber
+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
+ *	 checkpoint.
+ * Note also that we're assuming real segment numbers don't exceed INT_MAX.
+ *
+ * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
+ * table has to be searched linearly, but dropping a database is a pretty
+ * heavyweight operation anyhow, so we'll live with it.)
+ */
+void
+RememberFsyncRequest(int type, RelFileNode rnode, ForkNumber forknum,
+					 SegmentNumber segno)
+{
+	Assert(pendingFsyncTable);
+
+	if (type == FORGET_SEGMENT_FSYNC || type == FORGET_RELATION_FSYNC)
+	{
+		PendingFsyncEntry *entry;
+
+		entry = hash_search(pendingFsyncTable, &rnode, HASH_FIND, NULL);
+		if (entry)
+		{
+			if (type == FORGET_SEGMENT_FSYNC)
+			{
+				delete_segno(&entry->requests[forknum], segno);
+				delete_segno(&entry->requests_in_progress[forknum], segno);
+			}
+			else if (forknum == InvalidForkNumber)
+			{
+				/* Remove requests for all forks. */
+				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+				{
+					segnum_vector_reset(&entry->requests[forknum]);
+					segnum_vector_reset(&entry->requests_in_progress[forknum]);
+				}
+			}
+			else
+			{
+				/* Forget about all segments for one fork. */
+				segnum_vector_reset(&entry->requests[forknum]);
+				segnum_vector_reset(&entry->requests_in_progress[forknum]);
+			}
+		}
+	}
+	else if (type == FORGET_DATABASE_FSYNC)
+	{
+		HASH_SEQ_STATUS hstat;
+		PendingFsyncEntry *entry;
+
+		/* Remove fsync requests */
+		hash_seq_init(&hstat, pendingFsyncTable);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (rnode.dbNode == entry->rnode.dbNode)
+			{
+				/* Remove requests for all forks. */
+				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+				{
+					segnum_vector_reset(&entry->requests[forknum]);
+					segnum_vector_reset(&entry->requests_in_progress[forknum]);
+				}
+			}
+		}
+
+		/* Remove unlink requests */
+		if (segno == FORGET_DATABASE_FSYNC)
+		{
+			ListCell   *cell,
+					   *next,
+					   *prev;
+
+			prev = NULL;
+			for (cell = list_head(pendingUnlinks); cell; cell = next)
+			{
+				PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
+
+				next = lnext(cell);
+				if (rnode.dbNode == entry->rnode.dbNode)
+				{
+					pendingUnlinks = list_delete_cell(pendingUnlinks, cell,
+													  prev);
+					pfree(entry);
+				}
+				else
+					prev = cell;
+			}
+		}
+	}
+	else if (type == UNLINK_RELATION_REQUEST)
+	{
+		/* Unlink request: put it in the linked list */
+		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+		PendingUnlinkEntry *entry;
+
+		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
+		Assert(forknum == MAIN_FORKNUM);
+
+		entry = palloc(sizeof(PendingUnlinkEntry));
+		entry->rnode = rnode;
+		entry->cycle_ctr = ckpt_cycle_ctr;
+
+		pendingUnlinks = lappend(pendingUnlinks, entry);
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+	else
+	{
+		/* Normal case: enter a request to fsync this segment */
+		PendingFsyncEntry *entry;
+		bool		found;
+
+		entry = (PendingFsyncEntry *) hash_search(pendingFsyncTable,
+												  &rnode,
+												  HASH_ENTER,
+												  &found);
+		/* if new entry, initialize it */
+		if (!found)
+		{
+			ForkNumber	f;
+
+			entry->cycle_ctr = ckpt_cycle_ctr;
+			for (f = 0; f <= MAX_FORKNUM; f++)
+			{
+				segnum_vector_init(&entry->requests[f]);
+				segnum_vector_init(&entry->requests_in_progress[f]);
+			}
+		}
+
+		/*
+		 * NB: it's intentional that we don't change cycle_ctr if the entry
+		 * already exists.  The cycle_ctr must represent the oldest fsync
+		 * request that could be in the entry.
+		 */
+
+		insert_segno(&entry->requests[forknum], segno);
+	}
+}
+
+/*
+ * ForgetSegmentFsyncRequests -- forget any fsyncs for one segment of a
+ * relation fork
+ *
+ * forknum == InvalidForkNumber means all forks, although this code doesn't
+ * actually know that, since it's just forwarding the request elsewhere.
+ */
+void
+ForgetSegmentFsyncRequests(RelFileNode rnode, ForkNumber forknum,
+						   SegmentNumber segno)
+{
+	if (pendingFsyncTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(FORGET_SEGMENT_FSYNC, rnode, forknum, segno);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* Notify the checkpointer about it. */
+		while (!ForwardFsyncRequest(FORGET_SEGMENT_FSYNC, rnode, forknum,
+									segno))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+
+		/*
+		 * Note we don't wait for the checkpointer to actually absorb the
+		 * cancel message; see smgrsync() for the implications.
+		 */
+	}
+}
+
+/*
+ * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
+ *
+ * forknum == InvalidForkNumber means all forks, although this code doesn't
+ * actually know that, since it's just forwarding the request elsewhere.
+ */
+void
+ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
+{
+	if (pendingFsyncTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(FORGET_RELATION_FSYNC, rnode, forknum,
+							 InvalidSegmentNumber);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* Notify the checkpointer about it. */
+		while (!ForwardFsyncRequest(FORGET_RELATION_FSYNC, rnode, forknum,
+									InvalidSegmentNumber))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+
+		/*
+		 * Note we don't wait for the checkpointer to actually absorb the
+		 * cancel message; see smgrsync() for the implications.
+		 */
+	}
+}
+
+/*
+ * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
+ */
+void
+ForgetDatabaseFsyncRequests(Oid dbid)
+{
+	RelFileNode rnode;
+
+	rnode.dbNode = dbid;
+	rnode.spcNode = 0;
+	rnode.relNode = 0;
+
+	if (pendingFsyncTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(FORGET_DATABASE_FSYNC, rnode, 0,
+							 InvalidSegmentNumber);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* see notes in ForgetRelationFsyncRequests */
+		while (!ForwardFsyncRequest(FORGET_DATABASE_FSYNC, rnode, 0,
+									InvalidSegmentNumber))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+	}
+}
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 970c94ee805..32bc91102d7 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -59,7 +59,7 @@
 #include "commands/view.h"
 #include "miscadmin.h"
 #include "parser/parse_utilcmd.h"
-#include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "rewrite/rewriteDefine.h"
 #include "rewrite/rewriteRemove.h"
 #include "storage/fd.h"
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6fe19398812..db23de3a131 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -60,6 +60,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "postmaster/postmaster.h"
 #include "postmaster/syslogger.h"
 #include "postmaster/walwriter.h"
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 941c6aba7d1..137c748dfaf 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -1,10 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * bgwriter.h
- *	  Exports from postmaster/bgwriter.c and postmaster/checkpointer.c.
- *
- * The bgwriter process used to handle checkpointing duties too.  Now
- * there is a separate process, but we did not bother to split this header.
+ *	  Exports from postmaster/bgwriter.c.
  *
  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
  *
@@ -15,29 +12,10 @@
 #ifndef _BGWRITER_H
 #define _BGWRITER_H
 
-#include "storage/block.h"
-#include "storage/relfilenode.h"
-
-
 /* GUC options */
 extern int	BgWriterDelay;
-extern int	CheckPointTimeout;
-extern int	CheckPointWarning;
-extern double CheckPointCompletionTarget;
 
 extern void BackgroundWriterMain(void) pg_attribute_noreturn();
-extern void CheckpointerMain(void) pg_attribute_noreturn();
-
-extern void RequestCheckpoint(int flags);
-extern void CheckpointWriteDelay(int flags, double progress);
-
-extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
-					BlockNumber segno);
-extern void AbsorbFsyncRequests(void);
-
-extern Size CheckpointerShmemSize(void);
-extern void CheckpointerShmemInit(void);
 
-extern bool FirstCallSinceLastCheckpoint(void);
 
 #endif							/* _BGWRITER_H */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index c843bbc9692..61fe0276f74 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,6 +18,15 @@
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 
+/*
+ * The type used to identify segment numbers.  Generally, segments are an
+ * internal detail of individual storage manager implementations, but since
+ * they appear in various places to allow them to be passed between processes,
+ * it seemed worthwhile to have a typename.
+ */
+typedef uint32 SegmentNumber;
+
+#define InvalidSegmentNumber ((SegmentNumber) 0xFFFFFFFF)
 
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
@@ -105,10 +114,9 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
 			 BlockNumber nblocks);
-extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void smgrpreckpt(void);
-extern void smgrsync(void);
-extern void smgrpostckpt(void);
+extern bool smgrimmedsync(SMgrRelation reln, ForkNumber forknum,
+						  SegmentNumber segno);
+
 extern void AtEOXact_SMgr(void);
 
 
@@ -133,16 +141,9 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 		   BlockNumber nblocks);
-extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void mdpreckpt(void);
-extern void mdsync(void);
-extern void mdpostckpt(void);
-
-extern void SetForwardFsyncRequests(void);
-extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
-					 BlockNumber segno);
-extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
-extern void ForgetDatabaseFsyncRequests(Oid dbid);
+extern bool mdimmedsync(SMgrRelation reln, ForkNumber forknum,
+						SegmentNumber segno);
+
 extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
 
 #endif							/* SMGR_H */
diff --git a/src/include/storage/smgrsync.h b/src/include/storage/smgrsync.h
new file mode 100644
index 00000000000..8ef7093f801
--- /dev/null
+++ b/src/include/storage/smgrsync.h
@@ -0,0 +1,36 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgrsync.h
+ *	  management of file synchronization
+ *
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/smgrpending.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SMGRSYNC_H
+#define SMGRSYNC_H
+
+#include "storage/smgr.h"
+
+extern void smgrsync_init(void);
+extern void smgrpreckpt(void);
+extern void smgrsync(void);
+extern void smgrpostckpt(void);
+
+extern void UnlinkAfterCheckpoint(RelFileNodeBackend rnode);
+extern bool FsyncAtCheckpoint(RelFileNode rnode, ForkNumber forknum,
+							  SegmentNumber segno);
+extern void RememberFsyncRequest(int type, RelFileNode rnode,
+								 ForkNumber forknum, SegmentNumber segno);
+extern void SetForwardFsyncRequests(void);
+extern void ForgetSegmentFsyncRequests(RelFileNode rnode, ForkNumber forknum,
+									   SegmentNumber segno);
+extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
+extern void ForgetDatabaseFsyncRequests(Oid dbid);
+
+
+#endif
-- 
2.19.1

