At Mon, 17 Aug 2020 11:22:15 -0700, Andres Freund <and...@anarazel.de> wrote in 
> Hi,
> 
> On 2020-08-17 14:05:37 +0300, Heikki Linnakangas wrote:
> > On 14/04/2020 22:04, Teja Mupparti wrote:
> > > Thanks Kyotaro and Masahiko for the feedback. I think there is a
> > > consensus on the critical-section around truncate,
> > 
> > +1
> 
> I'm inclined to think that we should do that independent of the far more
> complicated fix for other related issues.
...
> > Perhaps a better approach would be to prevent the checkpoint from
> > completing, until all in-progress truncations have completed. We have a
> > mechanism to wait out in-progress commits at the beginning of a checkpoint,
> > right after the redo point has been established. See comments around the
> > GetVirtualXIDsDelayingChkpt() function call in CreateCheckPoint(). We could
> > have a similar mechanism to wait out the truncations before *completing* a
> > checkpoint.
> 
> What I outlined earlier *is* essentially a way to do so, by preventing
> checkpointing from finishing the buffer scan while a dangerous state
> exists.

Seems reasonable. The attached does that. It actually works for the
initial case.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 1233448481..e1d3068f14 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -3062,8 +3062,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
 	 * crash/basebackup, even though the state of the data directory would
 	 * require it.
 	 */
-	Assert(!MyProc->delayChkpt);
-	MyProc->delayChkpt = true;
+	Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE);
+	MyProc->delayChkpt = DELAY_CHKPT_START;
 
 	/* WAL log truncation */
 	WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3089,7 +3089,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
 	/* Then offsets */
 	PerformOffsetsTruncation(oldestMulti, newOldestMulti);
 
-	MyProc->delayChkpt = false;
+	MyProc->delayChkpt = DELAY_CHKPT_NONE;
 
 	END_CRIT_SECTION();
 	LWLockRelease(MultiXactTruncationLock);
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index fc18b77832..1b74a229d6 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -463,7 +463,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 	proc->lxid = (LocalTransactionId) xid;
 	proc->xid = xid;
 	Assert(proc->xmin == InvalidTransactionId);
-	proc->delayChkpt = false;
+	proc->delayChkpt = DELAY_CHKPT_NONE;
 	proc->statusFlags = 0;
 	proc->pid = 0;
 	proc->backendId = InvalidBackendId;
@@ -1108,7 +1108,8 @@ EndPrepare(GlobalTransaction gxact)
 
 	START_CRIT_SECTION();
 
-	MyProc->delayChkpt = true;
+	Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE);
+	MyProc->delayChkpt = DELAY_CHKPT_START;
 
 	XLogBeginInsert();
 	for (record = records.head; record != NULL; record = record->next)
@@ -1151,7 +1152,7 @@ EndPrepare(GlobalTransaction gxact)
 	 * checkpoint starting after this will certainly see the gxact as a
 	 * candidate for fsyncing.
 	 */
-	MyProc->delayChkpt = false;
+	MyProc->delayChkpt = DELAY_CHKPT_NONE;
 
 	/*
 	 * Remember that we have this GlobalTransaction entry locked for us.  If
@@ -2199,7 +2200,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	START_CRIT_SECTION();
 
 	/* See notes in RecordTransactionCommit */
-	MyProc->delayChkpt = true;
+	Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE);
+	MyProc->delayChkpt = DELAY_CHKPT_START;
 
 	/*
 	 * Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2247,7 +2249,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	TransactionIdCommitTree(xid, nchildren, children);
 
 	/* Checkpoint can proceed now */
-	MyProc->delayChkpt = false;
+	MyProc->delayChkpt = DELAY_CHKPT_NONE;
 
 	END_CRIT_SECTION();
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index a2068e3fd4..000206e506 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1334,8 +1334,9 @@ RecordTransactionCommit(void)
 		 * This makes checkpoint's determination of which xacts are delayChkpt
 		 * a bit fuzzy, but it doesn't matter.
 		 */
+		Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE);
 		START_CRIT_SECTION();
-		MyProc->delayChkpt = true;
+		MyProc->delayChkpt = DELAY_CHKPT_START;
 
 		SetCurrentTransactionStopTimestamp();
 
@@ -1436,7 +1437,7 @@ RecordTransactionCommit(void)
 	 */
 	if (markXidCommitted)
 	{
-		MyProc->delayChkpt = false;
+		MyProc->delayChkpt = DELAY_CHKPT_NONE;
 		END_CRIT_SECTION();
 	}
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ede93ad7fd..5b0a653408 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9024,18 +9024,30 @@ CreateCheckPoint(int flags)
 	 * and we will correctly flush the update below.  So we cannot miss any
 	 * xacts we need to wait for.
 	 */
-	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
+	vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
 	if (nvxids > 0)
 	{
 		do
 		{
 			pg_usleep(10000L);	/* wait for 10 msec */
-		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
+		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+											  DELAY_CHKPT_START));
 	}
 	pfree(vxids);
 
 	CheckPointGuts(checkPoint.redo, flags);
 
+	vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
+	if (nvxids > 0)
+	{
+		do
+		{
+			pg_usleep(10000L);	/* wait for 10 msec */
+		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+											  DELAY_CHKPT_COMPLETE));
+	}
+	pfree(vxids);
+
 	/*
 	 * Take a snapshot of running transactions and write this to WAL. This
 	 * allows us to reconstruct the state of running transactions during
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 7052dc245e..51f2581c06 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -923,7 +923,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
 	/*
 	 * Ensure no checkpoint can change our view of RedoRecPtr.
 	 */
-	Assert(MyProc->delayChkpt);
+	Assert(MyProc->delayChkpt == DELAY_CHKPT_START);
 
 	/*
 	 * Update RedoRecPtr so that we can make the right decision
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index cba7a9ada0..b75ea97b04 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -325,6 +325,16 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 
 	RelationPreTruncate(rel);
 
+	/*
+	 * If the file truncation fails but the concurrent checkpoint completes
+	 * just before that, the next crash recovery can fail due to WAL records
+	 * inconsistent with the untruncated pages. To avoid that situation we
+	 * delay the checkpoint completion until we confirm the truncation to be
+	 * successful.
+	 */
+	Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE);
+	MyProc->delayChkpt = DELAY_CHKPT_COMPLETE;
+
 	/*
 	 * We WAL-log the truncation before actually truncating, which means
 	 * trouble if the truncation fails. If we then crash, the WAL replay
@@ -373,6 +383,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	 */
 	if (need_fsm_vacuum)
 		FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
+
+	MyProc->delayChkpt = DELAY_CHKPT_NONE;
 }
 
 /*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8f2c482bc8..52d70019c4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3615,7 +3615,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 	{
 		XLogRecPtr	lsn = InvalidXLogRecPtr;
 		bool		dirtied = false;
-		bool		delayChkpt = false;
+		int			delayChkpt = DELAY_CHKPT_NONE;
 		uint32		buf_state;
 
 		/*
@@ -3665,7 +3665,8 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			 * essential that CreateCheckpoint waits for virtual transactions
 			 * rather than full transactionids.
 			 */
-			MyProc->delayChkpt = delayChkpt = true;
+			Assert(MyProc->delayChkpt == DELAY_CHKPT_NONE);
+			MyProc->delayChkpt = delayChkpt = DELAY_CHKPT_START;
 			lsn = XLogSaveBufferForHint(buffer, buffer_std);
 		}
 
@@ -3698,7 +3699,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		UnlockBufHdr(bufHdr, buf_state);
 
 		if (delayChkpt)
-			MyProc->delayChkpt = false;
+			MyProc->delayChkpt = DELAY_CHKPT_NONE;
 
 		if (dirtied)
 		{
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index cf12eda504..20757274ec 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -655,7 +655,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 
 		proc->lxid = InvalidLocalTransactionId;
 		proc->xmin = InvalidTransactionId;
-		proc->delayChkpt = false;	/* be sure this is cleared in abort */
+
+		/* be sure this is cleared in abort */
+		proc->delayChkpt = DELAY_CHKPT_NONE;
+
 		proc->recoveryConflictPending = false;
 
 		/* must be cleared with xid/xmin: */
@@ -694,7 +697,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 	proc->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
 	proc->xmin = InvalidTransactionId;
-	proc->delayChkpt = false;	/* be sure this is cleared in abort */
+
+	/* be sure this is cleared in abort */
+	proc->delayChkpt = DELAY_CHKPT_NONE;
+
 	proc->recoveryConflictPending = false;
 
 	/* must be cleared with xid/xmin: */
@@ -2929,7 +2935,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
  * delaying checkpoint because they have critical actions in progress.
  *
  * Constructs an array of VXIDs of transactions that are currently in commit
- * critical sections, as shown by having delayChkpt set in their PGPROC.
+ * critical sections, as shown by having delayChkpt set to the specified value
+ * in their PGPROC.
  *
  * Returns a palloc'd array that should be freed by the caller.
  * *nvxids is the number of valid entries.
@@ -2943,13 +2950,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
  * for clearing of delayChkpt to propagate is unimportant for correctness.
  */
 VirtualTransactionId *
-GetVirtualXIDsDelayingChkpt(int *nvxids)
+GetVirtualXIDsDelayingChkpt(int *nvxids, DelayChkptType type)
 {
 	VirtualTransactionId *vxids;
 	ProcArrayStruct *arrayP = procArray;
 	int			count = 0;
 	int			index;
 
+	Assert(type != DELAY_CHKPT_NONE);
+
 	/* allocate what's certainly enough result space */
 	vxids = (VirtualTransactionId *)
 		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
@@ -2961,7 +2970,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
 
-		if (proc->delayChkpt)
+		if (proc->delayChkpt == type)
 		{
 			VirtualTransactionId vxid;
 
@@ -2987,12 +2996,15 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
  * those numbers should be small enough for it not to be a problem.
  */
 bool
-HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
+HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids,
+							 DelayChkptType type)
 {
 	bool		result = false;
 	ProcArrayStruct *arrayP = procArray;
 	int			index;
 
+	Assert(type != DELAY_CHKPT_NONE);
+
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
 	for (index = 0; index < arrayP->numProcs; index++)
@@ -3003,7 +3015,7 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
 
 		GET_VXID_FROM_PGPROC(vxid, *proc);
 
-		if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
+		if (proc->delayChkpt == type &&	VirtualTransactionIdIsValid(vxid))
 		{
 			int			i;
 
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 9b6aa2fe0d..b7f9310afe 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -392,7 +392,7 @@ InitProcess(void)
 	MyProc->roleId = InvalidOid;
 	MyProc->tempNamespaceId = InvalidOid;
 	MyProc->isBackgroundWorker = IsBackgroundWorker;
-	MyProc->delayChkpt = false;
+	MyProc->delayChkpt = DELAY_CHKPT_NONE;
 	MyProc->statusFlags = 0;
 	/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
 	if (IsAutoVacuumWorkerProcess())
@@ -573,7 +573,7 @@ InitAuxiliaryProcess(void)
 	MyProc->roleId = InvalidOid;
 	MyProc->tempNamespaceId = InvalidOid;
 	MyProc->isBackgroundWorker = IsBackgroundWorker;
-	MyProc->delayChkpt = false;
+	MyProc->delayChkpt = DELAY_CHKPT_NONE;
 	MyProc->statusFlags = 0;
 	MyProc->lwWaiting = false;
 	MyProc->lwWaitMode = 0;
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 989c5849d4..ca764a1a72 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -78,6 +78,14 @@ struct XidCache
  */
 #define INVALID_PGPROCNO		PG_INT32_MAX
 
+/* type for PGPROC.delayChkpt */
+typedef enum DelayChkptType
+{
+	DELAY_CHKPT_NONE = 0,
+	DELAY_CHKPT_START,
+	DELAY_CHKPT_COMPLETE
+} DelayChkptType;
+
 typedef enum
 {
 	PROC_WAIT_STATUS_OK,
@@ -181,7 +189,8 @@ struct PGPROC
 	LOCKMASK	heldLocks;		/* bitmask for lock types already held on this
 								 * lock object by this backend */
 
-	bool		delayChkpt;		/* true if this proc delays checkpoint start */
+	DelayChkptType	delayChkpt;	/* if this proc delays checkpoint start and/or
+								 * completion.  */
 
 	uint8		statusFlags;	/* this backend's status flags, see PROC_*
 								 * above. mirrored in
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index b01fa52139..e560cada2b 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -15,11 +15,11 @@
 #define PROCARRAY_H
 
 #include "storage/lock.h"
+#include "storage/proc.h"
 #include "storage/standby.h"
 #include "utils/relcache.h"
 #include "utils/snapshot.h"
 
-
 extern Size ProcArrayShmemSize(void);
 extern void CreateSharedProcArray(void);
 extern void ProcArrayAdd(PGPROC *proc);
@@ -59,8 +59,10 @@ extern TransactionId GetOldestActiveTransactionId(void);
 extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
 extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
 
-extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
-extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
+extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids,
+														 DelayChkptType type);
+extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
+										 int nvxids, DelayChkptType type);
 
 extern PGPROC *BackendPidGetProc(int pid);
 extern PGPROC *BackendPidGetProcWithLock(int pid);

Reply via email to