On 26/06/2019 06:07, Thomas Munro wrote:
On Wed, Jun 26, 2019 at 2:33 PM Michael Paquier <mich...@paquier.xyz> wrote:
On Tue, Jun 25, 2019 at 02:38:43PM +0500, Andrey Borodin wrote:
I feel a little uncomfortable with number 0x7fffffff right in code.

PG_INT32_MAX...

MaxTransactionId / 2?

Yeah, that makes sense.

Here's a new version of the patches. Changes:

* I changed the reuse-logging so that we always write a reuse WAL record, even if the deleteXid is very old. I tried to avoid that with the check for MaxTransactionId / 2 or 0x7fffffff, but it had some problems. In the previous patch version, it wasn't just an optimization. Without the check, we would write 32-bit XIDs to the log that had already wrapped around, and that caused the standby to unnecessarily wait for or kill backends. But checking for MaxTransaction / 2 isn't quite enough: there was a small chance that the next XID counter advanced just after checking for that, so that we still logged an XID that had just wrapped around. A more robust way to deal with this is to log a full 64-bit XID, and check for wraparound at redo in the standby. And if we do that, trying to optimize this in the master doesn't seem that important anymore. So in this patch version, we always log the 64-bit XID, and check for the MaxTransaction / 2 when replaying the WAL record instead.

* I moved the logic to extend a 32-bit XID to 64-bits to a new helper function in varsup.c.

* Instead of storing just a naked FullTransactionId in the "page contents" of a deleted page, I created a new struct for that. The effect is the same, but I think the struct clarifies what's happening, and it's a good location to attach a comment explaining it.

* Fixed the mixup between < and >

I haven't done any testing on this yet. Andrey, would you happen to have an environment ready to test this?

- Heikki
>From 7fd5901e15ac9e0f1928eeecbb9ae1212bacf3f3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Thu, 4 Apr 2019 18:06:48 +0300
Subject: [PATCH 1/2] Refactor checks for deleted GiST pages.

The explicit check in gistScanPage() isn't currently really necessary, as
a deleted page is always empty, so the loop would fall through without
doing anything, anyway. But it's a marginal optimization, and it gives a
nice place to attach a comment to explain how it works.
---
 src/backend/access/gist/gist.c    | 40 ++++++++++++-------------------
 src/backend/access/gist/gistget.c | 14 +++++++++++
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 470b121e7da..79030e77153 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -709,14 +709,15 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
 			continue;
 		}
 
-		if (stack->blkno != GIST_ROOT_BLKNO &&
-			stack->parent->lsn < GistPageGetNSN(stack->page))
+		if ((stack->blkno != GIST_ROOT_BLKNO &&
+			 stack->parent->lsn < GistPageGetNSN(stack->page)) ||
+			GistPageIsDeleted(stack->page))
 		{
 			/*
-			 * Concurrent split detected. There's no guarantee that the
-			 * downlink for this page is consistent with the tuple we're
-			 * inserting anymore, so go back to parent and rechoose the best
-			 * child.
+			 * Concurrent split or page deletion detected. There's no
+			 * guarantee that the downlink for this page is consistent with
+			 * the tuple we're inserting anymore, so go back to parent and
+			 * rechoose the best child.
 			 */
 			UnlockReleaseBuffer(stack->buffer);
 			xlocked = false;
@@ -735,9 +736,6 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
-			/* currently, internal pages are never deleted */
-			Assert(!GistPageIsDeleted(stack->page));
-
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
@@ -858,12 +856,13 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
 					 * leaf/inner is enough to recognize split for root
 					 */
 				}
-				else if (GistFollowRight(stack->page) ||
-						 stack->parent->lsn < GistPageGetNSN(stack->page))
+				else if ((GistFollowRight(stack->page) ||
+						  stack->parent->lsn < GistPageGetNSN(stack->page)) &&
+						 GistPageIsDeleted(stack->page))
 				{
 					/*
-					 * The page was split while we momentarily unlocked the
-					 * page. Go back to parent.
+					 * The page was split or deleted while we momentarily
+					 * unlocked the page. Go back to parent.
 					 */
 					UnlockReleaseBuffer(stack->buffer);
 					xlocked = false;
@@ -872,18 +871,6 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
 				}
 			}
 
-			/*
-			 * The page might have been deleted after we scanned the parent
-			 * and saw the downlink.
-			 */
-			if (GistPageIsDeleted(stack->page))
-			{
-				UnlockReleaseBuffer(stack->buffer);
-				xlocked = false;
-				state.stack = stack = stack->parent;
-				continue;
-			}
-
 			/* now state.stack->(page, buffer and blkno) points to leaf page */
 
 			gistinserttuple(&state, stack, giststate, itup,
@@ -947,6 +934,9 @@ gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
 			break;
 		}
 
+		/* currently, internal pages are never deleted */
+		Assert(!GistPageIsDeleted(page));
+
 		top->lsn = BufferGetLSNAtomic(buffer);
 
 		/*
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index 8108fbb7d8e..77ae2fb3395 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -377,6 +377,20 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
 		MemoryContextSwitchTo(oldcxt);
 	}
 
+	/*
+	 * Check if the page was deleted after we saw the downlink. There's
+	 * nothing of interest on a deleted page. Note that we must do this
+	 * after checking the NSN for concurrent splits! It's possible that
+	 * the page originally contained some tuples that are visible to use,
+	 * but was split so that all the visible tuples were moved to another
+	 * page, and then this page was deleted.
+	 */
+	if (GistPageIsDeleted(page))
+	{
+		UnlockReleaseBuffer(buffer);
+		return;
+	}
+
 	so->nPageData = so->curPageData = 0;
 	scan->xs_hitup = NULL;		/* might point into pageDataCxt */
 	if (so->pageDataCxt)
-- 
2.20.1

>From 0cc869d0f5e7ef33f0ccdcd2ccbd8f89d5711590 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Thu, 27 Jun 2019 14:37:56 +0300
Subject: [PATCH 2/2] Use full 64-bit XID for checking if a deleted GiST page
 is old enough.

Otherwise, after a deleted page gets even older, it becomes unrecyclable
again. B-tree has the same problem, and has had since time immemorial,
but let's at least fix this in GiST, where this is new.
---
 src/backend/access/gist/gistutil.c     | 26 +++++++++++++++--
 src/backend/access/gist/gistvacuum.c   |  4 +--
 src/backend/access/gist/gistxlog.c     | 29 +++++++++++++++----
 src/backend/access/rmgrdesc/gistdesc.c | 11 +++++---
 src/backend/access/transam/varsup.c    | 39 ++++++++++++++++++++++++++
 src/include/access/gist.h              | 35 +++++++++++++++++++++--
 src/include/access/gist_private.h      |  4 +--
 src/include/access/gistxlog.h          |  6 ++--
 src/include/access/transam.h           |  1 +
 9 files changed, 133 insertions(+), 22 deletions(-)

diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 49df05653b3..3a09197a242 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -882,9 +882,29 @@ gistNewBuffer(Relation r)
 bool
 gistPageRecyclable(Page page)
 {
-	return PageIsNew(page) ||
-		(GistPageIsDeleted(page) &&
-		 TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalXmin));
+	if (PageIsNew(page))
+		return true;
+	if (GistPageIsDeleted(page))
+	{
+		/*
+		 * The page was deleted, but when? If it was just deleted, a scan
+		 * might have seen the downlink to it, and will read the page later.
+		 * As long as that can happen, we must keep the deleted page around as
+		 * a tombstone.
+		 *
+		 * Compare the deletion XID with RecentGlobalXmin. If deleteXid <
+		 * RecentGlobalXmin, then no scan that's still in progress could have
+		 * seen its downlink, and we can recycle it.
+		 */
+		FullTransactionId deletexid_full = GistPageGetDeleteXid(page);
+		FullTransactionId recentxmin_full;
+
+		recentxmin_full = FullTransactionIdFromRecentXid(RecentGlobalXmin);
+
+		if (FullTransactionIdPrecedes(deletexid_full, recentxmin_full))
+			return true;
+	}
+	return false;
 }
 
 bytea *
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 4270226eee2..ad1576da5b8 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -595,7 +595,7 @@ gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
 	ItemId		iid;
 	IndexTuple	idxtuple;
 	XLogRecPtr	recptr;
-	TransactionId txid;
+	FullTransactionId txid;
 
 	/*
 	 * Check that the leaf is still empty and deletable.
@@ -648,7 +648,7 @@ gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
 	 * currently in progress must have ended.  (That's much more conservative
 	 * than needed, but let's keep it safe and simple.)
 	 */
-	txid = ReadNewTransactionId();
+	txid = ReadNextFullTransactionId();
 
 	START_CRIT_SECTION();
 
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 503db34d863..9fc7282200d 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -396,8 +396,27 @@ gistRedoPageReuse(XLogReaderState *record)
 	 */
 	if (InHotStandby)
 	{
-		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
-											xlrec->node);
+		FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid;
+		FullTransactionId nextFullXid = ReadNextFullTransactionId();
+		uint64		diff;
+
+		/*
+		 * ResolveRecoveryConflictWithSnapshot operates on 32-bit
+		 * TransactionIds, so truncate the logged FullTransactionId. If the
+		 * logged value is very old, so that XID wrap-around already happened
+		 * on it, there can't be any snapshots that still see it.
+		 */
+		nextFullXid = ReadNextFullTransactionId();
+		diff = U64FromFullTransactionId(nextFullXid) -
+			U64FromFullTransactionId(latestRemovedFullXid);
+		if (diff < MaxTransactionId / 2)
+		{
+			TransactionId latestRemovedXid;
+
+			latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+			ResolveRecoveryConflictWithSnapshot(latestRemovedXid,
+												xlrec->node);
+		}
 	}
 }
 
@@ -554,7 +573,7 @@ gistXLogSplit(bool page_is_leaf,
  * downlink from the parent page.
  */
 XLogRecPtr
-gistXLogPageDelete(Buffer buffer, TransactionId xid,
+gistXLogPageDelete(Buffer buffer, FullTransactionId xid,
 				   Buffer parentBuffer, OffsetNumber downlinkOffset)
 {
 	gistxlogPageDelete xlrec;
@@ -578,7 +597,7 @@ gistXLogPageDelete(Buffer buffer, TransactionId xid,
  * Write XLOG record about reuse of a deleted page.
  */
 void
-gistXLogPageReuse(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+gistXLogPageReuse(Relation rel, BlockNumber blkno, FullTransactionId latestRemovedXid)
 {
 	gistxlogPageReuse xlrec_reuse;
 
@@ -591,7 +610,7 @@ gistXLogPageReuse(Relation rel, BlockNumber blkno, TransactionId latestRemovedXi
 	/* XLOG stuff */
 	xlrec_reuse.node = rel->rd_node;
 	xlrec_reuse.block = blkno;
-	xlrec_reuse.latestRemovedXid = latestRemovedXid;
+	xlrec_reuse.latestRemovedFullXid = latestRemovedXid;
 
 	XLogBeginInsert();
 	XLogRegisterData((char *) &xlrec_reuse, SizeOfGistxlogPageReuse);
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index 767864b58e6..eccb6fd9428 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -26,10 +26,11 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
 static void
 out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec)
 {
-	appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u",
+	appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u:%u",
 					 xlrec->node.spcNode, xlrec->node.dbNode,
 					 xlrec->node.relNode, xlrec->block,
-					 xlrec->latestRemovedXid);
+					 EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
+					 XidFromFullTransactionId(xlrec->latestRemovedFullXid));
 }
 
 static void
@@ -50,8 +51,10 @@ out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec)
 static void
 out_gistxlogPageDelete(StringInfo buf, gistxlogPageDelete *xlrec)
 {
-	appendStringInfo(buf, "deleteXid %u; downlink %u",
-					 xlrec->deleteXid, xlrec->downlinkOffset);
+	appendStringInfo(buf, "deleteXid %u:%u; downlink %u",
+					 EpochFromFullTransactionId(xlrec->deleteXid),
+					 XidFromFullTransactionId(xlrec->deleteXid),
+					 xlrec->downlinkOffset);
 }
 
 void
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 5b759ec7f3f..e305a3be028 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -300,6 +300,45 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid)
 	LWLockRelease(XidGenLock);
 }
 
+/*
+ * Extend a 32-bit TransactionId into a 64-bit FullTransactionId.
+ *
+ * This assumes that the xid is "recent", not older than 2 billion XIDs
+ * from the next xid.
+ */
+FullTransactionId
+FullTransactionIdFromRecentXid(TransactionId xid)
+{
+	FullTransactionId nextxid_full;
+	uint32		nextxid_epoch;
+	TransactionId nextxid_xid;
+	uint32		xid_epoch;
+
+	if (TransactionIdIsNormal(xid))
+	{
+		/*
+		 * Compute the epoch of the target xid from the next XID's epoch.
+		 * This assumes that the target XID is within the 2 billion XID
+		 * horizon from the next XID.
+		 */
+		nextxid_full = ReadNextFullTransactionId();
+		nextxid_epoch = EpochFromFullTransactionId(nextxid_full);
+		nextxid_xid = XidFromFullTransactionId(nextxid_full);
+
+		if (xid > nextxid_xid)
+			xid_epoch = nextxid_epoch - 1;
+		else
+			xid_epoch = nextxid_epoch;
+	}
+	else
+	{
+		/* Use epoch 0 for special XIDs. */
+		xid_epoch = 0;
+	}
+
+	return FullTransactionIdFromEpochAndXid(xid_epoch, xid);
+}
+
 /*
  * Advance the cluster-wide value for the oldest valid clog entry.
  *
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 6902f4115b7..14fa9646b23 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -16,6 +16,7 @@
 #ifndef GIST_H
 #define GIST_H
 
+#include "access/transam.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
 #include "storage/block.h"
@@ -158,9 +159,37 @@ typedef struct GISTENTRY
 #define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn))
 #define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
 
-/* For deleted pages we store last xid which could see the page in scan */
-#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
-#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val)
+
+/*
+ * On a deleted page, we store this struct. A deleted page doesn't contain any
+ * tuples, so we don't use the normal page layout with line pointers. Instead,
+ * this struct is stored right after the standard page header. pd_lower points
+ * to the end of this struct. If we add fields to this struct in the future, we
+ * can distinguish the old and new formats by pd_lower.
+ */
+typedef struct GISTDeletedPageContents
+{
+	/* last xid which could see the page in a scan */
+	FullTransactionId deleteXid;
+} GISTDeletedPageContents;
+
+static inline FullTransactionId
+GistPageGetDeleteXid(Page page)
+{
+	Assert(GistPageIsDeleted(page));
+	Assert(((PageHeader) page)->pd_lower == MAXALIGN(SizeOfPageHeaderData) + sizeof(GISTDeletedPageContents));
+
+	return ((GISTDeletedPageContents *) PageGetContents(page))->deleteXid;
+}
+
+static inline void
+GistPageSetDeleteXid(Page page, FullTransactionId deletexid)
+{
+	Assert(PageIsEmpty(page));
+	((PageHeader) page)->pd_lower = MAXALIGN(SizeOfPageHeaderData) + sizeof(GISTDeletedPageContents);
+
+	((GISTDeletedPageContents *) PageGetContents(page))->deleteXid = deletexid;
+}
 
 /*
  * Vector of GISTENTRY structs; user-defined methods union and picksplit
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index f80694bf9a8..f84ea71ecba 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -426,11 +426,11 @@ extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
 
 /* gistxlog.c */
 extern XLogRecPtr gistXLogPageDelete(Buffer buffer,
-									 TransactionId xid, Buffer parentBuffer,
+									 FullTransactionId xid, Buffer parentBuffer,
 									 OffsetNumber downlinkOffset);
 
 extern void gistXLogPageReuse(Relation rel, BlockNumber blkno,
-							  TransactionId latestRemovedXid);
+							  FullTransactionId latestRemovedXid);
 
 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
 								 OffsetNumber *todelete, int ntodelete,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 969a5376b5e..e44922d915c 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -83,7 +83,7 @@ typedef struct gistxlogPageSplit
  */
 typedef struct gistxlogPageDelete
 {
-	TransactionId deleteXid;	/* last Xid which could see page in scan */
+	FullTransactionId deleteXid;	/* last Xid which could see page in scan */
 	OffsetNumber downlinkOffset;	/* Offset of downlink referencing this
 									 * page */
 } gistxlogPageDelete;
@@ -98,10 +98,10 @@ typedef struct gistxlogPageReuse
 {
 	RelFileNode node;
 	BlockNumber block;
-	TransactionId latestRemovedXid;
+	FullTransactionId latestRemovedFullXid;
 } gistxlogPageReuse;
 
-#define SizeOfGistxlogPageReuse	(offsetof(gistxlogPageReuse, latestRemovedXid) + sizeof(TransactionId))
+#define SizeOfGistxlogPageReuse	(offsetof(gistxlogPageReuse, latestRemovedFullXid) + sizeof(FullTransactionId))
 
 extern void gist_redo(XLogReaderState *record);
 extern void gist_desc(StringInfo buf, XLogReaderState *record);
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 6cbb0c82c73..e5e5e4b3bf3 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -225,6 +225,7 @@ extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid);
 extern FullTransactionId GetNewTransactionId(bool isSubXact);
 extern void AdvanceNextFullTransactionIdPastXid(TransactionId xid);
 extern FullTransactionId ReadNextFullTransactionId(void);
+extern FullTransactionId FullTransactionIdFromRecentXid(TransactionId xid);
 extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
 								  Oid oldest_datoid);
 extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid);
-- 
2.20.1

Reply via email to