Hi all

There's a minor race between commit_ts SLRU truncation and concurrent
commit_ts lookups, where a lookup can check the lower valid bound xid
without knowing it's already been truncated away. This would result in
a SLRU lookup error.

It's pretty low-harm since it's hard to trigger and the only problem
is an error being emitted when we should otherwise return null/zero.
Most notably you have to pass an xid that used to be within the
datrozenxid but due to a concurrent vacuum has just moved outside it.
This can't happen if you're passing the xmin of a tuple that still
exists so it only matters for callers passing arbitrary XIDs in via
pg_xact_commit_timestamp(...).

The race window is bigger on standby because there we don't find out
about the advance of the lower commit ts bound until the next
checkpoint. But you still have to be looking at very old xids that
don't exist on the heap anymore.

We might as well fix it in HEAD, but it's totally pointless to
back-patch, and the only part of the race that can be realistically
hit is on standby, where we can't backpatch a fix w/o changing the
xlog format. Nope. We could narrow the scope by limiting commit_ts
slru truncation to just before a checkpoint, but given how hard this
is to hit... I don't care.

(This came up as part of the investigation I've been doing on the
txid_status thread, where Robert pointed out a similar problem that
can arise where txid_status races with clog truncation. I noticed the
issue with standby while looking into that.)

-- 
 Craig Ringer                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
From 361a92ecbe45226ed25ce87da6a8c7bd749cca4a Mon Sep 17 00:00:00 2001
From: Craig Ringer <cr...@2ndquadrant.com>
Date: Wed, 28 Dec 2016 21:23:59 +0800
Subject: [PATCH] Fix a minor race between commit_ts slru truncation and
 lookups

vac_truncate_clog was truncating the commit timestamp SLRU before
it advanced the oldest xid limit for commit timestamp lookups. This
created a small race window where a concurrent pg_xact_commit_timestamp
calling TransactionIdGetCommitTsData(...) could check the oldest
xid after the SLRU page containing it is truncated away but before
the threshold is updated.

Fix this by advancing the lower bound before removing the relevant SLRU pages.

A larger race window also existed on a standby. The lower bound for commit
timetamp validity was only advanced on a standby when we replayed a checkpoint.
This could happen a long time after the commit timestamp SLRU truncation.

This race only affects XIDs that vac_truncate_clog has determined are no longer
referenced in the system, so the minimum datfrozenxid has advanced past them.
It's never going to be triggered by code that looks up commit timestamps of
rows it's just read during an in-progress transaction. Also, the worst outcome
should the race be triggered is an I/O error from the SLRU code (where the
commit ts lookup should more correctly return null). So this race is pretty
harmless.

The nearly identical clog code has the same race, but it doesn't matter there
since there's no way for users to pass arbitrary XIDs to look them up in clog.
---
 src/backend/access/rmgrdesc/committsdesc.c |  8 +++++---
 src/backend/access/transam/commit_ts.c     | 23 +++++++++++++++--------
 src/backend/commands/vacuum.c              |  3 ++-
 src/include/access/commit_ts.h             |  8 ++++++++
 4 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/backend/access/rmgrdesc/committsdesc.c b/src/backend/access/rmgrdesc/committsdesc.c
index 527e5dc..2b117e4 100644
--- a/src/backend/access/rmgrdesc/committsdesc.c
+++ b/src/backend/access/rmgrdesc/committsdesc.c
@@ -33,10 +33,12 @@ commit_ts_desc(StringInfo buf, XLogReaderState *record)
 	}
 	else if (info == COMMIT_TS_TRUNCATE)
 	{
-		int			pageno;
+		xl_commit_ts_truncate trunc;
 
-		memcpy(&pageno, rec, sizeof(int));
-		appendStringInfo(buf, "%d", pageno);
+		memcpy(&trunc, rec, SizeOfCommitTsTruncate);
+
+		appendStringInfo(buf, "pageno %d, xid %u",
+			trunc.pageno, trunc.oldestXid);
 	}
 	else if (info == COMMIT_TS_SETTS)
 	{
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index a5b270c..86ac1d6 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -113,7 +113,7 @@ static bool CommitTsPagePrecedes(int page1, int page2);
 static void ActivateCommitTs(void);
 static void DeactivateCommitTs(void);
 static void WriteZeroPageXlogRec(int pageno);
-static void WriteTruncateXlogRec(int pageno);
+static void WriteTruncateXlogRec(int pageno, TransactionId oldestXid);
 static void WriteSetTimestampXlogRec(TransactionId mainxid, int nsubxids,
 						 TransactionId *subxids, TimestampTz timestamp,
 						 RepOriginId nodeid);
@@ -824,7 +824,7 @@ TruncateCommitTs(TransactionId oldestXact)
 		return;					/* nothing to remove */
 
 	/* Write XLOG record */
-	WriteTruncateXlogRec(cutoffPage);
+	WriteTruncateXlogRec(cutoffPage, oldestXact);
 
 	/* Now we can remove the old CommitTs segment(s) */
 	SimpleLruTruncate(CommitTsCtl, cutoffPage);
@@ -910,10 +910,15 @@ WriteZeroPageXlogRec(int pageno)
  * Write a TRUNCATE xlog record
  */
 static void
-WriteTruncateXlogRec(int pageno)
+WriteTruncateXlogRec(int pageno, TransactionId oldestXid)
 {
+	xl_commit_ts_truncate xlrec;
+
+	xlrec.pageno = pageno;
+	xlrec.oldestXid = oldestXid;
+
 	XLogBeginInsert();
-	XLogRegisterData((char *) (&pageno), sizeof(int));
+	XLogRegisterData((char *) (&xlrec), SizeOfCommitTsTruncate);
 	(void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE);
 }
 
@@ -967,17 +972,19 @@ commit_ts_redo(XLogReaderState *record)
 	}
 	else if (info == COMMIT_TS_TRUNCATE)
 	{
-		int			pageno;
+		xl_commit_ts_truncate trunc;
 
-		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		memcpy(&trunc, XLogRecGetData(record), SizeOfCommitTsTruncate);
+
+		AdvanceOldestCommitTsXid(trunc.oldestXid);
 
 		/*
 		 * During XLOG replay, latest_page_number isn't set up yet; insert a
 		 * suitable value to bypass the sanity test in SimpleLruTruncate.
 		 */
-		CommitTsCtl->shared->latest_page_number = pageno;
+		CommitTsCtl->shared->latest_page_number = trunc.pageno;
 
-		SimpleLruTruncate(CommitTsCtl, pageno);
+		SimpleLruTruncate(CommitTsCtl, trunc.pageno);
 	}
 	else if (info == COMMIT_TS_SETTS)
 	{
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index b1be2f7..4336eda 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1152,6 +1152,8 @@ vac_truncate_clog(TransactionId frozenXID,
 	if (bogus)
 		return;
 
+	AdvanceOldestCommitTsXid(frozenXID);
+
 	/*
 	 * Truncate CLOG, multixact and CommitTs to the oldest computed value.
 	 */
@@ -1167,7 +1169,6 @@ vac_truncate_clog(TransactionId frozenXID,
 	 */
 	SetTransactionIdLimit(frozenXID, oldestxid_datoid);
 	SetMultiXactIdLimit(minMulti, minmulti_datoid);
-	AdvanceOldestCommitTsXid(frozenXID);
 }
 
 
diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h
index cf343ca..06991c6 100644
--- a/src/include/access/commit_ts.h
+++ b/src/include/access/commit_ts.h
@@ -61,6 +61,14 @@ typedef struct xl_commit_ts_set
 #define SizeOfCommitTsSet	(offsetof(xl_commit_ts_set, mainxid) + \
 							 sizeof(TransactionId))
 
+typedef struct xl_commit_ts_truncate
+{
+	int pageno;
+	TransactionId oldestXid;
+} xl_commit_ts_truncate;
+
+#define SizeOfCommitTsTruncate	(offsetof(xl_commit_ts_truncate, oldestXid) +\
+								sizeof(TransactionId))
 
 extern void commit_ts_redo(XLogReaderState *record);
 extern void commit_ts_desc(StringInfo buf, XLogReaderState *record);
-- 
2.5.5

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to