From 1fbe8648abc70d97c6773b14c833f3fcf40411cc Mon Sep 17 00:00:00 2001
From: Jasper Smit <jasper.smit@servicenow.com>
Date: Thu, 9 Oct 2025 16:23:38 +0200
Subject: [PATCH] DEF0734066 Delay marking aborted tuples unused, until no
 visibile tuple has a ctid link to that tuple

---
 contrib/pgstattuple/pgstatapprox.c          |  1 +
 src/backend/access/heap/heapam.c            |  1 +
 src/backend/access/heap/heapam_handler.c    |  3 ++
 src/backend/access/heap/heapam_visibility.c | 39 +++++++++++++--------
 src/backend/access/heap/pruneheap.c         | 18 +++++++++-
 src/backend/access/heap/vacuumlazy.c        |  2 ++
 src/include/access/heapam.h                 |  3 +-
 7 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..50092793b99 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -156,6 +156,7 @@ statapprox_heap(Relation rel, output_type *stat)
 					break;
 				case HEAPTUPLE_DEAD:
 				case HEAPTUPLE_RECENTLY_DEAD:
+				case HEAPTUPLE_RECENTLY_ABORTED:
 				case HEAPTUPLE_INSERT_IN_PROGRESS:
 					stat->dead_tuple_len += tuple.t_len;
 					stat->dead_tuple_count++;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 0dcd6ee817e..43e8d343de1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -9264,6 +9264,7 @@ HeapCheckForSerializableConflictOut(bool visible, Relation relation,
 				return;
 			xid = HeapTupleHeaderGetXmin(tuple->t_data);
 			break;
+		case HEAPTUPLE_RECENTLY_ABORTED:
 		case HEAPTUPLE_RECENTLY_DEAD:
 		case HEAPTUPLE_DELETE_IN_PROGRESS:
 			if (visible)
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index cb4bc35c93e..29df8dbc372 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -846,6 +846,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 				isdead = true;
 				break;
 			case HEAPTUPLE_RECENTLY_DEAD:
+			case HEAPTUPLE_RECENTLY_ABORTED:
 				*tups_recently_dead += 1;
 				/* fall through */
 			case HEAPTUPLE_LIVE:
@@ -1079,6 +1080,7 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 
 			case HEAPTUPLE_DEAD:
 			case HEAPTUPLE_RECENTLY_DEAD:
+			case HEAPTUPLE_RECENTLY_ABORTED:
 				/* Count dead and recently-dead rows */
 				*deadrows += 1;
 				break;
@@ -1427,6 +1429,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 					/* Count it as live, too */
 					reltuples += 1;
 					break;
+				case HEAPTUPLE_RECENTLY_ABORTED:
 				case HEAPTUPLE_RECENTLY_DEAD:
 
 					/*
diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c
index 05f6946fe60..ac6f7dc14a8 100644
--- a/src/backend/access/heap/heapam_visibility.c
+++ b/src/backend/access/heap/heapam_visibility.c
@@ -1176,7 +1176,7 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
 
 	res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after);
 
-	if (res == HEAPTUPLE_RECENTLY_DEAD)
+	if (res == HEAPTUPLE_RECENTLY_DEAD || res == HEAPTUPLE_RECENTLY_ABORTED)
 	{
 		Assert(TransactionIdIsValid(dead_after));
 
@@ -1195,7 +1195,7 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
  * In contrast to HeapTupleSatisfiesVacuum this routine, when encountering a
  * tuple that could still be visible to some backend, stores the xid that
  * needs to be compared with the horizon in *dead_after, and returns
- * HEAPTUPLE_RECENTLY_DEAD. The caller then can perform the comparison with
+ * HEAPTUPLE_RECENTLY_DEAD || HEAPTUPLE_RECENTLY_ABORTED. The caller then can perform the comparison with
  * the horizon.  This is e.g. useful when comparing with different horizons.
  *
  * Note: HEAPTUPLE_DEAD can still be returned here, e.g. if the inserting
@@ -1215,13 +1215,20 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de
 	/*
 	 * Has inserting transaction committed?
 	 *
-	 * If the inserting transaction aborted, then the tuple was never visible
-	 * to any other transaction, so we can delete it immediately.
+	 * For aborted and committed transactions we need to take care of dead_after,
+	 * as there can be someone taking a tuple lock on it and in the process of traversing the tuple chain.
+	 * If we allow the tuple to be deleted immediately, it can be that another tuple is placed there.
 	 */
 	if (!HeapTupleHeaderXminCommitted(tuple))
 	{
-		if (HeapTupleHeaderXminInvalid(tuple))
+		TransactionId xmin = HeapTupleHeaderGetRawXmin(tuple);
+		if (HeapTupleHeaderXminInvalid(tuple)) {
+			if (TransactionIdIsNormal(xmin)) {
+				*dead_after = xmin;
+				return HEAPTUPLE_RECENTLY_ABORTED;
+			}
 			return HEAPTUPLE_DEAD;
+		}
 		/* Used by pre-9.0 binary upgrades */
 		else if (tuple->t_infomask & HEAP_MOVED_OFF)
 		{
@@ -1256,10 +1263,11 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de
 			{
 				SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
 							InvalidTransactionId);
+				// TODO: should also here treat HEAPTUPLE_RECENTLY_ABORTED
 				return HEAPTUPLE_DEAD;
 			}
 		}
-		else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple)))
+		else if (TransactionIdIsCurrentTransactionId(xmin))
 		{
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return HEAPTUPLE_INSERT_IN_PROGRESS;
@@ -1273,7 +1281,7 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de
 			/* deleting subtransaction must have aborted */
 			return HEAPTUPLE_INSERT_IN_PROGRESS;
 		}
-		else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple)))
+		else if (TransactionIdIsInProgress(xmin))
 		{
 			/*
 			 * It'd be possible to discern between INSERT/DELETE in progress
@@ -1285,17 +1293,20 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de
 			 */
 			return HEAPTUPLE_INSERT_IN_PROGRESS;
 		}
-		else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
-			SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-						HeapTupleHeaderGetRawXmin(tuple));
+		else if (TransactionIdDidCommit(xmin))
+			SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, xmin);
 		else
 		{
 			/*
 			 * Not in Progress, Not Committed, so either Aborted or crashed
 			 */
-			SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-						InvalidTransactionId);
-			return HEAPTUPLE_DEAD;
+			SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, xmin);
+			if(TransactionIdIsNormal(xmin)) {
+				*dead_after = xmin;
+				return HEAPTUPLE_RECENTLY_ABORTED;
+			}
+			else
+				return HEAPTUPLE_DEAD;
 		}
 
 		/*
@@ -1443,7 +1454,7 @@ HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot,
 
 	res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after);
 
-	if (res == HEAPTUPLE_RECENTLY_DEAD)
+	if (res == HEAPTUPLE_RECENTLY_DEAD || res == HEAPTUPLE_RECENTLY_ABORTED)
 	{
 		Assert(TransactionIdIsValid(dead_after));
 
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index a8025889be0..ecda8daf0e3 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -921,7 +921,7 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
 
 	res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
 
-	if (res != HEAPTUPLE_RECENTLY_DEAD)
+	if (res != HEAPTUPLE_RECENTLY_DEAD && res != HEAPTUPLE_RECENTLY_ABORTED)
 		return res;
 
 	/*
@@ -1108,6 +1108,9 @@ heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff,
 				 */
 				break;
 
+			case HEAPTUPLE_RECENTLY_ABORTED:
+				break;
+
 			case HEAPTUPLE_DELETE_IN_PROGRESS:
 			case HEAPTUPLE_LIVE:
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
@@ -1415,6 +1418,19 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb
 			}
 			break;
 
+		case HEAPTUPLE_RECENTLY_ABORTED:
+			prstate->recently_dead_tuples++;
+			prstate->all_visible = false;
+
+			/*
+			 * This tuple will soon become DEAD.  Update the hint field so
+			 * that the page is reconsidered for pruning in future.
+			 */
+			heap_prune_record_prunable(prstate,
+									   HeapTupleHeaderGetRawXmin(htup));
+			break;
+
+
 		case HEAPTUPLE_RECENTLY_DEAD:
 			prstate->recently_dead_tuples++;
 			prstate->all_visible = false;
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 0fef8e49e2b..def04eb758c 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -2352,6 +2352,7 @@ lazy_scan_noprune(LVRelState *vacrel,
 				 */
 				missed_dead_tuples++;
 				break;
+			case HEAPTUPLE_RECENTLY_ABORTED:
 			case HEAPTUPLE_RECENTLY_DEAD:
 
 				/*
@@ -3694,6 +3695,7 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
 				break;
 
 			case HEAPTUPLE_DEAD:
+			case HEAPTUPLE_RECENTLY_ABORTED:
 			case HEAPTUPLE_RECENTLY_DEAD:
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
 			case HEAPTUPLE_DELETE_IN_PROGRESS:
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 3a9424c19c9..139f1f01037 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -123,7 +123,8 @@ typedef enum
 {
 	HEAPTUPLE_DEAD,				/* tuple is dead and deletable */
 	HEAPTUPLE_LIVE,				/* tuple is live (committed, no deleter) */
-	HEAPTUPLE_RECENTLY_DEAD,	/* tuple is dead, but not deletable yet */
+	HEAPTUPLE_RECENTLY_DEAD,	/* tuple is dead, but not deletable yet; check xmax for when to delete */
+	HEAPTUPLE_RECENTLY_ABORTED,	/* tuple is aborted, but not deletable yet; check xmin for when to delete */
 	HEAPTUPLE_INSERT_IN_PROGRESS,	/* inserting xact is still in progress */
 	HEAPTUPLE_DELETE_IN_PROGRESS,	/* deleting xact is still in progress */
 } HTSV_Result;
-- 
2.39.5

