From a7c8d1da5d0496b2a7d05d52e37511a37075e13d Mon Sep 17 00:00:00 2001
From: alterego655 <824662526@qq.com>
Date: Tue, 16 Dec 2025 15:15:14 +0800
Subject: [PATCH v3 1/2] Optimize SnapBuild by maintaining committed.xip in
 sorted order

Maintain committed.xip in xidComparator order using a two-phase approach
that balances performance during slot initialization with efficiency during
steady-state decoding:

Pre-CONSISTENT phase:
- Use fast O(1) append for unsorted XIDs
- Sort once at transition to CONSISTENT state
- Avoids merge overhead during initial snapshot building

Post-CONSISTENT phase:
- Use per-commit batch insertion with sorted merge
- Collect all relevant XIDs for each commit
- Sort the batch once: O(m log m)
- Reverse-merge into the global array: O(N + m)

With this change, snapshot builds can skip the qsort() step and simply
memcpy the sorted array. While per-commit work increases from O(m) (plain
append) to O(m log m + N) (sort-and-merge) after CONSISTENT, eliminating
repeated O(N log N) sorts at each snapshot build significantly reduces
overall steady-state cost once CONSISTENT is reached and snapshots are
built frequently.
---
 src/backend/replication/logical/snapbuild.c  | 164 +++++++++++++++++--
 src/include/replication/snapbuild_internal.h |  12 +-
 2 files changed, 151 insertions(+), 25 deletions(-)

diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index d6ab1e017eb..e0d7d3e0ae8 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -407,8 +407,25 @@ SnapBuildBuildSnapshot(SnapBuild *builder)
 		   builder->committed.xip,
 		   builder->committed.xcnt * sizeof(TransactionId));
 
-	/* sort so we can bsearch() */
-	qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
+	if (builder->state < SNAPBUILD_CONSISTENT)
+	{
+		/*
+		 * Pre-CONSISTENT: committed.xip is unsorted, sort the snapshot copy.
+		 */
+		if (snapshot->xcnt > 1)
+			qsort(snapshot->xip, snapshot->xcnt,
+				  sizeof(TransactionId), xidComparator);
+	}
+#ifdef USE_ASSERT_CHECKING
+	else
+	{
+		/*
+		 * Post-CONSISTENT: committed.xip should already be sorted, verify.
+		 */
+		for (int i = 1; i < snapshot->xcnt; i++)
+			Assert(snapshot->xip[i - 1] < snapshot->xip[i]);
+	}
+#endif
 
 	/*
 	 * Initially, subxip is empty, i.e. it's a snapshot to be used by
@@ -822,17 +839,33 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact
 }
 
 /*
- * Keep track of a new catalog changing transaction that has committed.
+ * Keep track of new catalog changing transactions that have committed.
+ *
+ * Before reaching CONSISTENT state, we use fast O(1) append since the array
+ * doesn't need to be sorted yet.  After CONSISTENT, we maintain sorted order
+ * using a merge approach to avoid repeated full-array sorts at snapshot build.
  */
 static void
-SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
+SnapBuildAddCommittedTxns(SnapBuild *builder,
+						  const TransactionId *batch_xids,
+						  int batch_cnt)
 {
-	Assert(TransactionIdIsValid(xid));
+	TransactionId *committed_xids;
+	size_t		old_xcnt;
+
+	old_xcnt = builder->committed.xcnt;
 
-	if (builder->committed.xcnt == builder->committed.xcnt_space)
+	/* Ensure we have space for all elements */
+	if (old_xcnt + batch_cnt > builder->committed.xcnt_space)
 	{
 		builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
 
+		/*
+		 * Repeat if we need more than 2x current space.
+		 */
+		while (old_xcnt + batch_cnt > builder->committed.xcnt_space)
+			builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
+
 		elog(DEBUG1, "increasing space for committed transactions to %u",
 			 (uint32) builder->committed.xcnt_space);
 
@@ -840,12 +873,57 @@ SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
 										  builder->committed.xcnt_space * sizeof(TransactionId));
 	}
 
+	committed_xids = builder->committed.xip;
+
 	/*
-	 * TODO: It might make sense to keep the array sorted here instead of
-	 * doing it every time we build a new snapshot. On the other hand this
-	 * gets called repeatedly when a transaction with subtransactions commits.
+	 * Before CONSISTENT state, just append unsorted for O(1) performance. The
+	 * array will be sorted once when transitioning to CONSISTENT.
 	 */
-	builder->committed.xip[builder->committed.xcnt++] = xid;
+	if (builder->state < SNAPBUILD_CONSISTENT)
+	{
+		for (int i = 0; i < batch_cnt; i++)
+		{
+			Assert(TransactionIdIsValid(batch_xids[i]));
+			committed_xids[old_xcnt++] = batch_xids[i];
+		}
+		builder->committed.xcnt = old_xcnt;
+	}
+	else
+	{
+		/*
+		 * After CONSISTENT, maintain sorted order via merge.  Merge from the
+		 * end to avoid overwriting unread data.
+		 */
+		int			old_idx = old_xcnt - 1;
+		int			batch_idx = batch_cnt - 1;
+		int			write_idx = old_xcnt + batch_cnt - 1;
+
+		while (old_idx >= 0 && batch_idx >= 0)
+		{
+			Assert(TransactionIdIsValid(batch_xids[batch_idx]));
+
+			if (committed_xids[old_idx] > batch_xids[batch_idx])
+			{
+				committed_xids[write_idx--] = committed_xids[old_idx--];
+			}
+			else
+			{
+				/* Duplicates should never occur */
+				Assert(committed_xids[old_idx] != batch_xids[batch_idx]);
+				committed_xids[write_idx--] = batch_xids[batch_idx--];
+			}
+		}
+
+		/* Copy any remaining batch elements */
+		while (batch_idx >= 0)
+		{
+			Assert(TransactionIdIsValid(batch_xids[batch_idx]));
+			committed_xids[write_idx--] = batch_xids[batch_idx--];
+		}
+
+		/* Old elements that weren't moved are already in correct position */
+		builder->committed.xcnt = old_xcnt + batch_cnt;
+	}
 }
 
 /*
@@ -947,6 +1025,13 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
 
 	TransactionId xmax = xid;
 
+	/*
+	 * Collect XIDs that need tracking into a batch.  We'll sort and merge
+	 * them into committed.xip in one pass at the end.
+	 */
+	TransactionId *batch_xids = NULL;
+	int			batch_cnt = 0;
+
 	/*
 	 * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor
 	 * will they be part of a snapshot.  So we don't need to record anything.
@@ -977,6 +1062,12 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
 		}
 	}
 
+	if (nsubxacts > 0 || builder->building_full_snapshot ||
+		SnapBuildXidHasCatalogChanges(builder, xid, xinfo))
+	{
+		batch_xids = palloc((nsubxacts + 1) * sizeof(TransactionId));
+	}
+
 	for (nxact = 0; nxact < nsubxacts; nxact++)
 	{
 		TransactionId subxid = subxacts[nxact];
@@ -993,7 +1084,7 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
 			elog(DEBUG1, "found subtransaction %u:%u with catalog changes",
 				 xid, subxid);
 
-			SnapBuildAddCommittedTxn(builder, subxid);
+			batch_xids[batch_cnt++] = subxid;
 
 			if (NormalTransactionIdFollows(subxid, xmax))
 				xmax = subxid;
@@ -1007,7 +1098,7 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
 		 */
 		else if (needs_timetravel)
 		{
-			SnapBuildAddCommittedTxn(builder, subxid);
+			batch_xids[batch_cnt++] = subxid;
 			if (NormalTransactionIdFollows(subxid, xmax))
 				xmax = subxid;
 		}
@@ -1020,7 +1111,7 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
 			 xid);
 		needs_snapshot = true;
 		needs_timetravel = true;
-		SnapBuildAddCommittedTxn(builder, xid);
+		batch_xids[batch_cnt++] = xid;
 	}
 	else if (sub_needs_timetravel)
 	{
@@ -1028,15 +1119,33 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
 		elog(DEBUG2, "forced transaction %u to do timetravel due to one of its subtransactions",
 			 xid);
 		needs_timetravel = true;
-		SnapBuildAddCommittedTxn(builder, xid);
+		batch_xids[batch_cnt++] = xid;
 	}
 	else if (needs_timetravel)
 	{
 		elog(DEBUG2, "forced transaction %u to do timetravel", xid);
 
-		SnapBuildAddCommittedTxn(builder, xid);
+		batch_xids[batch_cnt++] = xid;
+	}
+
+	/*
+	 * Sort and merge the batch into committed.xip.  This maintains the
+	 * invariant that committed.xip is globally sorted by raw uint32 order
+	 * (xidComparator).
+	 */
+	if (batch_cnt > 0)
+	{
+		/* Sort by raw uint32 order */
+		qsort(batch_xids, batch_cnt, sizeof(TransactionId), xidComparator);
+
+		/* Merge into global array */
+		SnapBuildAddCommittedTxns(builder, batch_xids, batch_cnt);
 	}
 
+	/* Free batch array if allocated (even if nothing was added to it) */
+	if (batch_xids != NULL)
+		pfree(batch_xids);
+
 	if (!needs_timetravel)
 	{
 		/* record that we cannot export a general snapshot anymore */
@@ -1305,6 +1414,15 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn
 		Assert(TransactionIdIsNormal(builder->xmin));
 		Assert(TransactionIdIsNormal(builder->xmax));
 
+		/*
+		 * Sort committed.xip before transitioning to CONSISTENT.  During the
+		 * pre-CONSISTENT phase, XIDs were appended unsorted for performance.
+		 * Now we need sorted order for efficient snapshot building.
+		 */
+		if (builder->committed.xcnt > 1)
+			qsort(builder->committed.xip, builder->committed.xcnt,
+				  sizeof(TransactionId), xidComparator);
+
 		builder->state = SNAPBUILD_CONSISTENT;
 		builder->next_phase_at = InvalidTransactionId;
 
@@ -1402,6 +1520,15 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn
 			 TransactionIdPrecedesOrEquals(builder->next_phase_at,
 										   running->oldestRunningXid))
 	{
+		/*
+		 * Sort committed.xip before transitioning to CONSISTENT.  During the
+		 * pre-CONSISTENT phase, XIDs were appended unsorted for performance.
+		 * Now we need sorted order for efficient snapshot building.
+		 */
+		if (builder->committed.xcnt > 1)
+			qsort(builder->committed.xip, builder->committed.xcnt,
+				  sizeof(TransactionId), xidComparator);
+
 		builder->state = SNAPBUILD_CONSISTENT;
 		builder->next_phase_at = InvalidTransactionId;
 
@@ -1889,6 +2016,13 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
 		pfree(builder->committed.xip);
 		builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
 		builder->committed.xip = ondisk.builder.committed.xip;
+
+		/*
+		 * Sort the restored array to ensure it's in xidComparator order. Old
+		 * snapshot files may have been written with unsorted arrays.
+		 */
+		qsort(builder->committed.xip, builder->committed.xcnt,
+			  sizeof(TransactionId), xidComparator);
 	}
 	ondisk.builder.committed.xip = NULL;
 
diff --git a/src/include/replication/snapbuild_internal.h b/src/include/replication/snapbuild_internal.h
index 3b915dc8793..164160e5e5e 100644
--- a/src/include/replication/snapbuild_internal.h
+++ b/src/include/replication/snapbuild_internal.h
@@ -115,16 +115,8 @@ struct SnapBuild
 		/*
 		 * Array of committed transactions that have modified the catalog.
 		 *
-		 * As this array is frequently modified we do *not* keep it in
-		 * xidComparator order. Instead we sort the array when building &
-		 * distributing a snapshot.
-		 *
-		 * TODO: It's unclear whether that reasoning has much merit. Every
-		 * time we add something here after becoming consistent will also
-		 * require distributing a snapshot. Storing them sorted would
-		 * potentially also make it easier to purge (but more complicated wrt
-		 * wraparound?). Should be improved if sorting while building the
-		 * snapshot shows up in profiles.
+		 * Maintained in sorted order (by raw uint32 value) to allow efficient
+		 * snapshot building without repeated sorting overhead.
 		 */
 		TransactionId *xip;
 	}			committed;
-- 
2.51.0

