On Wed, Dec 10, 2014 at 9:49 AM, Robert Haas <robertmh...@gmail.com> wrote:
> I guess this bears some further thought.  I certainly don't like the
> fact that this makes the whole system crap out at a lower number of
> subtransactions than presently.  The actual performance numbers don't
> bother me very much; I'm comfortable with the possibility that closing
> a cursor will be some modest percentage slower if you've got thousands
> of active savepoints.

Here's a new version with two changes:

1. I changed the traversal of the resource owner tree to iterate
instead of recurse.  It now does a depth-first, pre-order traversal of
the tree; when we reach the last child of a node, we follow its parent
pointer to get back to where we were.  That way, we don't need to keep
anything on the stack.  That fixed the crash at 100k cursors, but it
was still 4x slower.

2. Instead of traversing the tree until we find an xmin equal to the
one we're currently advertising, the code now traverses the entire
tree each time it runs. However, it also keeps a record of how many
times the oldest xmin occurred in the tree, which is decremented each
time we unregister a snapshot with that xmin; the traversal doesn't
run again until that count reaches 0.  That fixed the performance
regression on your test case.

With a million subtransactions:

master 34.464s 33.742s 34.317s
advance-xmin 34.516s 34.069s 34.196s

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index 0955bcc..529209f 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -21,6 +21,7 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "miscadmin.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
 #include "utils/memutils.h"
@@ -113,6 +114,7 @@ typedef struct ResourceOwnerData
 ResourceOwner CurrentResourceOwner = NULL;
 ResourceOwner CurTransactionResourceOwner = NULL;
 ResourceOwner TopTransactionResourceOwner = NULL;
+ResourceOwner FirstRootResourceOwner = NULL;
 
 /*
  * List of add-on callbacks for resource releasing
@@ -167,6 +169,11 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name)
 		owner->nextchild = parent->firstchild;
 		parent->firstchild = owner;
 	}
+	else
+	{
+		owner->nextchild = FirstRootResourceOwner;
+		FirstRootResourceOwner = owner;
+	}
 
 	return owner;
 }
@@ -442,6 +449,8 @@ ResourceOwnerDelete(ResourceOwner owner)
 	 * the owner tree.  Better a leak than a crash.
 	 */
 	ResourceOwnerNewParent(owner, NULL);
+	Assert(FirstRootResourceOwner == owner);
+	FirstRootResourceOwner = owner->nextchild;
 
 	/* And free the object. */
 	if (owner->buffers)
@@ -502,6 +511,14 @@ ResourceOwnerNewParent(ResourceOwner owner,
 			}
 		}
 	}
+	else
+	{
+		ResourceOwner *link = &FirstRootResourceOwner;
+
+		while (*link != owner)
+			link = &((*link)->nextchild);
+		*link = owner->nextchild;
+	}
 
 	if (newparent)
 	{
@@ -513,7 +530,8 @@ ResourceOwnerNewParent(ResourceOwner owner,
 	else
 	{
 		owner->parent = NULL;
-		owner->nextchild = NULL;
+		owner->nextchild = FirstRootResourceOwner;
+		FirstRootResourceOwner = owner;
 	}
 }
 
@@ -1161,6 +1179,59 @@ ResourceOwnerForgetSnapshot(ResourceOwner owner, Snapshot snapshot)
 }
 
 /*
+ * Invoke a caller-supplied function on every snapshot registered with any
+ * resource owner in the system.  The callback can abort the traversal by
+ * returning true.
+ */
+bool
+ResourceOwnerWalkAllSnapshots(ResourceWalkSnapshotCallback callback, void *arg)
+{
+	ResourceOwner	owner = FirstRootResourceOwner;
+	bool		visited = false;
+
+	/*
+	 * We do this traversal non-recursively in order to avoid running out of
+	 * stack space, which can otherwise happen with large numbers of nested
+	 * subtransactions.  The easiest way to do that is to search depth-first,
+	 * so that we visit all of a given node's descendents before reaching its
+	 * right sibling.  When we've visited all of the node's descendents, we'll
+	 * follow the last child's parent pointer back to that node, but visited
+	 * will be set to true at that point, so we'll advance to the right
+	 * sibling instead of traversing it again.
+	 */
+	while (owner != NULL)
+	{
+		if (!visited)
+		{
+			int	i;
+
+			for (i = 0; i < owner->nsnapshots; ++i)
+				if (callback(owner->snapshots[i], arg))
+					return true;
+
+			if (owner->firstchild != NULL)
+			{
+				owner = owner->firstchild;
+				continue;
+			}
+		}
+
+		if (owner->nextchild != NULL)
+		{
+			owner = owner->nextchild;
+			visited = false;
+		}
+		else
+		{
+			owner = owner->parent;
+			visited = true;
+		}
+	}
+
+	return false;
+}
+
+/*
  * Debugging subroutine
  */
 static void
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index d601efe..1fd73c8 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -122,14 +122,28 @@ typedef struct ActiveSnapshotElt
 /* Top of the stack of active snapshots */
 static ActiveSnapshotElt *ActiveSnapshot = NULL;
 
+/* Data for recomputing xmin */
+typedef struct SnapshotXminData
+{
+	TransactionId	new_xmin;
+	int				new_xmin_count;
+	int				snapshots_remaining;
+} SnapshotXminData;
+
+/* How many snapshots is resowner.c tracking for us? */
+static int	RegisteredSnapshots = 0;
+
 /*
- * How many snapshots is resowner.c tracking for us?
- *
- * Note: for now, a simple counter is enough.  However, if we ever want to be
- * smarter about advancing our MyPgXact->xmin we will need to be more
- * sophisticated about this, perhaps keeping our own list of snapshots.
+ * When a cursor is released, it can be advantageous to recompute our
+ * advertised xmin to avoid system-wide impacts.  However, we don't want
+ * to do this recomputation needlessly.  After the first xmin recomputation
+ * in a particular transaction, these variables will indicate the result of
+ * the previous recomputation and the number of snapshots which at that time
+ * had that xmin.  The actual number of snapshots with that xmin can be higher
+ * than the value indicated here, but cannot be lower.
  */
-static int	RegisteredSnapshots = 0;
+static TransactionId RegisteredSnapshotOldestXmin = InvalidTransactionId;
+static int	RegisteredSnapshotOldestXminCount = 0;
 
 /* first GetTransactionSnapshot call in a transaction? */
 bool		FirstSnapshotSet = false;
@@ -153,6 +167,7 @@ static List *exportedSnapshots = NIL;
 
 static Snapshot CopySnapshot(Snapshot snapshot);
 static void FreeSnapshot(Snapshot snapshot);
+static bool SnapshotResetXminWorker(Snapshot snapshot, void *data);
 static void SnapshotResetXmin(void);
 
 
@@ -675,6 +690,8 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
 
 	ResourceOwnerForgetSnapshot(owner, snapshot);
 	RegisteredSnapshots--;
+	if (TransactionIdEquals(snapshot->xmin, RegisteredSnapshotOldestXmin))
+		RegisteredSnapshotOldestXminCount--;
 	if (--snapshot->regd_count == 0 && snapshot->active_count == 0)
 	{
 		FreeSnapshot(snapshot);
@@ -688,12 +705,77 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
  * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
  * Note we can do this without locking because we assume that storing an Xid
  * is atomic.
+ *
+ * Even if there are some remaining snapshots, we may be able to advance our
+ * PGXACT->xmin to some degree.  This typically happens when a portal is
+ * dropped.  For efficiency, we only consider recomputing PGXACT->xmin when
+ * the active snapshot stack is empty.
  */
 static void
 SnapshotResetXmin(void)
 {
-	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
+	SnapshotXminData	sxd;
+	ListCell	   *lc;
+
+	if (ActiveSnapshot != NULL)
+		return;
+
+	if (RegisteredSnapshots == 0)
+	{
 		MyPgXact->xmin = InvalidTransactionId;
+		RegisteredSnapshotOldestXmin = InvalidTransactionId;
+		RegisteredSnapshotOldestXminCount = 0;
+		return;
+	}
+
+	if (RegisteredSnapshotOldestXminCount > 0)
+		return;
+
+	sxd.new_xmin = InvalidTransactionId;
+	sxd.new_xmin_count = 0;
+	sxd.snapshots_remaining = RegisteredSnapshots;
+
+	if (FirstXactSnapshot != NULL)
+		SnapshotResetXminWorker(FirstXactSnapshot, &sxd);
+
+	foreach(lc, exportedSnapshots)
+	{
+		Snapshot	snapshot = lfirst(lc);
+
+		SnapshotResetXminWorker(snapshot, &sxd);
+	}
+
+	ResourceOwnerWalkAllSnapshots(SnapshotResetXminWorker, &sxd);
+
+	if (sxd.snapshots_remaining > 0)
+		elog(FATAL, "failed to traverse all snapshots");
+
+	MyPgXact->xmin = sxd.new_xmin;
+	RegisteredSnapshotOldestXmin = sxd.new_xmin;
+	RegisteredSnapshotOldestXminCount = sxd.new_xmin_count;
+}
+
+static bool
+SnapshotResetXminWorker(Snapshot snapshot, void *data)
+{
+	SnapshotXminData   *sxd = data;
+
+	if (TransactionIdEquals(snapshot->xmin, sxd->new_xmin))
+	{
+		Assert(TransactionIdIsNormal(snapshot->xmin));
+		sxd->new_xmin_count++;
+	}
+	else if (!TransactionIdIsValid(sxd->new_xmin)
+		|| TransactionIdPrecedes(snapshot->xmin, sxd->new_xmin))
+	{
+		sxd->new_xmin = snapshot->xmin;
+		sxd->new_xmin_count = 1;
+	}
+
+	if (--sxd->snapshots_remaining < 0)
+		elog(FATAL, "traversed too many snapshots");
+
+	return false;
 }
 
 /*
@@ -830,6 +912,8 @@ AtEOXact_Snapshot(bool isCommit)
 	 */
 	ActiveSnapshot = NULL;
 	RegisteredSnapshots = 0;
+	RegisteredSnapshotOldestXmin = InvalidTransactionId;
+	RegisteredSnapshotOldestXminCount = 0;
 
 	CurrentSnapshot = NULL;
 	SecondarySnapshot = NULL;
diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h
index b8fd1a9..9944144 100644
--- a/src/include/utils/resowner_private.h
+++ b/src/include/utils/resowner_private.h
@@ -73,6 +73,9 @@ extern void ResourceOwnerRememberSnapshot(ResourceOwner owner,
 							  Snapshot snapshot);
 extern void ResourceOwnerForgetSnapshot(ResourceOwner owner,
 							Snapshot snapshot);
+typedef bool (*ResourceWalkSnapshotCallback) (Snapshot snapshot, void *arg);
+extern bool ResourceOwnerWalkAllSnapshots(ResourceWalkSnapshotCallback callback,
+							  void *arg);
 
 /* support for temporary file management */
 extern void ResourceOwnerEnlargeFiles(ResourceOwner owner);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to