When inserting into a B-tree index, all the pages are read-locked when descending the tree. When we reach the leaf page, the read-lock is exchanged for a write-lock.

There's nothing wrong with that, but why don't we just directly grab a write-lock on the leaf page? When descending, we know the level we're on, and what level the child page is. The only downside I can see is that we would unnecessarily hold a write-lock when a read-lock would suffice, if the page was just split and we have to move right. But that seems like a really bad bet - hitting the page when it was just split is highly unlikely.

Grabbing the write-lock directly would obviously save one buffer unlock+lock sequence, for what it's worth, but I think it would also slightly simplify the code. Am I missing something?

See attached patch. The new contract of _bt_getroot is a bit weird: it locks the returned page in the requested lock-mode, shared or exclusive, if the root page was also the leaf page. Otherwise it's locked in shared mode regardless off the requested lock mode. But OTOH, the new contract for _bt_search() is more clear now: it actually locks the returned page in the requested mode, where it used to only use the access parameter to decide whether to create a root page if the index was empty.

- Heikki
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 5f7953f..3cb6404 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -113,24 +113,11 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 	itup_scankey = _bt_mkscankey(rel, itup);
 
 top:
-	/* find the first page containing this key */
+	/* find the first page containing this key, and write-lock it */
 	stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
 
 	offset = InvalidOffsetNumber;
 
-	/* trade in our read lock for a write lock */
-	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-	LockBuffer(buf, BT_WRITE);
-
-	/*
-	 * If the page was split between the time that we surrendered our read
-	 * lock and acquired our write lock, then this page may no longer be the
-	 * right place for the key we want to insert.  In this case, we need to
-	 * move right in the tree.	See Lehman and Yao for an excruciatingly
-	 * precise description.
-	 */
-	buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
-
 	/*
 	 * If we're not allowing duplicates, make sure the key isn't already in
 	 * the index.
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 14e422a..d6a4933 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -78,12 +78,13 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
  *		standard class of race conditions exists here; I think I covered
  *		them all in the Hopi Indian rain dance of lock requests below.
  *
- *		The access type parameter (BT_READ or BT_WRITE) controls whether
- *		a new root page will be created or not.  If access = BT_READ,
- *		and no root page exists, we just return InvalidBuffer.	For
- *		BT_WRITE, we try to create the root page if it doesn't exist.
- *		NOTE that the returned root page will have only a read lock set
- *		on it even if access = BT_WRITE!
+ *		The access type parameter (BT_READ or BT_WRITE) indicates whether
+ *		we're fetching the root for a search or an insertion.  If access =
+ *		BT_READ, and no root page exists, we just return InvalidBuffer.
+ *		For BT_WRITE, we try to create the root page if it doesn't exist.
+ *		If the root page is also a leaf-page, it is locked in the mode
+ *		specified.  NOTE that otherwise the returned root page will have
+ *		only a read lock on it,	even if access = BT_WRITE!
  *
  *		The returned page is not necessarily the true root --- it could be
  *		a "fast root" (a page that is alone in its level due to deletions).
@@ -127,7 +128,8 @@ _bt_getroot(Relation rel, int access)
 		Assert(rootblkno != P_NONE);
 		rootlevel = metad->btm_fastlevel;
 
-		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+		rootbuf = _bt_getbuf(rel, rootblkno,
+							 (rootlevel == 0) ? access : BT_READ);
 		rootpage = BufferGetPage(rootbuf);
 		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 
@@ -253,14 +255,6 @@ _bt_getroot(Relation rel, int access)
 
 		END_CRIT_SECTION();
 
-		/*
-		 * swap root write lock for read lock.	There is no danger of anyone
-		 * else accessing the new root page while it's unlocked, since no one
-		 * else knows where it is yet.
-		 */
-		LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
-		LockBuffer(rootbuf, BT_READ);
-
 		/* okay, metadata is correct, release lock on it */
 		_bt_relbuf(rel, metabuf);
 	}
@@ -285,7 +279,8 @@ _bt_getroot(Relation rel, int access)
 
 		for (;;)
 		{
-			rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
+			rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno,
+									   (rootlevel == 0) ? access : BT_READ);
 			rootpage = BufferGetPage(rootbuf);
 			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index bc62fbc..5e11aeb 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -48,16 +48,19 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  * address of the leaf-page buffer, which is read-locked and pinned.
  * No locks are held on the parent pages, however!
  *
- * NOTE that the returned buffer is read-locked regardless of the access
- * parameter.  However, access = BT_WRITE will allow an empty root page
- * to be created and returned.	When access = BT_READ, an empty index
- * will result in *bufP being set to InvalidBuffer.
+ * The returned buffer is locked in the mode specified by the access
+ * parameter.  When access = BT_READ, an empty index will result in
+ * *bufP being set to InvalidBuffer, while when access = BT_WRITE, an
+ * empty root page is created and returned.
  */
 BTStack
 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		   Buffer *bufP, int access)
 {
 	BTStack		stack_in = NULL;
+	Page		page;
+	BTPageOpaque opaque;
+	bool		isleaf;
 
 	/* Get the root page to start with */
 	*bufP = _bt_getroot(rel, access);
@@ -66,11 +69,13 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 	if (!BufferIsValid(*bufP))
 		return (BTStack) NULL;
 
+	page = BufferGetPage(*bufP);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	isleaf = P_ISLEAF(opaque);
+
 	/* Loop iterates once per level descended in the tree */
 	for (;;)
 	{
-		Page		page;
-		BTPageOpaque opaque;
 		OffsetNumber offnum;
 		ItemId		itemid;
 		IndexTuple	itup;
@@ -83,12 +88,14 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		 * pointer in the parent (or metapage).  If it has, we may need to
 		 * move right to its new sibling.  Do that.
 		 */
-		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
+		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+							  isleaf ? access : BT_READ);
 
 		/* if this is a leaf page, we're done */
 		page = BufferGetPage(*bufP);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		if (P_ISLEAF(opaque))
+		Assert(isleaf == P_ISLEAF(opaque));
+		if (isleaf)
 			break;
 
 		/*
@@ -118,7 +125,9 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		new_stack->bts_parent = stack_in;
 
 		/* drop the read lock on the parent page, acquire one on the child */
-		*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
+		isleaf = (opaque->btpo.level - 1) == 0;
+		*bufP = _bt_relandgetbuf(rel, *bufP, blkno,
+								 isleaf ? access : BT_READ);
 
 		/* okay, all set to move down a level */
 		stack_in = new_stack;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to