When inserting into a B-tree index, all the pages are read-locked when
descending the tree. When we reach the leaf page, the read-lock is
exchanged for a write-lock.
There's nothing wrong with that, but why don't we just directly grab a
write-lock on the leaf page? When descending, we know the level we're
on, and what level the child page is. The only downside I can see is
that we would unnecessarily hold a write-lock when a read-lock would
suffice, if the page was just split and we have to move right. But that
seems like a really bad bet - hitting the page when it was just split is
highly unlikely.
Grabbing the write-lock directly would obviously save one buffer
unlock+lock sequence, for what it's worth, but I think it would also
slightly simplify the code. Am I missing something?
See attached patch. The new contract of _bt_getroot is a bit weird: it
locks the returned page in the requested lock-mode, shared or exclusive,
if the root page was also the leaf page. Otherwise it's locked in shared
mode regardless off the requested lock mode. But OTOH, the new contract
for _bt_search() is more clear now: it actually locks the returned page
in the requested mode, where it used to only use the access parameter to
decide whether to create a root page if the index was empty.
- Heikki
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 5f7953f..3cb6404 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -113,24 +113,11 @@ _bt_doinsert(Relation rel, IndexTuple itup,
itup_scankey = _bt_mkscankey(rel, itup);
top:
- /* find the first page containing this key */
+ /* find the first page containing this key, and write-lock it */
stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
offset = InvalidOffsetNumber;
- /* trade in our read lock for a write lock */
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- LockBuffer(buf, BT_WRITE);
-
- /*
- * If the page was split between the time that we surrendered our read
- * lock and acquired our write lock, then this page may no longer be the
- * right place for the key we want to insert. In this case, we need to
- * move right in the tree. See Lehman and Yao for an excruciatingly
- * precise description.
- */
- buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
-
/*
* If we're not allowing duplicates, make sure the key isn't already in
* the index.
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 14e422a..d6a4933 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -78,12 +78,13 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
* standard class of race conditions exists here; I think I covered
* them all in the Hopi Indian rain dance of lock requests below.
*
- * The access type parameter (BT_READ or BT_WRITE) controls whether
- * a new root page will be created or not. If access = BT_READ,
- * and no root page exists, we just return InvalidBuffer. For
- * BT_WRITE, we try to create the root page if it doesn't exist.
- * NOTE that the returned root page will have only a read lock set
- * on it even if access = BT_WRITE!
+ * The access type parameter (BT_READ or BT_WRITE) indicates whether
+ * we're fetching the root for a search or an insertion. If access =
+ * BT_READ, and no root page exists, we just return InvalidBuffer.
+ * For BT_WRITE, we try to create the root page if it doesn't exist.
+ * If the root page is also a leaf-page, it is locked in the mode
+ * specified. NOTE that otherwise the returned root page will have
+ * only a read lock on it, even if access = BT_WRITE!
*
* The returned page is not necessarily the true root --- it could be
* a "fast root" (a page that is alone in its level due to deletions).
@@ -127,7 +128,8 @@ _bt_getroot(Relation rel, int access)
Assert(rootblkno != P_NONE);
rootlevel = metad->btm_fastlevel;
- rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+ rootbuf = _bt_getbuf(rel, rootblkno,
+ (rootlevel == 0) ? access : BT_READ);
rootpage = BufferGetPage(rootbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
@@ -253,14 +255,6 @@ _bt_getroot(Relation rel, int access)
END_CRIT_SECTION();
- /*
- * swap root write lock for read lock. There is no danger of anyone
- * else accessing the new root page while it's unlocked, since no one
- * else knows where it is yet.
- */
- LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(rootbuf, BT_READ);
-
/* okay, metadata is correct, release lock on it */
_bt_relbuf(rel, metabuf);
}
@@ -285,7 +279,8 @@ _bt_getroot(Relation rel, int access)
for (;;)
{
- rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
+ rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno,
+ (rootlevel == 0) ? access : BT_READ);
rootpage = BufferGetPage(rootbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index bc62fbc..5e11aeb 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -48,16 +48,19 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
* address of the leaf-page buffer, which is read-locked and pinned.
* No locks are held on the parent pages, however!
*
- * NOTE that the returned buffer is read-locked regardless of the access
- * parameter. However, access = BT_WRITE will allow an empty root page
- * to be created and returned. When access = BT_READ, an empty index
- * will result in *bufP being set to InvalidBuffer.
+ * The returned buffer is locked in the mode specified by the access
+ * parameter. When access = BT_READ, an empty index will result in
+ * *bufP being set to InvalidBuffer, while when access = BT_WRITE, an
+ * empty root page is created and returned.
*/
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
Buffer *bufP, int access)
{
BTStack stack_in = NULL;
+ Page page;
+ BTPageOpaque opaque;
+ bool isleaf;
/* Get the root page to start with */
*bufP = _bt_getroot(rel, access);
@@ -66,11 +69,13 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
if (!BufferIsValid(*bufP))
return (BTStack) NULL;
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ isleaf = P_ISLEAF(opaque);
+
/* Loop iterates once per level descended in the tree */
for (;;)
{
- Page page;
- BTPageOpaque opaque;
OffsetNumber offnum;
ItemId itemid;
IndexTuple itup;
@@ -83,12 +88,14 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
* pointer in the parent (or metapage). If it has, we may need to
* move right to its new sibling. Do that.
*/
- *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
+ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+ isleaf ? access : BT_READ);
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (P_ISLEAF(opaque))
+ Assert(isleaf == P_ISLEAF(opaque));
+ if (isleaf)
break;
/*
@@ -118,7 +125,9 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
new_stack->bts_parent = stack_in;
/* drop the read lock on the parent page, acquire one on the child */
- *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
+ isleaf = (opaque->btpo.level - 1) == 0;
+ *bufP = _bt_relandgetbuf(rel, *bufP, blkno,
+ isleaf ? access : BT_READ);
/* okay, all set to move down a level */
stack_in = new_stack;
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers