This patch makes some improvements to the rtree index implementation:

(1) Keep a pin on the scan's current buffer and mark buffer. This avoids
the need to do a ReadBuffer() for each tuple produced by the scan.

(2) Convert a ReleaseBuffer() ; ReadBuffer() pair into
ReleaseAndReadBuffer(). Surely not a huge win, but it saves a lock
acquire/release...

(3) Remove a bunch of duplicated code in rtget.c; make rtnext() handle
both the "initial result" and "subsequent result" cases.

(4) Add support for index tuple killing

(5) Remove rtscancache(): it is dead code, for the same reason that
gistscancache() is dead code (an index scan ought not be invoked with
NoMovementScanDirection).

The end result is about a 10% improvement in index scan performance,
according to contrib/rtree_gist/bench.

These changes (with the exception of #2) are analogous to changes I've
already made for GiST (it's clear that GiST was started as a fork of
rtree). I'm not hugely interested in further improvements to rtree; I
just did this stuff because it is low-hanging fruit and I've already
made the same changes for GiST.

-Neil

# 
# patch "src/backend/access/rtree/rtget.c"
#  from [3db9a1c0391d9e319ac8455167f23215e00bf832]
#    to [ebfe121a3bcd3e2a15da91e14a84b69c87afac67]
# 
# patch "src/backend/access/rtree/rtree.c"
#  from [274583f574bc483a709b46912a567c58c6abb98c]
#    to [8df26216af5a60db10afd38e3f1dc5c6355ec79c]
# 
# patch "src/backend/access/rtree/rtscan.c"
#  from [9ace8f57b28397296b386955fdf3b7b712178a9a]
#    to [92b177352fd28ffab208ac9abf502cb4c81d2740]
# 
# patch "src/include/access/rtree.h"
#  from [d3b048ba826b36bc766de324779c0b5bbb143b42]
#    to [e4947131a99641d8f43c4b7b770485c8c1043094]
# 
--- src/backend/access/rtree/rtget.c
+++ src/backend/access/rtree/rtget.c
@@ -19,10 +19,8 @@
 #include "access/relscan.h"
 #include "access/rtree.h"
 
-static OffsetNumber findnext(IndexScanDesc s, Page p, OffsetNumber n,
+static OffsetNumber findnext(IndexScanDesc s, OffsetNumber n,
 		 ScanDirection dir);
-static bool rtscancache(IndexScanDesc s, ScanDirection dir);
-static bool rtfirst(IndexScanDesc s, ScanDirection dir);
 static bool rtnext(IndexScanDesc s, ScanDirection dir);
 
 
@@ -31,138 +29,106 @@
 {
 	IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0);
 	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
-	bool		res;
-
-	/* if we have it cached in the scan desc, just return the value */
-	if (rtscancache(s, dir))
-		PG_RETURN_BOOL(true);
-
-	/* not cached, so we'll have to do some work */
-	if (ItemPointerIsValid(&(s->currentItemData)))
-		res = rtnext(s, dir);
-	else
-		res = rtfirst(s, dir);
-	PG_RETURN_BOOL(res);
-}
-
-static bool
-rtfirst(IndexScanDesc s, ScanDirection dir)
-{
-	Buffer		b;
-	Page		p;
-	OffsetNumber n;
-	OffsetNumber maxoff;
-	RTreePageOpaque po;
+	Page page;
+	OffsetNumber offnum;
 	RTreeScanOpaque so;
-	RTSTACK    *stk;
-	BlockNumber blk;
-	IndexTuple	it;
 
-	b = ReadBuffer(s->indexRelation, P_ROOT);
-	p = BufferGetPage(b);
-	po = (RTreePageOpaque) PageGetSpecialPointer(p);
 	so = (RTreeScanOpaque) s->opaque;
 
-	for (;;)
+	/*
+	 * If we've already produced a tuple and the executor has informed
+	 * us that it should be marked "killed", do so know.
+	 */
+	if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData)))
 	{
-		maxoff = PageGetMaxOffsetNumber(p);
-		if (ScanDirectionIsBackward(dir))
-			n = findnext(s, p, maxoff, dir);
-		else
-			n = findnext(s, p, FirstOffsetNumber, dir);
+		offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
+		page = BufferGetPage(so->curbuf);
+		PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
+		SetBufferCommitInfoNeedsSave(so->curbuf);
+	}
 
-		while (n < FirstOffsetNumber || n > maxoff)
-		{
-			ReleaseBuffer(b);
-			if (so->s_stack == NULL)
-				return false;
+	/*
+	 * Get the next tuple that matches the search key; if asked to
+	 * skip killed tuples, find the first non-killed tuple that
+	 * matches. Return as soon as we've run out of matches or we've
+	 * found an acceptable match.
+	 */
+	for (;;)
+	{
+		bool res = rtnext(s, dir);
 
-			stk = so->s_stack;
-			b = ReadBuffer(s->indexRelation, stk->rts_blk);
-			p = BufferGetPage(b);
-			po = (RTreePageOpaque) PageGetSpecialPointer(p);
-			maxoff = PageGetMaxOffsetNumber(p);
-
-			if (ScanDirectionIsBackward(dir))
-				n = OffsetNumberPrev(stk->rts_child);
-			else
-				n = OffsetNumberNext(stk->rts_child);
-			so->s_stack = stk->rts_parent;
-			pfree(stk);
-
-			n = findnext(s, p, n, dir);
-		}
-		if (po->flags & F_LEAF)
+		if (res == true && s->ignore_killed_tuples)
 		{
-			ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
-
-			it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-
-			s->xs_ctup.t_self = it->t_tid;
-
-			ReleaseBuffer(b);
-			return true;
+			offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
+			page = BufferGetPage(so->curbuf);
+			if (ItemIdDeleted(PageGetItemId(page, offnum)))
+				continue;
 		}
-		else
-		{
-			stk = (RTSTACK *) palloc(sizeof(RTSTACK));
-			stk->rts_child = n;
-			stk->rts_blk = BufferGetBlockNumber(b);
-			stk->rts_parent = so->s_stack;
-			so->s_stack = stk;
 
-			it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-			blk = ItemPointerGetBlockNumber(&(it->t_tid));
-
-			ReleaseBuffer(b);
-			b = ReadBuffer(s->indexRelation, blk);
-			p = BufferGetPage(b);
-			po = (RTreePageOpaque) PageGetSpecialPointer(p);
-		}
+		PG_RETURN_BOOL(res);
 	}
 }
 
 static bool
 rtnext(IndexScanDesc s, ScanDirection dir)
 {
-	Buffer		b;
 	Page		p;
 	OffsetNumber n;
-	OffsetNumber maxoff;
 	RTreePageOpaque po;
 	RTreeScanOpaque so;
-	RTSTACK    *stk;
-	BlockNumber blk;
-	IndexTuple	it;
 
-	blk = ItemPointerGetBlockNumber(&(s->currentItemData));
-	n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+	so = (RTreeScanOpaque) s->opaque;
 
-	if (ScanDirectionIsForward(dir))
-		n = OffsetNumberNext(n);
-	else
-		n = OffsetNumberPrev(n);
+	if (!ItemPointerIsValid(&(s->currentItemData)))
+	{
+		/* first call: start at the root */
+		Assert(BufferIsValid(so->curbuf) == false);
+		so->curbuf = ReadBuffer(s->indexRelation, P_ROOT);
+	}
 
-	b = ReadBuffer(s->indexRelation, blk);
-	p = BufferGetPage(b);
+	p = BufferGetPage(so->curbuf);
 	po = (RTreePageOpaque) PageGetSpecialPointer(p);
-	so = (RTreeScanOpaque) s->opaque;
 
+	if (!ItemPointerIsValid(&(s->currentItemData)))
+	{
+		/* first call: start at first/last offset */
+		if (ScanDirectionIsForward(dir))
+			n = FirstOffsetNumber;
+		else
+			n = PageGetMaxOffsetNumber(p);
+	}
+	else
+	{
+		/* go on to the next offset */
+		n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+		if (ScanDirectionIsForward(dir))
+			n = OffsetNumberNext(n);
+		else
+			n = OffsetNumberPrev(n);
+	}
+
 	for (;;)
 	{
-		maxoff = PageGetMaxOffsetNumber(p);
-		n = findnext(s, p, n, dir);
+		IndexTuple	it;
+		RTSTACK    *stk;
 
-		while (n < FirstOffsetNumber || n > maxoff)
+		n = findnext(s, n, dir);
+
+		/* no match on this page, so read in the next stack entry */
+		if (n == InvalidOffsetNumber)
 		{
-			ReleaseBuffer(b);
+			/* if out of stack entries, we're done */
 			if (so->s_stack == NULL)
+			{
+				ReleaseBuffer(so->curbuf);
+				so->curbuf = InvalidBuffer;
 				return false;
+			}
 
 			stk = so->s_stack;
-			b = ReadBuffer(s->indexRelation, stk->rts_blk);
-			p = BufferGetPage(b);
-			maxoff = PageGetMaxOffsetNumber(p);
+			so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation,
+											  stk->rts_blk);
+			p = BufferGetPage(so->curbuf);
 			po = (RTreePageOpaque) PageGetSpecialPointer(p);
 
 			if (ScanDirectionIsBackward(dir))
@@ -172,33 +138,41 @@
 			so->s_stack = stk->rts_parent;
 			pfree(stk);
 
-			n = findnext(s, p, n, dir);
+			continue;
 		}
+
 		if (po->flags & F_LEAF)
 		{
-			ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
-
+			ItemPointerSet(&(s->currentItemData),
+						   BufferGetBlockNumber(so->curbuf),
+						   n);
 			it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-
 			s->xs_ctup.t_self = it->t_tid;
-
-			ReleaseBuffer(b);
 			return true;
 		}
 		else
 		{
+			BlockNumber blk;
+
 			stk = (RTSTACK *) palloc(sizeof(RTSTACK));
 			stk->rts_child = n;
-			stk->rts_blk = BufferGetBlockNumber(b);
+			stk->rts_blk = BufferGetBlockNumber(so->curbuf);
 			stk->rts_parent = so->s_stack;
 			so->s_stack = stk;
 
 			it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
 			blk = ItemPointerGetBlockNumber(&(it->t_tid));
 
-			ReleaseBuffer(b);
-			b = ReadBuffer(s->indexRelation, blk);
-			p = BufferGetPage(b);
+			/*
+			 * Note that we release the pin on the page as we descend
+			 * down the tree, even though there's a good chance we'll
+			 * eventually need to re-read the buffer later in this
+			 * scan. This may or may not be optimal, but it doesn't
+			 * seem likely to make a huge performance difference
+			 * either way.
+			 */
+			so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk);
+			p = BufferGetPage(so->curbuf);
 			po = (RTreePageOpaque) PageGetSpecialPointer(p);
 
 			if (ScanDirectionIsBackward(dir))
@@ -210,16 +184,19 @@
 }
 
 static OffsetNumber
-findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
+findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
 {
 	OffsetNumber maxoff;
 	IndexTuple	it;
 	RTreePageOpaque po;
 	RTreeScanOpaque so;
+	Page p;
 
+	so = (RTreeScanOpaque) s->opaque;
+	p = BufferGetPage(so->curbuf);
+
 	maxoff = PageGetMaxOffsetNumber(p);
 	po = (RTreePageOpaque) PageGetSpecialPointer(p);
-	so = (RTreeScanOpaque) s->opaque;
 
 	/*
 	 * If we modified the index during the scan, we may have a pointer to
@@ -256,28 +233,8 @@
 			n = OffsetNumberNext(n);
 	}
 
-	return n;
+	if (n >= FirstOffsetNumber && n <= maxoff)
+		return n;						/* found a match on this page */
+	else
+		return InvalidOffsetNumber;		/* no match, go to next page */
 }
-
-static bool
-rtscancache(IndexScanDesc s, ScanDirection dir)
-{
-	Buffer		b;
-	Page		p;
-	OffsetNumber n;
-	IndexTuple	it;
-
-	if (!(ScanDirectionIsNoMovement(dir)
-		  && ItemPointerIsValid(&(s->currentItemData))))
-		return false;
-
-	b = ReadBuffer(s->indexRelation,
-				   ItemPointerGetBlockNumber(&(s->currentItemData)));
-	p = BufferGetPage(b);
-	n = ItemPointerGetOffsetNumber(&(s->currentItemData));
-	it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-	s->xs_ctup.t_self = it->t_tid;
-	ReleaseBuffer(b);
-
-	return true;
-}
--- src/backend/access/rtree/rtree.c
+++ src/backend/access/rtree/rtree.c
@@ -280,12 +280,8 @@
 
 	do
 	{
-		/* let go of current buffer before getting next */
-		if (buffer != InvalidBuffer)
-			ReleaseBuffer(buffer);
-
-		/* get next buffer */
-		buffer = ReadBuffer(r, blk);
+		/* release the current buffer, read in the next one */
+		buffer = ReleaseAndReadBuffer(buffer, r, blk);
 		page = (Page) BufferGetPage(buffer);
 
 		opaque = (RTreePageOpaque) PageGetSpecialPointer(page);
--- src/backend/access/rtree/rtscan.c
+++ src/backend/access/rtree/rtscan.c
@@ -89,12 +89,24 @@
 		freestack(p->s_markstk);
 		p->s_stack = p->s_markstk = NULL;
 		p->s_flags = 0x0;
+		/* drop pins on buffers -- no locks held */
+		if (BufferIsValid(p->curbuf))
+		{
+			ReleaseBuffer(p->curbuf);
+			p->curbuf = InvalidBuffer;
+		}
+		if (BufferIsValid(p->markbuf))
+		{
+			ReleaseBuffer(p->markbuf);
+			p->markbuf = InvalidBuffer;
+		}
 	}
 	else
 	{
 		/* initialize opaque data */
 		p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData));
 		p->s_stack = p->s_markstk = NULL;
+		p->curbuf = p->markbuf = InvalidBuffer;
 		p->s_internalNKey = s->numberOfKeys;
 		p->s_flags = 0x0;
 		s->opaque = p;
@@ -175,6 +187,18 @@
 	freestack(p->s_markstk);
 	p->s_markstk = o;
 
+	/* Update markbuf: make sure to bump ref count on curbuf */
+	if (BufferIsValid(p->markbuf))
+	{
+		ReleaseBuffer(p->markbuf);
+		p->markbuf = InvalidBuffer;
+	}
+	if (BufferIsValid(p->curbuf))
+	{
+		IncrBufferRefCount(p->curbuf);
+		p->markbuf = p->curbuf;
+	}
+
 	PG_RETURN_VOID();
 }
 
@@ -211,6 +235,18 @@
 	freestack(p->s_stack);
 	p->s_stack = o;
 
+	/* Update curbuf; be sure to bump ref count on markbuf */
+	if (BufferIsValid(p->curbuf))
+	{
+		ReleaseBuffer(p->curbuf);
+		p->curbuf = InvalidBuffer;
+	}
+	if (BufferIsValid(p->markbuf))
+	{
+		IncrBufferRefCount(p->markbuf);
+		p->curbuf = p->markbuf;
+	}
+
 	PG_RETURN_VOID();
 }
 
@@ -226,11 +262,14 @@
 	{
 		freestack(p->s_stack);
 		freestack(p->s_markstk);
+		if (BufferIsValid(p->curbuf))
+			ReleaseBuffer(p->curbuf);
+		if (BufferIsValid(p->markbuf))
+			ReleaseBuffer(p->markbuf);
 		pfree(s->opaque);
 	}
 
 	rtdropscan(s);
-	/* XXX don't unset read lock -- two-phase locking */
 
 	PG_RETURN_VOID();
 }
--- src/include/access/rtree.h
+++ src/include/access/rtree.h
@@ -59,11 +59,14 @@
 /*
  *	When we're doing a scan, we need to keep track of the parent stack
  *	for the marked and current items.  Also, rtrees have the following
- *	property:  if you're looking for the box (1,1,2,2), on the internal
- *	nodes you have to search for all boxes that *contain* (1,1,2,2), and
- *	not the ones that match it.  We have a private scan key for internal
- *	nodes in the opaque structure for rtrees for this reason.  See
- *	access/index-rtree/rtscan.c and rtstrat.c for how it gets initialized.
+ *	property: if you're looking for the box (1,1,2,2), on the internal
+ *	nodes you have to search for all boxes that *contain* (1,1,2,2),
+ *	and not the ones that match it.  We have a private scan key for
+ *	internal nodes in the opaque structure for rtrees for this reason.
+ *	See access/index-rtree/rtscan.c and rtstrat.c for how it gets
+ *	initialized. We also keep pins on the scan's current buffer and
+ *	marked buffer, if any: this avoids the need to invoke ReadBuffer()
+ *	for each tuple produced by the index scan.
  */
 
 typedef struct RTreeScanOpaqueData
@@ -73,6 +76,8 @@
 	uint16		s_flags;
 	int			s_internalNKey;
 	ScanKey		s_internalKey;
+	Buffer		curbuf;
+	Buffer		markbuf;
 } RTreeScanOpaqueData;
 
 typedef RTreeScanOpaqueData *RTreeScanOpaque;
---------------------------(end of broadcast)---------------------------
TIP 7: don't forget to increase your free space map settings

Reply via email to