Barring any objections, I intend to apply this patch tomorrow. The
patch, as well as the original -patches email, are included below.
-Neil
On Wed, 2004-11-24 at 11:15 +1100, Neil Conway wrote:
> This patch makes some improvements to the rtree index implementation:
>
> (1) Keep a pin on the scan's current buffer and mark buffer. This avoids
> the need to do a ReadBuffer() for each tuple produced by the scan.
>
> (2) Convert a ReleaseBuffer() ; ReadBuffer() pair into
> ReleaseAndReadBuffer(). Surely not a huge win, but it saves a lock
> acquire/release...
>
> (3) Remove a bunch of duplicated code in rtget.c; make rtnext() handle
> both the "initial result" and "subsequent result" cases.
>
> (4) Add support for index tuple killing
>
> (5) Remove rtscancache(): it is dead code, for the same reason that
> gistscancache() is dead code (an index scan ought not be invoked with
> NoMovementScanDirection).
>
> The end result is about a 10% improvement in index scan performance,
> according to contrib/rtree_gist/bench.
>
> These changes (with the exception of #2) are analogous to changes I've
> already made for GiST (it's clear that GiST was started as a fork of
> rtree). I'm not hugely interested in further improvements to rtree; I
> just did this stuff because it is low-hanging fruit and I've already
> made the same changes for GiST.
#
# patch "src/backend/access/rtree/rtget.c"
# from [3db9a1c0391d9e319ac8455167f23215e00bf832]
# to [ebfe121a3bcd3e2a15da91e14a84b69c87afac67]
#
# patch "src/backend/access/rtree/rtree.c"
# from [274583f574bc483a709b46912a567c58c6abb98c]
# to [8df26216af5a60db10afd38e3f1dc5c6355ec79c]
#
# patch "src/backend/access/rtree/rtscan.c"
# from [9ace8f57b28397296b386955fdf3b7b712178a9a]
# to [92b177352fd28ffab208ac9abf502cb4c81d2740]
#
# patch "src/include/access/rtree.h"
# from [d3b048ba826b36bc766de324779c0b5bbb143b42]
# to [e4947131a99641d8f43c4b7b770485c8c1043094]
#
--- src/backend/access/rtree/rtget.c
+++ src/backend/access/rtree/rtget.c
@@ -19,10 +19,8 @@
#include "access/relscan.h"
#include "access/rtree.h"
-static OffsetNumber findnext(IndexScanDesc s, Page p, OffsetNumber n,
+static OffsetNumber findnext(IndexScanDesc s, OffsetNumber n,
ScanDirection dir);
-static bool rtscancache(IndexScanDesc s, ScanDirection dir);
-static bool rtfirst(IndexScanDesc s, ScanDirection dir);
static bool rtnext(IndexScanDesc s, ScanDirection dir);
@@ -31,138 +29,106 @@
{
IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
- bool res;
-
- /* if we have it cached in the scan desc, just return the value */
- if (rtscancache(s, dir))
- PG_RETURN_BOOL(true);
-
- /* not cached, so we'll have to do some work */
- if (ItemPointerIsValid(&(s->currentItemData)))
- res = rtnext(s, dir);
- else
- res = rtfirst(s, dir);
- PG_RETURN_BOOL(res);
-}
-
-static bool
-rtfirst(IndexScanDesc s, ScanDirection dir)
-{
- Buffer b;
- Page p;
- OffsetNumber n;
- OffsetNumber maxoff;
- RTreePageOpaque po;
+ Page page;
+ OffsetNumber offnum;
RTreeScanOpaque so;
- RTSTACK *stk;
- BlockNumber blk;
- IndexTuple it;
- b = ReadBuffer(s->indexRelation, P_ROOT);
- p = BufferGetPage(b);
- po = (RTreePageOpaque) PageGetSpecialPointer(p);
so = (RTreeScanOpaque) s->opaque;
- for (;;)
+ /*
+ * If we've already produced a tuple and the executor has informed
+ * us that it should be marked "killed", do so know.
+ */
+ if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData)))
{
- maxoff = PageGetMaxOffsetNumber(p);
- if (ScanDirectionIsBackward(dir))
- n = findnext(s, p, maxoff, dir);
- else
- n = findnext(s, p, FirstOffsetNumber, dir);
+ offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
+ page = BufferGetPage(so->curbuf);
+ PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
+ SetBufferCommitInfoNeedsSave(so->curbuf);
+ }
- while (n < FirstOffsetNumber || n > maxoff)
- {
- ReleaseBuffer(b);
- if (so->s_stack == NULL)
- return false;
+ /*
+ * Get the next tuple that matches the search key; if asked to
+ * skip killed tuples, find the first non-killed tuple that
+ * matches. Return as soon as we've run out of matches or we've
+ * found an acceptable match.
+ */
+ for (;;)
+ {
+ bool res = rtnext(s, dir);
- stk = so->s_stack;
- b = ReadBuffer(s->indexRelation, stk->rts_blk);
- p = BufferGetPage(b);
- po = (RTreePageOpaque) PageGetSpecialPointer(p);
- maxoff = PageGetMaxOffsetNumber(p);
-
- if (ScanDirectionIsBackward(dir))
- n = OffsetNumberPrev(stk->rts_child);
- else
- n = OffsetNumberNext(stk->rts_child);
- so->s_stack = stk->rts_parent;
- pfree(stk);
-
- n = findnext(s, p, n, dir);
- }
- if (po->flags & F_LEAF)
+ if (res == true && s->ignore_killed_tuples)
{
- ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
-
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-
- s->xs_ctup.t_self = it->t_tid;
-
- ReleaseBuffer(b);
- return true;
+ offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
+ page = BufferGetPage(so->curbuf);
+ if (ItemIdDeleted(PageGetItemId(page, offnum)))
+ continue;
}
- else
- {
- stk = (RTSTACK *) palloc(sizeof(RTSTACK));
- stk->rts_child = n;
- stk->rts_blk = BufferGetBlockNumber(b);
- stk->rts_parent = so->s_stack;
- so->s_stack = stk;
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
- blk = ItemPointerGetBlockNumber(&(it->t_tid));
-
- ReleaseBuffer(b);
- b = ReadBuffer(s->indexRelation, blk);
- p = BufferGetPage(b);
- po = (RTreePageOpaque) PageGetSpecialPointer(p);
- }
+ PG_RETURN_BOOL(res);
}
}
static bool
rtnext(IndexScanDesc s, ScanDirection dir)
{
- Buffer b;
Page p;
OffsetNumber n;
- OffsetNumber maxoff;
RTreePageOpaque po;
RTreeScanOpaque so;
- RTSTACK *stk;
- BlockNumber blk;
- IndexTuple it;
- blk = ItemPointerGetBlockNumber(&(s->currentItemData));
- n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+ so = (RTreeScanOpaque) s->opaque;
- if (ScanDirectionIsForward(dir))
- n = OffsetNumberNext(n);
- else
- n = OffsetNumberPrev(n);
+ if (!ItemPointerIsValid(&(s->currentItemData)))
+ {
+ /* first call: start at the root */
+ Assert(BufferIsValid(so->curbuf) == false);
+ so->curbuf = ReadBuffer(s->indexRelation, P_ROOT);
+ }
- b = ReadBuffer(s->indexRelation, blk);
- p = BufferGetPage(b);
+ p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
- so = (RTreeScanOpaque) s->opaque;
+ if (!ItemPointerIsValid(&(s->currentItemData)))
+ {
+ /* first call: start at first/last offset */
+ if (ScanDirectionIsForward(dir))
+ n = FirstOffsetNumber;
+ else
+ n = PageGetMaxOffsetNumber(p);
+ }
+ else
+ {
+ /* go on to the next offset */
+ n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+ if (ScanDirectionIsForward(dir))
+ n = OffsetNumberNext(n);
+ else
+ n = OffsetNumberPrev(n);
+ }
+
for (;;)
{
- maxoff = PageGetMaxOffsetNumber(p);
- n = findnext(s, p, n, dir);
+ IndexTuple it;
+ RTSTACK *stk;
- while (n < FirstOffsetNumber || n > maxoff)
+ n = findnext(s, n, dir);
+
+ /* no match on this page, so read in the next stack entry */
+ if (n == InvalidOffsetNumber)
{
- ReleaseBuffer(b);
+ /* if out of stack entries, we're done */
if (so->s_stack == NULL)
+ {
+ ReleaseBuffer(so->curbuf);
+ so->curbuf = InvalidBuffer;
return false;
+ }
stk = so->s_stack;
- b = ReadBuffer(s->indexRelation, stk->rts_blk);
- p = BufferGetPage(b);
- maxoff = PageGetMaxOffsetNumber(p);
+ so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation,
+ stk->rts_blk);
+ p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (ScanDirectionIsBackward(dir))
@@ -172,33 +138,41 @@
so->s_stack = stk->rts_parent;
pfree(stk);
- n = findnext(s, p, n, dir);
+ continue;
}
+
if (po->flags & F_LEAF)
{
- ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
-
+ ItemPointerSet(&(s->currentItemData),
+ BufferGetBlockNumber(so->curbuf),
+ n);
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-
s->xs_ctup.t_self = it->t_tid;
-
- ReleaseBuffer(b);
return true;
}
else
{
+ BlockNumber blk;
+
stk = (RTSTACK *) palloc(sizeof(RTSTACK));
stk->rts_child = n;
- stk->rts_blk = BufferGetBlockNumber(b);
+ stk->rts_blk = BufferGetBlockNumber(so->curbuf);
stk->rts_parent = so->s_stack;
so->s_stack = stk;
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
blk = ItemPointerGetBlockNumber(&(it->t_tid));
- ReleaseBuffer(b);
- b = ReadBuffer(s->indexRelation, blk);
- p = BufferGetPage(b);
+ /*
+ * Note that we release the pin on the page as we descend
+ * down the tree, even though there's a good chance we'll
+ * eventually need to re-read the buffer later in this
+ * scan. This may or may not be optimal, but it doesn't
+ * seem likely to make a huge performance difference
+ * either way.
+ */
+ so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk);
+ p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (ScanDirectionIsBackward(dir))
@@ -210,16 +184,19 @@
}
static OffsetNumber
-findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
+findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
{
OffsetNumber maxoff;
IndexTuple it;
RTreePageOpaque po;
RTreeScanOpaque so;
+ Page p;
+ so = (RTreeScanOpaque) s->opaque;
+ p = BufferGetPage(so->curbuf);
+
maxoff = PageGetMaxOffsetNumber(p);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
- so = (RTreeScanOpaque) s->opaque;
/*
* If we modified the index during the scan, we may have a pointer to
@@ -256,28 +233,8 @@
n = OffsetNumberNext(n);
}
- return n;
+ if (n >= FirstOffsetNumber && n <= maxoff)
+ return n; /* found a match on this page */
+ else
+ return InvalidOffsetNumber; /* no match, go to next page */
}
-
-static bool
-rtscancache(IndexScanDesc s, ScanDirection dir)
-{
- Buffer b;
- Page p;
- OffsetNumber n;
- IndexTuple it;
-
- if (!(ScanDirectionIsNoMovement(dir)
- && ItemPointerIsValid(&(s->currentItemData))))
- return false;
-
- b = ReadBuffer(s->indexRelation,
- ItemPointerGetBlockNumber(&(s->currentItemData)));
- p = BufferGetPage(b);
- n = ItemPointerGetOffsetNumber(&(s->currentItemData));
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
- s->xs_ctup.t_self = it->t_tid;
- ReleaseBuffer(b);
-
- return true;
-}
--- src/backend/access/rtree/rtree.c
+++ src/backend/access/rtree/rtree.c
@@ -280,12 +280,8 @@
do
{
- /* let go of current buffer before getting next */
- if (buffer != InvalidBuffer)
- ReleaseBuffer(buffer);
-
- /* get next buffer */
- buffer = ReadBuffer(r, blk);
+ /* release the current buffer, read in the next one */
+ buffer = ReleaseAndReadBuffer(buffer, r, blk);
page = (Page) BufferGetPage(buffer);
opaque = (RTreePageOpaque) PageGetSpecialPointer(page);
--- src/backend/access/rtree/rtscan.c
+++ src/backend/access/rtree/rtscan.c
@@ -89,12 +89,24 @@
freestack(p->s_markstk);
p->s_stack = p->s_markstk = NULL;
p->s_flags = 0x0;
+ /* drop pins on buffers -- no locks held */
+ if (BufferIsValid(p->curbuf))
+ {
+ ReleaseBuffer(p->curbuf);
+ p->curbuf = InvalidBuffer;
+ }
+ if (BufferIsValid(p->markbuf))
+ {
+ ReleaseBuffer(p->markbuf);
+ p->markbuf = InvalidBuffer;
+ }
}
else
{
/* initialize opaque data */
p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData));
p->s_stack = p->s_markstk = NULL;
+ p->curbuf = p->markbuf = InvalidBuffer;
p->s_internalNKey = s->numberOfKeys;
p->s_flags = 0x0;
s->opaque = p;
@@ -175,6 +187,18 @@
freestack(p->s_markstk);
p->s_markstk = o;
+ /* Update markbuf: make sure to bump ref count on curbuf */
+ if (BufferIsValid(p->markbuf))
+ {
+ ReleaseBuffer(p->markbuf);
+ p->markbuf = InvalidBuffer;
+ }
+ if (BufferIsValid(p->curbuf))
+ {
+ IncrBufferRefCount(p->curbuf);
+ p->markbuf = p->curbuf;
+ }
+
PG_RETURN_VOID();
}
@@ -211,6 +235,18 @@
freestack(p->s_stack);
p->s_stack = o;
+ /* Update curbuf; be sure to bump ref count on markbuf */
+ if (BufferIsValid(p->curbuf))
+ {
+ ReleaseBuffer(p->curbuf);
+ p->curbuf = InvalidBuffer;
+ }
+ if (BufferIsValid(p->markbuf))
+ {
+ IncrBufferRefCount(p->markbuf);
+ p->curbuf = p->markbuf;
+ }
+
PG_RETURN_VOID();
}
@@ -226,11 +262,14 @@
{
freestack(p->s_stack);
freestack(p->s_markstk);
+ if (BufferIsValid(p->curbuf))
+ ReleaseBuffer(p->curbuf);
+ if (BufferIsValid(p->markbuf))
+ ReleaseBuffer(p->markbuf);
pfree(s->opaque);
}
rtdropscan(s);
- /* XXX don't unset read lock -- two-phase locking */
PG_RETURN_VOID();
}
--- src/include/access/rtree.h
+++ src/include/access/rtree.h
@@ -59,11 +59,14 @@
/*
* When we're doing a scan, we need to keep track of the parent stack
* for the marked and current items. Also, rtrees have the following
- * property: if you're looking for the box (1,1,2,2), on the internal
- * nodes you have to search for all boxes that *contain* (1,1,2,2), and
- * not the ones that match it. We have a private scan key for internal
- * nodes in the opaque structure for rtrees for this reason. See
- * access/index-rtree/rtscan.c and rtstrat.c for how it gets initialized.
+ * property: if you're looking for the box (1,1,2,2), on the internal
+ * nodes you have to search for all boxes that *contain* (1,1,2,2),
+ * and not the ones that match it. We have a private scan key for
+ * internal nodes in the opaque structure for rtrees for this reason.
+ * See access/index-rtree/rtscan.c and rtstrat.c for how it gets
+ * initialized. We also keep pins on the scan's current buffer and
+ * marked buffer, if any: this avoids the need to invoke ReadBuffer()
+ * for each tuple produced by the index scan.
*/
typedef struct RTreeScanOpaqueData
@@ -73,6 +76,8 @@
uint16 s_flags;
int s_internalNKey;
ScanKey s_internalKey;
+ Buffer curbuf;
+ Buffer markbuf;
} RTreeScanOpaqueData;
typedef RTreeScanOpaqueData *RTreeScanOpaque;
---------------------------(end of broadcast)---------------------------
TIP 8: explain analyze is your friend