Hello, Matthias!
Updated patches attached.
Changes:
* cleanup test logic a little bit
* resolve issue with rescan in GIST (item->blkno == InvalidBlockNumber)
* move test to the main isolation suite
* add test for SpGist
* update comment I mentioned before
* allow GIST to set LP_DEAD in cases it is a safe even if LSN is updated
Also, seems like SP-GIST version is broken, it fails like this:
TRAP: failed Assert("BufferIsValid(so->pagePin)"), File:
"../src/backend/access/spgist/spgscan.c", Line: 1139, PID: 612214
FETCH(ExceptionalCondition+0xbe)[0x644a0b9dfdbc]
FETCH(spggettuple+0x289)[0x644a0b3743c6]
FETCH(index_getnext_tid+0x166)[0x644a0b3382f7]
FETCH(+0x3b392b)[0x644a0b56d92b]
FETCH(+0x3887df)[0x644a0b5427df]
FETCH(ExecScan+0x77)[0x644a0b542858]
FETCH(+0x3b3b9b)[0x644a0b56db9b]
FETCH(+0x376d8b)[0x644a0b530d8b]
FETCH(+0x379bd9)[0x644a0b533bd9]
FETCH(standard_ExecutorRun+0x19f)[0x644a0b531457]
FETCH(ExecutorRun+0x5a)[0x644a0b5312b5]
FETCH(+0x6276dc)[0x644a0b7e16dc]
FETCH(+0x628936)[0x644a0b7e2936]
FETCH(PortalRunFetch+0x1a0)[0x644a0b7e229c]
FETCH(PerformPortalFetch+0x13b)[0x644a0b49d7e5]
FETCH(standard_ProcessUtility+0x5f0)[0x644a0b7e3aab]
FETCH(ProcessUtility+0x140)[0x644a0b7e34b4]
FETCH(+0x627ceb)[0x644a0b7e1ceb]
FETCH(+0x627a28)[0x644a0b7e1a28]
FETCH(PortalRun+0x273)[0x644a0b7e12bb]
FETCH(+0x61fae1)[0x644a0b7d9ae1]
FETCH(PostgresMain+0x9eb)[0x644a0b7df170]
FETCH(+0x61b3e2)[0x644a0b7d53e2]
FETCH(postmaster_child_launch+0x137)[0x644a0b6e6e2d]
FETCH(+0x53384b)[0x644a0b6ed84b]
FETCH(+0x530f31)[0x644a0b6eaf31]
FETCH(PostmasterMain+0x161f)[0x644a0b6ea812]
FETCH(main+0x3a1)[0x644a0b5c29cf]
Best regards,
Mikhail.
>
From 59b7746c96cca144c1d1d0362a96d8aa019e2a23 Mon Sep 17 00:00:00 2001
From: nkey <[email protected]>
Date: Fri, 10 Jan 2025 17:55:30 +0100
Subject: [PATCH v5 2/3] RFC: Extend buffer pinning for GIST IOS
This should fix issues with incorrect results when a GIST IOS encounters tuples removed from pages by a concurrent vacuum operation.
Also, add ability to set LP_DEAD bits in more cases of IOS scans overs GIST.
---
src/backend/access/gist/README | 16 ++++
src/backend/access/gist/gistget.c | 46 +++++++++--
src/backend/access/gist/gistscan.c | 115 ++++++++++++++++++++++++---
src/backend/access/gist/gistvacuum.c | 6 +-
src/include/access/gist_private.h | 2 +
5 files changed, 166 insertions(+), 19 deletions(-)
diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 8015ff19f05..c7c2afad088 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -287,6 +287,22 @@ would complicate the insertion algorithm. So when an insertion sees a page
with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
crashed in the middle to completion by adding the downlink in the parent.
+Index-only scans and VACUUM
+---------------------------
+
+Index-only scans require that any tuple returned by the index scan has not
+been removed from the index by a call to ambulkdelete through VACUUM.
+To ensure this invariant, bulkdelete now requires a buffer cleanup lock, and
+every Index-only scan (IOS) will keep a pin on each page that it is returning
+tuples from. For ordered scans, we keep one pin for each matching leaf tuple,
+for unordered scans we just keep an additional pin while we're still working
+on the page's tuples. This ensures that pages seen by the scan won't be
+cleaned up until after the tuples have been returned.
+
+These longer pin lifetimes can cause buffer exhaustion with messages like "no
+unpinned buffers available" when the index has many pages that have similar
+ordering; but future work can figure out how to best work that out.
+
Buffering build algorithm
-------------------------
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index cc40e928e0a..adf86fed67b 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -35,7 +35,7 @@
* away and the TID was re-used by a completely different heap tuple.
*/
static void
-gistkillitems(IndexScanDesc scan)
+gistkillitems(IndexScanDesc scan, bool pagePinned)
{
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
Buffer buffer;
@@ -60,9 +60,10 @@ gistkillitems(IndexScanDesc scan)
/*
* If page LSN differs it means that the page was modified since the last
* read. killedItems could be not valid so LP_DEAD hints applying is not
- * safe.
+ * safe. But in case then page was pinned - it is safe, because VACUUM is
+ * unable to remove tuples due to locking protocol.
*/
- if (BufferGetLSNAtomic(buffer) != so->curPageLSN)
+ if (!pagePinned && BufferGetLSNAtomic(buffer) != so->curPageLSN)
{
UnlockReleaseBuffer(buffer);
so->numKilled = 0; /* reset counter */
@@ -395,6 +396,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
}
so->nPageData = so->curPageData = 0;
+ Assert(so->pagePin == InvalidBuffer);
scan->xs_hitup = NULL; /* might point into pageDataCxt */
if (so->pageDataCxt)
MemoryContextReset(so->pageDataCxt);
@@ -402,7 +404,8 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
/*
* We save the LSN of the page as we read it, so that we know whether it
* safe to apply LP_DEAD hints to the page later. This allows us to drop
- * the pin for MVCC scans, which allows vacuum to avoid blocking.
+ * the pin for MVCC scans (except index-only scans), which allows vacuum
+ * to avoid blocking.
*/
so->curPageLSN = BufferGetLSNAtomic(buffer);
@@ -460,6 +463,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
so->pageData[so->nPageData].heapPtr = it->t_tid;
so->pageData[so->nPageData].recheck = recheck;
so->pageData[so->nPageData].offnum = i;
+ so->pageData[so->nPageData].buffer = InvalidBuffer;
/*
* In an index-only scan, also fetch the data from the tuple. The
@@ -471,7 +475,18 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
so->pageData[so->nPageData].recontup =
gistFetchTuple(giststate, r, it);
MemoryContextSwitchTo(oldcxt);
+
+ /*
+ * Only maintain a single additional buffer pin for unordered
+ * IOS scans; as we have all data already in one place.
+ */
+ if (so->nPageData == 0)
+ {
+ so->pagePin = buffer;
+ IncrBufferRefCount(buffer);
+ }
}
+
so->nPageData++;
}
else
@@ -501,7 +516,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
* In an index-only scan, also fetch the data from the tuple.
*/
if (scan->xs_want_itup)
+ {
item->data.heap.recontup = gistFetchTuple(giststate, r, it);
+ item->data.heap.buffer = buffer;
+ IncrBufferRefCount(buffer);
+ }
}
else
{
@@ -567,6 +586,10 @@ getNextNearest(IndexScanDesc scan)
/* free previously returned tuple */
pfree(scan->xs_hitup);
scan->xs_hitup = NULL;
+
+ Assert(BufferIsValid(so->pagePin));
+ ReleaseBuffer(so->pagePin);
+ so->pagePin = InvalidBuffer;
}
do
@@ -588,7 +611,11 @@ getNextNearest(IndexScanDesc scan)
/* in an index-only scan, also return the reconstructed tuple. */
if (scan->xs_want_itup)
+ {
+ Assert(BufferIsValid(item->data.heap.buffer));
scan->xs_hitup = item->data.heap.recontup;
+ so->pagePin = item->data.heap.buffer;
+ }
res = true;
}
else
@@ -688,7 +715,6 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
&& so->curPageData > 0
&& so->curPageData == so->nPageData)
{
-
if (so->killedItems == NULL)
{
MemoryContext oldCxt =
@@ -704,13 +730,21 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
so->killedItems[so->numKilled++] =
so->pageData[so->curPageData - 1].offnum;
}
+
+ if (scan->xs_want_itup && so->nPageData > 0)
+ {
+ Assert(BufferIsValid(so->pagePin));
+ ReleaseBuffer(so->pagePin);
+ so->pagePin = InvalidBuffer;
+ }
+
/* find and process the next index page */
do
{
GISTSearchItem *item;
if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0))
- gistkillitems(scan);
+ gistkillitems(scan, BufferIsValid(so->pagePin));
item = getNextGISTSearchItem(so);
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index 700fa959d03..932c2271510 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -110,6 +110,7 @@ gistbeginscan(Relation r, int nkeys, int norderbys)
so->numKilled = 0;
so->curBlkno = InvalidBlockNumber;
so->curPageLSN = InvalidXLogRecPtr;
+ so->pagePin = InvalidBuffer;
scan->opaque = so;
@@ -151,18 +152,73 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
Assert(so->queueCxt == so->giststate->scanCxt);
first_time = true;
}
- else if (so->queueCxt == so->giststate->scanCxt)
- {
- /* second time through */
- so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt,
- "GiST queue context",
- ALLOCSET_DEFAULT_SIZES);
- first_time = false;
- }
else
{
- /* third or later time through */
- MemoryContextReset(so->queueCxt);
+ /*
+ * In the first scan of a query we allocate IOS items in the scan
+ * context, which is never reset. To not leak this memory, we
+ * manually free the queue entries.
+ */
+ const bool freequeue = so->queueCxt == so->giststate->scanCxt;
+ /*
+ * Index-only scans require that vacuum can't clean up entries that
+ * we're still planning to return, so we hold a pin on the buffer until
+ * we're past the returned item (1 pin count for every index tuple).
+ * When rescan is called, however, we need to clean up the pins that
+ * we still hold, lest we leak them and lose a buffer entry to that
+ * page.
+ */
+ const bool unpinqueue = scan->xs_want_itup;
+
+ if (freequeue || unpinqueue)
+ {
+ while (!pairingheap_is_empty(so->queue))
+ {
+ pairingheap_node *node;
+ GISTSearchItem *item;
+
+ node = pairingheap_remove_first(so->queue);
+ item = pairingheap_container(GISTSearchItem, phNode, node);
+
+ /*
+ * If we need to unpin a buffer for IOS' heap items, do so
+ * now.
+ */
+ if (unpinqueue && item->blkno == InvalidBlockNumber)
+ {
+ Assert(BufferIsValid(item->data.heap.buffer));
+ ReleaseBuffer(item->data.heap.buffer);
+ }
+
+ /*
+ * item->data.heap.recontup is stored in the separate memory
+ * context so->pageDataCxt, which is always reset; so we don't
+ * need to free that.
+ * "item" itself is allocated into the queue context, which is
+ * generally reset in rescan.
+ * However, only in the first scan, we allocate these items
+ * into the main scan context, which isn't reset; so we must
+ * free these items, or else we'd leak the memory for the
+ * duration of the query.
+ */
+ if (freequeue)
+ pfree(item);
+ }
+ }
+
+ if (so->queueCxt == so->giststate->scanCxt)
+ {
+ /* second time through */
+ so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt,
+ "GiST queue context",
+ ALLOCSET_DEFAULT_SIZES);
+ }
+ else
+ {
+ /* third or later time through */
+ MemoryContextReset(so->queueCxt);
+ }
+
first_time = false;
}
@@ -341,6 +397,15 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
/* any previous xs_hitup will have been pfree'd in context resets above */
scan->xs_hitup = NULL;
+
+ if (scan->xs_want_itup)
+ {
+ if (BufferIsValid(so->pagePin))
+ {
+ ReleaseBuffer(so->pagePin);
+ so->pagePin = InvalidBuffer;
+ }
+ }
}
void
@@ -348,6 +413,36 @@ gistendscan(IndexScanDesc scan)
{
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ if (scan->xs_want_itup)
+ {
+ if (BufferIsValid(so->pagePin))
+ {
+ ReleaseBuffer(so->pagePin);
+ so->pagePin = InvalidBuffer;
+ }
+
+ /* unpin any leftover buffers */
+ while (!pairingheap_is_empty(so->queue))
+ {
+ pairingheap_node *node;
+ GISTSearchItem *item;
+
+ /*
+ * Note: unlike gistrescan, there is no need to actually free the
+ * items here, as that's handled by memory context reset in the
+ * call to freeGISTstate() below.
+ */
+ node = pairingheap_remove_first(so->queue);
+ item = pairingheap_container(GISTSearchItem, phNode, node);
+
+ if (item->blkno == InvalidBlockNumber)
+ {
+ Assert(BufferIsValid(item->data.heap.buffer));
+ ReleaseBuffer(item->data.heap.buffer);
+ }
+ }
+ }
+
/*
* freeGISTstate is enough to clean up everything made by gistbeginscan,
* as well as the queueCxt if there is a separate context for it.
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index fe0bfb781ca..840b3d586ed 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -289,10 +289,10 @@ restart:
info->strategy);
/*
- * We are not going to stay here for a long time, aggressively grab an
- * exclusive lock.
+ * We are not going to stay here for a long time, aggressively grab a
+ * cleanup lock.
*/
- LockBuffer(buffer, GIST_EXCLUSIVE);
+ LockBufferForCleanup(buffer);
page = (Page) BufferGetPage(buffer);
if (gistPageRecyclable(page))
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 39404ec7cdb..e559117e7d7 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -124,6 +124,7 @@ typedef struct GISTSearchHeapItem
* index-only scans */
OffsetNumber offnum; /* track offset in page to mark tuple as
* LP_DEAD */
+ Buffer buffer; /* buffer to unpin, when IOS */
} GISTSearchHeapItem;
/* Unvisited item, either index page or heap tuple */
@@ -176,6 +177,7 @@ typedef struct GISTScanOpaqueData
OffsetNumber curPageData; /* next item to return */
MemoryContext pageDataCxt; /* context holding the fetched tuples, for
* index-only scans */
+ Buffer pagePin; /* buffer of page, if pinned */
} GISTScanOpaqueData;
typedef GISTScanOpaqueData *GISTScanOpaque;
--
2.43.0
From 5423affdba594ca0c1575f4f35c6ec479f82b216 Mon Sep 17 00:00:00 2001
From: nkey <[email protected]>
Date: Fri, 10 Jan 2025 16:22:29 +0100
Subject: [PATCH v5 1/3] isolation tester showing broken index-only scans with
GiST and SP-GiST
Co-authored-by: Matthias van de Meent <[email protected]>, Michail Nikolaev <[email protected]>
---
.../expected/index-only-scan-gist-vacuum.out | 67 +++++++++++
.../index-only-scan-spgist-vacuum.out | 67 +++++++++++
src/test/isolation/isolation_schedule | 2 +
.../specs/index-only-scan-gist-vacuum.spec | 113 ++++++++++++++++++
.../specs/index-only-scan-spgist-vacuum.spec | 113 ++++++++++++++++++
5 files changed, 362 insertions(+)
create mode 100644 src/test/isolation/expected/index-only-scan-gist-vacuum.out
create mode 100644 src/test/isolation/expected/index-only-scan-spgist-vacuum.out
create mode 100644 src/test/isolation/specs/index-only-scan-gist-vacuum.spec
create mode 100644 src/test/isolation/specs/index-only-scan-spgist-vacuum.spec
diff --git a/src/test/isolation/expected/index-only-scan-gist-vacuum.out b/src/test/isolation/expected/index-only-scan-gist-vacuum.out
new file mode 100644
index 00000000000..19117402f52
--- /dev/null
+++ b/src/test/isolation/expected/index-only-scan-gist-vacuum.out
@@ -0,0 +1,67 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit
+step s2_mod:
+ DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+
+step s1_begin: BEGIN;
+step s1_prepare_sorted:
+ DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)';
+
+step s1_fetch_1:
+ FETCH FROM foo;
+
+ x
+------------------
+1.4142135623730951
+(1 row)
+
+step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...>
+step s1_fetch_all:
+ SELECT pg_sleep_for(INTERVAL '50ms');
+ FETCH ALL FROM foo;
+
+pg_sleep_for
+------------
+
+(1 row)
+
+x
+-
+(0 rows)
+
+step s2_vacuum: <... completed>
+step s1_commit: COMMIT;
+
+starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit
+step s2_mod:
+ DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+
+step s1_begin: BEGIN;
+step s1_prepare_unsorted:
+ DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a;
+
+step s1_fetch_1:
+ FETCH FROM foo;
+
+a
+-----
+(1,1)
+(1 row)
+
+step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...>
+step s1_fetch_all:
+ SELECT pg_sleep_for(INTERVAL '50ms');
+ FETCH ALL FROM foo;
+
+pg_sleep_for
+------------
+
+(1 row)
+
+a
+-
+(0 rows)
+
+step s2_vacuum: <... completed>
+step s1_commit: COMMIT;
diff --git a/src/test/isolation/expected/index-only-scan-spgist-vacuum.out b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out
new file mode 100644
index 00000000000..19117402f52
--- /dev/null
+++ b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out
@@ -0,0 +1,67 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit
+step s2_mod:
+ DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+
+step s1_begin: BEGIN;
+step s1_prepare_sorted:
+ DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)';
+
+step s1_fetch_1:
+ FETCH FROM foo;
+
+ x
+------------------
+1.4142135623730951
+(1 row)
+
+step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...>
+step s1_fetch_all:
+ SELECT pg_sleep_for(INTERVAL '50ms');
+ FETCH ALL FROM foo;
+
+pg_sleep_for
+------------
+
+(1 row)
+
+x
+-
+(0 rows)
+
+step s2_vacuum: <... completed>
+step s1_commit: COMMIT;
+
+starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit
+step s2_mod:
+ DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+
+step s1_begin: BEGIN;
+step s1_prepare_unsorted:
+ DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a;
+
+step s1_fetch_1:
+ FETCH FROM foo;
+
+a
+-----
+(1,1)
+(1 row)
+
+step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...>
+step s1_fetch_all:
+ SELECT pg_sleep_for(INTERVAL '50ms');
+ FETCH ALL FROM foo;
+
+pg_sleep_for
+------------
+
+(1 row)
+
+a
+-
+(0 rows)
+
+step s2_vacuum: <... completed>
+step s1_commit: COMMIT;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 143109aa4da..9720c9a2dc8 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -17,6 +17,8 @@ test: partial-index
test: two-ids
test: multiple-row-versions
test: index-only-scan
+test: index-only-scan-gist-vacuum
+test: index-only-scan-spgist-vacuum
test: predicate-lock-hot-tuple
test: update-conflict-out
test: deadlock-simple
diff --git a/src/test/isolation/specs/index-only-scan-gist-vacuum.spec b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec
new file mode 100644
index 00000000000..b1688d44fa7
--- /dev/null
+++ b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec
@@ -0,0 +1,113 @@
+# index-only-scan test showing wrong results with GiST
+#
+setup
+{
+ -- by using a low fillfactor and a wide tuple we can get multiple blocks
+ -- with just few rows
+ CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '')
+ WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10);
+
+ INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i);
+
+ CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING gist(a);
+}
+setup
+{
+ VACUUM ios_needs_cleanup_lock;
+}
+
+teardown
+{
+ DROP TABLE ios_needs_cleanup_lock;
+}
+
+
+session s1
+
+# Force an index-only scan, where possible:
+setup {
+ SET enable_bitmapscan = false;
+ SET enable_indexonlyscan = true;
+ SET enable_indexscan = true;
+}
+
+step s1_begin { BEGIN; }
+step s1_commit { COMMIT; }
+
+step s1_prepare_sorted {
+ DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)';
+}
+
+step s1_prepare_unsorted {
+ DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a;
+}
+
+step s1_fetch_1 {
+ FETCH FROM foo;
+}
+
+step s1_fetch_all {
+ SELECT pg_sleep_for(INTERVAL '50ms');
+ FETCH ALL FROM foo;
+}
+
+
+session s2
+
+# Don't delete row 1 so we have a row for the cursor to "rest" on.
+step s2_mod
+{
+ DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+}
+
+# Disable truncation, as otherwise we'll just wait for a timeout while trying
+# to acquire the lock
+step s2_vacuum { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; }
+
+permutation
+ # delete nearly all rows, to make issue visible
+ s2_mod
+ # create a cursor
+ s1_begin
+ s1_prepare_sorted
+
+ # fetch one row from the cursor, that ensures the index scan portion is done
+ # before the vacuum in the next step
+ s1_fetch_1
+
+ # with the bug this vacuum will mark pages as all-visible that the scan in
+ # the next step then considers all-visible, despite all rows from those
+ # pages having been removed.
+ # Because this should block on buffer-level locks, this won't ever be
+ # considered "blocked" by isolation tester, and so we only have a single
+ # step we can work with concurrently.
+ s2_vacuum (*)
+
+ # if this returns any rows, we're busted
+ s1_fetch_all
+
+ s1_commit
+
+permutation
+ # delete nearly all rows, to make issue visible
+ s2_mod
+ # create a cursor
+ s1_begin
+ s1_prepare_unsorted
+
+ # fetch one row from the cursor, that ensures the index scan portion is done
+ # before the vacuum in the next step
+ s1_fetch_1
+
+ # with the bug this vacuum will mark pages as all-visible that the scan in
+ # the next step then considers all-visible, despite all rows from those
+ # pages having been removed.
+ # Because this should block on buffer-level locks, this won't ever be
+ # considered "blocked" by isolation tester, and so we only have a single
+ # step we can work with concurrently.
+ s2_vacuum (*)
+
+ # if this returns any rows, we're busted
+ s1_fetch_all
+
+ s1_commit
diff --git a/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec
new file mode 100644
index 00000000000..b414c5d1695
--- /dev/null
+++ b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec
@@ -0,0 +1,113 @@
+# index-only-scan test showing wrong results with SPGiST
+#
+setup
+{
+ -- by using a low fillfactor and a wide tuple we can get multiple blocks
+ -- with just few rows
+ CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '')
+ WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10);
+
+ INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i);
+
+ CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING spgist(a);
+}
+setup
+{
+ VACUUM ios_needs_cleanup_lock;
+}
+
+teardown
+{
+ DROP TABLE ios_needs_cleanup_lock;
+}
+
+
+session s1
+
+# Force an index-only scan, where possible:
+setup {
+ SET enable_bitmapscan = false;
+ SET enable_indexonlyscan = true;
+ SET enable_indexscan = true;
+}
+
+step s1_begin { BEGIN; }
+step s1_commit { COMMIT; }
+
+step s1_prepare_sorted {
+ DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)';
+}
+
+step s1_prepare_unsorted {
+ DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a;
+}
+
+step s1_fetch_1 {
+ FETCH FROM foo;
+}
+
+step s1_fetch_all {
+ SELECT pg_sleep_for(INTERVAL '50ms');
+ FETCH ALL FROM foo;
+}
+
+
+session s2
+
+# Don't delete row 1 so we have a row for the cursor to "rest" on.
+step s2_mod
+{
+ DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+}
+
+# Disable truncation, as otherwise we'll just wait for a timeout while trying
+# to acquire the lock
+step s2_vacuum { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; }
+
+permutation
+ # delete nearly all rows, to make issue visible
+ s2_mod
+ # create a cursor
+ s1_begin
+ s1_prepare_sorted
+
+ # fetch one row from the cursor, that ensures the index scan portion is done
+ # before the vacuum in the next step
+ s1_fetch_1
+
+ # with the bug this vacuum will mark pages as all-visible that the scan in
+ # the next step then considers all-visible, despite all rows from those
+ # pages having been removed.
+ # Because this should block on buffer-level locks, this won't ever be
+ # considered "blocked" by isolation tester, and so we only have a single
+ # step we can work with concurrently.
+ s2_vacuum (*)
+
+ # if this returns any rows, we're busted
+ s1_fetch_all
+
+ s1_commit
+
+permutation
+ # delete nearly all rows, to make issue visible
+ s2_mod
+ # create a cursor
+ s1_begin
+ s1_prepare_unsorted
+
+ # fetch one row from the cursor, that ensures the index scan portion is done
+ # before the vacuum in the next step
+ s1_fetch_1
+
+ # with the bug this vacuum will mark pages as all-visible that the scan in
+ # the next step then considers all-visible, despite all rows from those
+ # pages having been removed.
+ # Because this should block on buffer-level locks, this won't ever be
+ # considered "blocked" by isolation tester, and so we only have a single
+ # step we can work with concurrently.
+ s2_vacuum (*)
+
+ # if this returns any rows, we're busted
+ s1_fetch_all
+
+ s1_commit
--
2.43.0
From bf5e033d291a9dcfdd95ca95d576f5b40a8d34c2 Mon Sep 17 00:00:00 2001
From: nkey <[email protected]>
Date: Fri, 10 Jan 2025 18:00:49 +0100
Subject: [PATCH v5 3/3] RFC: Extend buffer pinning for SP-GIST IOS
This should fix issues with incorrect results when a SP-GIST
IOS encounters tuples removed from pages by a concurrent vacuum
operation.
---
src/backend/access/spgist/spgscan.c | 112 ++++++++++++++++++++++----
src/backend/access/spgist/spgvacuum.c | 2 +-
src/include/access/spgist_private.h | 3 +
3 files changed, 100 insertions(+), 17 deletions(-)
diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c
index 986362a777f..32e6a0a8a03 100644
--- a/src/backend/access/spgist/spgscan.c
+++ b/src/backend/access/spgist/spgscan.c
@@ -30,7 +30,8 @@
typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr,
Datum leafValue, bool isNull,
SpGistLeafTuple leafTuple, bool recheck,
- bool recheckDistances, double *distances);
+ bool recheckDistances, double *distances,
+ Buffer pin);
/*
* Pairing heap comparison function for the SpGistSearchItem queue.
@@ -300,6 +301,38 @@ spgPrepareScanKeys(IndexScanDesc scan)
}
}
+/*
+ * Note: This removes all items from the pairingheap.
+ */
+static void
+spgScanEndDropAllPagePins(IndexScanDesc scan, SpGistScanOpaque so)
+{
+ /* Guaranteed no pinned pages */
+ if (so->scanQueue == NULL || !scan->xs_want_itup)
+ return;
+
+ if (so->nPtrs > 0)
+ {
+ Assert(BufferIsValid(so->pagePin));
+ ReleaseBuffer(so->pagePin);
+ so->pagePin = InvalidBuffer;
+ }
+
+ while (!pairingheap_is_empty(so->scanQueue))
+ {
+ pairingheap_node *node;
+ SpGistSearchItem *item;
+
+ node = pairingheap_remove_first(so->scanQueue);
+ item = pairingheap_container(SpGistSearchItem, phNode, node);
+ if (!item->isLeaf)
+ continue;
+
+ Assert(BufferIsValid(item->buffer));
+ ReleaseBuffer(item->buffer);
+ }
+}
+
IndexScanDesc
spgbeginscan(Relation rel, int keysz, int orderbysz)
{
@@ -416,6 +449,9 @@ spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
/* preprocess scankeys, set up the representation in *so */
spgPrepareScanKeys(scan);
+ /* release any pinned buffers from earlier rescans */
+ spgScanEndDropAllPagePins(scan, so);
+
/* set up starting queue entries */
resetSpGistScanOpaque(so);
@@ -428,6 +464,12 @@ spgendscan(IndexScanDesc scan)
{
SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+ /*
+ * release any pinned buffers from earlier rescans, before we drop their
+ * data by dropping the memory contexts.
+ */
+ spgScanEndDropAllPagePins(scan, so);
+
MemoryContextDelete(so->tempCxt);
MemoryContextDelete(so->traversalCxt);
@@ -460,7 +502,7 @@ spgendscan(IndexScanDesc scan)
static SpGistSearchItem *
spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
Datum leafValue, bool recheck, bool recheckDistances,
- bool isnull, double *distances)
+ bool isnull, double *distances, Buffer addPin)
{
SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances);
@@ -479,6 +521,10 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
datumCopy(leafValue, so->state.attType.attbyval,
so->state.attType.attlen);
+ Assert(BufferIsValid(addPin));
+ IncrBufferRefCount(addPin);
+ item->buffer = addPin;
+
/*
* If we're going to need to reconstruct INCLUDE attributes, store the
* whole leaf tuple so we can get the INCLUDE attributes out of it.
@@ -495,6 +541,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
{
item->value = (Datum) 0;
item->leafTuple = NULL;
+ item->buffer = InvalidBuffer;
}
item->traversalValue = NULL;
item->isLeaf = true;
@@ -513,7 +560,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
static bool
spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item,
SpGistLeafTuple leafTuple, bool isnull,
- bool *reportedSome, storeRes_func storeRes)
+ bool *reportedSome, storeRes_func storeRes, Buffer buffer)
{
Datum leafValue;
double *distances;
@@ -580,7 +627,8 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item,
recheck,
recheckDistances,
isnull,
- distances);
+ distances,
+ buffer);
spgAddSearchItemToQueue(so, heapItem);
@@ -591,7 +639,7 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item,
/* non-ordered scan, so report the item right away */
Assert(!recheckDistances);
storeRes(so, &leafTuple->heapPtr, leafValue, isnull,
- leafTuple, recheck, false, NULL);
+ leafTuple, recheck, false, NULL, InvalidBuffer);
*reportedSome = true;
}
}
@@ -760,7 +808,7 @@ enum SpGistSpecialOffsetNumbers
static OffsetNumber
spgTestLeafTuple(SpGistScanOpaque so,
SpGistSearchItem *item,
- Page page, OffsetNumber offset,
+ Page page, OffsetNumber offset, Buffer buffer,
bool isnull, bool isroot,
bool *reportedSome,
storeRes_func storeRes)
@@ -799,7 +847,8 @@ spgTestLeafTuple(SpGistScanOpaque so,
Assert(ItemPointerIsValid(&leafTuple->heapPtr));
- spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes);
+ spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes,
+ buffer);
return SGLT_GET_NEXTOFFSET(leafTuple);
}
@@ -835,7 +884,8 @@ redirect:
Assert(so->numberOfNonNullOrderBys > 0);
storeRes(so, &item->heapPtr, item->value, item->isNull,
item->leafTuple, item->recheck,
- item->recheckDistances, item->distances);
+ item->recheckDistances, item->distances,
+ item->buffer);
reportedSome = true;
}
else
@@ -873,7 +923,7 @@ redirect:
/* When root is a leaf, examine all its tuples */
for (offset = FirstOffsetNumber; offset <= max; offset++)
(void) spgTestLeafTuple(so, item, page, offset,
- isnull, true,
+ buffer, isnull, true,
&reportedSome, storeRes);
}
else
@@ -883,10 +933,24 @@ redirect:
{
Assert(offset >= FirstOffsetNumber && offset <= max);
offset = spgTestLeafTuple(so, item, page, offset,
- isnull, false,
+ buffer, isnull, false,
&reportedSome, storeRes);
if (offset == SpGistRedirectOffsetNumber)
+ {
+ Assert(so->nPtrs == 0);
goto redirect;
+ }
+ }
+
+ /*
+ * IOS: Make sure we have one additional pin on the buffer,
+ * so that vacuum won't remove any deleted TIDs and mark
+ * their pages ALL_VISIBLE while we still have a copy.
+ */
+ if (so->want_itup && reportedSome)
+ {
+ IncrBufferRefCount(buffer);
+ so->pagePin = buffer;
}
}
}
@@ -929,9 +993,10 @@ static void
storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr,
Datum leafValue, bool isnull,
SpGistLeafTuple leafTuple, bool recheck,
- bool recheckDistances, double *distances)
+ bool recheckDistances, double *distances,
+ Buffer pin)
{
- Assert(!recheckDistances && !distances);
+ Assert(!recheckDistances && !distances && !BufferIsValid(pin));
tbm_add_tuples(so->tbm, heapPtr, 1, recheck);
so->ntids++;
}
@@ -954,10 +1019,9 @@ spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
/* storeRes subroutine for gettuple case */
static void
-storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
- Datum leafValue, bool isnull,
- SpGistLeafTuple leafTuple, bool recheck,
- bool recheckDistances, double *nonNullDistances)
+storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue,
+ bool isnull, SpGistLeafTuple leafTuple, bool recheck,
+ bool recheckDistances, double *nonNullDistances, Buffer pin)
{
Assert(so->nPtrs < MaxIndexTuplesPerPage);
so->heapPtrs[so->nPtrs] = *heapPtr;
@@ -1016,6 +1080,10 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
so->reconTups[so->nPtrs] = heap_form_tuple(so->reconTupDesc,
leafDatums,
leafIsnulls);
+
+ /* move the buffer pin, if required */
+ if (BufferIsValid(pin))
+ so->pagePin = pin;
}
so->nPtrs++;
}
@@ -1065,7 +1133,19 @@ spggettuple(IndexScanDesc scan, ScanDirection dir)
for (i = 0; i < so->nPtrs; i++)
pfree(so->reconTups[i]);
+
+ if (so->nPtrs > 0)
+ {
+ Assert(BufferIsValid(so->pagePin));
+ ReleaseBuffer(so->pagePin);
+ so->pagePin = InvalidBuffer;
+ }
}
+ else
+ {
+ Assert(!BufferIsValid(so->pagePin));
+ }
+
so->iPtr = so->nPtrs = 0;
spgWalk(scan->indexRelation, so, false, storeGettuple);
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index 894aefa19e1..f02c270c5cc 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -629,7 +629,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, bds->info->strategy);
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ LockBufferForCleanup(buffer);
page = (Page) BufferGetPage(buffer);
if (PageIsNew(page))
diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h
index cb43a278f46..1948e53e2ff 100644
--- a/src/include/access/spgist_private.h
+++ b/src/include/access/spgist_private.h
@@ -175,6 +175,8 @@ typedef struct SpGistSearchItem
bool isLeaf; /* SearchItem is heap item */
bool recheck; /* qual recheck is needed */
bool recheckDistances; /* distance recheck is needed */
+ Buffer buffer; /* buffer pinned for this leaf tuple
+ * (IOS-only) */
/* array with numberOfOrderBys entries */
double distances[FLEXIBLE_ARRAY_MEMBER];
@@ -226,6 +228,7 @@ typedef struct SpGistScanOpaqueData
TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */
int nPtrs; /* number of TIDs found on current page */
int iPtr; /* index for scanning through same */
+ Buffer pagePin; /* output tuple's pinned buffer, if IOS */
ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */
bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */
bool recheckDistances[MaxIndexTuplesPerPage]; /* distance recheck
--
2.43.0