Hello! Rebased\reworked to align with the changes of [0].
Best regards. [0]: https://www.postgresql.org/message-id/flat/CAH2-WznZBhWqDBDVGh1VhVBLgLqaYHEkPhmVV7mJCr1Y3ZQhQQ%40mail.gmail.com#d6f1debb1405d1b4a983cbb46b24f41b
From 13934bd0b3588e71a96228f31395ab661e0e749d Mon Sep 17 00:00:00 2001 From: nkey <michail.nikolaev@gmail.com> Date: Sat, 23 Nov 2024 13:25:11 +0100 Subject: [PATCH v7 1/2] Add an isolation test to reproduce a dirty snapshot scan issue This commit introduces an isolation test to reliably reproduce the issue where non-MVCC index scans using SnapshotDirty can miss tuples due to concurrent modifications. When using non-MVCC snapshot types, a scan could miss tuples if concurrent transactions delete existing tuples and insert new one with different TIDs on the same page. --- src/backend/access/index/indexam.c | 8 ++++ src/backend/executor/execIndexing.c | 3 ++ src/test/modules/injection_points/Makefile | 2 +- .../expected/dirty_index_scan.out | 27 ++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../specs/dirty_index_scan.spec | 37 +++++++++++++++++++ 6 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 src/test/modules/injection_points/expected/dirty_index_scan.out create mode 100644 src/test/modules/injection_points/specs/dirty_index_scan.spec diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da..676e06c095c 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -57,6 +57,7 @@ #include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" +#include "utils/injection_point.h" /* ---------------------------------------------------------------- @@ -741,6 +742,13 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * * the index. */ Assert(ItemPointerIsValid(&scan->xs_heaptid)); +#ifdef USE_INJECTION_POINTS + if (!IsCatalogRelationOid(scan->indexRelation->rd_id)) + { + INJECTION_POINT("index_getnext_slot_before_fetch", NULL); + } +#endif + if (index_fetch_heap(scan, slot)) return true; } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b2406..36748b39e68 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -117,6 +117,7 @@ #include "utils/multirangetypes.h" #include "utils/rangetypes.h" #include "utils/snapmgr.h" +#include "utils/injection_point.h" /* waitMode argument to check_exclusion_or_unique_constraint() */ typedef enum @@ -943,6 +944,8 @@ retry: ExecDropSingleTupleTableSlot(existing_slot); + if (!conflict) + INJECTION_POINT("check_exclusion_or_unique_constraint_no_conflict", NULL); return !conflict; } diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index e680991f8d4..b73f8ac80f2 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -14,7 +14,7 @@ PGFILEDESC = "injection_points - facility for injection points" REGRESS = injection_points hashagg reindex_conc REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress -ISOLATION = basic inplace syscache-update-pruned +ISOLATION = basic inplace syscache-update-pruned dirty_index_scan TAP_TESTS = 1 diff --git a/src/test/modules/injection_points/expected/dirty_index_scan.out b/src/test/modules/injection_points/expected/dirty_index_scan.out new file mode 100644 index 00000000000..c286a9fd5b6 --- /dev/null +++ b/src/test/modules/injection_points/expected/dirty_index_scan.out @@ -0,0 +1,27 @@ +Parsed test spec with 3 sessions + +starting permutation: s1_s1 s2_s1 s3_s1 +injection_points_attach +----------------------- + +(1 row) + +step s1_s1: INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; <waiting ...> +step s2_s1: UPDATE test.tbl SET n = n + 1 WHERE i = 42; <waiting ...> +step s3_s1: + SELECT injection_points_detach('index_getnext_slot_before_fetch'); + SELECT injection_points_wakeup('index_getnext_slot_before_fetch'); + <waiting ...> +step s1_s1: <... completed> +step s2_s1: <... completed> +step s3_s1: <... completed> +injection_points_detach +----------------------- + +(1 row) + +injection_points_wakeup +----------------------- + +(1 row) + diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index d61149712fd..bb3869f9a75 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -47,6 +47,7 @@ tests += { 'basic', 'inplace', 'syscache-update-pruned', + 'dirty_index_scan', ], 'runningcheck': false, # see syscache-update-pruned }, diff --git a/src/test/modules/injection_points/specs/dirty_index_scan.spec b/src/test/modules/injection_points/specs/dirty_index_scan.spec new file mode 100644 index 00000000000..373bcaf4929 --- /dev/null +++ b/src/test/modules/injection_points/specs/dirty_index_scan.spec @@ -0,0 +1,37 @@ +setup +{ + CREATE EXTENSION injection_points; + CREATE SCHEMA test; + CREATE UNLOGGED TABLE test.tbl(i int primary key, n int); + CREATE INDEX tbl_n_idx ON test.tbl(n); + INSERT INTO test.tbl VALUES(42,1); +} + +teardown +{ + DROP SCHEMA test CASCADE; + DROP EXTENSION injection_points; +} + +session s1 +setup { + SELECT injection_points_set_local(); + SELECT injection_points_attach('check_exclusion_or_unique_constraint_no_conflict', 'error'); + SELECT injection_points_attach('index_getnext_slot_before_fetch', 'wait'); +} + +step s1_s1 { INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; } + +session s2 +step s2_s1 { UPDATE test.tbl SET n = n + 1 WHERE i = 42; } + +session s3 +step s3_s1 { + SELECT injection_points_detach('index_getnext_slot_before_fetch'); + SELECT injection_points_wakeup('index_getnext_slot_before_fetch'); +} + +permutation + s1_s1 + s2_s1(*) + s3_s1(s1_s1) \ No newline at end of file -- 2.43.0
From 261361b0016a729a46b6b8a24548645f118fd79b Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <mihailnikalayeu@gmail.com> Date: Mon, 9 Jun 2025 23:31:18 +0200 Subject: [PATCH v7 2/2] Fix btree index scan concurrency issues with dirty snapshots This patch addresses an issue where non-MVCC index scans using SnapshotDirty or SnapshotSelf could miss tuples due to concurrent modifications. The fix retains read locks on pages for these special snapshot types until the scan is done with the page's tuples, preventing concurrent modifications from causing inconsistent results. Updated README to document this special case in the btree locking mechanism. --- src/backend/access/nbtree/README | 13 ++++++++++++- src/backend/access/nbtree/nbtree.c | 19 ++++++++++++++++++- src/backend/access/nbtree/nbtsearch.c | 16 ++++++++++++---- src/backend/access/nbtree/nbtutils.c | 4 +++- src/include/access/nbtree.h | 1 + 5 files changed, 46 insertions(+), 7 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 53d4a61dc3f..a9280415633 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -85,7 +85,8 @@ move right until we find a page whose right-link matches the page we came from. (Actually, it's even harder than that; see page deletion discussion below.) -Page read locks are held only for as long as a scan is examining a page. +Page read locks are held only for as long as a scan is examining a page +(with exception for SnapshotDirty and SnapshotSelf scans - see below). To minimize lock/unlock traffic, an index scan always searches a leaf page to identify all the matching items at once, copying their heap tuple IDs into backend-local storage. The heap tuple IDs are then processed while @@ -103,6 +104,16 @@ We also remember the left-link, and follow it when the scan moves backwards (though this requires extra handling to account for concurrent splits of the left sibling; see detailed move-left algorithm below). +Despite the described mechanics in place, inconsistent results may still occur +during non-MVCC scans (SnapshotDirty and SnapshotSelf). This issue can occur if a +concurrent transaction deletes a tuple and inserts a new tuple with a new TID in the +same page. If the scan has already visited the page and cached its content in the +backend-local storage, it might skip the old tuple due to deletion and miss the new +tuple because the scan does not re-read the page. To address this issue, for +SnapshotDirty and SnapshotSelf scans, we retain the read lock on the page until +we're completely done processing all the tuples from that page, preventing +concurrent modifications that could lead to inconsistent results. + In most cases we release our lock and pin on a page before attempting to acquire pin and lock on the page we are moving to. In a few places it is necessary to lock the next page before releasing the current one. diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 03a1d7b027a..f9ef256561d 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -393,10 +393,22 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + else if (!so->dropLock) /* _bt_killitems always releases lock */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); BTScanPosUnpinIfPinned(so->currPos); BTScanPosInvalidate(so->currPos); } + /* + * For SnapshotDirty and SnapshotSelf scans, we don't unlock the buffer + * and keep the lock should be until we're completely done with this page. + * This prevents concurrent modifications from causing inconsistent + * results during non-MVCC scans. + * + * See nbtree/README for information about SnapshotDirty and SnapshotSelf. + */ + so->dropLock = scan->xs_snapshot->snapshot_type != SNAPSHOT_DIRTY + && scan->xs_snapshot->snapshot_type != SNAPSHOT_SELF; /* * We prefer to eagerly drop leaf page pins before btgettuple returns. * This avoids making VACUUM wait to acquire a cleanup lock on the page. @@ -418,7 +430,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, * * See nbtree/README section on making concurrent TID recycling safe. */ - so->dropPin = (!scan->xs_want_itup && + so->dropPin = (so->dropLock && + !scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) && RelationNeedsWAL(scan->indexRelation) && scan->heapRelation != NULL); @@ -475,6 +488,8 @@ btendscan(IndexScanDesc scan) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + else if (!so->dropLock) /* _bt_killitems always releases lock */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); BTScanPosUnpinIfPinned(so->currPos); } @@ -555,6 +570,8 @@ btrestrpos(IndexScanDesc scan) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + else if (!so->dropLock) /* _bt_killitems always releases lock */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); BTScanPosUnpinIfPinned(so->currPos); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 070f14c8b91..6e7f3c76162 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -57,12 +57,14 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); /* * _bt_drop_lock_and_maybe_pin() * - * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too. + * Unlock so->currPos.buf if so->dropLock. If scan is so->dropPin, drop the pin, too. * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock. */ static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so) { + if (!so->dropLock) + return; if (!so->dropPin) { /* Just drop the lock (not the pin) */ @@ -1532,7 +1534,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * _bt_next() -- Get the next item in a scan. * * On entry, so->currPos describes the current page, which may be pinned - * but is not locked, and so->currPos.itemIndex identifies which item was + * but is not locked (except for SnapshotDirty and SnapshotSelf scans, where + * the page remains locked), and so->currPos.itemIndex identifies which item was * previously returned. * * On success exit, so->currPos is updated as needed, and _bt_returnitem @@ -2111,7 +2114,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * Wrapper on _bt_readnextpage that performs final steps for the current page. * * On entry, so->currPos must be valid. Its buffer will be pinned, though - * never locked. (Actually, when so->dropPin there won't even be a pin held, + * never locked, except for SnapshotDirty and SnapshotSelf scans where the buffer + * remains locked until we're done with all tuples from the page + * (Actually, when so->dropPin there won't even be a pin held, * though so->currPos.currPage must still be set to a valid block number.) */ static bool @@ -2126,6 +2131,8 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + else if (!so->dropLock) /* _bt_killitems always releases lock */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); /* * Before we modify currPos, make a copy of the page data if there was a @@ -2265,7 +2272,8 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) } /* There's no actually-matching data on the page in so->currPos.buf */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (so->dropLock) + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); /* Call _bt_readnextpage using its _bt_steppage wrapper function */ if (!_bt_steppage(scan, dir)) diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 29f0dca1b08..a79d8bfc906 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3369,7 +3369,9 @@ _bt_killitems(IndexScanDesc scan) * concurrent VACUUMs from recycling any of the TIDs on the page. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_lockbuf(rel, so->currPos.buf, BT_READ); + /* Lock only if the lock is dropped. */ + if (so->dropLock) + _bt_lockbuf(rel, so->currPos.buf, BT_READ); } else { diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e709d2e0afe..ca8ebd7a418 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1070,6 +1070,7 @@ typedef struct BTScanOpaqueData /* info about killed items if any (killedItems is NULL if never used) */ int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ + bool dropLock; /* drop lock on before btgettuple returns? */ bool dropPin; /* drop leaf pin before btgettuple returns? */ /* -- 2.43.0