On 04/06/2024 01:49, Heikki Linnakangas wrote:
A straightforward fix is to modify RelationFlushRelation() so that if !IsTransactionState(), it just marks the entry as invalid instead of calling RelationClearRelation(). That's what RelationClearRelation() would do anyway, if it didn't hit the assertion first.
Here's a patch with that straightforward fix. Your test case hit the "rd_createSubid != InvalidSubTransactionId" case, I expanded it to also cover the "rd_firstRelfilelocatorSubid != InvalidSubTransactionId" case. Barring objections, I'll commit this later today or tomorrow. Thanks for the report!
I started a new thread with the bigger refactorings I had in mind: https://www.postgresql.org/message-id/9c9e8908-7b3e-4ce7-85a8-00c0e165a3d6%40iki.fi
-- Heikki Linnakangas Neon (https://neon.tech)
From 227f173d1066955a2ab6aba12c508aaa1f416c27 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas <heikki.linnakan...@iki.fi> Date: Wed, 5 Jun 2024 13:29:51 +0300 Subject: [PATCH 1/3] Make RelationFlushRelation() work without ResourceOwner during abort ReorderBufferImmediateInvalidation() executes invalidation messages in an aborted transaction. However, RelationFlushRelation sometimes required a valid resource owner, to temporarily increment the refcount of the relache entry. Commit b8bff07daa worked around that in the main subtransaction abort function, AbortSubTransaction(), but missed this similar case in ReorderBufferImmediateInvalidation(). To fix, introduce a separate function to invalidate a relcache entry. It does the same thing as RelationClearRelation(rebuild==true) does, when outside a transaction, but can be called without incrementing the refcount. Add regression test. Before this fix, it failed with: ERROR: ResourceOwnerEnlarge called after release started Reported-by: Alexander Lakhin <exclus...@gmail.com> Disussion: https://www.postgresql.org/message-id/e56be7d9-14b1-664d-0bfc-00ce97727...@gmail.com --- .../expected/decoding_in_xact.out | 48 +++++++++++++++++ .../test_decoding/sql/decoding_in_xact.sql | 27 ++++++++++ src/backend/access/transam/xact.c | 13 ----- src/backend/utils/cache/relcache.c | 54 ++++++++++++++++--- 4 files changed, 122 insertions(+), 20 deletions(-) diff --git a/contrib/test_decoding/expected/decoding_in_xact.out b/contrib/test_decoding/expected/decoding_in_xact.out index b65253f4630..8555b3d9436 100644 --- a/contrib/test_decoding/expected/decoding_in_xact.out +++ b/contrib/test_decoding/expected/decoding_in_xact.out @@ -79,6 +79,54 @@ SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'inc COMMIT (6 rows) +-- Decoding works in transaction that issues DDL +-- +-- We had issues handling relcache invalidations with these, see +-- https://www.postgresql.org/message-id/e56be7d9-14b1-664d-0bfc-00ce97727...@gmail.com +CREATE TABLE tbl_created_outside_xact(id SERIAL PRIMARY KEY); +BEGIN; + -- TRUNCATE changes the relfilenode and sends relcache invalidation + TRUNCATE tbl_created_outside_xact; + INSERT INTO tbl_created_outside_xact(id) VALUES('1'); + -- don't show yet, haven't committed + SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + data +------ +(0 rows) + +COMMIT; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + data +-------------------------------------------------------------- + BEGIN + table public.tbl_created_outside_xact: TRUNCATE: (no-flags) + table public.tbl_created_outside_xact: INSERT: id[integer]:1 + COMMIT +(4 rows) + +set debug_logical_replication_streaming=immediate; +BEGIN; + CREATE TABLE tbl_created_in_xact(id SERIAL PRIMARY KEY); + INSERT INTO tbl_created_in_xact VALUES (1); + CHECKPOINT; -- Force WAL flush, so that the above changes will be streamed + SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1'); + data +------------------------------------------ + opening a streamed block for transaction + streaming change for transaction + closing a streamed block for transaction +(3 rows) + +COMMIT; +reset debug_logical_replication_streaming; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + data +--------------------------------------------------------- + BEGIN + table public.tbl_created_in_xact: INSERT: id[integer]:1 + COMMIT +(3 rows) + SELECT 'stop' FROM pg_drop_replication_slot('regression_slot'); ?column? ---------- diff --git a/contrib/test_decoding/sql/decoding_in_xact.sql b/contrib/test_decoding/sql/decoding_in_xact.sql index 108782dc2e9..841a92fa780 100644 --- a/contrib/test_decoding/sql/decoding_in_xact.sql +++ b/contrib/test_decoding/sql/decoding_in_xact.sql @@ -38,4 +38,31 @@ COMMIT; INSERT INTO nobarf(data) VALUES('3'); SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); +-- Decoding works in transaction that issues DDL +-- +-- We had issues handling relcache invalidations with these, see +-- https://www.postgresql.org/message-id/e56be7d9-14b1-664d-0bfc-00ce97727...@gmail.com +CREATE TABLE tbl_created_outside_xact(id SERIAL PRIMARY KEY); +BEGIN; + -- TRUNCATE changes the relfilenode and sends relcache invalidation + TRUNCATE tbl_created_outside_xact; + INSERT INTO tbl_created_outside_xact(id) VALUES('1'); + + -- don't show yet, haven't committed + SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); +COMMIT; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + +set debug_logical_replication_streaming=immediate; +BEGIN; + CREATE TABLE tbl_created_in_xact(id SERIAL PRIMARY KEY); + INSERT INTO tbl_created_in_xact VALUES (1); + + CHECKPOINT; -- Force WAL flush, so that the above changes will be streamed + + SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1'); +COMMIT; +reset debug_logical_replication_streaming; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); + SELECT 'stop' FROM pg_drop_replication_slot('regression_slot'); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 4f4ce757623..9bda1aa6bc6 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -5279,20 +5279,7 @@ AbortSubTransaction(void) AtEOSubXact_RelationCache(false, s->subTransactionId, s->parent->subTransactionId); - - - /* - * AtEOSubXact_Inval sometimes needs to temporarily bump the refcount - * on the relcache entries that it processes. We cannot use the - * subtransaction's resource owner anymore, because we've already - * started releasing it. But we can use the parent resource owner. - */ - CurrentResourceOwner = s->parent->curTransactionOwner; - AtEOSubXact_Inval(false); - - CurrentResourceOwner = s->curTransactionOwner; - ResourceOwnerRelease(s->curTransactionOwner, RESOURCE_RELEASE_LOCKS, false, false); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index cc9b0c6524f..35dbb87ae3d 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -275,6 +275,7 @@ static HTAB *OpClassCache = NULL; static void RelationCloseCleanup(Relation relation); static void RelationDestroyRelation(Relation relation, bool remember_tupdesc); +static void RelationInvalidateRelation(Relation relation); static void RelationClearRelation(Relation relation, bool rebuild); static void RelationReloadIndexInfo(Relation relation); @@ -2512,6 +2513,31 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) pfree(relation); } +/* + * RelationInvalidateRelation - mark a relation cache entry as invalid + * + * An entry that's marked as invalid will be reloaded on next access. + */ +static void +RelationInvalidateRelation(Relation relation) +{ + /* + * Make sure smgr and lower levels close the relation's files, if they + * weren't closed already. If the relation is not getting deleted, the + * next smgr access should reopen the files automatically. This ensures + * that the low-level file access state is updated after, say, a vacuum + * truncation. + */ + RelationCloseSmgr(relation); + + /* Free AM cached data, if any */ + if (relation->rd_amcache) + pfree(relation->rd_amcache); + relation->rd_amcache = NULL; + + relation->rd_isvalid = false; +} + /* * RelationClearRelation * @@ -2846,14 +2872,28 @@ RelationFlushRelation(Relation relation) * New relcache entries are always rebuilt, not flushed; else we'd * forget the "new" status of the relation. Ditto for the * new-relfilenumber status. - * - * The rel could have zero refcnt here, so temporarily increment the - * refcnt to ensure it's safe to rebuild it. We can assume that the - * current transaction has some lock on the rel already. */ - RelationIncrementReferenceCount(relation); - RelationClearRelation(relation, true); - RelationDecrementReferenceCount(relation); + if (IsTransactionState() && relation->rd_droppedSubid == InvalidSubTransactionId) + { + /* + * The rel could have zero refcnt here, so temporarily increment + * the refcnt to ensure it's safe to rebuild it. We can assume + * that the current transaction has some lock on the rel already. + */ + RelationIncrementReferenceCount(relation); + RelationClearRelation(relation, true); + RelationDecrementReferenceCount(relation); + } + else + { + /* + * During abort processing, the current resource owner is not + * valid and we cannot hold a refcnt. Without a valid + * transaction, RelationClearRelation() would just mark the rel as + * invalid anyway, so we can do the same directly. + */ + RelationInvalidateRelation(relation); + } } else { -- 2.39.2