On Fri, Mar 27, 2026 at 3:17 PM Melanie Plageman <[email protected]> wrote: > > Done in v47.
Attached v48 does a bit more cleanup. No functional changes. I'm planning to push this soon. I think my remaining question is whether I should move the row marks and result relation bitmaps into the estate. I'm leaning toward not doing that and leaving them in the PlannedStmt. Anyway, If I want to replace the list of result relation RTIs in the PlannedStmt, I have to leave the bitmapset version there. - Melanie
From 04d24039ec7c14672955aaaba37e3aa512858a0d Mon Sep 17 00:00:00 2001 From: Melanie Plageman <[email protected]> Date: Fri, 27 Mar 2026 09:21:22 -0400 Subject: [PATCH v48 1/6] Make it cheap to check if a relation is modified by a query Save the range table indexes of result relations and row mark relations in separate bitmaps in the PlannedStmt. Precomputing them allows cheap membership checks during execution. With a few exceptions, these two groups comprise all relations that will be modified by a query. This includes relations targeted by INSERT, UPDATE, DELETE, and MERGE as well as relations with any row mark (like SELECT FOR UPDATE). A later commit will use this information during scans to control whether or not on-access pruning is allowed to set the visibility map -- which would be counterproductive if the query will modify the page. PlannedStmt->resultRelations is only used in a membership check, so it may make sense to replace its usage with the new resultRelationRelids. Author: Melanie Plageman <[email protected]> Reviewed-by: Andres Freund <[email protected]> Reviewed-by: David Rowley <[email protected]> Reviewed-by: Chao Li <[email protected]> Discussion: https://postgr.es/m/F5CDD1B5-628C-44A1-9F85-3958C626F6A9%40gmail.com --- src/backend/executor/execParallel.c | 2 ++ src/backend/optimizer/plan/planner.c | 19 ++++++++++++++++++- src/include/nodes/plannodes.h | 9 +++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index ac84af294c9..791fcb88de9 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -188,6 +188,8 @@ ExecSerializePlan(Plan *plan, EState *estate) pstmt->partPruneInfos = estate->es_part_prune_infos; pstmt->rtable = estate->es_range_table; pstmt->unprunableRelids = estate->es_unpruned_relids; + pstmt->resultRelationRelids = estate->es_plannedstmt->resultRelationRelids; + pstmt->rowMarkRelids = estate->es_plannedstmt->rowMarkRelids; pstmt->permInfos = estate->es_rteperminfos; pstmt->resultRelations = NIL; pstmt->appendRelations = NIL; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index d19800ad6a5..df4c99fc3ff 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -340,8 +340,11 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, RelOptInfo *final_rel; Path *best_path; Plan *top_plan; + Bitmapset *resultRelationRelids = NULL; + Bitmapset *rowMarkRelids = NULL; ListCell *lp, - *lr; + *lr, + *lc; /* * Set up global state for this planner invocation. This data is needed @@ -661,6 +664,20 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->subplans = glob->subplans; result->rewindPlanIDs = glob->rewindPlanIDs; result->rowMarks = glob->finalrowmarks; + + /* + * Compute resultRelationRelids and rowMarkRelids from resultRelations and + * rowMarks for quick access. + */ + foreach(lc, glob->resultRelations) + resultRelationRelids = bms_add_member(resultRelationRelids, + lfirst_int(lc)); + foreach(lc, glob->finalrowmarks) + rowMarkRelids = bms_add_member(rowMarkRelids, + ((PlanRowMark *) lfirst(lc))->rti); + result->resultRelationRelids = resultRelationRelids; + result->rowMarkRelids = rowMarkRelids; + result->relationOids = glob->relationOids; result->invalItems = glob->invalItems; result->paramExecTypes = glob->paramExecTypes; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index b6185825fcb..88be65d7bde 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -121,6 +121,9 @@ typedef struct PlannedStmt /* integer list of RT indexes, or NIL */ List *resultRelations; + /* RT indexes of result relations targeted by INSERT/UPDATE/DELETE/MERGE */ + Bitmapset *resultRelationRelids; + /* list of AppendRelInfo nodes */ List *appendRelations; @@ -138,6 +141,12 @@ typedef struct PlannedStmt /* a list of PlanRowMark's */ List *rowMarks; + /* + * RT indexes of relations with row marks. Useful for quick membership + * checks instead of iterating through rowMarks. + */ + Bitmapset *rowMarkRelids; + /* OIDs of relations the plan depends on */ List *relationOids; -- 2.43.0
From 7c331c575a377b40a1dd1142b23fa3a8692de38f Mon Sep 17 00:00:00 2001 From: Melanie Plageman <[email protected]> Date: Fri, 27 Mar 2026 08:35:00 -0400 Subject: [PATCH v48 2/6] Remove PlannedStmt->resultRelations in favor of resultRelationRelids PlannedStmt->resultRelations was an integer list of range table indexes. Now that we have a bitmapset, which offers cheap membership checks, remove the list and update all consumers to use the bitmapset. --- contrib/pg_overexplain/pg_overexplain.c | 5 +++-- src/backend/executor/execParallel.c | 1 - src/backend/executor/execUtils.c | 2 +- src/backend/optimizer/plan/planner.c | 1 - src/include/nodes/plannodes.h | 4 ---- 5 files changed, 4 insertions(+), 9 deletions(-) diff --git a/contrib/pg_overexplain/pg_overexplain.c b/contrib/pg_overexplain/pg_overexplain.c index c2b90493cc6..b4e90909289 100644 --- a/contrib/pg_overexplain/pg_overexplain.c +++ b/contrib/pg_overexplain/pg_overexplain.c @@ -780,8 +780,9 @@ overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es) overexplain_bitmapset("Unprunable RTIs", plannedstmt->unprunableRelids, es); if (es->format != EXPLAIN_FORMAT_TEXT || - plannedstmt->resultRelations != NIL) - overexplain_intlist("Result RTIs", plannedstmt->resultRelations, es); + !bms_is_empty(plannedstmt->resultRelationRelids)) + overexplain_bitmapset("Result RTIs", plannedstmt->resultRelationRelids, + es); /* Close group, we're all done */ ExplainCloseGroup("Range Table", "Range Table", false, es); diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 791fcb88de9..1bab6160036 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -191,7 +191,6 @@ ExecSerializePlan(Plan *plan, EState *estate) pstmt->resultRelationRelids = estate->es_plannedstmt->resultRelationRelids; pstmt->rowMarkRelids = estate->es_plannedstmt->rowMarkRelids; pstmt->permInfos = estate->es_rteperminfos; - pstmt->resultRelations = NIL; pstmt->appendRelations = NIL; pstmt->planOrigin = PLAN_STMT_INTERNAL; diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 9886ab06b69..36c5285d252 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -733,7 +733,7 @@ ExecCreateScanSlotFromOuterPlan(EState *estate, bool ExecRelationIsTargetRelation(EState *estate, Index scanrelid) { - return list_member_int(estate->es_plannedstmt->resultRelations, scanrelid); + return bms_is_member(scanrelid, estate->es_plannedstmt->resultRelationRelids); } /* ---------------------------------------------------------------- diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index df4c99fc3ff..9853443209d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -659,7 +659,6 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, glob->prunableRelids); result->permInfos = glob->finalrteperminfos; result->subrtinfos = glob->subrtinfos; - result->resultRelations = glob->resultRelations; result->appendRelations = glob->appendRelations; result->subplans = glob->subplans; result->rewindPlanIDs = glob->rewindPlanIDs; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 88be65d7bde..19e5d814c59 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -117,10 +117,6 @@ typedef struct PlannedStmt */ List *permInfos; - /* rtable indexes of target relations for INSERT/UPDATE/DELETE/MERGE */ - /* integer list of RT indexes, or NIL */ - List *resultRelations; - /* RT indexes of result relations targeted by INSERT/UPDATE/DELETE/MERGE */ Bitmapset *resultRelationRelids; -- 2.43.0
From 05cc37abae70327fda4bee4a392dfebcc08ec3c5 Mon Sep 17 00:00:00 2001 From: Melanie Plageman <[email protected]> Date: Thu, 19 Mar 2026 17:05:55 -0400 Subject: [PATCH v48 3/6] Thread flags through begin-scan APIs Add an AM user-settable flags parameter to several of the table scan functions, one table AM callback, and index_beginscan(). This allows users to pass additional context to be used when building the scan descriptors. For index scans, a new flags field is added to IndexFetchTableData, and the heap AM saves the caller-provided flags there. This introduces an extension point for follow-up work to pass per-scan information (such as whether the relation is read-only for the current query) from the executor to the AM layer. Author: Melanie Plageman <[email protected]> Reviewed-by: Andres Freund <[email protected]> Reviewed-by: Tomas Vondra <[email protected]> Reviewed-by: David Rowley <[email protected]> Reviewed-by: Chao Li <[email protected]> Discussion: https://postgr.es/m/F5CDD1B5-628C-44A1-9F85-3958C626F6A9%40gmail.com --- contrib/pgrowlocks/pgrowlocks.c | 2 +- src/backend/access/brin/brin.c | 3 +- src/backend/access/gin/gininsert.c | 3 +- src/backend/access/heap/heapam_handler.c | 9 +- src/backend/access/index/genam.c | 6 +- src/backend/access/index/indexam.c | 13 ++- src/backend/access/nbtree/nbtsort.c | 3 +- src/backend/access/table/tableam.c | 22 ++--- src/backend/commands/constraint.c | 3 +- src/backend/commands/copyto.c | 3 +- src/backend/commands/tablecmds.c | 13 +-- src/backend/commands/typecmds.c | 6 +- src/backend/executor/execIndexing.c | 4 +- src/backend/executor/execReplication.c | 12 ++- src/backend/executor/nodeBitmapHeapscan.c | 3 +- src/backend/executor/nodeIndexonlyscan.c | 9 +- src/backend/executor/nodeIndexscan.c | 12 ++- src/backend/executor/nodeSamplescan.c | 3 +- src/backend/executor/nodeSeqscan.c | 9 +- src/backend/executor/nodeTidrangescan.c | 7 +- src/backend/partitioning/partbounds.c | 3 +- src/backend/utils/adt/selfuncs.c | 3 +- src/include/access/genam.h | 6 +- src/include/access/heapam.h | 5 +- src/include/access/relscan.h | 6 ++ src/include/access/tableam.h | 103 ++++++++++++++++------ 26 files changed, 185 insertions(+), 86 deletions(-) diff --git a/contrib/pgrowlocks/pgrowlocks.c b/contrib/pgrowlocks/pgrowlocks.c index ff3692c87c4..d164c4c03ad 100644 --- a/contrib/pgrowlocks/pgrowlocks.c +++ b/contrib/pgrowlocks/pgrowlocks.c @@ -115,7 +115,7 @@ pgrowlocks(PG_FUNCTION_ARGS) RelationGetRelationName(rel)); /* Scan the relation */ - scan = table_beginscan(rel, GetActiveSnapshot(), 0, NULL); + scan = table_beginscan(rel, GetActiveSnapshot(), 0, NULL, SO_NONE); hscan = (HeapScanDesc) scan; attinmeta = TupleDescGetAttInMetadata(rsinfo->setDesc); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 2a0f8c8e3b8..bdb30752e09 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2844,7 +2844,8 @@ _brin_parallel_scan_and_build(BrinBuildState *state, indexInfo->ii_Concurrent = brinshared->isconcurrent; scan = table_beginscan_parallel(heap, - ParallelTableScanFromBrinShared(brinshared)); + ParallelTableScanFromBrinShared(brinshared), + SO_NONE); reltuples = table_index_build_scan(heap, index, indexInfo, true, true, brinbuildCallbackParallel, state, scan); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index e54782d9dd8..9d83a495775 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -2068,7 +2068,8 @@ _gin_parallel_scan_and_build(GinBuildState *state, indexInfo->ii_Concurrent = ginshared->isconcurrent; scan = table_beginscan_parallel(heap, - ParallelTableScanFromGinBuildShared(ginshared)); + ParallelTableScanFromGinBuildShared(ginshared), + SO_NONE); reltuples = table_index_build_scan(heap, index, indexInfo, true, progress, ginBuildCallbackParallel, state, scan); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index d40878928e1..99280cd8159 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -81,11 +81,12 @@ heapam_slot_callbacks(Relation relation) */ static IndexFetchTableData * -heapam_index_fetch_begin(Relation rel) +heapam_index_fetch_begin(Relation rel, uint32 flags) { IndexFetchHeapData *hscan = palloc0_object(IndexFetchHeapData); hscan->xs_base.rel = rel; + hscan->xs_base.flags = flags; hscan->xs_cbuf = InvalidBuffer; hscan->xs_vmbuffer = InvalidBuffer; @@ -763,7 +764,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tableScan = NULL; heapScan = NULL; - indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0); + indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0, + SO_NONE); index_rescan(indexScan, NULL, 0, NULL, 0); } else @@ -772,7 +774,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, pgstat_progress_update_param(PROGRESS_REPACK_PHASE, PROGRESS_REPACK_PHASE_SEQ_SCAN_HEAP); - tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); + tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL, + SO_NONE); heapScan = (HeapScanDesc) tableScan; indexScan = NULL; diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 5e89b86a62c..1408989c568 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -455,7 +455,8 @@ systable_beginscan(Relation heapRelation, } sysscan->iscan = index_beginscan(heapRelation, irel, - snapshot, NULL, nkeys, 0); + snapshot, NULL, nkeys, 0, + SO_NONE); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; @@ -716,7 +717,8 @@ systable_beginscan_ordered(Relation heapRelation, bsysscan = true; sysscan->iscan = index_beginscan(heapRelation, indexRelation, - snapshot, NULL, nkeys, 0); + snapshot, NULL, nkeys, 0, + SO_NONE); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index fbfc33159eb..44496ae0963 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -258,7 +258,8 @@ index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, - int nkeys, int norderbys) + int nkeys, int norderbys, + uint32 flags) { IndexScanDesc scan; @@ -285,7 +286,7 @@ index_beginscan(Relation heapRelation, scan->instrument = instrument; /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + scan->xs_heapfetch = table_index_fetch_begin(heapRelation, flags); return scan; } @@ -588,13 +589,17 @@ index_parallelrescan(IndexScanDesc scan) /* * index_beginscan_parallel - join parallel index scan * + * flags is a bitmask of ScanOptions affecting the underlying table scan. No + * SO_INTERNAL_FLAGS are permitted. + * * Caller must be holding suitable locks on the heap and the index. */ IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, - ParallelIndexScanDesc pscan) + ParallelIndexScanDesc pscan, + uint32 flags) { Snapshot snapshot; IndexScanDesc scan; @@ -616,7 +621,7 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, scan->instrument = instrument; /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heaprel); + scan->xs_heapfetch = table_index_fetch_begin(heaprel, flags); return scan; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 47a9bda30c9..756dfa3dcf4 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1928,7 +1928,8 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared)); + ParallelTableScanFromBTShared(btshared), + SO_NONE); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index dfda1af412e..86481d7c029 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -118,7 +118,7 @@ table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key) Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); return table_beginscan_common(relation, snapshot, nkeys, key, - NULL, flags); + NULL, flags, SO_NONE); } @@ -163,10 +163,11 @@ table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, } TableScanDesc -table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan, + uint32 flags) { Snapshot snapshot; - uint32 flags = SO_TYPE_SEQSCAN | + uint32 internal_flags = SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); @@ -176,7 +177,7 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); RegisterSnapshot(snapshot); - flags |= SO_TEMP_SNAPSHOT; + internal_flags |= SO_TEMP_SNAPSHOT; } else { @@ -185,16 +186,17 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) } return table_beginscan_common(relation, snapshot, 0, NULL, - pscan, flags); + pscan, internal_flags, flags); } TableScanDesc table_beginscan_parallel_tidrange(Relation relation, - ParallelTableScanDesc pscan) + ParallelTableScanDesc pscan, + uint32 flags) { Snapshot snapshot; - uint32 flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE; TableScanDesc sscan; + uint32 internal_flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE; Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); @@ -206,7 +208,7 @@ table_beginscan_parallel_tidrange(Relation relation, /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); RegisterSnapshot(snapshot); - flags |= SO_TEMP_SNAPSHOT; + internal_flags |= SO_TEMP_SNAPSHOT; } else { @@ -215,7 +217,7 @@ table_beginscan_parallel_tidrange(Relation relation, } sscan = table_beginscan_common(relation, snapshot, 0, NULL, - pscan, flags); + pscan, internal_flags, flags); return sscan; } @@ -248,7 +250,7 @@ table_index_fetch_tuple_check(Relation rel, bool found; slot = table_slot_create(rel, NULL); - scan = table_index_fetch_begin(rel); + scan = table_index_fetch_begin(rel, SO_NONE); found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index cc11c47b6f2..421d8c359f0 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -106,7 +106,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) */ tmptid = checktid; { - IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); + IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation, + SO_NONE); bool call_again = false; if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index faf62d959b4..f0e0147c665 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -1336,7 +1336,8 @@ CopyRelationTo(CopyToState cstate, Relation rel, Relation root_rel, uint64 *proc AttrMap *map = NULL; TupleTableSlot *root_slot = NULL; - scandesc = table_beginscan(rel, GetActiveSnapshot(), 0, NULL); + scandesc = table_beginscan(rel, GetActiveSnapshot(), 0, NULL, + SO_NONE); slot = table_slot_create(rel, NULL); /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index c69c12dc014..ec0063287d0 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -6411,7 +6411,8 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap) * checking all the constraints. */ snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(oldrel, snapshot, 0, NULL); + scan = table_beginscan(oldrel, snapshot, 0, NULL, + SO_NONE); /* * Switch to per-tuple memory context and reset it for each tuple @@ -13980,8 +13981,8 @@ validateForeignKeyConstraint(char *conname, */ snapshot = RegisterSnapshot(GetLatestSnapshot()); slot = table_slot_create(rel, NULL); - scan = table_beginscan(rel, snapshot, 0, NULL); - + scan = table_beginscan(rel, snapshot, 0, NULL, + SO_NONE); perTupCxt = AllocSetContextCreate(CurrentMemoryContext, "validateForeignKeyConstraint", ALLOCSET_SMALL_SIZES); @@ -22882,7 +22883,8 @@ MergePartitionsMoveRows(List **wqueue, List *mergingPartitions, Relation newPart /* Scan through the rows. */ snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(mergingPartition, snapshot, 0, NULL); + scan = table_beginscan(mergingPartition, snapshot, 0, NULL, + SO_NONE); /* * Switch to per-tuple memory context and reset it for each tuple @@ -23346,7 +23348,8 @@ SplitPartitionMoveRows(List **wqueue, Relation rel, Relation splitRel, /* Scan through the rows. */ snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(splitRel, snapshot, 0, NULL); + scan = table_beginscan(splitRel, snapshot, 0, NULL, + SO_NONE); /* * Switch to per-tuple memory context and reset it for each tuple diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 3dab6bb5a79..cd38e9cddf4 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -3185,7 +3185,8 @@ validateDomainNotNullConstraint(Oid domainoid) /* Scan all tuples in this relation */ snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(testrel, snapshot, 0, NULL); + scan = table_beginscan(testrel, snapshot, 0, NULL, + SO_NONE); slot = table_slot_create(testrel, NULL); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3266,7 +3267,8 @@ validateDomainCheckConstraint(Oid domainoid, const char *ccbin, LOCKMODE lockmod /* Scan all tuples in this relation */ snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(testrel, snapshot, 0, NULL); + scan = table_beginscan(testrel, snapshot, 0, NULL, + SO_NONE); slot = table_slot_create(testrel, NULL); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 9d071e495c6..cc6eb3a6ee9 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -815,7 +815,9 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, retry: conflict = false; found_self = false; - index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0); + index_scan = index_beginscan(heap, index, + &DirtySnapshot, NULL, indnkeyatts, 0, + SO_NONE); index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0); while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot)) diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 2497ee7edc5..fea8991cb04 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -205,7 +205,8 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); /* Start an index scan. */ - scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0); + scan = index_beginscan(rel, idxrel, + &snap, NULL, skey_attoff, 0, SO_NONE); retry: found = false; @@ -383,7 +384,8 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, /* Start a heap scan. */ InitDirtySnapshot(snap); - scan = table_beginscan(rel, &snap, 0, NULL); + scan = table_beginscan(rel, &snap, 0, NULL, + SO_NONE); scanslot = table_slot_create(rel, NULL); retry: @@ -602,7 +604,8 @@ RelationFindDeletedTupleInfoSeq(Relation rel, TupleTableSlot *searchslot, * not yet committed or those just committed prior to the scan are * excluded in update_most_recent_deletion_info(). */ - scan = table_beginscan(rel, SnapshotAny, 0, NULL); + scan = table_beginscan(rel, SnapshotAny, 0, NULL, + SO_NONE); scanslot = table_slot_create(rel, NULL); table_rescan(scan, NULL); @@ -666,7 +669,8 @@ RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid, * not yet committed or those just committed prior to the scan are * excluded in update_most_recent_deletion_info(). */ - scan = index_beginscan(rel, idxrel, SnapshotAny, NULL, skey_attoff, 0); + scan = index_beginscan(rel, idxrel, + SnapshotAny, NULL, skey_attoff, 0, SO_NONE); index_rescan(scan, skey, skey_attoff, NULL, 0); diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 7cf8d23c742..69683d81527 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -148,7 +148,8 @@ BitmapTableScanSetup(BitmapHeapScanState *node) table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, - NULL); + NULL, + SO_NONE); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 9eab81fd1c8..02df40f32c5 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -95,7 +95,8 @@ IndexOnlyNext(IndexOnlyScanState *node) estate->es_snapshot, node->ioss_Instrument, node->ioss_NumScanKeys, - node->ioss_NumOrderByKeys); + node->ioss_NumOrderByKeys, + SO_NONE); node->ioss_ScanDesc = scandesc; @@ -794,7 +795,8 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, - piscan); + piscan, + SO_NONE); node->ioss_ScanDesc->xs_want_itup = true; node->ioss_VMBuffer = InvalidBuffer; @@ -860,7 +862,8 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, - piscan); + piscan, + SO_NONE); node->ioss_ScanDesc->xs_want_itup = true; /* diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 06143e94c5a..3c0b8daf664 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -113,7 +113,8 @@ IndexNext(IndexScanState *node) estate->es_snapshot, node->iss_Instrument, node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + node->iss_NumOrderByKeys, + SO_NONE); node->iss_ScanDesc = scandesc; @@ -209,7 +210,8 @@ IndexNextWithReorder(IndexScanState *node) estate->es_snapshot, node->iss_Instrument, node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + node->iss_NumOrderByKeys, + SO_NONE); node->iss_ScanDesc = scandesc; @@ -1730,7 +1732,8 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, - piscan); + piscan, + SO_NONE); /* * If no run-time keys to calculate or they are ready, go ahead and pass @@ -1794,7 +1797,8 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, - piscan); + piscan, + SO_NONE); /* * If no run-time keys to calculate or they are ready, go ahead and pass diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index 6b0d65f752f..cf32df33d82 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -298,7 +298,8 @@ tablesample_init(SampleScanState *scanstate) 0, NULL, scanstate->use_bulkread, allow_sync, - scanstate->use_pagemode); + scanstate->use_pagemode, + SO_NONE); } else { diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 8f219f60a93..09ccc65de1c 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -71,7 +71,8 @@ SeqNext(SeqScanState *node) */ scandesc = table_beginscan(node->ss.ss_currentRelation, estate->es_snapshot, - 0, NULL); + 0, NULL, + SO_NONE); node->ss.ss_currentScanDesc = scandesc; } @@ -375,7 +376,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, estate->es_snapshot); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel(node->ss.ss_currentRelation, pscan, + SO_NONE); } /* ---------------------------------------------------------------- @@ -408,5 +410,6 @@ ExecSeqScanInitializeWorker(SeqScanState *node, pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel(node->ss.ss_currentRelation, pscan, + SO_NONE); } diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index 617713bde04..084e4c6ec90 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -245,7 +245,8 @@ TidRangeNext(TidRangeScanState *node) scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation, estate->es_snapshot, &node->trss_mintid, - &node->trss_maxtid); + &node->trss_maxtid, + SO_NONE); node->ss.ss_currentScanDesc = scandesc; } else @@ -460,7 +461,7 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, - pscan); + pscan, SO_NONE); } /* ---------------------------------------------------------------- @@ -494,5 +495,5 @@ ExecTidRangeScanInitializeWorker(TidRangeScanState *node, pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, - pscan); + pscan, SO_NONE); } diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 0ca312ac27d..f867d1b75a5 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -3362,7 +3362,8 @@ check_default_partition_contents(Relation parent, Relation default_rel, econtext = GetPerTupleExprContext(estate); snapshot = RegisterSnapshot(GetLatestSnapshot()); tupslot = table_slot_create(part_rel, &estate->es_tupleTable); - scan = table_beginscan(part_rel, snapshot, 0, NULL); + scan = table_beginscan(part_rel, snapshot, 0, NULL, + SO_NONE); /* * Switch to per-tuple memory context and reset it for each tuple diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 53f85ccde01..4160d2d6e24 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -7178,7 +7178,8 @@ get_actual_variable_endpoint(Relation heapRel, index_scan = index_beginscan(heapRel, indexRel, &SnapshotNonVacuumable, NULL, - 1, 0); + 1, 0, + SO_NONE); /* Set it up for index-only scan */ index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 1a27bf060b3..b69320a7fc8 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -158,7 +158,8 @@ extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, - int nkeys, int norderbys); + int nkeys, int norderbys, + uint32 flags); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, @@ -184,7 +185,8 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, - ParallelIndexScanDesc pscan); + ParallelIndexScanDesc pscan, + uint32 flags); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); extern bool index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 9b403203006..e2e07348f37 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -95,10 +95,7 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; - /* - * For sequential scans and bitmap heap scans. The current heap block's - * corresponding page in the visibility map. - */ + /* Current heap block's corresponding page in the visibility map */ Buffer rs_vmbuffer; /* these fields only used in page-at-a-time mode and for bitmap scans */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index ce340c076f8..960abf6c214 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -122,6 +122,12 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + + /* + * Bitmask of ScanOptions affecting the relation. No SO_INTERNAL_FLAGS are + * permitted. + */ + uint32 flags; } IndexFetchTableData; struct IndexScanInstrumentation; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 06084752245..f8d1423b2d0 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -45,6 +45,8 @@ typedef struct ValidateIndexState ValidateIndexState; */ typedef enum ScanOptions { + SO_NONE = 0, + /* one of SO_TYPE_* may be specified */ SO_TYPE_SEQSCAN = 1 << 0, SO_TYPE_BITMAPSCAN = 1 << 1, @@ -65,6 +67,19 @@ typedef enum ScanOptions SO_TEMP_SNAPSHOT = 1 << 9, } ScanOptions; +/* + * Mask of flags that are set internally by the table scan functions and + * shouldn't be passed by callers. Some of these are effectively set by callers + * through parameters to table scan functions (e.g. SO_ALLOW_STRAT/allow_strat), + * however, for now, retain tight control over them and don't allow users to + * pass these themselves to table scan functions. + */ +#define SO_INTERNAL_FLAGS \ + (SO_TYPE_SEQSCAN | SO_TYPE_BITMAPSCAN | SO_TYPE_SAMPLESCAN | \ + SO_TYPE_TIDSCAN | SO_TYPE_TIDRANGESCAN | SO_TYPE_ANALYZE | \ + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | \ + SO_TEMP_SNAPSHOT) + /* * Result codes for table_{update,delete,lock_tuple}, and for visibility * routines inside table AMs. @@ -321,8 +336,9 @@ typedef struct TableAmRoutine * `flags` is a bitmask indicating the type of scan (ScanOptions's * SO_TYPE_*, currently only one may be specified), options controlling * the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be - * specified, an AM may ignore unsupported ones) and whether the snapshot - * needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT). + * specified, an AM may ignore unsupported ones), whether the snapshot + * needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT), + * and any number of the other ScanOptions values. */ TableScanDesc (*scan_begin) (Relation rel, Snapshot snapshot, @@ -418,9 +434,12 @@ typedef struct TableAmRoutine * IndexFetchTableData, which the AM will typically embed in a larger * structure with additional information. * + * flags is a bitmask of ScanOptions affecting underlying table scan + * behavior. See scan_begin() for more information on passing these. + * * Tuples for an index scan can then be fetched via index_fetch_tuple. */ - struct IndexFetchTableData *(*index_fetch_begin) (Relation rel); + struct IndexFetchTableData *(*index_fetch_begin) (Relation rel, uint32 flags); /* * Reset index fetch. Typically this will release cross index fetch @@ -871,12 +890,19 @@ extern TupleTableSlot *table_slot_create(Relation relation, List **reglist); * A wrapper around the Table Access Method scan_begin callback, to centralize * error checking. All calls to ->scan_begin() should go through this * function. + * + * The caller-provided user_flags are validated against SO_INTERNAL_FLAGS to + * catch callers that accidentally pass scan-type or other internal flags. */ static TableScanDesc table_beginscan_common(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, ParallelTableScanDesc pscan, - uint32 flags) + uint32 flags, uint32 user_flags) { + Assert((user_flags & SO_INTERNAL_FLAGS) == 0); + Assert((flags & ~SO_INTERNAL_FLAGS) == 0); + flags |= user_flags; + /* * We don't allow scans to be started while CheckXidAlive is set, except * via systable_beginscan() et al. See detailed comments in xact.c where @@ -891,15 +917,18 @@ table_beginscan_common(Relation rel, Snapshot snapshot, int nkeys, /* * Start a scan of `rel`. Returned tuples pass a visibility test of * `snapshot`, and if nkeys != 0, the results are filtered by those scan keys. + * + * flags is a bitmask of ScanOptions. No SO_INTERNAL_FLAGS are permitted. */ static inline TableScanDesc table_beginscan(Relation rel, Snapshot snapshot, - int nkeys, ScanKeyData *key) + int nkeys, ScanKeyData *key, uint32 flags) { - uint32 flags = SO_TYPE_SEQSCAN | + uint32 internal_flags = SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; - return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); + return table_beginscan_common(rel, snapshot, nkeys, key, NULL, + internal_flags, flags); } /* @@ -928,7 +957,8 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, if (allow_sync) flags |= SO_ALLOW_SYNC; - return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); + return table_beginscan_common(rel, snapshot, nkeys, key, NULL, + flags, SO_NONE); } /* @@ -936,14 +966,17 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, * TableScanDesc for a bitmap heap scan. Although that scan technology is * really quite unlike a standard seqscan, there is just enough commonality to * make it worth using the same data structure. + * + * flags is a bitmask of ScanOptions. No SO_INTERNAL_FLAGS are permitted. */ static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, - int nkeys, ScanKeyData *key) + int nkeys, ScanKeyData *key, uint32 flags) { - uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; + uint32 internal_flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; - return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); + return table_beginscan_common(rel, snapshot, nkeys, key, NULL, + internal_flags, flags); } /* @@ -952,23 +985,26 @@ table_beginscan_bm(Relation rel, Snapshot snapshot, * using the same data structure although the behavior is rather different. * In addition to the options offered by table_beginscan_strat, this call * also allows control of whether page-mode visibility checking is used. + * + * flags is a bitmask of ScanOptions. No SO_INTERNAL_FLAGS are permitted. */ static inline TableScanDesc table_beginscan_sampling(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, bool allow_strat, bool allow_sync, - bool allow_pagemode) + bool allow_pagemode, uint32 flags) { - uint32 flags = SO_TYPE_SAMPLESCAN; + uint32 internal_flags = SO_TYPE_SAMPLESCAN; if (allow_strat) - flags |= SO_ALLOW_STRAT; + internal_flags |= SO_ALLOW_STRAT; if (allow_sync) - flags |= SO_ALLOW_SYNC; + internal_flags |= SO_ALLOW_SYNC; if (allow_pagemode) - flags |= SO_ALLOW_PAGEMODE; + internal_flags |= SO_ALLOW_PAGEMODE; - return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); + return table_beginscan_common(rel, snapshot, nkeys, key, NULL, + internal_flags, flags); } /* @@ -981,7 +1017,8 @@ table_beginscan_tid(Relation rel, Snapshot snapshot) { uint32 flags = SO_TYPE_TIDSCAN; - return table_beginscan_common(rel, snapshot, 0, NULL, NULL, flags); + return table_beginscan_common(rel, snapshot, 0, NULL, NULL, + flags, SO_NONE); } /* @@ -994,7 +1031,8 @@ table_beginscan_analyze(Relation rel) { uint32 flags = SO_TYPE_ANALYZE; - return table_beginscan_common(rel, NULL, 0, NULL, NULL, flags); + return table_beginscan_common(rel, NULL, 0, NULL, NULL, + flags, SO_NONE); } /* @@ -1055,16 +1093,19 @@ table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableS /* * table_beginscan_tidrange is the entry point for setting up a TableScanDesc * for a TID range scan. + * + * flags is a bitmask of ScanOptions. No SO_INTERNAL_FLAGS are permitted. */ static inline TableScanDesc table_beginscan_tidrange(Relation rel, Snapshot snapshot, ItemPointer mintid, - ItemPointer maxtid) + ItemPointer maxtid, uint32 flags) { TableScanDesc sscan; - uint32 flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE; + uint32 internal_flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE; - sscan = table_beginscan_common(rel, snapshot, 0, NULL, NULL, flags); + sscan = table_beginscan_common(rel, snapshot, 0, NULL, NULL, + internal_flags, flags); /* Set the range of TIDs to scan */ sscan->rs_rd->rd_tableam->scan_set_tidrange(sscan, mintid, maxtid); @@ -1136,20 +1177,26 @@ extern void table_parallelscan_initialize(Relation rel, * table_parallelscan_initialize(), for the same relation. The initialization * does not need to have happened in this backend. * + * flags is a bitmask of ScanOptions. No SO_INTERNAL_FLAGS are permitted. + * * Caller must hold a suitable lock on the relation. */ extern TableScanDesc table_beginscan_parallel(Relation relation, - ParallelTableScanDesc pscan); + ParallelTableScanDesc pscan, + uint32 flags); /* * Begin a parallel tid range scan. `pscan` needs to have been initialized * with table_parallelscan_initialize(), for the same relation. The * initialization does not need to have happened in this backend. * + * flags is a bitmask of ScanOptions. No SO_INTERNAL_FLAGS are permitted. + * * Caller must hold a suitable lock on the relation. */ extern TableScanDesc table_beginscan_parallel_tidrange(Relation relation, - ParallelTableScanDesc pscan); + ParallelTableScanDesc pscan, + uint32 flags); /* * Restart a parallel scan. Call this in the leader process. Caller is @@ -1172,11 +1219,15 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) * Prepare to fetch tuples from the relation, as needed when fetching tuples * for an index scan. * + * flags is a bitmask of ScanOptions. No SO_INTERNAL_FLAGS are permitted. + * * Tuples for an index scan can then be fetched via table_index_fetch_tuple(). */ static inline IndexFetchTableData * -table_index_fetch_begin(Relation rel) +table_index_fetch_begin(Relation rel, uint32 flags) { + Assert((flags & SO_INTERNAL_FLAGS) == 0); + /* * We don't allow scans to be started while CheckXidAlive is set, except * via systable_beginscan() et al. See detailed comments in xact.c where @@ -1185,7 +1236,7 @@ table_index_fetch_begin(Relation rel) if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "scan started during logical decoding"); - return rel->rd_tableam->index_fetch_begin(rel); + return rel->rd_tableam->index_fetch_begin(rel, flags); } /* -- 2.43.0
From 239ec276e5bee0f59ae0a91d0bd9eff8842c8a63 Mon Sep 17 00:00:00 2001 From: Melanie Plageman <[email protected]> Date: Mon, 2 Mar 2026 16:31:33 -0500 Subject: [PATCH v48 4/6] Pass down information on table modification to scan node Pass down information to sequential scan, index [only] scan, bitmap table scan, sample scan, and TID range scan nodes on whether or not the query modifies the relation being scanned. A later commit will use this information to update the VM during on-access pruning only if the relation is not modified by the query. Author: Melanie Plageman <[email protected]> Reviewed-by: Andres Freund <[email protected]> Reviewed-by: Andrey Borodin <[email protected]> Reviewed-by: Tomas Vondra <[email protected]> Reviewed-by: Chao Li <[email protected]> Discussion: https://postgr.es/m/4379FDA3-9446-4E2C-9C15-32EFE8D4F31B%40yandex-team.ru --- src/backend/executor/execUtils.c | 21 +++++++++++++++++++++ src/backend/executor/nodeBitmapHeapscan.c | 3 ++- src/backend/executor/nodeIndexonlyscan.c | 9 ++++++--- src/backend/executor/nodeIndexscan.c | 12 ++++++++---- src/backend/executor/nodeSamplescan.c | 3 ++- src/backend/executor/nodeSeqscan.c | 10 +++++++--- src/backend/executor/nodeTidrangescan.c | 11 ++++++++--- src/include/access/tableam.h | 3 +++ src/include/executor/executor.h | 2 ++ 9 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 36c5285d252..f090de49921 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -736,6 +736,27 @@ ExecRelationIsTargetRelation(EState *estate, Index scanrelid) return bms_is_member(scanrelid, estate->es_plannedstmt->resultRelationRelids); } +/* + * Return true if the scan node's relation is not modified by the query. + * + * This is not perfectly accurate. INSERT ... SELECT from the same table does + * not add the scan relation to resultRelationRelids, so it will be reported + * as read-only even though the query modifies it. + * + * Conversely, when any relation in the query has a modifying row mark, all + * other relations get a ROW_MARK_REFERENCE, causing them to be reported as + * not read-only even though they may only be read. + */ +bool +ScanRelIsReadOnly(ScanState *ss) +{ + Index scanrelid = ((Scan *) ss->ps.plan)->scanrelid; + PlannedStmt *pstmt = ss->ps.state->es_plannedstmt; + + return !bms_is_member(scanrelid, pstmt->resultRelationRelids) && + !bms_is_member(scanrelid, pstmt->rowMarkRelids); +} + /* ---------------------------------------------------------------- * ExecOpenScanRelation * diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 69683d81527..73831aed451 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -149,7 +149,8 @@ BitmapTableScanSetup(BitmapHeapScanState *node) node->ss.ps.state->es_snapshot, 0, NULL, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 02df40f32c5..de6154fd541 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -96,7 +96,8 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); node->ioss_ScanDesc = scandesc; @@ -796,7 +797,8 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, piscan, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); node->ioss_ScanDesc->xs_want_itup = true; node->ioss_VMBuffer = InvalidBuffer; @@ -863,7 +865,8 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, piscan, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); node->ioss_ScanDesc->xs_want_itup = true; /* diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 3c0b8daf664..1620d146071 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -114,7 +114,8 @@ IndexNext(IndexScanState *node) node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); node->iss_ScanDesc = scandesc; @@ -211,7 +212,8 @@ IndexNextWithReorder(IndexScanState *node) node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); node->iss_ScanDesc = scandesc; @@ -1733,7 +1735,8 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_NumScanKeys, node->iss_NumOrderByKeys, piscan, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); /* * If no run-time keys to calculate or they are ready, go ahead and pass @@ -1798,7 +1801,8 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_NumScanKeys, node->iss_NumOrderByKeys, piscan, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); /* * If no run-time keys to calculate or they are ready, go ahead and pass diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index cf32df33d82..f3d273e1c5e 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -299,7 +299,8 @@ tablesample_init(SampleScanState *scanstate) scanstate->use_bulkread, allow_sync, scanstate->use_pagemode, - SO_NONE); + ScanRelIsReadOnly(&scanstate->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); } else { diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 09ccc65de1c..04803b0e37d 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -72,7 +72,8 @@ SeqNext(SeqScanState *node) scandesc = table_beginscan(node->ss.ss_currentRelation, estate->es_snapshot, 0, NULL, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); node->ss.ss_currentScanDesc = scandesc; } @@ -375,9 +376,11 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan, estate->es_snapshot); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); + node->ss.ss_currentScanDesc = table_beginscan_parallel(node->ss.ss_currentRelation, pscan, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); } /* ---------------------------------------------------------------- @@ -411,5 +414,6 @@ ExecSeqScanInitializeWorker(SeqScanState *node, pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = table_beginscan_parallel(node->ss.ss_currentRelation, pscan, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); } diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index 084e4c6ec90..4a8fe91b2b3 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -246,7 +246,8 @@ TidRangeNext(TidRangeScanState *node) estate->es_snapshot, &node->trss_mintid, &node->trss_maxtid, - SO_NONE); + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); node->ss.ss_currentScanDesc = scandesc; } else @@ -461,7 +462,9 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, - pscan, SO_NONE); + pscan, + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); } /* ---------------------------------------------------------------- @@ -495,5 +498,7 @@ ExecTidRangeScanInitializeWorker(TidRangeScanState *node, pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, - pscan, SO_NONE); + pscan, + ScanRelIsReadOnly(&node->ss) ? + SO_HINT_REL_READ_ONLY : SO_NONE); } diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index f8d1423b2d0..68ddabc171a 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -65,6 +65,9 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + + /* set if the query doesn't modify the relation */ + SO_HINT_REL_READ_ONLY = 1 << 10, } ScanOptions; /* diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 07f4b1f7490..7979a17e4ec 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -690,6 +690,8 @@ extern void ExecCreateScanSlotFromOuterPlan(EState *estate, extern bool ExecRelationIsTargetRelation(EState *estate, Index scanrelid); +extern bool ScanRelIsReadOnly(ScanState *ss); + extern Relation ExecOpenScanRelation(EState *estate, Index scanrelid, int eflags); extern void ExecInitRangeTable(EState *estate, List *rangeTable, List *permInfos, -- 2.43.0
From e914b4834e613c59935df55a400a9290cc145b33 Mon Sep 17 00:00:00 2001 From: Melanie Plageman <[email protected]> Date: Fri, 27 Feb 2026 16:33:40 -0500 Subject: [PATCH v48 5/6] Allow on-access pruning to set pages all-visible Many queries do not modify the underlying relation. For such queries, if on-access pruning occurs during the scan, we can check whether the page has become all-visible and update the visibility map accordingly. Previously, only vacuum and COPY FREEZE marked pages as all-visible or all-frozen. This commit implements on-access VM setting for sequential scans as well as for the underlying heap relation in index scans and bitmap heap scans. Setting the visibility map on-access can avoid write amplification caused by vacuum later needing to set the page all-visible, trigger a write and potentially FPI. It also allows more frequent index-only scans, since they require pages to be marked all-visible in the VM. Author: Melanie Plageman <[email protected]> Reviewed-by: Andres Freund <[email protected]> Reviewed-by: Kirill Reshke <[email protected]> Reviewed-by: Chao Li <[email protected]> Discussion: https://postgr.es/m/flat/CAAKRu_ZMw6Npd_qm2KM%2BFwQ3cMOMx1Dh3VMhp8-V7SOLxdK9-g%40mail.gmail.com --- src/backend/access/heap/heapam.c | 3 +- src/backend/access/heap/heapam_handler.c | 6 ++- src/backend/access/heap/pruneheap.c | 55 ++++++++++++++++++------ src/backend/access/heap/vacuumlazy.c | 2 +- src/include/access/heapam.h | 3 +- 5 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index eb1f67f31cd..7012ee2c306 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -633,7 +633,8 @@ heap_prepare_pagescan(TableScanDesc sscan) /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_base.rs_rd, buffer, &scan->rs_vmbuffer); + heap_page_prune_opt(scan->rs_base.rs_rd, buffer, &scan->rs_vmbuffer, + (sscan->rs_flags & SO_HINT_REL_READ_ONLY)); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 99280cd8159..3433ea93c11 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -149,7 +149,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, */ if (prev_buf != hscan->xs_cbuf) heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf, - &hscan->xs_vmbuffer); + &hscan->xs_vmbuffer, + (hscan->xs_base.flags & SO_HINT_REL_READ_ONLY)); } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -2546,7 +2547,8 @@ BitmapHeapScanNextBlock(TableScanDesc scan, /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_rd, buffer, &hscan->rs_vmbuffer); + heap_page_prune_opt(scan->rs_rd, buffer, &hscan->rs_vmbuffer, + scan->rs_flags & SO_HINT_REL_READ_ONLY); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 6693af8da7f..7fcfc844d20 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -44,6 +44,8 @@ typedef struct bool mark_unused_now; /* whether to attempt freezing tuples */ bool attempt_freeze; + /* whether to attempt setting the VM */ + bool attempt_set_vm; struct VacuumCutoffs *cutoffs; Relation relation; @@ -232,7 +234,8 @@ static void page_verify_redirects(Page page); static bool heap_page_will_freeze(bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune, PruneState *prstate); -static bool heap_page_will_set_vm(PruneState *prstate, PruneReason reason); +static bool heap_page_will_set_vm(PruneState *prstate, PruneReason reason, + bool do_prune, bool do_freeze); /* @@ -251,9 +254,21 @@ static bool heap_page_will_set_vm(PruneState *prstate, PruneReason reason); * reuse the pin across calls, avoiding repeated pin/unpin cycles. If we find * VM corruption during pruning, we will fix it. Caller is responsible for * unpinning *vmbuffer. + * + * rel_read_only is true if we determined at plan time that the query does not + * modify the relation. It is counterproductive to set the VM if the query + * will immediately clear it. + * + * As noted in ScanRelIsReadOnly(), INSERT ... SELECT on the same table will + * report the scan relation as read-only. This is usually harmless in + * practice. It is useful to set scanned pages all-visible that won't be + * inserted into. Pages we do insert to rarely meet the criteria for pruning, + * and those that do are likely to contain in-progress inserts which make the + * page not fully all-visible. */ void -heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) +heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer, + bool rel_read_only) { Page page = BufferGetPage(buffer); TransactionId prune_xid; @@ -336,6 +351,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) * current implementation. */ params.options = HEAP_PAGE_PRUNE_ALLOW_FAST_PATH; + if (rel_read_only) + params.options |= HEAP_PAGE_PRUNE_SET_VM; heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc, NULL, NULL); @@ -392,6 +409,7 @@ prune_freeze_setup(PruneFreezeParams *params, /* cutoffs must be provided if we will attempt freezing */ Assert(!(params->options & HEAP_PAGE_PRUNE_FREEZE) || params->cutoffs); prstate->attempt_freeze = (params->options & HEAP_PAGE_PRUNE_FREEZE) != 0; + prstate->attempt_set_vm = (params->options & HEAP_PAGE_PRUNE_SET_VM) != 0; prstate->cutoffs = params->cutoffs; prstate->relation = params->relation; prstate->block = BufferGetBlockNumber(params->buffer); @@ -461,9 +479,8 @@ prune_freeze_setup(PruneFreezeParams *params, * We track whether the page will be all-visible/all-frozen at the end of * pruning and freezing. While examining tuple visibility, we'll set * set_all_visible to false if there are tuples on the page not visible to - * all running and future transactions. set_all_visible is always - * maintained but only VACUUM will set the VM if the page ends up being - * all-visible. + * all running and future transactions. If enabled for this scan, we will + * set the VM if the page ends up being all-visible. * * We also keep track of the newest live XID, which is used to calculate * the snapshot conflict horizon for a WAL record setting the VM. @@ -920,21 +937,35 @@ heap_page_fix_vm_corruption(PruneState *prstate, OffsetNumber offnum, * This function does not actually set the VM bits or page-level visibility * hint, PD_ALL_VISIBLE. * + * This should be called only after do_freeze has been decided (and do_prune + * has been set), as these factor into our heuristic-based decision. + * * Returns true if one or both VM bits should be set and false otherwise. */ static bool -heap_page_will_set_vm(PruneState *prstate, PruneReason reason) +heap_page_will_set_vm(PruneState *prstate, PruneReason reason, + bool do_prune, bool do_freeze) { - /* - * Though on-access pruning maintains prstate->set_all_visible, we don't - * set the VM on-access for now. - */ - if (reason == PRUNE_ON_ACCESS) + if (!prstate->attempt_set_vm) return false; if (!prstate->set_all_visible) return false; + /* + * If this is an on-access call and we're not actually pruning, avoid + * setting the visibility map if it would newly dirty the heap page or, if + * the page is already dirty, if doing so would require including a + * full-page image (FPI) of the heap page in the WAL. + */ + if (reason == PRUNE_ON_ACCESS && !do_prune && !do_freeze && + (!BufferIsDirty(prstate->buffer) || XLogCheckBufferNeedsBackup(prstate->buffer))) + { + prstate->set_all_visible = false; + prstate->set_all_frozen = false; + return false; + } + prstate->new_vmbits = VISIBILITYMAP_ALL_VISIBLE; if (prstate->set_all_frozen) @@ -1167,7 +1198,7 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, Assert(!prstate.set_all_frozen || prstate.set_all_visible); Assert(!prstate.set_all_visible || (prstate.lpdead_items == 0)); - do_set_vm = heap_page_will_set_vm(&prstate, params->reason); + do_set_vm = heap_page_will_set_vm(&prstate, params->reason, do_prune, do_freeze); /* * new_vmbits should be 0 regardless of whether or not the page is diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index f698c2d899b..24001b27387 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -2021,7 +2021,7 @@ lazy_scan_prune(LVRelState *vacrel, .buffer = buf, .vmbuffer = vmbuffer, .reason = PRUNE_VACUUM_SCAN, - .options = HEAP_PAGE_PRUNE_FREEZE, + .options = HEAP_PAGE_PRUNE_FREEZE | HEAP_PAGE_PRUNE_SET_VM, .vistest = vacrel->vistest, .cutoffs = &vacrel->cutoffs, }; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index e2e07348f37..f2a009141be 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -43,6 +43,7 @@ #define HEAP_PAGE_PRUNE_MARK_UNUSED_NOW (1 << 0) #define HEAP_PAGE_PRUNE_FREEZE (1 << 1) #define HEAP_PAGE_PRUNE_ALLOW_FAST_PATH (1 << 2) +#define HEAP_PAGE_PRUNE_SET_VM (1 << 3) typedef struct BulkInsertStateData *BulkInsertState; typedef struct GlobalVisState GlobalVisState; @@ -431,7 +432,7 @@ extern TransactionId heap_index_delete_tuples(Relation rel, /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer, - Buffer *vmbuffer); + Buffer *vmbuffer, bool rel_read_only); extern void heap_page_prune_and_freeze(PruneFreezeParams *params, PruneFreezeResult *presult, OffsetNumber *off_loc, -- 2.43.0
From 13f3c314d760bce33ca48ea6d1cde606b62cad4c Mon Sep 17 00:00:00 2001 From: Melanie Plageman <[email protected]> Date: Tue, 29 Jul 2025 16:12:56 -0400 Subject: [PATCH v48 6/6] Set pd_prune_xid on insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that on-access pruning can update the visibility map (VM) during read-only queries, set the page’s pd_prune_xid hint during INSERT and on the new page during UPDATE. This allows heap_page_prune_and_freeze() to set the VM the first time a page is read after being filled with tuples. This may avoid I/O amplification by setting the page all-visible when it is still in shared buffers and allowing later vacuums to skip scanning the page. It also enables index-only scans of newly inserted data much sooner. As a side benefit, this addresses a long-standing note in heap_insert() and heap_multi_insert(): aborted inserts can now be pruned on-access rather than lingering until the next VACUUM. Author: Melanie Plageman <[email protected]> Reviewed-by: Andres Freund <[email protected]> Reviewed-by: Chao Li <[email protected]> Discussion: https://postgr.es/m/flat/CAAKRu_ZMw6Npd_qm2KM%2BFwQ3cMOMx1Dh3VMhp8-V7SOLxdK9-g%40mail.gmail.com --- src/backend/access/heap/heapam.c | 39 +++++++++++++++++---------- src/backend/access/heap/heapam_xlog.c | 19 ++++++++++++- src/backend/access/heap/pruneheap.c | 18 ++++++------- 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 7012ee2c306..3b020d910d7 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2154,6 +2154,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, TransactionId xid = GetCurrentTransactionId(); HeapTuple heaptup; Buffer buffer; + Page page; Buffer vmbuffer = InvalidBuffer; bool all_visible_cleared = false; @@ -2180,6 +2181,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, &vmbuffer, NULL, 0); + page = BufferGetPage(buffer); + /* * We're about to do the actual insert -- but check for conflict first, to * avoid possibly having to roll back work we've just done. @@ -2203,25 +2206,30 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, RelationPutHeapTuple(relation, buffer, heaptup, (options & HEAP_INSERT_SPECULATIVE) != 0); - if (PageIsAllVisible(BufferGetPage(buffer))) + if (PageIsAllVisible(page)) { all_visible_cleared = true; - PageClearAllVisible(BufferGetPage(buffer)); + PageClearAllVisible(page); visibilitymap_clear(relation, ItemPointerGetBlockNumber(&(heaptup->t_self)), vmbuffer, VISIBILITYMAP_VALID_BITS); } /* - * XXX Should we set PageSetPrunable on this page ? + * Set pd_prune_xid to trigger heap_page_prune_and_freeze() once the page + * is full so that we can set the page all-visible in the VM on the next + * page access. * - * The inserting transaction may eventually abort thus making this tuple - * DEAD and hence available for pruning. Though we don't want to optimize - * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the - * aborted tuple will never be pruned until next vacuum is triggered. + * Setting pd_prune_xid is also handy if the inserting transaction + * eventually aborts making this tuple DEAD and hence available for + * pruning. If no other tuple in this page is UPDATEd/DELETEd, the aborted + * tuple would never otherwise be pruned until next vacuum is triggered. * - * If you do add PageSetPrunable here, add it in heap_xlog_insert too. + * Don't set it if we are in bootstrap mode or we are inserting a frozen + * tuple, as there is no further pruning/freezing needed in those cases. */ + if (TransactionIdIsNormal(xid) && !(options & HEAP_INSERT_FROZEN)) + PageSetPrunable(page, xid); MarkBufferDirty(buffer); @@ -2231,7 +2239,6 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, xl_heap_insert xlrec; xl_heap_header xlhdr; XLogRecPtr recptr; - Page page = BufferGetPage(buffer); uint8 info = XLOG_HEAP_INSERT; int bufflags = 0; @@ -2596,8 +2603,12 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, } /* - * XXX Should we set PageSetPrunable on this page ? See heap_insert() + * Set pd_prune_xid. See heap_insert() for more on why we do this when + * inserting tuples. This only makes sense if we aren't already + * setting the page frozen in the VM and we're not in bootstrap mode. */ + if (!all_frozen_set && TransactionIdIsNormal(xid)) + PageSetPrunable(page, xid); MarkBufferDirty(buffer); @@ -4139,12 +4150,12 @@ l2: * the subsequent page pruning will be a no-op and the hint will be * cleared. * - * XXX Should we set hint on newbuf as well? If the transaction aborts, - * there would be a prunable tuple in the newbuf; but for now we choose - * not to optimize for aborts. Note that heap_xlog_update must be kept in - * sync if this decision changes. + * We set the new page prunable as well. See heap_insert() for more on why + * we do this when inserting tuples. */ PageSetPrunable(page, xid); + if (newbuf != buffer) + PageSetPrunable(newpage, xid); if (use_hot_update) { diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 1302bb13e18..f3f419d3dc1 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -450,6 +450,14 @@ heap_xlog_insert(XLogReaderState *record) freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + /* + * Set the page prunable to trigger on-access pruning later, which may + * set the page all-visible in the VM. See comments in heap_insert(). + */ + if (TransactionIdIsNormal(XLogRecGetXid(record)) && + !HeapTupleHeaderXminFrozen(htup)) + PageSetPrunable(page, XLogRecGetXid(record)); + PageSetLSN(page, lsn); if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) @@ -599,12 +607,19 @@ heap_xlog_multi_insert(XLogReaderState *record) if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); - /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + /* + * XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible. If + * we are not setting the page frozen, then set the page's prunable + * hint so that we trigger on-access pruning later which may set the + * page all-visible in the VM. + */ if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) { PageSetAllVisible(page); PageClearPrunable(page); } + else + PageSetPrunable(page, XLogRecGetXid(record)); MarkBufferDirty(buffer); } @@ -921,6 +936,8 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) freespace = PageGetHeapFreeSpace(npage); PageSetLSN(npage, lsn); + /* See heap_insert() for why we set pd_prune_xid on insert */ + PageSetPrunable(npage, XLogRecGetXid(record)); MarkBufferDirty(nbuffer); } diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 7fcfc844d20..fe9b1f16db4 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -286,7 +286,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer, /* * First check whether there's any chance there's something to prune, * determining the appropriate horizon is a waste if there's no prune_xid - * (i.e. no updates/deletes left potentially dead tuples around). + * (i.e. no updates/deletes left potentially dead tuples around and no + * inserts inserted new tuples that may be visible to all). */ prune_xid = PageGetPruneXid(page); if (!TransactionIdIsValid(prune_xid)) @@ -1927,17 +1928,14 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum) prstate->set_all_visible = false; prstate->set_all_frozen = false; - /* The page should not be marked all-visible */ - if (PageIsAllVisible(page)) - heap_page_fix_vm_corruption(prstate, offnum, - VM_CORRUPT_TUPLE_VISIBILITY); - /* - * If we wanted to optimize for aborts, we might consider marking - * the page prunable when we see INSERT_IN_PROGRESS. But we - * don't. See related decisions about when to mark the page - * prunable in heapam.c. + * Though there is nothing "prunable" on the page, we maintain + * pd_prune_xid for inserts so that we have the opportunity to + * mark them all-visible during the next round of pruning. */ + heap_prune_record_prunable(prstate, + HeapTupleHeaderGetXmin(htup), + offnum); break; case HEAPTUPLE_DELETE_IN_PROGRESS: -- 2.43.0
