io_combine_limit 1, effective_io_concurrency 16, read ahead kb 16 On Fri, Feb 14, 2025 at 12:18 PM Tomas Vondra <to...@vondra.me> wrote: > > Based on off-list discussion with Melanie, I ran a modified version of > the benchmark, with these changes:
Thanks! It looks like the worst offender is io_combine_limit 1 (128 kB), effective_io_concurrency 16, read_ahead_kb 16. This is a combination that people shouldn't run in practice I think -- an io_combine_limit larger than read ahead. But we do still see regressions with io_combine_limit 1, effective_io_concurrency 16 at other read_ahead_kb values. Therefore I'd be interested to see that subset (ioc 1, eic 16, diff read ahead values) with the attached patch. > FWIW this does not change anything in the detection of sequential access > patterns, discussed nearby, because the benchmarks started before Andres > looked into that. If needed, I can easily rerun these tests, I just need > a patch to apply. Attached a patch that has all the commits squashed + removes sequential detection. - Melanie
From 0617aebfdd635ba75f3cc7bc44cc9e72431aa9c5 Mon Sep 17 00:00:00 2001 From: Melanie Plageman <melanieplage...@gmail.com> Date: Fri, 17 Jan 2025 16:42:25 -0500 Subject: [PATCH] Streaming BHS --- src/backend/access/gin/ginget.c | 39 +- src/backend/access/gin/ginscan.c | 2 +- src/backend/access/heap/heapam.c | 70 ++++ src/backend/access/heap/heapam_handler.c | 366 ++++++++++--------- src/backend/access/table/tableamapi.c | 2 - src/backend/executor/nodeBitmapHeapscan.c | 424 ++-------------------- src/backend/nodes/tidbitmap.c | 101 +++--- src/backend/optimizer/util/plancat.c | 2 +- src/backend/storage/aio/read_stream.c | 4 +- src/include/access/gin_private.h | 7 +- src/include/access/relscan.h | 8 +- src/include/access/tableam.h | 121 ++---- src/include/nodes/execnodes.h | 23 +- src/include/nodes/tidbitmap.h | 18 +- 14 files changed, 411 insertions(+), 776 deletions(-) diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 63dd1f3679f..ac1b749ed98 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -332,10 +332,11 @@ restartScanEntry: entry->list = NULL; entry->nlist = 0; entry->matchBitmap = NULL; - entry->matchResult = NULL; + entry->matchResult.blockno = InvalidBlockNumber; entry->reduceResult = false; entry->predictNumberResult = 0; + /* * we should find entry, and begin scan of posting tree or just store * posting list in memory @@ -824,19 +825,19 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, { /* * If we've exhausted all items on this block, move to next block - * in the bitmap. + * in the bitmap. tbm_private_iterate() sets matchResult->blockno + * to InvalidBlockNumber when the bitmap is exhausted. */ - while (entry->matchResult == NULL || - (entry->matchResult->ntuples >= 0 && - entry->offset >= entry->matchResult->ntuples) || - entry->matchResult->blockno < advancePastBlk || + while ((!BlockNumberIsValid(entry->matchResult.blockno)) || + (entry->matchResult.ntuples >= 0 && + entry->offset >= entry->matchResult.ntuples) || + entry->matchResult.blockno < advancePastBlk || (ItemPointerIsLossyPage(&advancePast) && - entry->matchResult->blockno == advancePastBlk)) + entry->matchResult.blockno == advancePastBlk)) { - entry->matchResult = - tbm_private_iterate(entry->matchIterator); + tbm_private_iterate(entry->matchIterator, &entry->matchResult); - if (entry->matchResult == NULL) + if (!BlockNumberIsValid(entry->matchResult.blockno)) { ItemPointerSetInvalid(&entry->curItem); tbm_end_private_iterate(entry->matchIterator); @@ -847,7 +848,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, /* * Reset counter to the beginning of entry->matchResult. Note: - * entry->offset is still greater than matchResult->ntuples if + * entry->offset is still greater than matchResult.ntuples if * matchResult is lossy. So, on next call we will get next * result from TIDBitmap. */ @@ -860,10 +861,10 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, * We're now on the first page after advancePast which has any * items on it. If it's a lossy result, return that. */ - if (entry->matchResult->ntuples < 0) + if (entry->matchResult.ntuples < 0) { ItemPointerSetLossyPage(&entry->curItem, - entry->matchResult->blockno); + entry->matchResult.blockno); /* * We might as well fall out of the loop; we could not @@ -877,27 +878,27 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, * Not a lossy page. Skip over any offsets <= advancePast, and * return that. */ - if (entry->matchResult->blockno == advancePastBlk) + if (entry->matchResult.blockno == advancePastBlk) { /* * First, do a quick check against the last offset on the * page. If that's > advancePast, so are all the other * offsets, so just go back to the top to get the next page. */ - if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff) + if (entry->matchResult.offsets[entry->matchResult.ntuples - 1] <= advancePastOff) { - entry->offset = entry->matchResult->ntuples; + entry->offset = entry->matchResult.ntuples; continue; } /* Otherwise scan to find the first item > advancePast */ - while (entry->matchResult->offsets[entry->offset] <= advancePastOff) + while (entry->matchResult.offsets[entry->offset] <= advancePastOff) entry->offset++; } ItemPointerSet(&entry->curItem, - entry->matchResult->blockno, - entry->matchResult->offsets[entry->offset]); + entry->matchResult.blockno, + entry->matchResult.offsets[entry->offset]); entry->offset++; /* Done unless we need to reduce the result */ diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c index 7d1e6615260..625140fdf25 100644 --- a/src/backend/access/gin/ginscan.c +++ b/src/backend/access/gin/ginscan.c @@ -106,7 +106,7 @@ ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum, ItemPointerSetMin(&scanEntry->curItem); scanEntry->matchBitmap = NULL; scanEntry->matchIterator = NULL; - scanEntry->matchResult = NULL; + scanEntry->matchResult.blockno = InvalidBlockNumber; scanEntry->list = NULL; scanEntry->nlist = 0; scanEntry->offset = InvalidOffsetNumber; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ea0a12b39af..45f419f849f 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -279,6 +279,62 @@ heap_scan_stream_read_next_serial(ReadStream *stream, return scan->rs_prefetch_block; } +/* + * Read stream API callback for bitmap heap scans. + * Returns the next block the caller wants from the read stream or + * InvalidBlockNumber when done. + */ +static BlockNumber +bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data, + void *per_buffer_data) +{ + TBMIterateResult *tbmres = per_buffer_data; + BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) private_data; + HeapScanDesc hscan = (HeapScanDesc) bscan; + TableScanDesc sscan = &hscan->rs_base; + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + tbm_iterate(&sscan->st.bitmap.rs_iterator, tbmres); + + /* no more entries in the bitmap */ + if (!BlockNumberIsValid(tbmres->blockno)) + return InvalidBlockNumber; + + /* + * Ignore any claimed entries past what we think is the end of the + * relation. It may have been extended after the start of our scan (we + * only hold an AccessShareLock, and it could be inserts from this + * backend). We don't take this optimization in SERIALIZABLE + * isolation though, as we need to examine all invisible tuples + * reachable by the index. + */ + if (!IsolationIsSerializable() && + tbmres->blockno >= hscan->rs_nblocks) + continue; + + /* + * We can skip fetching the heap page if we don't need any fields from + * the heap, the bitmap entries don't need rechecking, and all tuples + * on the page are visible to our transaction. + */ + if (!(sscan->rs_flags & SO_NEED_TUPLES) && + !tbmres->recheck && + VM_ALL_VISIBLE(sscan->rs_rd, tbmres->blockno, &bscan->rs_vmbuffer)) + { + bscan->rs_empty_tuples_pending += tbmres->ntuples; + continue; + } + + return tbmres->blockno; + } + + /* not reachable */ + Assert(false); +} + /* ---------------- * initscan - scan code common to heap_beginscan and heap_rescan * ---------------- @@ -1067,6 +1123,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, scan->rs_base.rs_flags = flags; scan->rs_base.rs_parallel = parallel_scan; scan->rs_strategy = NULL; /* set in initscan */ + scan->rs_cbuf = InvalidBuffer; /* * Disable page-at-a-time mode if it's not a MVCC-safe snapshot. @@ -1146,6 +1203,16 @@ heap_beginscan(Relation relation, Snapshot snapshot, scan, 0); } + else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN) + { + scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_DEFAULT, + scan->rs_strategy, + scan->rs_base.rs_rd, + MAIN_FORKNUM, + bitmapheap_stream_read_next, + scan, + sizeof(TBMIterateResult)); + } return (TableScanDesc) scan; @@ -1180,7 +1247,10 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, * unpin scan buffers */ if (BufferIsValid(scan->rs_cbuf)) + { ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + } if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN) { diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index c0bec014154..f6387f60e3e 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -56,6 +56,11 @@ static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan); +static bool BitmapHeapScanNextBlock(TableScanDesc scan, + bool *recheck, + uint64 *lossy_pages, uint64 *exact_pages); + + /* ------------------------------------------------------------------------ * Slot related callbacks for heap AM @@ -2116,199 +2121,44 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths, */ static bool -heapam_scan_bitmap_next_block(TableScanDesc scan, - BlockNumber *blockno, bool *recheck, - uint64 *lossy_pages, uint64 *exact_pages) +heapam_scan_bitmap_next_tuple(TableScanDesc scan, + TupleTableSlot *slot, + bool *recheck, + uint64 *lossy_pages, + uint64 *exact_pages) { BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan; HeapScanDesc hscan = (HeapScanDesc) bscan; - BlockNumber block; - Buffer buffer; - Snapshot snapshot; - int ntup; - TBMIterateResult *tbmres; - - Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); - - hscan->rs_cindex = 0; - hscan->rs_ntuples = 0; - - *blockno = InvalidBlockNumber; - *recheck = true; - - do - { - CHECK_FOR_INTERRUPTS(); - - tbmres = tbm_iterate(&scan->st.rs_tbmiterator); - - if (tbmres == NULL) - return false; - - /* - * Ignore any claimed entries past what we think is the end of the - * relation. It may have been extended after the start of our scan (we - * only hold an AccessShareLock, and it could be inserts from this - * backend). We don't take this optimization in SERIALIZABLE - * isolation though, as we need to examine all invisible tuples - * reachable by the index. - */ - } while (!IsolationIsSerializable() && - tbmres->blockno >= hscan->rs_nblocks); - - /* Got a valid block */ - *blockno = tbmres->blockno; - *recheck = tbmres->recheck; - - /* - * We can skip fetching the heap page if we don't need any fields from the - * heap, the bitmap entries don't need rechecking, and all tuples on the - * page are visible to our transaction. - */ - if (!(scan->rs_flags & SO_NEED_TUPLES) && - !tbmres->recheck && - VM_ALL_VISIBLE(scan->rs_rd, tbmres->blockno, &bscan->rs_vmbuffer)) - { - /* can't be lossy in the skip_fetch case */ - Assert(tbmres->ntuples >= 0); - Assert(bscan->rs_empty_tuples_pending >= 0); - - bscan->rs_empty_tuples_pending += tbmres->ntuples; - - return true; - } - - block = tbmres->blockno; - - /* - * Acquire pin on the target heap page, trading in any pin we held before. - */ - hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf, - scan->rs_rd, - block); - hscan->rs_cblock = block; - buffer = hscan->rs_cbuf; - snapshot = scan->rs_snapshot; - - ntup = 0; - - /* - * Prune and repair fragmentation for the whole page, if possible. - */ - heap_page_prune_opt(scan->rs_rd, buffer); - - /* - * We must hold share lock on the buffer content while examining tuple - * visibility. Afterwards, however, the tuples we have found to be - * visible are guaranteed good as long as we hold the buffer pin. - */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); + OffsetNumber targoffset; + Page page; + ItemId lp; /* - * We need two separate strategies for lossy and non-lossy cases. + * Out of range? If so, nothing more to look at on this page */ - if (tbmres->ntuples >= 0) - { - /* - * Bitmap is non-lossy, so we just look through the offsets listed in - * tbmres; but we have to follow any HOT chain starting at each such - * offset. - */ - int curslot; - - for (curslot = 0; curslot < tbmres->ntuples; curslot++) - { - OffsetNumber offnum = tbmres->offsets[curslot]; - ItemPointerData tid; - HeapTupleData heapTuple; - - ItemPointerSet(&tid, block, offnum); - if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, - &heapTuple, NULL, true)) - hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); - } - } - else + while (hscan->rs_cindex >= hscan->rs_ntuples) { /* - * Bitmap is lossy, so we must examine each line pointer on the page. - * But we can ignore HOT chains, since we'll check each tuple anyway. + * Emit empty tuples before advancing to the next block */ - Page page = BufferGetPage(buffer); - OffsetNumber maxoff = PageGetMaxOffsetNumber(page); - OffsetNumber offnum; - - for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + if (bscan->rs_empty_tuples_pending > 0) { - ItemId lp; - HeapTupleData loctup; - bool valid; - - lp = PageGetItemId(page, offnum); - if (!ItemIdIsNormal(lp)) - continue; - loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); - loctup.t_len = ItemIdGetLength(lp); - loctup.t_tableOid = scan->rs_rd->rd_id; - ItemPointerSet(&loctup.t_self, block, offnum); - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); - if (valid) - { - hscan->rs_vistuples[ntup++] = offnum; - PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot, - HeapTupleHeaderGetXmin(loctup.t_data)); - } - HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, - buffer, snapshot); + /* + * If we don't have to fetch the tuple, just return nulls. + */ + ExecStoreAllNullTuple(slot); + bscan->rs_empty_tuples_pending--; + return true; } - } - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - Assert(ntup <= MaxHeapTuplesPerPage); - hscan->rs_ntuples = ntup; - - if (tbmres->ntuples >= 0) - (*exact_pages)++; - else - (*lossy_pages)++; - - /* - * Return true to indicate that a valid block was found and the bitmap is - * not exhausted. If there are no visible tuples on this page, - * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will - * return false returning control to this function to advance to the next - * block in the bitmap. - */ - return true; -} - -static bool -heapam_scan_bitmap_next_tuple(TableScanDesc scan, - TupleTableSlot *slot) -{ - BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan; - HeapScanDesc hscan = (HeapScanDesc) bscan; - OffsetNumber targoffset; - Page page; - ItemId lp; - if (bscan->rs_empty_tuples_pending > 0) - { /* - * If we don't have to fetch the tuple, just return nulls. + * Returns false if the bitmap is exhausted and there are no further + * blocks we need to scan. */ - ExecStoreAllNullTuple(slot); - bscan->rs_empty_tuples_pending--; - return true; + if (!BitmapHeapScanNextBlock(scan, recheck, lossy_pages, exact_pages)) + return false; } - /* - * Out of range? If so, nothing more to look at on this page - */ - if (hscan->rs_cindex >= hscan->rs_ntuples) - return false; - targoffset = hscan->rs_vistuples[hscan->rs_cindex]; page = BufferGetPage(hscan->rs_cbuf); lp = PageGetItemId(page, targoffset); @@ -2615,6 +2465,167 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, } } +/* + * Helper function get the next block of a bitmap heap scan. Returns true when + * it got the next block and saved it in the scan descriptor and false when + * the bitmap and or relation are exhausted. + */ +static bool +BitmapHeapScanNextBlock(TableScanDesc scan, + bool *recheck, + uint64 *lossy_pages, uint64 *exact_pages) +{ + BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan; + HeapScanDesc hscan = (HeapScanDesc) bscan; + BlockNumber block; + void *per_buffer_data; + Buffer buffer; + Snapshot snapshot; + int ntup; + TBMIterateResult *tbmres; + + Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); + Assert(hscan->rs_read_stream); + + hscan->rs_cindex = 0; + hscan->rs_ntuples = 0; + + /* Release buffer containing previous block. */ + if (BufferIsValid(hscan->rs_cbuf)) + { + ReleaseBuffer(hscan->rs_cbuf); + hscan->rs_cbuf = InvalidBuffer; + } + + hscan->rs_cbuf = read_stream_next_buffer(hscan->rs_read_stream, &per_buffer_data); + + if (BufferIsInvalid(hscan->rs_cbuf)) + { + if (BufferIsValid(bscan->rs_vmbuffer)) + { + ReleaseBuffer(bscan->rs_vmbuffer); + bscan->rs_vmbuffer = InvalidBuffer; + } + + /* + * Bitmap is exhausted. Time to emit empty tuples if relevant. We emit + * all empty tuples at the end instead of emitting them per block we + * skip fetching. This is necessary because the streaming read API + * will only return TBMIterateResults for blocks actually fetched. + * When we skip fetching a block, we keep track of how many empty + * tuples to emit at the end of the BitmapHeapScan. We do not recheck + * all NULL tuples. + */ + *recheck = false; + return bscan->rs_empty_tuples_pending > 0; + } + + Assert(per_buffer_data); + + tbmres = per_buffer_data; + + Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno); + + *recheck = tbmres->recheck; + + hscan->rs_cblock = tbmres->blockno; + hscan->rs_ntuples = tbmres->ntuples; + + block = tbmres->blockno; + buffer = hscan->rs_cbuf; + snapshot = scan->rs_snapshot; + + ntup = 0; + + /* + * Prune and repair fragmentation for the whole page, if possible. + */ + heap_page_prune_opt(scan->rs_rd, buffer); + + /* + * We must hold share lock on the buffer content while examining tuple + * visibility. Afterwards, however, the tuples we have found to be + * visible are guaranteed good as long as we hold the buffer pin. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + /* + * We need two separate strategies for lossy and non-lossy cases. + */ + if (tbmres->ntuples >= 0) + { + /* + * Bitmap is non-lossy, so we just look through the offsets listed in + * tbmres; but we have to follow any HOT chain starting at each such + * offset. + */ + int curslot; + + for (curslot = 0; curslot < tbmres->ntuples; curslot++) + { + OffsetNumber offnum = tbmres->offsets[curslot]; + ItemPointerData tid; + HeapTupleData heapTuple; + + ItemPointerSet(&tid, block, offnum); + if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, + &heapTuple, NULL, true)) + hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + } + } + else + { + /* + * Bitmap is lossy, so we must examine each line pointer on the page. + * But we can ignore HOT chains, since we'll check each tuple anyway. + */ + Page page = BufferGetPage(buffer); + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + OffsetNumber offnum; + + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId lp; + HeapTupleData loctup; + bool valid; + + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) + continue; + loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); + loctup.t_len = ItemIdGetLength(lp); + loctup.t_tableOid = scan->rs_rd->rd_id; + ItemPointerSet(&loctup.t_self, block, offnum); + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + if (valid) + { + hscan->rs_vistuples[ntup++] = offnum; + PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot, + HeapTupleHeaderGetXmin(loctup.t_data)); + } + HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, + buffer, snapshot); + } + } + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + Assert(ntup <= MaxHeapTuplesPerPage); + hscan->rs_ntuples = ntup; + + if (tbmres->ntuples >= 0) + (*exact_pages)++; + else + (*lossy_pages)++; + + /* + * Return true to indicate that a valid block was found and the bitmap is + * not exhausted. If there are no visible tuples on this page, + * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will + * invoke this function again. + */ + return true; +} /* ------------------------------------------------------------------------ * Definition of the heap table access method. @@ -2674,7 +2685,6 @@ static const TableAmRoutine heapam_methods = { .relation_estimate_size = heapam_estimate_rel_size, - .scan_bitmap_next_block = heapam_scan_bitmap_next_block, .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, .scan_sample_next_block = heapam_scan_sample_next_block, .scan_sample_next_tuple = heapam_scan_sample_next_tuple diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 760a36fd2a1..56378007a81 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -92,8 +92,6 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->relation_estimate_size != NULL); /* optional, but one callback implies presence of the other */ - Assert((routine->scan_bitmap_next_block == NULL) == - (routine->scan_bitmap_next_tuple == NULL)); Assert(routine->scan_sample_next_block != NULL); Assert(routine->scan_sample_next_tuple != NULL); diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index be0d24d901b..68499e3aa45 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -51,10 +51,6 @@ static void BitmapTableScanSetup(BitmapHeapScanState *node); static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node); static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate); -static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node); -static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node); -static inline void BitmapPrefetch(BitmapHeapScanState *node, - TableScanDesc scan); static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate); @@ -62,14 +58,6 @@ static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate); * Do the underlying index scan, build the bitmap, set up the parallel state * needed for parallel workers to iterate through the bitmap, and set up the * underlying table scan descriptor. - * - * For prefetching, we use *two* iterators, one for the pages we are actually - * scanning and another that runs ahead of the first for prefetching. - * node->prefetch_pages tracks exactly how many pages ahead the prefetch - * iterator is. Also, node->prefetch_target tracks the desired prefetch - * distance, which starts small and increases up to the - * node->prefetch_maximum. This is to avoid doing a lot of prefetching in a - * scan that stops after a few tuples because of a LIMIT. */ static void BitmapTableScanSetup(BitmapHeapScanState *node) @@ -102,14 +90,6 @@ BitmapTableScanSetup(BitmapHeapScanState *node) */ pstate->tbmiterator = tbm_prepare_shared_iterate(node->tbm); -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - pstate->prefetch_iterator = - tbm_prepare_shared_iterate(node->tbm); - } -#endif /* USE_PREFETCH */ - /* We have initialized the shared state so wake up others. */ BitmapDoneInitializingSharedState(pstate); } @@ -119,15 +99,6 @@ BitmapTableScanSetup(BitmapHeapScanState *node) pstate->tbmiterator : InvalidDsaPointer); -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - node->prefetch_iterator = - tbm_begin_iterate(node->tbm, dsa, - pstate ? - pstate->prefetch_iterator : - InvalidDsaPointer); -#endif /* USE_PREFETCH */ - /* * If this is the first scan of the underlying table, create the table * scan descriptor and begin the scan. @@ -149,16 +120,16 @@ BitmapTableScanSetup(BitmapHeapScanState *node) node->ss.ss_currentScanDesc = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, + pstate, 0, NULL, need_tuples); } - node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; + node->ss.ss_currentScanDesc->st.bitmap.rs_iterator = tbmiterator; node->initialized = true; } - /* ---------------------------------------------------------------- * BitmapHeapNext * @@ -168,120 +139,44 @@ BitmapTableScanSetup(BitmapHeapScanState *node) static TupleTableSlot * BitmapHeapNext(BitmapHeapScanState *node) { - ExprContext *econtext; - TableScanDesc scan; - TupleTableSlot *slot; - -#ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; -#endif - - /* - * extract necessary information from index scan node - */ - econtext = node->ss.ps.ps_ExprContext; - slot = node->ss.ss_ScanTupleSlot; - scan = node->ss.ss_currentScanDesc; + ExprContext *econtext = node->ss.ps.ps_ExprContext; + TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; /* * If we haven't yet performed the underlying index scan, do it, and begin * the iteration over the bitmap. */ if (!node->initialized) - { BitmapTableScanSetup(node); - scan = node->ss.ss_currentScanDesc; - goto new_page; - } - for (;;) + while (table_scan_bitmap_next_tuple(node->ss.ss_currentScanDesc, + slot, &node->recheck, + &node->stats.lossy_pages, + &node->stats.exact_pages)) { - while (table_scan_bitmap_next_tuple(scan, slot)) - { - /* - * Continuing in previously obtained page. - */ - - CHECK_FOR_INTERRUPTS(); - -#ifdef USE_PREFETCH - - /* - * Try to prefetch at least a few pages even before we get to the - * second page if we don't stop reading after the first tuple. - */ - if (!pstate) - { - if (node->prefetch_target < node->prefetch_maximum) - node->prefetch_target++; - } - else if (pstate->prefetch_target < node->prefetch_maximum) - { - /* take spinlock while updating shared state */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target < node->prefetch_maximum) - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } -#endif /* USE_PREFETCH */ - - /* - * We issue prefetch requests *after* fetching the current page to - * try to avoid having prefetching interfere with the main I/O. - * Also, this should happen only when we have determined there is - * still something to do on the current page, else we may - * uselessly prefetch the same page we are just about to request - * for real. - */ - BitmapPrefetch(node, scan); - - /* - * If we are using lossy info, we have to recheck the qual - * conditions at every tuple. - */ - if (node->recheck) - { - econtext->ecxt_scantuple = slot; - if (!ExecQualAndReset(node->bitmapqualorig, econtext)) - { - /* Fails recheck, so drop it and loop back for another */ - InstrCountFiltered2(node, 1); - ExecClearTuple(slot); - continue; - } - } - - /* OK to return this tuple */ - return slot; - } - -new_page: - - BitmapAdjustPrefetchIterator(node); - /* - * Returns false if the bitmap is exhausted and there are no further - * blocks we need to scan. + * Continuing in previously obtained page. */ - if (!table_scan_bitmap_next_block(scan, &node->blockno, - &node->recheck, - &node->stats.lossy_pages, - &node->stats.exact_pages)) - break; + CHECK_FOR_INTERRUPTS(); /* - * If serial, we can error out if the prefetch block doesn't stay - * ahead of the current block. + * If we are using lossy info, we have to recheck the qual conditions + * at every tuple. */ - if (node->pstate == NULL && - !tbm_exhausted(&node->prefetch_iterator) && - node->prefetch_blockno < node->blockno) - elog(ERROR, - "prefetch and main iterators are out of sync. pfblockno: %d. blockno: %d", - node->prefetch_blockno, node->blockno); - - /* Adjust the prefetch target */ - BitmapAdjustPrefetchTarget(node); + if (node->recheck) + { + econtext->ecxt_scantuple = slot; + if (!ExecQualAndReset(node->bitmapqualorig, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + ExecClearTuple(slot); + continue; + } + } + + /* OK to return this tuple */ + return slot; } /* @@ -305,226 +200,6 @@ BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate) ConditionVariableBroadcast(&pstate->cv); } -/* - * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator - * - * We keep track of how far the prefetch iterator is ahead of the main - * iterator in prefetch_pages. For each block the main iterator returns, we - * decrement prefetch_pages. - */ -static inline void -BitmapAdjustPrefetchIterator(BitmapHeapScanState *node) -{ -#ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - TBMIterateResult *tbmpre; - - if (pstate == NULL) - { - TBMIterator *prefetch_iterator = &node->prefetch_iterator; - - if (node->prefetch_pages > 0) - { - /* The main iterator has closed the distance by one page */ - node->prefetch_pages--; - } - else if (!tbm_exhausted(prefetch_iterator)) - { - tbmpre = tbm_iterate(prefetch_iterator); - node->prefetch_blockno = tbmpre ? tbmpre->blockno : - InvalidBlockNumber; - } - return; - } - - /* - * XXX: There is a known issue with keeping the prefetch and current block - * iterators in sync for parallel bitmap table scans. This can lead to - * prefetching blocks that have already been read. See the discussion - * here: - * https://postgr.es/m/20240315211449.en2jcmdqxv5o6tlz%40alap3.anarazel.de - * Note that moving the call site of BitmapAdjustPrefetchIterator() - * exacerbates the effects of this bug. - */ - if (node->prefetch_maximum > 0) - { - TBMIterator *prefetch_iterator = &node->prefetch_iterator; - - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages > 0) - { - pstate->prefetch_pages--; - SpinLockRelease(&pstate->mutex); - } - else - { - /* Release the mutex before iterating */ - SpinLockRelease(&pstate->mutex); - - /* - * In case of shared mode, we can not ensure that the current - * blockno of the main iterator and that of the prefetch iterator - * are same. It's possible that whatever blockno we are - * prefetching will be processed by another process. Therefore, - * we don't validate the blockno here as we do in non-parallel - * case. - */ - if (!tbm_exhausted(prefetch_iterator)) - { - tbmpre = tbm_iterate(prefetch_iterator); - node->prefetch_blockno = tbmpre ? tbmpre->blockno : - InvalidBlockNumber; - } - } - } -#endif /* USE_PREFETCH */ -} - -/* - * BitmapAdjustPrefetchTarget - Adjust the prefetch target - * - * Increase prefetch target if it's not yet at the max. Note that - * we will increase it to zero after fetching the very first - * page/tuple, then to one after the second tuple is fetched, then - * it doubles as later pages are fetched. - */ -static inline void -BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) -{ -#ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - - if (pstate == NULL) - { - if (node->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (node->prefetch_target >= node->prefetch_maximum / 2) - node->prefetch_target = node->prefetch_maximum; - else if (node->prefetch_target > 0) - node->prefetch_target *= 2; - else - node->prefetch_target++; - return; - } - - /* Do an unlocked check first to save spinlock acquisitions. */ - if (pstate->prefetch_target < node->prefetch_maximum) - { - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (pstate->prefetch_target >= node->prefetch_maximum / 2) - pstate->prefetch_target = node->prefetch_maximum; - else if (pstate->prefetch_target > 0) - pstate->prefetch_target *= 2; - else - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } -#endif /* USE_PREFETCH */ -} - -/* - * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target - */ -static inline void -BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) -{ -#ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - - if (pstate == NULL) - { - TBMIterator *prefetch_iterator = &node->prefetch_iterator; - - if (!tbm_exhausted(prefetch_iterator)) - { - while (node->prefetch_pages < node->prefetch_target) - { - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - bool skip_fetch; - - if (tbmpre == NULL) - { - /* No more pages to prefetch */ - tbm_end_iterate(prefetch_iterator); - break; - } - node->prefetch_pages++; - node->prefetch_blockno = tbmpre->blockno; - - /* - * If we expect not to have to actually read this heap page, - * skip this prefetch call, but continue to run the prefetch - * logic normally. (Would it be better not to increment - * prefetch_pages?) - */ - skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) && - !tbmpre->recheck && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, - &node->pvmbuffer)); - - if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); - } - } - - return; - } - - if (pstate->prefetch_pages < pstate->prefetch_target) - { - TBMIterator *prefetch_iterator = &node->prefetch_iterator; - - if (!tbm_exhausted(prefetch_iterator)) - { - while (1) - { - TBMIterateResult *tbmpre; - bool do_prefetch = false; - bool skip_fetch; - - /* - * Recheck under the mutex. If some other process has already - * done enough prefetching then we need not to do anything. - */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages < pstate->prefetch_target) - { - pstate->prefetch_pages++; - do_prefetch = true; - } - SpinLockRelease(&pstate->mutex); - - if (!do_prefetch) - return; - - tbmpre = tbm_iterate(prefetch_iterator); - if (tbmpre == NULL) - { - /* No more pages to prefetch */ - tbm_end_iterate(prefetch_iterator); - break; - } - - node->prefetch_blockno = tbmpre->blockno; - - /* As above, skip prefetch if we expect not to need page */ - skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) && - !tbmpre->recheck && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, - &node->pvmbuffer)); - - if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); - } - } - } -#endif /* USE_PREFETCH */ -} - /* * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual */ @@ -574,31 +249,19 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) * End iteration on iterators saved in scan descriptor if they have * not already been cleaned up. */ - if (!tbm_exhausted(&scan->st.rs_tbmiterator)) - tbm_end_iterate(&scan->st.rs_tbmiterator); + if (!tbm_exhausted(&scan->st.bitmap.rs_iterator)) + tbm_end_iterate(&scan->st.bitmap.rs_iterator); /* rescan to release any page pin */ table_rescan(node->ss.ss_currentScanDesc, NULL); } - /* If we did not already clean up the prefetch iterator, do so now. */ - if (!tbm_exhausted(&node->prefetch_iterator)) - tbm_end_iterate(&node->prefetch_iterator); - /* release bitmaps and buffers if any */ if (node->tbm) tbm_free(node->tbm); - if (node->pvmbuffer != InvalidBuffer) - ReleaseBuffer(node->pvmbuffer); node->tbm = NULL; node->initialized = false; - node->pvmbuffer = InvalidBuffer; node->recheck = true; - /* Only used for serial BHS */ - node->blockno = InvalidBlockNumber; - node->prefetch_blockno = InvalidBlockNumber; - node->prefetch_pages = 0; - node->prefetch_target = -1; ExecScanReScan(&node->ss); @@ -658,8 +321,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) * End iteration on iterators saved in scan descriptor if they have * not already been cleaned up. */ - if (!tbm_exhausted(&scanDesc->st.rs_tbmiterator)) - tbm_end_iterate(&scanDesc->st.rs_tbmiterator); + if (!tbm_exhausted(&scanDesc->st.bitmap.rs_iterator)) + tbm_end_iterate(&scanDesc->st.bitmap.rs_iterator); /* * close table scan @@ -667,17 +330,11 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) table_endscan(scanDesc); } - /* If we did not already clean up the prefetch iterator, do so now. */ - if (!tbm_exhausted(&node->prefetch_iterator)) - tbm_end_iterate(&node->prefetch_iterator); - /* * release bitmaps and buffers if any */ if (node->tbm) tbm_free(node->tbm); - if (node->pvmbuffer != InvalidBuffer) - ReleaseBuffer(node->pvmbuffer); } /* ---------------------------------------------------------------- @@ -710,18 +367,13 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan; scanstate->tbm = NULL; - scanstate->pvmbuffer = InvalidBuffer; /* Zero the statistics counters */ memset(&scanstate->stats, 0, sizeof(BitmapHeapScanInstrumentation)); - scanstate->prefetch_pages = 0; - scanstate->prefetch_target = -1; scanstate->initialized = false; scanstate->pstate = NULL; scanstate->recheck = true; - scanstate->blockno = InvalidBlockNumber; - scanstate->prefetch_blockno = InvalidBlockNumber; /* * Miscellaneous initialization @@ -761,13 +413,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->bitmapqualorig = ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate); - /* - * Maximum number of prefetches for the tablespace if configured, - * otherwise the current value of the effective_io_concurrency GUC. - */ - scanstate->prefetch_maximum = - get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); - scanstate->ss.ss_currentRelation = currentRelation; /* @@ -871,12 +516,9 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, sinstrument = (SharedBitmapHeapInstrumentation *) ptr; pstate->tbmiterator = 0; - pstate->prefetch_iterator = 0; /* Initialize the mutex */ SpinLockInit(&pstate->mutex); - pstate->prefetch_pages = 0; - pstate->prefetch_target = -1; pstate->state = BM_INITIAL; ConditionVariableInit(&pstate->cv); @@ -913,17 +555,11 @@ ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, return; pstate->state = BM_INITIAL; - pstate->prefetch_pages = 0; - pstate->prefetch_target = -1; if (DsaPointerIsValid(pstate->tbmiterator)) tbm_free_shared_area(dsa, pstate->tbmiterator); - if (DsaPointerIsValid(pstate->prefetch_iterator)) - tbm_free_shared_area(dsa, pstate->prefetch_iterator); - pstate->tbmiterator = InvalidDsaPointer; - pstate->prefetch_iterator = InvalidDsaPointer; } /* ---------------------------------------------------------------- diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 66b3c387d53..b7e4b4d9e1c 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -40,7 +40,6 @@ #include <limits.h> -#include "access/htup_details.h" #include "common/hashfn.h" #include "common/int.h" #include "nodes/bitmapset.h" @@ -48,14 +47,6 @@ #include "storage/lwlock.h" #include "utils/dsa.h" -/* - * The maximum number of tuples per page is not large (typically 256 with - * 8K pages, or 1024 with 32K pages). So there's not much point in making - * the per-page bitmaps variable size. We just legislate that the size - * is this: - */ -#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage - /* * When we have to switch over to lossy storage, we use a data structure * with one bit per page, where all pages having the same number DIV @@ -67,7 +58,7 @@ * table, using identical data structures. (This is because the memory * management for hashtables doesn't easily/efficiently allow space to be * transferred easily from one hashtable to another.) Therefore it's best - * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not + * if PAGES_PER_CHUNK is the same as MaxHeapTuplesPerPage, or at least not * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to * avoid expensive integer remainder operations. So, define it like this: */ @@ -79,7 +70,7 @@ #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD) /* number of active words for an exact page: */ -#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1) +#define WORDS_PER_PAGE ((MaxHeapTuplesPerPage - 1) / BITS_PER_BITMAPWORD + 1) /* number of active words for a lossy chunk: */ #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1) @@ -181,7 +172,6 @@ struct TBMPrivateIterator int spageptr; /* next spages index */ int schunkptr; /* next schunks index */ int schunkbit; /* next bit to check in current schunk */ - TBMIterateResult output; /* MUST BE LAST (because variable-size) */ }; /* @@ -222,7 +212,6 @@ struct TBMSharedIterator PTEntryArray *ptbase; /* pagetable element array */ PTIterationArray *ptpages; /* sorted exact page index list */ PTIterationArray *ptchunks; /* sorted lossy page index list */ - TBMIterateResult output; /* MUST BE LAST (because variable-size) */ }; /* Local function prototypes */ @@ -390,7 +379,7 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bitnum; /* safety check to ensure we don't overrun bit array bounds */ - if (off < 1 || off > MAX_TUPLES_PER_PAGE) + if (off < 1 || off > MaxHeapTuplesPerPage) elog(ERROR, "tuple offset out of range: %u", off); /* @@ -696,9 +685,7 @@ tbm_begin_private_iterate(TIDBitmap *tbm) * Create the TBMPrivateIterator struct, with enough trailing space to * serve the needs of the TBMIterateResult sub-struct. */ - iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator) + - MAX_TUPLES_PER_PAGE * - sizeof(OffsetNumber)); + iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator)); iterator->tbm = tbm; /* @@ -959,20 +946,21 @@ tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp) /* * tbm_private_iterate - scan through next page of a TIDBitmap * - * Returns a TBMIterateResult representing one page, or NULL if there are - * no more pages to scan. Pages are guaranteed to be delivered in numerical - * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to - * remember the exact tuples to look at on this page --- the caller must - * examine all tuples on the page and check if they meet the intended - * condition. If result->recheck is true, only the indicated tuples need - * be examined, but the condition must be rechecked anyway. (For ease of - * testing, recheck is always set true when ntuples < 0.) + * Caller must pass in a TBMIterateResult to be filled. + * + * Pages are guaranteed to be delivered in numerical order. tbmres->blockno is + * set to InvalidBlockNumber when there are no more pages to scan. If + * tbmres->ntuples < 0, then the bitmap is "lossy" and failed to remember the + * exact tuples to look at on this page --- the caller must examine all tuples + * on the page and check if they meet the intended condition. If + * tbmres->recheck is true, only the indicated tuples need be examined, but the + * condition must be rechecked anyway. (For ease of testing, recheck is always + * set true when ntuples < 0.) */ -TBMIterateResult * -tbm_private_iterate(TBMPrivateIterator *iterator) +void +tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres) { TIDBitmap *tbm = iterator->tbm; - TBMIterateResult *output = &(iterator->output); Assert(tbm->iterating == TBM_ITERATING_PRIVATE); @@ -1010,11 +998,11 @@ tbm_private_iterate(TBMPrivateIterator *iterator) chunk_blockno < tbm->spages[iterator->spageptr]->blockno) { /* Return a lossy page indicator from the chunk */ - output->blockno = chunk_blockno; - output->ntuples = -1; - output->recheck = true; + tbmres->blockno = chunk_blockno; + tbmres->ntuples = -1; + tbmres->recheck = true; iterator->schunkbit++; - return output; + return; } } @@ -1030,16 +1018,16 @@ tbm_private_iterate(TBMPrivateIterator *iterator) page = tbm->spages[iterator->spageptr]; /* scan bitmap to extract individual offset numbers */ - ntuples = tbm_extract_page_tuple(page, output); - output->blockno = page->blockno; - output->ntuples = ntuples; - output->recheck = page->recheck; + ntuples = tbm_extract_page_tuple(page, tbmres); + tbmres->blockno = page->blockno; + tbmres->ntuples = ntuples; + tbmres->recheck = page->recheck; iterator->spageptr++; - return output; + return; } /* Nothing more in the bitmap */ - return NULL; + tbmres->blockno = InvalidBlockNumber; } /* @@ -1049,10 +1037,9 @@ tbm_private_iterate(TBMPrivateIterator *iterator) * across multiple processes. We need to acquire the iterator LWLock, * before accessing the shared members. */ -TBMIterateResult * -tbm_shared_iterate(TBMSharedIterator *iterator) +void +tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres) { - TBMIterateResult *output = &iterator->output; TBMSharedIteratorState *istate = iterator->state; PagetableEntry *ptbase = NULL; int *idxpages = NULL; @@ -1103,13 +1090,13 @@ tbm_shared_iterate(TBMSharedIterator *iterator) chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno) { /* Return a lossy page indicator from the chunk */ - output->blockno = chunk_blockno; - output->ntuples = -1; - output->recheck = true; + tbmres->blockno = chunk_blockno; + tbmres->ntuples = -1; + tbmres->recheck = true; istate->schunkbit++; LWLockRelease(&istate->lock); - return output; + return; } } @@ -1119,21 +1106,21 @@ tbm_shared_iterate(TBMSharedIterator *iterator) int ntuples; /* scan bitmap to extract individual offset numbers */ - ntuples = tbm_extract_page_tuple(page, output); - output->blockno = page->blockno; - output->ntuples = ntuples; - output->recheck = page->recheck; + ntuples = tbm_extract_page_tuple(page, tbmres); + tbmres->blockno = page->blockno; + tbmres->ntuples = ntuples; + tbmres->recheck = page->recheck; istate->spageptr++; LWLockRelease(&istate->lock); - return output; + return; } LWLockRelease(&istate->lock); /* Nothing more in the bitmap */ - return NULL; + tbmres->blockno = InvalidBlockNumber; } /* @@ -1468,8 +1455,7 @@ tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp) * Create the TBMSharedIterator struct, with enough trailing space to * serve the needs of the TBMIterateResult sub-struct. */ - iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) + - MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator)); istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp); @@ -1610,13 +1596,14 @@ tbm_end_iterate(TBMIterator *iterator) /* * Get the next TBMIterateResult from the shared or private bitmap iterator. */ -TBMIterateResult * -tbm_iterate(TBMIterator *iterator) +void +tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres) { Assert(iterator); + Assert(tbmres); if (iterator->shared) - return tbm_shared_iterate(iterator->i.shared_iterator); + return tbm_shared_iterate(iterator->i.shared_iterator, tbmres); else - return tbm_private_iterate(iterator->i.private_iterator); + return tbm_private_iterate(iterator->i.private_iterator, tbmres); } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 71abb01f655..0489ad36644 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -325,7 +325,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->amcanparallel = amroutine->amcanparallel; info->amhasgettuple = (amroutine->amgettuple != NULL); info->amhasgetbitmap = amroutine->amgetbitmap != NULL && - relation->rd_tableam->scan_bitmap_next_block != NULL; + relation->rd_tableam->scan_bitmap_next_tuple != NULL; info->amcanmarkpos = (amroutine->ammarkpos != NULL && amroutine->amrestrpos != NULL); info->amcostestimate = amroutine->amcostestimate; diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index e4414b2e915..4dd58e96bf4 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -241,9 +241,7 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice) * If advice hasn't been suppressed, this system supports it, and this * isn't a strictly sequential pattern, then we'll issue advice. */ - if (!suppress_advice && - stream->advice_enabled && - stream->pending_read_blocknum != stream->seq_blocknum) + if (!suppress_advice && stream->advice_enabled) flags = READ_BUFFERS_ISSUE_ADVICE; else flags = 0; diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index dcd1ae3fc34..478e2a8a377 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -353,7 +353,12 @@ typedef struct GinScanEntryData /* for a partial-match or full-scan query, we accumulate all TIDs here */ TIDBitmap *matchBitmap; TBMPrivateIterator *matchIterator; - TBMIterateResult *matchResult; + + /* + * If blockno is InvalidBlockNumber, all of the other fields in the + * matchResult are meaningless. + */ + TBMIterateResult matchResult; /* used for Posting list and one page in Posting tree */ ItemPointerData *list; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index dc6e0184284..934b667ad78 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -43,8 +43,12 @@ typedef struct TableScanDescData */ union { - /* Iterator for Bitmap Table Scans */ - TBMIterator rs_tbmiterator; + /* State for Bitmap Table Scans */ + struct + { + struct ParallelBitmapHeapState *rs_pstate; + TBMIterator rs_iterator; + } bitmap; /* * Range of ItemPointers for table_scan_getnextslot_tidrange() to diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 131c050c15f..4206a6d0ab1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -780,59 +780,23 @@ typedef struct TableAmRoutine */ /* - * Prepare to fetch / check / return tuples from `blockno` as part of a - * bitmap table scan. `scan` was started via table_beginscan_bm(). Return - * false if the bitmap is exhausted and true otherwise. - * - * This will typically read and pin the target block, and do the necessary - * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might - * make sense to perform tuple visibility checks at this time). - * - * `lossy_pages` and `exact_pages` are EXPLAIN counters that can be - * incremented by the table AM to indicate whether or not the block's - * representation in the bitmap is lossy. - * - * `recheck` is set by the table AM to indicate whether or not the tuples - * from this block should be rechecked. Tuples from lossy pages will - * always need to be rechecked, but some non-lossy pages' tuples may also - * require recheck. - * - * `blockno` is the current block and is set by the table AM. The table AM - * is responsible for advancing the main iterator, but the bitmap table - * scan code still advances the prefetch iterator. `blockno` is used by - * bitmap table scan code to validate that the prefetch block stays ahead - * of the current block. + * Fetch the next tuple of a bitmap table scan into `slot` and return true + * if a visible tuple was found, false otherwise. * - * XXX: Currently this may only be implemented if the AM uses md.c as its - * storage manager, and uses ItemPointer->ip_blkid in a manner that maps - * blockids directly to the underlying storage. nodeBitmapHeapscan.c - * performs prefetching directly using that interface. This probably - * needs to be rectified at a later point. + * `lossy_pages` is incremented if the bitmap is lossy for the selected + * page; otherwise, `exact_pages` is incremented. This is tracked for + * display in EXPLAIN ANALYZE output. * - * XXX: Currently this may only be implemented if the AM uses the - * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to - * perform prefetching. This probably needs to be rectified at a later - * point. + * Prefetching additional data from the bitmap is left to the table AM. * - * Optional callback, but either both scan_bitmap_next_block and - * scan_bitmap_next_tuple need to exist, or neither. + * This is an optional callback. */ - bool (*scan_bitmap_next_block) (TableScanDesc scan, - BlockNumber *blockno, + bool (*scan_bitmap_next_tuple) (TableScanDesc scan, + TupleTableSlot *slot, bool *recheck, uint64 *lossy_pages, uint64 *exact_pages); - /* - * Fetch the next tuple of a bitmap table scan into `slot` and return true - * if a visible tuple was found, false otherwise. - * - * Optional callback, but either both scan_bitmap_next_block and - * scan_bitmap_next_tuple need to exist, or neither. - */ - bool (*scan_bitmap_next_tuple) (TableScanDesc scan, - TupleTableSlot *slot); - /* * Prepare to fetch tuples from the next block in a sample scan. Return * false if the sample scan is finished, true otherwise. `scan` was @@ -956,15 +920,19 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, */ static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, + struct ParallelBitmapHeapState *pstate, int nkeys, struct ScanKeyData *key, bool need_tuple) { + TableScanDesc result; uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; if (need_tuple) flags |= SO_NEED_TUPLES; - return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, - NULL, flags); + result = rel->rd_tableam->scan_begin(rel, snapshot, nkeys, + key, NULL, flags); + result->st.bitmap.rs_pstate = pstate; + return result; } /* @@ -1955,56 +1923,24 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths, */ /* - * Prepare to fetch / check / return tuples as part of a bitmap table scan. - * `scan` needs to have been started via table_beginscan_bm(). Returns false - * if there are no more blocks in the bitmap, true otherwise. - * - * `lossy_pages` and `exact_pages` are EXPLAIN counters that can be - * incremented by the table AM to indicate whether or not the block's - * representation in the bitmap is lossy. + * Fetch / check / return tuples as part of a bitmap table scan. `scan` needs + * to have been started via table_beginscan_bm(). Fetch the next tuple of a + * bitmap table scan into `slot` and return true if a visible tuple was found, + * false otherwise. * - * `recheck` is set by the table AM to indicate whether or not the tuples - * from this block should be rechecked. + * `recheck` is set by the table AM to indicate whether or not the tuple in + * `slot` should be rechecked. Tuples from lossy pages will always need to be + * rechecked, but some non-lossy pages' tuples may also require recheck. * - * `blockno` is the current block and is set by the table AM and is used by - * bitmap table scan code to validate that the prefetch block stays ahead of - * the current block. - * - * Note, this is an optionally implemented function, therefore should only be - * used after verifying the presence (at plan time or such). + * `lossy_pages` is incremented if the block's representation in the bitmap is + * lossy; otherwise, `exact_pages` is incremented. */ static inline bool -table_scan_bitmap_next_block(TableScanDesc scan, - BlockNumber *blockno, +table_scan_bitmap_next_tuple(TableScanDesc scan, + TupleTableSlot *slot, bool *recheck, uint64 *lossy_pages, uint64 *exact_pages) -{ - /* - * We don't expect direct calls to table_scan_bitmap_next_block with valid - * CheckXidAlive for catalog or regular tables. See detailed comments in - * xact.c where these variables are declared. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding"); - - return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, - blockno, recheck, - lossy_pages, - exact_pages); -} - -/* - * Fetch the next tuple of a bitmap table scan into `slot` and return true if - * a visible tuple was found, false otherwise. - * table_scan_bitmap_next_block() needs to previously have selected a - * block (i.e. returned true), and no previous - * table_scan_bitmap_next_tuple() for the same block may have - * returned false. - */ -static inline bool -table_scan_bitmap_next_tuple(TableScanDesc scan, - TupleTableSlot *slot) { /* * We don't expect direct calls to table_scan_bitmap_next_tuple with valid @@ -2015,7 +1951,10 @@ table_scan_bitmap_next_tuple(TableScanDesc scan, elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding"); return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan, - slot); + slot, + recheck, + lossy_pages, + exact_pages); } /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index e2d1dc1e067..c08fa33ad83 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1822,11 +1822,7 @@ typedef enum /* ---------------- * ParallelBitmapHeapState information * tbmiterator iterator for scanning current pages - * prefetch_iterator iterator for prefetching ahead of current page - * mutex mutual exclusion for the prefetching variable - * and state - * prefetch_pages # pages prefetch iterator is ahead of current - * prefetch_target current target prefetch distance + * mutex mutual exclusion for state * state current state of the TIDBitmap * cv conditional wait variable * ---------------- @@ -1834,10 +1830,7 @@ typedef enum typedef struct ParallelBitmapHeapState { dsa_pointer tbmiterator; - dsa_pointer prefetch_iterator; slock_t mutex; - int prefetch_pages; - int prefetch_target; SharedBitmapState state; ConditionVariable cv; } ParallelBitmapHeapState; @@ -1861,18 +1854,11 @@ typedef struct SharedBitmapHeapInstrumentation * * bitmapqualorig execution state for bitmapqualorig expressions * tbm bitmap obtained from child index scan(s) - * pvmbuffer buffer for visibility-map lookups of prefetched pages * stats execution statistics - * prefetch_iterator iterator for prefetching ahead of current page - * prefetch_pages # pages prefetch iterator is ahead of current - * prefetch_target current target prefetch distance - * prefetch_maximum maximum value for prefetch_target * initialized is node is ready to iterate * pstate shared state for parallel bitmap scan * sinstrument statistics for parallel workers * recheck do current page's tuples need recheck - * blockno used to validate pf and current block stay in sync - * prefetch_blockno used to validate pf stays ahead of current block * ---------------- */ typedef struct BitmapHeapScanState @@ -1880,18 +1866,11 @@ typedef struct BitmapHeapScanState ScanState ss; /* its first field is NodeTag */ ExprState *bitmapqualorig; TIDBitmap *tbm; - Buffer pvmbuffer; BitmapHeapScanInstrumentation stats; - TBMIterator prefetch_iterator; - int prefetch_pages; - int prefetch_target; - int prefetch_maximum; bool initialized; ParallelBitmapHeapState *pstate; SharedBitmapHeapInstrumentation *sinstrument; bool recheck; - BlockNumber blockno; - BlockNumber prefetch_blockno; } BitmapHeapScanState; /* ---------------- diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index a6ffeac90be..bc4adca3809 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -22,6 +22,7 @@ #ifndef TIDBITMAP_H #define TIDBITMAP_H +#include "access/htup_details.h" #include "storage/itemptr.h" #include "utils/dsa.h" @@ -55,9 +56,16 @@ typedef struct TBMIterateResult { BlockNumber blockno; /* page number containing tuples */ int ntuples; /* -1 indicates lossy result */ - bool recheck; /* should the tuples be rechecked? */ /* Note: recheck is always true if ntuples < 0 */ - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; + bool recheck; /* should the tuples be rechecked? */ + + /* + * The maximum number of tuples per page is not large (typically 256 with + * 8K pages, or 1024 with 32K pages). So there's not much point in making + * the per-page bitmaps variable size. We just legislate that the size is + * this: + */ + OffsetNumber offsets[MaxHeapTuplesPerPage]; } TBMIterateResult; /* function prototypes in nodes/tidbitmap.c */ @@ -78,8 +86,8 @@ extern bool tbm_is_empty(const TIDBitmap *tbm); extern TBMPrivateIterator *tbm_begin_private_iterate(TIDBitmap *tbm); extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm); -extern TBMIterateResult *tbm_private_iterate(TBMPrivateIterator *iterator); -extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator); +extern void tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres); +extern void tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres); extern void tbm_end_private_iterate(TBMPrivateIterator *iterator); extern void tbm_end_shared_iterate(TBMSharedIterator *iterator); extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa, @@ -90,7 +98,7 @@ extern TBMIterator tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp); extern void tbm_end_iterate(TBMIterator *iterator); -extern TBMIterateResult *tbm_iterate(TBMIterator *iterator); +extern void tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres); static inline bool tbm_exhausted(TBMIterator *iterator) -- 2.34.1