On Wed, Jul 15, 2015 at 3:07 AM, Sawada Masahiko <sawada.m...@gmail.com> wrote: > On Wed, Jul 15, 2015 at 12:55 AM, Simon Riggs <si...@2ndquadrant.com> wrote: >> On 10 July 2015 at 15:11, Sawada Masahiko <sawada.m...@gmail.com> wrote: >>> >>> >>> Oops, I had forgotten to add new file heapfuncs.c. >>> Latest patch is attached. >> >> >> I think we've established the approach is desirable and defined the way >> forwards for this, so this is looking good. > > If we want to move stuff like pg_stattuple, pg_freespacemap into core, > we could move them into heapfuncs.c. > >> Some of my requests haven't been actioned yet, so I personally would not >> commit this yet. I am happy to continue as reviewer/committer unless others >> wish to take over. >> The main missing item is pg_upgrade support, which won't happen by end of >> CF1, so I am marking this as Returned With Feedback. Hopefully we can review >> this again before CF2. > > I appreciate your reviewing. > Yeah, the pg_upgrade support and regression test for VFM patch is > almost done now, I will submit the patch in this week after testing it > .
Attached patch is latest v9 patch. I added: - regression test for visibility map (visibilitymap.sql and visibilitymap.out files) - pg_upgrade support (rewriting vm file to vfm file) - regression test for pg_upgrade Please review it. Regards, -- Masahiko Sawada
diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c index 22c5f7a..b1b6a06 100644 --- a/contrib/pgstattuple/pgstatapprox.c +++ b/contrib/pgstattuple/pgstatapprox.c @@ -87,7 +87,7 @@ statapprox_heap(Relation rel, output_type *stat) * If the page has only visible tuples, then we can find out the free * space from the FSM and move on. */ - if (visibilitymap_test(rel, blkno, &vmbuffer)) + if (visibilitymap_test(rel, blkno, &vmbuffer, VISIBILITYMAP_ALL_VISIBLE)) { freespace = GetRecordedFreeSpace(rel, blkno); stat->tuple_len += BLCKSZ - freespace; diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile index b83d496..806ce27 100644 --- a/src/backend/access/heap/Makefile +++ b/src/backend/access/heap/Makefile @@ -12,6 +12,7 @@ subdir = src/backend/access/heap top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o visibilitymap.o +OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o visibilitymap.o \ + heapfuncs.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 86a2e6b..796b76f 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2131,8 +2131,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); /* - * Find buffer to insert this tuple into. If the page is all visible, - * this will also pin the requisite visibility map page. + * Find buffer to insert this tuple into. If the page is all visible + * or all frozen, this will also pin the requisite visibility map and + * frozen map page. */ buffer = RelationGetBufferForTuple(relation, heaptup->t_len, InvalidBuffer, options, bistate, @@ -2147,7 +2148,11 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, if (PageIsAllVisible(BufferGetPage(buffer))) { all_visible_cleared = true; + + /* all-frozen information is also cleared at the same time */ PageClearAllVisible(BufferGetPage(buffer)); + PageClearAllFrozen(BufferGetPage(buffer)); + visibilitymap_clear(relation, ItemPointerGetBlockNumber(&(heaptup->t_self)), vmbuffer); @@ -2448,7 +2453,11 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, if (PageIsAllVisible(page)) { all_visible_cleared = true; + + /* all-frozen information is also cleared at the same time */ PageClearAllVisible(page); + PageClearAllFrozen(page); + visibilitymap_clear(relation, BufferGetBlockNumber(buffer), vmbuffer); @@ -2731,9 +2740,9 @@ heap_delete(Relation relation, ItemPointer tid, /* * If we didn't pin the visibility map page and the page has become all - * visible while we were busy locking the buffer, we'll have to unlock and - * re-lock, to avoid holding the buffer lock across an I/O. That's a bit - * unfortunate, but hopefully shouldn't happen often. + * visible or all frozen while we were busy locking the buffer, we'll + * have to unlock and re-lock, to avoid holding the buffer lock across an + * I/O. That's a bit unfortunate, but hopefully shouldn't happen often. */ if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) { @@ -2925,10 +2934,15 @@ l1: */ PageSetPrunable(page, xid); + /* clear PD_ALL_VISIBLE and PD_ALL_FORZEN flags */ if (PageIsAllVisible(page)) { all_visible_cleared = true; + + /* all-frozen information is also cleared at the same time */ PageClearAllVisible(page); + PageClearAllFrozen(page); + visibilitymap_clear(relation, BufferGetBlockNumber(buffer), vmbuffer); } @@ -3207,7 +3221,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, * in the middle of changing this, so we'll need to recheck after we have * the lock. */ - if (PageIsAllVisible(page)) + if (PageIsAllVisible(page) || PageIsAllFrozen(page)) visibilitymap_pin(relation, block, &vmbuffer); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); @@ -3801,14 +3815,22 @@ l2: if (PageIsAllVisible(BufferGetPage(buffer))) { all_visible_cleared = true; + + /* all-frozen information is also cleared at the same time */ PageClearAllVisible(BufferGetPage(buffer)); + PageClearAllFrozen(BufferGetPage(buffer)); + visibilitymap_clear(relation, BufferGetBlockNumber(buffer), vmbuffer); } if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf))) { all_visible_cleared_new = true; + + /* all-frozen information is also cleared at the same time */ PageClearAllVisible(BufferGetPage(newbuf)); + PageClearAllFrozen(BufferGetPage(newbuf)); + visibilitymap_clear(relation, BufferGetBlockNumber(newbuf), vmbuffer_new); } @@ -6893,7 +6915,7 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, */ XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, - TransactionId cutoff_xid) + TransactionId cutoff_xid, uint8 vmflags) { xl_heap_visible xlrec; XLogRecPtr recptr; @@ -6903,6 +6925,7 @@ log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, Assert(BufferIsValid(vm_buffer)); xlrec.cutoff_xid = cutoff_xid; + xlrec.flags = vmflags; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapVisible); @@ -7492,8 +7515,14 @@ heap_xlog_visible(XLogReaderState *record) * the subsequent update won't be replayed to clear the flag. */ page = BufferGetPage(buffer); - PageSetAllVisible(page); + + if (xlrec->flags & VISIBILITYMAP_ALL_VISIBLE) + PageSetAllVisible(page); + if (xlrec->flags & VISIBILITYMAP_ALL_FROZEN) + PageSetAllFrozen(page); + MarkBufferDirty(buffer); + } else if (action == BLK_RESTORED) { @@ -7544,7 +7573,7 @@ heap_xlog_visible(XLogReaderState *record) */ if (lsn > PageGetLSN(vmpage)) visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, - xlrec->cutoff_xid); + xlrec->cutoff_xid, xlrec->flags); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); @@ -7694,7 +7723,10 @@ heap_xlog_delete(XLogReaderState *record) PageSetPrunable(page, XLogRecGetXid(record)); if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + { PageClearAllVisible(page); + PageClearAllFrozen(page); + } /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = target_tid; @@ -7798,7 +7830,10 @@ heap_xlog_insert(XLogReaderState *record) PageSetLSN(page, lsn); if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { PageClearAllVisible(page); + PageClearAllFrozen(page); + } MarkBufferDirty(buffer); } @@ -7937,7 +7972,10 @@ heap_xlog_multi_insert(XLogReaderState *record) PageSetLSN(page, lsn); if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { PageClearAllVisible(page); + PageClearAllFrozen(page); + } MarkBufferDirty(buffer); } @@ -8065,7 +8103,10 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) PageSetPrunable(page, XLogRecGetXid(record)); if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + { PageClearAllVisible(page); + PageClearAllFrozen(page); + } PageSetLSN(page, lsn); MarkBufferDirty(obuffer); @@ -8200,7 +8241,10 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) elog(PANIC, "heap_update_redo: failed to add tuple"); if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + { PageClearAllVisible(page); + PageClearAllFrozen(page); + } freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ diff --git a/src/backend/access/heap/heapfuncs.c b/src/backend/access/heap/heapfuncs.c new file mode 100644 index 0000000..6c3753b --- /dev/null +++ b/src/backend/access/heap/heapfuncs.c @@ -0,0 +1,81 @@ +/*------------------------------------------------------------------------- + * + * heapfuncs.c + * Functions for accessing the related heap page + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/heapfuncs.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/visibilitymap.h" +#include "funcapi.h" +#include "storage/freespace.h" +#include "storage/bufmgr.h" + +/* Functions for visibilitymap */ +extern Datum pg_is_all_visible(PG_FUNCTION_ARGS); +extern Datum pg_is_all_frozen(PG_FUNCTION_ARGS); + +static bool visibilitymap_test_internal(Oid relid, uint64 blkno, uint8); + +/* + * Return the page is all-visible or not, according to the visibility map. + */ +Datum +pg_is_all_visible(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 blkno = PG_GETARG_INT64(1); + bool all_visible; + + all_visible = visibilitymap_test_internal(relid, blkno, VISIBILITYMAP_ALL_VISIBLE); + + PG_RETURN_BOOL(all_visible); +} + +/* + * Return the page is all-frozen or not, according to the visibility map. + */ +Datum +pg_is_all_frozen(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 blkno = PG_GETARG_INT64(1); + bool all_frozen; + + all_frozen = visibilitymap_test_internal(relid, blkno, VISIBILITYMAP_ALL_FROZEN); + + PG_RETURN_BOOL(all_frozen); +} + +static bool +visibilitymap_test_internal(Oid relid, uint64 blkno, uint8 flag) +{ + + Relation rel; + Buffer vmbuffer = InvalidBuffer; + bool result; + + rel = relation_open(relid, AccessShareLock); + + if (blkno < 0 || blkno > MaxBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid block number"))); + + result = visibilitymap_test(rel, blkno, &vmbuffer, flag); + + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); + relation_close(rel, AccessShareLock); + + return result; +} diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 7c38772..a284b85 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -21,33 +21,45 @@ * * NOTES * - * The visibility map is a bitmap with one bit per heap page. A set bit means - * that all tuples on the page are known visible to all transactions, and - * therefore the page doesn't need to be vacuumed. The map is conservative in - * the sense that we make sure that whenever a bit is set, we know the - * condition is true, but if a bit is not set, it might or might not be true. + * The visibility map is a bitmap with two bits (all-visible and all-frozen) + * per heap page. A set all-visible bit means that all tuples on the page are + * known visible to all transactions, and therefore the page doesn't need to + * be vacuumed. A set all-frozen bit means that all tuples on the page are + * completely frozen, and therefore the page doesn't need to be vacuumed even + * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum). + * A all-frozen bit must be set only when the page is already all-visible. + * That is, all-frozen bit is always set with all-visible bit. + * + * The map is conservative in the sense that we make sure that whenever a bit + * is set, we know the condition is true, but if a bit is not set, it might or + * might not be true. * * Clearing a visibility map bit is not separately WAL-logged. The callers * must make sure that whenever a bit is cleared, the bit is cleared on WAL - * replay of the updating operation as well. + * replay of the updating operation as well. And all-frozen bit must be + * cleared with all-visible at the same time. * * When we *set* a visibility map during VACUUM, we must write WAL. This may * seem counterintuitive, since the bit is basically a hint: if it is clear, - * it may still be the case that every tuple on the page is visible to all - * transactions; we just don't know that for certain. The difficulty is that - * there are two bits which are typically set together: the PD_ALL_VISIBLE bit - * on the page itself, and the visibility map bit. If a crash occurs after the - * visibility map page makes it to disk and before the updated heap page makes - * it to disk, redo must set the bit on the heap page. Otherwise, the next - * insert, update, or delete on the heap page will fail to realize that the - * visibility map bit must be cleared, possibly causing index-only scans to - * return wrong answers. + * it may still be the case that every tuple on the page is visible or frozen + * to all transactions; we just don't know that for certain. The difficulty is + * that there are two bits which are typically set together: the PD_ALL_VISIBLE + * or PD_ALL_FROZEN bit on the page itself, and the visibility map bit. If a + * crash occurs after the visibility map page makes it to disk and before the + * updated heap page makes it to disk, redo must set the bit on the heap page. + * Otherwise, the next insert, update, or delete on the heap page will fail to + * realize that the visibility map bit must be cleared, possibly causing index-only + * scans to return wrong answers. * * VACUUM will normally skip pages for which the visibility map bit is set; * such pages can't contain any dead tuples and therefore don't need vacuuming. - * The visibility map is not used for anti-wraparound vacuums, because + * The visibility map is not used for anti-wraparound vacuums before 9.5, because * an anti-wraparound vacuum needs to freeze tuples and observe the latest xid * present in the table, even on pages that don't have any dead tuples. + * 9.6 or later, the visibility map has a additional bit which indicates all tuple + * on single page has been completely forzen, so the visibility map is also used for + * anti-wraparound vacuums. + * * * LOCKING * @@ -58,14 +70,14 @@ * section that logs the page modification. However, we don't want to hold * the buffer lock over any I/O that may be required to read in the visibility * map page. To avoid this, we examine the heap page before locking it; - * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map - * bit. Then, we lock the buffer. But this creates a race condition: there - * is a possibility that in the time it takes to lock the buffer, the - * PD_ALL_VISIBLE bit gets set. If that happens, we have to unlock the - * buffer, pin the visibility map page, and relock the buffer. This shouldn't - * happen often, because only VACUUM currently sets visibility map bits, - * and the race will only occur if VACUUM processes a given page at almost - * exactly the same time that someone tries to further modify it. + * if the page-level PD_ALL_VISIBLE or PD_ALL_FROZEN bit is set, we pin the + * visibility map bit. Then, we lock the buffer. But this creates a race + * condition: there is a possibility that in the time it takes to lock the + * buffer, the PD_ALL_VISIBLE or PD_ALL_FROZEN bit gets set. If that happens, + * we have to unlock the buffer, pin the visibility map page, and relock the + * buffer. This shouldn't happen often, because only VACUUM currently sets + * visibility map bits, and the race will only occur if VACUUM processes a given + * page at almost exactly the same time that someone tries to further modify it. * * To set a bit, you need to hold a lock on the heap page. That prevents * the race condition where VACUUM sees that all tuples on the page are @@ -101,11 +113,14 @@ */ #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) -/* Number of bits allocated for each heap block. */ -#define BITS_PER_HEAPBLOCK 1 +/* + * Number of bits allocated for each heap block. + * One for all-visible, other for all-frozen. +*/ +#define BITS_PER_HEAPBLOCK 2 /* Number of heap blocks we can represent in one byte. */ -#define HEAPBLOCKS_PER_BYTE 8 +#define HEAPBLOCKS_PER_BYTE 4 /* Number of heap blocks we can represent in one visibility map page. */ #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) @@ -115,24 +130,42 @@ #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) #define HEAPBLK_TO_MAPBIT(x) ((x) % HEAPBLOCKS_PER_BYTE) -/* table for fast counting of set bits */ -static const uint8 number_of_ones[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 +/* tables for fast counting of set bits for visible and freeze */ +static const uint8 number_of_ones_for_visible[256] = { + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4 +}; +static const uint8 number_of_ones_for_frozen[256] = { + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4 }; /* prototypes for internal routines */ @@ -141,7 +174,7 @@ static void vm_extend(Relation rel, BlockNumber nvmblocks); /* - * visibilitymap_clear - clear a bit in visibility map + * visibilitymap_clear - clear all bits in visibility map * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do @@ -153,7 +186,8 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf) BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); int mapBit = HEAPBLK_TO_MAPBIT(heapBlk); - uint8 mask = 1 << mapBit; + uint8 mask = (VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN) << + (BITS_PER_HEAPBLOCK * mapBit); char *map; #ifdef TRACE_VISIBILITYMAP @@ -225,7 +259,7 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) } /* - * visibilitymap_set - set a bit on a previously pinned page + * visibilitymap_set - set bit(s) on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the @@ -234,10 +268,11 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * - * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling - * this function. Except in recovery, caller should also pass the heap - * buffer. When checksums are enabled and we're not in recovery, we must add - * the heap buffer to the WAL chain to protect it from being torn. + * Caller is expected to set the heap page's PD_ALL_VISIBLE or PD_ALL_FROZEN + * bit before calling this function. Except in recovery, caller should also + * pass the heap buffer and flags which indicates what flag we want to set. + * When checksums are enabled and we're not in recovery, we must add the heap + * buffer to the WAL chain to protect it from being torn. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do @@ -245,7 +280,8 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); @@ -254,7 +290,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, char *map; #ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); + elog(DEBUG1, "vm_set %s %d %u", RelationGetRelationName(rel), heapBlk, flags); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); @@ -272,11 +308,11 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, map = PageGetContents(page); LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); - if (!(map[mapByte] & (1 << mapBit))) + if (flags != (map[mapByte] & (flags << (BITS_PER_HEAPBLOCK * mapBit)))) { START_CRIT_SECTION(); - map[mapByte] |= (1 << mapBit); + map[mapByte] |= (flags << (BITS_PER_HEAPBLOCK * mapBit)); MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) @@ -285,7 +321,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, { Assert(!InRecovery); recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, - cutoff_xid); + cutoff_xid, flags); /* * If data checksums are enabled (or wal_log_hints=on), we @@ -295,11 +331,15 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, { Page heapPage = BufferGetPage(heapBuf); - /* caller is expected to set PD_ALL_VISIBLE first */ - Assert(PageIsAllVisible(heapPage)); + /* + * caller is expected to set PD_ALL_VISIBLE or + * PD_ALL_FROZEN first. + */ + Assert(PageIsAllVisible(heapPage) || PageIsAllFrozen(heapPage)); PageSetLSN(heapPage, recptr); } } + PageSetLSN(page, recptr); } @@ -310,15 +350,16 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, } /* - * visibilitymap_test - test if a bit is set + * visibilitymap_test - test if bit(s) is set * - * Are all tuples on heapBlk visible to all, according to the visibility map? + * Are all tuples on heapBlk visible or frozen to all, according to the visibility map? * * On entry, *buf should be InvalidBuffer or a valid buffer returned by an * earlier call to visibilitymap_pin or visibilitymap_test on the same * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk, or InvalidBuffer. The caller is responsible for - * releasing *buf after it's done testing and setting bits. + * releasing *buf after it's done testing and setting bits, and must set flags + * which indicates what flag we want to test. * * NOTE: This function is typically called without a lock on the heap page, * so somebody else could change the bit just after we look at it. In fact, @@ -328,7 +369,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, * all concurrency issues! */ bool -visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) +visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf, uint8 flags) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); @@ -337,7 +378,7 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) char *map; #ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk); + elog(DEBUG1, "vm_test %s %d %u", RelationGetRelationName(rel), heapBlk, flags); #endif /* Reuse the old pinned buffer if possible */ @@ -360,11 +401,12 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) map = PageGetContents(BufferGetPage(*buf)); /* - * A single-bit read is atomic. There could be memory-ordering effects + * A single or double bit read is atomic. There could be memory-ordering effects * here, but for performance reasons we make it the caller's job to worry * about that. */ - result = (map[mapByte] & (1 << mapBit)) ? true : false; + result = (map[mapByte] & (flags << (BITS_PER_HEAPBLOCK * mapBit))) ? + true : false; return result; } @@ -374,10 +416,11 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) * * Note: we ignore the possibility of race conditions when the table is being * extended concurrently with the call. New pages added to the table aren't - * going to be marked all-visible, so they won't affect the result. + * going to be marked all-visible or all-frozen, so they won't affect the result. + * The caller must set the flags which indicates what flag we want to count. */ BlockNumber -visibilitymap_count(Relation rel) +visibilitymap_count(Relation rel, uint8 flags) { BlockNumber result = 0; BlockNumber mapBlock; @@ -406,7 +449,10 @@ visibilitymap_count(Relation rel) for (i = 0; i < MAPSIZE; i++) { - result += number_of_ones[map[i]]; + if (flags & VISIBILITYMAP_ALL_VISIBLE) + result += number_of_ones_for_visible[map[i]]; + if (flags & VISIBILITYMAP_ALL_FROZEN) + result += number_of_ones_for_frozen[map[i]]; } ReleaseBuffer(mapBuffer); diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 4246554..015bfb8 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1919,11 +1919,18 @@ index_update_stats(Relation rel, { BlockNumber relpages = RelationGetNumberOfBlocks(rel); BlockNumber relallvisible; + BlockNumber relallfrozen; if (rd_rel->relkind != RELKIND_INDEX) - relallvisible = visibilitymap_count(rel); + { + relallvisible = visibilitymap_count(rel, VISIBILITYMAP_ALL_VISIBLE); + relallfrozen = visibilitymap_count(rel, VISIBILITYMAP_ALL_FROZEN); + } else /* don't bother for indexes */ + { relallvisible = 0; + relallfrozen = 0; + } if (rd_rel->relpages != (int32) relpages) { @@ -1940,6 +1947,11 @@ index_update_stats(Relation rel, rd_rel->relallvisible = (int32) relallvisible; dirty = true; } + if (rd_rel->relallfrozen != (int32) relallfrozen) + { + rd_rel->relallfrozen = (int32) relallfrozen; + dirty = true; + } } /* diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 861048f..392c2a4 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -572,7 +572,8 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, vac_update_relstats(onerel, relpages, totalrows, - visibilitymap_count(onerel), + visibilitymap_count(onerel, VISIBILITYMAP_ALL_VISIBLE), + visibilitymap_count(onerel, VISIBILITYMAP_ALL_FROZEN), hasindex, InvalidTransactionId, InvalidMultiXactId, @@ -595,6 +596,7 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, RelationGetNumberOfBlocks(Irel[ind]), totalindexrows, 0, + 0, false, InvalidTransactionId, InvalidMultiXactId, diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 7ab4874..d3725dd 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -22,6 +22,7 @@ #include "access/rewriteheap.h" #include "access/transam.h" #include "access/tuptoaster.h" +#include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlog.h" #include "catalog/catalog.h" diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index baf66f1..d68c7c4 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -744,6 +744,7 @@ void vac_update_relstats(Relation relation, BlockNumber num_pages, double num_tuples, BlockNumber num_all_visible_pages, + BlockNumber num_all_frozen_pages, bool hasindex, TransactionId frozenxid, MultiXactId minmulti, bool in_outer_xact) @@ -781,6 +782,11 @@ vac_update_relstats(Relation relation, pgcform->relallvisible = (int32) num_all_visible_pages; dirty = true; } + if (pgcform->relallfrozen != (int32) num_all_frozen_pages) + { + pgcform->relallfrozen = (int32) num_all_frozen_pages; + dirty = true; + } /* Apply DDL updates, but not inside an outer transaction (see above) */ diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index a01cfb4..120de63 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -106,6 +106,8 @@ typedef struct LVRelStats BlockNumber rel_pages; /* total number of pages */ BlockNumber scanned_pages; /* number of pages we examined */ BlockNumber pinskipped_pages; /* # of pages we skipped due to a pin */ + BlockNumber vmskipped_frozen_pages; /* # of pages we skipped by all-frozen bit + of visibility map */ double scanned_tuples; /* counts only tuples on scanned pages */ double old_rel_tuples; /* previous value of pg_class.reltuples */ double new_rel_tuples; /* new estimated total # of tuples */ @@ -156,7 +158,7 @@ static void lazy_record_dead_tuple(LVRelStats *vacrelstats, static bool lazy_tid_reaped(ItemPointer itemptr, void *state); static int vac_cmp_itemptr(const void *left, const void *right); static bool heap_page_is_all_visible(Relation rel, Buffer buf, - TransactionId *visibility_cutoff_xid); + TransactionId *visibility_cutoff_xid, bool *all_frozen); /* @@ -188,7 +190,8 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, MultiXactId mxactFullScanLimit; BlockNumber new_rel_pages; double new_rel_tuples; - BlockNumber new_rel_allvisible; + BlockNumber new_rel_allvisible, + new_rel_allfrozen; double new_live_tuples; TransactionId new_frozen_xid; MultiXactId new_min_multi; @@ -222,6 +225,8 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, * than or equal to the requested Xid full-table scan limit; or if the * table's minimum MultiXactId is older than or equal to the requested * mxid full-table scan limit. + * Even if scan_all is set so far, we could skip to scan some pages + * according by frozen map. */ scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, xidFullScanLimit); @@ -253,7 +258,8 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, * NB: We need to check this before truncating the relation, because that * will change ->rel_pages. */ - if (vacrelstats->scanned_pages < vacrelstats->rel_pages) + if ((vacrelstats->scanned_pages + vacrelstats->vmskipped_frozen_pages) + < vacrelstats->rel_pages) { Assert(!scan_all); scanned_all = false; @@ -301,10 +307,14 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, new_rel_tuples = vacrelstats->old_rel_tuples; } - new_rel_allvisible = visibilitymap_count(onerel); + new_rel_allvisible = visibilitymap_count(onerel, VISIBILITYMAP_ALL_VISIBLE); if (new_rel_allvisible > new_rel_pages) new_rel_allvisible = new_rel_pages; + new_rel_allfrozen = visibilitymap_count(onerel, VISIBILITYMAP_ALL_FROZEN); + if (new_rel_allfrozen > new_rel_pages) + new_rel_allfrozen = new_rel_pages; + new_frozen_xid = scanned_all ? FreezeLimit : InvalidTransactionId; new_min_multi = scanned_all ? MultiXactCutoff : InvalidMultiXactId; @@ -312,6 +322,7 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, new_rel_pages, new_rel_tuples, new_rel_allvisible, + new_rel_allfrozen, vacrelstats->hasindex, new_frozen_xid, new_min_multi, @@ -360,10 +371,11 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans); - appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins\n"), + appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped according to vm\n"), vacrelstats->pages_removed, vacrelstats->rel_pages, - vacrelstats->pinskipped_pages); + vacrelstats->pinskipped_pages, + vacrelstats->vmskipped_frozen_pages); appendStringInfo(&buf, _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable\n"), vacrelstats->tuples_deleted, @@ -486,9 +498,12 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. - * Also, skipping even a single page means that we can't update - * relfrozenxid, so we only want to do it if we can skip a goodly number - * of pages. + * Also, skipping even a single page accorinding to all-visible bit of + * visibility map means that we can't update relfrozenxid, so we only want + * to do it if we can skip a goodly number. On the other hand, we count + * both how many pages we skipped according to all-frozen bit of visibility + * map and how many pages we freeze page, so we can update relfrozenxid if + * the sum of them is as many as pages of table. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not @@ -515,7 +530,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, next_not_all_visible_block < nblocks; next_not_all_visible_block++) { - if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) + if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN)) break; vacuum_delay_point(); } @@ -533,7 +549,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, bool tupgone, hastup; int prev_dead_count; - int nfrozen; + int nfrozen; /* # of tuples is frozen */ + int nalready_frozen; /* # of tuples is already frozen */ + int ntotal_frozen; /* # of tuples is in single page */ + int ntup_per_page; Size freespace; bool all_visible_according_to_vm; bool all_visible; @@ -548,7 +567,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, - &vmbuffer)) + &vmbuffer, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN)) break; vacuum_delay_point(); } @@ -566,9 +586,25 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, } else { - /* Current block is all-visible */ - if (skipping_all_visible_blocks && !scan_all) - continue; + /* + * This block is at least all-visible according to visibility map. + * We check whehter this block is all-frozen to skip to vacuum this + * page even if scanning whole page is required. + */ + if (scan_all) + { + if (visibilitymap_test(onerel, blkno, &vmbuffer, VISIBILITYMAP_ALL_FROZEN)) + { + vacrelstats->vmskipped_frozen_pages++; + continue; + } + } + else + { + if (skipping_all_visible_blocks) + continue; + } + all_visible_according_to_vm = true; } @@ -740,7 +776,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, PageSetAllVisible(page); visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId); + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE); END_CRIT_SECTION(); } @@ -764,6 +801,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, all_visible = true; has_dead_tuples = false; nfrozen = 0; + nalready_frozen = 0; + ntup_per_page = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); @@ -918,8 +957,13 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, else { num_tuples += 1; + ntup_per_page += 1;; hastup = true; + /* Check whether this tuple is alrady frozen or not */ + if (HeapTupleHeaderXminFrozen(tuple.t_data)) + nalready_frozen += 1; + /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. @@ -931,9 +975,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, } /* scan along page */ /* - * If we froze any tuples, mark the buffer dirty, and write a WAL - * record recording the changes. We must log the changes to be - * crash-safe against future truncation of CLOG. + * If we froze any tuples or any tuples are already frozen, + * mark the buffer dirty, and write a WAL record recording the changes. + * We must log the changes to be crash-safe against future truncation + * of CLOG. */ if (nfrozen > 0) { @@ -966,6 +1011,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, END_CRIT_SECTION(); } + /* Compute the number of frozen tuples in a page */ + ntotal_frozen = nfrozen + nalready_frozen; + /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. @@ -988,26 +1036,47 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, freespace = PageGetHeapFreeSpace(page); - /* mark page all-visible, if appropriate */ - if (all_visible && !all_visible_according_to_vm) + /* This page is all visible */ + if (all_visible) { - /* - * It should never be the case that the visibility map page is set - * while the page-level bit is clear, but the reverse is allowed - * (if checksums are not enabled). Regardless, set the both bits - * so that we get back in sync. - * - * NB: If the heap page is all-visible but the VM bit is not set, - * we don't need to dirty the heap page. However, if checksums - * are enabled, we do need to make sure that the heap page is - * dirtied before passing it to visibilitymap_set(), because it - * may be logged. Given that this situation should only happen in - * rare cases after a crash, it is not worth optimizing. - */ - PageSetAllVisible(page); - MarkBufferDirty(buf); - visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, visibility_cutoff_xid); + uint8 flags = 0; + + /* mark page all-visible, if appropriate */ + if (!all_visible_according_to_vm) + { + /* + * It should never be the case that the visibility map page is set + * while the page-level bit is clear, but the reverse is allowed + * (if checksums are not enabled). Regardless, set the both bits + * so that we get back in sync. + * + * NB: If the heap page is all-visible but the VM bit is not set, + * we don't need to dirty the heap page. However, if checksums + * are enabled, we do need to make sure that the heap page is + * dirtied before passing it to visibilitymap_set(), because it + * may be logged. Given that this situation should only happen in + * rare cases after a crash, it is not worth optimizing. + */ + PageSetAllVisible(page); + flags |= VISIBILITYMAP_ALL_VISIBLE; + } + + /* mark page all-frozen, if all tuples are frozen in total */ + if ((ntotal_frozen == ntup_per_page) && + !visibilitymap_test(onerel, blkno, &vmbuffer, VISIBILITYMAP_ALL_FROZEN)) + { + Assert(PageIsAllVisible(page)); + + PageSetAllFrozen(page); + flags |= VISIBILITYMAP_ALL_FROZEN; + } + + if (flags) + { + MarkBufferDirty(buf); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, visibility_cutoff_xid, flags); + } } /* @@ -1018,7 +1087,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * that something bad has happened. */ else if (all_visible_according_to_vm && !PageIsAllVisible(page) - && visibilitymap_test(onerel, blkno, &vmbuffer)) + && visibilitymap_test(onerel, blkno, &vmbuffer, VISIBILITYMAP_ALL_VISIBLE)) { elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", relname, blkno); @@ -1047,6 +1116,17 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, visibilitymap_clear(onerel, blkno, vmbuffer); } + /* + * As a result of scanning a page, we set VM all-frozen bit and page header + * if all tuples of single page are frozen. + */ + if (ntotal_frozen == ntup_per_page) + { + PageSetAllFrozen(page); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, vmbuffer, + InvalidTransactionId, VISIBILITYMAP_ALL_FROZEN); + } + UnlockReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ @@ -1078,7 +1158,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, num_tuples); /* - * Release any remaining pin on visibility map page. + * Release any remaining pin on visibility map and frozen map page. */ if (BufferIsValid(vmbuffer)) { @@ -1115,6 +1195,14 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, tups_vacuumed, vacuumed_pages))); /* + * This information would be effective for how much effect all-frozen bit + * of VM had for freezing tuples. + */ + ereport(elevel, + (errmsg("Skipped %d frozen pages acoording to visibility map", + vacrelstats->vmskipped_frozen_pages))); + + /* * This is pretty messy, but we split it up so that we can skip emitting * individual parts of the message when not applicable. */ @@ -1226,6 +1314,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, OffsetNumber unused[MaxOffsetNumber]; int uncnt = 0; TransactionId visibility_cutoff_xid; + bool all_frozen; START_CRIT_SECTION(); @@ -1277,19 +1366,31 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, * dirty, exclusively locked, and, if needed, a full page image has been * emitted in the log_heap_clean() above. */ - if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid)) + if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid, &all_frozen)) PageSetAllVisible(page); /* * All the changes to the heap page have been done. If the all-visible - * flag is now set, also set the VM bit. + * flag is now set, also set the VM all-visible bit. + * Also, if this page is all-frozen, set VM all-frozen bit and flag. */ - if (PageIsAllVisible(page) && - !visibilitymap_test(onerel, blkno, vmbuffer)) + if (PageIsAllVisible(page)) { - Assert(BufferIsValid(*vmbuffer)); - visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer, - visibility_cutoff_xid); + uint8 flags = 0; + + if (!visibilitymap_test(onerel, blkno, vmbuffer, VISIBILITYMAP_ALL_VISIBLE)) + flags |= VISIBILITYMAP_ALL_VISIBLE; + + /* mark page all-frozen, and set VM all-frozen bit */ + if (all_frozen) + { + PageSetAllFrozen(page); + flags |= VISIBILITYMAP_ALL_FROZEN; + } + + if (flags) + visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer, + visibility_cutoff_xid, flags); } return tupindex; @@ -1408,6 +1509,7 @@ lazy_cleanup_index(Relation indrel, stats->num_pages, stats->num_index_tuples, 0, + 0, false, InvalidTransactionId, InvalidMultiXactId, @@ -1782,7 +1884,8 @@ vac_cmp_itemptr(const void *left, const void *right) * xmin amongst the visible tuples. */ static bool -heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid) +heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid, + bool *all_frozen) { Page page = BufferGetPage(buf); BlockNumber blockno = BufferGetBlockNumber(buf); @@ -1791,6 +1894,7 @@ heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cut bool all_visible = true; *visibility_cutoff_xid = InvalidTransactionId; + *all_frozen = true; /* * This is a stripped down version of the line pointer scan in @@ -1814,7 +1918,7 @@ heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cut /* * Dead line pointers can have index pointers pointing to them. So - * they can't be treated as visible + * they can't be treated as visible and frozen. */ if (ItemIdIsDead(itemid)) { @@ -1855,6 +1959,10 @@ heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cut /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) *visibility_cutoff_xid = xmin; + + /* Check whether this tuple is alrady frozen or not */ + if (!HeapTupleHeaderXminFrozen(tuple.t_data)) + *all_frozen = false; } break; @@ -1863,6 +1971,7 @@ heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cut case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: all_visible = false; + *all_frozen = false; break; default: diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 9f54c46..08df289 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -116,7 +116,7 @@ IndexOnlyNext(IndexOnlyScanState *node) */ if (!visibilitymap_test(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) + &node->ioss_VMBuffer, VISIBILITYMAP_ALL_VISIBLE)) { /* * Rats, we have to visit the heap to check visibility. diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 874ca6a..376841a 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -127,7 +127,7 @@ ExecCheckPlanOutput(Relation resultRel, List *targetList) if (attno != resultDesc->natts) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg("table row type and query-specified row type do not match"), + errmsg("table row type and query-specified row type do not match"), errdetail("Query has too few columns."))); } diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 79d9390..8fededc 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -10,6 +10,7 @@ #include "postgres_fe.h" #include "pg_upgrade.h" +#include "storage/bufpage.h" #include <fcntl.h> @@ -21,6 +22,27 @@ static int copy_file(const char *fromfile, const char *tofile, bool force); static int win32_pghardlink(const char *src, const char *dst); #endif +static int rewrite_vm_to_vfm(const char *fromfile, const char *tofile, bool force); + +/* table for fast rewriting vm file to vfm file */ +static const uint16 rewrite_vm_to_vfm_table[256] = { + 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, 85, + 256, 257, 260, 261, 272, 273, 276, 277, 320, 321, 324, 325, 336, 337, 340, 341, + 1024, 1025, 1028, 1029, 1040, 1041, 1044, 1045, 1088, 1089, 1092, 1093, 1104, 1105, 1108, 1109, + 1280, 1281, 1284, 1285, 1296, 1297, 1300, 1301, 1344, 1345, 1348, 1349, 1360, 1361, 1364, 1365, + 4096, 4097, 4100, 4101, 4112, 4113, 4116, 4117, 4160, 4161, 4164, 4165, 4176, 4177, 4180, 4181, + 4352, 4353, 4356, 4357, 4368, 4369, 4372, 4373, 4416, 4417, 4420, 4421, 4432, 4433, 4436, 4437, + 5120, 5121, 5124, 5125, 5136, 5137, 5140, 5141, 5184, 5185, 5188, 5189, 5200, 5201, 5204, 5205, + 5376, 5377, 5380, 5381, 5392, 5393, 5396, 5397, 5440, 5441, 5444, 5445, 5456, 5457, 5460, 5461, + 16384, 16385, 16388, 16389, 16400, 16401, 16404, 16405, 16448, 16449, 16452, 16453, 16464, 16465, 16468, 16469, + 16640, 16641, 16644, 16645, 16656, 16657, 16660, 16661, 16704, 16705, 16708, 16709, 16720, 16721, 16724, 16725, + 17408, 17409, 17412, 17413, 17424, 17425, 17428, 17429, 17472, 17473, 17476, 17477, 17488, 17489, 17492, 17493, + 17664, 17665, 17668, 17669, 17680, 17681, 17684, 17685, 17728, 17729, 17732, 17733, 17744, 17745, 17748, 17749, + 20480, 20481, 20484, 20485, 20496, 20497, 20500, 20501, 20544, 20545, 20548, 20549, 20560, 20561, 20564, 20565, + 20736, 20737, 20740, 20741, 20752, 20753, 20756, 20757, 20800, 20801, 20804, 20805, 20816, 20817, 20820, 20821, + 21504, 21505, 21508, 21509, 21520, 21521, 21524, 21525, 21568, 21569, 21572, 21573, 21584, 21585, 21588, 21589, + 21760, 21761, 21764, 21765, 21776, 21777, 21780, 21781, 21824, 21825, 21828, 21829, 21840, 21841, 21844, 21845 +}; /* * copyAndUpdateFile() @@ -30,11 +52,19 @@ static int win32_pghardlink(const char *src, const char *dst); */ const char * copyAndUpdateFile(pageCnvCtx *pageConverter, - const char *src, const char *dst, bool force) + const char *src, const char *dst, bool force, bool rewrite_vm) { + if (pageConverter == NULL) { - if (pg_copy_file(src, dst, force) == -1) + int ret; + + if (rewrite_vm) + ret = rewrite_vm_to_vfm(src, dst, force); + else + ret = pg_copy_file(src, dst, force); + + if (ret) return getErrorText(errno); else return NULL; @@ -99,7 +129,6 @@ copyAndUpdateFile(pageCnvCtx *pageConverter, } } - /* * linkAndUpdateFile() * @@ -201,6 +230,110 @@ copy_file(const char *srcfile, const char *dstfile, bool force) #endif +/* + * rewriteVisibiiltyMap() + * + * A additional bit which indicates that all tuples on page is completely + * frozen is added into visibility map at PG 9.6. So the format of visibiilty + * map has been changed. + * Copies a visibility map file while adding all-frozen bit(0) into each bit. + */ +static int +rewrite_vm_to_vfm(const char *fromfile, const char *tofile, bool force) +{ +#define REWRITE_BUF_SIZE (50 * BLCKSZ) +#define BITS_PER_HEAPBLOCK 2 + + int src_fd, dst_fd; + uint16 vfm_bits; + ssize_t nbytes; + char *buffer; + int ret = 0; + int save_errno = 0; + + if ((fromfile == NULL) || (tofile == NULL)) + { + errno = EINVAL; + return -1; + } + + if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0) + return -1; + + if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0) + { + save_errno = errno; + if (src_fd != 0) + close(src_fd); + + errno = save_errno; + return -1; + } + + buffer = (char *) pg_malloc(REWRITE_BUF_SIZE); + + /* Copy page header data in advance */ + if ((nbytes = read(src_fd, buffer, MAXALIGN(SizeOfPageHeaderData))) <= 0) + { + save_errno = errno; + return -1; + } + + if (write(dst_fd, buffer, nbytes) != nbytes) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + save_errno = errno; + return -1; + } + + /* perform data rewriting i.e read src srouce, write to destination */ + while (true) + { + ssize_t nbytes = read(src_fd, buffer, REWRITE_BUF_SIZE); + char *cur, *end; + + if (nbytes < 0) + { + ret = -1; + break; + } + + if (nbytes == 0) + break; + + cur = buffer; + end = buffer + nbytes; + + /* + * Rewrite a byte and write dest_fd per BITS_PER_HEAPBLOCK bytes. + */ + while (end > cur) + { + /* Get rewritten bit from table and its string representation */ + vfm_bits = rewrite_vm_to_vfm_table[(uint8) *cur]; + + if (write(dst_fd, &vfm_bits, BITS_PER_HEAPBLOCK) != BITS_PER_HEAPBLOCK) + { + ret = -1; + break; + } + cur++; + } + } + + pg_free(buffer); + + if (src_fd != 0) + close(src_fd); + + if (dst_fd != 0) + close(dst_fd); + + return ret; +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 13aa891..d957581 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -112,6 +112,11 @@ extern char *output_files[]; #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031 /* + * The format of visibility map changed with this 9.6 commit, + * + */ +#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201507161 +/* * pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85, * ("Improve concurrency of foreign key locking") which also updated catalog * version to this value. pg_upgrade behavior depends on whether old and new @@ -397,7 +402,7 @@ typedef void *pageCnvCtx; #endif const char *copyAndUpdateFile(pageCnvCtx *pageConverter, const char *src, - const char *dst, bool force); + const char *dst, bool force, bool rewrite_vm); const char *linkAndUpdateFile(pageCnvCtx *pageConverter, const char *src, const char *dst); diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index c22df42..766a473 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -18,7 +18,7 @@ static void transfer_single_new_db(pageCnvCtx *pageConverter, FileNameMap *maps, int size, char *old_tablespace); static void transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, - const char *suffix); + const char *type_old_suffix, const char *type_new_suffix); /* @@ -171,6 +171,7 @@ transfer_single_new_db(pageCnvCtx *pageConverter, { int mapnum; bool vm_crashsafe_match = true; + bool vm_rewrite_needed = false; /* * Do the old and new cluster disagree on the crash-safetiness of the vm @@ -180,13 +181,20 @@ transfer_single_new_db(pageCnvCtx *pageConverter, new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER) vm_crashsafe_match = false; + /* + * Do we need to rewrite "vm" to "vfm". + */ + if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER && + new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) + vm_rewrite_needed = true; + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0) { /* transfer primary file */ - transfer_relfile(pageConverter, &maps[mapnum], ""); + transfer_relfile(pageConverter, &maps[mapnum], "", ""); /* fsm/vm files added in PG 8.4 */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) @@ -194,9 +202,17 @@ transfer_single_new_db(pageCnvCtx *pageConverter, /* * Copy/link any fsm and vm files, if they exist */ - transfer_relfile(pageConverter, &maps[mapnum], "_fsm"); + transfer_relfile(pageConverter, &maps[mapnum], "_fsm", "_fsm"); if (vm_crashsafe_match) - transfer_relfile(pageConverter, &maps[mapnum], "_vm"); + { + /* + * vm file is changed to vfm file in PG 9.6. + */ + if (vm_rewrite_needed) + transfer_relfile(pageConverter, &maps[mapnum], "_vm", "_vfm"); + else + transfer_relfile(pageConverter, &maps[mapnum], "_vm", "_vm"); + } } } } @@ -210,7 +226,7 @@ transfer_single_new_db(pageCnvCtx *pageConverter, */ static void transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, - const char *type_suffix) + const char *type_old_suffix, const char *type_new_suffix) { const char *msg; char old_file[MAXPGPATH]; @@ -218,6 +234,7 @@ transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, int fd; int segno; char extent_suffix[65]; + bool rewrite_vm = false; /* * Now copy/link any related segments as well. Remember, PG breaks large @@ -236,18 +253,18 @@ transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, map->old_tablespace_suffix, map->old_db_oid, map->old_relfilenode, - type_suffix, + type_old_suffix, extent_suffix); snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s", map->new_tablespace, map->new_tablespace_suffix, map->new_db_oid, map->new_relfilenode, - type_suffix, + type_new_suffix, extent_suffix); /* Is it an extent, fsm, or vm file? */ - if (type_suffix[0] != '\0' || segno != 0) + if (type_old_suffix[0] != '\0' || segno != 0) { /* Did file open fail? */ if ((fd = open(old_file, O_RDONLY, 0)) == -1) @@ -276,7 +293,11 @@ transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, { pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file); - if ((msg = copyAndUpdateFile(pageConverter, old_file, new_file, true)) != NULL) + /* We need to rewrite vm file to vfm file. */ + if (strcmp(type_old_suffix, type_new_suffix) != 0) + rewrite_vm = true; + + if ((msg = copyAndUpdateFile(pageConverter, old_file, new_file, true, rewrite_vm)) != NULL) pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", map->nspname, map->relname, old_file, new_file, msg); } diff --git a/src/bin/pg_upgrade/test.sh b/src/bin/pg_upgrade/test.sh index f4e5d9a..53b8b2f 100644 --- a/src/bin/pg_upgrade/test.sh +++ b/src/bin/pg_upgrade/test.sh @@ -171,6 +171,11 @@ if "$MAKE" -C "$oldsrc" installcheck; then mv "$temp_root"/dump1.sql "$temp_root"/dump1.sql.orig sed "s;$oldsrc;$newsrc;g" "$temp_root"/dump1.sql.orig >"$temp_root"/dump1.sql fi + + vm_sql="SELECT c.relname, c.relallvisible FROM pg_class as c, pg_namespace as n WHERE c.relnamespace = n.oid AND n.nspname NOT IN ('information_schema', 'pg_toast', 'pg_catalog') ORDER BY c.relname;" + # Test for rewriting visibility map + vacuumdb -d regression || visibilitymap_vacuum1_status=$? + psql -d regression -c "$vm_sql" > "$temp_root"/vm_test1.txt || visibilitymap_test1_status=$? else make_installcheck_status=$? fi @@ -185,6 +190,14 @@ if [ -n "$pg_dumpall1_status" ]; then echo "pg_dumpall of pre-upgrade database cluster failed" exit 1 fi +if [ -n "$visibilitymap_vacuum1_status" ];then + echo "VACUUM of pre-upgrade database cluster for visibility map test failed" + exit 1 +fi +if [ -n "$visibilitymap_test1_status" ];then + echo "SELECT of pre-upgrade database cluster for visibility map test failed" + exit 1 +fi PGDATA=$BASE_PGDATA @@ -200,6 +213,8 @@ case $testhost in esac pg_dumpall -f "$temp_root"/dump2.sql || pg_dumpall2_status=$? +vacuumdb -d regression || visibilitymap_vacuum2_status=$? +psql -d regression -c "$vm_sql" > "$temp_root"/vm_test2.txt || visibilitymap_test2_status=$? pg_ctl -m fast stop # no need to echo commands anymore @@ -211,11 +226,26 @@ if [ -n "$pg_dumpall2_status" ]; then exit 1 fi +if [ -n "$visibilitymap_vacuum2_status" ];then + echo "VACUUM of post-upgrade database cluster for visibility map test failed" + exit 1 +fi + +if [ -n "$visibilitymap_test2_status" ];then + echo "SELECT of post-upgrade database cluster for visibility map test failed" + exit 1 +fi + case $testhost in MINGW*) cmd /c delete_old_cluster.bat ;; *) sh ./delete_old_cluster.sh ;; esac +if ! diff "$temp_root"/vm_test1.txt "$temp_root"/vm_test2.txt >/dev/null; then + echo "Visibility map rewriting test failed" + exit 1 +fi + if diff "$temp_root"/dump1.sql "$temp_root"/dump2.sql >/dev/null; then echo PASSED exit 0 diff --git a/src/common/relpath.c b/src/common/relpath.c index 66dfef1..5898f1b 100644 --- a/src/common/relpath.c +++ b/src/common/relpath.c @@ -30,11 +30,14 @@ * If you add a new entry, remember to update the errhint in * forkname_to_number() below, and update the SGML documentation for * pg_relation_size(). + * 9.6 or later, the visibility map fork name is changed from "vm" to + * "vfm" bacause visibility map has not only information about all-visible + * but also information about all-frozen. */ const char *const forkNames[] = { "main", /* MAIN_FORKNUM */ "fsm", /* FSM_FORKNUM */ - "vm", /* VISIBILITYMAP_FORKNUM */ + "vfm", /* VISIBILITYMAP_FORKNUM */ "init" /* INIT_FORKNUM */ }; diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index caa0f14..93afb10 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -320,9 +320,10 @@ typedef struct xl_heap_freeze_page typedef struct xl_heap_visible { TransactionId cutoff_xid; + uint8 flags; } xl_heap_visible; -#define SizeOfHeapVisible (offsetof(xl_heap_visible, cutoff_xid) + sizeof(TransactionId)) +#define SizeOfHeapVisible (offsetof(xl_heap_visible, flags) + sizeof(uint8)) typedef struct xl_heap_new_cid { @@ -389,6 +390,6 @@ extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, extern void heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *xlrec_tp); extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, - Buffer vm_buffer, TransactionId cutoff_xid); + Buffer vm_buffer, TransactionId cutoff_xid, uint8 flags); #endif /* HEAPAM_XLOG_H */ diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index 0c0e0ef..7270609 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -19,15 +19,20 @@ #include "storage/buf.h" #include "utils/relcache.h" -extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk, - Buffer vmbuf); +/* Flags for bit map */ +#define VISIBILITYMAP_ALL_VISIBLE 0x01 +#define VISIBILITYMAP_ALL_FROZEN 0x02 + +extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf); extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid); -extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); -extern BlockNumber visibilitymap_count(Relation rel); + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags); +extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf, + uint8 flags); +extern BlockNumber visibilitymap_count(Relation rel, uint8 flags); extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks); #endif /* VISIBILITYMAP_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 44ce2b3..734df9d 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201507021 +#define CATALOG_VERSION_NO 201507161 #endif diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index e526cd9..ea0f7c1 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -47,6 +47,8 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO float4 reltuples; /* # of tuples (not always up-to-date) */ int32 relallvisible; /* # of all-visible blocks (not always * up-to-date) */ + int32 relallfrozen; /* # of all-frozen blocks (not always + up-to-date) */ Oid reltoastrelid; /* OID of toast table; 0 if none */ bool relhasindex; /* T if has (or has had) any indexes */ bool relisshared; /* T if shared across databases */ @@ -95,7 +97,7 @@ typedef FormData_pg_class *Form_pg_class; * ---------------- */ -#define Natts_pg_class 30 +#define Natts_pg_class 31 #define Anum_pg_class_relname 1 #define Anum_pg_class_relnamespace 2 #define Anum_pg_class_reltype 3 @@ -107,25 +109,26 @@ typedef FormData_pg_class *Form_pg_class; #define Anum_pg_class_relpages 9 #define Anum_pg_class_reltuples 10 #define Anum_pg_class_relallvisible 11 -#define Anum_pg_class_reltoastrelid 12 -#define Anum_pg_class_relhasindex 13 -#define Anum_pg_class_relisshared 14 -#define Anum_pg_class_relpersistence 15 -#define Anum_pg_class_relkind 16 -#define Anum_pg_class_relnatts 17 -#define Anum_pg_class_relchecks 18 -#define Anum_pg_class_relhasoids 19 -#define Anum_pg_class_relhaspkey 20 -#define Anum_pg_class_relhasrules 21 -#define Anum_pg_class_relhastriggers 22 -#define Anum_pg_class_relhassubclass 23 -#define Anum_pg_class_relrowsecurity 24 -#define Anum_pg_class_relispopulated 25 -#define Anum_pg_class_relreplident 26 -#define Anum_pg_class_relfrozenxid 27 -#define Anum_pg_class_relminmxid 28 -#define Anum_pg_class_relacl 29 -#define Anum_pg_class_reloptions 30 +#define Anum_pg_class_relallfrozen 12 +#define Anum_pg_class_reltoastrelid 13 +#define Anum_pg_class_relhasindex 14 +#define Anum_pg_class_relisshared 15 +#define Anum_pg_class_relpersistence 16 +#define Anum_pg_class_relkind 17 +#define Anum_pg_class_relnatts 18 +#define Anum_pg_class_relchecks 19 +#define Anum_pg_class_relhasoids 20 +#define Anum_pg_class_relhaspkey 21 +#define Anum_pg_class_relhasrules 22 +#define Anum_pg_class_relhastriggers 23 +#define Anum_pg_class_relhassubclass 24 +#define Anum_pg_class_relrowsecurity 25 +#define Anum_pg_class_relispopulated 26 +#define Anum_pg_class_relreplident 27 +#define Anum_pg_class_relfrozenxid 28 +#define Anum_pg_class_relminmxid 29 +#define Anum_pg_class_relacl 30 +#define Anum_pg_class_reloptions 31 /* ---------------- * initial contents of pg_class @@ -140,13 +143,13 @@ typedef FormData_pg_class *Form_pg_class; * Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId; * similarly, "1" in relminmxid stands for FirstMultiXactId */ -DATA(insert OID = 1247 ( pg_type PGNSP 71 0 PGUID 0 0 0 0 0 0 0 f f p r 30 0 t f f f f f t n 3 1 _null_ _null_ )); +DATA(insert OID = 1247 ( pg_type PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f f t n 3 1 _null_ _null_ )); DESCR(""); -DATA(insert OID = 1249 ( pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 f f p r 21 0 f f f f f f t n 3 1 _null_ _null_ )); +DATA(insert OID = 1249 ( pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 f f p r 21 0 f f f f f f t n 3 1 _null_ _null_ )); DESCR(""); -DATA(insert OID = 1255 ( pg_proc PGNSP 81 0 PGUID 0 0 0 0 0 0 0 f f p r 28 0 t f f f f f t n 3 1 _null_ _null_ )); +DATA(insert OID = 1255 ( pg_proc PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 f f p r 28 0 t f f f f f t n 3 1 _null_ _null_ )); DESCR(""); -DATA(insert OID = 1259 ( pg_class PGNSP 83 0 PGUID 0 0 0 0 0 0 0 f f p r 30 0 t f f f f f t n 3 1 _null_ _null_ )); +DATA(insert OID = 1259 ( pg_class PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 f f p r 31 0 t f f f f f t n 3 1 _null_ _null_ )); DESCR(""); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 6fd1278..7d9b93f 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -3213,6 +3213,12 @@ DESCR("sleep until the specified time"); DATA(insert OID = 2971 ( text PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "16" _null_ _null_ _null_ _null_ _null_ booltext _null_ _null_ _null_ )); DESCR("convert boolean to text"); +DATA(insert OID = 3298 ( pg_is_all_visible PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 16 "2205 20" _null_ _null_ _null_ _null_ _null_ pg_is_all_visible _null_ _null_ _null_ )); +DESCR("true if the page is all visible"); +DATA(insert OID = 3299 ( pg_is_all_frozen PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 16 "2205 20" _null_ _null_ _null_ _null_ _null_ pg_is_all_frozen _null_ _null_ _null_ )); +DESCR("true if the page is all frozen"); + + /* Aggregates (moved here from pg_aggregate for 7.3) */ DATA(insert OID = 2100 ( avg PGNSP PGUID 12 1 0 0 0 t f f f f f i 1 0 1700 "20" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ )); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index e3a31af..d2bae2d 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -172,6 +172,7 @@ extern void vac_update_relstats(Relation relation, BlockNumber num_pages, double num_tuples, BlockNumber num_all_visible_pages, + BlockNumber num_all_frozen_pages, bool hasindex, TransactionId frozenxid, MultiXactId minmulti, diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index a2f78ee..7bf2718 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -178,8 +178,10 @@ typedef PageHeaderData *PageHeader; * tuple? */ #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ +#define PD_ALL_FROZEN 0x0008 /* all tuples on page are completely + frozen */ -#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ +#define PD_VALID_FLAG_BITS 0x000F /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. @@ -369,6 +371,13 @@ typedef PageHeaderData *PageHeader; #define PageClearAllVisible(page) \ (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE) +#define PageIsAllFrozen(page) \ + (((PageHeader) (page))->pd_flags & PD_ALL_FROZEN) +#define PageSetAllFrozen(page) \ + (((PageHeader) (page))->pd_flags |= PD_ALL_FROZEN) +#define PageClearAllFrozen(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_ALL_FROZEN) + #define PageIsPrunable(page, oldestxmin) \ ( \ AssertMacro(TransactionIdIsNormal(oldestxmin)), \ diff --git a/src/test/regress/expected/visibilitymap.out b/src/test/regress/expected/visibilitymap.out new file mode 100644 index 0000000..543eeaa --- /dev/null +++ b/src/test/regress/expected/visibilitymap.out @@ -0,0 +1,75 @@ +-- +-- Visibility map +-- +CREATE FUNCTION + pg_visibilitymap(rel regclass, blkno OUT bigint, all_visible OUT bool, all_frozen OUT bool) +RETURNS SETOF RECORD +AS $$ + SELECT blkno, pg_is_all_visible($1, blkno) AS all_visible, pg_is_all_frozen($1, blkno) AS all_frozen + FROM generate_series(0, pg_relation_size($1) / current_setting('block_size')::bigint - 1) AS blkno; +$$ +LANGUAGE SQL; +CREATE TABLE vmtest (i INT primary key); +INSERT INTO vmtest SELECT generate_series(1,10000); +\set VERBOSITY terse +-- All pages are become all-visible +VACUUM vmtest; +SELECT count(all_visible) = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_visibilitymap('vmtest') + WHERE all_visible; + ?column? +---------- + t +(1 row) + +SELECT relallvisible = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_class + WHERE relname = 'vmtest'; + ?column? +---------- + t +(1 row) + +VACUUM FREEZE vmtest; +SELECT count(all_visible) = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_visibilitymap('vmtest') + WHERE all_visible + GROUP BY all_visible; + ?column? +---------- + t +(1 row) + +SELECT count(all_frozen) = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_visibilitymap('vmtest') + WHERE all_frozen + GROUP BY all_frozen; + ?column? +---------- + t +(1 row) + +SELECT relallvisible = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_class + WHERE relname = 'vmtest'; + ?column? +---------- + t +(1 row) + +SELECT relallfrozen = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_class + WHERE relname = 'vmtest'; + ?column? +---------- + t +(1 row) + +-- All pages are skipped acoording to VM +VACUUM FREEZE VERBOSE vmtest; +INFO: vacuuming "public.vmtest" +INFO: index "vmtest_pkey" now contains 10000 row versions in 30 pages +INFO: Skipped 45 frozen pages acoording to visibility map +INFO: "vmtest": found 0 removable, 0 nonremovable row versions in 0 out of 45 pages +DROP FUNCTION pg_visibilitymap(regclass); +DROP TABLE vmtest; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 4df15de..893d773 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -108,5 +108,8 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare without_oid c # event triggers cannot run concurrently with any test that runs DDL test: event_trigger +# visibility map and vacuum test cannot run concurrently with any test that runs SQL +test: visibilitymap + # run stats by itself because its delay may be insufficient under heavy load test: stats diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 3a607cf..76dbff7 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -157,3 +157,4 @@ test: xml test: event_trigger test: stats test: tablesample +test: visibilitymap diff --git a/src/test/regress/sql/visibilitymap.sql b/src/test/regress/sql/visibilitymap.sql new file mode 100644 index 0000000..11b552e --- /dev/null +++ b/src/test/regress/sql/visibilitymap.sql @@ -0,0 +1,49 @@ +-- +-- Visibility map +-- + +CREATE FUNCTION + pg_visibilitymap(rel regclass, blkno OUT bigint, all_visible OUT bool, all_frozen OUT bool) +RETURNS SETOF RECORD +AS $$ + SELECT blkno, pg_is_all_visible($1, blkno) AS all_visible, pg_is_all_frozen($1, blkno) AS all_frozen + FROM generate_series(0, pg_relation_size($1) / current_setting('block_size')::bigint - 1) AS blkno; +$$ +LANGUAGE SQL; + +CREATE TABLE vmtest (i INT primary key); +INSERT INTO vmtest SELECT generate_series(1,10000); + +\set VERBOSITY terse + +-- All pages are become all-visible +VACUUM vmtest; +SELECT count(all_visible) = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_visibilitymap('vmtest') + WHERE all_visible; +SELECT relallvisible = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_class + WHERE relname = 'vmtest'; + +VACUUM FREEZE vmtest; +SELECT count(all_visible) = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_visibilitymap('vmtest') + WHERE all_visible + GROUP BY all_visible; +SELECT count(all_frozen) = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_visibilitymap('vmtest') + WHERE all_frozen + GROUP BY all_frozen; + +SELECT relallvisible = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_class + WHERE relname = 'vmtest'; +SELECT relallfrozen = (pg_relation_size('vmtest') / current_setting('block_size')::int) + FROM pg_class + WHERE relname = 'vmtest'; + +-- All pages are skipped acoording to VM +VACUUM FREEZE VERBOSE vmtest; + +DROP FUNCTION pg_visibilitymap(regclass); +DROP TABLE vmtest;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers