On Wed, Nov 19, 2014 at 2:09 PM, Peter Geoghegan <[email protected]> wrote:
> Attached is a revision of what I previously called btreecheck, which
> is now renamed to amcheck.
Whoops. I left in modifications to pg_config_manual.h to build with
Valgrind support. Here is a version without that.
--
Peter Geoghegan
diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile
new file mode 100644
index ...07b18d8
*** a/contrib/amcheck/Makefile
--- b/contrib/amcheck/Makefile
***************
*** 0 ****
--- 1,18 ----
+ # contrib/amcheck/Makefile
+
+ MODULE_big = amcheck
+ OBJS = amcheck.o
+
+ EXTENSION = amcheck
+ DATA = amcheck--1.0.sql
+
+ ifdef USE_PGXS
+ PG_CONFIG = pg_config
+ PGXS := $(shell $(PG_CONFIG) --pgxs)
+ include $(PGXS)
+ else
+ subdir = contrib/amcheck
+ top_builddir = ../..
+ include $(top_builddir)/src/Makefile.global
+ include $(top_srcdir)/contrib/contrib-global.mk
+ endif
diff --git a/contrib/amcheck/amcheck--1.0.sql b/contrib/amcheck/amcheck--1.0.sql
new file mode 100644
index ...e25fad1
*** a/contrib/amcheck/amcheck--1.0.sql
--- b/contrib/amcheck/amcheck--1.0.sql
***************
*** 0 ****
--- 1,44 ----
+ /* contrib/amcheck/amcheck--1.0.sql */
+
+ -- complain if script is sourced in psql, rather than via CREATE EXTENSION
+ \echo Use "CREATE EXTENSION amcheck" to load this file. \quit
+
+ --
+ -- bt_page_verify()
+ --
+ CREATE FUNCTION bt_page_verify(relname text, blkno int4)
+ RETURNS VOID
+ AS 'MODULE_PATHNAME', 'bt_page_verify'
+ LANGUAGE C STRICT;
+
+ --
+ -- bt_parent_page_verify()
+ --
+ CREATE FUNCTION bt_parent_page_verify(relname text, blkno int4)
+ RETURNS VOID
+ AS 'MODULE_PATHNAME', 'bt_parent_page_verify'
+ LANGUAGE C STRICT;
+
+ --
+ -- bt_index_verify()
+ --
+ CREATE FUNCTION bt_index_verify(relname text)
+ RETURNS VOID
+ AS 'MODULE_PATHNAME', 'bt_index_verify'
+ LANGUAGE C STRICT;
+
+ --
+ -- bt_parent_index_verify()
+ --
+ CREATE FUNCTION bt_parent_index_verify(relname text)
+ RETURNS VOID
+ AS 'MODULE_PATHNAME', 'bt_parent_index_verify'
+ LANGUAGE C STRICT;
+
+ --
+ -- bt_leftright_verify()
+ --
+ CREATE FUNCTION bt_leftright_verify(relname text)
+ RETURNS VOID
+ AS 'MODULE_PATHNAME', 'bt_leftright_verify'
+ LANGUAGE C STRICT;
diff --git a/contrib/amcheck/amcheck.c b/contrib/amcheck/amcheck.c
new file mode 100644
index ...8776fbe
*** a/contrib/amcheck/amcheck.c
--- b/contrib/amcheck/amcheck.c
***************
*** 0 ****
--- 1,1009 ----
+ /*-------------------------------------------------------------------------
+ *
+ * amcheck.c
+ * Verifies the integrity of access methods based on invariants.
+ *
+ * Currently, only checks for the nbtree AM are provided. Provides
+ * SQL-callable functions for verifying that various invariants in the
+ * structure of nbtree indexes are respected. This includes for example the
+ * invariant that each page with a high key has data items bound by the high
+ * key. Some functions also check invariant conditions that must hold across
+ * multiple pages, such as the requirement each left link within a page (if
+ * any) should point to a page that has as its right link a pointer to the
+ * page.
+ *
+ *
+ * Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/amcheck/amcheck.c
+ *
+ *-------------------------------------------------------------------------
+ */
+ #include "postgres.h"
+
+ #include "access/nbtree.h"
+ #include "catalog/index.h"
+ #include "catalog/namespace.h"
+ #include "commands/tablecmds.h"
+ #include "funcapi.h"
+ #include "miscadmin.h"
+ #include "storage/lmgr.h"
+ #include "utils/builtins.h"
+ #include "utils/lsyscache.h"
+ #include "utils/rel.h"
+
+
+ PG_MODULE_MAGIC;
+
+ #define CHECK_SUPERUSER() { \
+ if (!superuser()) \
+ ereport(ERROR, \
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), \
+ (errmsg("must be superuser to use verification functions")))); }
+ #define CHECK_RELATION_BTREE(r) { \
+ if ((r)->rd_rel->relkind != RELKIND_INDEX || (r)->rd_rel->relam != BTREE_AM_OID) \
+ elog(ERROR, "relation \"%s\" is not a btree index", \
+ RelationGetRelationName(r)); }
+ /* reject attempts to read non-local temporary relations */
+ #define CHECK_RELATION_IS_NOT_OTHER_TEMP(rel) { \
+ if (RELATION_IS_OTHER_TEMP(rel)) \
+ ereport(ERROR, \
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \
+ errmsg("cannot access temporary tables of other sessions"))); }
+ /* note: BlockNumber is unsigned, hence can't be negative */
+ #define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \
+ if (blkno == 0) \
+ elog(ERROR, "block 0 is a meta page"); \
+ if (RelationGetNumberOfBlocks(rel) <= (BlockNumber) (blkno) ) \
+ elog(ERROR, "block number out of range"); }
+
+ /*
+ * Callers to verification functions can reasonably expect to never receive a
+ * warning. Therefore, when using amcheck functions for stress testing it
+ * may be useful to temporally change CHCKELEVEL to PANIC, to immediately halt
+ * the server in the event of detecting an invariant condition violation. This
+ * may preserve more information about the nature of the underlying problem.
+ */
+ #define CHCKELEVEL WARNING
+
+ PG_FUNCTION_INFO_V1(bt_page_verify);
+ PG_FUNCTION_INFO_V1(bt_parent_page_verify);
+ PG_FUNCTION_INFO_V1(bt_index_verify);
+ PG_FUNCTION_INFO_V1(bt_parent_index_verify);
+ PG_FUNCTION_INFO_V1(bt_leftright_verify);
+
+ static void rangevar_callback_for_childcheck(const RangeVar *relation,
+ Oid relId, Oid oldRelId,
+ void *arg);
+ static void bt_do_page_verify(Relation rel, BlockNumber blockNum,
+ bool childCheck, bool singlepage);
+ static ScanKey first_item_right_page(Relation rel, BlockNumber rightblockNum,
+ Page *rpage);
+ static ScanKey verify_btree_items_le(Relation rel, Page childPage,
+ ScanKey downLinkParent, IndexTuple pItup);
+ static BlockNumber verify_leftright_level(Relation rel, BlockNumber leftMost);
+ static char *form_btree_data(IndexTuple itup);
+
+ /*
+ * Verify specified B-Tree block/page.
+ *
+ * Only acquires AccessShareLock on index relation.
+ */
+ Datum
+ bt_page_verify(PG_FUNCTION_ARGS)
+ {
+ text *relname = PG_GETARG_TEXT_P(0);
+ uint32 blkno = PG_GETARG_UINT32(1);
+ Relation rel;
+ RangeVar *relrv;
+
+ CHECK_SUPERUSER();
+
+ relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+ rel = relation_openrv(relrv, AccessShareLock);
+
+ CHECK_RELATION_BTREE(rel);
+
+ CHECK_RELATION_IS_NOT_OTHER_TEMP(rel);
+
+ CHECK_RELATION_BLOCK_RANGE(rel, blkno);
+
+ bt_do_page_verify(rel, blkno, false, true);
+
+ relation_close(rel, AccessShareLock);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Verify specified B-Tree parent block/page. Make sure that each child page
+ * has as its right link the page that the parent page has as the next
+ * down-link, and other checks between parent and child pages when examining a
+ * parent page.
+ *
+ * Acquires AccessExclusiveLock on index relation, and ShareLock on the
+ * associated heap relation, much like REINDEX.
+ */
+ Datum
+ bt_parent_page_verify(PG_FUNCTION_ARGS)
+ {
+ text *relname = PG_GETARG_TEXT_P(0);
+ uint32 blkno = PG_GETARG_UINT32(1);
+ Relation iRel, heapRel;
+ RangeVar *relrv;
+ Oid indexId, heapId = InvalidOid;
+
+ CHECK_SUPERUSER();
+
+ relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+
+ /*
+ * Open the target index relation and get an exclusive lock on it, to
+ * ensure that no one else is touching this particular index.
+ */
+ indexId = RangeVarGetRelidExtended(relrv, AccessExclusiveLock,
+ false, false,
+ rangevar_callback_for_childcheck,
+ (void *) &heapId);
+
+ /*
+ * Open the target index relations separately (like relation_openrv() but
+ * with heap relation locked first to prevent deadlocking)
+ */
+ heapRel = heap_open(heapId, NoLock);
+ iRel = index_open(indexId, NoLock);
+
+ /* check for active uses of the index in the current transaction */
+ CheckTableNotInUse(iRel, "bt_parent_page_verify");
+
+ CHECK_RELATION_BTREE(iRel);
+
+ CHECK_RELATION_IS_NOT_OTHER_TEMP(iRel);
+
+ CHECK_RELATION_BLOCK_RANGE(iRel, blkno);
+
+ bt_do_page_verify(iRel, blkno, true, true);
+
+ index_close(iRel, AccessExclusiveLock);
+ heap_close(heapRel, ShareLock);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Verify entire index, without looking at cross-page invariant conditions.
+ *
+ * Only acquires AccessShareLock on index relation.
+ */
+ Datum
+ bt_index_verify(PG_FUNCTION_ARGS)
+ {
+ text *relname = PG_GETARG_TEXT_P(0);
+ Relation rel;
+ RangeVar *relrv;
+ BlockNumber i;
+ BlockNumber nBlocks;
+
+ CHECK_SUPERUSER();
+
+ relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+ rel = relation_openrv(relrv, AccessShareLock);
+
+ CHECK_RELATION_BTREE(rel);
+
+ CHECK_RELATION_IS_NOT_OTHER_TEMP(rel);
+
+ nBlocks = RelationGetNumberOfBlocks(rel);
+
+ /* Verify every page */
+ for (i = BTREE_METAPAGE + 1; i < nBlocks; i++)
+ {
+ bt_do_page_verify(rel, i, false, false);
+ }
+
+ relation_close(rel, AccessShareLock);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Verify entire index, considering cross-page invariant conditions between
+ * parent and child pages.
+ *
+ * Acquires AccessExclusiveLock on index relation, and ShareLock on the
+ * associated heap relation, much like REINDEX.
+ */
+ Datum
+ bt_parent_index_verify(PG_FUNCTION_ARGS)
+ {
+ text *relname = PG_GETARG_TEXT_P(0);
+ RangeVar *relrv;
+ BlockNumber i;
+ BlockNumber nBlocks;
+ Relation iRel, heapRel;
+ Oid indexId, heapId = InvalidOid;
+
+ CHECK_SUPERUSER();
+
+ relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+
+ /*
+ * Open the target index relation and get an exclusive lock on it, to
+ * ensure that no one else is touching this particular index.
+ */
+ indexId = RangeVarGetRelidExtended(relrv, AccessExclusiveLock,
+ false, false,
+ rangevar_callback_for_childcheck,
+ (void *) &heapId);
+
+ /*
+ * Open the target index relations separately (like relation_openrv() but
+ * with heap relation locked first to prevent deadlocking)
+ */
+ heapRel = heap_open(heapId, NoLock);
+ iRel = index_open(indexId, NoLock);
+
+ /* check for active uses of the index in the current transaction */
+ CheckTableNotInUse(iRel, "bt_parent_index_verify");
+
+ CHECK_RELATION_BTREE(iRel);
+
+ CHECK_RELATION_IS_NOT_OTHER_TEMP(iRel);
+
+ nBlocks = RelationGetNumberOfBlocks(iRel);
+
+ /* Verify every page, and parent/child relationships */
+ for (i = BTREE_METAPAGE + 1; i < nBlocks; i++)
+ {
+ bt_do_page_verify(iRel, i, true, false);
+ }
+
+ index_close(iRel, AccessExclusiveLock);
+ heap_close(heapRel, ShareLock);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Verify left-right link consistency at each level, starting from the true
+ * root.
+ *
+ * Only acquires AccessShareLock on index relation.
+ */
+ Datum
+ bt_leftright_verify(PG_FUNCTION_ARGS)
+ {
+ text *relname = PG_GETARG_TEXT_P(0);
+ Relation rel;
+ RangeVar *relrv;
+ Buffer buffer;
+ Page page;
+ BlockNumber leftMost;
+ BTMetaPageData *metad;
+
+ CHECK_SUPERUSER();
+
+ relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+ rel = relation_openrv(relrv, AccessShareLock);
+
+ CHECK_RELATION_BTREE(rel);
+
+ CHECK_RELATION_IS_NOT_OTHER_TEMP(rel);
+
+ /* Get true root block from meta-page */
+ buffer = ReadBuffer(rel, BTREE_METAPAGE);
+ LockBuffer(buffer, BT_READ);
+
+ page = BufferGetPage(buffer);
+ metad = BTPageGetMeta(page);
+
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * Verify every level, starting form true root's level. Move left to right.
+ * Process one level at a time.
+ */
+ leftMost = metad->btm_root;
+
+ while (leftMost != P_NONE)
+ {
+ leftMost = verify_leftright_level(rel, leftMost);
+ }
+
+ relation_close(rel, AccessShareLock);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Worker for bt_page_verify(), bt_index_verify() and "parent" variants of
+ * both.
+ *
+ * It is the caller's responsibility to handle heavyweight locking. Checking
+ * children from down-links has race conditions with only an AccessShareLock,
+ * since there is never an attempt to get a consistent view of multiple pages
+ * using multiple concurrent buffer locks (in general, we prefer to only lock
+ * one buffer at a time, and to only manipulate it in local memory).
+ */
+ static void
+ bt_do_page_verify(Relation rel, BlockNumber blockNum, bool childCheck,
+ bool singlepage)
+ {
+ Buffer buffer;
+ Page page;
+ OffsetNumber offset, maxoffset;
+ BTPageOpaque opaque;
+ int16 relnatts;
+
+ buffer = ReadBuffer(rel, blockNum);
+ LockBuffer(buffer, BT_READ);
+
+ /*
+ * We copy the page into local storage to avoid holding pin on the
+ * buffer longer than we must, and possibly failing to release it at
+ * all if the calling query doesn't fetch all rows.
+ */
+ page = palloc(BLCKSZ);
+ memcpy(page, BufferGetPage(buffer), BLCKSZ);
+
+ UnlockReleaseBuffer(buffer);
+
+ relnatts = rel->rd_rel->relnatts;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ maxoffset = PageGetMaxOffsetNumber(page);
+
+ for (offset = P_FIRSTDATAKEY(opaque); offset <= maxoffset; offset++)
+ {
+ ItemId itemId;
+ IndexTuple itup;
+ ScanKey skey;
+ int32 eqs_hikey, eqs_item;
+
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
+
+ if (P_IGNORE(opaque))
+ {
+ elog(NOTICE, "page %u of index \"%s\" is ignored", blockNum,
+ RelationGetRelationName(rel));
+ break;
+ }
+
+ /*
+ * As noted in comments above _bt_compare(), there is special handling
+ * of the first data item on a non-leaf page. There is clearly no
+ * point in building a ScanKey of whatever data happens to be in this
+ * first data IndexTuple, since its contents are undefined. (This is
+ * because _bt_compare() is hard-coded to specially treat it as "minus
+ * infinity").
+ */
+ if (!P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque))
+ continue;
+
+ itemId = PageGetItemId(page, offset);
+
+ if (!ItemIdIsValid(itemId))
+ elog(Max(CHCKELEVEL, ERROR), "invalid itemId");
+
+ itup = (IndexTuple) PageGetItem(page, itemId);
+
+ /* Build ScanKey that relates to the current item/offset's value */
+ skey = _bt_mkscankey(rel, itup);
+
+ eqs_hikey = eqs_item = 0;
+
+ /* If there is a high key, compare this item to it */
+ if (!P_RIGHTMOST(opaque))
+ eqs_hikey = _bt_compare(rel, relnatts, skey, page, P_HIKEY);
+
+ /*
+ * Check that items are stored on page in logical order.
+ *
+ * In each iteration, check that next item is equal to or greater than
+ * current item, until there is no next item (i.e. don't do this on the
+ * last iteration).
+ */
+ if (offset < maxoffset)
+ {
+ /* Iteration before last case */
+ eqs_item = _bt_compare(rel, relnatts, skey, page, offset + 1);
+ }
+ else if (!P_RIGHTMOST(opaque))
+ {
+ ScanKey nLow;
+ Page rpage;
+
+ /* Last iteration (page item) */
+ Assert(offset == maxoffset);
+
+ /*
+ * There is no "next" item available to compare this final item to
+ * on this same page (there is only the page highkey). Look for
+ * one in right-link page, since one should be available.
+ *
+ * Get first item on right page by following right link, and verify
+ * that that item is less than or equal to this last item. Note
+ * that we always get the first real data item -- not the
+ * right-link's high key, and not the garbage first "real" item
+ * that internal pages have.
+ *
+ * It might be that there has been a page split and the right link
+ * is now not the same as the right link during the time that we
+ * took a local copy of the current page, but (even without
+ * heavyweight locking by caller) this shouldn't matter. It should
+ * be possible to treat whatever we find to the right (if anything)
+ * as an invariant that must hold. (Since pages split right, it
+ * might be the same page in a sense, which won't in itself have
+ * changed the first item, or it might be deleted.)
+ *
+ * XXX: In general there is a pretty good chance that this right
+ * page will be visited at some later juncture (if we're in the
+ * process of verifying the entire index, for example). It seems
+ * wasteful that there will be considerable redundant buffer
+ * locking and page copying. Revisit this.
+ *
+ * In general, a lot of checking performed by amcheck will probably
+ * turn out to be superfluous when the code is tightened up in the
+ * future.
+ */
+ nLow = first_item_right_page(rel, opaque->btpo_next, &rpage);
+
+ if (nLow)
+ {
+ int32 eqs_nextpage;
+
+ /*
+ * This is a bit different to what earlier iterations do. They
+ * user their item's scankey to compare to next item (so in
+ * this final iteration, we have just verified that second last
+ * item is less than or equal to this item -- we're now
+ * checking this item against items on next page).
+ *
+ * Since ScanKey comes from next page, and is compared to
+ * current offset, next-page-generated scankey should be
+ * greater than (or equal to) this current offset/item.
+ */
+ eqs_nextpage = _bt_compare(rel, relnatts, nLow, page, offset);
+
+ if (eqs_nextpage < 0)
+ elog(CHCKELEVEL, "page order invariant violated for index \"%s\" block %u across block and next page",
+ RelationGetRelationName(rel), offset);
+
+ pfree(nLow);
+ pfree(rpage);
+ }
+ }
+
+ /*
+ * It is expected that current iteration's ScanKey is always less than
+ * the high key on this page, and always less than the next
+ * item/offset.
+ */
+ if (eqs_hikey > 0 || eqs_item > 0)
+ {
+ char *hblk, *iblk, *niblk, *dump;
+
+ hblk = psprintf("(%u,%u)",
+ BlockIdGetBlockNumber(&(itup->t_tid.ip_blkid)),
+ itup->t_tid.ip_posid);
+
+ iblk = psprintf("(%u,%u)",
+ blockNum,
+ offset);
+
+ niblk = psprintf("(%u,%u)",
+ blockNum,
+ offset + 1);
+
+ dump = form_btree_data(itup);
+
+ if (eqs_hikey > 0)
+ elog(CHCKELEVEL, "high key invariant violated for index \"%s\" %s with data %s (points to: %s)",
+ RelationGetRelationName(rel), iblk, dump, hblk);
+ if (eqs_item > 0)
+ elog(CHCKELEVEL, "page order invariant violated for index \"%s\" (%s and %s) with data %s (points to: %s)",
+ RelationGetRelationName(rel), iblk, niblk, dump, hblk);
+ }
+
+ /*
+ * For internal pages, verify that child pages comport with the
+ * down-link in the parent page. This is safe against concurrent page
+ * splits because caller must have taken appropriate heavyweight locks
+ * for this case. If there is an old incomplete page split, detect
+ * that and handle it appropriately.
+ *
+ * This seems unlikely to be too wasteful in any particular scenario
+ * (other than the fact that it necessitates a heavy lmgr lock). After
+ * all, this only needs to occur with internal pages, that are almost
+ * always than 5% of the total, and often less than 1%.
+ *
+ * However, we should think about doing this in a separate pass with
+ * that heavier lock, or perhaps coming up with a smarter buffer
+ * locking protocol that obviates the need for a hwlock. For now, it
+ * seems far preferable to not be overly clever with buffer locks.
+ */
+ if (childCheck && !P_ISLEAF(opaque))
+ {
+ Page childPage;
+ BlockNumber childBlockNum;
+ Buffer childBuffer;
+ ScanKey childHkey;
+
+ childBlockNum = ItemPointerGetBlockNumber(&(itup->t_tid));
+
+ childBuffer = ReadBuffer(rel, childBlockNum);
+ LockBuffer(childBuffer, BT_READ);
+
+ /* Copy this page into local storage too */
+ childPage = palloc(BLCKSZ);
+ memcpy(childPage, BufferGetPage(childBuffer), BLCKSZ);
+
+ UnlockReleaseBuffer(childBuffer);
+
+ /*
+ * Verify child page that this down-link points to has the down-link
+ * key as a lower bound.
+ */
+ childHkey = verify_btree_items_le(rel, childPage, skey, itup);
+
+ if (childHkey && offset < maxoffset)
+ {
+ /* state for child page, and next data item/offset */
+ BlockNumber childRightLinkBlockNum,
+ parentNextOffsetBlockNum;
+ IndexTuple nIndTup;
+ ItemId nItemId;
+ BTPageOpaque childOpaque;
+ int32 res;
+
+ /*
+ * Since this isn't the last iteration, and there can't have
+ * been an incomplete page split on child page just inspected
+ * (since a child high key was returned), further check that
+ * next item on this page (the parent) is equal to child's high
+ * key.
+ */
+ childOpaque = (BTPageOpaque) PageGetSpecialPointer(childPage);
+ Assert(!P_RIGHTMOST(childOpaque));
+ Assert(!P_INCOMPLETE_SPLIT(childOpaque));
+
+ res = _bt_compare(rel, relnatts, childHkey, page, offset + 1);
+
+ if (res != 0)
+ {
+ char *iblk, *piblk;
+
+ /*
+ * Principally blame next item, not this item for problem,
+ * since it was the down-link that completed the split,
+ * which seems most plausible as a proximate cause.
+ */
+ iblk = psprintf("(%u,%u)",
+ blockNum,
+ offset + 1);
+ piblk = psprintf("(%u,%u)",
+ blockNum,
+ offset);
+
+ elog(CHCKELEVEL, "down-link in parent page in index \"%s\" %s is %s (not equal to) immediately prior item's %s child's (block %u) high key",
+ RelationGetRelationName(rel), iblk,
+ res < 0 ? "greater than":"less than",
+ piblk, childBlockNum);
+ }
+
+ /*
+ * Match block number of child's right link to next parent
+ * offset item's block number, too. This is similar to but
+ * slightly distinct from the child high key matching
+ * verification.
+ */
+ nItemId = PageGetItemId(page, offset + 1);
+
+ if (!ItemIdIsValid(nItemId))
+ elog(Max(CHCKELEVEL, ERROR), "invalid nItemId for next item");
+
+ childRightLinkBlockNum = childOpaque->btpo_next;
+ nIndTup = (IndexTuple) PageGetItem(page, nItemId);
+ parentNextOffsetBlockNum =
+ BlockIdGetBlockNumber(&(nIndTup ->t_tid.ip_blkid));
+
+ if (childRightLinkBlockNum != parentNextOffsetBlockNum)
+ elog(CHCKELEVEL, "next item/offset down-link (%u) in parent page does not match child (block %u) right link block number",
+ parentNextOffsetBlockNum, childBlockNum);
+
+ pfree(childHkey);
+ }
+ pfree(childPage);
+ }
+ else if (childCheck && singlepage && P_ISLEAF(opaque))
+ {
+ elog(CHCKELEVEL, "single leaf page was specified as target for checking against child");
+ }
+ pfree(skey);
+ }
+ pfree(page);
+ }
+
+ /*
+ * Lock the heap before the RangeVarGetRelidExtended takes the index lock, to
+ * avoid deadlocks.
+ *
+ * If there was ever a requirement to check permissions, this would be the
+ * place to do it. Currently, we just insist upon superuser.
+ */
+ static void
+ rangevar_callback_for_childcheck(const RangeVar *relation,
+ Oid relId, Oid oldRelId, void *arg)
+ {
+ Oid *heapOid = (Oid *) arg;
+
+ /*
+ * If we previously locked some other index's heap, and the name we're
+ * looking up no longer refers to that relation, release the now-useless
+ * lock.
+ */
+ if (relId != oldRelId && OidIsValid(oldRelId))
+ {
+ /* lock level here should match reindex_index() heap lock */
+ UnlockRelationOid(*heapOid, ShareLock);
+ *heapOid = InvalidOid;
+ }
+
+ /* If the relation does not exist, there's nothing more to do. */
+ if (!OidIsValid(relId))
+ return;
+
+ /* Lock heap before index to avoid deadlock. */
+ if (relId != oldRelId)
+ {
+ /*
+ * Lock level here should match elsewhere. If the OID isn't valid, it
+ * means the index was concurrently dropped, which is not a problem for
+ * us; just return normally.
+ */
+ *heapOid = IndexGetRelation(relId, true);
+ if (OidIsValid(*heapOid))
+ LockRelationOid(*heapOid, ShareLock);
+ }
+ }
+
+ /*
+ * Worker for bt_do_page_verify()
+ *
+ * Requires only that AccessShareLock has been acquired by caller on target
+ * btree index relation.
+ *
+ * Given an original non-rightmost page's right link block (that points to a
+ * page that may or may not itself be right-most), return a scanKey for the
+ * first real data item on the right page sufficient to check ordering
+ * invariant on last item in original page. Also, pass back right-link page,
+ * which caller manages memory of (since the ScanKey points into it).
+ *
+ * Handles concurrent page splits and deletions.
+ */
+ static ScanKey
+ first_item_right_page(Relation rel, BlockNumber rightblockNum, Page *rpage)
+ {
+ Buffer buffer;
+ BTPageOpaque opaque;
+ ItemId firstDataId;
+ IndexTuple firstDataTup;
+
+ /* copy the page into local storage */
+ *rpage = palloc(BLCKSZ);
+
+ for (;;)
+ {
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
+
+ buffer = ReadBuffer(rel, rightblockNum);
+ LockBuffer(buffer, BT_READ);
+
+ memcpy(*rpage, BufferGetPage(buffer), BLCKSZ);
+
+ UnlockReleaseBuffer(buffer);
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(*rpage);
+
+ /* Ordinarily, we expect to not ignore the right linked page */
+ if (!P_IGNORE(opaque))
+ break;
+
+ /*
+ * Handle race -- find first non-ignore page until one is found, or at
+ * rightmost.
+ *
+ * Before freeing memory, check if this is rightmost (which means we're
+ * done, and couldn't generate a useful scankey for caller's target).
+ */
+ if (P_RIGHTMOST(opaque))
+ goto abort;
+
+ rightblockNum = opaque->btpo_next;
+ elog(NOTICE, "left-most page was found deleted or half dead");
+ }
+
+ firstDataId = PageGetItemId(*rpage, P_FIRSTDATAKEY(opaque));
+
+ if (!P_ISLEAF(opaque))
+ {
+ /*
+ * Since this is an internal page, the first "real" data item is
+ * actually a garbage "minus infinity" item. Don't return garbage item
+ * on caller's target's right page (our target page), though. This
+ * isn't a real data key for caller's purposes. (This is separately
+ * avoided by caller, when it iterates through its target page).
+ */
+ if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(*rpage))
+ firstDataId = PageGetItemId(*rpage, P_FIRSTDATAKEY(opaque) + 1);
+ else
+ goto abort;
+ }
+
+ if (!ItemIdIsValid(firstDataId))
+ elog(Max(CHCKELEVEL, ERROR), "invalid hKeyItemId");
+
+ /*
+ * Return scankey, and leave it to caller to free all memory (Page storage
+ * local memory).
+ */
+ firstDataTup = (IndexTuple) PageGetItem(*rpage, firstDataId);
+
+ return _bt_mkscankey(rel, firstDataTup);
+
+ abort:
+ pfree(*rpage);
+ rpage = NULL;
+ return NULL;
+ }
+
+ /*
+ * Verify that each item on page is greater than or equal to the down-link key
+ * in the parent. Don't assume that the items in the child page are in logical
+ * order as generally expected -- exhaustively check each and every data item,
+ * and not just the first.
+ *
+ * XXX: Presently, the checking of every item here is usually somewhat
+ * redundant. Revisit the question of how necessary this is in light of how
+ * the code is actually called from a higher level.
+ *
+ * Returns ScanKey that is initialized with high key value, that can be checked
+ * within parent against next offset/down-link. However, if there was an
+ * incomplete page split (and therefore no down-link in parent can reasonably
+ * be expected yet) or this is the right-most page (and therefore does not have
+ * a high key), just return NULL.
+ */
+ static ScanKey
+ verify_btree_items_le(Relation rel, Page childPage, ScanKey downLinkParent,
+ IndexTuple pItup)
+ {
+ OffsetNumber offset, maxoffset;
+ BTPageOpaque opaque;
+ int16 relnatts = rel->rd_rel->relnatts;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(childPage);
+ maxoffset = PageGetMaxOffsetNumber(childPage);
+
+ for (offset = P_FIRSTDATAKEY(opaque); offset <= maxoffset; offset++)
+ {
+ int32 res;
+
+ /*
+ * No point in checking first data key on non-leaf page, since it's
+ * logically "minus infinity". All items on child page should be
+ * greater than or equal to supplied down-link ScanKey (so the first
+ * data item on a non-leaf page is only "minus infinity" in a way that
+ * relies on the very invariant now being verified -- it's not "minus
+ * infinity" in any general sense. It's "minus infinity" with respect
+ * to values less than the first "real" data item, but greater than or
+ * equal to the page's left-link's high key, iff there is a left link).
+ *
+ * Leaving aside discussion of invariant conditions, clearly there is
+ * no point in doing anything with the first data key on a non-leaf
+ * page because its data is undefined, as noted in comments above
+ * _bt_compare().
+ */
+ if (!P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque))
+ continue;
+
+ res = _bt_compare(rel, relnatts, downLinkParent, childPage, offset);
+
+ /*
+ * Parent down-link is lower bound. Report when this invariant
+ * condition has been violated.
+ */
+ if (res > 0)
+ {
+ char *dump;
+
+ dump = form_btree_data(pItup);
+
+ elog(CHCKELEVEL, "down-link low key invariant violated for index \"%s\" with data %s (points to: %u)",
+ RelationGetRelationName(rel), dump,
+ BlockIdGetBlockNumber(&(pItup->t_tid.ip_blkid)));
+ }
+ }
+
+ if (P_INCOMPLETE_SPLIT(opaque))
+ {
+ elog(NOTICE, "incomplete split in child page (block %u) found when following down-link",
+ BlockIdGetBlockNumber(&(pItup->t_tid.ip_blkid)));
+ }
+ else if (!P_RIGHTMOST(opaque))
+ {
+ /*
+ * Useful ScanKey for child page high key exists -- return it to caller
+ * for further checks on parent page
+ */
+ ItemId hKeyItemId;
+ IndexTuple hKeyItup;
+
+ hKeyItemId = PageGetItemId(childPage, P_HIKEY);
+
+ if (!ItemIdIsValid(hKeyItemId))
+ elog(Max(CHCKELEVEL, ERROR), "invalid hKeyItemId");
+
+ hKeyItup = (IndexTuple) PageGetItem(childPage, hKeyItemId);
+
+ return _bt_mkscankey(rel, hKeyItup);
+ }
+
+ return NULL;
+ }
+
+ /*
+ * Given a left-most block at some level, move right, verifying that left/right
+ * sibling links match as pages are passed over. It is the caller's
+ * responsibility to acquire appropriate heavyweight locks on the index
+ * relation.
+ *
+ * Moves from left to right, so there is no need to worry about concurrent page
+ * splits, as after a page split the original/left page has the same left-link
+ * as before. (And, for what it's worth, the page just under consideration has
+ * the same right link as before, so logically nothing is skipped. Everything
+ * in the index moves right, and so is denied the opportunity to be skipped
+ * over). Page deletion receives no special consideration either, because the
+ * second stage of page deletion (where a half-dead leaf page is unlinked from
+ * its siblings) involves acquiring a lock on the left-sibling, on the deletion
+ * target itself, and on the right sibling, all concurrently and in that order.
+ * That's the only one of the two stages of deletion where side-links are
+ * updated.
+ *
+ * Returns left-most block number one level lower that should be passed on next
+ * level/call, or P_NONE when done. Caller should call with the true root page
+ * initially, and work their way down to level 0 (leaf page level).
+ *
+ * The return value may become stale (from, say, concurrent page deletion).
+ * When passed back here, this is handled in a similar though less invasive
+ * manner to _bt_moveright(). It may prove necessary to "move right" one or
+ * more times if the left-most page passed here is deleted or half dead, by
+ * telling caller about a new left-most page on the same level.
+ */
+ static BlockNumber
+ verify_leftright_level(Relation rel, BlockNumber leftMost)
+ {
+ Buffer buffer;
+ BTPageOpaque opaque;
+ Page page;
+ BlockNumber current, currentsLeft, next;
+
+ buffer = ReadBuffer(rel, leftMost);
+ LockBuffer(buffer, BT_READ);
+
+ /* as always, copy the page into local storage */
+ page = palloc(BLCKSZ);
+ memcpy(page, BufferGetPage(buffer), BLCKSZ);
+
+ UnlockReleaseBuffer(buffer);
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_IGNORE(opaque))
+ {
+ /*
+ * Handle race -- tell caller to try again with prior left-most's right
+ * sibling, in a manner not unlike _bt_moveright()
+ */
+ if (P_RIGHTMOST(opaque))
+ elog(Max(CHCKELEVEL, ERROR), "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+ next = opaque->btpo_next;
+ pfree(page);
+ elog(NOTICE, "left-most page was found deleted or half dead");
+ return next;
+ }
+
+ if (!P_LEFTMOST(opaque))
+ elog(CHCKELEVEL, "page %u of index \"%s\" is not leftmost",
+ leftMost, RelationGetRelationName(rel));
+
+ /* remember down-link block to return */
+ if (P_ISLEAF(opaque))
+ {
+ next = P_NONE;
+ }
+ else
+ {
+ IndexTuple itup;
+ ItemId itemId;
+
+ itemId = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
+
+ if (!ItemIdIsValid(itemId))
+ elog(Max(CHCKELEVEL, ERROR), "invalid itemId");
+
+ itup = (IndexTuple) PageGetItem(page, itemId);
+
+ next = ItemPointerGetBlockNumber(&(itup->t_tid));
+ }
+
+ currentsLeft = leftMost;
+ current = opaque->btpo_next;
+
+ while (!P_RIGHTMOST(opaque))
+ {
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
+
+ buffer = ReadBuffer(rel, current);
+ LockBuffer(buffer, BT_READ);
+
+ memcpy(page, BufferGetPage(buffer), BLCKSZ);
+
+ UnlockReleaseBuffer(buffer);
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_ISDELETED(opaque))
+ elog(NOTICE, "page %u of index \"%s\" is deleted",
+ current, RelationGetRelationName(rel));
+
+ if (currentsLeft != opaque->btpo_prev)
+ {
+ if (!P_ISDELETED(opaque))
+ elog(CHCKELEVEL, "left link/right link pair don't comport at level %u, block %u, last: %u, current left: %u",
+ opaque->btpo.level, current, currentsLeft, opaque->btpo_prev);
+ else /* level unavailable */
+ elog(CHCKELEVEL, "left link/right link pair don't comport, deleted block %u, last: %u, current left: %u",
+ current, currentsLeft, opaque->btpo_prev);
+ }
+
+ currentsLeft = current;
+ current = opaque->btpo_next;
+ }
+
+ pfree(page);
+
+ return next;
+ }
+
+ /*
+ * Given an IndexTuple, form hex cstring of data for error reporting purposes
+ */
+ static char *
+ form_btree_data(IndexTuple itup)
+ {
+ char *dump, *odump;
+ char *ptr;
+ int off;
+ int dlen;
+
+ dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
+ ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
+ dump = palloc0(dlen * 3 + 1);
+ odump = dump;
+
+ for (off = 0; off < dlen; off++)
+ {
+ if (off > 0)
+ *dump++ = ' ';
+ sprintf(dump, "%02x", *(ptr + off) & 0xff);
+ dump += 2;
+ }
+
+ return odump;
+ }
diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control
new file mode 100644
index ...ab4b1a3
*** a/contrib/amcheck/amcheck.control
--- b/contrib/amcheck/amcheck.control
***************
*** 0 ****
--- 1,5 ----
+ # amcheck extension
+ comment = 'verify the integrity of indexes at a low level'
+ default_version = '1.0'
+ module_pathname = '$libdir/amcheck'
+ relocatable = true
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers