On Tue, Dec 27, 2016 at 1:36 PM, Mithun Cy <mithun...@enterprisedb.com> wrote: Oops, patch number should be 08 re-attaching same after renaming.
-- Thanks and Regards Mithun C Y EnterpriseDB: http://www.enterprisedb.com
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 46df589..c45b3f0 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -32,19 +32,13 @@ _hash_doinsert(Relation rel, IndexTuple itup) Buffer bucket_buf; Buffer metabuf; HashMetaPage metap; - BlockNumber blkno; - BlockNumber oldblkno; - bool retry; + HashMetaPage usedmetap; Page metapage; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; - Bucket bucket; - uint32 maxbucket; - uint32 highmask; - uint32 lowmask; /* * Get the hash key for the item (it's stored in the index tuple itself). @@ -57,10 +51,15 @@ _hash_doinsert(Relation rel, IndexTuple itup) * need to be consistent */ restart_insert: - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + + /* + * Load the metapage. No need to lock as of now because we only access + * page header element pd_pagesize_version in HashMaxItemSize(), this + * element is constant and will not move while accessing. But we hold the + * pin so we can use the metabuf while writing into to it below. + */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); metapage = BufferGetPage(metabuf); - metap = HashPageGetMeta(metapage); /* * Check whether the item can fit on a hash page at all. (Eventually, we @@ -76,59 +75,7 @@ restart_insert: itemsz, HashMaxItemSize(metapage)), errhint("Values larger than a buffer page cannot be indexed."))); - oldblkno = InvalidBlockNumber; - retry = false; - - /* - * Loop until we get a lock on the correct target bucket. - */ - for (;;) - { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); - - /* - * Copy bucket mapping info now; refer the comment in - * _hash_expandtable where we copy this information before calling - * _hash_splitbucket to see why this is okay. - */ - maxbucket = metap->hashm_maxbucket; - highmask = metap->hashm_highmask; - lowmask = metap->hashm_lowmask; - - /* Release metapage lock, but keep pin. */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - - /* - * If the previous iteration of this loop locked the primary page of - * what is still the correct target bucket, we are done. Otherwise, - * drop any old lock before acquiring the new one. - */ - if (retry) - { - if (oldblkno == blkno) - break; - _hash_relbuf(rel, buf); - } - - /* Fetch and lock the primary bucket page for the target bucket */ - buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); - - /* - * Reacquire metapage lock and check that no bucket split has taken - * place while we were awaiting the bucket lock. - */ - LockBuffer(metabuf, BUFFER_LOCK_SHARE); - oldblkno = blkno; - retry = true; - } + buf = _hash_getbucketbuf_from_hashkey(hashkey, rel, HASH_WRITE, &usedmetap); /* remember the primary bucket buffer to release the pin on it at end. */ bucket_buf = buf; @@ -152,7 +99,9 @@ restart_insert: LockBuffer(buf, BUFFER_LOCK_UNLOCK); _hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket, - maxbucket, highmask, lowmask); + usedmetap->hashm_maxbucket, + usedmetap->hashm_highmask, + usedmetap->hashm_lowmask); /* release the pin on old and meta buffer. retry for insert. */ _hash_dropbuf(rel, buf); @@ -225,6 +174,7 @@ restart_insert: */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + metap = HashPageGetMeta(metapage); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 45e184c..c726909 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -434,7 +434,13 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); - pageopaque->hasho_prevblkno = InvalidBlockNumber; + + /* + * Setting hasho_prevblkno of bucket page with latest maxbucket number + * to indicate bucket has been initialized and need to reconstruct + * HashMetaCache if it is older. + */ + pageopaque->hasho_prevblkno = metap->hashm_maxbucket; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; @@ -845,6 +851,12 @@ _hash_splitbucket(Relation rel, */ oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; + /* + * Setting hasho_prevblkno of bucket page with latest maxbucket number to + * indicate bucket has been split and need to reconstruct HashMetaCache. + * Below same is done for new bucket page. + */ + oopaque->hasho_prevblkno = maxbucket; npage = BufferGetPage(nbuf); /* @@ -852,7 +864,7 @@ _hash_splitbucket(Relation rel, * split is in progress. */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); - nopaque->hasho_prevblkno = InvalidBlockNumber; + nopaque->hasho_prevblkno = maxbucket; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; @@ -1191,3 +1203,124 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, LockBuffer(obuf, BUFFER_LOCK_UNLOCK); hash_destroy(tidhtab); } + + +/* + * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given + * hashkey. + * + * Bucket Pages do not move or get removed once they are allocated. This give + * us an opportunity to use the previously saved metapage contents to reach + * the target bucket buffer, instead of every time reading from the metapage + * buffer. This saves one buffer access everytime we want to reach the target + * bucket buffer, which is very helpful savings in bufmgr traffic and + * contention. + * + * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the + * bucket buffer has to be locked for reading or writing. + * + * The out parameter cachedmetap is set with metapage contents used for + * hashkey to bucket buffer mapping. Some callers need this info to reach the + * old bucket in case of bucket split, see @_hash_doinsert(). + */ +Buffer +_hash_getbucketbuf_from_hashkey(uint32 hashkey, Relation rel, int access, + HashMetaPage *cachedmetap) +{ + HashMetaPage metap; + Buffer buf; + Buffer metabuf = InvalidBuffer; + Page page; + Bucket bucket; + BlockNumber blkno; + HashPageOpaque opaque; + + if (rel->rd_amcache != NULL) + { + metap = (HashMetaPage) rel->rd_amcache; + } + else + { + /* Read the metapage */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + /* Cache the metapage data for next time. */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(HashMetaPageData)); + memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData)); + metap = (HashMetaPage) rel->rd_amcache; + + /* Release metapage lock, but keep pin. */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + } + + /* + * Loop until we get a lock on the correct target bucket. + */ + for (;;) + { + /* + * Compute the target bucket number, and convert to block number. + */ + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); + + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + + /* + * Check if this bucket is split after we have cached the hash meta + * data. To do this we need to check whether cached maxbucket number + * is less than or equal to maxbucket number stored in bucket page, + * which was set with that times maxbucket number during bucket page + * splits. In case of upgrade hashno_prevblkno of old bucket page will + * be set with InvalidBlockNumber. And as of now maximum value the + * hashm_maxbucket can take is 1 less than InvalidBlockNumber (see + * _hash_expandtable). So an explicit check for InvalidBlockNumber in + * hasho_prevblkno will tell whether current bucket has been split + * after caching hash meta data. + */ + if (opaque->hasho_prevblkno == InvalidBlockNumber || + opaque->hasho_prevblkno <= metap->hashm_maxbucket) + { + /* Ok now we have the right bucket proceed to search in it. */ + break; + } + + /* First drop any locks held on bucket buffers. */ + _hash_relbuf(rel, buf); + + /* Cached meta data is old try again updating it. */ + if (BufferIsInvalid(metabuf)) + metabuf = + _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + else + LockBuffer(metabuf, BUFFER_LOCK_SHARE); + + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + /* Cache the metapage data for next time. */ + memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData)); + metap = (HashMetaPage) rel->rd_amcache; + + /* Release metapage lock, but keep pin. */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + } + + /* done with the metapage */ + if (!BufferIsInvalid(metabuf)) + _hash_dropbuf(rel, metabuf); + + if (cachedmetap) + *cachedmetap = metap; + return buf; +} diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 913b87c..cdc54e1 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -152,6 +152,11 @@ _hash_readprev(IndexScanDesc scan, _hash_relbuf(rel, *bufp); *bufp = InvalidBuffer; + + /* If it is a bucket page there will not be a prevblkno. */ + if ((*opaquep)->hasho_flag & LH_BUCKET_PAGE) + return; + /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); if (BlockNumberIsValid(blkno)) @@ -216,13 +221,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) uint32 hashkey; Bucket bucket; BlockNumber blkno; - BlockNumber oldblkno = InvalidBuffer; - bool retry = false; Buffer buf; - Buffer metabuf; Page page; HashPageOpaque opaque; - HashMetaPage metap; IndexTuple itup; ItemPointer current; OffsetNumber offnum; @@ -277,56 +278,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) so->hashso_sk_hash = hashkey; - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - page = BufferGetPage(metabuf); - metap = HashPageGetMeta(page); - - /* - * Loop until we get a lock on the correct target bucket. - */ - for (;;) - { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); - - /* Release metapage lock, but keep pin. */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - - /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old lock - * and lock what now appears to be the correct bucket. - */ - if (retry) - { - if (oldblkno == blkno) - break; - _hash_relbuf(rel, buf); - } - - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); - - /* - * Reacquire metapage lock and check that no bucket split has taken - * place while we were awaiting the bucket lock. - */ - LockBuffer(metabuf, BUFFER_LOCK_SHARE); - oldblkno = blkno; - retry = true; - } - - /* done with the metapage */ - _hash_dropbuf(rel, metabuf); - + buf = _hash_getbucketbuf_from_hashkey(hashkey, rel, HASH_READ, NULL); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == bucket); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index bc08f81..fe98157 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -60,6 +60,14 @@ typedef uint32 Bucket; typedef struct HashPageOpaqueData { + /* + * If this is an ovfl page this stores previous ovfl (or bucket) blkno. + * Else if this is a bucket page we use this for a special purpose. We + * store hashm_maxbucket value, whenever this page is initialized or + * split. So this helps us to know whether the bucket has been split after + * caching the some of the meta page data. See _hash_doinsert(), + * _hash_first() to know how to use same. + */ BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */ BlockNumber hasho_nextblkno; /* next ovfl blkno */ Bucket hasho_bucket; /* bucket number this pg belongs to */ @@ -327,6 +335,10 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags); extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags); +extern Buffer +_hash_getbucketbuf_from_hashkey(uint32 hashkey, Relation rel, + int access, + HashMetaPage *cachedmetap); extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers