Re: [HACKERS] WIP: Split of hash index bucket

Antonin Houska Fri, 27 Mar 2015 05:52:13 -0700

Antonin Houska <a...@cybertec.at> wrote:
> I'm still testing it. especially the concurrent access. There are probably
> bugs in the code, but it can help understand what I mean.


I've traced the most important cases of concurrent insertion into a bucket
split of which is in progress. A few related bugs fixed. Some tool to check
the index structure is needed yet, before performance testing makes
sense. That might include enhancement of contrib/pageinspect.

-- 
Antonin Houska
Cybertec Schönig & Schönig GmbH
Gröhrmühlgasse 26
A-2700 Wiener Neustadt
Web: http://www.postgresql-support.de, http://www.cybertec.at

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
new file mode 100644
index 24b06a5..149bbcf
*** a/src/backend/access/hash/hash.c
--- b/src/backend/access/hash/hash.c
*************** loop_top:
*** 572,577 ****
--- 572,599 ----
  			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
  			Assert(opaque->hasho_bucket == cur_bucket);
  
+ 			/*
+ 			 * If the bucket participates in a split, give up.
+ 			 *
+ 			 * (Unlike the metapage copy, the flags at bucket level should
+ 			 * always be up-to-date.)
+ 			 *
+ 			 * TODO
+ 			 *
+ 			 * 1. Analyze if both buckets participating in the split impose
+ 			 * too severe restrictions, and if it makes sense to introduce
+ 			 * separate flags for "old" and "new" bucket. Also, would such a
+ 			 * restricted VACUUM still make sense?
+ 			 *
+ 			 * 2. Consider how statistics should reflect the fact that some
+ 			 * buckets are skipped because of split.
+ 			 */
+ 			if (opaque->hasho_flag & LH_BUCKET_SPLIT)
+ 			{
+ 				_hash_relbuf(rel, buf);
+ 				break;
+ 			}
+ 
  			/* Scan each tuple in page */
  			maxoffno = PageGetMaxOffsetNumber(page);
  			for (offno = FirstOffsetNumber;
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
new file mode 100644
index 63aaec9..de445c9
*** a/src/backend/access/hash/hashinsert.c
--- b/src/backend/access/hash/hashinsert.c
*************** _hash_doinsert(Relation rel, IndexTuple
*** 37,42 ****
--- 37,43 ----
  	Page		page;
  	HashPageOpaque pageopaque;
  	Size		itemsz;
+ 	uint16      buckets_total;
  	bool		do_expand;
  	uint32		hashkey;
  	Bucket		bucket;
*************** _hash_doinsert(Relation rel, IndexTuple
*** 123,129 ****
  		 */
  		BlockNumber nextblkno = pageopaque->hasho_nextblkno;
  
! 		if (BlockNumberIsValid(nextblkno))
  		{
  			/*
  			 * ovfl page exists; go get it.  if it doesn't have room, we'll
--- 124,131 ----
  		 */
  		BlockNumber nextblkno = pageopaque->hasho_nextblkno;
  
! 		if (BlockNumberIsValid(nextblkno) &&
! 			!(pageopaque->hasho_flag & LH_BUCKET_SPLIT_LAST))
  		{
  			/*
  			 * ovfl page exists; go get it.  if it doesn't have room, we'll
*************** _hash_doinsert(Relation rel, IndexTuple
*** 136,142 ****
  		else
  		{
  			/*
! 			 * we're at the end of the bucket chain and we haven't found a
  			 * page with enough room.  allocate a new overflow page.
  			 */
  
--- 138,145 ----
  		else
  		{
  			/*
! 			 * we're at the end of the bucket chain, or (during a split) right
! 			 * before redirection to the "old bucket", and we haven't found a
  			 * page with enough room.  allocate a new overflow page.
  			 */
  
*************** _hash_doinsert(Relation rel, IndexTuple
*** 151,157 ****
  			Assert(PageGetFreeSpace(page) >= itemsz);
  		}
  		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
! 		Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
  		Assert(pageopaque->hasho_bucket == bucket);
  	}
  
--- 154,160 ----
  			Assert(PageGetFreeSpace(page) >= itemsz);
  		}
  		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
! 		Assert(pageopaque->hasho_flag & LH_OVERFLOW_PAGE);
  		Assert(pageopaque->hasho_bucket == bucket);
  	}
  
*************** _hash_doinsert(Relation rel, IndexTuple
*** 173,180 ****
  	metap->hashm_ntuples += 1;
  
  	/* Make sure this stays in sync with _hash_expandtable() */
! 	do_expand = metap->hashm_ntuples >
! 		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
  
  	/* Write out the metapage and drop lock, but keep pin */
  	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
--- 176,184 ----
  	metap->hashm_ntuples += 1;
  
  	/* Make sure this stays in sync with _hash_expandtable() */
! 	buckets_total = metap->hashm_maxbucket + 1 + metap->hashm_split_count;
! 	do_expand = metap->hashm_split_count < HASH_MAX_SPLITS &&
! 		metap->hashm_ntuples > (double) metap->hashm_ffactor * buckets_total;
  
  	/* Write out the metapage and drop lock, but keep pin */
  	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
new file mode 100644
index b775164..4345f29
*** a/src/backend/access/hash/hashovfl.c
--- b/src/backend/access/hash/hashovfl.c
***************
*** 21,27 ****
  #include "utils/rel.h"
  
  
- static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
  static uint32 _hash_firstfreebit(uint32 map);
  
  
--- 21,26 ----
*************** _hash_addovflpage(Relation rel, Buffer m
*** 127,133 ****
  		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
  		nextblkno = pageopaque->hasho_nextblkno;
  
! 		if (!BlockNumberIsValid(nextblkno))
  			break;
  
  		/* we assume we do not need to write the unmodified page */
--- 126,133 ----
  		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
  		nextblkno = pageopaque->hasho_nextblkno;
  
! 		if (!BlockNumberIsValid(nextblkno) ||
! 			(pageopaque->hasho_flag & LH_BUCKET_SPLIT_LAST))
  			break;
  
  		/* we assume we do not need to write the unmodified page */
*************** _hash_addovflpage(Relation rel, Buffer m
*** 140,150 ****
  	ovflpage = BufferGetPage(ovflbuf);
  	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
  	ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
- 	ovflopaque->hasho_nextblkno = InvalidBlockNumber;
  	ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
  	ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
  	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
  
  	MarkBufferDirty(ovflbuf);
  
  	/* logically chain overflow page to previous page */
--- 140,163 ----
  	ovflpage = BufferGetPage(ovflbuf);
  	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
  	ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
  	ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
  	ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
  	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
  
+ 	if (pageopaque->hasho_flag & LH_BUCKET_SPLIT_LAST)
+ 	{
+ 		/*
+ 		 * The bucket is being created, so it may point to part of the bucket
+ 		 * being split (but the link may already be terminated - we don't
+ 		 * care, just copy it.)
+ 		 */
+ 		ovflopaque->hasho_nextblkno = pageopaque->hasho_nextblkno;
+ 		ovflopaque->hasho_flag |= LH_BUCKET_SPLIT_LAST;
+ 		pageopaque->hasho_flag &= ~LH_BUCKET_SPLIT_LAST;
+ 	}
+ 	else
+ 		ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+ 
  	MarkBufferDirty(ovflbuf);
  
  	/* logically chain overflow page to previous page */
*************** _hash_addovflpage(Relation rel, Buffer m
*** 164,170 ****
   * The caller must hold a pin, but no lock, on the metapage buffer.
   * That buffer is left in the same state at exit.
   */
! static Buffer
  _hash_getovflpage(Relation rel, Buffer metabuf)
  {
  	HashMetaPage metap;
--- 177,183 ----
   * The caller must hold a pin, but no lock, on the metapage buffer.
   * That buffer is left in the same state at exit.
   */
! Buffer
  _hash_getovflpage(Relation rel, Buffer metabuf)
  {
  	HashMetaPage metap;
*************** found:
*** 334,339 ****
--- 347,385 ----
  }
  
  /*
+  * Return buffer containing the last page of a bucket being created (during
+  * split). The search starts at page contained in 'start' buffer. This buffer
+  * is expected to be pinned and write-locked (caller can't unlock it). The
+  * result is also pinned and write-locked.
+  */
+ Buffer
+ _hash_get_new_bucket_end(Relation rel, Buffer start)
+ {
+ 	Buffer buf;
+ 	Page		page;
+ 	HashPageOpaque pageopaque;
+ 	BlockNumber blkno = InvalidBlockNumber;
+ 
+ 	buf = start;
+ 	for (;;)
+ 	{
+ 		if (BlockNumberIsValid(blkno))
+ 			buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE);
+ 		page = BufferGetPage(buf);
+ 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ 		if (pageopaque->hasho_flag & LH_BUCKET_SPLIT_LAST)
+ 			return buf;
+ 
+ 		blkno = pageopaque->hasho_nextblkno;
+ 		/* If reached end of the old bucket, something is pretty bad. */
+ 		Assert(BlockNumberIsValid(blkno));
+ 
+ 		if (buf != start)
+ 			_hash_relbuf(rel, buf);
+ 	}
+ }
+ 
+ /*
   *	_hash_firstfreebit()
   *
   *	Return the number of the first bit that is not set in the word 'map'.
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
new file mode 100644
index 46c6c96..c169dcd
*** a/src/backend/access/hash/hashpage.c
--- b/src/backend/access/hash/hashpage.c
*************** _hash_metapinit(Relation rel, double num
*** 424,432 ****
--- 424,434 ----
  	 */
  	metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
  	metap->hashm_highmask = (num_buckets << 1) - 1;
+ 	metap->hashm_split_count = 0;
  
  	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
  	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
+ 	MemSet(metap->hashm_splits, 0, sizeof(metap->hashm_splits));
  
  	/* Set up mapping for one spare page after the initial splitpoints */
  	metap->hashm_spares[log2_num_buckets] = 1;
*************** void
*** 498,511 ****
--- 500,521 ----
  _hash_expandtable(Relation rel, Buffer metabuf)
  {
  	HashMetaPage metap;
+ 	uint16      buckets_total;
  	Bucket		old_bucket;
  	Bucket		new_bucket;
+ 	HashSplit   new_split;
  	uint32		spare_ndx;
  	BlockNumber start_oblkno;
  	BlockNumber start_nblkno;
  	uint32		maxbucket;
  	uint32		highmask;
  	uint32		lowmask;
+ 	Buffer		obuf;
+ 	Page		opage, npage;
+ 	HashPageOpaque oopaque, nopaque;
+ 	uint16      i;
+ 	size_t      tail;
+ 	Buffer      nbuf;
  
  	/*
  	 * Write-lock the meta page.  It used to be necessary to acquire a
*************** _hash_expandtable(Relation rel, Buffer m
*** 517,529 ****
  	metap = HashPageGetMeta(BufferGetPage(metabuf));
  
  	/*
! 	 * Check to see if split is still needed; someone else might have already
! 	 * done one while we waited for the lock.
  	 *
  	 * Make sure this stays in sync with _hash_doinsert()
  	 */
! 	if (metap->hashm_ntuples <=
! 		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
  		goto fail;
  
  	/*
--- 527,542 ----
  	metap = HashPageGetMeta(BufferGetPage(metabuf));
  
  	/*
! 	 * Check to see if we can handle one more split and if it's still needed;
! 	 * someone else might have already done one or more splits while we waited
! 	 * for the lock. (At this point we assume that the splits in progress will
! 	 * all succeed.)
  	 *
  	 * Make sure this stays in sync with _hash_doinsert()
  	 */
! 	buckets_total = metap->hashm_maxbucket + 1 + metap->hashm_split_count;
! 	if (metap->hashm_split_count >= HASH_MAX_SPLITS ||
! 		metap->hashm_ntuples <= (double) metap->hashm_ffactor * buckets_total)
  		goto fail;
  
  	/*
*************** _hash_expandtable(Relation rel, Buffer m
*** 545,569 ****
  		goto fail;
  
  	/*
! 	 * Determine which bucket is to be split, and attempt to lock the old
! 	 * bucket.  If we can't get the lock, give up.
! 	 *
! 	 * The lock protects us against other backends, but not against our own
! 	 * backend.  Must check for active scans separately.
  	 */
  	new_bucket = metap->hashm_maxbucket + 1;
  
  	old_bucket = (new_bucket & metap->hashm_lowmask);
  
! 	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
  
  	if (_hash_has_active_scan(rel, old_bucket))
  		goto fail;
  
  	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
  		goto fail;
  
  	/*
  	 * Likewise lock the new bucket (should never fail).
  	 *
  	 * Note: it is safe to compute the new bucket's blkno here, even though we
--- 558,619 ----
  		goto fail;
  
  	/*
! 	 * Determine which bucket is to be split.
  	 */
  	new_bucket = metap->hashm_maxbucket + 1;
  
  	old_bucket = (new_bucket & metap->hashm_lowmask);
  
! 	/*
! 	 * If the masks changed too recently, another new_bucket might map to the
! 	 * same old_bucket.
! 	 */
! 	for (i = 0; i < metap->hashm_split_count; i++)
! 	{
! 		if (metap->hashm_splits[i].old_bucket == old_bucket)
! 			goto fail;
! 	}
  
+ 	/*
+ 	 * Lock the old bucket.  If we can't get the lock, give up (else we could
+ 	 * cause deadlock with backends looking for the bucket).
+ 	 *
+ 	 * The lock protects us against other backends, but not against our own
+ 	 * backend.  Must check for active scans separately.
+ 	 */
  	if (_hash_has_active_scan(rel, old_bucket))
  		goto fail;
  
+ 	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
  	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
  		goto fail;
  
  	/*
+ 	 * Make sure that backends in need of old_bucket during the split are
+ 	 * aware of the restriction that split-in-progress imposes.
+ 	 */
+ 	obuf = _hash_getbuf(rel, start_oblkno, HASH_WRITE, LH_BUCKET_PAGE);
+ 	opage = BufferGetPage(obuf);
+ 	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+ 	/*
+ 	 * If hashm_splits indicates no split for this bucket, the bucket itself
+ 	 * shouldn't be marked neither: when the split ends, these flags are unset
+ 	 * first, then the item is removed from hashm_splits.
+ 	 */
+ 	Assert(!(oopaque->hasho_flag & LH_BUCKET_SPLIT));
+ 	oopaque->hasho_flag |= LH_BUCKET_SPLIT;
+ 
+ 	/*
+ 	 * Do not drop pin, we'll definitely need the buffer soon again.
+ 	 */
+ 	_hash_chgbufaccess(rel, obuf, HASH_WRITE, HASH_NOLOCK);
+ 	/*
+ 	 * It's o.k. to let other backends use the bucket now. LH_BUCKET_SPLITS
+ 	 * ensures that they are aware of the relevant restrictions.
+ 	 */
+ 	_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
+ 
+ 	/*
  	 * Likewise lock the new bucket (should never fail).
  	 *
  	 * Note: it is safe to compute the new bucket's blkno here, even though we
*************** _hash_expandtable(Relation rel, Buffer m
*** 596,610 ****
  		 */
  		if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
  		{
! 			/* can't split due to BlockNumber overflow */
! 			_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
  			_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
  			goto fail;
  		}
  	}
  
  	/*
! 	 * Okay to proceed with split.  Update the metapage bucket mapping info.
  	 *
  	 * Since we are scribbling on the metapage data right in the shared
  	 * buffer, any failure in this next little bit leaves us with a big
--- 646,664 ----
  		 */
  		if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
  		{
! 			/*
! 			 * can't split due to BlockNumber overflow
! 			 *
! 			 * Meta page is not updated yet, no other backend should be
! 			 * aware of the new bucket.
! 			 */
  			_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
  			goto fail;
  		}
  	}
  
  	/*
! 	 * Okay to initiate the split.  Update the metapage bucket mapping info.
  	 *
  	 * Since we are scribbling on the metapage data right in the shared
  	 * buffer, any failure in this next little bit leaves us with a big
*************** _hash_expandtable(Relation rel, Buffer m
*** 613,619 ****
  	 * establish a critical section.
  	 */
  	START_CRIT_SECTION();
- 
  	metap->hashm_maxbucket = new_bucket;
  
  	if (new_bucket > metap->hashm_highmask)
--- 667,672 ----
*************** _hash_expandtable(Relation rel, Buffer m
*** 635,640 ****
--- 688,700 ----
  		metap->hashm_ovflpoint = spare_ndx;
  	}
  
+ 	/*
+ 	 * Publish information about the split in progress.
+ 	 */
+ 	new_split = &metap->hashm_splits[i];
+ 	new_split->new_bucket = new_bucket;
+ 	new_split->old_bucket = old_bucket;
+ 	metap->hashm_split_count += 1;
  	/* Done mucking with metapage */
  	END_CRIT_SECTION();
  
*************** _hash_expandtable(Relation rel, Buffer m
*** 649,665 ****
  	highmask = metap->hashm_highmask;
  	lowmask = metap->hashm_lowmask;
  
! 	/* Write out the metapage and drop lock, but keep pin */
  	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
  
  	/* Relocate records to the new bucket */
  	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
! 					  start_oblkno, start_nblkno,
! 					  maxbucket, highmask, lowmask);
  
! 	/* Release bucket locks, allowing others to access them */
! 	_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
! 	_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
  
  	return;
  
--- 709,832 ----
  	highmask = metap->hashm_highmask;
  	lowmask = metap->hashm_lowmask;
  
! 	/*
! 	 * Initialize the new bucket's primary page while holding the meta page
! 	 * lock (_hash_getnewbuf should not be called concurrently).
! 	 *
! 	 * XXX This is rather a misuse of the metabuf content lock. Can we
! 	 * synchronize this call in any better way?
! 	 */
! 	nbuf = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
! 
! 	/*
! 	 * No need to hold the meta page lock while initializing page(s) of the
! 	 * new bucket.
! 	 *
! 	 * Note: As soon as the meta page lock is released, any concurrent backend
! 	 * can see the new bucket, but can't lock it. We hold the exclusive lock
! 	 * on the new bucket until its ready to accept both reads and writes.
! 	 *
! 	 * TODO (WAL)
! 	 *
! 	 * Before WAL-replaying the above meta page changes, exclusive lock of the
! 	 * new bucket must be acquired (probably before even locking the metapage,
! 	 * to avoid deadlock with backends looking for the bucket; note that
! 	 * conditional lock acquisition does not help here, we really need the
! 	 * lock). The problem is, that if crash happens exactly after the meta
! 	 * page changes have been WAL-logged, recovery must not let other backends
! 	 * know about (and possibly lock) the new bucket until the replaying
! 	 * backend (startup process) has it locked exclusively (to complete its
! 	 * initial setup).
! 	 */
  	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
  
+ 	/*
+ 	 * Neither initialization nor writing of the new page requires the metabuf
+ 	 * lock.
+ 	 *
+ 	 * The last overflow page in the chain must point to the first page of the
+ 	 * old bucket. During the split this ensures that scans of the new bucket
+ 	 * will always succeed, although the new bucket has not received all the
+ 	 * matching tuples from the old bucket yet.
+ 	 */
+ 	npage = BufferGetPage(nbuf);
+ 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+ 	nopaque->hasho_prevblkno = InvalidBlockNumber;
+ 	nopaque->hasho_nextblkno = start_oblkno;
+ 	nopaque->hasho_bucket = new_bucket;
+ 
+ 	/*
+ 	 * LH_BUCKET_SPLIT_LAST - initially the primary page is also the last page
+ 	 * of the bucket. This flag will be cleared as soon as the first overflow
+ 	 * page is added.
+ 	 */
+ 	nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_SPLIT |
+ 		LH_BUCKET_SPLIT_LAST;
+ 	nopaque->hasho_page_id = HASHO_PAGE_ID;
+ 	_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
+ 
+ 	/* Release lock of the new bucket, allowing others to use it. */
+ 	_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
+ 
  	/* Relocate records to the new bucket */
  	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
! 								   start_oblkno, start_nblkno, maxbucket,
! 								   highmask, lowmask);
! 
! 	/*
! 	 * Mark the new buckets as no longer participating in the split.
! 	 */
! 	_hash_chgbufaccess(rel, nbuf, HASH_NOLOCK, HASH_WRITE);
! 	nopaque->hasho_flag &= ~LH_BUCKET_SPLIT;
! 	_hash_wrtbuf(rel, nbuf);
  
! 
! 	_hash_chgbufaccess(rel, obuf, HASH_NOLOCK, HASH_WRITE);
! 	oopaque->hasho_flag &= ~LH_BUCKET_SPLIT;
! 	_hash_wrtbuf(rel, obuf);
! 
! 	/*
! 	 * Remove the "split record" from the metapage.
! 	 *
! 	 * (Use critical section for the same reasons as explained above.)
! 	 */
! 	START_CRIT_SECTION();
! 	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
! 	_hash_checkpage(rel, metabuf, LH_META_PAGE);
! 	metap = HashPageGetMeta(BufferGetPage(metabuf));
! 	Assert(metap->hashm_split_count > 0);
! 	for (i = 0; i < metap->hashm_split_count; i++)
! 	{
! 		if (metap->hashm_splits[i].old_bucket == old_bucket)
! 			break;
! 	}
! 	if (i == metap->hashm_split_count)
! 		/*
! 		 * This should not happen unless the meta page is corrupt.
! 		 */
! 		elog(ERROR, "could not find info on split of bucket %u", old_bucket);
! 
! 	/* Number of items following the one we're gonna delete. */
! 	tail = metap->hashm_split_count - (i + 1);
! 	if (tail > 0)
! 	{
! 		HashSplit current, next;
! 
! 		/*
! 		 * The item being removed is not the last one. Shift the following
! 		 * items one position lower.
! 		 */
! 		current = metap->hashm_splits + i;
! 		next = current + 1;
! 		memmove(current, next, tail * sizeof(HashSplitData));
! 	}
! 	metap->hashm_split_count -= 1;
! 	END_CRIT_SECTION();
! 
! 	/*
! 	 * Write the meta page.
! 	 */
! 	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
  
  	return;
  
*************** _hash_alloc_buckets(Relation rel, BlockN
*** 718,723 ****
--- 885,949 ----
  	return true;
  }
  
+ /*
+  * Set redirection from the end of "new" bucket (i.e. just being created) to
+  * the appropriate page of "old" (i.e. bucket being split). Search for the
+  * last page of "new" bucket starts at page contained in 'buf'.
+  *
+  * 'buf' must be pinned and write-locked, 'target' points to the page of the
+  * "old" bucket. 'to_unlock' is a list of pages that caller will need to
+  * unlock sometime.
+  */
+ static void
+ _hash_set_new_bucket_end_link(Relation rel, Buffer buf, BlockNumber target,
+ 							  List **to_unlock)
+ {
+ 	Page page;
+ 	HashPageOpaque opaque;
+ 
+ 	page = BufferGetPage(buf);
+ 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ 	if (!(opaque->hasho_flag & LH_BUCKET_SPLIT_LAST))
+ 	{
+ 		buf = _hash_get_new_bucket_end(rel, buf);
+ 		page = BufferGetPage(buf);
+ 		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ 		/* It will need to be unlocked too. */
+ 		*to_unlock = lappend_int(*to_unlock, buf);
+ 	}
+ 
+ 	Assert(opaque->hasho_flag & LH_BUCKET_SPLIT_LAST);
+ 	opaque->hasho_nextblkno = target;
+ 	/* The special case of terminating the new bucket. */
+ 	if (!BlockNumberIsValid(opaque->hasho_nextblkno))
+ 		opaque->hasho_flag &= ~LH_BUCKET_SPLIT_LAST;
+ }
+ 
+ 
+ /*
+  * Unlock buffers in the order we locked them. 'keep_pin', if valid, is a
+  * buffer that should only by written, but its pin should not be dropped.
+  */
+ static void
+ _hash_write_buffers(Relation rel, List **bufs, Buffer keep_pin)
+ {
+ 	ListCell *nc;
+ 
+ 	foreach(nc, *bufs)
+ 	{
+ 		Buffer buf;
+ 
+ 		buf = (Buffer) lfirst_int(nc);
+ 		if (!BufferIsInvalid(keep_pin) && buf == keep_pin)
+ 			_hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK);
+ 		else
+ 			_hash_wrtbuf(rel, buf);
+ 
+ 	}
+ 	list_free(*bufs);
+ 	*bufs = NIL;
+ }
+ 
  
  /*
   * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
*************** _hash_alloc_buckets(Relation rel, BlockN
*** 727,734 ****
   * belong in the new bucket, and compress out any free space in the old
   * bucket.
   *
!  * The caller must hold exclusive locks on both buckets to ensure that
!  * no one else is trying to access them (see README).
   *
   * The caller must hold a pin, but no lock, on the metapage buffer.
   * The buffer is returned in the same state.  (The metapage is only
--- 953,961 ----
   * belong in the new bucket, and compress out any free space in the old
   * bucket.
   *
!  * Neither bucket needs to be locked. Any concurrent backend is expected to
!  * check flags of primary pages of the buckets and follow the related
!  * restrictions.
   *
   * The caller must hold a pin, but no lock, on the metapage buffer.
   * The buffer is returned in the same state.  (The metapage is only
*************** _hash_splitbucket(Relation rel,
*** 748,796 ****
  	BlockNumber oblkno;
  	BlockNumber nblkno;
  	Buffer		obuf;
- 	Buffer		nbuf;
  	Page		opage;
- 	Page		npage;
  	HashPageOpaque oopaque;
! 	HashPageOpaque nopaque;
  
- 	/*
- 	 * It should be okay to simultaneously write-lock pages from each bucket,
- 	 * since no one else can be trying to acquire buffer lock on pages of
- 	 * either bucket.
- 	 */
  	oblkno = start_oblkno;
- 	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE);
- 	opage = BufferGetPage(obuf);
- 	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
- 
  	nblkno = start_nblkno;
! 	nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM);
! 	npage = BufferGetPage(nbuf);
! 
! 	/* initialize the new bucket's primary page */
! 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
! 	nopaque->hasho_prevblkno = InvalidBlockNumber;
! 	nopaque->hasho_nextblkno = InvalidBlockNumber;
! 	nopaque->hasho_bucket = nbucket;
! 	nopaque->hasho_flag = LH_BUCKET_PAGE;
! 	nopaque->hasho_page_id = HASHO_PAGE_ID;
  
  	/*
! 	 * Partition the tuples in the old bucket between the old bucket and the
! 	 * new bucket, advancing along the old bucket's overflow bucket chain and
! 	 * adding overflow pages to the new bucket as needed.  Outer loop iterates
! 	 * once per page in old bucket.
  	 */
  	for (;;)
  	{
  		OffsetNumber ooffnum;
  		OffsetNumber omaxoffnum;
  		OffsetNumber deletable[MaxOffsetNumber];
  		int			ndeletable = 0;
  
  		/* Scan each tuple in old page */
  		omaxoffnum = PageGetMaxOffsetNumber(opage);
  		for (ooffnum = FirstOffsetNumber;
  			 ooffnum <= omaxoffnum;
  			 ooffnum = OffsetNumberNext(ooffnum))
--- 975,1032 ----
  	BlockNumber oblkno;
  	BlockNumber nblkno;
  	Buffer		obuf;
  	Page		opage;
  	HashPageOpaque oopaque;
! 	Page		npage = NULL;
! 	HashPageOpaque nopaque = NULL;
! 	char *tuples_new;
! 	Buffer nbuf = InvalidBuffer;
! 	int nbuf_inserts = 0;
  
  	oblkno = start_oblkno;
  	nblkno = start_nblkno;
! 	/* BLCKSZ is safe size for any number of matching tuples. */
! 	tuples_new = (char *) palloc(BLCKSZ);
  
  	/*
! 	 * Copy matching tuples from the old to the new bucket, advancing along
! 	 * the old bucket's overflow bucket chain and adding overflow pages to the
! 	 * new bucket as needed. Backends scanning the old bucket during the split
! 	 * are aware that deletion of the tuples copied is not immediate, so they
! 	 * do re-check each key before returning it.
! 	 *
! 	 * Outer loop iterates once per page in old bucket.
  	 */
  	for (;;)
  	{
  		OffsetNumber ooffnum;
  		OffsetNumber omaxoffnum;
+ 		BlockNumber oblkno_next;
  		OffsetNumber deletable[MaxOffsetNumber];
  		int			ndeletable = 0;
+ 		char *tuple_copy;
+ 		IndexTuple itup;
+ 		Size itemsz;
+ 		int item_count, i;
+ 		Buffer keep_pin = InvalidBuffer;
+ 		List *new_write = NIL;
+ 
+ 		if (!BlockNumberIsValid(oblkno))
+ 			break;
+ 
+ 		/*
+ 		 * 1. Copy the interesting tuples from the source page aside and
+ 		 * release its content lock.
+ 		 */
+ 		obuf = _hash_getbuf(rel, oblkno, HASH_READ,
+ 							LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+ 		opage = BufferGetPage(obuf);
+ 		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
  
  		/* Scan each tuple in old page */
  		omaxoffnum = PageGetMaxOffsetNumber(opage);
+ 		tuple_copy = tuples_new;
+ 		item_count = 0;
  		for (ooffnum = FirstOffsetNumber;
  			 ooffnum <= omaxoffnum;
  			 ooffnum = OffsetNumberNext(ooffnum))
*************** _hash_splitbucket(Relation rel,
*** 810,862 ****
  
  			if (bucket == nbucket)
  			{
- 				/*
- 				 * insert the tuple into the new bucket.  if it doesn't fit on
- 				 * the current page in the new bucket, we must allocate a new
- 				 * overflow page and place the tuple on that page instead.
- 				 */
  				itemsz = IndexTupleDSize(*itup);
  				itemsz = MAXALIGN(itemsz);
  
! 				if (PageGetFreeSpace(npage) < itemsz)
  				{
! 					/* write out nbuf and drop lock, but keep pin */
! 					_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
! 					/* chain to a new overflow page */
! 					nbuf = _hash_addovflpage(rel, metabuf, nbuf);
! 					npage = BufferGetPage(nbuf);
! 					/* we don't need nblkno or nopaque within the loop */
  				}
  
  				/*
! 				 * Insert tuple on new page, using _hash_pgaddtup to ensure
! 				 * correct ordering by hashkey.  This is a tad inefficient
! 				 * since we may have to shuffle itempointers repeatedly.
! 				 * Possible future improvement: accumulate all the items for
! 				 * the new page and qsort them before insertion.
  				 */
! 				(void) _hash_pgaddtup(rel, nbuf, itemsz, itup);
  
  				/*
  				 * Mark tuple for deletion from old page.
  				 */
  				deletable[ndeletable++] = ooffnum;
  			}
  			else
- 			{
- 				/*
- 				 * the tuple stays on this page, so nothing to do.
- 				 */
  				Assert(bucket == obucket);
- 			}
  		}
  
- 		oblkno = oopaque->hasho_nextblkno;
- 
- 		/*
- 		 * Done scanning this old page.  If we moved any tuples, delete them
- 		 * from the old page.
- 		 */
  		if (ndeletable > 0)
  		{
  			PageIndexMultiDelete(opage, deletable, ndeletable);
--- 1046,1305 ----
  
  			if (bucket == nbucket)
  			{
  				itemsz = IndexTupleDSize(*itup);
  				itemsz = MAXALIGN(itemsz);
  
! 				memcpy(tuple_copy, itup, itemsz);
! 				tuple_copy += itemsz;
! 				item_count++;
! 			}
! 			else
! 			{
! 				/*
! 				 * the tuple stays on this page, so nothing to do.
! 				 */
! 				Assert(bucket == obucket);
! 			}
! 		}
! 
! 		oblkno_next = oopaque->hasho_nextblkno;
! 		_hash_chgbufaccess(rel, obuf, HASH_READ, HASH_NOLOCK);
! 
! 		/*
! 		 * 2. Insert them into the new bucket.
! 		 */
! 		if (BufferIsInvalid(nbuf))
! 		{
! 			nbuf  = _hash_getbuf(rel, nblkno, HASH_WRITE, LH_BUCKET_PAGE);
! 			npage = BufferGetPage(nbuf);
! 			nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
! 		}
! 		else
! 			_hash_chgbufaccess(rel, nbuf, HASH_NOLOCK, HASH_WRITE);
! 
! 
! 		if (item_count == 0)
! 		{
! 			/*
! 			 * No matching tuples on this page.
! 			 *
! 			 * Write even though no items could be added to nbuf: the link to
! 			 * the old bucket will be updated.
! 			 */
! 			new_write = lappend_int(new_write, nbuf);
! 			/*
! 			 * If this is the last outer loop, oblkno_next is
! 			 * InvalidBlockNumber, so the link will be terminated. Otherwise
! 			 * it will point to the next page of the old bucket.
! 			 */
! 			_hash_set_new_bucket_end_link(rel, nbuf, oblkno_next, &new_write);
! 
! 			/*
! 			 * Write, this time only nbuf, and - if distinct - the buffer
! 			 * containing the link to the old bucket. Keep pin of nbuf iff at
! 			 * least one more outer loop is expected.
! 			 */
! 			keep_pin = BlockNumberIsValid(oblkno_next) ? nbuf : InvalidBuffer;
! 			_hash_write_buffers(rel, &new_write, keep_pin);
! 
! 			_hash_dropbuf(rel, obuf);
! 			oblkno = oblkno_next;
! 			continue;
! 		}
! 
! 		tuple_copy = tuples_new;
! 		for (i = 0; i < item_count;)
! 		{
! 			itup = (IndexTuple) tuple_copy;
! 			itemsz = IndexTupleDSize(*itup);
! 			itemsz = MAXALIGN(itemsz);
! 
! 			/*
! 			 * insert the tuple into the new bucket.  if it doesn't fit on the
! 			 * current page in the new bucket, try to proceed to the next one
! 			 * - it's possible that it overflow page has already been added
! 			 * for concurrent insertions.
! 			 */
! 			if (PageGetFreeSpace(npage) < itemsz)
! 			{
! 				BlockNumber nblkno_next;
! 
! 				nblkno_next = nopaque->hasho_nextblkno;
! 
! 				/*
! 				 * The split must have been initialized by now. Thus, even if
! 				 * we are at the end of the new bucket, a valid link should be
! 				 * there (to the old bucket).
! 				 */
! 				Assert(BlockNumberIsValid(nblkno_next));
! 
! 				if (nbuf_inserts > 0)
  				{
! 					/*
! 					 * The page should be written, but not unlocked! As the
! 					 * new bucket eventually links to the current page of the
! 					 * old bucket, the tuples already copied from the current
! 					 * old page could be found twice - once on this page and
! 					 * once on the old one. (On the other hand, the link must
! 					 * not be redirected to oblkno_next yet, that would make
! 					 * readers omit some not-yet-copied tuples.)
! 					 *
! 					 * TODO The hash index seems to have no API to only sync
! 					 * buffer, w/o unlocking. Let's do nothing so far (the
! 					 * buffers will be written when processing the current old
! 					 * page is done) and resolve it when implementing WAL
! 					 */
! 
! 					/*
! 					 * Remember to write and unlock when the current old
! 					 * bucket page is processed.
! 					 */
! 					new_write = lappend_int(new_write, nbuf);
  				}
  
  				/*
! 				 * The current (already full) page links either to a page of
! 				 * the new bucket (probably created for concurrent insertions)
! 				 * or to one of the old bucket.
  				 */
! 				if (!(nopaque->hasho_flag & LH_BUCKET_SPLIT_LAST))
! 				{
! 					if (nbuf_inserts == 0)
! 					{
! 						/* No reason to prevent others from using the page. */
! 						_hash_relbuf(rel, nbuf);
! 					}
! 
! 					/* Follow the link. */
! 					nbuf = _hash_getbuf(rel, nblkno_next, HASH_WRITE,
! 									   LH_OVERFLOW_PAGE);
! 				}
! 				else
! 				{
! 					/*
! 					 * A link to the old bucket - add a new page before it.
! 					 */
! 					Buffer		ovflbuf;
! 					Page		ovflpage;
! 					HashPageOpaque ovflopaque;
! 
! 					/*
!   					 * _hash_addovflpage can't be used here because it expects
! 					 * the buffer unlocked. We can't afford unlocking as
! 					 * explained above.
! 					 */
! 					ovflbuf = _hash_getovflpage(rel, metabuf);
! 					ovflpage = BufferGetPage(ovflbuf);
! 					ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
! 					ovflopaque->hasho_bucket = nopaque->hasho_bucket;
! 					ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
! 					ovflopaque->hasho_page_id = HASHO_PAGE_ID;
! 					/* ovflpage takes the link over. */
! 					ovflopaque->hasho_nextblkno = nopaque->hasho_nextblkno;
! 					nopaque->hasho_flag &= ~LH_BUCKET_SPLIT_LAST;
! 					ovflopaque->hasho_flag |= LH_BUCKET_SPLIT_LAST;
! 					ovflopaque->hasho_prevblkno = nblkno;
! 					/* The current new page is no longer the last. */
! 					nopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
! 
! 					if (nbuf_inserts == 0)
! 					{
! 						/*
! 						 * No reason to prevent others from using the
! 						 * page. (Could not do it earlier as we needed some
! 						 * info from nopaque. to setup the links.
! 						 */
! 						_hash_relbuf(rel, nbuf);
! 					}
! 					nbuf = ovflbuf;
! 				}
! 				/* Keep nblkno always in-sync with nbuf. */
! 				nblkno = BufferGetBlockNumber(nbuf);
! 				npage = BufferGetPage(nbuf);
! 				nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
! 				nbuf_inserts = 0;
! 				continue;
! 			}
! 
! 			/*
! 			 * Insert tuple on new page, using _hash_pgaddtup to ensure
! 			 * correct ordering by hashkey.  This is a tad inefficient
! 			 * since we may have to shuffle itempointers repeatedly.
! 			 * Possible future improvement: accumulate all the items for
! 			 * the new page and qsort them before insertion.
! 			 */
! 			(void) _hash_pgaddtup(rel, nbuf, itemsz, itup);
! 
! 			nbuf_inserts++;
! 			tuple_copy += itemsz;
! 			i++;
! 		}
! 
! 		Assert(nbuf_inserts > 0);
! 		new_write = lappend_int(new_write, nbuf);
! 		/*
! 		 * If the next outer loop will add some more items to 'nbuf', they
! 		 * should trigger writing again.
! 		 */
! 		nbuf_inserts = 0;
! 
! 		/*
! 		 * 3. Before the affected pages of the new bucket get unlocked,
! 		 * redirect scans of the new bucket to the first not-yet-processed
! 		 * page of the old bucket.
! 		 *
! 		 * If we're at the last page of the old bucket, this effectively
! 		 * terminates the new bucket.
! 		 */
! 		_hash_set_new_bucket_end_link(rel, nbuf, oblkno_next, &new_write);
! 
! 		/*
! 		 * Write all buffers modified for the current old bucket page.
! 		 *
! 		 * Modification means tuple insertion or update of the link to the old
! 		 * bucket.
! 		 *
! 		 * nbuf (the last buffer we inserted data into) might be needed for
! 		 * the next outer loop. Keep its pin unless this is the last outer
! 		 * loop. (However the page _hash_set_new_bucket_end_link might have
! 		 * added can be dropped for now.)
! 		 */
! 		keep_pin = BlockNumberIsValid(oblkno_next) ? nbuf : InvalidBuffer;
! 		_hash_write_buffers(rel, &new_write, keep_pin);
! 
! 		/*
! 		 * 4. Now that the copying has succeeded, write-lock page of the old
! 		 * bucket and delete the tuples just copied.
! 		 */
! 		_hash_chgbufaccess(rel, obuf, HASH_NOLOCK, HASH_WRITE);
! 		omaxoffnum = PageGetMaxOffsetNumber(opage);
! 		for (ooffnum = FirstOffsetNumber;
! 			 ooffnum <= omaxoffnum;
! 			 ooffnum = OffsetNumberNext(ooffnum))
! 		{
! 			IndexTuple	itup;
! 			Bucket		bucket;
! 
! 			/*
! 			 * Fetch the item's hash key (conveniently stored in the item) and
! 			 * determine which bucket it now belongs in.
! 			 */
! 			itup = (IndexTuple) PageGetItem(opage,
! 											PageGetItemId(opage, ooffnum));
! 			bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
! 										  maxbucket, highmask, lowmask);
  
+ 			if (bucket == nbucket)
+ 			{
  				/*
  				 * Mark tuple for deletion from old page.
  				 */
  				deletable[ndeletable++] = ooffnum;
  			}
  			else
  				Assert(bucket == obucket);
  		}
  
  		if (ndeletable > 0)
  		{
  			PageIndexMultiDelete(opage, deletable, ndeletable);
*************** _hash_splitbucket(Relation rel,
*** 865,887 ****
  		else
  			_hash_relbuf(rel, obuf);
  
! 		/* Exit loop if no more overflow pages in old bucket */
! 		if (!BlockNumberIsValid(oblkno))
! 			break;
! 
! 		/* Else, advance to next old page */
! 		obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
! 		opage = BufferGetPage(obuf);
! 		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
  	}
  
  	/*
! 	 * We're at the end of the old bucket chain, so we're done partitioning
! 	 * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
! 	 * tuples remaining in the old bucket (including the overflow pages) are
! 	 * packed as tightly as possible.  The new bucket is already tight.
  	 */
- 	_hash_wrtbuf(rel, nbuf);
- 
- 	_hash_squeezebucket(rel, obucket, start_oblkno, NULL);
  }
--- 1308,1343 ----
  		else
  			_hash_relbuf(rel, obuf);
  
! 		/* Go for the next unprocessed page. */
! 		oblkno = oblkno_next;
  	}
  
+ 	pfree(tuples_new);
+ 
  	/*
! 	 * TODO Squeeze the old bucket
! 	 *
! 	 * Whether it should happen here or elsewhere, consider how to interact
! 	 * with concurrent calls of _hash_addovflpage. That function claims: "at
! 	 * least share lock on the bucket, to ensure that no one else tries to
! 	 * compact the bucket meanwhile. This guarantees that 'buf' won't stop
! 	 * being part of the bucket while it's unlocked."
! 	 *
! 	 * The problem is that we don't hold exclusive lock on either bucket
! 	 * during the split anymore (except for short time during the
! 	 * preparation), so the shared lock held by caller of _hash_addovflpage
! 	 * is useless. We can either:
! 	 *
! 	 * 1. Squeeze the bucket here, but consider that LH_BUCKET_SPLIT is still
! 	 * set. That is, be "careful enough" not to disrupt any concurrent
! 	 * execution of _hash_addovflpage (during insertion). (And remove the
! 	 * mentioned comment about "at least share lock"). Does such a "tentative"
! 	 * squeezing still make sense?
! 	 *
! 	 * 2. Move the "squeeze" phase elsewhere (to VACUUM ?) and be careful
! 	 * about LH_BUCKET_SPLIT flag before doing anything.
! 	 *
! 	 * Note: whichever approach we take, VACUUM must also respect
! 	 * LH_BUCKET_SPLIT.
  	 */
  }
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
new file mode 100644
index 3d66216..2321890
*** a/src/backend/access/hash/hashutil.c
--- b/src/backend/access/hash/hashutil.c
*************** _hash_checkpage(Relation rel, Buffer buf
*** 185,192 ****
  	if (flags)
  	{
  		HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
  
! 		if ((opaque->hasho_flag & flags) == 0)
  			ereport(ERROR,
  					(errcode(ERRCODE_INDEX_CORRUPTED),
  				   errmsg("index \"%s\" contains corrupted page at block %u",
--- 185,196 ----
  	if (flags)
  	{
  		HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ 		uint16 oflags = opaque->hasho_flag;
  
! 		if (((oflags & flags) == 0) ||
! 			((oflags & LH_BUCKET_SPLIT) && !(oflags & LH_BUCKET_PAGE)) ||
! 			((oflags & LH_BUCKET_SPLIT_LAST) &&
! 			 !(oflags & (LH_BUCKET_PAGE | LH_OVERFLOW_PAGE))))
  			ereport(ERROR,
  					(errcode(ERRCODE_INDEX_CORRUPTED),
  				   errmsg("index \"%s\" contains corrupted page at block %u",
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
new file mode 100644
index fc3c7f4..956e5a2
*** a/src/include/access/hash.h
--- b/src/include/access/hash.h
*************** typedef uint32 Bucket;
*** 46,57 ****
   * available to other buckets by calling _hash_freeovflpage(). If all
   * the tuples are deleted from a bucket page, no additional action is
   * necessary.
   */
  #define LH_UNUSED_PAGE			(0)
  #define LH_OVERFLOW_PAGE		(1 << 0)
  #define LH_BUCKET_PAGE			(1 << 1)
! #define LH_BITMAP_PAGE			(1 << 2)
! #define LH_META_PAGE			(1 << 3)
  
  typedef struct HashPageOpaqueData
  {
--- 46,77 ----
   * available to other buckets by calling _hash_freeovflpage(). If all
   * the tuples are deleted from a bucket page, no additional action is
   * necessary.
+  *
+  * LH_BUCKET_SPLIT can only be set if LH_BUCKET_PAGE is already set. It
+  * indicates that bucket participates in a split - whether the bucket being
+  * split or a new one, receiving tuples from an existing bucket. This flag
+  * compensates for the fact that no lock is held at bucket level for major
+  * part of the split. Whoever wants to move tuples across pages (VACUUM,
+  * _hash_squeezebucket) should check for this flags *in addition* to having
+  * acquired exclusive lock on the bucket. (The exclusive lock itself is
+  * sufficient protection for running scans.)
+  *
+  * XXX Do we need 2 separate flags, one for the "old" and one for the "new"
+  * bucket?
+  *
+  * LH_BUCKET_SPLIT_LAST can be set either on primary page of a bucket being
+  * created during a split (such page already has LH_BUCKET_SPLIT set), or - if
+  * the bucket already contains overflow pages - on the last overflow page of
+  * such bucket. It tells that hasho_nextblkno of the page points to the first
+  * not-yet-processed page of the bucket being split.
   */
  #define LH_UNUSED_PAGE			(0)
  #define LH_OVERFLOW_PAGE		(1 << 0)
  #define LH_BUCKET_PAGE			(1 << 1)
! #define LH_BUCKET_SPLIT         (1 << 2)
! #define LH_BUCKET_SPLIT_LAST    (1 << 3)
! #define LH_BITMAP_PAGE			(1 << 4)
! #define LH_META_PAGE			(1 << 5)
  
  typedef struct HashPageOpaqueData
  {
*************** typedef struct HashScanOpaqueData
*** 88,94 ****
  	bool		hashso_bucket_valid;
  
  	/*
! 	 * If we have a share lock on the bucket, we record it here.  When
  	 * hashso_bucket_blkno is zero, we have no such lock.
  	 */
  	BlockNumber hashso_bucket_blkno;
--- 108,114 ----
  	bool		hashso_bucket_valid;
  
  	/*
! 	 * If we have a share lock on the bucket, we record it here. When
  	 * hashso_bucket_blkno is zero, we have no such lock.
  	 */
  	BlockNumber hashso_bucket_blkno;
*************** typedef struct HashScanOpaqueData
*** 111,116 ****
--- 131,152 ----
  typedef HashScanOpaqueData *HashScanOpaque;
  
  /*
+  * Details of a split in progress.
+  *
+  * old_bucket is the bucket being split, by moving some tuples into
+  * new_bucket.
+  *
+  * TODO Consider renaming to BucketSplitData.
+  */
+ typedef struct HashSplitData
+ {
+ 	Bucket      old_bucket;
+ 	Bucket      new_bucket;
+ } HashSplitData;
+ 
+ typedef HashSplitData *HashSplit;
+ 
+ /*
   * Definitions for metapage.
   */
  
*************** typedef HashScanOpaqueData *HashScanOpaq
*** 138,145 ****
--- 174,185 ----
   * There is no particular upper limit on the size of mapp[], other than
   * needing to fit into the metapage.  (With 8K block size, 128 bitmaps
   * limit us to 64 Gb of overflow space...)
+  *
+  * splits contains list of bucket splits in progress. Concurrent scans and
+  * insertions need to be aware of them.
   */
  #define HASH_MAX_SPLITPOINTS		32
+ #define HASH_MAX_SPLITS             8 /* TODO How many can we afford? */
  #define HASH_MAX_BITMAPS			128
  
  typedef struct HashMetaPageData
*************** typedef struct HashMetaPageData
*** 153,158 ****
--- 193,199 ----
  								 * of 2 */
  	uint16		hashm_bmshift;	/* log2(bitmap array size in BITS) */
  	uint32		hashm_maxbucket;	/* ID of maximum bucket in use */
+ 	uint16      hashm_split_count;	/* number of splits in progress */
  	uint32		hashm_highmask; /* mask to modulo into entire table */
  	uint32		hashm_lowmask;	/* mask to modulo into lower half of table */
  	uint32		hashm_ovflpoint;/* splitpoint from which ovflpgs being
*************** typedef struct HashMetaPageData
*** 163,168 ****
--- 204,210 ----
  	uint32		hashm_spares[HASH_MAX_SPLITPOINTS];		/* spare pages before
  														 * each splitpoint */
  	BlockNumber hashm_mapp[HASH_MAX_BITMAPS];	/* blknos of ovfl bitmaps */
+ 	HashSplitData hashm_splits[HASH_MAX_SPLITS]; /* splits in progress */
  } HashMetaPageData;
  
  typedef HashMetaPageData *HashMetaPage;
*************** extern OffsetNumber _hash_pgaddtup(Relat
*** 291,296 ****
--- 333,340 ----
  
  /* hashovfl.c */
  extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
+ extern Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
+ extern Buffer _hash_get_new_bucket_end(Relation rel, Buffer start);
  extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf,
  				   BufferAccessStrategy bstrategy);
  extern void _hash_initbitmap(Relation rel, HashMetaPage metap,

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] WIP: Split of hash index bucket

Reply via email to