Re: [HACKERS] WIP: Split of hash index bucket

2015-03-27 Thread Antonin Houska
Antonin Houska a...@cybertec.at wrote:
 I'm still testing it. especially the concurrent access. There are probably
 bugs in the code, but it can help understand what I mean.

I've traced the most important cases of concurrent insertion into a bucket
split of which is in progress. A few related bugs fixed. Some tool to check
the index structure is needed yet, before performance testing makes
sense. That might include enhancement of contrib/pageinspect.

-- 
Antonin Houska
Cybertec Schönig  Schönig GmbH
Gröhrmühlgasse 26
A-2700 Wiener Neustadt
Web: http://www.postgresql-support.de, http://www.cybertec.at

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
new file mode 100644
index 24b06a5..149bbcf
*** a/src/backend/access/hash/hash.c
--- b/src/backend/access/hash/hash.c
*** loop_top:
*** 572,577 
--- 572,599 
  			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
  			Assert(opaque-hasho_bucket == cur_bucket);
  
+ 			/*
+ 			 * If the bucket participates in a split, give up.
+ 			 *
+ 			 * (Unlike the metapage copy, the flags at bucket level should
+ 			 * always be up-to-date.)
+ 			 *
+ 			 * TODO
+ 			 *
+ 			 * 1. Analyze if both buckets participating in the split impose
+ 			 * too severe restrictions, and if it makes sense to introduce
+ 			 * separate flags for old and new bucket. Also, would such a
+ 			 * restricted VACUUM still make sense?
+ 			 *
+ 			 * 2. Consider how statistics should reflect the fact that some
+ 			 * buckets are skipped because of split.
+ 			 */
+ 			if (opaque-hasho_flag  LH_BUCKET_SPLIT)
+ 			{
+ _hash_relbuf(rel, buf);
+ break;
+ 			}
+ 
  			/* Scan each tuple in page */
  			maxoffno = PageGetMaxOffsetNumber(page);
  			for (offno = FirstOffsetNumber;
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
new file mode 100644
index 63aaec9..de445c9
*** a/src/backend/access/hash/hashinsert.c
--- b/src/backend/access/hash/hashinsert.c
*** _hash_doinsert(Relation rel, IndexTuple
*** 37,42 
--- 37,43 
  	Page		page;
  	HashPageOpaque pageopaque;
  	Size		itemsz;
+ 	uint16  buckets_total;
  	bool		do_expand;
  	uint32		hashkey;
  	Bucket		bucket;
*** _hash_doinsert(Relation rel, IndexTuple
*** 123,129 
  		 */
  		BlockNumber nextblkno = pageopaque-hasho_nextblkno;
  
! 		if (BlockNumberIsValid(nextblkno))
  		{
  			/*
  			 * ovfl page exists; go get it.  if it doesn't have room, we'll
--- 124,131 
  		 */
  		BlockNumber nextblkno = pageopaque-hasho_nextblkno;
  
! 		if (BlockNumberIsValid(nextblkno) 
! 			!(pageopaque-hasho_flag  LH_BUCKET_SPLIT_LAST))
  		{
  			/*
  			 * ovfl page exists; go get it.  if it doesn't have room, we'll
*** _hash_doinsert(Relation rel, IndexTuple
*** 136,142 
  		else
  		{
  			/*
! 			 * we're at the end of the bucket chain and we haven't found a
  			 * page with enough room.  allocate a new overflow page.
  			 */
  
--- 138,145 
  		else
  		{
  			/*
! 			 * we're at the end of the bucket chain, or (during a split) right
! 			 * before redirection to the old bucket, and we haven't found a
  			 * page with enough room.  allocate a new overflow page.
  			 */
  
*** _hash_doinsert(Relation rel, IndexTuple
*** 151,157 
  			Assert(PageGetFreeSpace(page) = itemsz);
  		}
  		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
! 		Assert(pageopaque-hasho_flag == LH_OVERFLOW_PAGE);
  		Assert(pageopaque-hasho_bucket == bucket);
  	}
  
--- 154,160 
  			Assert(PageGetFreeSpace(page) = itemsz);
  		}
  		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
! 		Assert(pageopaque-hasho_flag  LH_OVERFLOW_PAGE);
  		Assert(pageopaque-hasho_bucket == bucket);
  	}
  
*** _hash_doinsert(Relation rel, IndexTuple
*** 173,180 
  	metap-hashm_ntuples += 1;
  
  	/* Make sure this stays in sync with _hash_expandtable() */
! 	do_expand = metap-hashm_ntuples 
! 		(double) metap-hashm_ffactor * (metap-hashm_maxbucket + 1);
  
  	/* Write out the metapage and drop lock, but keep pin */
  	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
--- 176,184 
  	metap-hashm_ntuples += 1;
  
  	/* Make sure this stays in sync with _hash_expandtable() */
! 	buckets_total = metap-hashm_maxbucket + 1 + metap-hashm_split_count;
! 	do_expand = metap-hashm_split_count  HASH_MAX_SPLITS 
! 		metap-hashm_ntuples  (double) metap-hashm_ffactor * buckets_total;
  
  	/* Write out the metapage and drop lock, but keep pin */
  	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
new file mode 100644
index b775164..4345f29
*** a/src/backend/access/hash/hashovfl.c
--- b/src/backend/access/hash/hashovfl.c
***
*** 21,27 
  #include utils/rel.h
  
  
- static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
  static uint32 _hash_firstfreebit(uint32 

[HACKERS] WIP: Split of hash index bucket

2015-03-26 Thread Antonin Houska
I've read proposal [1] and also major part of thread [2]. [1] encouraged me to
try an alternative approach, which does not require flags at tuple
level. After thinking about it for some time, I decided to code something -
see attachment of this mail. (I was not sure whether I should write some kind
of pseudocode, but was too eager to try whether my idea works :-) )

The idea is that new bucket is initialized as an empty primary page, whose
'hasho_nextblkno' points at the first page of the old bucket (the one being
split). Then, tuples belonging to the new bucket are copied there and the link
at the end of the new bucket is redirected to the 2nd page of the old
bucket. And so on. When the last page of the old bucket is processed, the link
from the new to the old bucket is broken.

Any bucket participating in a split (whether the original one or the one being
created) has a flag on its primary page, so that its split-in-progress status
does not require access to the index metapage.

This logic should ensure that the split can be performed in small steps, w/o
blocking scans and inserts at bucket level (of course, contention still exists
at page level).

I'm still testing it. especially the concurrent access. There are probably
bugs in the code, but it can help understand what I mean.

If this split algorithm proves to be viable, an important question about the
role of bucket-level locks (implemented currently as heavyweight lock of the
bucket's primary page) remains.


(Note that squeeze bucket functionality is not implemented in this version.)


References:

[1] 
http://www.postgresql.org/message-id/ca+tgmozymojsrfxhxq06g8jhjxqcskvdihb_8z_7nc7hj7i...@mail.gmail.com
[2]
http://www.postgresql.org/message-id/ca+tgmoy4x7vkyc4dawujutuboyxe2qsjf9aybhwzjxxwoc6...@mail.gmail.co

-- 
Antonin Houska
Cybertec Schönig  Schönig GmbH
Gröhrmühlgasse 26
A-2700 Wiener Neustadt
Web: http://www.postgresql-support.de, http://www.cybertec.at


diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
new file mode 100644
index 24b06a5..149bbcf
*** a/src/backend/access/hash/hash.c
--- b/src/backend/access/hash/hash.c
*** loop_top:
*** 572,577 
--- 572,599 
  			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
  			Assert(opaque-hasho_bucket == cur_bucket);
  
+ 			/*
+ 			 * If the bucket participates in a split, give up.
+ 			 *
+ 			 * (Unlike the metapage copy, the flags at bucket level should
+ 			 * always be up-to-date.)
+ 			 *
+ 			 * TODO
+ 			 *
+ 			 * 1. Analyze if both buckets participating in the split impose
+ 			 * too severe restrictions, and if it makes sense to introduce
+ 			 * separate flags for old and new bucket. Also, would such a
+ 			 * restricted VACUUM still make sense?
+ 			 *
+ 			 * 2. Consider how statistics should reflect the fact that some
+ 			 * buckets are skipped because of split.
+ 			 */
+ 			if (opaque-hasho_flag  LH_BUCKET_SPLIT)
+ 			{
+ _hash_relbuf(rel, buf);
+ break;
+ 			}
+ 
  			/* Scan each tuple in page */
  			maxoffno = PageGetMaxOffsetNumber(page);
  			for (offno = FirstOffsetNumber;
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
new file mode 100644
index 63aaec9..4b372ae
*** a/src/backend/access/hash/hashinsert.c
--- b/src/backend/access/hash/hashinsert.c
*** _hash_doinsert(Relation rel, IndexTuple
*** 37,42 
--- 37,43 
  	Page		page;
  	HashPageOpaque pageopaque;
  	Size		itemsz;
+ 	uint16  buckets_total;
  	bool		do_expand;
  	uint32		hashkey;
  	Bucket		bucket;
*** _hash_doinsert(Relation rel, IndexTuple
*** 173,180 
  	metap-hashm_ntuples += 1;
  
  	/* Make sure this stays in sync with _hash_expandtable() */
! 	do_expand = metap-hashm_ntuples 
! 		(double) metap-hashm_ffactor * (metap-hashm_maxbucket + 1);
  
  	/* Write out the metapage and drop lock, but keep pin */
  	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
--- 174,182 
  	metap-hashm_ntuples += 1;
  
  	/* Make sure this stays in sync with _hash_expandtable() */
! 	buckets_total = metap-hashm_maxbucket + 1 + metap-hashm_split_count;
! 	do_expand = metap-hashm_split_count  HASH_MAX_SPLITS 
! 		metap-hashm_ntuples  (double) metap-hashm_ffactor * buckets_total;
  
  	/* Write out the metapage and drop lock, but keep pin */
  	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
new file mode 100644
index b775164..4345f29
*** a/src/backend/access/hash/hashovfl.c
--- b/src/backend/access/hash/hashovfl.c
***
*** 21,27 
  #include utils/rel.h
  
  
- static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
  static uint32 _hash_firstfreebit(uint32 map);
  
  
--- 21,26 
*** _hash_addovflpage(Relation rel, Buffer m
*** 127,133 
  		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
  		nextblkno = pageopaque-hasho_nextblkno;