Re: [HACKERS] WIP: Split of hash index bucket
Antonin Houska a...@cybertec.at wrote: I'm still testing it. especially the concurrent access. There are probably bugs in the code, but it can help understand what I mean. I've traced the most important cases of concurrent insertion into a bucket split of which is in progress. A few related bugs fixed. Some tool to check the index structure is needed yet, before performance testing makes sense. That might include enhancement of contrib/pageinspect. -- Antonin Houska Cybertec Schönig Schönig GmbH Gröhrmühlgasse 26 A-2700 Wiener Neustadt Web: http://www.postgresql-support.de, http://www.cybertec.at diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c new file mode 100644 index 24b06a5..149bbcf *** a/src/backend/access/hash/hash.c --- b/src/backend/access/hash/hash.c *** loop_top: *** 572,577 --- 572,599 opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque-hasho_bucket == cur_bucket); + /* + * If the bucket participates in a split, give up. + * + * (Unlike the metapage copy, the flags at bucket level should + * always be up-to-date.) + * + * TODO + * + * 1. Analyze if both buckets participating in the split impose + * too severe restrictions, and if it makes sense to introduce + * separate flags for old and new bucket. Also, would such a + * restricted VACUUM still make sense? + * + * 2. Consider how statistics should reflect the fact that some + * buckets are skipped because of split. + */ + if (opaque-hasho_flag LH_BUCKET_SPLIT) + { + _hash_relbuf(rel, buf); + break; + } + /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c new file mode 100644 index 63aaec9..de445c9 *** a/src/backend/access/hash/hashinsert.c --- b/src/backend/access/hash/hashinsert.c *** _hash_doinsert(Relation rel, IndexTuple *** 37,42 --- 37,43 Page page; HashPageOpaque pageopaque; Size itemsz; + uint16 buckets_total; bool do_expand; uint32 hashkey; Bucket bucket; *** _hash_doinsert(Relation rel, IndexTuple *** 123,129 */ BlockNumber nextblkno = pageopaque-hasho_nextblkno; ! if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll --- 124,131 */ BlockNumber nextblkno = pageopaque-hasho_nextblkno; ! if (BlockNumberIsValid(nextblkno) ! !(pageopaque-hasho_flag LH_BUCKET_SPLIT_LAST)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll *** _hash_doinsert(Relation rel, IndexTuple *** 136,142 else { /* ! * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ --- 138,145 else { /* ! * we're at the end of the bucket chain, or (during a split) right ! * before redirection to the old bucket, and we haven't found a * page with enough room. allocate a new overflow page. */ *** _hash_doinsert(Relation rel, IndexTuple *** 151,157 Assert(PageGetFreeSpace(page) = itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); ! Assert(pageopaque-hasho_flag == LH_OVERFLOW_PAGE); Assert(pageopaque-hasho_bucket == bucket); } --- 154,160 Assert(PageGetFreeSpace(page) = itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); ! Assert(pageopaque-hasho_flag LH_OVERFLOW_PAGE); Assert(pageopaque-hasho_bucket == bucket); } *** _hash_doinsert(Relation rel, IndexTuple *** 173,180 metap-hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ ! do_expand = metap-hashm_ntuples ! (double) metap-hashm_ffactor * (metap-hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); --- 176,184 metap-hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ ! buckets_total = metap-hashm_maxbucket + 1 + metap-hashm_split_count; ! do_expand = metap-hashm_split_count HASH_MAX_SPLITS ! metap-hashm_ntuples (double) metap-hashm_ffactor * buckets_total; /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c new file mode 100644 index b775164..4345f29 *** a/src/backend/access/hash/hashovfl.c --- b/src/backend/access/hash/hashovfl.c *** *** 21,27 #include utils/rel.h - static Buffer _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32
[HACKERS] WIP: Split of hash index bucket
I've read proposal [1] and also major part of thread [2]. [1] encouraged me to try an alternative approach, which does not require flags at tuple level. After thinking about it for some time, I decided to code something - see attachment of this mail. (I was not sure whether I should write some kind of pseudocode, but was too eager to try whether my idea works :-) ) The idea is that new bucket is initialized as an empty primary page, whose 'hasho_nextblkno' points at the first page of the old bucket (the one being split). Then, tuples belonging to the new bucket are copied there and the link at the end of the new bucket is redirected to the 2nd page of the old bucket. And so on. When the last page of the old bucket is processed, the link from the new to the old bucket is broken. Any bucket participating in a split (whether the original one or the one being created) has a flag on its primary page, so that its split-in-progress status does not require access to the index metapage. This logic should ensure that the split can be performed in small steps, w/o blocking scans and inserts at bucket level (of course, contention still exists at page level). I'm still testing it. especially the concurrent access. There are probably bugs in the code, but it can help understand what I mean. If this split algorithm proves to be viable, an important question about the role of bucket-level locks (implemented currently as heavyweight lock of the bucket's primary page) remains. (Note that squeeze bucket functionality is not implemented in this version.) References: [1] http://www.postgresql.org/message-id/ca+tgmozymojsrfxhxq06g8jhjxqcskvdihb_8z_7nc7hj7i...@mail.gmail.com [2] http://www.postgresql.org/message-id/ca+tgmoy4x7vkyc4dawujutuboyxe2qsjf9aybhwzjxxwoc6...@mail.gmail.co -- Antonin Houska Cybertec Schönig Schönig GmbH Gröhrmühlgasse 26 A-2700 Wiener Neustadt Web: http://www.postgresql-support.de, http://www.cybertec.at diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c new file mode 100644 index 24b06a5..149bbcf *** a/src/backend/access/hash/hash.c --- b/src/backend/access/hash/hash.c *** loop_top: *** 572,577 --- 572,599 opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque-hasho_bucket == cur_bucket); + /* + * If the bucket participates in a split, give up. + * + * (Unlike the metapage copy, the flags at bucket level should + * always be up-to-date.) + * + * TODO + * + * 1. Analyze if both buckets participating in the split impose + * too severe restrictions, and if it makes sense to introduce + * separate flags for old and new bucket. Also, would such a + * restricted VACUUM still make sense? + * + * 2. Consider how statistics should reflect the fact that some + * buckets are skipped because of split. + */ + if (opaque-hasho_flag LH_BUCKET_SPLIT) + { + _hash_relbuf(rel, buf); + break; + } + /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c new file mode 100644 index 63aaec9..4b372ae *** a/src/backend/access/hash/hashinsert.c --- b/src/backend/access/hash/hashinsert.c *** _hash_doinsert(Relation rel, IndexTuple *** 37,42 --- 37,43 Page page; HashPageOpaque pageopaque; Size itemsz; + uint16 buckets_total; bool do_expand; uint32 hashkey; Bucket bucket; *** _hash_doinsert(Relation rel, IndexTuple *** 173,180 metap-hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ ! do_expand = metap-hashm_ntuples ! (double) metap-hashm_ffactor * (metap-hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); --- 174,182 metap-hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ ! buckets_total = metap-hashm_maxbucket + 1 + metap-hashm_split_count; ! do_expand = metap-hashm_split_count HASH_MAX_SPLITS ! metap-hashm_ntuples (double) metap-hashm_ffactor * buckets_total; /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c new file mode 100644 index b775164..4345f29 *** a/src/backend/access/hash/hashovfl.c --- b/src/backend/access/hash/hashovfl.c *** *** 21,27 #include utils/rel.h - static Buffer _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32 map); --- 21,26 *** _hash_addovflpage(Relation rel, Buffer m *** 127,133 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque-hasho_nextblkno;