There's minor change against the previous one( http://archives.postgresql.org/pgsql-hackers/2008-07/msg01183.php ). * merge branch master(Aug 16) into the patch * clean code and make some comment Performance result is here http://wiki.postgresql.org/wiki/Gsoc08-hashindex
It seems hash index is a little better on index creation and selection. But maybe it's in the range of noise, I'm not sure. I'd like to try it with a bigger dataset (e.g. table with 10GB) but there is not enough space in my computer. Anyone interest can make a test on a bigger data set. -- Best Regards, Xiao Meng DKERC, Harbin Institute of Technology, China Gtalk: [EMAIL PROTECTED] MSN: [EMAIL PROTECTED] http://xiaomeng.yo2.cn
*** a/src/backend/access/hash/hash.c --- b/src/backend/access/hash/hash.c *************** *** 129,135 **** hashbuildCallback(Relation index, IndexTuple itup; /* form an index tuple and point it at the heap tuple */ ! itup = index_form_tuple(RelationGetDescr(index), values, isnull); itup->t_tid = htup->t_self; /* Hash indexes don't index nulls, see notes in hashinsert */ --- 129,135 ---- IndexTuple itup; /* form an index tuple and point it at the heap tuple */ ! itup = _hash_form_tuple(index, values,isnull); itup->t_tid = htup->t_self; /* Hash indexes don't index nulls, see notes in hashinsert */ *************** *** 153,160 **** hashbuildCallback(Relation index, /* * hashinsert() -- insert an index tuple into a hash table. * ! * Hash on the index tuple's key, find the appropriate location ! * for the new tuple, and put it there. */ Datum hashinsert(PG_FUNCTION_ARGS) --- 153,160 ---- /* * hashinsert() -- insert an index tuple into a hash table. * ! * Hash on the heap tuple's key, form an index tuple with hash code. ! * Find the appropriate location for the new tuple, and put it there. */ Datum hashinsert(PG_FUNCTION_ARGS) *************** *** 171,177 **** hashinsert(PG_FUNCTION_ARGS) IndexTuple itup; /* generate an index tuple */ ! itup = index_form_tuple(RelationGetDescr(rel), values, isnull); itup->t_tid = *ht_ctid; /* --- 171,177 ---- IndexTuple itup; /* generate an index tuple */ ! itup = _hash_form_tuple(rel, values, isnull); itup->t_tid = *ht_ctid; /* *************** *** 211,218 **** hashgettuple(PG_FUNCTION_ARGS) OffsetNumber offnum; bool res; ! /* Hash indexes are never lossy (at the moment anyway) */ ! scan->xs_recheck = false; /* * We hold pin but not lock on current buffer while outside the hash AM. --- 211,218 ---- OffsetNumber offnum; bool res; ! /* Hash indexes maybe lossy since we store hash code only */ ! scan->xs_recheck = true; /* * We hold pin but not lock on current buffer while outside the hash AM. *** a/src/backend/access/hash/hashinsert.c --- b/src/backend/access/hash/hashinsert.c *************** *** 44,60 **** _hash_doinsert(Relation rel, IndexTuple itup) uint32 hashkey; Bucket bucket; Datum datum; - bool isnull; /* ! * Compute the hash key for the item. We do this first so as not to need ! * to hold any locks while running the hash function. */ if (rel->rd_rel->relnatts != 1) elog(ERROR, "hash indexes support only one index key"); ! datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull); ! Assert(!isnull); ! hashkey = _hash_datum2hashkey(rel, datum); /* compute item size too */ itemsz = IndexTupleDSize(*itup); --- 44,58 ---- uint32 hashkey; Bucket bucket; Datum datum; /* ! * Get the hash key for the item. */ if (rel->rd_rel->relnatts != 1) elog(ERROR, "hash indexes support only one index key"); ! ! datum = _hash_get_datum(itup); ! hashkey = DatumGetUInt32(datum); /* compute item size too */ itemsz = IndexTupleDSize(*itup); *************** *** 197,207 **** _hash_pgaddtup(Relation rel, { OffsetNumber itup_off; Page page; _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); ! itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page)); if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", --- 195,210 ---- { OffsetNumber itup_off; Page page; + Datum datum; + uint32 hashkey; _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); ! datum = _hash_get_datum(itup); ! hashkey = DatumGetUInt32(datum); ! itup_off = _hash_binsearch(page, hashkey); ! if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", *** a/src/backend/access/hash/hashpage.c --- b/src/backend/access/hash/hashpage.c *************** *** 348,358 **** _hash_metapinit(Relation rel, double num_tuples) * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it ! * exactly if the index datatype is fixed-width, but for var-width there's ! * some guessing involved. */ ! data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid, ! RelationGetDescr(rel)->attrs[0]->atttypmod); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; --- 348,356 ---- * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it ! * exactly since the index datatype (i.e. hash code of uint32 ) is fixed-width. */ ! data_width = 4; item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; *************** *** 785,791 **** _hash_splitbucket(Relation rel, OffsetNumber omaxoffnum; Page opage; Page npage; ! TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each bucket, --- 783,789 ---- OffsetNumber omaxoffnum; Page opage; Page npage; ! /* * It should be okay to simultaneously write-lock pages from each bucket, *************** *** 848,861 **** _hash_splitbucket(Relation rel, /* * Re-hash the tuple to determine which bucket it now belongs in. * ! * It is annoying to call the hash function while holding locks, but ! * releasing and relocking the page for each tuple is unappealing too. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); ! datum = index_getattr(itup, 1, itupdesc, &null); ! Assert(!null); ! ! bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), maxbucket, highmask, lowmask); if (bucket == nbucket) --- 846,856 ---- /* * Re-hash the tuple to determine which bucket it now belongs in. * ! * We needn't call hash function since we store hash code in the index tuple */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); ! datum = _hash_get_datum(itup); ! bucket = _hash_hashkey2bucket(DatumGetUInt32(datum), maxbucket, highmask, lowmask); if (bucket == nbucket) *** a/src/backend/access/hash/hashsearch.c --- b/src/backend/access/hash/hashsearch.c *************** *** 178,183 **** _hash_first(IndexScanDesc scan, ScanDirection dir) --- 178,184 ---- hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument, cur->sk_subtype); + so->hashso_sk_hash = hashkey; /* * Acquire shared split lock so we can compute the target bucket safely * (see README). *************** *** 289,370 **** _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) * continue to step through tuples until: 1) we get to the end of the * bucket chain or 2) we find a valid tuple. */ ! do ! { ! switch (dir) { ! case ForwardScanDirection: ! if (offnum != InvalidOffsetNumber) ! offnum = OffsetNumberNext(offnum); /* move forward */ ! else ! offnum = FirstOffsetNumber; /* new page */ ! ! while (offnum > maxoff) ! { ! /* ! * either this page is empty (maxoff == ! * InvalidOffsetNumber) or we ran off the end. ! */ ! _hash_readnext(rel, &buf, &page, &opaque); ! if (BufferIsValid(buf)) ! { ! maxoff = PageGetMaxOffsetNumber(page); ! offnum = FirstOffsetNumber; ! } ! else ! { ! /* end of bucket */ ! maxoff = offnum = InvalidOffsetNumber; ! break; /* exit while */ ! } ! } ! break; ! ! case BackwardScanDirection: ! if (offnum != InvalidOffsetNumber) ! offnum = OffsetNumberPrev(offnum); /* move back */ ! else ! offnum = maxoff; /* new page */ ! ! while (offnum < FirstOffsetNumber) ! { ! /* ! * either this page is empty (offnum == ! * InvalidOffsetNumber) or we ran off the end. ! */ ! _hash_readprev(rel, &buf, &page, &opaque); ! if (BufferIsValid(buf)) ! maxoff = offnum = PageGetMaxOffsetNumber(page); ! else ! { ! /* end of bucket */ ! maxoff = offnum = InvalidOffsetNumber; ! break; /* exit while */ ! } ! } ! break; ! ! default: ! /* NoMovementScanDirection */ ! /* this should not be reached */ ! break; } ! /* we ran off the end of the world without finding a match */ ! if (offnum == InvalidOffsetNumber) { ! *bufP = so->hashso_curbuf = InvalidBuffer; ! ItemPointerSetInvalid(current); ! return false; } ! ! /* get ready to check this tuple */ ! itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); ! } while (!_hash_checkqual(scan, itup)); ! ! /* if we made it to here, we've found a valid tuple */ ! blkno = BufferGetBlockNumber(buf); ! *bufP = so->hashso_curbuf = buf; ! ItemPointerSet(current, blkno, offnum); ! return true; } --- 290,340 ---- * continue to step through tuples until: 1) we get to the end of the * bucket chain or 2) we find a valid tuple. */ ! for (;;) ! { ! if (offnum == InvalidOffsetNumber) ! { ! /* ! * This is the first time we're scanning this particular ! * page of the bucket, so jump to the right spot via ! * binary search. ! */ ! offnum = _hash_binsearch(page, so->hashso_sk_hash); ! } ! else { ! /* Advance to the next tuple */ ! offnum = OffsetNumberNext(offnum); } ! if (offnum <= maxoff) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); ! if (offnum <= maxoff && _hash_checkqual(scan, itup)) { ! /* Found a matching tuple */ ! *bufP = so->hashso_curbuf = buf; ! ItemPointerSet(current, BufferGetBlockNumber(buf), offnum); ! return true; } ! else ! { ! /* No more matches on this page, so go on to next page */ ! if (ScanDirectionIsForward(dir)) ! _hash_readnext(rel, &buf, &page, &opaque); ! else ! _hash_readprev(rel, &buf, &page, &opaque); ! ! if (BufferIsValid(buf)) ! { ! maxoff = PageGetMaxOffsetNumber(page); ! offnum = InvalidOffsetNumber; ! } ! else ! { ! /* Ran out of pages, so we're done */ ! *bufP = so->hashso_curbuf = InvalidBuffer; ! ItemPointerSetInvalid(current); ! return false; ! } ! } ! } } *** a/src/backend/access/hash/hashutil.c --- b/src/backend/access/hash/hashutil.c *************** *** 20,53 **** #include "executor/execdebug.h" #include "storage/bufmgr.h" #include "utils/lsyscache.h" /* * _hash_checkqual -- does the index tuple satisfy the scan conditions? */ bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup) { - TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); ScanKey key = scan->keyData; int scanKeySize = scan->numberOfKeys; IncrIndexProcessed(); while (scanKeySize > 0) { - Datum datum; - bool isNull; Datum test; ! ! datum = index_getattr(itup, ! key->sk_attno, ! tupdesc, ! &isNull); /* assume sk_func is strict */ - if (isNull) - return false; if (key->sk_flags & SK_ISNULL) return false; --- 20,59 ---- #include "executor/execdebug.h" #include "storage/bufmgr.h" #include "utils/lsyscache.h" + #include "utils/memutils.h" + #include "catalog/pg_type.h" /* + * hardcoded tuple descriptors. see include/access/hash.h + */ + static FormData_pg_attribute Desc_hash[1] = {Schema_Hash}; + /* * _hash_checkqual -- does the index tuple satisfy the scan conditions? */ bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup) { ScanKey key = scan->keyData; int scanKeySize = scan->numberOfKeys; + Datum datum; + HashScanOpaque so = scan->opaque; IncrIndexProcessed(); + + datum = _hash_get_datum(itup); + if( so->hashso_sk_hash != DatumGetUInt32(datum) ) + return false; + key++; + scanKeySize--; while (scanKeySize > 0) { Datum test; ! ! datum = _hash_get_datum(itup); /* assume sk_func is strict */ if (key->sk_flags & SK_ISNULL) return false; *************** *** 221,223 **** hashoptions(PG_FUNCTION_ARGS) --- 227,326 ---- PG_RETURN_BYTEA_P(result); PG_RETURN_NULL(); } + + /* + * _get_hash_desc - get the hash index tuple descriptor + * + * The hash index tuple descriptor is a hard-coded tuple descriptor with only int32 attribute. + */ + TupleDesc _get_hash_desc() + { + static TupleDesc hashdesc = NULL; + + /* Already done? */ + if (hashdesc == NULL){ + /* + * It's the same with BuildHardcodedDescriptor() in relcache.c to build + * a hard-coded tuple descriptor. + */ + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(CacheMemoryContext); + + hashdesc = CreateTemplateTupleDesc(1, false); + hashdesc->tdtypeid = INT4OID; + hashdesc->tdtypmod = -1; + memcpy(hashdesc->attrs[0], &Desc_hash[0], ATTRIBUTE_TUPLE_SIZE); + hashdesc->attrs[0]->attcacheoff = 0; + + MemoryContextSwitchTo(oldcxt); + } + + return hashdesc; + + } + + /* + * _hash_get_datum - get the hash index tuple's first column datum (i.e. hash code) + */ + Datum _hash_get_datum(IndexTuple itup) + { + return fetch_att( (char *) (itup) + IndexInfoFindDataOffset((itup)->t_info) , + true, + 4); + + } + /* + * _hash_form_tuple - form a tuple with hash code only + */ + IndexTuple _hash_form_tuple(Relation rel, Datum* values, bool* isnull) + { + TupleDesc hashdesc; + IndexTuple itup; + uint32 hashkey; + + hashdesc = _get_hash_desc(); + hashkey = _hash_datum2hashkey(rel, values[0]); + values[0] = UInt32GetDatum(hashkey); + itup = index_form_tuple(hashdesc, values, isnull); + return itup; + } + + /* + * _hash_binsearch - Return the offset number in the page where the specified hash value + * should be located. + * + * The return value might exceed the page's max offset + * if the hash value is greater than any hash in the page. + */ + OffsetNumber + _hash_binsearch(Page page, uint32 hash_value) + { + OffsetNumber upper; + OffsetNumber lower; + + upper = PageGetMaxOffsetNumber(page) + 1; + lower = FirstOffsetNumber; + + while (upper > lower) + { + IndexTuple pos; + OffsetNumber off; + uint32 hashkey; + Datum datum; + bool isNull; + + off = (upper + lower) / 2; + Assert(OffsetNumberIsValid(off)); + + pos = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + datum = _hash_get_datum(pos); + hashkey = DatumGetUInt32(datum); + if (hashkey < hash_value) + lower = off + 1; + else + upper = off; + } + + return upper; + } *** a/src/backend/utils/sort/tuplesort.c --- b/src/backend/utils/sort/tuplesort.c *************** *** 456,465 **** static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, --- 456,468 ---- static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); + static void copytup_index_hash(Tuplesortstate *state, SortTuple *stup, void *tup); static void writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup); static void readtup_index(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len); + static void readtup_index_hash(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); static void reversedirection_index_btree(Tuplesortstate *state); static void reversedirection_index_hash(Tuplesortstate *state); static int comparetup_datum(const SortTuple *a, const SortTuple *b, *************** *** 682,690 **** tuplesort_begin_index_hash(Relation indexRel, state->nKeys = 1; /* Only one sort column, the hash code */ state->comparetup = comparetup_index_hash; ! state->copytup = copytup_index; state->writetup = writetup_index; ! state->readtup = readtup_index; state->reversedirection = reversedirection_index_hash; state->indexRel = indexRel; --- 685,693 ---- state->nKeys = 1; /* Only one sort column, the hash code */ state->comparetup = comparetup_index_hash; ! state->copytup = copytup_index_hash; state->writetup = writetup_index; ! state->readtup = readtup_index_hash; state->reversedirection = reversedirection_index_hash; state->indexRel = indexRel; *************** *** 2822,2830 **** comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { /* ! * It's slightly annoying to redo the hash function each time, although ! * most hash functions ought to be cheap. Is it worth having a variant ! * tuple storage format so we can store the hash code? */ uint32 hash1; uint32 hash2; --- 2825,2831 ---- Tuplesortstate *state) { /* ! * we needn't redo hash function each time since we've stored it. */ uint32 hash1; uint32 hash2; *************** *** 2836,2845 **** comparetup_index_hash(const SortTuple *a, const SortTuple *b, /* Compute hash codes and mask off bits we don't want to sort by */ Assert(!a->isnull1); ! hash1 = DatumGetUInt32(FunctionCall1(state->hash_proc, a->datum1)) & state->hash_mask; Assert(!b->isnull1); ! hash2 = DatumGetUInt32(FunctionCall1(state->hash_proc, b->datum1)) & state->hash_mask; if (hash1 > hash2) --- 2837,2846 ---- /* Compute hash codes and mask off bits we don't want to sort by */ Assert(!a->isnull1); ! hash1 = DatumGetUInt32(a->datum1) & state->hash_mask; Assert(!b->isnull1); ! hash2 = DatumGetUInt32(b->datum1) & state->hash_mask; if (hash1 > hash2) *************** *** 2894,2899 **** copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) --- 2895,2916 ---- } static void + copytup_index_hash(Tuplesortstate *state, SortTuple *stup, void *tup) + { + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) palloc(tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value(i.e. hash code) */ + stup->datum1 = _hash_get_datum(newtuple); + stup->isnull1 = false; + } + static void writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) { IndexTuple tuple = (IndexTuple) stup->tuple; *************** *** 2936,2941 **** readtup_index(Tuplesortstate *state, SortTuple *stup, --- 2953,2979 ---- } static void + readtup_index_hash(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) + { + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) palloc(tuplen); + + USEMEM(state, GetMemoryChunkSpace(tuple)); + if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple, + tuplen) != tuplen) + elog(ERROR, "unexpected end of data"); + if (state->randomAccess) /* need trailing length word? */ + if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen, + sizeof(tuplen)) != sizeof(tuplen)) + elog(ERROR, "unexpected end of data"); + stup->tuple = (void *) tuple; + /* set up first-column key value(i.e. hash code) */ + stup->datum1 = _hash_get_datum(tuple); + stup->isnull1 = false; + } + + static void reversedirection_index_btree(Tuplesortstate *state) { ScanKey scanKey = state->indexScanKey; *** a/src/include/access/hash.h --- b/src/include/access/hash.h *************** *** 100,105 **** typedef struct HashScanOpaqueData --- 100,107 ---- /* Current and marked position of the scan */ ItemPointerData hashso_curpos; ItemPointerData hashso_mrkpos; + /* Hash value of the scan key */ + uint32 hashso_sk_hash; } HashScanOpaqueData; typedef HashScanOpaqueData *HashScanOpaque; *************** *** 227,233 **** typedef HashMetaPageData *HashMetaPage; */ #define HASHPROC 1 ! /* public routines */ extern Datum hashbuild(PG_FUNCTION_ARGS); --- 229,239 ---- */ #define HASHPROC 1 ! /* ! * hard-coded hash desc ! */ ! #define Schema_Hash \ ! { 0, {"hashcode"}, 23, -1, 4, 1, 0, -1, -1, true, 'p', 'i', false, false, false, true, 0 } /* public routines */ extern Datum hashbuild(PG_FUNCTION_ARGS); *************** *** 330,335 **** extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, --- 336,345 ---- uint32 highmask, uint32 lowmask); extern uint32 _hash_log2(uint32 num); extern void _hash_checkpage(Relation rel, Buffer buf, int flags); + extern TupleDesc _get_hash_desc(); + extern Datum _hash_get_datum(IndexTuple itup); + extern IndexTuple _hash_form_tuple(Relation rel, Datum* values, bool* isnull); + OffsetNumber _hash_binsearch(Page page, uint32 hash_value); /* hash.c */ extern void hash_redo(XLogRecPtr lsn, XLogRecord *record);
-- Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches