On 22.06.2013 19:19, Simon Riggs wrote:
So I think that (2) is the best route: Given that we know with much
better certainty the number of rows in the scanned-relation, we should
be able to examine our hash table after it has been built and decide
whether it would be cheaper to rebuild the hash table with the right
number of buckets, or continue processing with what we have now. Which
is roughly what Heikki proposed already, in January.
Back in January, I wrote a quick patch to experiment with rehashing when
the hash table becomes too full. It was too late to make it into 9.3 so
I didn't pursue it further back then, but IIRC it worked. If we have the
capability to rehash, the accuracy of the initial guess becomes much
less important.
- Heikki
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 6a2f236..32aebb9 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -223,6 +223,60 @@ ExecEndHash(HashState *node)
ExecEndNode(outerPlan);
}
+static void
+Rehash(HashJoinTable hashtable)
+{
+ int nbuckets_old = hashtable->nbuckets_inmem;
+ int nbuckets_new;
+ uint32 mask;
+ int i;
+
+ if (nbuckets_old > (1<<30))
+ return;
+
+ nbuckets_new = nbuckets_old * 2;
+ hashtable->buckets = (HashJoinTuple *)
+ repalloc(hashtable->buckets, nbuckets_new * sizeof(HashJoinTuple));
+ memset(((char *) hashtable->buckets) + nbuckets_old * sizeof(HashJoinTuple),
+ 0, nbuckets_old * sizeof(HashJoinTuple));
+ hashtable->nbuckets_inmem = nbuckets_new;
+
+ mask = nbuckets_old;
+
+ for (i = 0; i < nbuckets_old; i++)
+ {
+ int newbucketno = i + nbuckets_old;
+ HashJoinTuple prevtuple;
+ HashJoinTuple tuple;
+
+ prevtuple = NULL;
+ tuple = hashtable->buckets[i];
+
+ while (tuple != NULL)
+ {
+ /* save link in case we delete */
+ HashJoinTuple nexttuple = tuple->next;
+
+ if ((tuple->hashvalue & mask) != 0)
+ {
+ /* move to the new bucket */
+ tuple->next = hashtable->buckets[newbucketno];
+ hashtable->buckets[newbucketno] = tuple;
+
+ if (prevtuple)
+ prevtuple->next = nexttuple;
+ else
+ hashtable->buckets[i] = nexttuple;
+ }
+ else
+ {
+ /* keep in this bucket */
+ prevtuple = tuple;
+ }
+ tuple = nexttuple;
+ }
+ }
+}
/* ----------------------------------------------------------------
* ExecHashTableCreate
@@ -271,6 +325,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
*/
hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
hashtable->nbuckets = nbuckets;
+ hashtable->nbuckets_inmem = nbuckets;
hashtable->log2_nbuckets = log2_nbuckets;
hashtable->buckets = NULL;
hashtable->keepNulls = keepNulls;
@@ -285,6 +340,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
hashtable->nbatch_outstart = nbatch;
hashtable->growEnabled = true;
hashtable->totalTuples = 0;
+ hashtable->inmemTuples = 0;
hashtable->innerBatchFile = NULL;
hashtable->outerBatchFile = NULL;
hashtable->spaceUsed = 0;
@@ -612,7 +668,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
*/
ninmemory = nfreed = 0;
- for (i = 0; i < hashtable->nbuckets; i++)
+ for (i = 0; i < hashtable->nbuckets_inmem; i++)
{
HashJoinTuple prevtuple;
HashJoinTuple tuple;
@@ -734,6 +790,10 @@ ExecHashTableInsert(HashJoinTable hashtable,
hashTuple->next = hashtable->buckets[bucketno];
hashtable->buckets[bucketno] = hashTuple;
+ hashtable->inmemTuples += 1;
+ if (hashtable->inmemTuples > hashtable->nbuckets_inmem * NTUP_PER_BUCKET * 2)
+ Rehash(hashtable);
+
/* Account for space used, and back off if we've used too much */
hashtable->spaceUsed += hashTupleSize;
if (hashtable->spaceUsed > hashtable->spacePeak)
@@ -873,18 +933,18 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
int *bucketno,
int *batchno)
{
- uint32 nbuckets = (uint32) hashtable->nbuckets;
+ uint32 nbuckets_inmem = (uint32) hashtable->nbuckets_inmem;
uint32 nbatch = (uint32) hashtable->nbatch;
if (nbatch > 1)
{
/* we can do MOD by masking, DIV by shifting */
- *bucketno = hashvalue & (nbuckets - 1);
+ *bucketno = hashvalue & (nbuckets_inmem - 1);
*batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1);
}
else
{
- *bucketno = hashvalue & (nbuckets - 1);
+ *bucketno = hashvalue & (nbuckets_inmem - 1);
*batchno = 0;
}
}
@@ -995,7 +1055,7 @@ ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext)
*/
if (hashTuple != NULL)
hashTuple = hashTuple->next;
- else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
+ else if (hjstate->hj_CurBucketNo < hashtable->nbuckets_inmem)
{
hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
hjstate->hj_CurBucketNo++;
@@ -1052,7 +1112,7 @@ void
ExecHashTableReset(HashJoinTable hashtable)
{
MemoryContext oldcxt;
- int nbuckets = hashtable->nbuckets;
+ int nbuckets_inmem = hashtable->nbuckets_inmem;
/*
* Release all the hash buckets and tuples acquired in the prior pass, and
@@ -1063,9 +1123,10 @@ ExecHashTableReset(HashJoinTable hashtable)
/* Reallocate and reinitialize the hash bucket headers. */
hashtable->buckets = (HashJoinTuple *)
- palloc0(nbuckets * sizeof(HashJoinTuple));
+ palloc0(nbuckets_inmem * sizeof(HashJoinTuple));
hashtable->spaceUsed = 0;
+ hashtable->inmemTuples = 0;
MemoryContextSwitchTo(oldcxt);
}
@@ -1081,7 +1142,7 @@ ExecHashTableResetMatchFlags(HashJoinTable hashtable)
int i;
/* Reset all flags in the main table ... */
- for (i = 0; i < hashtable->nbuckets; i++)
+ for (i = 0; i < hashtable->nbuckets_inmem; i++)
{
for (tuple = hashtable->buckets[i]; tuple != NULL; tuple = tuple->next)
HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 8654066..1f10863 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -106,6 +106,7 @@ typedef struct HashSkewBucket
typedef struct HashJoinTableData
{
int nbuckets; /* # buckets in the in-memory hash table */
+ int nbuckets_inmem; /* # buckets in the in-memory hash table */
int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
@@ -130,6 +131,8 @@ typedef struct HashJoinTableData
double totalTuples; /* # tuples obtained from inner plan */
+ uint64 inmemTuples; /* # tuples in current hash table */
+
/*
* These arrays are allocated for the life of the hash join, but only if
* nbatch > 1. A file is opened only when we first write a tuple into it
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers