Re: [HACKERS] bad estimation together with large work_mem generates terrible slow hash joins

Tomas Vondra Thu, 26 Jun 2014 11:44:58 -0700

Attached is v2 of the patch, with some cleanups / minor improvements:

* improved comments, whitespace fixed / TODOs etc.


* tracking inital # of buckets (similar to initial # of batches)

* adding info about buckets to EXPLAIN ANALYZE, similar to batches - I
didn't want to make it overly complex, so the info about initial
bucket/batch count is added if at least one them is modified

* modified threshold triggering the growth, to get NTUP_PER_BUCKET on
average (see the NTUP_GROW_THRESHOLD comment nodeHash.c)

* there's a single FIXME, related to counting tuples in the

One thing that's important to note is the difference between # of
batches and # of buckets. While one # of batches is "global" the # of
buckets is 'within a batch'. So theoretically each batch can use
different number of buckets.

However the value is reused between batches, so it only grows. That
means this is possible:

  initial: 1024 buckets (before 1st batch)
  batch 1: 1024 buckets
  batch 2: 1024 buckets
  batch 3: 4096 buckets
  batch 4: 8192 buckets

while this is not:

  initial: 1024 buckets (before 1st batch)
  batch 1: 1024 buckets
  batch 2: 4096 buckets
  batch 3: 1024 buckets
  batch 4: 8192 buckets

However in practice I expect the first batch will to do all the work,
and the following batches will just reuse the same number of buckets.
This of course assumes the batches have similar tuple sizes etc.

So the first batch will do all the reshuffling the tables, and the
following batches will reuse the 'right' number of buckets from the start.

regards
Tomas

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 0d9663c..db3a953 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1900,18 +1900,20 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 		if (es->format != EXPLAIN_FORMAT_TEXT)
 		{
 			ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
+			ExplainPropertyLong("Original Hash Buckets",
+								hashtable->nbuckets_original, es);
 			ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
 			ExplainPropertyLong("Original Hash Batches",
 								hashtable->nbatch_original, es);
 			ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
 		}
-		else if (hashtable->nbatch_original != hashtable->nbatch)
+		else if ((hashtable->nbatch_original != hashtable->nbatch) || (hashtable->nbuckets_original != hashtable->nbuckets))
 		{
 			appendStringInfoSpaces(es->str, es->indent * 2);
 			appendStringInfo(es->str,
-			"Buckets: %d  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
-							 hashtable->nbuckets, hashtable->nbatch,
-							 hashtable->nbatch_original, spacePeakKb);
+			"Buckets: %d (originally %d)  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
+							 hashtable->nbuckets, hashtable->nbuckets_original,
+							 hashtable->nbatch, hashtable->nbatch_original, spacePeakKb);
 		}
 		else
 		{
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 589b2f1..879b336 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
 
 
 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
 static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
 					  int mcvsToUse);
 static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -271,6 +272,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	 */
 	hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
 	hashtable->nbuckets = nbuckets;
+	hashtable->nbuckets_original = nbuckets;
 	hashtable->log2_nbuckets = log2_nbuckets;
 	hashtable->buckets = NULL;
 	hashtable->keepNulls = keepNulls;
@@ -386,6 +388,22 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 /* Target bucket loading (tuples per bucket) */
 #define NTUP_PER_BUCKET			10
 
+/* Multiple of NTUP_PER_BUCKET triggering the increase of nbuckets.
+ * 
+ * Once we reach the threshold we double the number of buckets, and we
+ * want to get 1.0 on average (to get NTUP_PER_BUCKET on average). That
+ * means these two equations should hold
+ * 
+ *   b = 2a         (growth)
+ *   (a + b)/2 = 1  (oscillate around NTUP_PER_BUCKET)
+ * 
+ * which means b=1.3333 (a = b/2). If we wanted higher threshold, we
+ * could grow the nbuckets to (4*nbuckets), thus using (b=4a) for
+ * growth, leading to (b=1.6). Or (b=8a) giving 1.7777 etc.
+ * 
+ * Let's start with doubling the bucket count, i.e. 1.333. */
+#define NTUP_GROW_THRESHOLD     1.333
+
 void
 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 						int *numbuckets,
@@ -682,6 +700,92 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 }
 
 /*
+ * ExecHashIncreaseNumBuckets
+ *		increase the original number of buckets in order to reduce
+ *		number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+	int			i;
+	int         ntuples = 0;
+	int			oldnbuckets = hashtable->nbuckets;
+	HashJoinTuple  *oldbuckets = hashtable->buckets;
+	MemoryContext   oldcxt;
+
+	/* XXX Not sure if we should update the info about used space here.
+	 * The code seems to ignore the space used for 'buckets' and we're not
+	 * allocating more space for tuples (just shuffling them to the new
+	 * buckets). And the amount of memory used for buckets is quite small
+	 * (just an array of pointers, thus ~8kB per 1k buckets on 64-bit). */
+
+	/* XXX Should we disable growth if (nbuckets * NTUP_PER_BUCKET)
+	 * reaches work_mem (or something like that)? We shouldn't really
+	 * get into such position (should be handled by increasing the
+	 * number of batches, which is called right before this). */
+
+	/* XXX Maybe adding info into hashjoin explain output (e.g. initial
+	 * nbuckets, time spent growing the table) would be appropriate. */
+
+	/* update the hashtable info, so that we can compute buckets etc. */
+	hashtable->log2_nbuckets += 1;
+	hashtable->nbuckets *= 2;
+
+	Assert(hashtable->nbuckets > 1);
+	Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+#ifdef HJDEBUG
+	printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
+#endif
+
+	/* TODO Maybe it'd be better to resize the buckets in place (should be possible,
+	 * but when I tried it I always ended up with a strange infinite loop). */
+
+	/* allocate a new bucket list (use the batch context as before) */
+	oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
+
+	hashtable->buckets = (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
+
+	MemoryContextSwitchTo(oldcxt);
+
+	/* walk through the old buckets, move the buckets into the new table */
+	for (i = 0; i < oldnbuckets; i++)
+	{
+
+		HashJoinTuple tuple = oldbuckets[i];
+
+		while (tuple != NULL)
+		{
+			/* save link in case we delete */
+			HashJoinTuple nexttuple = tuple->next;
+			int			bucketno;
+			int			batchno;
+
+			ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
+									  &bucketno, &batchno);
+
+			/* move it to the correct bucket */
+			tuple->next = hashtable->buckets[bucketno];
+			hashtable->buckets[bucketno] = tuple;
+
+			/* process the next tuple */
+			tuple = nexttuple;
+
+			ntuples++;
+		}
+	}
+
+	pfree(oldbuckets);
+
+#ifdef HJDEBUG
+	printf("Nbuckets increased to %d, average items per bucket %.1f\n",
+		   hashtable->nbuckets, (float)ntuples / hashtable->nbuckets);
+#endif
+
+}
+
+
+/*
  * ExecHashTableInsert
  *		insert a tuple into the hash table depending on the hash value
  *		it may just go to a temp file for later batches
@@ -740,6 +844,22 @@ ExecHashTableInsert(HashJoinTable hashtable,
 			hashtable->spacePeak = hashtable->spaceUsed;
 		if (hashtable->spaceUsed > hashtable->spaceAllowed)
 			ExecHashIncreaseNumBatches(hashtable);
+
+		/* check average number of tuples per bucket, increase (2*nbuckets) if needed */
+		/* FIXME This attempts to compute number of tuples in the hashtable using hashTupleSize,
+		 * 		 i.e. the size of the current tuple, which is not really accurate. Need to track
+		 * 		 the count properly.
+		 */
+		if (hashtable->spaceUsed / hashTupleSize / hashtable->nbuckets > NTUP_GROW_THRESHOLD * NTUP_PER_BUCKET) {
+
+#ifdef HJDEBUG
+			printf("Increasing nbucket to %d because average per bucket = %d\n",
+				   nbuckets,  hashtable->spaceUsed / hashTupleSize / hashtable->nbuckets);
+#endif
+
+			ExecHashIncreaseNumBuckets(hashtable);
+		}
+
 	}
 	else
 	{
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 3beae40..03e91a6 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -106,6 +106,7 @@ typedef struct HashSkewBucket
 typedef struct HashJoinTableData
 {
 	int			nbuckets;		/* # buckets in the in-memory hash table */
+	int			nbuckets_original;	/* # buckets when starting the first hash */
 	int			log2_nbuckets;	/* its log2 (nbuckets must be a power of 2) */
 
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] bad estimation together with large work_mem generates terrible slow hash joins

Reply via email to