Hi,

Dne 2014-06-26 14:10, Pavel Stehule napsal:
Hello all,

today I had to work with one slow query - when I checked different
scenarios I found a dependency on work_mem size - but it is atypical -
bigger work_mem increased query execution 31 minutes (600MB work mem)
and 1 minute (1MB).

The problem described in Pavel's emails (illustrated by the awful
explain plans) is in this part:

-> Hash (cost=881801.58..881801.58 rows=61735 width=8) (actual time=9076.153..9076.153 rows=3310768 loops=1)

That is, the estimated number of rows is ~60k, but in reality it's ~3.3M. This then leads to a hash table with small number of buckets (8192) containing large number of tuples (~400 in this case) in a linked list. Which significantly
slows down the lookups during the hash join.

This issue is actually quite common - all it takes is a significant
underestimation of the hashed relation, either because it's a complex join (thus inherently difficult to estimate) or because the WHERE conditions are
not independent (see the example below).

The attached patch (early WIP, after ~30 minutes of hacking) addresses this by increasing the number of bucket whenever the average number of tuples per item
exceeds 1.5x NTUP_PER_BUCKET.


======= Example ========

create table table_a (id int, fk_id int);
create table table_b (fk_id int, val_a int, val_b int);

insert into table_a select i, i from generate_series(1,10000000) s(i);
insert into table_b select i, mod(i,1000), mod(i,1000) from generate_series(1,10000000) s(i);

analyze table_a;
analyze table_b;

explain analyze select count(*) from table_a join table_b on (table_a.id = table_b.fk_id) where val_a < 10 and val_b < 10;


without the patch:

                                                            QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=385834.56..385834.57 rows=1 width=0) (actual time=49543.263..49543.264 rows=1 loops=1) -> Hash Join (cost=204069.89..385831.16 rows=1359 width=0) (actual time=923.751..49531.554 rows=100000 loops=1)
         Hash Cond: (table_a.id = table_b.fk_id)
-> Seq Scan on table_a (cost=0.00..144247.77 rows=9999977 width=4) (actual time=0.104..967.090 rows=10000000 loops=1) -> Hash (cost=204052.90..204052.90 rows=1359 width=4) (actual time=923.615..923.615 rows=100000 loops=1)
               Buckets: 1024  Batches: 1  Memory Usage: 3516kB
-> Seq Scan on table_b (cost=0.00..204052.90 rows=1359 width=4) (actual time=0.086..910.656 rows=100000 loops=1)
                     Filter: ((val_a < 10) AND (val_b < 10))
                     Rows Removed by Filter: 9900000
 Planning time: 0.271 ms
 Execution time: 49545.053 ms
(11 rows)


with the patch:

                                                            QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=385834.56..385834.57 rows=1 width=0) (actual time=9780.346..9780.347 rows=1 loops=1) -> Hash Join (cost=204069.89..385831.16 rows=1359 width=0) (actual time=939.297..9772.256 rows=100000 loops=1)
         Hash Cond: (table_a.id = table_b.fk_id)
-> Seq Scan on table_a (cost=0.00..144247.77 rows=9999977 width=4) (actual time=0.103..962.446 rows=10000000 loops=1) -> Hash (cost=204052.90..204052.90 rows=1359 width=4) (actual time=939.172..939.172 rows=100000 loops=1)
               Buckets: 8192  Batches: 1  Memory Usage: 3516kB
-> Seq Scan on table_b (cost=0.00..204052.90 rows=1359 width=4) (actual time=0.064..909.191 rows=100000 loops=1)
                     Filter: ((val_a < 10) AND (val_b < 10))
                     Rows Removed by Filter: 9900000
 Planning time: 0.276 ms
 Execution time: 9782.295 ms
(11 rows)

Time: 9784.392 ms

So the duration improved significantly - from ~52 seconds to ~10 seconds.

The patch is not perfect, it needs a bit more love, as illustrated by the FIXME/TODO items. Feel free to comment.

regards
Tomas

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 589b2f1..d71b8b9 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
 
 
 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
 static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
 					  int mcvsToUse);
 static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -682,6 +683,77 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 }
 
 /*
+ * ExecHashIncreaseNumBuckets
+ *		increase the original number of buckets in order to reduce
+ *		number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+	int			oldnbuckets = hashtable->nbuckets;
+	int			i;
+    HashJoinTuple * oldbuckets = hashtable->buckets;
+
+    /* TODO Maybe it'd be better to resize the buckets in place (should be possible,
+     * but when I tried it I always ended up with a strange infinite loop). */
+
+	MemoryContext oldcxt;
+
+	/* FIXME do nothing if we've decided to shut off bucket count growth */
+
+    /* FIXME safety checks here */
+
+    /* update the hashtable info, so that we can compute buckets etc. */
+    hashtable->log2_nbuckets += 1; /* we've multiplied by 2 */
+	hashtable->nbuckets *= 2;
+
+	Assert(hashtable->nbuckets > 1);
+    Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+    /* grow the bucket list */
+    oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
+    
+	hashtable->buckets = (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple));
+    
+    MemoryContextSwitchTo(oldcxt);
+
+    /* walk through the old buckets, check if tuples are in the right bucket */
+	for (i = 0; i < oldnbuckets; i++)
+	{
+		HashJoinTuple tuple;
+
+		tuple = oldbuckets[i];
+
+		while (tuple != NULL)
+		{
+			/* save link in case we delete */
+			HashJoinTuple nexttuple = tuple->next;
+			int			bucketno;
+			int			batchno;
+
+			ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
+									  &bucketno, &batchno);
+
+            /* move it to the correct bucket */
+            tuple->next = hashtable->buckets[bucketno];
+            hashtable->buckets[bucketno] = tuple;
+
+            /* process the next tuple */
+			tuple = nexttuple;
+
+		}
+
+	}
+    
+    pfree(oldbuckets);
+
+    /* TODO disable growth if nbuckets * NTUP_PER_BUCKET reaches work_mem (or something
+     * like that) */
+
+}
+
+
+/*
  * ExecHashTableInsert
  *		insert a tuple into the hash table depending on the hash value
  *		it may just go to a temp file for later batches
@@ -740,6 +812,17 @@ ExecHashTableInsert(HashJoinTable hashtable,
 			hashtable->spacePeak = hashtable->spaceUsed;
 		if (hashtable->spaceUsed > hashtable->spaceAllowed)
 			ExecHashIncreaseNumBatches(hashtable);
+            
+        /* check average number of tuples per bucket, increase (2*nbuckets) if needed */
+        if (hashtable->spaceUsed / hashTupleSize / hashtable->nbuckets > 1.5 * NTUP_PER_BUCKET) {
+
+#ifdef HJDEBUG
+	printf("Increasing nbucket to %d because average per bucket = %d\n",
+		   nbuckets,  hashtable->spaceUsed / hashTupleSize / hashtable->nbuckets);
+#endif
+            ExecHashIncreaseNumBuckets(hashtable);
+        }
+
 	}
 	else
 	{
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to