Re: hashagg slowdown due to spill changes

Andres Freund Fri, 12 Jun 2020 14:38:06 -0700

Hi,

On 2020-06-11 11:14:02 -0700, Jeff Davis wrote:
> On Thu, 2020-06-11 at 10:45 -0700, Andres Freund wrote:
> > Did you run any performance tests?
>
> Yes, I reproduced your ~12% regression from V12, and this patch nearly
> eliminated it for me.


I spent a fair bit of time looking at the difference. Jeff had let me
know on chat that he was still seeing some difference, but couldn't
quite figure out where that was.

Trying it out myself, I observed that the patch helped, but not that
much. After a bit I found one major reason for why:
LookupTupleHashEntryHash() assigned the hash to pointer provided by the
caller's before doing the insertion. That ended up causing a pipeline
stall (I assume it's store forwarding, but not sure). Moving the
assignment to the caller variable to after the insertion got rid of
that.

It got within 3-4% after that change. I did a number of small
microoptimizations that each helped, but didn't get quite get to the
level of 12.

Finally I figured out that that's due to an issue outside of nodeAgg.c
itself:

commit 4cad2534da6d17067d98cf04be2dfc1bda8f2cd0
Author: Tomas Vondra <tomas.von...@postgresql.org>
Date:   2020-05-31 14:43:13 +0200

    Use CP_SMALL_TLIST for hash aggregate

Due to this change we end up with an additional projection in queries
like this:

postgres[212666][1]=# \d fewgroups_many_rows
         Table "public.fewgroups_many_rows"
┌────────┬─────────┬───────────┬──────────┬─────────┐
│ Column │  Type   │ Collation │ Nullable │ Default │
├────────┼─────────┼───────────┼──────────┼─────────┤
│ cat    │ integer │           │ not null │         │
│ val    │ integer │           │ not null │         │
└────────┴─────────┴───────────┴──────────┴─────────┘

postgres[212666][1]=# explain SELECT cat, count(*) FROM fewgroups_many_rows 
GROUP BY 1;
┌───────────────────────────────────────────────────────────────────────────────────────┐
│                                      QUERY PLAN                               
        │
├───────────────────────────────────────────────────────────────────────────────────────┤
│ HashAggregate  (cost=1942478.48..1942478.53 rows=5 width=12)                  
        │
│   Group Key: cat                                                              
        │
│   ->  Seq Scan on fewgroups_many_rows  (cost=0.00..1442478.32 rows=100000032 
width=4) │
└───────────────────────────────────────────────────────────────────────────────────────┘
(3 rows)

as 'val' is "projected away"..


After neutering the tlist change, Jeff's patch and my changes to it
yield performance *above* v12.


I don't see why it's ok to force an additional projection in the very
common case of hashaggs over a few rows. So I think we need to rethink
4cad2534da6.

Greetings,

Andres Freund

diff --git i/src/include/executor/executor.h w/src/include/executor/executor.h
index c7deeac662f..415e117407c 100644
--- i/src/include/executor/executor.h
+++ w/src/include/executor/executor.h
@@ -139,7 +139,7 @@ extern TupleHashTable BuildTupleHashTableExt(PlanState *parent,
 											 MemoryContext tempcxt, bool use_variable_hash_iv);
 extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   TupleTableSlot *slot,
-										   bool *isnew);
+										   bool *isnew, uint32 *hash);
 extern uint32 TupleHashTableHash(TupleHashTable hashtable,
 								 TupleTableSlot *slot);
 extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,
diff --git i/src/backend/executor/execGrouping.c w/src/backend/executor/execGrouping.c
index 8be36ca7634..1e582832ea0 100644
--- i/src/backend/executor/execGrouping.c
+++ w/src/backend/executor/execGrouping.c
@@ -22,11 +22,12 @@
 #include "utils/memutils.h"
 
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
-static uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
+static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
 										  const MinimalTuple tuple);
-static TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
-													TupleTableSlot *slot,
-													bool *isnew, uint32 hash);
+static inline TupleHashEntry LookupTupleHashEntry_internal(
+	TupleHashTable hashtable,
+	TupleTableSlot *slot,
+	bool *isnew, uint32 hash);
 
 /*
  * Define parameters for tuple hash table code generation. The interface is
@@ -291,6 +292,9 @@ ResetTupleHashTable(TupleHashTable hashtable)
  * If isnew is NULL, we do not create new entries; we return NULL if no
  * match is found.
  *
+ * If hash is not NULL, we set it to the calculated hash value. This allows
+ * callers access to the hash value even if no entry is returned.
+ *
  * If isnew isn't NULL, then a new entry is created if no existing entry
  * matches.  On return, *isnew is true if the entry is newly created,
  * false if it existed already.  ->additional_data in the new entry has
@@ -298,11 +302,11 @@ ResetTupleHashTable(TupleHashTable hashtable)
  */
 TupleHashEntry
 LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
-					 bool *isnew)
+					 bool *isnew, uint32 *hash)
 {
 	TupleHashEntry entry;
 	MemoryContext oldContext;
-	uint32		hash;
+	uint32		local_hash;
 
 	/* Need to run the hash functions in short-lived context */
 	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
@@ -312,8 +316,14 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
-	hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
-	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
+	local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+
+	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
+
+	if (hash != NULL)
+		*hash = local_hash;
+
+	Assert(entry == NULL || entry->hash == local_hash);
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -362,6 +372,7 @@ LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
 	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
+	Assert(entry == NULL || entry->hash == hash);
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -480,7 +491,7 @@ TupleHashTableHash_internal(struct tuplehash_hash *tb,
  * NB: This function may or may not change the memory context. Caller is
  * expected to change it back.
  */
-static TupleHashEntry
+static inline TupleHashEntry
 LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 							  bool *isnew, uint32 hash)
 {
diff --git i/src/backend/executor/nodeAgg.c w/src/backend/executor/nodeAgg.c
index 331acee2814..1b44b9f00de 100644
--- i/src/backend/executor/nodeAgg.c
+++ w/src/backend/executor/nodeAgg.c
@@ -382,7 +382,9 @@ static void finalize_partialaggregate(AggState *aggstate,
 									  AggStatePerAgg peragg,
 									  AggStatePerGroup pergroupstate,
 									  Datum *resultVal, bool *resultIsNull);
-static void prepare_hash_slot(AggState *aggstate);
+static inline void prepare_hash_slot(AggStatePerHash perhash,
+									 TupleTableSlot *inputslot,
+									 TupleTableSlot *hashslot);
 static void prepare_projection_slot(AggState *aggstate,
 									TupleTableSlot *slot,
 									int currentSet);
@@ -403,8 +405,9 @@ static int	hash_choose_num_partitions(uint64 input_groups,
 									   double hashentrysize,
 									   int used_bits,
 									   int *log2_npartittions);
-static AggStatePerGroup lookup_hash_entry(AggState *aggstate, uint32 hash,
-										  bool *in_hash_table);
+static void initialize_hash_entry(AggState *aggstate,
+								  TupleHashTable hashtable,
+								  TupleHashEntry entry);
 static void lookup_hash_entries(AggState *aggstate);
 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
 static void agg_fill_hash_table(AggState *aggstate);
@@ -1197,12 +1200,11 @@ finalize_partialaggregate(AggState *aggstate,
  * Extract the attributes that make up the grouping key into the
  * hashslot. This is necessary to compute the hash or perform a lookup.
  */
-static void
-prepare_hash_slot(AggState *aggstate)
+static inline void
+prepare_hash_slot(AggStatePerHash perhash,
+				  TupleTableSlot *inputslot,
+				  TupleTableSlot *hashslot)
 {
-	TupleTableSlot *inputslot = aggstate->tmpcontext->ecxt_outertuple;
-	AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set];
-	TupleTableSlot *hashslot = perhash->hashslot;
 	int			i;
 
 	/* transfer just the needed columns into hashslot */
@@ -1979,75 +1981,39 @@ hash_choose_num_partitions(uint64 input_groups, double hashentrysize,
 }
 
 /*
- * Find or create a hashtable entry for the tuple group containing the current
- * tuple (already set in tmpcontext's outertuple slot), in the current grouping
- * set (which the caller must have selected - note that initialize_aggregate
- * depends on this).
- *
- * When called, CurrentMemoryContext should be the per-query context. The
- * already-calculated hash value for the tuple must be specified.
- *
- * If in "spill mode", then only find existing hashtable entries; don't create
- * new ones. If a tuple's group is not already present in the hash table for
- * the current grouping set, assign *in_hash_table=false and the caller will
- * spill it to disk.
+ * Initialize a freshly-created TupleHashEntry.
  */
-static AggStatePerGroup
-lookup_hash_entry(AggState *aggstate, uint32 hash, bool *in_hash_table)
+static void
+initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
+					  TupleHashEntry entry)
 {
-	AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set];
-	TupleTableSlot *hashslot = perhash->hashslot;
-	TupleHashEntryData *entry;
-	bool		isnew = false;
-	bool	   *p_isnew;
+	AggStatePerGroup pergroup;
+	int			transno;
 
-	/* if hash table already spilled, don't create new entries */
-	p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
+	aggstate->hash_ngroups_current++;
+	hash_agg_check_limits(aggstate);
 
-	/* find or create the hashtable entry using the filtered tuple */
-	entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot, p_isnew,
-									 hash);
+	/* no need to allocate or initialize per-group state */
+	if (aggstate->numtrans == 0)
+		return;
 
-	if (entry == NULL)
+	pergroup = (AggStatePerGroup)
+		MemoryContextAlloc(hashtable->tablecxt,
+						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
+
+	entry->additional = pergroup;
+
+	/*
+	 * Initialize aggregates for new tuple group, lookup_hash_entries()
+	 * already has selected the relevant grouping set.
+	 */
+	for (transno = 0; transno < aggstate->numtrans; transno++)
 	{
-		*in_hash_table = false;
-		return NULL;
+		AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+		AggStatePerGroup pergroupstate = &pergroup[transno];
+
+		initialize_aggregate(aggstate, pertrans, pergroupstate);
 	}
-	else
-		*in_hash_table = true;
-
-	if (isnew)
-	{
-		AggStatePerGroup pergroup;
-		int			transno;
-
-		aggstate->hash_ngroups_current++;
-		hash_agg_check_limits(aggstate);
-
-		/* no need to allocate or initialize per-group state */
-		if (aggstate->numtrans == 0)
-			return NULL;
-
-		pergroup = (AggStatePerGroup)
-			MemoryContextAlloc(perhash->hashtable->tablecxt,
-							   sizeof(AggStatePerGroupData) * aggstate->numtrans);
-
-		entry->additional = pergroup;
-
-		/*
-		 * Initialize aggregates for new tuple group, lookup_hash_entries()
-		 * already has selected the relevant grouping set.
-		 */
-		for (transno = 0; transno < aggstate->numtrans; transno++)
-		{
-			AggStatePerTrans pertrans = &aggstate->pertrans[transno];
-			AggStatePerGroup pergroupstate = &pergroup[transno];
-
-			initialize_aggregate(aggstate, pertrans, pergroupstate);
-		}
-	}
-
-	return entry->additional;
 }
 
 /*
@@ -2072,21 +2038,37 @@ static void
 lookup_hash_entries(AggState *aggstate)
 {
 	AggStatePerGroup *pergroup = aggstate->hash_pergroup;
+	TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
 	int			setno;
 
 	for (setno = 0; setno < aggstate->num_hashes; setno++)
 	{
 		AggStatePerHash perhash = &aggstate->perhash[setno];
+		TupleHashTable hashtable = perhash->hashtable;
+		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashEntry entry;
 		uint32		hash;
-		bool		in_hash_table;
+		bool		isnew = false;
+		bool	   *p_isnew;
+
+		/* if hash table already spilled, don't create new entries */
+		p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
 
 		select_current_set(aggstate, setno, true);
-		prepare_hash_slot(aggstate);
-		hash = TupleHashTableHash(perhash->hashtable, perhash->hashslot);
-		pergroup[setno] = lookup_hash_entry(aggstate, hash, &in_hash_table);
+		prepare_hash_slot(perhash,
+						  outerslot,
+						  hashslot);
 
-		/* check to see if we need to spill the tuple for this grouping set */
-		if (!in_hash_table)
+		entry = LookupTupleHashEntry(hashtable, hashslot,
+									 p_isnew, &hash);
+
+		if (entry != NULL)
+		{
+			if (isnew)
+				initialize_hash_entry(aggstate, hashtable, entry);
+			pergroup[setno] = entry->additional;
+		}
+		else
 		{
 			HashAggSpill *spill = &aggstate->hash_spills[setno];
 			TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
@@ -2097,6 +2079,7 @@ lookup_hash_entries(AggState *aggstate)
 								   aggstate->hashentrysize);
 
 			hashagg_spill_tuple(spill, slot, hash);
+			pergroup[setno] = NULL;
 		}
 	}
 }
@@ -2554,6 +2537,7 @@ static bool
 agg_refill_hash_table(AggState *aggstate)
 {
 	HashAggBatch *batch;
+	AggStatePerHash perhash;
 	HashAggSpill spill;
 	HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
 	uint64		ngroups_estimate;
@@ -2605,6 +2589,8 @@ agg_refill_hash_table(AggState *aggstate)
 
 	select_current_set(aggstate, batch->setno, true);
 
+	perhash = &aggstate->perhash[aggstate->current_set];
+
 	/*
 	 * Spilled tuples are always read back as MinimalTuples, which may be
 	 * different from the outer plan, so recompile the aggregate expressions.
@@ -2618,10 +2604,13 @@ agg_refill_hash_table(AggState *aggstate)
 							 HASHAGG_READ_BUFFER_SIZE);
 	for (;;)
 	{
-		TupleTableSlot *slot = aggstate->hash_spill_slot;
-		MinimalTuple tuple;
-		uint32		hash;
-		bool		in_hash_table;
+		TupleTableSlot *spillslot = aggstate->hash_spill_slot;
+		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashEntry	entry;
+		MinimalTuple	tuple;
+		uint32			hash;
+		bool			isnew = false;
+		bool		   *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -2629,16 +2618,20 @@ agg_refill_hash_table(AggState *aggstate)
 		if (tuple == NULL)
 			break;
 
-		ExecStoreMinimalTuple(tuple, slot, true);
-		aggstate->tmpcontext->ecxt_outertuple = slot;
+		ExecStoreMinimalTuple(tuple, spillslot, true);
+		aggstate->tmpcontext->ecxt_outertuple = spillslot;
 
-		prepare_hash_slot(aggstate);
-		aggstate->hash_pergroup[batch->setno] =
-			lookup_hash_entry(aggstate, hash, &in_hash_table);
+		prepare_hash_slot(perhash,
+						  aggstate->tmpcontext->ecxt_outertuple,
+						  hashslot);
+		entry = LookupTupleHashEntryHash(
+			perhash->hashtable, hashslot, p_isnew, hash);
 
-		if (in_hash_table)
+		if (entry != NULL)
 		{
-			/* Advance the aggregates (or combine functions) */
+			if (isnew)
+				initialize_hash_entry(aggstate, perhash->hashtable, entry);
+			aggstate->hash_pergroup[batch->setno] = entry->additional;
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2654,7 +2647,9 @@ agg_refill_hash_table(AggState *aggstate)
 								   ngroups_estimate, aggstate->hashentrysize);
 			}
 			/* no memory for a new group, spill */
-			hashagg_spill_tuple(&spill, slot, hash);
+			hashagg_spill_tuple(&spill, spillslot, hash);
+
+			aggstate->hash_pergroup[batch->setno] = NULL;
 		}
 
 		/*
diff --git i/src/backend/executor/nodeRecursiveunion.c w/src/backend/executor/nodeRecursiveunion.c
index 620414a1edc..046242682f0 100644
--- i/src/backend/executor/nodeRecursiveunion.c
+++ w/src/backend/executor/nodeRecursiveunion.c
@@ -94,7 +94,7 @@ ExecRecursiveUnion(PlanState *pstate)
 			if (plan->numCols > 0)
 			{
 				/* Find or build hashtable entry for this tuple's group */
-				LookupTupleHashEntry(node->hashtable, slot, &isnew);
+				LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 				/* Must reset temp context after each hashtable lookup */
 				MemoryContextReset(node->tempContext);
 				/* Ignore tuple if already seen */
@@ -141,7 +141,7 @@ ExecRecursiveUnion(PlanState *pstate)
 		if (plan->numCols > 0)
 		{
 			/* Find or build hashtable entry for this tuple's group */
-			LookupTupleHashEntry(node->hashtable, slot, &isnew);
+			LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 			/* Must reset temp context after each hashtable lookup */
 			MemoryContextReset(node->tempContext);
 			/* Ignore tuple if already seen */
diff --git i/src/backend/executor/nodeSetOp.c w/src/backend/executor/nodeSetOp.c
index bfd148a41a2..8d4ccff19cc 100644
--- i/src/backend/executor/nodeSetOp.c
+++ w/src/backend/executor/nodeSetOp.c
@@ -381,7 +381,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 			/* Find or build hashtable entry for this tuple's group */
 			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
-										 &isnew);
+										 &isnew, NULL);
 
 			/* If new tuple group, initialize counts */
 			if (isnew)
@@ -402,7 +402,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 			/* For tuples not seen previously, do not make hashtable entry */
 			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
-										 NULL);
+										 NULL, NULL);
 
 			/* Advance the counts if entry is already present */
 			if (entry)
diff --git i/src/backend/executor/nodeSubplan.c w/src/backend/executor/nodeSubplan.c
index 298b7757f57..38c2fc0b50b 100644
--- i/src/backend/executor/nodeSubplan.c
+++ w/src/backend/executor/nodeSubplan.c
@@ -595,12 +595,12 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
 		 */
 		if (slotNoNulls(slot))
 		{
-			(void) LookupTupleHashEntry(node->hashtable, slot, &isnew);
+			(void) LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 			node->havehashrows = true;
 		}
 		else if (node->hashnulls)
 		{
-			(void) LookupTupleHashEntry(node->hashnulls, slot, &isnew);
+			(void) LookupTupleHashEntry(node->hashnulls, slot, &isnew, NULL);
 			node->havenullrows = true;
 		}
 
diff --git i/src/backend/optimizer/plan/createplan.c w/src/backend/optimizer/plan/createplan.c
index eb9543f6add..0778ff29ab4 100644
--- i/src/backend/optimizer/plan/createplan.c
+++ w/src/backend/optimizer/plan/createplan.c
@@ -2124,9 +2124,11 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path)
 	 */
 	flags = CP_LABEL_TLIST;
 
+#if 0
 	/* ensure small tlist for hash aggregate */
 	if (best_path->aggstrategy == AGG_HASHED)
 		flags |= CP_SMALL_TLIST;
+#endif
 
 	subplan = create_plan_recurse(root, best_path->subpath, flags);

Re: hashagg slowdown due to spill changes

Reply via email to