From 066320bc566140122bc8e9b0aca1bd7b08236515 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Wed, 7 Aug 2024 16:56:48 +1200
Subject: [PATCH v4] Use ExprStates for hashing in GROUP BY and SubPlans

This speeds up obtaining hash values for GROUP BY and for hashed
SubPlan by using the ExprState support for hashing.  This allows JIT
compilation for hash value.
---
 src/backend/executor/execExpr.c     | 149 ++++++++++++++++++++++++++++
 src/backend/executor/execGrouping.c |  82 +++++++--------
 src/backend/executor/nodeSubplan.c  |  17 +++-
 src/include/executor/executor.h     |  10 +-
 src/include/nodes/execnodes.h       |  15 ++-
 5 files changed, 212 insertions(+), 61 deletions(-)

diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c
index 45954d979f..e9cc9935fc 100644
--- a/src/backend/executor/execExpr.c
+++ b/src/backend/executor/execExpr.c
@@ -3971,6 +3971,155 @@ ExecBuildAggTransCall(ExprState *state, AggState *aggstate,
 	}
 }
 
+/*
+ * Build an ExprState that calls the given hash function(s) on the attnums
+ * given by 'keyColIdx' .  When numCols > 1, the hash values returned by each
+ * hash function are combined to produce a single hash value.
+ *
+ * desc: tuple descriptor for the to-be-hashed columns
+ * ops: TupleTableSlotOps for the TupleDesc
+ * hashfunctions: FmgrInfos for each hash function to call, one per numCols.
+ * These are used directly in the returned ExprState so must remain allocated.
+ * collations: collation to use when calling the hash function.
+ * numCols: array length of hashfunctions, collations and keyColIdx.
+ * parent: PlanState node that the resulting ExprState will be evaluated at
+ * init_value: Normally 0, but can be set to other values to seed the hash
+ * with some other value.  Using non-zero is slightly less efficient but can
+ * be useful.
+ */
+ExprState *
+ExecBuildHash32FromAttrs(TupleDesc desc, const TupleTableSlotOps *ops,
+						 FmgrInfo *hashfunctions, Oid *collations,
+						 int numCols, AttrNumber *keyColIdx,
+						 PlanState *parent, uint32 init_value)
+{
+	ExprState  *state = makeNode(ExprState);
+	ExprEvalStep scratch = {0};
+	NullableDatum *iresult = NULL;
+	intptr_t	opcode;
+	AttrNumber	last_attnum = 0;
+
+	state->parent = parent;
+
+	/*
+	 * When hashing more than 1 column or if we have an init value, we need
+	 * somewhere to store the intermediate hash value so that it's available
+	 * to be combined with the result of subsequent hashing.
+	 */
+	if (numCols > 1 || init_value != 0)
+		iresult = palloc(sizeof(NullableDatum));
+
+	/* find the highest attnum so we deform the tuple to that point */
+	for (int i = 0; i < numCols; i++)
+		last_attnum = Max(last_attnum, keyColIdx[i]);
+
+	scratch.opcode = EEOP_INNER_FETCHSOME;
+	scratch.d.fetch.last_var = last_attnum;
+	scratch.d.fetch.fixed = false;
+	scratch.d.fetch.kind = NULL;
+	scratch.d.fetch.known_desc = NULL;
+	if (ExecComputeSlotInfo(state, &scratch))
+		ExprEvalPushStep(state, &scratch);
+
+	if (init_value == 0)
+	{
+		/*
+		 * No initial value, so we can assign the result of the hash function
+		 * for the first hash_expr without having to concern ourselves with
+		 * combining the result with any initial value.
+		 */
+		opcode = EEOP_HASHDATUM_FIRST;
+	}
+	else
+	{
+		/* Set up operation to set the initial value. */
+		scratch.opcode = EEOP_HASHDATUM_SET_INITVAL;
+		scratch.d.hashdatum_initvalue.init_value = UInt32GetDatum(init_value);
+		scratch.resvalue = &iresult->value;
+		scratch.resnull = &iresult->isnull;
+
+		ExprEvalPushStep(state, &scratch);
+
+		/*
+		 * When using an initial value use the NEXT32 ops as the FIRST ops
+		 * would overwrite the stored initial value.
+		 */
+		opcode = EEOP_HASHDATUM_NEXT32;
+	}
+
+	for (int i = 0; i < numCols; i++)
+	{
+		FmgrInfo   *finfo;
+		FunctionCallInfo fcinfo;
+		Oid			inputcollid = collations[i];
+		AttrNumber	attnum = keyColIdx[i] - 1;
+
+		finfo = &hashfunctions[i];
+		fcinfo = palloc0(SizeForFunctionCallInfo(1));
+
+		/* Initialize function call parameter structure too */
+		InitFunctionCallInfoData(*fcinfo, finfo, 1, inputcollid, NULL, NULL);
+
+		/*
+		 * Fetch inner Var for this attnum and store it in the 1st arg of the
+		 * hash func.
+		 */
+		scratch.opcode = EEOP_INNER_VAR;
+		scratch.resvalue = &fcinfo->args[0].value;
+		scratch.resnull = &fcinfo->args[0].isnull;
+		scratch.d.var.attnum = attnum;
+		scratch.d.var.vartype = TupleDescAttr(desc, attnum)->atttypid;
+
+		ExprEvalPushStep(state, &scratch);
+
+		/* Call the hash function */
+		scratch.opcode = opcode;
+
+		if (i == numCols - 1)
+		{
+			/*
+			 * The result for hashing the final column is stored in the
+			 * ExprState.
+			 */
+			scratch.resvalue = &state->resvalue;
+			scratch.resnull = &state->resnull;
+		}
+		else
+		{
+			Assert(iresult != NULL);
+
+			/* intermediate values are stored in an intermediate result */
+			scratch.resvalue = &iresult->value;
+			scratch.resnull = &iresult->isnull;
+		}
+
+		/*
+		 * NEXT32 opcodes need to look at the intermediate result.  We might
+		 * as well just set this for all ops.  FIRSTs won't look at it.
+		 */
+		scratch.d.hashdatum.iresult = iresult;
+
+		scratch.d.hashdatum.finfo = finfo;
+		scratch.d.hashdatum.fcinfo_data = fcinfo;
+		scratch.d.hashdatum.fn_addr = finfo->fn_addr;
+		scratch.d.hashdatum.jumpdone = -1;
+
+		ExprEvalPushStep(state, &scratch);
+
+		/* subsequent attnums must be combined with the previous */
+		opcode = EEOP_HASHDATUM_NEXT32;
+	}
+
+	scratch.resvalue = NULL;
+	scratch.resnull = NULL;
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return state;
+}
+
 /*
  * Build an ExprState that calls the given hash function(s) on the given
  * 'hash_exprs'.  When multiple expressions are present, the hash values
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 774e4de882..73743847d6 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -169,6 +169,7 @@ BuildTupleHashTableExt(PlanState *parent,
 	Size		hash_mem_limit;
 	MemoryContext oldcontext;
 	bool		allow_jit;
+	uint32		hash_iv = 0;
 
 	Assert(nbuckets > 0);
 
@@ -183,14 +184,13 @@ BuildTupleHashTableExt(PlanState *parent,
 
 	hashtable->numCols = numCols;
 	hashtable->keyColIdx = keyColIdx;
-	hashtable->tab_hash_funcs = hashfunctions;
 	hashtable->tab_collations = collations;
 	hashtable->tablecxt = tablecxt;
 	hashtable->tempcxt = tempcxt;
 	hashtable->entrysize = entrysize;
 	hashtable->tableslot = NULL;	/* will be made on first lookup */
 	hashtable->inputslot = NULL;
-	hashtable->in_hash_funcs = NULL;
+	hashtable->in_hash_expr = NULL;
 	hashtable->cur_eq_func = NULL;
 
 	/*
@@ -202,9 +202,7 @@ BuildTupleHashTableExt(PlanState *parent,
 	 * underestimated.
 	 */
 	if (use_variable_hash_iv)
-		hashtable->hash_iv = murmurhash32(ParallelWorkerNumber);
-	else
-		hashtable->hash_iv = 0;
+		hash_iv = murmurhash32(ParallelWorkerNumber);
 
 	hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable);
 
@@ -225,6 +223,15 @@ BuildTupleHashTableExt(PlanState *parent,
 	 */
 	allow_jit = metacxt != tablecxt;
 
+	hashtable->tab_hash_expr = ExecBuildHash32FromAttrs(inputDesc,
+														&TTSOpsMinimalTuple,
+														hashfunctions,
+														collations,
+														numCols,
+														keyColIdx,
+														allow_jit ? parent : NULL,
+														hash_iv);
+
 	/* build comparator for all columns */
 	/* XXX: should we support non-minimal tuples for the inputslot? */
 	hashtable->tab_eq_func = ExecBuildGroupingEqual(inputDesc, inputDesc,
@@ -316,7 +323,7 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 
 	/* set up data needed by hash and match functions */
 	hashtable->inputslot = slot;
-	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+	hashtable->in_hash_expr = hashtable->tab_hash_expr;
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
 	local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
@@ -342,7 +349,7 @@ TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
 	uint32		hash;
 
 	hashtable->inputslot = slot;
-	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+	hashtable->in_hash_expr = hashtable->tab_hash_expr;
 
 	/* Need to run the hash functions in short-lived context */
 	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
@@ -370,7 +377,7 @@ LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
 
 	/* set up data needed by hash and match functions */
 	hashtable->inputslot = slot;
-	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+	hashtable->in_hash_expr = hashtable->tab_hash_expr;
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
 	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
@@ -386,14 +393,14 @@ LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
  * created if there's not a match.  This is similar to the non-creating
  * case of LookupTupleHashEntry, except that it supports cross-type
  * comparisons, in which the given tuple is not of the same type as the
- * table entries.  The caller must provide the hash functions to use for
- * the input tuple, as well as the equality functions, since these may be
+ * table entries.  The caller must provide the hash ExprState to use for
+ * the input tuple, as well as the equality ExprState, since these may be
  * different from the table's internal functions.
  */
 TupleHashEntry
 FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 				   ExprState *eqcomp,
-				   FmgrInfo *hashfunctions)
+				   ExprState *hashexpr)
 {
 	TupleHashEntry entry;
 	MemoryContext oldContext;
@@ -404,7 +411,7 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 
 	/* Set up data needed by hash and match functions */
 	hashtable->inputslot = slot;
-	hashtable->in_hash_funcs = hashfunctions;
+	hashtable->in_hash_expr = hashexpr;
 	hashtable->cur_eq_func = eqcomp;
 
 	/* Search the hash table */
@@ -421,25 +428,24 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
  * copied into the table.
  *
  * Also, the caller must select an appropriate memory context for running
- * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.)
+ * the hash functions.
  */
 static uint32
 TupleHashTableHash_internal(struct tuplehash_hash *tb,
 							const MinimalTuple tuple)
 {
 	TupleHashTable hashtable = (TupleHashTable) tb->private_data;
-	int			numCols = hashtable->numCols;
-	AttrNumber *keyColIdx = hashtable->keyColIdx;
-	uint32		hashkey = hashtable->hash_iv;
+	uint32		hashkey;
 	TupleTableSlot *slot;
-	FmgrInfo   *hashfunctions;
-	int			i;
+	bool		isnull;
 
 	if (tuple == NULL)
 	{
 		/* Process the current input tuple for the table */
-		slot = hashtable->inputslot;
-		hashfunctions = hashtable->in_hash_funcs;
+		hashtable->exprcontext->ecxt_innertuple = hashtable->inputslot;
+		hashkey = DatumGetUInt32(ExecEvalExpr(hashtable->in_hash_expr,
+											  hashtable->exprcontext,
+											  &isnull));
 	}
 	else
 	{
@@ -449,38 +455,18 @@ TupleHashTableHash_internal(struct tuplehash_hash *tb,
 		 * (this case never actually occurs due to the way simplehash.h is
 		 * used, as the hash-value is stored in the entries)
 		 */
-		slot = hashtable->tableslot;
+		slot = hashtable->exprcontext->ecxt_innertuple = hashtable->tableslot;
 		ExecStoreMinimalTuple(tuple, slot, false);
-		hashfunctions = hashtable->tab_hash_funcs;
-	}
-
-	for (i = 0; i < numCols; i++)
-	{
-		AttrNumber	att = keyColIdx[i];
-		Datum		attr;
-		bool		isNull;
-
-		/* combine successive hashkeys by rotating */
-		hashkey = pg_rotate_left32(hashkey, 1);
-
-		attr = slot_getattr(slot, att, &isNull);
-
-		if (!isNull)			/* treat nulls as having hash key 0 */
-		{
-			uint32		hkey;
-
-			hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i],
-													hashtable->tab_collations[i],
-													attr));
-			hashkey ^= hkey;
-		}
+		hashkey = DatumGetUInt32(ExecEvalExpr(hashtable->tab_hash_expr,
+											  hashtable->exprcontext,
+											  &isnull));
 	}
 
 	/*
-	 * The way hashes are combined above, among each other and with the IV,
-	 * doesn't lead to good bit perturbation. As the IV's goal is to lead to
-	 * achieve that, perform a round of hashing of the combined hash -
-	 * resulting in near perfect perturbation.
+	 * The way hashes are combined with the ExprState above, among each other
+	 * and with the IV, doesn't lead to good bit perturbation. As the IV's
+	 * goal is to lead to achieve that, perform a round of hashing of the
+	 * combined hash - resulting in near perfect perturbation.
 	 */
 	return murmurhash32(hashkey);
 }
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 236222d72a..40bae49267 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -160,7 +160,7 @@ ExecHashSubPlan(SubPlanState *node,
 			FindTupleHashEntry(node->hashtable,
 							   slot,
 							   node->cur_eq_comp,
-							   node->lhs_hash_funcs) != NULL)
+							   node->lhs_hash_expr) != NULL)
 		{
 			ExecClearTuple(slot);
 			return BoolGetDatum(true);
@@ -857,7 +857,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
 	sstate->tab_eq_funcoids = NULL;
 	sstate->tab_hash_funcs = NULL;
 	sstate->tab_collations = NULL;
-	sstate->lhs_hash_funcs = NULL;
 	sstate->cur_eq_funcs = NULL;
 
 	/*
@@ -897,6 +896,7 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
 		TupleDesc	tupDescRight;
 		Oid		   *cross_eq_funcoids;
 		TupleTableSlot *slot;
+		FmgrInfo   *lhs_hash_funcs;
 		List	   *oplist,
 				   *lefttlist,
 				   *righttlist;
@@ -953,7 +953,7 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
 		sstate->tab_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid));
 		sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid));
 		sstate->tab_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
-		sstate->lhs_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+		lhs_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
 		sstate->cur_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
 		/* we'll need the cross-type equality fns below, but not in sstate */
 		cross_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid));
@@ -1003,7 +1003,7 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
 									   &left_hashfn, &right_hashfn))
 				elog(ERROR, "could not find hash function for hash operator %u",
 					 opexpr->opno);
-			fmgr_info(left_hashfn, &sstate->lhs_hash_funcs[i - 1]);
+			fmgr_info(left_hashfn, &lhs_hash_funcs[i - 1]);
 			fmgr_info(right_hashfn, &sstate->tab_hash_funcs[i - 1]);
 
 			/* Set collation */
@@ -1039,6 +1039,15 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
 													sstate->planstate,
 													NULL);
 
+		sstate->lhs_hash_expr = ExecBuildHash32FromAttrs(tupDescLeft,
+														 &TTSOpsVirtual,
+														 lhs_hash_funcs,
+														 sstate->tab_collations,
+														 sstate->numCols,
+														 sstate->keyColIdx,
+														 parent,
+														 0);
+
 		/*
 		 * Create comparator for lookups of rows in the table (potentially
 		 * cross-type comparisons).
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 69c3ebff00..e77377ff9b 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -160,7 +160,7 @@ extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,
 extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
 										 TupleTableSlot *slot,
 										 ExprState *eqcomp,
-										 FmgrInfo *hashfunctions);
+										 ExprState *hashexpr);
 extern void ResetTupleHashTable(TupleHashTable hashtable);
 
 /*
@@ -289,6 +289,14 @@ extern ExprState *ExecInitCheck(List *qual, PlanState *parent);
 extern List *ExecInitExprList(List *nodes, PlanState *parent);
 extern ExprState *ExecBuildAggTrans(AggState *aggstate, struct AggStatePerPhaseData *phase,
 									bool doSort, bool doHash, bool nullcheck);
+extern ExprState *ExecBuildHash32FromAttrs(TupleDesc desc,
+										   const TupleTableSlotOps *ops,
+										   FmgrInfo *hashfunctions,
+										   Oid *collations,
+										   int numCols,
+										   AttrNumber *keyColIdx,
+										   PlanState *parent,
+										   uint32 init_value);
 extern ExprState *ExecBuildHash32Expr(TupleDesc desc,
 									  const TupleTableSlotOps *ops,
 									  const Oid *hashfunc_oids,
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 182a6956bb..616f7bf270 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -795,14 +795,14 @@ typedef struct ExecAuxRowMark
  *
  * All-in-memory tuple hash tables are used for a number of purposes.
  *
- * Note: tab_hash_funcs are for the key datatype(s) stored in the table,
+ * Note: tab_hash_expr are for the key datatype(s) stored in the table,
  * and tab_eq_func are non-cross-type equality operators for those types.
- * Normally these are the only functions used, but FindTupleHashEntry()
+ * Normally these are the only ExprStates used, but FindTupleHashEntry()
  * supports searching a hashtable using cross-data-type hashing.  For that,
  * the caller must supply hash functions for the LHS datatype as well as
- * the cross-type equality operators to use.  in_hash_funcs and cur_eq_func
+ * the cross-type equality operators to use.  in_hash_expr and cur_eq_func
  * are set to point to the caller's function arrays while doing such a search.
- * During LookupTupleHashEntry(), they point to tab_hash_funcs and
+ * During LookupTupleHashEntry(), they point to tab_hash_expr and
  * tab_eq_func respectively.
  * ----------------------------------------------------------------
  */
@@ -830,7 +830,7 @@ typedef struct TupleHashTableData
 	tuplehash_hash *hashtab;	/* underlying hash table */
 	int			numCols;		/* number of columns in lookup key */
 	AttrNumber *keyColIdx;		/* attr numbers of key columns */
-	FmgrInfo   *tab_hash_funcs; /* hash functions for table datatype(s) */
+	ExprState  *tab_hash_expr;	/* ExprState for hashing table datatype(s) */
 	ExprState  *tab_eq_func;	/* comparator for table datatype(s) */
 	Oid		   *tab_collations; /* collations for hash and comparison */
 	MemoryContext tablecxt;		/* memory context containing table */
@@ -839,9 +839,8 @@ typedef struct TupleHashTableData
 	TupleTableSlot *tableslot;	/* slot for referencing table entries */
 	/* The following fields are set transiently for each table search: */
 	TupleTableSlot *inputslot;	/* current input tuple's slot */
-	FmgrInfo   *in_hash_funcs;	/* hash functions for input datatype(s) */
+	ExprState  *in_hash_expr;	/* ExprState for hashing input datatype(s) */
 	ExprState  *cur_eq_func;	/* comparator for input vs. table */
-	uint32		hash_iv;		/* hash-function IV */
 	ExprContext *exprcontext;	/* expression context */
 }			TupleHashTableData;
 
@@ -994,7 +993,7 @@ typedef struct SubPlanState
 									 * datatype(s) */
 	Oid		   *tab_collations; /* collations for hash and comparison */
 	FmgrInfo   *tab_hash_funcs; /* hash functions for table datatype(s) */
-	FmgrInfo   *lhs_hash_funcs; /* hash functions for lefthand datatype(s) */
+	ExprState  *lhs_hash_expr;	/* hash expr for lefthand datatype(s) */
 	FmgrInfo   *cur_eq_funcs;	/* equality functions for LHS vs. table */
 	ExprState  *cur_eq_comp;	/* equality comparator for LHS vs. table */
 } SubPlanState;
-- 
2.34.1