Hi Ilia!
On 05.09.2025 16:03, David Geier wrote:
>>> I propose an optimization: when the column datatype supports
>>> ordering(i.e., has < and >), we can sort both MCV lists and apply
>>> mege-style algorithm to detect matches. This reduces runtime from
>>> O(N^2) to O(NlogN), where N is the number of MCV entries. The patch
>>> also applies the same optimization to semi-join clauses, which show
>>> similar performance behavior.
>
> Why do you sort both lists and then merge instead of putting the smaller
> list into a hash map and then doing hash lookups (if the type is hashable)?
I've tested your and my code with the following script:
CREATE TABLE foo(col TEXT);
CREATE TABLE bar(col TEXT);
SET default_statistics_target = 10000;
-- Generate MCV values. PostgreSQL doesn't store MCVs if the table has
-- only a single value or every value has exactly the same cardinality.
DO $$
BEGIN
FOR i IN 1..10000 LOOP
FOR j IN 1..least(i, 50) LOOP
INSERT INTO foo VALUES ('aaaaaaaaaaaaaaaaaaaa' || i::TEXT);
INSERT INTO bar VALUES ('aaaaaaaaaaaaaaaaaaaa' || (i + 100000)::TEXT);
END LOOP;
END LOOP;
END;
$$;
ANALYZE foo, bar;
\timing on
EXPLAIN SELECT * FROM foo f, bar b WHERE f.col = b.col;
Results are:
- master: 433 ms
- Order+Merge: 11 ms
- Hash map: 4 ms
I've attached my draft patch.
--
David GeierFrom 7e2cf9602511ada0284503eb929e202ce0d7d354 Mon Sep 17 00:00:00 2001
From: David Geier <[email protected]>
Date: Mon, 8 Sep 2025 12:06:44 +0200
Subject: [PATCH] Optimize eqoinsel_inner with hash table
---
src/backend/utils/adt/selfuncs.c | 180 +++++++++++++++++++++++++++----
1 file changed, 159 insertions(+), 21 deletions(-)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 1c480cfaaf7..c171d70eb47 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -143,6 +143,8 @@
#define DEFAULT_PAGE_CPU_MULTIPLIER 50.0
+struct McvHashTable_hash;
+
/* Hooks for plugins to get control when we ask for stats */
get_relation_stats_hook_type get_relation_stats_hook = NULL;
get_index_stats_hook_type get_index_stats_hook = NULL;
@@ -217,7 +219,9 @@ static bool get_actual_variable_endpoint(Relation heapRel,
static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
static double btcost_correlation(IndexOptInfo *index,
VariableStatData *vardata);
-
+static uint32 hash_msv(struct McvHashTable_hash *hashTable, Datum key);
+static bool are_mcvs_equal(struct McvHashTable_hash *hashTable, Datum value1, Datum value2);
+static Oid get_hash_func_oid(Oid operatorOid);
/*
* eqsel - Selectivity of "=" for any data types.
@@ -2269,6 +2273,132 @@ rowcomparesel(PlannerInfo *root,
return s1;
}
+typedef struct McvHashEntry
+{
+ Datum value;
+ uint32 index;
+ uint32 hash;
+ char status; // NOLINT
+} McvHashEntry;
+
+typedef struct McvHashContext
+{
+ FmgrInfo equal_proc;
+ FmgrInfo hash_proc;
+ Oid collation;
+} McvHashContext;
+
+#define SH_PREFIX McvHashTable
+#define SH_ELEMENT_TYPE McvHashEntry
+#define SH_KEY_TYPE Datum
+#define SH_KEY value
+#define SH_HASH_KEY(mcvs, key) hash_msv(mcvs, key)
+#define SH_EQUAL(mcvs, key0, key1) are_mcvs_equal(mcvs, key0, key1)
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(mcvs, key) key->hash
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static uint32
+hash_msv(struct McvHashTable_hash *hashTable, Datum key)
+{
+ McvHashContext *context = (McvHashContext *)hashTable->private_data;
+ return DatumGetUInt32(FunctionCall1Coll(&context->hash_proc, context->collation, key));
+}
+
+static bool
+are_mcvs_equal(struct McvHashTable_hash *hashTable, Datum value1, Datum value2)
+{
+
+ /*
+ * We can safely use FunctionCall2Coll() which requires the result to never be NULL,
+ * because MCV arrays from 'pg_statistic' don't contain 'NULL' values
+ */
+ McvHashContext *context = (McvHashContext *)hashTable->private_data;
+ return DatumGetBool(FunctionCall2Coll(&context->equal_proc, context->collation, value1, value2));
+}
+
+static Oid
+get_hash_func_oid(Oid operatorOid)
+{
+ Oid hashLeft = InvalidOid;
+ Oid hashRight = InvalidOid;
+ get_op_hash_functions(operatorOid, &hashLeft, &hashRight);
+ return OidIsValid(hashLeft) && hashLeft == hashRight ? hashLeft : InvalidOid;
+}
+
+/*
+ * eqjoinsel_inner_with_hashtable
+ *
+ * Optimizes inner equality join selectivity estimation by using an O(n) algorithm based on hashing.
+ * Returns whether or not all prerequisites are met and the operation was successful.
+ * The result is used to know if to fallback to the default implementation.
+ */
+static bool
+eqjoinsel_inner_with_hashtable(Oid operatorOid, Oid collation,
+ AttStatsSlot *statsSlot1, AttStatsSlot *statsSlot2,
+ FunctionCallInfo equalFunctionCallInfo,
+ /* Output parameters: */
+ double *matchProductFrequencies, int *nMatches, bool *hasMatch1, bool *hasMatch2)
+{
+ Oid hashFuncOid;
+ AttStatsSlot *statsInner = statsSlot2;
+ AttStatsSlot *statsOuter = statsSlot1;
+ bool *hasMatchInner = hasMatch2;
+ bool *hasMatchOuter = hasMatch1;
+ struct McvHashContext hashContext;
+ McvHashTable_hash *hashTable;
+
+ if (Min(statsSlot1->nvalues, statsSlot2->nvalues) == 1)
+ return false;
+
+ hashFuncOid = get_hash_func_oid(operatorOid);
+ if (!OidIsValid(hashFuncOid))
+ return false;
+
+ /* Make sure we build the hash table on the smaller array. */
+ if (statsSlot1->nvalues < statsSlot2->nvalues)
+ {
+ statsInner = statsSlot1;
+ statsOuter = statsSlot2;
+ hasMatchInner = hasMatch1;
+ hasMatchOuter = hasMatch2;
+ }
+
+ /* 1. Create hash table of smaller 'pg_statistic' array. That's O(n). */
+ fmgr_info(get_opcode(operatorOid), &hashContext.equal_proc);
+ fmgr_info(hashFuncOid, &hashContext.hash_proc);
+ hashContext.collation = collation;
+
+ hashTable = McvHashTable_create(CurrentMemoryContext, statsInner->nvalues, &hashContext);
+
+ for (int i = 0; i < statsInner->nvalues; i++)
+ {
+ bool found = false;
+ McvHashEntry *entry = McvHashTable_insert(hashTable, statsInner->values[i], &found);
+ Assert(!found);
+ entry->index = i;
+ }
+
+ /* 2. Look-up values from other 'pg_statistic' array against hash map to find matches. */
+ for (int i = 0; i < statsOuter->nvalues; i++)
+ {
+ McvHashEntry *entry = McvHashTable_lookup(hashTable, statsOuter->values[i]);
+ if (entry != NULL)
+ {
+ hasMatchInner[entry->index] = true;
+ hasMatchOuter[i] = true;
+ *matchProductFrequencies += statsInner->numbers[entry->index] * statsOuter->numbers[i];
+ (*nMatches)++;
+ }
+ }
+
+ McvHashTable_destroy(hashTable);
+ return true;
+}
+
/*
* eqjoinsel - Join selectivity of "="
*/
@@ -2350,7 +2480,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
}
/* We need to compute the inner-join selectivity in all cases */
- selec_inner = eqjoinsel_inner(opfuncoid, collation,
+ selec_inner = eqjoinsel_inner(operator, collation,
&vardata1, &vardata2,
nd1, nd2,
isdefault1, isdefault2,
@@ -2438,7 +2568,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
* that it's worth trying to distinguish them here.
*/
static double
-eqjoinsel_inner(Oid opfuncoid, Oid collation,
+eqjoinsel_inner(Oid operator, Oid collation,
VariableStatData *vardata1, VariableStatData *vardata2,
double nd1, double nd2,
bool isdefault1, bool isdefault2,
@@ -2480,7 +2610,7 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
int i,
nmatches;
- fmgr_info(opfuncoid, &eqproc);
+ fmgr_info(get_opcode(operator), &eqproc);
/*
* Save a few cycles by setting up the fcinfo struct just once. Using
@@ -2504,30 +2634,38 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
*/
matchprodfreq = 0.0;
nmatches = 0;
- for (i = 0; i < sslot1->nvalues; i++)
- {
- int j;
- fcinfo->args[0].value = sslot1->values[i];
-
- for (j = 0; j < sslot2->nvalues; j++)
+ if (!eqjoinsel_inner_with_hashtable(operator, collation, sslot1, sslot2,
+ fcinfo, &matchprodfreq, &nmatches,
+ hasmatch1, hasmatch2))
+ {
+ /* Fallback to O(N^2) algorithm if hash based variant didn't succeed. */
+ for (i = 0; i < sslot1->nvalues; i++)
{
- Datum fresult;
+ int j;
- if (hasmatch2[j])
- continue;
- fcinfo->args[1].value = sslot2->values[j];
- fcinfo->isnull = false;
- fresult = FunctionCallInvoke(fcinfo);
- if (!fcinfo->isnull && DatumGetBool(fresult))
+ fcinfo->args[0].value = sslot1->values[i];
+
+ for (j = 0; j < sslot2->nvalues; j++)
{
- hasmatch1[i] = hasmatch2[j] = true;
- matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
- nmatches++;
- break;
+ Datum fresult;
+
+ if (hasmatch2[j])
+ continue;
+ fcinfo->args[1].value = sslot2->values[j];
+ fcinfo->isnull = false;
+ fresult = FunctionCallInvoke(fcinfo);
+ if (!fcinfo->isnull && DatumGetBool(fresult))
+ {
+ hasmatch1[i] = hasmatch2[j] = true;
+ matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
+ nmatches++;
+ break;
+ }
}
}
}
+
CLAMP_PROBABILITY(matchprodfreq);
/* Sum up frequencies of matched and unmatched MCVs */
matchfreq1 = unmatchfreq1 = 0.0;
--
2.43.0