Following up on my previous messages about optimizing eqjoinsel() and
eqjoinsel_semi() for Var1 = Var2 clauses, I’d like to share detailed
profiling results showing the effect of the patch on JOB for different
values of default_statistics_target.
The first table shows the total planner time (summed over all 113
queries) before and after applying the patch, along with the speedup
achieved:
default_statistics_target | Planner Speedup (×) | Planner Before (ms) |
Planner After (ms)
--------------------------+---------------------+---------------------+--------------------
100 | *1.00x* | 1828.433 | 1820.556
1000 | *1.12x* | 2194.282 | 1963.110
2500 | *2.15x* | 4606.705 | 2140.126
5000 | *6.37x* | 16661.581 | 2616.109
7500 | *11.76x* | 35988.569 |
3061.161
10000 | *19.01x* | 66616.620 |
3504.144
The second table shows the profiling of eqjoinsel() using *perf*,
demonstrating that the function, which dominates planning at high
statistics targets, becomes essentially negligible after the patch:
default_statistics_target | eqjoinsel() Before (perf) | eqjoinsel()
After (perf)
--------------------------+---------------------------+--------------------------
100 | 0.01%
| 0.04%
1000 | 6.23%
| 0.06%
2500 | 35.45%
| 0.23%
5000 | 66.14%
| 0.53%
7500 | 72.70%
| 0.97%
10000 | 75.42%
| 1.25%
I’ve attached v3 of the patch. This version adds a check for NULL values
when comparing MCV entries, ensuring correctness in edge cases.
--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC,
https://tantorlabs.com
From 65757822257e13b1e2a50bf86b9080ed8b86adec Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdoki...@tantorlabs.ru>
Date: Wed, 3 Sep 2025 19:40:50 +0300
Subject: [PATCH v3] Optimize selectivity estimation for Var = Var clauses
using merge-based MCV comparison
Previously, selectivity estimation for join and semi-join clauses of the form
Var = Var relied on a nested-loop comparison of MCV lists, resulting in O(N^2)
behavior with respect to the number of common values.
This patch introduces a merge-based strategy when the underlying type supports
BTREE ordering. In such cases, the MCVs are sorted using the datatype-specific
comparison operator, and selectivity is estimated via linear-time merge scan.
---
src/backend/utils/adt/selfuncs.c | 311 ++++++++++++++++++++++------
src/backend/utils/cache/lsyscache.c | 61 ++++++
src/include/utils/lsyscache.h | 1 +
src/include/utils/selfuncs.h | 8 +
4 files changed, 321 insertions(+), 60 deletions(-)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 1c480cfaaf7..400c5fdcb9b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -217,6 +217,7 @@ static bool get_actual_variable_endpoint(Relation heapRel,
static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
static double btcost_correlation(IndexOptInfo *index,
VariableStatData *vardata);
+static int compare_mcv_items(const void *a, const void *b, void *arg);
/*
@@ -2463,7 +2464,6 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
* of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu).
*/
LOCAL_FCINFO(fcinfo, 2);
- FmgrInfo eqproc;
bool *hasmatch1;
bool *hasmatch2;
double nullfrac1 = stats1->stanullfrac;
@@ -2479,52 +2479,139 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
totalsel2;
int i,
nmatches;
-
- fmgr_info(opfuncoid, &eqproc);
-
- /*
- * Save a few cycles by setting up the fcinfo struct just once. Using
- * FunctionCallInvoke directly also avoids failure if the eqproc
- * returns NULL, though really equality functions should never do
- * that.
- */
- InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
- NULL, NULL);
- fcinfo->args[0].isnull = false;
- fcinfo->args[1].isnull = false;
+ Oid compare_func_oid = InvalidOid;
hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool));
hasmatch2 = (bool *) palloc0(sslot2->nvalues * sizeof(bool));
/*
* Note we assume that each MCV will match at most one member of the
- * other MCV list. If the operator isn't really equality, there could
- * be multiple matches --- but we don't look for them, both for speed
- * and because the math wouldn't add up...
+ * other MCV list.
*/
matchprodfreq = 0.0;
nmatches = 0;
- for (i = 0; i < sslot1->nvalues; i++)
+ if (get_comparison_op_for_sort(opfuncoid, &compare_func_oid))
{
- int j;
+ LOCAL_FCINFO(merge_fcinfo, 2);
+ FmgrInfo cmpproc;
+ McvItem *mcvs1,
+ *mcvs2;
+ int ptr1,
+ ptr2;
+
+ fmgr_info(compare_func_oid, &cmpproc);
+
+ InitFunctionCallInfoData(*merge_fcinfo, &cmpproc, 2,
+ collation, NULL, NULL);
+
+ mcvs1 = (McvItem *) palloc(sslot1->nvalues * sizeof(McvItem));
+ mcvs2 = (McvItem *) palloc(sslot2->nvalues * sizeof(McvItem));
- fcinfo->args[0].value = sslot1->values[i];
+ /*
+ * Copy the MCV values and their frequencies into a temporary array
+ * so we can safely sort them without altering the original statistics.
+ */
+ for (i = 0; i < sslot1->nvalues; i++)
+ {
+ mcvs1[i].value = sslot1->values[i];
+ mcvs1[i].original_idx = i;
+ mcvs1[i].frequency = sslot1->numbers[i];
+ }
- for (j = 0; j < sslot2->nvalues; j++)
+ for (i = 0; i < sslot2->nvalues; i++)
{
- Datum fresult;
+ mcvs2[i].value = sslot2->values[i];
+ mcvs2[i].original_idx = i;
+ mcvs2[i].frequency = sslot2->numbers[i];
+ }
- if (hasmatch2[j])
- continue;
- fcinfo->args[1].value = sslot2->values[j];
- fcinfo->isnull = false;
- fresult = FunctionCallInvoke(fcinfo);
- if (!fcinfo->isnull && DatumGetBool(fresult))
+ qsort_arg(mcvs1, sslot1->nvalues, sizeof(McvItem), compare_mcv_items, &merge_fcinfo);
+ qsort_arg(mcvs2, sslot2->nvalues, sizeof(McvItem), compare_mcv_items, &merge_fcinfo);
+
+ ptr1 = ptr2 = 0;
+ while (ptr1 < sslot1->nvalues && ptr2 < sslot2->nvalues)
+ {
+ McvItem *item1 = &mcvs1[ptr1];
+ McvItem *item2 = &mcvs2[ptr2];
+ Datum result_datum;
+
+ merge_fcinfo->args[0].value = item1->value;
+ merge_fcinfo->args[1].value = item2->value;
+ merge_fcinfo->isnull = false;
+
+ result_datum = FunctionCallInvoke(merge_fcinfo);
+ if (!merge_fcinfo->isnull)
{
- hasmatch1[i] = hasmatch2[j] = true;
- matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
- nmatches++;
- break;
+ int32 cmp_result = DatumGetInt32(result_datum);
+ if (cmp_result == 0)
+ {
+ hasmatch1[item1->original_idx] = true;
+ hasmatch2[item2->original_idx] = true;
+
+ matchprodfreq += item1->frequency * item2->frequency;
+ nmatches++;
+
+ ptr1++;
+ ptr2++;
+ }
+ else if (cmp_result < 0)
+ ptr1++;
+ else
+ ptr2++;
+ }
+ else
+ {
+ ptr1++;
+ ptr2++;
+ }
+ }
+
+ pfree(mcvs1);
+ pfree(mcvs2);
+ }
+ else
+ {
+ FmgrInfo eqproc;
+
+ fmgr_info(opfuncoid, &eqproc);
+ /*
+ * Save a few cycles by setting up the fcinfo struct just once. Using
+ * FunctionCallInvoke directly also avoids failure if the eqproc
+ * returns NULL, though really equality functions should never do
+ * that.
+ */
+ InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
+ NULL, NULL);
+ fcinfo->args[0].isnull = false;
+ fcinfo->args[1].isnull = false;
+
+ /*
+ * If the operator isn't really equality, there could
+ * be multiple matches --- but we don't look for them, both for speed
+ * and because the math wouldn't add up...
+ */
+ for (i = 0; i < sslot1->nvalues; i++)
+ {
+ int j;
+
+ fcinfo->args[0].value = sslot1->values[i];
+
+ for (j = 0; j < sslot2->nvalues; j++)
+ {
+ Datum fresult;
+
+ if (hasmatch2[j])
+ continue;
+ fcinfo->args[1].value = sslot2->values[j];
+ fcinfo->isnull = false;
+ fresult = FunctionCallInvoke(fcinfo);
+ if (!fcinfo->isnull && DatumGetBool(fresult))
+ {
+ hasmatch1[i] = hasmatch2[j] = true;
+ matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
+ nmatches++;
+ break;
+ }
}
}
}
@@ -2690,7 +2777,6 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
* in a skewed distribution this gives us a big leg up in accuracy.
*/
LOCAL_FCINFO(fcinfo, 2);
- FmgrInfo eqproc;
bool *hasmatch1;
bool *hasmatch2;
double nullfrac1 = stats1->stanullfrac;
@@ -2700,6 +2786,7 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
int i,
nmatches,
clamped_nvalues2;
+ Oid compare_func_oid = InvalidOid;
/*
* The clamping above could have resulted in nd2 being less than
@@ -2710,19 +2797,6 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
*/
clamped_nvalues2 = Min(sslot2->nvalues, nd2);
- fmgr_info(opfuncoid, &eqproc);
-
- /*
- * Save a few cycles by setting up the fcinfo struct just once. Using
- * FunctionCallInvoke directly also avoids failure if the eqproc
- * returns NULL, though really equality functions should never do
- * that.
- */
- InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
- NULL, NULL);
- fcinfo->args[0].isnull = false;
- fcinfo->args[1].isnull = false;
-
hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool));
hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
@@ -2733,26 +2807,120 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
* and because the math wouldn't add up...
*/
nmatches = 0;
- for (i = 0; i < sslot1->nvalues; i++)
+ if (get_comparison_op_for_sort(opfuncoid, &compare_func_oid))
{
- int j;
+ LOCAL_FCINFO(merge_fcinfo, 2);
+ FmgrInfo cmpproc;
+ McvItem *mcvs1,
+ *mcvs2;
+ int ptr1,
+ ptr2;
+
+ fmgr_info(compare_func_oid, &cmpproc);
+ InitFunctionCallInfoData(*merge_fcinfo, &cmpproc, 2,
+ collation, NULL, NULL);
- fcinfo->args[0].value = sslot1->values[i];
+ mcvs1 = (McvItem *) palloc(sslot1->nvalues * sizeof(McvItem));
+ mcvs2 = (McvItem *) palloc(clamped_nvalues2 * sizeof(McvItem));
- for (j = 0; j < clamped_nvalues2; j++)
+ /*
+ * Copy the MCV values and their frequencies into a temporary array
+ * so we can safely sort them without altering the original statistics.
+ */
+ for (i = 0; i < sslot1->nvalues; i++)
{
- Datum fresult;
+ mcvs1[i].value = sslot1->values[i];
+ mcvs1[i].original_idx = i;
+ mcvs1[i].frequency = sslot1->numbers[i];
+ }
- if (hasmatch2[j])
- continue;
- fcinfo->args[1].value = sslot2->values[j];
- fcinfo->isnull = false;
- fresult = FunctionCallInvoke(fcinfo);
- if (!fcinfo->isnull && DatumGetBool(fresult))
+ for (i = 0; i < clamped_nvalues2; i++)
+ {
+ mcvs2[i].value = sslot2->values[i];
+ mcvs2[i].original_idx = i;
+ mcvs2[i].frequency = sslot2->numbers[i];
+ }
+
+ qsort_arg(mcvs1, sslot1->nvalues, sizeof(McvItem), compare_mcv_items, &merge_fcinfo);
+ qsort_arg(mcvs2, clamped_nvalues2, sizeof(McvItem), compare_mcv_items, &merge_fcinfo);
+
+ ptr1 = ptr2 = 0;
+ while (ptr1 < sslot1->nvalues && ptr2 < clamped_nvalues2)
+ {
+ McvItem *item1 = &mcvs1[ptr1];
+ McvItem *item2 = &mcvs2[ptr2];
+ Datum result_datum;
+
+ merge_fcinfo->args[0].value = item1->value;
+ merge_fcinfo->args[1].value = item2->value;
+ merge_fcinfo->isnull = false;
+
+ result_datum = FunctionCallInvoke(merge_fcinfo);
+ if (!merge_fcinfo->isnull)
{
- hasmatch1[i] = hasmatch2[j] = true;
- nmatches++;
- break;
+ int32 cmp_result = DatumGetInt32(result_datum);
+ if (cmp_result == 0)
+ {
+ hasmatch1[item1->original_idx] = hasmatch2[item2->original_idx] = true;
+
+ nmatches++;
+
+ ptr1++;
+ ptr2++;
+ }
+ else if (cmp_result < 0)
+ ptr1++;
+ else
+ ptr2++;
+ }
+ else
+ {
+ ptr1++;
+ ptr2++;
+ }
+ }
+
+ pfree(mcvs1);
+ pfree(mcvs2);
+ }
+ else
+ {
+ FmgrInfo eqproc;
+
+ fmgr_info(opfuncoid, &eqproc);
+ InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
+ NULL, NULL);
+
+ fcinfo->args[0].isnull = false;
+ fcinfo->args[1].isnull = false;
+
+ /*
+ * Save a few cycles by setting up the fcinfo struct just once. Using
+ * FunctionCallInvoke directly also avoids failure if the eqproc
+ * returns NULL, though really equality functions should never do
+ * that.
+ */
+ for (i = 0; i < sslot1->nvalues; i++)
+ {
+ int j;
+
+ fcinfo->args[0].value = sslot1->values[i];
+
+ for (j = 0; j < clamped_nvalues2; j++)
+ {
+ Datum fresult;
+
+ if (hasmatch2[j])
+ continue;
+ fcinfo->args[1].value = sslot2->values[j];
+ fcinfo->isnull = false;
+ fresult = FunctionCallInvoke(fcinfo);
+ if (!fcinfo->isnull && DatumGetBool(fresult))
+ {
+ hasmatch1[i] = hasmatch2[j] = true;
+ nmatches++;
+ break;
+ }
}
}
}
@@ -8793,3 +8961,26 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
*indexPages = index->pages;
}
+
+/*
+ * Comparator for sorting MCV items
+ */
+static int
+compare_mcv_items(const void *a, const void *b, void *arg)
+{
+ FunctionCallInfo fcinfo_comp = *((FunctionCallInfo *) arg);
+ Datum result_datum;
+
+ const McvItem *item1 = (const McvItem *) a;
+ const McvItem *item2 = (const McvItem *) b;
+
+ fcinfo_comp->args[0].value = item1->value;
+ fcinfo_comp->args[0].isnull = false;
+
+ fcinfo_comp->args[1].value = item2->value;
+ fcinfo_comp->args[1].isnull = false;
+
+ result_datum = FunctionCallInvoke(fcinfo_comp);
+
+ return DatumGetInt32(result_datum);
+}
\ No newline at end of file
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index fa7cd7e06a7..8d868f3a3f3 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -17,6 +17,7 @@
#include "access/hash.h"
#include "access/htup_details.h"
+#include "access/nbtree.h"
#include "bootstrap/bootstrap.h"
#include "catalog/namespace.h"
#include "catalog/pg_am.h"
@@ -40,6 +41,7 @@
#include "catalog/pg_subscription.h"
#include "catalog/pg_transform.h"
#include "catalog/pg_type.h"
+#include "commands/defrem.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "utils/array.h"
@@ -316,6 +318,65 @@ get_ordering_op_properties(Oid opno,
return result;
}
+/*
+ * get_comparison_op_for_sort
+ * Get the OID of a datatype-specific comparison operator
+ * associated with a given equality operator.
+ *
+ * This is typically used in clauses like Var = Var, where both arguments
+ * are of the same type. We look up the corresponding BTORDER_PROC for
+ * the default B-tree opclass of the datatype.
+ *
+ * However, for special cases like array_eq and record_eq, a comparison
+ * operator may exist (btarraycmp, btrecordcmp), but sorting is only
+ * possible if all element or field types are themselves sortable.
+ * Rather than reimplement that logic here, we simply exclude such
+ * cases by checking for btarraycmp and btrecordcmp explicitly.
+ *
+ * Returns false if no suitable comparison operator is found.
+ */
+bool
+get_comparison_op_for_sort(Oid opno, Oid *cmpopno)
+{
+ bool result = false;
+ HeapTuple proctup;
+
+ proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(opno));
+ if (HeapTupleIsValid(proctup))
+ {
+ Form_pg_proc procform = (Form_pg_proc) GETSTRUCT(proctup);
+
+ /* Only consider binary operators where both arguments are of the same type. */
+ if (procform->pronargs == 2 &&
+ OidIsValid(procform->proargtypes.values[0]) &&
+ OidIsValid(procform->proargtypes.values[1]) &&
+ procform->proargtypes.values[0] == procform->proargtypes.values[1])
+ {
+ Oid type_oid = procform->proargtypes.values[0];
+ Oid opclass_oid = GetDefaultOpClass(type_oid, BTREE_AM_OID);
+
+ if (OidIsValid(opclass_oid))
+ {
+ Oid opfamily_oid = get_opclass_family(opclass_oid);
+
+ *cmpopno = get_opfamily_proc(opfamily_oid, type_oid, type_oid, BTORDER_PROC);
+
+ /*
+ * Skip array_eq and record_eq: their comparison functions exist,
+ * but sorting is only possible if all elements or fields are sortable.
+ */
+ if (OidIsValid(*cmpopno) &&
+ *cmpopno != F_BTARRAYCMP &&
+ *cmpopno != F_BTRECORDCMP)
+ result = true;
+ }
+ }
+ ReleaseSysCache(proctup);
+ }
+
+ return result;
+}
+
/*
* get_equality_op_for_ordering_op
* Get the OID of the datatype-specific equality operator
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index c65cee4f24c..7e1188c04dd 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -80,6 +80,7 @@ extern Oid get_opfamily_member_for_cmptype(Oid opfamily, Oid lefttype, Oid right
extern bool get_ordering_op_properties(Oid opno,
Oid *opfamily, Oid *opcintype, CompareType *cmptype);
extern Oid get_equality_op_for_ordering_op(Oid opno, bool *reverse);
+extern bool get_comparison_op_for_sort(Oid opno, Oid *cmpopno);
extern Oid get_ordering_op_for_equality_op(Oid opno, bool use_lhs_type);
extern List *get_mergejoin_opfamilies(Oid opno);
extern bool get_compatible_hash_operators(Oid opno,
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index fb4fa53363d..eb42f0df930 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -138,6 +138,14 @@ typedef struct
double num_sa_scans; /* # indexscans from ScalarArrayOpExprs */
} GenericCosts;
+/* Represents an MCV statistic entry */
+typedef struct McvItem
+{
+ Datum value;
+ int original_idx; /* The original index of the item in the array before sorting. */
+ double frequency; /* Frequency of value in statistics */
+} McvItem;
+
/* Hooks for plugins to get control when we ask for stats */
typedef bool (*get_relation_stats_hook_type) (PlannerInfo *root,
RangeTblEntry *rte,
--
2.34.1