Hi, thank you for the feedback!

On 9/7/2026 14:39, Ilia Evdokimov (<[email protected]>) wrote:

> 1. mcv_can_cap() reimplements logic already present in
> dependency_is_compatible_clause(). Shall we combine the two in order to
> avoid code duplication?
>
> It's true that some of the checks that are in `mcv_can_cap()` are already
in `dependency_is_compatible_clause()`. They both return the same value
when there's a pseudo-constant and the clause is `=`, `= TRUE` or `=
FALSE`. Rest of the cases are different (IS NULL, ANY/IN, OR).

We would need to add surrounding logic because even for the matching cases,
`mcv_can_cap()` doesn't need the pseudo-constant check that
`dependency_is_compatible_clause()` requires for the `=` branch, among
other checks.
We won't gain much from combining both: the number of removed lines is
compensated with the added surrounding logic; and things will be more
convoluted.

Moreover, the two functions will evolve independently because they serve
different stats, so coupling them would create undesirable
cross-dependency. I would prefer to maintain it as it is.

2. mcv_can_cap() runs unconditionally before the if (is_or) branch, but
> can_cap is only consumed in the else/AND branch. This means mcv_can_cap() -
> including get_oprrest, syscache lookups per clause - runs for every OR
> query as wasted work. Both can_cap and covered_attnums should be moved
> inside the else branch.
>
Makes sense. I've modified it, you can check v4-0001. It only made sense to
have it there if we implement the OR path, which might be done in a future
patch.

get_ndistinct_for_keys() reimplements the ndistinct item lookup already
> present in estimate_multivariate_ndistinct(). Both functions iterate over
> MVNDistinct->items match by attributes count.
>
Right! I've created a helper and used it in both places. You can check
v4-0002.

I've checked that the patch applies cleanly and pg-ci.yml passes.

Best regards,
Enrique.
From 2cd3db834e510a57b59d14ce09e67368e4e09d72 Mon Sep 17 00:00:00 2001
From: Enrique Sanchez Cardoso <[email protected]>
Date: Sun, 7 Jun 2026 15:20:37 +0200
Subject: [PATCH v4 2/2] Use ndistinct to cap non-MCV values

When no MCV matches and ndistinct is available, apply the uniform
distribution among non-MCV combinations as an upper bound:

        (1 - mcv_totalsel) / (ndistinct - mcv_nitems)
---
 src/backend/statistics/extended_stats.c       | 73 ++++++++++++++++++-
 src/backend/statistics/mcv.c                  |  4 +-
 src/backend/statistics/mvdistinct.c           | 46 ++++++++++++
 src/backend/utils/adt/selfuncs.c              | 39 +---------
 .../statistics/extended_stats_internal.h      |  3 +-
 src/include/statistics/statistics.h           |  3 +
 src/test/regress/expected/stats_ext.out       | 21 +++++-
 src/test/regress/sql/stats_ext.sql            | 14 +++-
 8 files changed, 157 insertions(+), 46 deletions(-)

diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 2f6fabe4589..f3541a8fa0e 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1768,6 +1768,48 @@ mcv_can_cap(StatisticExtInfo *stat, Bitmapset *covered_attnums, List *stat_claus
 	return true;
 }
 
+/*
+ * get_ndistinct_for_keys
+ *		Return the ndistinct estimate for the full set of columns identified by
+ *		keys, using a matching STATS_EXT_NDISTINCT object from the relation's
+ *		statlist.
+ *
+ * Accepts both exact-match and superset statistics objects.  Returns -1.0
+ * if no matching ndistinct statistics object or item is found.
+ */
+static double
+get_ndistinct_for_keys(List *statlist, Bitmapset *keys, bool inh)
+{
+	ListCell   *lc;
+
+	foreach(lc, statlist)
+	{
+		StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+		MVNDistinct *mvnd;
+		MVNDistinctItem *item;
+
+		if (info->kind != STATS_EXT_NDISTINCT || info->inherit != inh)
+			continue;
+		if (!bms_is_subset(keys, info->keys))
+			continue;
+
+		mvnd = statext_ndistinct_load(info->statOid, inh);
+		item = mvndistinct_find_item(mvnd, keys, 0);
+
+		if (item)
+		{
+			double		ndistinct = item->ndistinct;
+
+			statext_ndistinct_free(mvnd);
+			return ndistinct;
+		}
+
+		statext_ndistinct_free(mvnd);
+	}
+
+	return -1.0;
+}
+
 /*
  * statext_mcv_clauselist_selectivity
  *		Estimate clauses using the best multi-column statistics.
@@ -2062,6 +2104,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 						mcv_totalsel,
 						mcv_cap,
 						stat_sel;
+			uint32		mcv_nitems;
 
 			can_cap = mcv_can_cap(stat, covered_attnums, stat_clauses);
 			bms_free(covered_attnums);
@@ -2082,7 +2125,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 												 varRelid, jointype, sjinfo,
 												 rel, &mcv_basesel,
 												 &mcv_totalsel,
-												 &mcv_cap);
+												 &mcv_cap,
+												 &mcv_nitems);
 
 			/* Combine the simple and multi-column estimates. */
 			stat_sel = mcv_combine_selectivities(simple_sel,
@@ -2090,9 +2134,30 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 												 mcv_basesel,
 												 mcv_totalsel);
 
-			/* Cap to the least common MCV item when no MCV items matched. */
-			if (can_cap && stat_sel > mcv_cap)
-				stat_sel = mcv_cap;
+			/* Cap when no MCV items matched (mcv_sel = 0.0). */
+			if (can_cap && mcv_sel == 0.0)
+			{
+				double		ndistinct;
+
+				/* Cap to the least common MCV item. */
+				if (stat_sel > mcv_cap)
+					stat_sel = mcv_cap;
+
+				ndistinct = get_ndistinct_for_keys(rel->statlist, stat->keys, rte->inh);
+
+				if (ndistinct > (double) mcv_nitems)
+				{
+					double		non_mcv_sel = (1.0 - mcv_totalsel) / (ndistinct - (double) mcv_nitems);
+
+					/*
+					 * Cap to uniform distribution among the non-MCV
+					 * combinations. This is similar to what var_eq_const()
+					 * does for single-column MCV stats.
+					 */
+					if (stat_sel > non_mcv_sel)
+						stat_sel = non_mcv_sel;
+				}
+			}
 
 			/* Factor this into the overall result */
 			sel *= stat_sel;
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 6617c297eab..04a2c430637 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -2048,7 +2048,7 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 						   JoinType jointype, SpecialJoinInfo *sjinfo,
 						   RelOptInfo *rel,
 						   Selectivity *basesel, Selectivity *totalsel,
-						   Selectivity *cap)
+						   Selectivity *cap, uint32 *nitems)
 {
 	int			i;
 	MCVList    *mcv;
@@ -2064,6 +2064,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 	/* load the MCV list stored in the statistics object */
 	mcv = statext_mcv_load(stat->statOid, rte->inh);
 
+	*nitems = mcv->nitems;
+
 	/* build a match bitmap for the clauses */
 	matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
 								   mcv, false);
diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c
index 4f8f578a22f..85462470604 100644
--- a/src/backend/statistics/mvdistinct.c
+++ b/src/backend/statistics/mvdistinct.c
@@ -336,6 +336,52 @@ statext_ndistinct_free(MVNDistinct *ndistinct)
 	pfree(ndistinct);
 }
 
+/*
+ * mvndistinct_find_item
+ *		Search an MVNDistinct for the item whose attribute set exactly matches
+ *		the supplied keys bitmap.
+ *
+ * attnum_offset is added to each item attribute number before the bitmap
+ * membership check.  Pass 0 when keys contains plain attribute numbers.
+ * Pass the offset used when building keys when expression-based statistics
+ * are involved (as in estimate_multivariate_ndistinct).
+ *
+ * Returns a pointer to the matching MVNDistinctItem, or NULL if not found.
+ */
+MVNDistinctItem *
+mvndistinct_find_item(MVNDistinct *stats, Bitmapset *keys,
+					  AttrNumber attnum_offset)
+{
+	int			nkeys = bms_num_members(keys);
+
+	for (int i = 0; i < stats->nitems; i++)
+	{
+		MVNDistinctItem *item = &stats->items[i];
+		int			j;
+
+		if (item->nattributes != nkeys)
+			continue;
+
+		/* assume it's the right item */
+		for (j = 0; j < item->nattributes; j++)
+		{
+			AttrNumber	attnum = item->attributes[j] + attnum_offset;
+
+			if (!bms_is_member(attnum, keys))
+			{
+				/* nah, it's not this item */
+				item = NULL;
+				break;
+			}
+		}
+
+		if (item)
+			return item;
+	}
+
+	return NULL;
+}
+
 /*
  * Validate a set of MVNDistincts against the extended statistics object
  * definition.
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index d6efd07073a..834bafdf0a4 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4685,7 +4685,6 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
 	 */
 	if (stats)
 	{
-		int			i;
 		List	   *newlist = NIL;
 		MVNDistinctItem *item = NULL;
 		ListCell   *lc2;
@@ -4775,43 +4774,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
 		}
 
 		/* Find the specific item that exactly matches the combination */
-		for (i = 0; i < stats->nitems; i++)
-		{
-			int			j;
-			MVNDistinctItem *tmpitem = &stats->items[i];
-
-			if (tmpitem->nattributes != bms_num_members(matched))
-				continue;
-
-			/* assume it's the right item */
-			item = tmpitem;
-
-			/* check that all item attributes/expressions fit the match */
-			for (j = 0; j < tmpitem->nattributes; j++)
-			{
-				AttrNumber	attnum = tmpitem->attributes[j];
-
-				/*
-				 * Thanks to how we constructed the matched bitmap above, we
-				 * can just offset all attnums the same way.
-				 */
-				attnum = attnum + attnum_offset;
-
-				if (!bms_is_member(attnum, matched))
-				{
-					/* nah, it's not this item */
-					item = NULL;
-					break;
-				}
-			}
-
-			/*
-			 * If the item has all the matched attributes, we know it's the
-			 * right one - there can't be a better one. matching more.
-			 */
-			if (item)
-				break;
-		}
+		item = mvndistinct_find_item(stats, matched, attnum_offset);
 
 		/*
 		 * Make sure we found an item. There has to be one, because ndistinct
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 01b5f67b843..1114d2870b2 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -130,7 +130,8 @@ extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
 											  RelOptInfo *rel,
 											  Selectivity *basesel,
 											  Selectivity *totalsel,
-											  Selectivity *cap);
+											  Selectivity *cap,
+											  uint32 *nitems);
 
 extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root,
 											 StatisticExtInfo *stat,
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 8f9b9d237fd..e1d03ee98d4 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -95,6 +95,9 @@ typedef struct MCVList
 } MCVList;
 
 extern MVNDistinct *statext_ndistinct_load(Oid mvoid, bool inh);
+extern MVNDistinctItem *mvndistinct_find_item(MVNDistinct *stats,
+											  Bitmapset *keys,
+											  AttrNumber attnum_offset);
 extern MVDependencies *statext_dependencies_load(Oid mvoid, bool inh);
 extern MCVList *statext_mcv_load(Oid mvoid, bool inh);
 
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index c87b2d9f9f5..30760efe47c 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -2945,7 +2945,7 @@ SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0
       1219 |      0
 (1 row)
 
-CREATE STATISTICS mcv_cap_stats (mcv) ON a, b, c, d FROM mcv_cap;
+CREATE STATISTICS mcv_cap_stats_mcv (mcv) ON a, b, c, d FROM mcv_cap;
 ANALYZE mcv_cap;
 -- MCV
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
@@ -2954,6 +2954,15 @@ SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0
        100 |      0
 (1 row)
 
+CREATE STATISTICS mcv_cap_stats_nd (ndistinct) ON a, b, c, d FROM mcv_cap;
+ANALYZE mcv_cap;
+-- MCV + ndistinct
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+ estimated | actual 
+-----------+--------
+        50 |      0
+(1 row)
+
 -- When a value IS in the MCV list, no cap path runs
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 1 AND c = TRUE AND d = '{}'$$);
  estimated | actual 
@@ -2975,6 +2984,16 @@ SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0
       2450 |      0
 (1 row)
 
+-- MCV + superset ndistinct
+DROP STATISTICS mcv_cap_stats_nd;
+CREATE STATISTICS mcv_cap_stats_nd (ndistinct) ON a, b, c, d, e FROM mcv_cap;
+ANALYZE mcv_cap;
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+ estimated | actual 
+-----------+--------
+        50 |      0
+(1 row)
+
 DROP TABLE mcv_cap;
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index e6cf85aa6ab..ffbf64a98af 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1483,12 +1483,18 @@ ANALYZE mcv_cap;
 -- no MCV
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
 
-CREATE STATISTICS mcv_cap_stats (mcv) ON a, b, c, d FROM mcv_cap;
+CREATE STATISTICS mcv_cap_stats_mcv (mcv) ON a, b, c, d FROM mcv_cap;
 ANALYZE mcv_cap;
 
 -- MCV
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
 
+CREATE STATISTICS mcv_cap_stats_nd (ndistinct) ON a, b, c, d FROM mcv_cap;
+ANALYZE mcv_cap;
+
+-- MCV + ndistinct
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+
 -- When a value IS in the MCV list, no cap path runs
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 1 AND c = TRUE AND d = '{}'$$);
 
@@ -1498,6 +1504,12 @@ SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a >= 0 AND b =
 -- Capping does not apply when the query does not cover all MCV columns
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE$$);
 
+-- MCV + superset ndistinct
+DROP STATISTICS mcv_cap_stats_nd;
+CREATE STATISTICS mcv_cap_stats_nd (ndistinct) ON a, b, c, d, e FROM mcv_cap;
+ANALYZE mcv_cap;
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+
 DROP TABLE mcv_cap;
 
 -- check the ability to use multiple MCV lists
-- 
2.43.0

From fe16908c387c7ce09ffcadb5769a74ccf168f8ab Mon Sep 17 00:00:00 2001
From: Enrique Sanchez Cardoso <[email protected]>
Date: Wed, 3 Jun 2026 23:12:16 +0200
Subject: [PATCH v4 1/2] Cap selectivity when values are not in multi-column
 mcv

Selectivity can't be > last MCV item (least common) selectivity when
they are AND clauses and cover all the MCV dimensions.
---
 src/backend/statistics/extended_stats.c       | 82 ++++++++++++++++++-
 src/backend/statistics/mcv.c                  |  9 +-
 .../statistics/extended_stats_internal.h      |  3 +-
 src/test/regress/expected/stats_ext.out       | 48 +++++++++++
 src/test/regress/sql/stats_ext.sql            | 35 ++++++++
 5 files changed, 174 insertions(+), 3 deletions(-)

diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 2b83355d26e..2f6fabe4589 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1705,6 +1705,69 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
 	return true;
 }
 
+/*
+ * mcv_can_cap
+ *		Determines whether the MCV selectivity estimate can be capped at the
+ *		frequency of the least common item in the MCV list.
+ *
+ * When a combination of values does not appear in the MCV list, its true
+ * selectivity must be lower than the frequency of the least common tracked
+ * combination.  We can exploit this to cap the combined selectivity estimate,
+ * but only when the following conditions are both satisfied:
+ *
+ * 1. The clauses cover all dimensions of the statistics object, i.e.
+ *    covered_attnums equals stat->keys exactly.  If any dimension is
+ *    unconstrained, the absence of a match in the MCV list does not bound
+ *    the selectivity of the full combination.
+ *
+ * 2. Every clause is an equality-like condition: either an equality operator,
+ *    an IS NULL test, or a bare boolean Var.  Range or inequality predicates
+ *    can match many values, so the per-combination argument no longer applies.
+ *
+ * Returns true if both conditions hold and capping is valid.
+ */
+static bool
+mcv_can_cap(StatisticExtInfo *stat, Bitmapset *covered_attnums, List *stat_clauses)
+{
+	ListCell   *lc;
+
+	/*
+	 * Expressions are not supported, they can match multiple rows. Also, the
+	 * clauses must cover all dimensions of the MCV list.
+	 */
+	if (stat->exprs != NULL || !bms_equal(covered_attnums, stat->keys))
+	{
+		return false;
+	}
+
+	foreach(lc, stat_clauses)
+	{
+		Node	   *clause = (Node *) lfirst(lc);
+
+		if (IsA(clause, RestrictInfo))
+			clause = (Node *) ((RestrictInfo *) clause)->clause;
+
+		/* = */
+		if (is_opclause(clause) && get_oprrest(((const OpExpr *) clause)->opno) == F_EQSEL)
+			continue;
+
+		/* IS NULL */
+		if (IsA(clause, NullTest) && ((const NullTest *) clause)->nulltesttype == IS_NULL)
+			continue;
+
+		/* = TRUE */
+		if (IsA(clause, Var))
+			continue;
+
+		/* = FALSE */
+		if (IsA(clause, BoolExpr) && ((const BoolExpr *) clause)->boolop == NOT_EXPR && IsA(linitial(((const BoolExpr *) clause)->args), Var))
+			continue;
+
+		return false;
+	}
+	return true;
+}
+
 /*
  * statext_mcv_clauselist_selectivity
  *		Estimate clauses using the best multi-column statistics.
@@ -1800,6 +1863,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 		StatisticExtInfo *stat;
 		List	   *stat_clauses;
 		Bitmapset  *simple_clauses;
+		Bitmapset  *covered_attnums;
 
 		/* find the best suited statistics object for these attnums */
 		stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV, rte->inh,
@@ -1822,6 +1886,9 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 		/* record which clauses are simple (single column or expression) */
 		simple_clauses = NULL;
 
+		/* record all attnums to check if MCV covers all of them */
+		covered_attnums = NULL;
+
 		listidx = -1;
 		foreach(l, clauses)
 		{
@@ -1872,6 +1939,9 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 			stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
 			*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
 
+			if (!is_or)
+				covered_attnums = bms_add_members(covered_attnums, list_attnums[listidx]);
+
 			/*
 			 * Reset the pointers, so that choose_best_statistics knows this
 			 * clause was estimated and does not consider it again.
@@ -1985,12 +2055,17 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 		}
 		else					/* Implicitly-ANDed list of clauses */
 		{
+			bool		can_cap;
 			Selectivity simple_sel,
 						mcv_sel,
 						mcv_basesel,
 						mcv_totalsel,
+						mcv_cap,
 						stat_sel;
 
+			can_cap = mcv_can_cap(stat, covered_attnums, stat_clauses);
+			bms_free(covered_attnums);
+
 			/*
 			 * "Simple" selectivity, i.e. without any extended statistics,
 			 * essentially assuming independence of the columns/clauses.
@@ -2006,7 +2081,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 			mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses,
 												 varRelid, jointype, sjinfo,
 												 rel, &mcv_basesel,
-												 &mcv_totalsel);
+												 &mcv_totalsel,
+												 &mcv_cap);
 
 			/* Combine the simple and multi-column estimates. */
 			stat_sel = mcv_combine_selectivities(simple_sel,
@@ -2014,6 +2090,10 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 												 mcv_basesel,
 												 mcv_totalsel);
 
+			/* Cap to the least common MCV item when no MCV items matched. */
+			if (can_cap && stat_sel > mcv_cap)
+				stat_sel = mcv_cap;
+
 			/* Factor this into the overall result */
 			sel *= stat_sel;
 		}
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 0b7da605a4c..6617c297eab 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -2047,7 +2047,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 						   List *clauses, int varRelid,
 						   JoinType jointype, SpecialJoinInfo *sjinfo,
 						   RelOptInfo *rel,
-						   Selectivity *basesel, Selectivity *totalsel)
+						   Selectivity *basesel, Selectivity *totalsel,
+						   Selectivity *cap)
 {
 	int			i;
 	MCVList    *mcv;
@@ -2057,6 +2058,9 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 	/* match/mismatch bitmap for each MCV item */
 	bool	   *matches = NULL;
 
+	/* default: no cap on selectivity */
+	*cap = 1.0;
+
 	/* load the MCV list stored in the statistics object */
 	mcv = statext_mcv_load(stat->statOid, rte->inh);
 
@@ -2078,6 +2082,9 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 		}
 	}
 
+	if (s == 0.0 && mcv->nitems > 0)
+		*cap = mcv->items[mcv->nitems - 1].frequency;
+
 	return s;
 }
 
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index c775442f2ee..01b5f67b843 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -129,7 +129,8 @@ extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
 											  SpecialJoinInfo *sjinfo,
 											  RelOptInfo *rel,
 											  Selectivity *basesel,
-											  Selectivity *totalsel);
+											  Selectivity *totalsel,
+											  Selectivity *cap);
 
 extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root,
 											 StatisticExtInfo *stat,
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 37070c1a896..c87b2d9f9f5 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -2928,6 +2928,54 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0
 (1 row)
 
 DROP TABLE mcv_lists_partial;
+-- After building MCV statistics the cap limits the combined estimate,
+-- eliminating most of the over-estimation.
+CREATE TABLE mcv_cap (a INT, b INT, c BOOL, d INTEGER[], e INT) WITH (autovacuum_enabled = off);
+INSERT INTO mcv_cap
+    SELECT 0, b, TRUE, '{}', 1 FROM generate_series(1, 99) b, generate_series(1, 100) r;
+INSERT INTO mcv_cap
+    SELECT a, 0, NULL, '{1, 2}', 2 FROM generate_series(1, 99) a, generate_series(1, 100) r;
+INSERT INTO mcv_cap
+    SELECT c, c, FALSE, '{1, 1}', 3 FROM generate_series(1, 100) c;
+ANALYZE mcv_cap;
+-- no MCV
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+ estimated | actual 
+-----------+--------
+      1219 |      0
+(1 row)
+
+CREATE STATISTICS mcv_cap_stats (mcv) ON a, b, c, d FROM mcv_cap;
+ANALYZE mcv_cap;
+-- MCV
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+ estimated | actual 
+-----------+--------
+       100 |      0
+(1 row)
+
+-- When a value IS in the MCV list, no cap path runs
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 1 AND c = TRUE AND d = '{}'$$);
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+-- Capping does not apply when the query includes an inequality clause
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a >= 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+ estimated | actual 
+-----------+--------
+      2450 |      0
+(1 row)
+
+-- Capping does not apply when the query does not cover all MCV columns
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE$$);
+ estimated | actual 
+-----------+--------
+      2450 |      0
+(1 row)
+
+DROP TABLE mcv_cap;
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
 	a INTEGER,
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 3cc6012b822..e6cf85aa6ab 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1465,6 +1465,41 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0
 
 DROP TABLE mcv_lists_partial;
 
+-- After building MCV statistics the cap limits the combined estimate,
+-- eliminating most of the over-estimation.
+CREATE TABLE mcv_cap (a INT, b INT, c BOOL, d INTEGER[], e INT) WITH (autovacuum_enabled = off);
+
+INSERT INTO mcv_cap
+    SELECT 0, b, TRUE, '{}', 1 FROM generate_series(1, 99) b, generate_series(1, 100) r;
+
+INSERT INTO mcv_cap
+    SELECT a, 0, NULL, '{1, 2}', 2 FROM generate_series(1, 99) a, generate_series(1, 100) r;
+
+INSERT INTO mcv_cap
+    SELECT c, c, FALSE, '{1, 1}', 3 FROM generate_series(1, 100) c;
+
+ANALYZE mcv_cap;
+
+-- no MCV
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+
+CREATE STATISTICS mcv_cap_stats (mcv) ON a, b, c, d FROM mcv_cap;
+ANALYZE mcv_cap;
+
+-- MCV
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+
+-- When a value IS in the MCV list, no cap path runs
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 1 AND c = TRUE AND d = '{}'$$);
+
+-- Capping does not apply when the query includes an inequality clause
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a >= 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+
+-- Capping does not apply when the query does not cover all MCV columns
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE$$);
+
+DROP TABLE mcv_cap;
+
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
 	a INTEGER,
-- 
2.43.0

Reply via email to