Original thread is:
https://www.postgresql.org/message-id/flat/196f1e1a-5464-ed07-ab3c-0c9920564af7%40postgrespro.ru
Following Yugo's advice, I have splitted this patch into two:
1. Extending auto_explain extension to generate extended statistics in
case of bad selectivity estimation.
2. Taken in account extended statistics when computing join selectivity.
Now this thread will contain only patches for join selectivity estimation.
However,
IIUC, the clausesel patch uses only functional dependencies statistics for
improving join, so my question was about possibility to consider MCV in the
clausesel patch.
Sorry, do not have idea right now how to use MCV for better estimation
of join selectivity.
--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index d263ecf..5446ad5 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -25,6 +25,8 @@
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
+#include "statistics/statistics.h"
+#include "catalog/pg_statistic_ext.h"
/*
* Data structure for accumulating info about possible range-query
@@ -56,6 +58,47 @@ static Selectivity clauselist_selectivity_or(PlannerInfo *root,
****************************************************************************/
/*
+ * Find functional dependency between attributes using multicolumn statistic.
+ * relid: index of relation to which all considered attributes belong
+ * var: variable which dependencies are inspected
+ * attnums: set of considered attributes included specified variables
+ * This function return degree of strongest dependency between some subset of this attributes
+ * and specified variable or 0.0 if on dependency is found.
+ */
+double
+find_var_dependency(PlannerInfo *root, Index relid, Var *var, Bitmapset *attnums)
+{
+ RelOptInfo* rel = find_base_rel(root, relid);
+ if (rel->rtekind == RTE_RELATION && rel->statlist != NIL)
+ {
+ StatisticExtInfo *stat = choose_best_statistics(rel->statlist, STATS_EXT_DEPENDENCIES,
+ &attnums, 1);
+ if (stat != NULL)
+ {
+ MVDependencies *dependencies = statext_dependencies_load(stat->statOid);
+ MVDependency *strongest = NULL;
+ int i;
+ for (i = 0; i < dependencies->ndeps; i++)
+ {
+ MVDependency *dependency = dependencies->deps[i];
+ int n_dep_vars = dependency->nattributes - 1;
+ /* Dependency implies attribute */
+ if (var->varattno == dependency->attributes[n_dep_vars])
+ {
+ while (--n_dep_vars >= 0
+ && bms_is_member(dependency->attributes[n_dep_vars], attnums));
+ if (n_dep_vars < 0 && (!strongest || strongest->degree < dependency->degree))
+ strongest = dependency;
+ }
+ }
+ if (strongest)
+ return strongest->degree;
+ }
+ }
+ return 0.0;
+}
+
+/*
* clauselist_selectivity -
* Compute the selectivity of an implicitly-ANDed list of boolean
* expression clauses. The list can be empty, in which case 1.0
@@ -129,6 +172,9 @@ clauselist_selectivity_ext(PlannerInfo *root,
RangeQueryClause *rqlist = NULL;
ListCell *l;
int listidx;
+ Bitmapset *clauses_attnums = NULL;
+ int n_clauses_attnums = 0;
+ int innerRelid = varRelid;
/*
* If there's exactly one clause, just go directly to
@@ -139,6 +185,9 @@ clauselist_selectivity_ext(PlannerInfo *root,
varRelid, jointype, sjinfo,
use_extended_stats);
+ if (innerRelid == 0 && sjinfo)
+ bms_get_singleton_member(sjinfo->min_righthand, &innerRelid);
+
/*
* Determine if these clauses reference a single relation. If so, and if
* it has extended statistics, try to apply those.
@@ -171,7 +220,6 @@ clauselist_selectivity_ext(PlannerInfo *root,
Node *clause = (Node *) lfirst(l);
RestrictInfo *rinfo;
Selectivity s2;
-
listidx++;
/*
@@ -204,6 +252,7 @@ clauselist_selectivity_ext(PlannerInfo *root,
else
rinfo = NULL;
+
/*
* See if it looks like a restriction clause with a pseudoconstant on
* one side. (Anything more complicated than that might not behave in
@@ -215,6 +264,42 @@ clauselist_selectivity_ext(PlannerInfo *root,
OpExpr *expr = (OpExpr *) clause;
bool varonleft = true;
bool ok;
+ int oprrest = get_oprrest(expr->opno);
+
+ /* Try to take in account functional dependencies between attributes */
+ if (oprrest == F_EQSEL && rinfo != NULL && innerRelid != 0)
+ {
+ Var* var = (Var*)linitial(expr->args);
+ if (!IsA(var, Var) || var->varno != innerRelid)
+ {
+ var = (Var*)lsecond(expr->args);
+ }
+ if (IsA(var, Var) && var->varattno >= 0 && var->varno == innerRelid)
+ {
+ clauses_attnums = bms_add_member(clauses_attnums, var->varattno);
+ if (n_clauses_attnums++ != 0)
+ {
+ double dep = find_var_dependency(root, innerRelid, var, clauses_attnums);
+ if (dep != 0.0)
+ {
+ /*
+ * Replace s2 with the conditional probability s2 given s1, computed
+ * using the formula P(b|a) = P(a,b) / P(a), which simplifies to
+ *
+ * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
+ *
+ * where P(a) = s1, the selectivity of the implying attributes, and
+ * P(b) = s2, the selectivity of the implied attribute.
+ */
+ if (s1 <= s2)
+ s1 *= dep + (1 - dep) * s2;
+ else
+ s1 *= dep * s2 / s1 + (1 - dep) * s2;
+ continue;
+ }
+ }
+ }
+ }
if (rinfo)
{
@@ -240,7 +325,7 @@ clauselist_selectivity_ext(PlannerInfo *root,
* selectivity in generically. But if it's the right oprrest,
* add the clause to rqlist for later processing.
*/
- switch (get_oprrest(expr->opno))
+ switch (oprrest)
{
case F_SCALARLTSEL:
case F_SCALARLESEL:
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index aab06c7..74ceaf5 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -4790,6 +4790,30 @@ get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
return nrows;
}
+
+/*
+ * Try to find dependency between variables.
+ * var: varaibles which dependencies are considered
+ * clause_vars: list of variables used in other clauses
+ * This functions return strongest dependency and some subset of variables from the same relation
+ * or 0.0 if no dependency was found.
+ */
+static double
+var_depends_on(PlannerInfo *root, Var* var, List* clause_vars)
+{
+ ListCell* lc;
+ Bitmapset *attnums = NULL;
+ Index relid = var->varno;
+
+ foreach (lc, clause_vars)
+ {
+ Var* join_var = (Var*)lfirst(lc);
+ if (join_var->varno == relid && join_var->varattno >= 0)
+ attnums = bms_add_member(attnums, join_var->varattno);
+ }
+ return attnums ? find_var_dependency(root, relid, var, bms_add_member(attnums, var->varattno)) : 0.0;
+}
+
/*
* calc_joinrel_size_estimate
* Workhorse for set_joinrel_size_estimates and
@@ -4886,6 +4910,39 @@ calc_joinrel_size_estimate(PlannerInfo *root,
pselec = 0.0; /* not used, keep compiler quiet */
}
+ /* Try to take in account functional dependencies between attributes of clauses pushed-down to joined relations and
+ * retstrictlist clause. Right now we consider only case of restrictlist consists of one clause.
+ */
+ if (list_length(restrictlist) == 1)
+ {
+ RestrictInfo* rinfo = linitial(restrictlist);
+ Expr* clause = rinfo->clause;
+
+ Assert(IsA(rinfo, RestrictInfo));
+
+ if (is_opclause(clause))
+ {
+ OpExpr *expr = (OpExpr *) clause;
+ ListCell* lc;
+ List* join_vars = NULL;
+
+ /* Get list of all attributes in pushed-down clauses */
+ foreach (lc, outer_rel->baserestrictinfo)
+ join_vars = list_concat(join_vars, pull_vars_of_level((Node*)((RestrictInfo*)lfirst(lc))->clause, 0));
+ foreach (lc, inner_rel->baserestrictinfo)
+ join_vars = list_concat(join_vars, pull_vars_of_level((Node*)((RestrictInfo*)lfirst(lc))->clause, 0));
+
+ foreach (lc, expr->args)
+ {
+ Var *var = (Var*) lfirst(lc);
+ if (IsA(var, Var) && var->varattno >= 0)
+ {
+ double dep = var_depends_on(root, var, join_vars);
+ jselec = jselec*(1.0 - dep) + dep;
+ }
+ }
+ }
+ }
/*
* Basically, we multiply size of Cartesian product by selectivity.
*
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index 0673887..f7d91e7 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -53,4 +53,6 @@ extern void CommuteOpExpr(OpExpr *clause);
extern Query *inline_set_returning_function(PlannerInfo *root,
RangeTblEntry *rte);
+extern double find_var_dependency(PlannerInfo *root, Index relid, Var *var, Bitmapset *attnums);
+
#endif /* CLAUSES_H */