Smarter version of join selectivity patch handling cases like this:
explain select * from outer_tab join inner_tab using(x,y) where x=1;
QUERY PLAN
------------------------------------------------------------------------------------------------
Nested Loop (cost=0.42..1815.47 rows=10 width=12)
Join Filter: (outer_tab.y = inner_tab.y)
-> Seq Scan on outer_tab (cost=0.00..1791.00 rows=1 width=12)
Filter: (x = 1)
-> Index Only Scan using inner_tab_x_y_idx on inner_tab
(cost=0.42..24.35 rows=10 width=8)
Index Cond: (x = 1)
(6 rows)
--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index a9536c2..915a204 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -13,12 +13,25 @@
#include "postgres.h"
#include <limits.h>
+#include <math.h>
#include "access/parallel.h"
#include "commands/explain.h"
+#include "commands/defrem.h"
#include "executor/instrument.h"
#include "jit/jit.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/cost.h"
+#include "optimizer/optimizer.h"
+#include "optimizer/planmain.h"
+#include "parser/parsetree.h"
+#include "storage/ipc.h"
+#include "statistics/statistics.h"
#include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
PG_MODULE_MAGIC;
@@ -33,7 +46,9 @@ static bool auto_explain_log_settings = false;
static int auto_explain_log_format = EXPLAIN_FORMAT_TEXT;
static int auto_explain_log_level = LOG;
static bool auto_explain_log_nested_statements = false;
+static bool auto_explain_suggest_only = false;
static double auto_explain_sample_rate = 1;
+static double auto_explain_add_statistics_threshold = 0.0;
static const struct config_enum_entry format_options[] = {
{"text", EXPLAIN_FORMAT_TEXT, false},
@@ -218,6 +233,30 @@ _PG_init(void)
NULL,
NULL);
+ DefineCustomRealVariable("auto_explain.add_statistics_threshold",
+ "Sets the threshold for actual/estimated #rows ratio triggering creation of multicolumn statistic for the related columns.",
+ "Zero disables implicit creation of multicolumn statistic.",
+ &auto_explain_add_statistics_threshold,
+ 0.0,
+ 0.0,
+ INT_MAX,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("auto_explain.suggest_only",
+ "Do not create statistic but just record in WAL suggested create statistics statement.",
+ NULL,
+ &auto_explain_suggest_only,
+ false,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
EmitWarningsOnPlaceholders("auto_explain");
/* Install hooks. */
@@ -353,6 +392,230 @@ explain_ExecutorFinish(QueryDesc *queryDesc)
PG_END_TRY();
}
+static void
+AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+
+static void
+AddMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+ ListCell *lst;
+
+ foreach(lst, plans)
+ {
+ SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+ AddMultiColumnStatisticsForNode(sps->planstate, es);
+ }
+}
+
+static void
+AddMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+ ExplainState *es)
+{
+ int j;
+
+ for (j = 0; j < nsubnodes; j++)
+ AddMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+ char* va = strVal((Value *) linitial(((ColumnRef *)lfirst(a))->fields));
+ char* vb = strVal((Value *) linitial(((ColumnRef *)lfirst(b))->fields));
+ return strcmp(va, vb);
+}
+
+static void
+AddMultiColumnStatisticsForQual(void* qual, ExplainState *es)
+{
+ List *vars = NULL;
+ ListCell* lc;
+ foreach (lc, qual)
+ {
+ Node* node = (Node*)lfirst(lc);
+ if (IsA(node, RestrictInfo))
+ node = (Node*)((RestrictInfo*)node)->clause;
+ vars = list_concat(vars, pull_vars_of_level(node, 0));
+ }
+ while (vars != NULL)
+ {
+ ListCell *cell;
+ List *cols = NULL;
+ Index varno = 0;
+ Bitmapset* colmap = NULL;
+
+ foreach (cell, vars)
+ {
+ Node* node = (Node *) lfirst(cell);
+ if (IsA(node, Var))
+ {
+ Var *var = (Var *) node;
+ if (cols == NULL || var->varnoold == varno)
+ {
+ varno = var->varnoold;
+ if (var->varattno > 0 &&
+ !bms_is_member(var->varattno, colmap) &&
+ varno >= 1 &&
+ varno <= list_length(es->rtable) &&
+ list_length(cols) < STATS_MAX_DIMENSIONS)
+ {
+ RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+ if (rte->rtekind == RTE_RELATION)
+ {
+ ColumnRef *col = makeNode(ColumnRef);
+ char *colname = get_rte_attribute_name(rte, var->varattno);
+ col->fields = list_make1(makeString(colname));
+ cols = lappend(cols, col);
+ colmap = bms_add_member(colmap, var->varattno);
+ }
+ }
+ }
+ else
+ {
+ continue;
+ }
+ }
+ vars = foreach_delete_current(vars, cell);
+ }
+ if (list_length(cols) >= 2)
+ {
+ CreateStatsStmt* stats = makeNode(CreateStatsStmt);
+ RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+ char *rel_namespace = get_namespace_name(get_rel_namespace(rte->relid));
+ char *rel_name = get_rel_name(rte->relid);
+ RangeVar* rel = makeRangeVar(rel_namespace, rel_name, 0);
+ char* stat_name = rel_name;
+ char* create_stat_stmt = (char*)"";
+ char const* sep = "ON";
+
+ list_sort(cols, vars_list_comparator);
+ /* Construct name for statistic by concatenating relation name with all columns */
+ foreach (cell, cols)
+ {
+ char* col_name = strVal((Value *) linitial(((ColumnRef *)lfirst(cell))->fields));
+ stat_name = psprintf("%s_%s", stat_name, col_name);
+ create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+ sep = ",";
+ }
+ /*
+ * Check if multicolumn statistic object with such name already exists.
+ * Most likely if was already created by auto_explain, but either ANALYZE was not performed since
+ * this time, either presence of this multicolumn statistic doesn't help to provide more precise estimation.
+ * Despite to the fact that we create statistics with "if_not_exist" option, presence of such check
+ * allows to eliminate notice message that statistics object already exists.
+ */
+ if (!SearchSysCacheExists2(STATEXTNAMENSP,
+ CStringGetDatum(stat_name),
+ ObjectIdGetDatum(get_rel_namespace(rte->relid))))
+ {
+ if (auto_explain_suggest_only)
+ {
+ elog(NOTICE, "Auto_explain suggestion: CREATE STATISTICS %s %s FROM %s", stat_name, create_stat_stmt, rel_name);
+ }
+ else
+ {
+ elog(LOG, "Add statistics %s", stat_name);
+ stats->defnames = list_make2(makeString(rel_namespace), makeString(stat_name));
+ stats->if_not_exists = true;
+ stats->relations = list_make1(rel);
+ stats->exprs = cols;
+ CreateStatistics(stats);
+ }
+ }
+ }
+ }
+}
+
+static void
+AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+ Plan *plan = planstate->plan;
+
+ if (planstate->instrument && plan->plan_rows != 0)
+ {
+ if (auto_explain_add_statistics_threshold != 0
+ && planstate->instrument->ntuples / plan->plan_rows >= auto_explain_add_statistics_threshold)
+ {
+ elog(DEBUG1, "Estimated=%f, actual=%f, error=%f: plan=%s", plan->plan_rows, planstate->instrument->ntuples, planstate->instrument->ntuples / plan->plan_rows, nodeToString(plan));
+ /* quals, sort keys, etc */
+ switch (nodeTag(plan))
+ {
+ case T_IndexScan:
+ AddMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_IndexOnlyScan:
+ AddMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+ break;
+ case T_BitmapIndexScan:
+ AddMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_NestLoop:
+ AddMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+ break;
+ case T_MergeJoin:
+ AddMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+ AddMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+ break;
+ case T_HashJoin:
+ AddMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+ AddMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+ break;
+ default:
+ break;
+ }
+ AddMultiColumnStatisticsForQual(plan->qual, es);
+ }
+ }
+
+ /* initPlan-s */
+ if (planstate->initPlan)
+ AddMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+ /* lefttree */
+ if (outerPlanState(planstate))
+ AddMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+ /* righttree */
+ if (innerPlanState(planstate))
+ AddMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+ /* special child plans */
+ switch (nodeTag(plan))
+ {
+ case T_ModifyTable:
+ AddMultiColumnStatisticsForMemberNodes(((ModifyTableState *) planstate)->mt_plans,
+ ((ModifyTableState *) planstate)->mt_nplans,
+ es);
+ break;
+ case T_Append:
+ AddMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+ ((AppendState *) planstate)->as_nplans,
+ es);
+ break;
+ case T_MergeAppend:
+ AddMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+ ((MergeAppendState *) planstate)->ms_nplans,
+ es);
+ break;
+ case T_BitmapAnd:
+ AddMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+ ((BitmapAndState *) planstate)->nplans,
+ es);
+ break;
+ case T_BitmapOr:
+ AddMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+ ((BitmapOrState *) planstate)->nplans,
+ es);
+ break;
+ case T_SubqueryScan:
+ AddMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+ break;
+ default:
+ break;
+ }
+}
+
/*
* ExecutorEnd hook: log results if needed
*/
@@ -392,6 +655,10 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
ExplainPrintJITSummary(es, queryDesc);
ExplainEndOutput(es);
+ /* Add multicolumn statistic if requested */
+ if (auto_explain_add_statistics_threshold && !IsParallelWorker())
+ AddMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
/* Remove last line break */
if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
es->str->data[--es->str->len] = '\0';
diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index a9536c2..915a204 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -13,12 +13,25 @@
#include "postgres.h"
#include <limits.h>
+#include <math.h>
#include "access/parallel.h"
#include "commands/explain.h"
+#include "commands/defrem.h"
#include "executor/instrument.h"
#include "jit/jit.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/cost.h"
+#include "optimizer/optimizer.h"
+#include "optimizer/planmain.h"
+#include "parser/parsetree.h"
+#include "storage/ipc.h"
+#include "statistics/statistics.h"
#include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
PG_MODULE_MAGIC;
@@ -33,7 +46,9 @@ static bool auto_explain_log_settings = false;
static int auto_explain_log_format = EXPLAIN_FORMAT_TEXT;
static int auto_explain_log_level = LOG;
static bool auto_explain_log_nested_statements = false;
+static bool auto_explain_suggest_only = false;
static double auto_explain_sample_rate = 1;
+static double auto_explain_add_statistics_threshold = 0.0;
static const struct config_enum_entry format_options[] = {
{"text", EXPLAIN_FORMAT_TEXT, false},
@@ -218,6 +233,30 @@ _PG_init(void)
NULL,
NULL);
+ DefineCustomRealVariable("auto_explain.add_statistics_threshold",
+ "Sets the threshold for actual/estimated #rows ratio triggering creation of multicolumn statistic for the related columns.",
+ "Zero disables implicit creation of multicolumn statistic.",
+ &auto_explain_add_statistics_threshold,
+ 0.0,
+ 0.0,
+ INT_MAX,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("auto_explain.suggest_only",
+ "Do not create statistic but just record in WAL suggested create statistics statement.",
+ NULL,
+ &auto_explain_suggest_only,
+ false,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
EmitWarningsOnPlaceholders("auto_explain");
/* Install hooks. */
@@ -353,6 +392,230 @@ explain_ExecutorFinish(QueryDesc *queryDesc)
PG_END_TRY();
}
+static void
+AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+
+static void
+AddMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+ ListCell *lst;
+
+ foreach(lst, plans)
+ {
+ SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+ AddMultiColumnStatisticsForNode(sps->planstate, es);
+ }
+}
+
+static void
+AddMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+ ExplainState *es)
+{
+ int j;
+
+ for (j = 0; j < nsubnodes; j++)
+ AddMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+ char* va = strVal((Value *) linitial(((ColumnRef *)lfirst(a))->fields));
+ char* vb = strVal((Value *) linitial(((ColumnRef *)lfirst(b))->fields));
+ return strcmp(va, vb);
+}
+
+static void
+AddMultiColumnStatisticsForQual(void* qual, ExplainState *es)
+{
+ List *vars = NULL;
+ ListCell* lc;
+ foreach (lc, qual)
+ {
+ Node* node = (Node*)lfirst(lc);
+ if (IsA(node, RestrictInfo))
+ node = (Node*)((RestrictInfo*)node)->clause;
+ vars = list_concat(vars, pull_vars_of_level(node, 0));
+ }
+ while (vars != NULL)
+ {
+ ListCell *cell;
+ List *cols = NULL;
+ Index varno = 0;
+ Bitmapset* colmap = NULL;
+
+ foreach (cell, vars)
+ {
+ Node* node = (Node *) lfirst(cell);
+ if (IsA(node, Var))
+ {
+ Var *var = (Var *) node;
+ if (cols == NULL || var->varnoold == varno)
+ {
+ varno = var->varnoold;
+ if (var->varattno > 0 &&
+ !bms_is_member(var->varattno, colmap) &&
+ varno >= 1 &&
+ varno <= list_length(es->rtable) &&
+ list_length(cols) < STATS_MAX_DIMENSIONS)
+ {
+ RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+ if (rte->rtekind == RTE_RELATION)
+ {
+ ColumnRef *col = makeNode(ColumnRef);
+ char *colname = get_rte_attribute_name(rte, var->varattno);
+ col->fields = list_make1(makeString(colname));
+ cols = lappend(cols, col);
+ colmap = bms_add_member(colmap, var->varattno);
+ }
+ }
+ }
+ else
+ {
+ continue;
+ }
+ }
+ vars = foreach_delete_current(vars, cell);
+ }
+ if (list_length(cols) >= 2)
+ {
+ CreateStatsStmt* stats = makeNode(CreateStatsStmt);
+ RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+ char *rel_namespace = get_namespace_name(get_rel_namespace(rte->relid));
+ char *rel_name = get_rel_name(rte->relid);
+ RangeVar* rel = makeRangeVar(rel_namespace, rel_name, 0);
+ char* stat_name = rel_name;
+ char* create_stat_stmt = (char*)"";
+ char const* sep = "ON";
+
+ list_sort(cols, vars_list_comparator);
+ /* Construct name for statistic by concatenating relation name with all columns */
+ foreach (cell, cols)
+ {
+ char* col_name = strVal((Value *) linitial(((ColumnRef *)lfirst(cell))->fields));
+ stat_name = psprintf("%s_%s", stat_name, col_name);
+ create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+ sep = ",";
+ }
+ /*
+ * Check if multicolumn statistic object with such name already exists.
+ * Most likely if was already created by auto_explain, but either ANALYZE was not performed since
+ * this time, either presence of this multicolumn statistic doesn't help to provide more precise estimation.
+ * Despite to the fact that we create statistics with "if_not_exist" option, presence of such check
+ * allows to eliminate notice message that statistics object already exists.
+ */
+ if (!SearchSysCacheExists2(STATEXTNAMENSP,
+ CStringGetDatum(stat_name),
+ ObjectIdGetDatum(get_rel_namespace(rte->relid))))
+ {
+ if (auto_explain_suggest_only)
+ {
+ elog(NOTICE, "Auto_explain suggestion: CREATE STATISTICS %s %s FROM %s", stat_name, create_stat_stmt, rel_name);
+ }
+ else
+ {
+ elog(LOG, "Add statistics %s", stat_name);
+ stats->defnames = list_make2(makeString(rel_namespace), makeString(stat_name));
+ stats->if_not_exists = true;
+ stats->relations = list_make1(rel);
+ stats->exprs = cols;
+ CreateStatistics(stats);
+ }
+ }
+ }
+ }
+}
+
+static void
+AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+ Plan *plan = planstate->plan;
+
+ if (planstate->instrument && plan->plan_rows != 0)
+ {
+ if (auto_explain_add_statistics_threshold != 0
+ && planstate->instrument->ntuples / plan->plan_rows >= auto_explain_add_statistics_threshold)
+ {
+ elog(DEBUG1, "Estimated=%f, actual=%f, error=%f: plan=%s", plan->plan_rows, planstate->instrument->ntuples, planstate->instrument->ntuples / plan->plan_rows, nodeToString(plan));
+ /* quals, sort keys, etc */
+ switch (nodeTag(plan))
+ {
+ case T_IndexScan:
+ AddMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_IndexOnlyScan:
+ AddMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+ break;
+ case T_BitmapIndexScan:
+ AddMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_NestLoop:
+ AddMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+ break;
+ case T_MergeJoin:
+ AddMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+ AddMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+ break;
+ case T_HashJoin:
+ AddMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+ AddMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+ break;
+ default:
+ break;
+ }
+ AddMultiColumnStatisticsForQual(plan->qual, es);
+ }
+ }
+
+ /* initPlan-s */
+ if (planstate->initPlan)
+ AddMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+ /* lefttree */
+ if (outerPlanState(planstate))
+ AddMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+ /* righttree */
+ if (innerPlanState(planstate))
+ AddMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+ /* special child plans */
+ switch (nodeTag(plan))
+ {
+ case T_ModifyTable:
+ AddMultiColumnStatisticsForMemberNodes(((ModifyTableState *) planstate)->mt_plans,
+ ((ModifyTableState *) planstate)->mt_nplans,
+ es);
+ break;
+ case T_Append:
+ AddMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+ ((AppendState *) planstate)->as_nplans,
+ es);
+ break;
+ case T_MergeAppend:
+ AddMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+ ((MergeAppendState *) planstate)->ms_nplans,
+ es);
+ break;
+ case T_BitmapAnd:
+ AddMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+ ((BitmapAndState *) planstate)->nplans,
+ es);
+ break;
+ case T_BitmapOr:
+ AddMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+ ((BitmapOrState *) planstate)->nplans,
+ es);
+ break;
+ case T_SubqueryScan:
+ AddMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+ break;
+ default:
+ break;
+ }
+}
+
/*
* ExecutorEnd hook: log results if needed
*/
@@ -392,6 +655,10 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
ExplainPrintJITSummary(es, queryDesc);
ExplainEndOutput(es);
+ /* Add multicolumn statistic if requested */
+ if (auto_explain_add_statistics_threshold && !IsParallelWorker())
+ AddMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
/* Remove last line break */
if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
es->str->data[--es->str->len] = '\0';
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 4bf777d..085defe 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -25,6 +25,7 @@
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
#include "statistics/statistics.h"
+#include "catalog/pg_statistic_ext.h"
/*
@@ -106,6 +107,47 @@ clauselist_selectivity(PlannerInfo *root,
}
/*
+ * Find functional dependency between attributes using multicolumn statistic.
+ * relid: index of relation to which all considered attributes belong
+ * var: variable which dependencies are inspected
+ * attnums: set of considered attributes included specified variables
+ * This function return degree of strongest dependency between some subset of this attributes
+ * and specified variable or 0.0 if on dependency is found.
+ */
+double
+find_var_dependency(PlannerInfo *root, Index relid, Var *var, Bitmapset *attnums)
+{
+ RelOptInfo* rel = find_base_rel(root, relid);
+ if (rel->rtekind == RTE_RELATION && rel->statlist != NIL)
+ {
+ StatisticExtInfo *stat = choose_best_statistics(rel->statlist, attnums,
+ STATS_EXT_DEPENDENCIES);
+ if (stat != NULL)
+ {
+ MVDependencies *dependencies = statext_dependencies_load(stat->statOid);
+ MVDependency *strongest = NULL;
+ int i;
+ for (i = 0; i < dependencies->ndeps; i++)
+ {
+ MVDependency *dependency = dependencies->deps[i];
+ int n_dep_vars = dependency->nattributes - 1;
+ /* Dependency implies attribute */
+ if (var->varattno == dependency->attributes[n_dep_vars])
+ {
+ while (--n_dep_vars >= 0
+ && bms_is_member(dependency->attributes[n_dep_vars], attnums));
+ if (n_dep_vars < 0 && (!strongest || strongest->degree < dependency->degree))
+ strongest = dependency;
+ }
+ }
+ if (strongest)
+ return strongest->degree;
+ }
+ }
+ return 0.0;
+}
+
+/*
* clauselist_selectivity_simple -
* Compute the selectivity of an implicitly-ANDed list of boolean
* expression clauses. The list can be empty, in which case 1.0
@@ -159,6 +201,9 @@ clauselist_selectivity_simple(PlannerInfo *root,
RangeQueryClause *rqlist = NULL;
ListCell *l;
int listidx;
+ Bitmapset *clauses_attnums = NULL;
+ int n_clauses_attnums = 0;
+ int innerRelid = varRelid;
/*
* If there's exactly one clause (and it was not estimated yet), just go
@@ -170,6 +215,9 @@ clauselist_selectivity_simple(PlannerInfo *root,
return clause_selectivity(root, (Node *) linitial(clauses),
varRelid, jointype, sjinfo);
+ if (innerRelid == 0 && sjinfo)
+ bms_get_singleton_member(sjinfo->min_righthand, &innerRelid);
+
/*
* Anything that doesn't look like a potential rangequery clause gets
* multiplied into s1 and forgotten. Anything that does gets inserted into
@@ -181,7 +229,6 @@ clauselist_selectivity_simple(PlannerInfo *root,
Node *clause = (Node *) lfirst(l);
RestrictInfo *rinfo;
Selectivity s2;
-
listidx++;
/*
@@ -213,6 +260,7 @@ clauselist_selectivity_simple(PlannerInfo *root,
else
rinfo = NULL;
+
/*
* See if it looks like a restriction clause with a pseudoconstant on
* one side. (Anything more complicated than that might not behave in
@@ -224,6 +272,30 @@ clauselist_selectivity_simple(PlannerInfo *root,
OpExpr *expr = (OpExpr *) clause;
bool varonleft = true;
bool ok;
+ int oprrest = get_oprrest(expr->opno);
+
+ /* Try to take in account functional dependencies between attributes */
+ if (oprrest == F_EQSEL && rinfo != NULL && innerRelid != 0)
+ {
+ Var* var = (Var*)linitial(expr->args);
+ if (!IsA(var, Var) || var->varnoold != innerRelid)
+ {
+ var = (Var*)lsecond(expr->args);
+ }
+ if (IsA(var, Var) && var->varattno >= 0 && var->varnoold == innerRelid)
+ {
+ clauses_attnums = bms_add_member(clauses_attnums, var->varattno);
+ if (n_clauses_attnums++ != 0)
+ {
+ double dep = find_var_dependency(root, innerRelid, var, clauses_attnums);
+ if (dep != 0.0)
+ {
+ s1 *= dep + (1 - dep) * s2;
+ continue;
+ }
+ }
+ }
+ }
if (rinfo)
{
@@ -249,7 +321,7 @@ clauselist_selectivity_simple(PlannerInfo *root,
* selectivity in generically. But if it's the right oprrest,
* add the clause to rqlist for later processing.
*/
- switch (get_oprrest(expr->opno))
+ switch (oprrest)
{
case F_SCALARLTSEL:
case F_SCALARLESEL:
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index c5f6593..f6714e4 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -4543,6 +4543,30 @@ get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
return nrows;
}
+
+/*
+ * Try to find dependency between variables.
+ * var: varaibles which dependencies are considered
+ * join_vars: list of variables used in other clauses
+ * This functions return strongest dependency and some subset of variables from the same relation
+ * or 0.0 if no dependency was found.
+ */
+static double
+var_depends_on(PlannerInfo *root, Var* var, List* clause_vars)
+{
+ ListCell* lc;
+ Bitmapset *attnums = NULL;
+ Index relid = var->varnoold;
+
+ foreach (lc, clause_vars)
+ {
+ Var* join_var = (Var*)lfirst(lc);
+ if (join_var->varnoold == relid && join_var->varattno >= 0)
+ attnums = bms_add_member(attnums, join_var->varattno);
+ }
+ return attnums ? find_var_dependency(root, relid, var, bms_add_member(attnums, var->varattno)) : 0.0;
+}
+
/*
* calc_joinrel_size_estimate
* Workhorse for set_joinrel_size_estimates and
@@ -4639,6 +4663,39 @@ calc_joinrel_size_estimate(PlannerInfo *root,
pselec = 0.0; /* not used, keep compiler quiet */
}
+ /* Try to take in account functional dependencies between attributes of clauses pushed-down to joined relations and
+ * retstrictlist clause. Right now we consider only case of restrictlist consists of one clause.
+ */
+ if (list_length(restrictlist) == 1)
+ {
+ RestrictInfo* rinfo = linitial(restrictlist);
+ Expr* clause = rinfo->clause;
+
+ Assert(IsA(rinfo, RestrictInfo));
+
+ if (is_opclause(clause))
+ {
+ OpExpr *expr = (OpExpr *) clause;
+ ListCell* lc;
+ List* join_vars = NULL;
+
+ /* Get list of all attributes in pushed-down clauses */
+ foreach (lc, outer_rel->baserestrictinfo)
+ join_vars = list_concat(join_vars, pull_vars_of_level((Node*)((RestrictInfo*)lfirst(lc))->clause, 0));
+ foreach (lc, inner_rel->baserestrictinfo)
+ join_vars = list_concat(join_vars, pull_vars_of_level((Node*)((RestrictInfo*)lfirst(lc))->clause, 0));
+
+ foreach (lc, expr->args)
+ {
+ Var *var = (Var*) lfirst(lc);
+ if (IsA(var, Var) && var->varattno >= 0)
+ {
+ double dep = var_depends_on(root, var, join_vars);
+ jselec = jselec*(1.0 - dep) + dep;
+ }
+ }
+ }
+ }
/*
* Basically, we multiply size of Cartesian product by selectivity.
*
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index 2f9aeec..09ff0c4 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -55,4 +55,6 @@ extern void CommuteOpExpr(OpExpr *clause);
extern Query *inline_set_returning_function(PlannerInfo *root,
RangeTblEntry *rte);
+extern double find_var_dependency(PlannerInfo *root, Index relid, Var *var, Bitmapset *attnums);
+
#endif /* CLAUSES_H */