On 15.10.2019 1:20, legrand legrand wrote:
Hello Konstantin,

What you have proposed regarding join_selectivity and multicolumn statistics
is a very good new !

Regarding your auto_explain modification, maybe an "advisor" mode would also
be helpfull (with auto_explain_add_statistics_threshold=-1 for exemple).
This would allow to track which missing statistic should be tested (manually
or in an other environment).

In my point of view this advice should be an option of the EXPLAIN command,
that should also permit
auto_explain module to propose "learning" phase.
Thank you for good suggestion. Advisor mode is really good idea.
I have added "auto_explain.suggest_only" GUC.
When it is switched on, suggested CREATE STATISTICS statement is just printed inĀ  log but not actually created.


--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 4bf777d..a356df9 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -25,6 +25,7 @@
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
 #include "statistics/statistics.h"
+#include "catalog/pg_statistic_ext.h"
 
 
 /*
@@ -159,6 +160,9 @@ clauselist_selectivity_simple(PlannerInfo *root,
 	RangeQueryClause *rqlist = NULL;
 	ListCell   *l;
 	int			listidx;
+	Bitmapset  *clauses_attnums = NULL;
+	int			n_clauses_attnums = 0;
+	int         innerRelid = varRelid;
 
 	/*
 	 * If there's exactly one clause (and it was not estimated yet), just go
@@ -170,6 +174,9 @@ clauselist_selectivity_simple(PlannerInfo *root,
 		return clause_selectivity(root, (Node *) linitial(clauses),
 								  varRelid, jointype, sjinfo);
 
+	if (innerRelid == 0 && sjinfo)
+		bms_get_singleton_member(sjinfo->min_righthand, &innerRelid);
+
 	/*
 	 * Anything that doesn't look like a potential rangequery clause gets
 	 * multiplied into s1 and forgotten. Anything that does gets inserted into
@@ -181,7 +188,6 @@ clauselist_selectivity_simple(PlannerInfo *root,
 		Node	   *clause = (Node *) lfirst(l);
 		RestrictInfo *rinfo;
 		Selectivity s2;
-
 		listidx++;
 
 		/*
@@ -213,6 +219,7 @@ clauselist_selectivity_simple(PlannerInfo *root,
 		else
 			rinfo = NULL;
 
+
 		/*
 		 * See if it looks like a restriction clause with a pseudoconstant on
 		 * one side.  (Anything more complicated than that might not behave in
@@ -224,6 +231,55 @@ clauselist_selectivity_simple(PlannerInfo *root,
 			OpExpr	   *expr = (OpExpr *) clause;
 			bool		varonleft = true;
 			bool		ok;
+			int         oprrest = get_oprrest(expr->opno);
+
+			/* Try to take in account functional dependencies between attributes */
+			if (oprrest == F_EQSEL && rinfo != NULL && innerRelid != 0)
+			{
+				Var* var = (Var*)linitial(expr->args);
+				if (!IsA(var, Var) || var->varno != innerRelid)
+				{
+					var = (Var*)lsecond(expr->args);
+				}
+				if (IsA(var, Var) && var->varno == innerRelid)
+				{
+					clauses_attnums = bms_add_member(clauses_attnums, var->varattno);
+					if (n_clauses_attnums++ != 0)
+					{
+						RelOptInfo* rel = find_base_rel(root, innerRelid);
+						if (rel->rtekind == RTE_RELATION && rel->statlist != NIL)
+						{
+							StatisticExtInfo *stat = choose_best_statistics(rel->statlist, clauses_attnums,
+																			STATS_EXT_DEPENDENCIES);
+							if (stat != NULL)
+							{
+								MVDependencies *dependencies = statext_dependencies_load(stat->statOid);
+								MVDependency *strongest = NULL;
+								int i;
+								for (i = 0; i < dependencies->ndeps; i++)
+								{
+									MVDependency *dependency = dependencies->deps[i];
+									int n_dep_vars = dependency->nattributes - 1;
+									/* Dependency implies attribute */
+									if (var->varattno == dependency->attributes[n_dep_vars])
+									{
+										while (--n_dep_vars >= 0
+											   && bms_is_member(dependency->attributes[n_dep_vars], clauses_attnums));
+										if (n_dep_vars < 0 && (!strongest || strongest->degree < dependency->degree))
+											strongest = dependency;
+									}
+								}
+								if (strongest)
+								{
+									Selectivity dep_sel = (strongest->degree + (1 - strongest->degree) * s1);
+									s1 = Min(dep_sel, s2);
+									continue;
+								}
+							}
+						}
+					}
+				}
+			}
 
 			if (rinfo)
 			{
@@ -249,7 +305,7 @@ clauselist_selectivity_simple(PlannerInfo *root,
 				 * selectivity in generically.  But if it's the right oprrest,
 				 * add the clause to rqlist for later processing.
 				 */
-				switch (get_oprrest(expr->opno))
+				switch (oprrest)
 				{
 					case F_SCALARLTSEL:
 					case F_SCALARLESEL:
diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index a9536c2..915a204 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -13,12 +13,25 @@
 #include "postgres.h"
 
 #include <limits.h>
+#include <math.h>
 
 #include "access/parallel.h"
 #include "commands/explain.h"
+#include "commands/defrem.h"
 #include "executor/instrument.h"
 #include "jit/jit.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/cost.h"
+#include "optimizer/optimizer.h"
+#include "optimizer/planmain.h"
+#include "parser/parsetree.h"
+#include "storage/ipc.h"
+#include "statistics/statistics.h"
 #include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
 
 PG_MODULE_MAGIC;
 
@@ -33,7 +46,9 @@ static bool auto_explain_log_settings = false;
 static int	auto_explain_log_format = EXPLAIN_FORMAT_TEXT;
 static int	auto_explain_log_level = LOG;
 static bool auto_explain_log_nested_statements = false;
+static bool auto_explain_suggest_only = false;
 static double auto_explain_sample_rate = 1;
+static double auto_explain_add_statistics_threshold = 0.0;
 
 static const struct config_enum_entry format_options[] = {
 	{"text", EXPLAIN_FORMAT_TEXT, false},
@@ -218,6 +233,30 @@ _PG_init(void)
 							 NULL,
 							 NULL);
 
+	DefineCustomRealVariable("auto_explain.add_statistics_threshold",
+							 "Sets the threshold for actual/estimated #rows ratio triggering creation of multicolumn statistic for the related columns.",
+							 "Zero disables implicit creation of multicolumn statistic.",
+							 &auto_explain_add_statistics_threshold,
+							 0.0,
+							 0.0,
+							 INT_MAX,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	DefineCustomBoolVariable("auto_explain.suggest_only",
+							 "Do not create statistic but just record in WAL suggested create statistics statement.",
+							 NULL,
+							 &auto_explain_suggest_only,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	EmitWarningsOnPlaceholders("auto_explain");
 
 	/* Install hooks. */
@@ -353,6 +392,230 @@ explain_ExecutorFinish(QueryDesc *queryDesc)
 	PG_END_TRY();
 }
 
+static void
+AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+
+static void
+AddMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+	ListCell   *lst;
+
+	foreach(lst, plans)
+	{
+		SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+		AddMultiColumnStatisticsForNode(sps->planstate, es);
+	}
+}
+
+static void
+AddMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+									ExplainState *es)
+{
+	int			j;
+
+	for (j = 0; j < nsubnodes; j++)
+		AddMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+	char* va = strVal((Value *) linitial(((ColumnRef *)lfirst(a))->fields));
+	char* vb = strVal((Value *) linitial(((ColumnRef *)lfirst(b))->fields));
+	return strcmp(va, vb);
+}
+
+static void
+AddMultiColumnStatisticsForQual(void* qual, ExplainState *es)
+{
+	List *vars = NULL;
+	ListCell* lc;
+	foreach (lc, qual)
+	{
+		Node* node = (Node*)lfirst(lc);
+		if (IsA(node, RestrictInfo))
+			node = (Node*)((RestrictInfo*)node)->clause;
+		vars = list_concat(vars, pull_vars_of_level(node, 0));
+	}
+	while (vars != NULL)
+	{
+		ListCell *cell;
+		List *cols = NULL;
+		Index varno = 0;
+		Bitmapset* colmap = NULL;
+
+		foreach (cell, vars)
+		{
+			Node* node = (Node *) lfirst(cell);
+			if (IsA(node, Var))
+			{
+				Var *var = (Var *) node;
+				if (cols == NULL || var->varnoold == varno)
+				{
+					varno = var->varnoold;
+					if (var->varattno > 0 &&
+						!bms_is_member(var->varattno, colmap) &&
+						varno >= 1 &&
+						varno <= list_length(es->rtable) &&
+						list_length(cols) < STATS_MAX_DIMENSIONS)
+					{
+						RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+						if (rte->rtekind == RTE_RELATION)
+						{
+							ColumnRef  *col = makeNode(ColumnRef);
+							char *colname = get_rte_attribute_name(rte, var->varattno);
+							col->fields = list_make1(makeString(colname));
+							cols = lappend(cols, col);
+							colmap = bms_add_member(colmap, var->varattno);
+						}
+					}
+				}
+				else
+				{
+					continue;
+				}
+			}
+			vars = foreach_delete_current(vars, cell);
+		}
+		if (list_length(cols) >= 2)
+		{
+			CreateStatsStmt* stats = makeNode(CreateStatsStmt);
+			RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+			char *rel_namespace = get_namespace_name(get_rel_namespace(rte->relid));
+			char *rel_name = get_rel_name(rte->relid);
+			RangeVar* rel = makeRangeVar(rel_namespace, rel_name, 0);
+			char* stat_name = rel_name;
+			char* create_stat_stmt = (char*)"";
+			char const* sep = "ON";
+
+			list_sort(cols, vars_list_comparator);
+			/* Construct name for statistic by concatenating relation name with all columns */
+			foreach (cell, cols)
+			{
+				char* col_name = strVal((Value *) linitial(((ColumnRef *)lfirst(cell))->fields));
+				stat_name = psprintf("%s_%s", stat_name, col_name);
+				create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+				sep = ",";
+			}
+			/*
+			 * Check if multicolumn statistic object with such name already exists.
+			 * Most likely if was already created by auto_explain, but either ANALYZE was not performed since
+			 * this time, either presence of this multicolumn statistic doesn't help to provide more precise estimation.
+			 * Despite to the fact that we create statistics with "if_not_exist" option, presence of such check
+			 * allows to eliminate notice message that statistics object already exists.
+			 */
+			if (!SearchSysCacheExists2(STATEXTNAMENSP,
+									   CStringGetDatum(stat_name),
+									   ObjectIdGetDatum(get_rel_namespace(rte->relid))))
+			{
+				if (auto_explain_suggest_only)
+				{
+					elog(NOTICE, "Auto_explain suggestion: CREATE STATISTICS %s %s FROM %s", stat_name, create_stat_stmt, rel_name);
+				}
+				else
+				{
+					elog(LOG, "Add statistics %s", stat_name);
+					stats->defnames = list_make2(makeString(rel_namespace), makeString(stat_name));
+					stats->if_not_exists = true;
+					stats->relations = list_make1(rel);
+					stats->exprs = cols;
+					CreateStatistics(stats);
+				}
+			}
+		}
+	}
+}
+
+static void
+AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+	Plan	   *plan = planstate->plan;
+
+	if (planstate->instrument && plan->plan_rows != 0)
+	{
+		if (auto_explain_add_statistics_threshold != 0
+			&& planstate->instrument->ntuples / plan->plan_rows >= auto_explain_add_statistics_threshold)
+		{
+			elog(DEBUG1, "Estimated=%f, actual=%f, error=%f: plan=%s", plan->plan_rows, planstate->instrument->ntuples, planstate->instrument->ntuples / plan->plan_rows, nodeToString(plan));
+			/* quals, sort keys, etc */
+			switch (nodeTag(plan))
+			{
+			  case T_IndexScan:
+				AddMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+				break;
+			  case T_IndexOnlyScan:
+				AddMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+				break;
+			  case T_BitmapIndexScan:
+				AddMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+				break;
+			  case T_NestLoop:
+				AddMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+				break;
+			  case T_MergeJoin:
+				AddMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+				AddMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+				break;
+			  case T_HashJoin:
+				AddMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+				AddMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+				break;
+			  default:
+				break;
+			}
+			AddMultiColumnStatisticsForQual(plan->qual, es);
+		}
+	}
+
+	/* initPlan-s */
+	if (planstate->initPlan)
+		AddMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+	/* lefttree */
+	if (outerPlanState(planstate))
+		AddMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+	/* righttree */
+	if (innerPlanState(planstate))
+		AddMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+	/* special child plans */
+	switch (nodeTag(plan))
+	{
+		case T_ModifyTable:
+			AddMultiColumnStatisticsForMemberNodes(((ModifyTableState *) planstate)->mt_plans,
+												   ((ModifyTableState *) planstate)->mt_nplans,
+												   es);
+			break;
+		case T_Append:
+			AddMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+												   ((AppendState *) planstate)->as_nplans,
+												   es);
+			break;
+		case T_MergeAppend:
+			AddMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+												   ((MergeAppendState *) planstate)->ms_nplans,
+												   es);
+			break;
+		case T_BitmapAnd:
+			AddMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+												   ((BitmapAndState *) planstate)->nplans,
+												   es);
+			break;
+		case T_BitmapOr:
+			AddMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+												   ((BitmapOrState *) planstate)->nplans,
+												   es);
+			break;
+		case T_SubqueryScan:
+			AddMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+			break;
+		default:
+			break;
+	}
+}
+
 /*
  * ExecutorEnd hook: log results if needed
  */
@@ -392,6 +655,10 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
 				ExplainPrintJITSummary(es, queryDesc);
 			ExplainEndOutput(es);
 
+			/* Add multicolumn statistic if requested */
+			if (auto_explain_add_statistics_threshold && !IsParallelWorker())
+				AddMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
 			/* Remove last line break */
 			if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
 				es->str->data[--es->str->len] = '\0';

Reply via email to