On 19.03.2021 12:17, Yugo NAGATA wrote:
On Wed, 10 Mar 2021 03:00:25 +0100
Tomas Vondra <tomas.von...@enterprisedb.com> wrote:

What is being proposed here - an extension suggesting which statistics
to create (and possibly creating them automatically) is certainly
useful, but I'm not sure I'd call it "adaptive query optimization". I
think "adaptive" means the extension directly modifies the estimates
based on past executions. So I propose calling it maybe "statistics
advisor" or something like that.
I am also agree with the idea to implement this feature as a new
extension for statistics advisor.

BTW Why is "qual" in

   static void
   AddMultiColumnStatisticsForQual(void* qual, ExplainState *es)

declared as "void *"? Shouldn't that be "List *"?
When I tested this extension using TPC-H queries, it raised segmentation
fault in this function. I think the cause would be around this argument.

Regards,
Yugo Nagata

Attached please find new version of the patch with AddMultiColumnStatisticsForQual parameter type fix and one more fix related with handling synthetic attributes. I can not reproduce the crash on TPC-H queries, so if the problem persists, can you please send me stack trace and may be some other information helping to understand the reason of SIGSEGV?

Thanks in advance,
Konstantin

diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index faa6231d87..3fefc47e0b 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -13,12 +13,32 @@
 #include "postgres.h"
 
 #include <limits.h>
+#include <math.h>
 
+#include "access/hash.h"
 #include "access/parallel.h"
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "access/table.h"
+#include "access/tableam.h"
+#include "catalog/pg_statistic_ext.h"
 #include "commands/explain.h"
+#include "commands/defrem.h"
 #include "executor/instrument.h"
 #include "jit/jit.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/cost.h"
+#include "optimizer/optimizer.h"
+#include "optimizer/planmain.h"
+#include "parser/parsetree.h"
+#include "storage/ipc.h"
+#include "statistics/statistics.h"
+#include "utils/fmgroids.h"
 #include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
 
 PG_MODULE_MAGIC;
 
@@ -34,7 +54,9 @@ static bool auto_explain_log_settings = false;
 static int	auto_explain_log_format = EXPLAIN_FORMAT_TEXT;
 static int	auto_explain_log_level = LOG;
 static bool auto_explain_log_nested_statements = false;
+static bool auto_explain_add_statistics_suggest_only = false;
 static double auto_explain_sample_rate = 1;
+static double auto_explain_add_statistics_threshold = 0.0;
 
 static const struct config_enum_entry format_options[] = {
 	{"text", EXPLAIN_FORMAT_TEXT, false},
@@ -85,6 +107,7 @@ static void explain_ExecutorRun(QueryDesc *queryDesc,
 static void explain_ExecutorFinish(QueryDesc *queryDesc);
 static void explain_ExecutorEnd(QueryDesc *queryDesc);
 
+static void AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
 
 /*
  * Module load callback
@@ -230,6 +253,30 @@ _PG_init(void)
 							 NULL,
 							 NULL);
 
+	DefineCustomRealVariable("auto_explain.add_statistics_threshold",
+							 "Sets the threshold for actual/estimated #rows ratio triggering creation of multicolumn statistic for the related columns.",
+							 "Zero disables implicit creation of multicolumn statistic.",
+							 &auto_explain_add_statistics_threshold,
+							 0.0,
+							 0.0,
+							 INT_MAX,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	DefineCustomBoolVariable("auto_explain.add_statistics_suggest_only",
+							 "Do not create statistic but just record in WAL suggested create statistics statement.",
+							 NULL,
+							 &auto_explain_add_statistics_suggest_only,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	EmitWarningsOnPlaceholders("auto_explain");
 
 	/* Install hooks. */
@@ -363,6 +410,283 @@ explain_ExecutorFinish(QueryDesc *queryDesc)
 	PG_END_TRY();
 }
 
+/**
+ * Try to add multicolumn statistics for specified subplans.
+ */
+static void
+AddMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+	ListCell   *lst;
+
+	foreach(lst, plans)
+	{
+		SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+		AddMultiColumnStatisticsForNode(sps->planstate, es);
+	}
+}
+
+/**
+ * Try to add multicolumn statistics for plan subnodes.
+ */
+static void
+AddMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+									   ExplainState *es)
+{
+	int			j;
+
+	for (j = 0; j < nsubnodes; j++)
+		AddMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+/**
+ * Comparator used to sort Vars by name
+ */
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+	char* va = strVal((Value *) linitial(((ColumnRef *)lfirst(a))->fields));
+	char* vb = strVal((Value *) linitial(((ColumnRef *)lfirst(b))->fields));
+	return strcmp(va, vb);
+}
+
+/**
+ * Try to add multicolumn statistics for qual
+ */
+static void
+AddMultiColumnStatisticsForQual(List* qual, ExplainState *es)
+{
+	List *vars = NULL;
+	ListCell* lc;
+
+	/* Extract vars from all quals */
+	foreach (lc, qual)
+	{
+		Node* node = (Node*)lfirst(lc);
+		if (IsA(node, RestrictInfo))
+			node = (Node*)((RestrictInfo*)node)->clause;
+		vars = list_concat(vars, pull_vars_of_level(node, 0));
+	}
+
+	/* Loop until we considered all vars */
+	while (vars != NULL)
+	{
+		ListCell *cell;
+		List *cols = NULL;
+		Index relno = 0;
+		Bitmapset* colmap = NULL;
+
+		/* Contruct list of unique vars */
+		foreach (cell, vars)
+		{
+			Node* node = (Node *) lfirst(cell);
+			if (IsA(node, Var))
+			{
+				Var *var = (Var *) node;
+				int varno = IS_SPECIAL_VARNO(var->varno) ? var->varnosyn : var->varno;
+				if (cols == NULL || varno == relno)
+				{
+					int varattno = IS_SPECIAL_VARNO(var->varno) ? var->varattnosyn : var->varattno;
+					relno = varno;
+					if (var->varattno > 0 &&
+						!bms_is_member(varattno, colmap) &&
+						varno >= 1 && /* not synthetic var */
+						varno <= list_length(es->rtable) &&
+						list_length(cols) < STATS_MAX_DIMENSIONS)
+					{
+						RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+						if (rte->rtekind == RTE_RELATION)
+						{
+							ColumnRef  *col = makeNode(ColumnRef);
+							char *colname = get_rte_attribute_name(rte, varattno);
+							col->fields = list_make1(makeString(colname));
+							cols = lappend(cols, col);
+							colmap = bms_add_member(colmap, varattno);
+						}
+					}
+				}
+				else
+				{
+					continue;
+				}
+			}
+			vars = foreach_delete_current(vars, cell);
+		}
+		/* To create multicolumn statitics we need to have at least 2 columns */
+		if (list_length(cols) >= 2)
+		{
+			RangeTblEntry *rte = rt_fetch(relno, es->rtable);
+			CreateStatsStmt* stats = makeNode(CreateStatsStmt);
+			char *rel_namespace = get_namespace_name(get_rel_namespace(rte->relid));
+			char *rel_name = get_rel_name(rte->relid);
+			RangeVar* rel = makeRangeVar(rel_namespace, rel_name, 0);
+			char* stat_name = rel_name;
+			char* create_stat_stmt = (char*)"";
+			char const* sep = "ON";
+			ScanKeyData entry[2];
+			TableScanDesc scan;
+			Relation stat_rel;
+			size_t name_len;
+			TupleTableSlot *slot;
+
+			/* Sort variables by name */
+			list_sort(cols, vars_list_comparator);
+
+			/* Construct name for statistic by concatenating relation name with all columns */
+			foreach (cell, cols)
+			{
+				char* col_name = strVal((Value *) linitial(((ColumnRef *)lfirst(cell))->fields));
+				stat_name = psprintf("%s_%s", stat_name, col_name);
+				create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+				sep = ",";
+			}
+
+			name_len = strlen(stat_name);
+			/* Truncate name if it doesn't fit in NameData */
+			if (name_len >= NAMEDATALEN)
+				stat_name = psprintf("%.*s_%08x", NAMEDATALEN - 10, stat_name, (unsigned)hash_any((uint8*)stat_name, name_len));
+
+			ScanKeyInit(&entry[0],
+						Anum_pg_statistic_ext_stxname,
+						BTEqualStrategyNumber, F_NAMEEQ,
+						CStringGetDatum(stat_name));
+			ScanKeyInit(&entry[1],
+						Anum_pg_statistic_ext_stxnamespace,
+						BTEqualStrategyNumber, F_OIDEQ,
+						ObjectIdGetDatum(get_rel_namespace(rte->relid)));
+
+			/*
+			 * Prevent concurrent access to extended statistic table
+			 */
+			stat_rel = table_open(StatisticExtRelationId, AccessExclusiveLock);
+			slot = table_slot_create(stat_rel, NULL);
+			scan = table_beginscan_catalog(stat_rel, 2, entry);
+
+			/*
+			 * Check if multicolumn statistic object with such name already exists.
+			 * Most likely if was already created by auto_explain, but either ANALYZE was not performed since
+			 * this time, either presence of this multicolumn statistic doesn't help to provide more precise estimation.
+			 * Despite to the fact that we create statistics with "if_not_exist" option, presence of such check
+			 * allows to eliminate notice message that statistics object already exists.
+			 */
+			if (!table_scan_getnextslot(scan, ForwardScanDirection, slot))
+			{
+				if (auto_explain_add_statistics_suggest_only)
+				{
+					ereport(NOTICE, (errmsg("Auto_explain suggestion: CREATE STATISTICS %s %s FROM %s",
+											stat_name, create_stat_stmt, rel_name),
+									 errhidestmt(true)));
+				}
+				else
+				{
+					ereport(LOG, (errmsg("Add statistics %s", stat_name),
+								  errhidestmt(true)));
+					stats->defnames = list_make2(makeString(rel_namespace), makeString(stat_name));
+					stats->if_not_exists = true;
+					stats->relations = list_make1(rel);
+					stats->exprs = cols;
+					CreateStatistics(stats);
+				}
+			}
+			table_endscan(scan);
+			ExecDropSingleTupleTableSlot(slot);
+			table_close(stat_rel, AccessExclusiveLock);
+		}
+	}
+}
+
+/**
+ * Try to add multicolumn statistics for node
+ */
+static void
+AddMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+	Plan	   *plan = planstate->plan;
+
+	if (planstate->instrument && plan->plan_rows != 0)
+	{
+		if (auto_explain_add_statistics_threshold != 0
+			&& planstate->instrument->ntuples / plan->plan_rows >= auto_explain_add_statistics_threshold)
+		{
+			elog(DEBUG1, "Estimated=%f, actual=%f, error=%f: plan=%s", plan->plan_rows, planstate->instrument->ntuples, planstate->instrument->ntuples / plan->plan_rows, nodeToString(plan));
+			/* quals, sort keys, etc */
+			switch (nodeTag(plan))
+			{
+			  case T_IndexScan:
+				AddMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+				break;
+			  case T_IndexOnlyScan:
+				AddMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+				break;
+			  case T_BitmapIndexScan:
+				AddMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+				break;
+			  case T_NestLoop:
+				AddMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+				break;
+			  case T_MergeJoin:
+				AddMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+				AddMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+				break;
+			  case T_HashJoin:
+				AddMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+				AddMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+				break;
+			  default:
+				break;
+			}
+			AddMultiColumnStatisticsForQual(plan->qual, es);
+		}
+	}
+
+	/* initPlan-s */
+	if (planstate->initPlan)
+		AddMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+	/* lefttree */
+	if (outerPlanState(planstate))
+		AddMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+	/* righttree */
+	if (innerPlanState(planstate))
+		AddMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+	/* special child plans */
+	switch (nodeTag(plan))
+	{
+		case T_ModifyTable:
+			AddMultiColumnStatisticsForMemberNodes(((ModifyTableState *) planstate)->mt_plans,
+												   ((ModifyTableState *) planstate)->mt_nplans,
+												   es);
+			break;
+		case T_Append:
+			AddMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+												   ((AppendState *) planstate)->as_nplans,
+												   es);
+			break;
+		case T_MergeAppend:
+			AddMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+												   ((MergeAppendState *) planstate)->ms_nplans,
+												   es);
+			break;
+		case T_BitmapAnd:
+			AddMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+												   ((BitmapAndState *) planstate)->nplans,
+												   es);
+			break;
+		case T_BitmapOr:
+			AddMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+												   ((BitmapOrState *) planstate)->nplans,
+												   es);
+			break;
+		case T_SubqueryScan:
+			AddMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+			break;
+		default:
+			break;
+	}
+}
+
 /*
  * ExecutorEnd hook: log results if needed
  */
@@ -403,6 +727,10 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
 				ExplainPrintJITSummary(es, queryDesc);
 			ExplainEndOutput(es);
 
+			/* Add multicolumn statistic if requested */
+			if (auto_explain_add_statistics_threshold && !IsParallelWorker())
+				AddMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
 			/* Remove last line break */
 			if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
 				es->str->data[--es->str->len] = '\0';
diff --git a/doc/src/sgml/auto-explain.sgml b/doc/src/sgml/auto-explain.sgml
index 30e35a714a..29dfd7a8fb 100644
--- a/doc/src/sgml/auto-explain.sgml
+++ b/doc/src/sgml/auto-explain.sgml
@@ -280,6 +280,45 @@ LOAD 'auto_explain';
      </para>
     </listitem>
    </varlistentry>
+
+   <varlistentry>
+    <term>
+     <varname>auto_explain.auto_explain.add_statistics_threshold</varname> (<type>real</type>)
+     <indexterm>
+      <primary><varname>auto_explain.add_statistics_threshold</varname> configuration parameter</primary>
+     </indexterm>
+    </term>
+    <listitem>
+     <para>
+       <varname>auto_explain.add_statistics_threshold</varname> sets the threshold for
+       actual/estimated #rows ratio triggering creation of multicolumn statistic
+       for the related columns. It can be used for adpative query optimization.
+       If there is large gap between real and estimated number of tuples for the
+       concrete plan node, then multicolumn statistic is created for involved
+       attributes. Zero value (default) disables implicit creation of multicolumn statistic.
+       Please notice, this parameter has no effect unless <varname>auto_explain.log_analyze</varname>
+       is enabled and that statistics are created only when the excution time exceeds
+       <varname>auto_explain.log_min_duration</varname>. Also additional statistics are created only if
+       number of rows is over-estimated and not if it is under-estimated.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>
+     <varname>auto_explain.auto_explain.add_statistics_suggest_only</varname> (<type>boolean</type>)
+     <indexterm>
+      <primary><varname>auto_explain.add_statistics_suggest_only</varname> configuration parameter</primary>
+     </indexterm>
+    </term>
+    <listitem>
+     <para>
+       <varname>auto_explain.add_statistics_suggest_only</varname> disables creation
+       of multicolumn statistic even if <varname>auto_explain.add_statistics_threshold</varname>
+       contains non zero value.Only log message will be reported in case of bad estimation.
+     </para>
+    </listitem>
+   </varlistentry>
   </variablelist>
 
   <para>

Reply via email to