Hi, Attached is a patch to allow estimation of (Var op Var) clauses using extended statistics. Currently we only use extended stats to estimate (Var op Const) clauses, which is sufficient for most cases, but it's not very hard to support this second type of clauses.
This is not an entirely new patch - I've originally included it in the patch series in [1] but it's probably better to discuss it separately, so that it does not get buried in that discussion. [1] https://www.postgresql.org/message-id/flat/20200113230008.g67iyk4cs3xbnjju@development To illustrate the purpose of this patch, consider this: db=# create table t (a int, b int); CREATE TABLE db=# insert into t select mod(i,10), mod(i,10)+1 from generate_series(1,100000) s(i); INSERT 0 100000 db=# analyze t; ANALYZE db=# explain select * from t where a < b; QUERY PLAN -------------------------------------------------------- Seq Scan on t (cost=0.00..1693.00 rows=33333 width=8) Filter: (a < b) (2 rows) db=# explain select * from t where a > b; QUERY PLAN -------------------------------------------------------- Seq Scan on t (cost=0.00..1693.00 rows=33333 width=8) Filter: (a > b) (2 rows) db=# create statistics s (mcv) on a,b from t; CREATE STATISTICS db=# analyze t; ANALYZE db=# explain select * from t where a < b; QUERY PLAN --------------------------------------------------------- Seq Scan on t (cost=0.00..1693.00 rows=100000 width=8) Filter: (a < b) (2 rows) db=# explain select * from t where a > b; QUERY PLAN ---------------------------------------------------- Seq Scan on t (cost=0.00..1693.00 rows=1 width=8) Filter: (a > b) (2 rows) I'm not entirely convinced this patch (on it's own) is very useful, for a couple of reasons: (a) Clauses of this form are not particularly common, at least compared to the Var op Const clauses. (I don't recall slow-query reports from any of our mailing lists that might be attributed to such clauses.) (b) For known cases of such queries (e.g. several TPC-H queries do use clauses like "l_commitdate < l_receiptdate" etc.) this is somewhat hindered by extended statistics only supporting MCV lists, which may not work particularly well for high-cardinality columns like dates etc. But despite that it seems like a useful feature / building block, and those limitations may be addressed in some other way (e.g. we may add multi-dimensional histograms to address the second one). regards -- Tomas Vondra EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
>From 439d39db128b9cbc06063a862283d663510d0fcc Mon Sep 17 00:00:00 2001 From: Tomas Vondra <to...@2ndquadrant.com> Date: Fri, 13 Nov 2020 01:46:50 +0100 Subject: [PATCH] Support estimation of clauses of the form Var op Var Until now, extended statistics were used only to estimate clauses of the form (Var op Const). This patch extends the estimation to also handle clauses (Var op Var). --- src/backend/statistics/extended_stats.c | 67 +++++++++---- src/backend/statistics/mcv.c | 71 +++++++++++++- .../statistics/extended_stats_internal.h | 2 +- src/test/regress/expected/stats_ext.out | 96 +++++++++++++++++++ src/test/regress/sql/stats_ext.sql | 26 +++++ 5 files changed, 243 insertions(+), 19 deletions(-) diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 36326927c6..1a6e7a3fc0 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -987,14 +987,18 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, { RangeTblEntry *rte = root->simple_rte_array[relid]; OpExpr *expr = (OpExpr *) clause; - Var *var; + Var *var, + *var2; /* Only expressions with two arguments are considered compatible. */ if (list_length(expr->args) != 2) return false; - /* Check if the expression has the right shape (one Var, one Const) */ - if (!examine_clause_args(expr->args, &var, NULL, NULL)) + /* + * Check if the expression has the right shape (one Var and one Const, + * or two Vars). + */ + if (!examine_clause_args(expr->args, &var, &var2, NULL, NULL)) return false; /* @@ -1034,7 +1038,20 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, !get_func_leakproof(get_opcode(expr->opno))) return false; - return statext_is_compatible_clause_internal(root, (Node *) var, + /* + * Check compatibility of the first Var - we get this one for both + * types of supported expressions (Var op Const) and (Var op Var). + */ + if (!statext_is_compatible_clause_internal(root, (Node *) var, + relid, attnums)) + return false; + + /* For (Var op Const) we don't get the second Var, and we're done. */ + if (!var2) + return true; + + /* For (Var op Var) check compatibility of the second Var. */ + return statext_is_compatible_clause_internal(root, (Node *) var2, relid, attnums); } @@ -1050,7 +1067,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, return false; /* Check if the expression has the right shape (one Var, one Const) */ - if (!examine_clause_args(expr->args, &var, NULL, NULL)) + if (!examine_clause_args(expr->args, &var, NULL, NULL, NULL)) return false; /* @@ -1446,22 +1463,24 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, } /* - * examine_opclause_expression + * examine_clause_args * Split expression into Var and Const parts. * - * Attempts to match the arguments to either (Var op Const) or (Const op Var), - * possibly with a RelabelType on top. When the expression matches this form, - * returns true, otherwise returns false. + * Attempts to match the arguments to either (Var op Const) or (Const op Var) + * or (Var op Var), possibly with a RelabelType on top. When the expression + * matches this form, returns true, otherwise returns false. * * Optionally returns pointers to the extracted Var/Const nodes, when passed * non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies * on which side of the operator we found the Var node. */ bool -examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp) +examine_clause_args(List *args, Var **var1p, Var **var2p, Const **cstp, + bool *varonleftp) { - Var *var; - Const *cst; + Var *var1 = NULL; + Var *var2 = NULL; + Const *cst = NULL; bool varonleft; Node *leftop, *rightop; @@ -1481,22 +1500,38 @@ examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp) if (IsA(leftop, Var) && IsA(rightop, Const)) { - var = (Var *) leftop; + var1 = (Var *) leftop; cst = (Const *) rightop; varonleft = true; } else if (IsA(leftop, Const) && IsA(rightop, Var)) { - var = (Var *) rightop; + var1 = (Var *) rightop; cst = (Const *) leftop; varonleft = false; } + else if (IsA(leftop, Var) && IsA(rightop, Var)) + { + var1 = (Var *) leftop; + var2 = (Var *) rightop; + varonleft = false; + + /* + * Both variables have to be for the same relation (otherwise it's + * a join clause, and we don't deal with those yet. + */ + if (var1->varno != var2->varno) + return false; + } else return false; /* return pointers to the extracted parts if requested */ - if (varp) - *varp = var; + if (var1p) + *var1p = var1; + + if (var2p) + *var2p = var2; if (cstp) *cstp = cst; diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 6a262f1543..ac9a8020b8 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -1583,13 +1583,17 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, /* valid only after examine_clause_args returns true */ Var *var; + Var *var2; Const *cst; bool varonleft; fmgr_info(get_opcode(expr->opno), &opproc); /* extract the var and const from the expression */ - if (examine_clause_args(expr->args, &var, &cst, &varonleft)) + if (!examine_clause_args(expr->args, &var, &var2, &cst, &varonleft)) + continue; + + if (cst) /* Var op Const */ { int idx; @@ -1653,6 +1657,68 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, matches[i] = RESULT_MERGE(matches[i], is_or, match); } } + else /* Var op Var */ + { + int idx; + int idx2; + + Assert(var2); + + /* match the attribute to a dimension of the statistic */ + idx = bms_member_index(keys, var->varattno); + idx2 = bms_member_index(keys, var2->varattno); + + /* + * Walk through the MCV items and evaluate the current clause. + * We can skip items that were already ruled out, and + * terminate if there are no remaining MCV items that might + * possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + bool match = true; + MCVItem *item = &mcvlist->items[i]; + + /* + * When either of the MCV items is NULL we can treat this + * as a mismatch. We must not call the operator because + * of strictness. + */ + if (item->isnull[idx] || item->isnull[idx2]) + { + matches[i] = RESULT_MERGE(matches[i], is_or, false); + continue; + } + + /* + * Skip MCV items that can't change result in the bitmap. + * Once the value gets false for AND-lists, or true for + * OR-lists, we don't need to look at more clauses. + */ + if (RESULT_IS_FINAL(matches[i], is_or)) + continue; + + /* + * First check whether the constant is below the lower + * boundary (in that case we can skip the bucket, because + * there's no overlap). + * + * We don't store collations used to build the statistics, + * but we can use the collation for the attribute itself, + * as stored in varcollid. We do reset the statistics after + * a type change (including collation change), so this is + * OK. We may need to relax this after allowing extended + * statistics on expressions. + */ + match = DatumGetBool(FunctionCall2Coll(&opproc, + var->varcollid, + item->values[idx], + item->values[idx2])); + + /* update the match bitmap with the result */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); + } + } } else if (IsA(clause, ScalarArrayOpExpr)) { @@ -1667,7 +1733,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, fmgr_info(get_opcode(expr->opno), &opproc); /* extract the var and const from the expression */ - if (examine_clause_args(expr->args, &var, &cst, &varonleft)) + if (examine_clause_args(expr->args, &var, NULL, &cst, &varonleft)) { int idx; @@ -1681,6 +1747,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, /* ScalarArrayOpExpr has the Var always on the left */ Assert(varonleft); + Assert(cst); if (!cst->constisnull) { diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index 61e69696cf..8f0bf2df5c 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -96,7 +96,7 @@ extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, MultiSortSupport mss, int numattrs, AttrNumber *attnums); -extern bool examine_clause_args(List *args, Var **varp, +extern bool examine_clause_args(List *args, Var **var1p, Var **var2p, Const **cstp, bool *varonleftp); extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root, diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 4c3edd213f..f77d1d5469 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -978,6 +978,18 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' 343 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a > c'); + estimated | actual +-----------+-------- + 1667 | 3750 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < c'); + estimated | actual +-----------+-------- + 1667 | 0 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); estimated | actual -----------+-------- @@ -1113,6 +1125,18 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' 200 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a > c'); + estimated | actual +-----------+-------- + 3750 | 3750 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < c'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); estimated | actual -----------+-------- @@ -1323,6 +1347,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND 3750 | 2500 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE b = d'); + estimated | actual +-----------+-------- + 25 | 2500 +(1 row) + -- create statistics CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, d FROM mcv_lists; ANALYZE mcv_lists; @@ -1356,6 +1386,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND 2500 | 2500 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE b = d'); + estimated | actual +-----------+-------- + 2500 | 2500 +(1 row) + -- mcv with pass-by-ref fixlen types, e.g. uuid CREATE TABLE mcv_lists_uuid ( a UUID, @@ -1450,6 +1486,18 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND 1094 | 0 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b'); + estimated | actual +-----------+-------- + 9950 | 2500 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b AND b = c'); + estimated | actual +-----------+-------- + 50 | 2500 +(1 row) + CREATE STATISTICS mcv_lists_bool_stats (mcv) ON a, b, c FROM mcv_lists_bool; ANALYZE mcv_lists_bool; @@ -1477,6 +1525,18 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND 1 | 0 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b'); + estimated | actual +-----------+-------- + 2500 | 2500 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b AND b = c'); + estimated | actual +-----------+-------- + 2500 | 2500 +(1 row) + -- check the ability to use multiple MCV lists CREATE TABLE mcv_lists_multi ( a INTEGER, @@ -1512,6 +1572,24 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AN 4 | 142 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = b AND c = d'); + estimated | actual +-----------+-------- + 1 | 5000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a < b'); + estimated | actual +-----------+-------- + 1667 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a <= b'); + estimated | actual +-----------+-------- + 1667 | 5000 +(1 row) + -- create separate MCV statistics CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi; @@ -1534,6 +1612,24 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AN 143 | 142 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = b AND c = d'); + estimated | actual +-----------+-------- + 5000 | 5000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a < b'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a <= b'); + estimated | actual +-----------+-------- + 5000 | 5000 +(1 row) + DROP TABLE mcv_lists_multi; -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 9781e590a3..7ae02397a9 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -512,6 +512,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a > c'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < c'); + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)'); @@ -561,6 +565,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 4 >= a AND ''0 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a > c'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < c'); + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)'); @@ -671,6 +679,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND (b = ''x'' OR d = ''x'')'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE b = d'); + -- create statistics CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, d FROM mcv_lists; @@ -689,6 +699,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND (b = ''x'' OR d = ''x'')'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE b = d'); + -- mcv with pass-by-ref fixlen types, e.g. uuid CREATE TABLE mcv_lists_uuid ( a UUID, @@ -764,6 +776,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b AND b = c'); + CREATE STATISTICS mcv_lists_bool_stats (mcv) ON a, b, c FROM mcv_lists_bool; @@ -777,6 +793,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b AND b = c'); + -- check the ability to use multiple MCV lists CREATE TABLE mcv_lists_multi ( a INTEGER, @@ -800,6 +820,9 @@ ANALYZE mcv_lists_multi; SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = b AND c = d'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a < b'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a <= b'); -- create separate MCV statistics CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; @@ -810,6 +833,9 @@ ANALYZE mcv_lists_multi; SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = b AND c = d'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a < b'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a <= b'); DROP TABLE mcv_lists_multi; -- 2.26.2