On Sat, 14 Mar 2020 at 18:45, Tomas Vondra <tomas.von...@2ndquadrant.com> wrote: > > I realized there's one more thing that probably needs discussing. > Essentially, these two clause types are the same: > > a IN (1, 2, 3) > > (a = 1 OR a = 2 OR a = 3) > > but with 8f321bd1 we only recognize the first one as compatible with > functional dependencies. It was always the case that we estimated those > two clauses a bit differently, but the differences were usually small. > But now that we recognize IN as compatible with dependencies, the > difference may be much larger, which bugs me a bit ... > > So I wonder if we should recognize the special form of an OR clause, > with all Vars referencing to the same attribute etc. and treat this as > supported by functional dependencies - the attached patch does that. > MCV lists there's already no difference because OR clauses are > supported. >
Makes sense, and the patch looks straightforward enough. > The question is whether we want to do this, and whether we should also > teach the per-column estimates to recognize this special case of IN > clause. I'm not convinced about that second part though. I'd say that recognising the OR clause for functional dependencies is sufficient to prevent the large differences in estimates relative to the equivalent IN clauses. The small differences between the way that OR and IN clauses are handled have always been there, and I think that changing that is out of scope for this work. The other thing that I'm still concerned about is the possibility of returning estimates with P(a,b) > P(a) or P(b). I think that such a thing becomes much more likely with the new types of clause supported here, because they now allow multiple values from each column, where before we only allowed one. I took another look at the patch I posted on the other thread, and I've convinced myself that it's correct. Attached is an updated version, with some cosmetic tidying up and now with some additional regression tests. Regards, Dean
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c new file mode 100644 index 72dc1cd..e0cc2b9 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -30,6 +30,7 @@ #include "utils/fmgroids.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" #include "utils/syscache.h" #include "utils/typcache.h" @@ -73,8 +74,6 @@ static double dependency_degree(int numr AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); static bool dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums); -static bool dependency_implies_attribute(MVDependency *dependency, - AttrNumber attnum); static bool dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum); static MVDependency *find_strongest_dependency(MVDependencies **dependencies, @@ -614,19 +613,6 @@ dependency_is_fully_matched(MVDependency } /* - * dependency_implies_attribute - * check that the attnum matches is implied by the functional dependency - */ -static bool -dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum) -{ - if (attnum == dependency->attributes[dependency->nattributes - 1]) - return true; - - return false; -} - -/* * statext_dependencies_load * Load the functional dependencies for the indicated pg_statistic_ext tuple */ @@ -966,7 +952,7 @@ find_strongest_dependency(MVDependencies * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected * dependency, we then combine the per-clause selectivities using the formula * - * P(a,b) = P(a) * [f + (1-f)*P(b)] + * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) * * where 'f' is the degree of the dependency. * @@ -974,7 +960,7 @@ find_strongest_dependency(MVDependencies * recursively, starting with the widest/strongest dependencies. For example * P(a,b,c) is first split like this: * - * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)] + * P(a,b,c) = f * Min(P(a,b), P(c)) + (1-f) * P(a,b) * P(c) * * assuming (a,b=>c) is the strongest dependency. */ @@ -987,20 +973,27 @@ dependencies_clauselist_selectivity(Plan RelOptInfo *rel, Bitmapset **estimatedclauses) { - Selectivity s1 = 1.0; + Selectivity s1; ListCell *l; - Bitmapset *clauses_attnums = NULL; - Bitmapset **list_attnums; + Bitmapset *clauses_attnums; + AttrNumber *list_attnums; int listidx; - MVDependencies **dependencies = NULL; - int ndependencies = 0; + MVDependencies **dependencies; + int ndependencies; + int total_ndeps; + MVDependency **dependencies_used; + int ndependencies_used; + Bitmapset *attrs_used; + int nattrs_used; + Selectivity *attr_sel; int i; + int attidx; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) return 1.0; - list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * + list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * list_length(clauses)); /* @@ -1014,6 +1007,7 @@ dependencies_clauselist_selectivity(Plan * We also skip clauses that we already estimated using different types of * statistics (we treat them as incompatible). */ + clauses_attnums = NULL; listidx = 0; foreach(l, clauses) { @@ -1023,11 +1017,11 @@ dependencies_clauselist_selectivity(Plan if (!bms_is_member(listidx, *estimatedclauses) && dependency_is_compatible_clause(clause, rel->relid, &attnum)) { - list_attnums[listidx] = bms_make_singleton(attnum); + list_attnums[listidx] = attnum; clauses_attnums = bms_add_member(clauses_attnums, attnum); } else - list_attnums[listidx] = NULL; + list_attnums[listidx] = InvalidAttrNumber; listidx++; } @@ -1039,6 +1033,7 @@ dependencies_clauselist_selectivity(Plan */ if (bms_num_members(clauses_attnums) < 2) { + bms_free(clauses_attnums); pfree(list_attnums); return 1.0; } @@ -1055,6 +1050,7 @@ dependencies_clauselist_selectivity(Plan * to moving the freed chunks to freelists etc. */ ndependencies = 0; + total_ndeps = 0; dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) * list_length(rel->statlist)); @@ -1076,104 +1072,185 @@ dependencies_clauselist_selectivity(Plan if (num_matched < 2) continue; - dependencies[ndependencies++] + dependencies[ndependencies] = statext_dependencies_load(stat->statOid); + + total_ndeps += dependencies[ndependencies]->ndeps; + ndependencies++; } /* if no matching stats could be found then we've nothing to do */ if (!ndependencies) { + pfree(dependencies); + bms_free(clauses_attnums); pfree(list_attnums); return 1.0; } /* - * Apply the dependencies recursively, starting with the widest/strongest - * ones, and proceeding to the smaller/weaker ones. At the end of each - * round we factor in the selectivity of clauses on the implied attribute, - * and remove the clauses from the list. + * Work out which dependencies we can apply, starting with the + * widest/stongest ones, and proceeding to smaller/weaker ones. */ + ndependencies_used = 0; + dependencies_used = (MVDependency **) palloc(sizeof(MVDependency *) * + total_ndeps); + + attrs_used = NULL; + while (true) { - Selectivity s2 = 1.0; MVDependency *dependency; + AttrNumber attnum; - /* the widest/strongest dependency, fully matched by clauses */ + /* The widest/strongest dependency, fully matched by clauses */ dependency = find_strongest_dependency(dependencies, ndependencies, clauses_attnums); - - /* if no suitable dependency was found, we're done */ if (!dependency) break; + dependencies_used[ndependencies_used++] = dependency; + + /* Track all attributes used */ + for (i = 0; i < dependency->nattributes; i++) + { + attnum = dependency->attributes[i]; + attrs_used = bms_add_member(attrs_used, attnum); + } + /* - * We found an applicable dependency, so find all the clauses on the - * implied attribute - with dependency (a,b => c) we look for clauses - * on 'c'. + * Ignore dependencies involving the implied attribute in later loops. */ + clauses_attnums = bms_del_member(clauses_attnums, attnum); + } + + if (!ndependencies_used) + { + bms_free(attrs_used); + pfree(dependencies_used); + pfree(dependencies); + bms_free(clauses_attnums); + pfree(list_attnums); + return 1.0; + } + + /* + * For each attribute used in the dependencies to be applied, compute the + * single-column selectivity of all the clauses using that attribute, and + * mark all those clauses as having been estimated. + */ + nattrs_used = bms_num_members(attrs_used); + attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs_used); + + attidx = 0; + i = -1; + while ((i = bms_next_member(attrs_used, i)) >= 0) + { + List *attr_clauses = NIL; + Selectivity simple_sel; + listidx = -1; foreach(l, clauses) { - Node *clause; - AttrNumber attnum; + Node *clause = (Node *) lfirst(l); listidx++; - - /* - * Skip incompatible clauses, and ones we've already estimated on. - */ - if (!list_attnums[listidx]) - continue; - - /* - * We expect the bitmaps ton contain a single attribute number. - */ - attnum = bms_singleton_member(list_attnums[listidx]); - - /* - * Technically we could find more than one clause for a given - * attnum. Since these clauses must be equality clauses, we choose - * to only take the selectivity estimate from the final clause in - * the list for this attnum. If the attnum happens to be compared - * to a different Const in another clause then no rows will match - * anyway. If it happens to be compared to the same Const, then - * ignoring the additional clause is just the thing to do. - */ - if (dependency_implies_attribute(dependency, attnum)) + if (list_attnums[listidx] == i) { - clause = (Node *) lfirst(l); + attr_clauses = lappend(attr_clauses, clause); + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + } + } - s2 = clause_selectivity(root, clause, varRelid, jointype, - sjinfo); + simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid, + jointype, sjinfo, NULL); + attr_sel[attidx++] = simple_sel; + } - /* mark this one as done, so we don't touch it again. */ - *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + /* + * Now combine these selectivities using the dependency information. For + * chains of dependencies such as a -> b -> c, the b -> c dependency will + * come before the a -> b dependency in the array, so we traverse the + * array backwards to ensure such chains are computed in the right order. + * + * Pairs of selectivities for attributes with dependencies are combined + * using the formula + * + * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) + * + * where 'f' is the degree of validity of the dependency. This ensures + * that the combined selectivity is never greater than either individual + * selectivity. + * + * Where multiple dependencies apply (e.g., a -> b -> c), we use + * conditional probabilities to compute the overall result as follows: + * + * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a) + * + * so we replace the probabilities of all implied attributes with + * conditional probabilities, conditional on all their implying + * attributes. The probabilities of all other non-implied attributes are + * left as they are. + */ + for (i = ndependencies_used-1; i >= 0; i--) + { + MVDependency *dependency = dependencies_used[i]; + int j; + AttrNumber attnum; + Selectivity s2; + double f; - /* - * Mark that we've got and used the dependency on this clause. - * We'll want to ignore this when looking for the next - * strongest dependency above. - */ - clauses_attnums = bms_del_member(clauses_attnums, attnum); - } + /* Selectivity of all the implying attributes */ + s1 = 1.0; + for (j = 0; j < dependency->nattributes - 1; j++) + { + attnum = dependency->attributes[j]; + attidx = bms_member_index(attrs_used, attnum); + s1 *= attr_sel[attidx]; } + /* Original selectivity of the implied attribute */ + attnum = dependency->attributes[j]; + attidx = bms_member_index(attrs_used, attnum); + s2 = attr_sel[attidx]; + /* - * Now factor in the selectivity for all the "implied" clauses into - * the final one, using this formula: + * Replace s2 with the conditional probability s2 given s1, computed + * using the formula P(b|a) = P(a,b) / P(a), which simplifies to * - * P(a,b) = P(a) * (f + (1-f) * P(b)) + * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b) * - * where 'f' is the degree of validity of the dependency. + * where P(a) = s1, the selectivity of the implying attributes, and + * P(b) = s2, the selectivity of the implied attribute. */ - s1 *= (dependency->degree + (1 - dependency->degree) * s2); + f = dependency->degree; + + if (s1 <= s2) + attr_sel[attidx] = f + (1 - f) * s2; + else + attr_sel[attidx] = f * s2 / s1 + (1 -f) * s2; } + /* + * The overall selectivity of all the clauses on all these attributes is + * then the product of all the original (non-implied) and conditional + * (implied) probabilities. + */ + s1 = 1.0; + for (i = 0; i < nattrs_used; i++) + s1 *= attr_sel[i]; + + CLAMP_PROBABILITY(s1); + /* free deserialized functional dependencies (and then the array) */ for (i = 0; i < ndependencies; i++) pfree(dependencies[i]); + pfree(attr_sel); + bms_free(attrs_used); + pfree(dependencies_used); pfree(dependencies); + bms_free(clauses_attnums); pfree(list_attnums); return s1; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out new file mode 100644 index dbc2532..f96e651 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -440,6 +440,12 @@ SELECT * FROM check_estimated_rows('SELE 8 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + estimated | actual +-----------+-------- + 4 | 100 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); estimated | actual -----------+-------- @@ -574,6 +580,12 @@ SELECT * FROM check_estimated_rows('SELE 200 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); estimated | actual -----------+-------- @@ -667,12 +679,14 @@ SELECT * FROM check_estimated_rows('SELE 1 | 0 (1 row) --- check change of column type doesn't break it +-- changing the type of column c causes its single-column stats to be dropped, +-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple +-- clauses estimated with functional dependencies does not exceed this ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1'); estimated | actual -----------+-------- - 50 | 50 + 25 | 50 (1 row) ANALYZE functional_dependencies; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql new file mode 100644 index 8beb042..203d38d --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -280,6 +280,8 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')'); +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)'); @@ -332,6 +334,8 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')'); +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)'); @@ -365,7 +369,9 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])'); --- check change of column type doesn't break it +-- changing the type of column c causes its single-column stats to be dropped, +-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple +-- clauses estimated with functional dependencies does not exceed this ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');