Re: [HACKERS] improving GROUP BY estimation

Mark Dilger Thu, 25 Feb 2016 15:17:07 -0800

> On Feb 23, 2016, at 5:12 PM, Tomas Vondra <tomas.von...@2ndquadrant.com> 
> wrote:
> 
> <snip>
> 
> So much better. Clearly, there are cases where this will over-estimate the 
> cardinality - for example when the values are somehow correlated.
>


I applied your patch, which caused a few regression tests to fail.  Attached
is a patch that includes the necessary changes to the expected test results.

It is not hard to write test cases where your patched version overestimates
the number of rows by a very similar factor as the old code underestimates
them.  My very first test, which was not specifically designed to demonstrate
this, happens to be one such example:


CREATE TABLE t (a INT, b int);
INSERT INTO t SELECT sqrt(gs)::int, gs FROM generate_series(1,10000000) gs;
ANALYZE t;
EXPLAIN SELECT a FROM t WHERE b < 1000 GROUP BY a;
                          QUERY PLAN
---------------------------------------------------------------
 HashAggregate  (cost=169250.21..169258.71 rows=850 width=4)
   Group Key: a
   ->  Seq Scan on t  (cost=0.00..169247.71 rows=1000 width=4)
         Filter: (b < 1000)
(4 rows)

SELECT COUNT(*) FROM (SELECT a FROM t WHERE b < 1000 GROUP BY a) AS ss;
 count 
-------                     
    32
(1 row)



So, it estimates 850 rows where only 32 are returned .  Without applying your 
patch,
it estimates just 1 row where 32 are returned.  That's an overestimate of 
roughly 26 times,
rather than an underestimate of 32 times.

As a patch review, I'd say that your patch does what you claim it does, and it 
applies
cleanly, and passes the regression tests with my included modifications.  I 
think there
needs to be some discussion on the list about whether the patch is a good idea.

Mark Dilger

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 46c95b0..82a7f7f 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3438,9 +3438,9 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, 
double input_rows,
                                reldistinct = clamp;
 
                        /*
-                        * Multiply by restriction selectivity.
+                        * Estimate number of distinct values expected in given 
number of rows.
                         */
-                       reldistinct *= rel->rows / rel->tuples;
+                       reldistinct *= (1 - powl((reldistinct - 1) / 
reldistinct, rel->rows));
 
                        /*
                         * Update estimate of total distinct groups.
diff --git a/src/test/regress/expected/join.out 
b/src/test/regress/expected/join.out
index 59d7877..d9dd5ca 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3951,17 +3951,17 @@ select d.* from d left join (select * from b group by 
b.id, b.c_id) s
   on d.a = s.id;
               QUERY PLAN               
 ---------------------------------------
- Merge Left Join
-   Merge Cond: (d.a = s.id)
-   ->  Sort
-         Sort Key: d.a
-         ->  Seq Scan on d
+ Merge Right Join
+   Merge Cond: (s.id = d.a)
    ->  Sort
          Sort Key: s.id
          ->  Subquery Scan on s
                ->  HashAggregate
                      Group Key: b.id
                      ->  Seq Scan on b
+   ->  Sort
+         Sort Key: d.a
+         ->  Seq Scan on d
 (11 rows)
 
 -- similarly, but keying off a DISTINCT clause
@@ -3970,17 +3970,17 @@ select d.* from d left join (select distinct * from b) s
   on d.a = s.id;
                  QUERY PLAN                  
 ---------------------------------------------
- Merge Left Join
-   Merge Cond: (d.a = s.id)
-   ->  Sort
-         Sort Key: d.a
-         ->  Seq Scan on d
+ Merge Right Join
+   Merge Cond: (s.id = d.a)
    ->  Sort
          Sort Key: s.id
          ->  Subquery Scan on s
                ->  HashAggregate
                      Group Key: b.id, b.c_id
                      ->  Seq Scan on b
+   ->  Sort
+         Sort Key: d.a
+         ->  Seq Scan on d
 (11 rows)
 
 -- check join removal works when uniqueness of the join condition is enforced
diff --git a/src/test/regress/expected/subselect.out 
b/src/test/regress/expected/subselect.out
index de64ca7..0fc93d9 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -807,27 +807,24 @@ select * from int4_tbl where
 explain (verbose, costs off)
 select * from int4_tbl o where (f1, f1) in
   (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
-                              QUERY PLAN                              
-----------------------------------------------------------------------
- Hash Join
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Hash Semi Join
    Output: o.f1
    Hash Cond: (o.f1 = "ANY_subquery".f1)
    ->  Seq Scan on public.int4_tbl o
          Output: o.f1
    ->  Hash
          Output: "ANY_subquery".f1, "ANY_subquery".g
-         ->  HashAggregate
+         ->  Subquery Scan on "ANY_subquery"
                Output: "ANY_subquery".f1, "ANY_subquery".g
-               Group Key: "ANY_subquery".f1, "ANY_subquery".g
-               ->  Subquery Scan on "ANY_subquery"
-                     Output: "ANY_subquery".f1, "ANY_subquery".g
-                     Filter: ("ANY_subquery".f1 = "ANY_subquery".g)
-                     ->  HashAggregate
-                           Output: i.f1, (generate_series(1, 2) / 10)
-                           Group Key: i.f1
-                           ->  Seq Scan on public.int4_tbl i
-                                 Output: i.f1
-(18 rows)
+               Filter: ("ANY_subquery".f1 = "ANY_subquery".g)
+               ->  HashAggregate
+                     Output: i.f1, (generate_series(1, 2) / 10)
+                     Group Key: i.f1
+                     ->  Seq Scan on public.int4_tbl i
+                           Output: i.f1
+(15 rows)
 
 select * from int4_tbl o where (f1, f1) in
   (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
diff --git a/src/test/regress/expected/union.out 
b/src/test/regress/expected/union.out
index 016571b..f2e297e 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -263,16 +263,16 @@ ORDER BY 1;
 SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl;
         q2        
 ------------------
- 4567890123456789
               123
+ 4567890123456789
 (2 rows)
 
 SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl;
         q2        
 ------------------
+              123
  4567890123456789
  4567890123456789
-              123
 (3 rows)
 
 SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
@@ -305,16 +305,16 @@ SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl;
 SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q2 FROM int8_tbl;
         q1        
 ------------------
- 4567890123456789
               123
+ 4567890123456789
 (2 rows)
 
 SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl;
         q1        
 ------------------
+              123
  4567890123456789
  4567890123456789
-              123
 (3 rows)
 
 SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl FOR NO KEY UPDATE;
@@ -343,8 +343,8 @@ SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl 
ORDER BY 1;
 SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl UNION ALL SELECT q2 
FROM int8_tbl;
         q1         
 -------------------
-  4567890123456789
                123
+  4567890123456789
                456
   4567890123456789
                123
@@ -355,15 +355,15 @@ SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl 
UNION ALL SELECT q2 FR
 SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT 
q2 FROM int8_tbl)));
         q1        
 ------------------
- 4567890123456789
               123
+ 4567890123456789
 (2 rows)
 
 (((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL 
SELECT q2 FROM int8_tbl;
         q1         
 -------------------
-  4567890123456789
                123
+  4567890123456789
                456
   4567890123456789
                123
@@ -419,8 +419,8 @@ HINT:  There is a column named "q2" in table "*SELECT* 2", 
but it cannot be refe
 SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 
1)));
         q1        
 ------------------
- 4567890123456789
               123
+ 4567890123456789
 (2 rows)
 
 --

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] improving GROUP BY estimation

Reply via email to