Re: Eager aggregation, take 3

Matheus Alcantara Thu, 02 Apr 2026 05:19:12 -0700

On Mon Mar 30, 2026 at 12:17 AM -03, Richard Guo wrote:
> On Thu, Oct 9, 2025 at 5:07 PM Richard Guo <[email protected]> wrote:
>> I noticed an unnecessary header include in initsplan.c.  Will fix that
>> as well.
>
> I noticed a couple of issues that can lead to unexpected results.
> I've attached two patches to fix them.
>
> 1. Eager aggregation was incorrectly checking the data type's default
> collation rather than the expression's actual collation.  This allowed
> columns with non-deterministic collations to be pushed down, resulting
> in incorrect grouping.  Fixed by 0001.
>
> 2. Pushing aggregates containing volatile functions below a join
> alters their execution count.  Fixed by 0002.
>
> (As briefly discussed on Discord, this non-deterministic collation
> issue also exists in our long-existing logic for pushing HAVING down
> to WHERE.  But let's fix it for the eager aggregation first.)
>


Hi Richard,

The patches looks good to me and are working as expected. It seems very
straightforward, so I don't have any major comments.

I'm attaching some new tests that I've added to collate.icu.utf8 and
eager_aggregate regression tests during my review, fell free to include
any of them if it could be helpful or none.

--
Matheus Alcantara
EDB: https://www.enterprisedb.com

diff --git a/src/test/regress/expected/collate.icu.utf8.out 
b/src/test/regress/expected/collate.icu.utf8.out
index fbcdb7eb58c..a2dd8a34da4 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -2726,6 +2726,95 @@ GROUP BY t1.id;
 
 DROP TABLE eager_agg_t1;
 DROP TABLE eager_agg_t2;
+--
+-- Test for eager aggregation with multiple columns having different collations
+--
+CREATE TABLE eager_agg_t3 (
+    id int,
+    val1 text COLLATE case_insensitive,
+    val2 text COLLATE "C"
+);
+CREATE TABLE eager_agg_t4 (
+    val1 text COLLATE case_insensitive,
+    val2 text COLLATE "C"
+);
+INSERT INTO eager_agg_t3 SELECT 1, 'a', 'x' FROM generate_series(1, 50);
+INSERT INTO eager_agg_t3 SELECT 1, 'A', 'x' FROM generate_series(1, 50);
+INSERT INTO eager_agg_t4 VALUES ('A', 'x');
+ANALYZE eager_agg_t3;
+ANALYZE eager_agg_t4;
+-- Ensure that eager aggregation is not used when grouping by a column with
+-- non-deterministic collation, even when other grouping columns have
+-- deterministic collations.
+EXPLAIN (COSTS OFF)
+SELECT t1.id, t1.val1, count(*)
+  FROM eager_agg_t3 t1
+  JOIN eager_agg_t4 t2 ON t1.val1 = t2.val1 COLLATE "C" AND t1.val2 = t2.val2
+GROUP BY t1.id, t1.val1;
+                                     QUERY PLAN                                
     
+------------------------------------------------------------------------------------
+ HashAggregate
+   Group Key: t1.id, t1.val1
+   ->  Nested Loop
+         Join Filter: (((t1.val1)::text = (t2.val1)::text) AND (t1.val2 = 
t2.val2))
+         ->  Seq Scan on eager_agg_t4 t2
+         ->  Seq Scan on eager_agg_t3 t1
+(6 rows)
+
+-- Verify correct results (should return 1 row with count = 50)
+SELECT t1.id, t1.val1, count(*)
+  FROM eager_agg_t3 t1
+  JOIN eager_agg_t4 t2 ON t1.val1 = t2.val1 COLLATE "C" AND t1.val2 = t2.val2
+GROUP BY t1.id, t1.val1;
+ id | val1 | count 
+----+------+-------
+  1 | A    |    50
+(1 row)
+
+DROP TABLE eager_agg_t3;
+DROP TABLE eager_agg_t4;
+--
+-- Test for eager aggregation with explicit COLLATE on grouping expression
+--
+CREATE TABLE eager_agg_t5 (id int, val text COLLATE "C");
+CREATE TABLE eager_agg_t6 (val text COLLATE "C");
+INSERT INTO eager_agg_t5 SELECT 1, 'a' FROM generate_series(1, 50);
+INSERT INTO eager_agg_t5 SELECT 1, 'A' FROM generate_series(1, 50);
+INSERT INTO eager_agg_t6 VALUES ('A');
+ANALYZE eager_agg_t5;
+ANALYZE eager_agg_t6;
+-- When grouping by an expression with explicit non-deterministic COLLATE,
+-- eager aggregation should not be used even if the column's native collation
+-- is deterministic.
+EXPLAIN (COSTS OFF)
+SELECT t1.id, t1.val COLLATE case_insensitive, count(*)
+  FROM eager_agg_t5 t1
+  JOIN eager_agg_t6 t2 ON t1.val = t2.val
+GROUP BY t1.id, t1.val COLLATE case_insensitive;
+                  QUERY PLAN                   
+-----------------------------------------------
+ HashAggregate
+   Group Key: t1.id, (t1.val)::text
+   ->  Hash Join
+         Hash Cond: (t1.val = t2.val)
+         ->  Seq Scan on eager_agg_t5 t1
+         ->  Hash
+               ->  Seq Scan on eager_agg_t6 t2
+(7 rows)
+
+-- Verify correct results (should return 1 row with count = 100, since 'a' and
+-- 'A' are equal under case_insensitive collation)
+SELECT t1.id, t1.val COLLATE case_insensitive, count(*)
+  FROM eager_agg_t5 t1
+  JOIN eager_agg_t6 t2 ON t1.val = t2.val
+GROUP BY t1.id, t1.val COLLATE case_insensitive;
+ id | val | count 
+----+-----+-------
+  1 | A   |    50
+(1 row)
+
+DROP TABLE eager_agg_t5;
+DROP TABLE eager_agg_t6;
 -- virtual generated columns
 CREATE TABLE t5 (
     a int,
diff --git a/src/test/regress/expected/eager_aggregate.out 
b/src/test/regress/expected/eager_aggregate.out
index d1b86be3a62..2bf983d12cb 100644
--- a/src/test/regress/expected/eager_aggregate.out
+++ b/src/test/regress/expected/eager_aggregate.out
@@ -448,6 +448,26 @@ GROUP BY t1.a ORDER BY t1.a;
                      ->  Seq Scan on eager_agg_t1 t1
 (9 rows)
 
+-- Ensure eager aggregation is not applied when FILTER clause contains
+-- volatile function
+EXPLAIN (COSTS OFF)
+SELECT t1.a, avg(t2.c) FILTER (WHERE random() > 0.5)
+  FROM eager_agg_t1 t1
+  JOIN eager_agg_t2 t2 ON t1.b = t2.b
+GROUP BY t1.a ORDER BY t1.a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ GroupAggregate
+   Group Key: t1.a
+   ->  Sort
+         Sort Key: t1.a
+         ->  Hash Join
+               Hash Cond: (t2.b = t1.b)
+               ->  Seq Scan on eager_agg_t2 t2
+               ->  Hash
+                     ->  Seq Scan on eager_agg_t1 t1
+(9 rows)
+
 DROP TABLE eager_agg_t1;
 DROP TABLE eager_agg_t2;
 DROP TABLE eager_agg_t3;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql 
b/src/test/regress/sql/collate.icu.utf8.sql
index 0e6b76b11b8..93c22b37727 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -1021,6 +1021,76 @@ GROUP BY t1.id;
 DROP TABLE eager_agg_t1;
 DROP TABLE eager_agg_t2;
 
+--
+-- Test for eager aggregation with multiple columns having different collations
+--
+CREATE TABLE eager_agg_t3 (
+    id int,
+    val1 text COLLATE case_insensitive,
+    val2 text COLLATE "C"
+);
+CREATE TABLE eager_agg_t4 (
+    val1 text COLLATE case_insensitive,
+    val2 text COLLATE "C"
+);
+
+INSERT INTO eager_agg_t3 SELECT 1, 'a', 'x' FROM generate_series(1, 50);
+INSERT INTO eager_agg_t3 SELECT 1, 'A', 'x' FROM generate_series(1, 50);
+INSERT INTO eager_agg_t4 VALUES ('A', 'x');
+
+ANALYZE eager_agg_t3;
+ANALYZE eager_agg_t4;
+
+-- Ensure that eager aggregation is not used when grouping by a column with
+-- non-deterministic collation, even when other grouping columns have
+-- deterministic collations.
+EXPLAIN (COSTS OFF)
+SELECT t1.id, t1.val1, count(*)
+  FROM eager_agg_t3 t1
+  JOIN eager_agg_t4 t2 ON t1.val1 = t2.val1 COLLATE "C" AND t1.val2 = t2.val2
+GROUP BY t1.id, t1.val1;
+
+-- Verify correct results (should return 1 row with count = 50)
+SELECT t1.id, t1.val1, count(*)
+  FROM eager_agg_t3 t1
+  JOIN eager_agg_t4 t2 ON t1.val1 = t2.val1 COLLATE "C" AND t1.val2 = t2.val2
+GROUP BY t1.id, t1.val1;
+
+DROP TABLE eager_agg_t3;
+DROP TABLE eager_agg_t4;
+
+--
+-- Test for eager aggregation with explicit COLLATE on grouping expression
+--
+CREATE TABLE eager_agg_t5 (id int, val text COLLATE "C");
+CREATE TABLE eager_agg_t6 (val text COLLATE "C");
+
+INSERT INTO eager_agg_t5 SELECT 1, 'a' FROM generate_series(1, 50);
+INSERT INTO eager_agg_t5 SELECT 1, 'A' FROM generate_series(1, 50);
+INSERT INTO eager_agg_t6 VALUES ('A');
+
+ANALYZE eager_agg_t5;
+ANALYZE eager_agg_t6;
+
+-- When grouping by an expression with explicit non-deterministic COLLATE,
+-- eager aggregation should not be used even if the column's native collation
+-- is deterministic.
+EXPLAIN (COSTS OFF)
+SELECT t1.id, t1.val COLLATE case_insensitive, count(*)
+  FROM eager_agg_t5 t1
+  JOIN eager_agg_t6 t2 ON t1.val = t2.val
+GROUP BY t1.id, t1.val COLLATE case_insensitive;
+
+-- Verify correct results (should return 1 row with count = 100, since 'a' and
+-- 'A' are equal under case_insensitive collation)
+SELECT t1.id, t1.val COLLATE case_insensitive, count(*)
+  FROM eager_agg_t5 t1
+  JOIN eager_agg_t6 t2 ON t1.val = t2.val
+GROUP BY t1.id, t1.val COLLATE case_insensitive;
+
+DROP TABLE eager_agg_t5;
+DROP TABLE eager_agg_t6;
+
 -- virtual generated columns
 CREATE TABLE t5 (
     a int,
diff --git a/src/test/regress/sql/eager_aggregate.sql 
b/src/test/regress/sql/eager_aggregate.sql
index 97e10dd7cf4..9c935ef0633 100644
--- a/src/test/regress/sql/eager_aggregate.sql
+++ b/src/test/regress/sql/eager_aggregate.sql
@@ -171,6 +171,14 @@ SELECT t1.a, avg(t2.c + random())
   JOIN eager_agg_t2 t2 ON t1.b = t2.b
 GROUP BY t1.a ORDER BY t1.a;
 
+-- Ensure eager aggregation is not applied when FILTER clause contains
+-- volatile function
+EXPLAIN (COSTS OFF)
+SELECT t1.a, avg(t2.c) FILTER (WHERE random() > 0.5)
+  FROM eager_agg_t1 t1
+  JOIN eager_agg_t2 t2 ON t1.b = t2.b
+GROUP BY t1.a ORDER BY t1.a;
+
 DROP TABLE eager_agg_t1;
 DROP TABLE eager_agg_t2;
 DROP TABLE eager_agg_t3;

Re: Eager aggregation, take 3

Reply via email to