Re: [HACKERS] Macro customizable hashtable / bitmapscan & aggregation perf

Andres Freund Sun, 09 Oct 2016 16:39:07 -0700

Hi,

Attached is an updated version of the patchset. The main changes are
- address most of Tomas' feedback
- address regression test output changes by adding explicit ORDER BYs,
  in a separate commit.
- fix issues around hash tables sized up to PG_UINT32_MAX
- fix a bug in iteration with "concurrent" deletions


> > We didn't really for dynahash (as it basically assumed a fillfactor of 1
> > all the time), not sure if changing it won't also cause issues.
> > 
> 
> That's a case of glass half-full vs. half-empty, I guess. If we assumed load
> factor 1.0, then I see it as accounting for load factor (while you see it as
> not accounting of it).

Well, that load factor is almost never achieved, because we'd have grown
since...  I added a comment about disregarding fill factor and growth
policy to estimate_hashagg_tablesize, which is actually the place where
it'd seemingly make sense to handle it.

Tomas, did you have time to run a benchmark?

I think this is starting to look good.


Regards,

Andres

>From 0cf78f283af702d2e938e249de8c09733d2f5f77 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sun, 3 Jul 2016 15:05:18 -0700
Subject: [PATCH 1/5] Add likely/unlikely() branch hint macros.

These are useful for hot code paths. Because it's easy to guess wrongly
about likelihood, and because such likelihoods change over time, they
should be used sparingly.

Discussion: <20160727004333.r3e2k2y6fvk2n...@alap3.anarazel.de>
---
 src/include/c.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/include/c.h b/src/include/c.h
index 4ab3f80..3a77107 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -939,6 +939,22 @@ typedef NameData *Name;
 #endif
 
 
+/*
+ * Hints to the compiler about the likelihood of a branch. Both likely() and
+ * unlikely() return the boolean value of the contained expression.
+ *
+ * These should only be used sparingly, in very hot code paths. It's very easy
+ * to mis-estimate likelihoods.
+ */
+#if __GNUC__ >= 3
+#define likely(x)	__builtin_expect((x) != 0, 1)
+#define unlikely(x)	__builtin_expect((x) != 0, 0)
+#else
+#define likely(x)	((x) != 0)
+#define unlikely(x)	((x) != 0)
+#endif
+
+
 /* ----------------------------------------------------------------
  *				Section 8:	random stuff
  * ----------------------------------------------------------------
-- 
2.9.3

>From b9bee24439cd26cf7d3ccd5d02ff52ac579646f5 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sun, 9 Oct 2016 15:34:13 -0700
Subject: [PATCH 2/5] Make regression tests less dependent on hash table order.

Upcoming changes to the hash table code used, among others, for grouping
and set operations will change the output order for a few queries. To
make it less likely that actual bugs are hidden between ordering
changes, and to make the tests robust against platform dependant
ordering, add ORDER BYs guaranteeing the output order.

Discussion: <20160727004333.r3e2k2y6fvk2n...@alap3.anarazel.de>
---
 src/test/regress/expected/matview.out |   6 +-
 src/test/regress/expected/psql.out    |   2 +-
 src/test/regress/expected/tsrf.out    |   8 +-
 src/test/regress/expected/union.out   | 133 ++++++++++++++++++----------------
 src/test/regress/expected/with.out    |   8 +-
 src/test/regress/sql/matview.sql      |   4 +-
 src/test/regress/sql/psql.sql         |   2 +-
 src/test/regress/sql/tsrf.sql         |   2 +-
 src/test/regress/sql/union.sql        |  63 ++++++++--------
 src/test/regress/sql/with.sql         |   8 +-
 10 files changed, 125 insertions(+), 111 deletions(-)

diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out
index e7d0ad1..08cffcf 100644
--- a/src/test/regress/expected/matview.out
+++ b/src/test/regress/expected/matview.out
@@ -33,7 +33,7 @@ SELECT relispopulated FROM pg_class WHERE oid = 'mvtest_tm'::regclass;
  f
 (1 row)
 
-SELECT * FROM mvtest_tm;
+SELECT * FROM mvtest_tm ORDER BY type;
 ERROR:  materialized view "mvtest_tm" has not been populated
 HINT:  Use the REFRESH MATERIALIZED VIEW command.
 REFRESH MATERIALIZED VIEW mvtest_tm;
@@ -44,12 +44,12 @@ SELECT relispopulated FROM pg_class WHERE oid = 'mvtest_tm'::regclass;
 (1 row)
 
 CREATE UNIQUE INDEX mvtest_tm_type ON mvtest_tm (type);
-SELECT * FROM mvtest_tm;
+SELECT * FROM mvtest_tm ORDER BY type;
  type | totamt 
 ------+--------
+ x    |      5
  y    |     12
  z    |     11
- x    |      5
 (3 rows)
 
 -- create various views
diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out
index 017b79e..464436a 100644
--- a/src/test/regress/expected/psql.out
+++ b/src/test/regress/expected/psql.out
@@ -123,7 +123,7 @@ unicode_header_linestyle single
 prepare q as select array_to_string(array_agg(repeat('x',2*n)),E'\n') as "ab
 
 c", array_to_string(array_agg(repeat('y',20-2*n)),E'\n') as "a
-bc" from generate_series(1,10) as n(n) group by n>1 ;
+bc" from generate_series(1,10) as n(n) group by n>1 order by n>1;
 \pset linestyle ascii
 \pset expanded off
 \pset columns 40
diff --git a/src/test/regress/expected/tsrf.out b/src/test/regress/expected/tsrf.out
index d9a5f13..8c54f71 100644
--- a/src/test/regress/expected/tsrf.out
+++ b/src/test/regress/expected/tsrf.out
@@ -187,15 +187,15 @@ SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_se
 (3 rows)
 
 -- sorting + grouping
-SELECT few.dataa, count(*), min(id), max(id), generate_series(1,3) FROM few GROUP BY few.dataa ORDER BY 5;
+SELECT few.dataa, count(*), min(id), max(id), generate_series(1,3) FROM few GROUP BY few.dataa ORDER BY 5, 1;
  dataa | count | min | max | generate_series 
 -------+-------+-----+-----+-----------------
- b     |     1 |   3 |   3 |               1
  a     |     2 |   1 |   2 |               1
- b     |     1 |   3 |   3 |               2
+ b     |     1 |   3 |   3 |               1
  a     |     2 |   1 |   2 |               2
- b     |     1 |   3 |   3 |               3
+ b     |     1 |   3 |   3 |               2
  a     |     2 |   1 |   2 |               3
+ b     |     1 |   3 |   3 |               3
 (6 rows)
 
 -- grouping sets are a bit special, they produce NULLs in columns not actually NULL
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 016571b..67f5fc4 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -2,14 +2,14 @@
 -- UNION (also INTERSECT, EXCEPT)
 --
 -- Simple UNION constructs
-SELECT 1 AS two UNION SELECT 2;
+SELECT 1 AS two UNION SELECT 2 ORDER BY 1;
  two 
 -----
    1
    2
 (2 rows)
 
-SELECT 1 AS one UNION SELECT 1;
+SELECT 1 AS one UNION SELECT 1 ORDER BY 1;
  one 
 -----
    1
@@ -29,7 +29,7 @@ SELECT 1 AS two UNION ALL SELECT 1;
    1
 (2 rows)
 
-SELECT 1 AS three UNION SELECT 2 UNION SELECT 3;
+SELECT 1 AS three UNION SELECT 2 UNION SELECT 3 ORDER BY 1;
  three 
 -------
      1
@@ -37,14 +37,14 @@ SELECT 1 AS three UNION SELECT 2 UNION SELECT 3;
      3
 (3 rows)
 
-SELECT 1 AS two UNION SELECT 2 UNION SELECT 2;
+SELECT 1 AS two UNION SELECT 2 UNION SELECT 2 ORDER BY 1;
  two 
 -----
    1
    2
 (2 rows)
 
-SELECT 1 AS three UNION SELECT 2 UNION ALL SELECT 2;
+SELECT 1 AS three UNION SELECT 2 UNION ALL SELECT 2 ORDER BY 1;
  three 
 -------
      1
@@ -52,7 +52,7 @@ SELECT 1 AS three UNION SELECT 2 UNION ALL SELECT 2;
      2
 (3 rows)
 
-SELECT 1.1 AS two UNION SELECT 2.2;
+SELECT 1.1 AS two UNION SELECT 2.2 ORDER BY 1;
  two 
 -----
  1.1
@@ -60,41 +60,41 @@ SELECT 1.1 AS two UNION SELECT 2.2;
 (2 rows)
 
 -- Mixed types
-SELECT 1.1 AS two UNION SELECT 2;
+SELECT 1.1 AS two UNION SELECT 2 ORDER BY 1;
  two 
 -----
  1.1
    2
 (2 rows)
 
-SELECT 1 AS two UNION SELECT 2.2;
+SELECT 1 AS two UNION SELECT 2.2 ORDER BY 1;
  two 
 -----
    1
  2.2
 (2 rows)
 
-SELECT 1 AS one UNION SELECT 1.0::float8;
+SELECT 1 AS one UNION SELECT 1.0::float8 ORDER BY 1;
  one 
 -----
    1
 (1 row)
 
-SELECT 1.1 AS two UNION ALL SELECT 2;
+SELECT 1.1 AS two UNION ALL SELECT 2 ORDER BY 1;
  two 
 -----
  1.1
    2
 (2 rows)
 
-SELECT 1.0::float8 AS two UNION ALL SELECT 1;
+SELECT 1.0::float8 AS two UNION ALL SELECT 1 ORDER BY 1;
  two 
 -----
    1
    1
 (2 rows)
 
-SELECT 1.1 AS three UNION SELECT 2 UNION SELECT 3;
+SELECT 1.1 AS three UNION SELECT 2 UNION SELECT 3 ORDER BY 1;
  three 
 -------
    1.1
@@ -109,7 +109,7 @@ SELECT 1.1::float8 AS two UNION SELECT 2 UNION SELECT 2.0::float8 ORDER BY 1;
    2
 (2 rows)
 
-SELECT 1.1 AS three UNION SELECT 2 UNION ALL SELECT 2;
+SELECT 1.1 AS three UNION SELECT 2 UNION ALL SELECT 2 ORDER BY 1;
  three 
 -------
    1.1
@@ -117,7 +117,7 @@ SELECT 1.1 AS three UNION SELECT 2 UNION ALL SELECT 2;
      2
 (3 rows)
 
-SELECT 1.1 AS two UNION (SELECT 2 UNION ALL SELECT 2);
+SELECT 1.1 AS two UNION (SELECT 2 UNION ALL SELECT 2) ORDER BY 1;
  two 
 -----
  1.1
@@ -195,7 +195,8 @@ SELECT f1 AS five FROM FLOAT8_TBL
   WHERE f1 BETWEEN -1e6 AND 1e6
 UNION
 SELECT f1 FROM INT4_TBL
-  WHERE f1 BETWEEN 0 AND 1000000;
+  WHERE f1 BETWEEN 0 AND 1000000
+ORDER BY 1;
          five          
 -----------------------
                -1004.3
@@ -260,19 +261,19 @@ ORDER BY 1;
 --
 -- INTERSECT and EXCEPT
 --
-SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl ORDER BY 1;
         q2        
 ------------------
- 4567890123456789
               123
+ 4567890123456789
 (2 rows)
 
-SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl ORDER BY 1;
         q2        
 ------------------
- 4567890123456789
- 4567890123456789
               123
+ 4567890123456789
+ 4567890123456789
 (3 rows)
 
 SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
@@ -297,24 +298,24 @@ SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1;
   4567890123456789
 (3 rows)
 
-SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl;
+SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY 1;
  q1 
 ----
 (0 rows)
 
-SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q2 FROM int8_tbl;
+SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q2 FROM int8_tbl ORDER BY 1;
         q1        
 ------------------
- 4567890123456789
               123
+ 4567890123456789
 (2 rows)
 
-SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl;
+SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl ORDER BY 1;
         q1        
 ------------------
- 4567890123456789
- 4567890123456789
               123
+ 4567890123456789
+ 4567890123456789
 (3 rows)
 
 SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl FOR NO KEY UPDATE;
@@ -322,7 +323,7 @@ ERROR:  FOR NO KEY UPDATE is not allowed with UNION/INTERSECT/EXCEPT
 --
 -- Mixed types
 --
-SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl;
+SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl ORDER BY 1;
  f1 
 ----
   0
@@ -340,30 +341,30 @@ SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1;
 --
 -- Operator precedence and (((((extra))))) parentheses
 --
-SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl;
+SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl  ORDER BY 1;
         q1         
 -------------------
-  4567890123456789
+ -4567890123456789
+               123
                123
                456
   4567890123456789
-               123
   4567890123456789
- -4567890123456789
+  4567890123456789
 (7 rows)
 
-SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl)));
+SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) ORDER BY 1;
         q1        
 ------------------
- 4567890123456789
               123
+ 4567890123456789
 (2 rows)
 
-(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL SELECT q2 FROM int8_tbl;
+(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl ORDER BY 1))) UNION ALL SELECT q2 FROM int8_tbl;
         q1         
 -------------------
-  4567890123456789
                123
+  4567890123456789
                456
   4567890123456789
                123
@@ -416,11 +417,11 @@ LINE 1: ... int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1...
                                                              ^
 HINT:  There is a column named "q2" in table "*SELECT* 2", but it cannot be referenced from this part of the query.
 -- But this should work:
-SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1)));
+SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1))) ORDER BY 1;
         q1        
 ------------------
- 4567890123456789
               123
+ 4567890123456789
 (2 rows)
 
 --
@@ -593,23 +594,27 @@ SELECT * FROM
   (SELECT 1 AS t, 2 AS x
    UNION
    SELECT 2 AS t, 4 AS x) ss
-WHERE x < 4;
-                 QUERY PLAN                 
---------------------------------------------
- Unique
-   ->  Sort
-         Sort Key: (1), (2)
-         ->  Append
-               ->  Result
-               ->  Result
-                     One-Time Filter: false
-(7 rows)
+WHERE x < 4
+ORDER BY x;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Sort
+   Sort Key: (2)
+   ->  Unique
+         ->  Sort
+               Sort Key: (1), (2)
+               ->  Append
+                     ->  Result
+                     ->  Result
+                           One-Time Filter: false
+(9 rows)
 
 SELECT * FROM
   (SELECT 1 AS t, 2 AS x
    UNION
    SELECT 2 AS t, 4 AS x) ss
-WHERE x < 4;
+WHERE x < 4
+ORDER BY x;
  t | x 
 ---+---
  1 | 2
@@ -653,24 +658,28 @@ SELECT * FROM
   (SELECT 1 AS t, (random()*3)::int AS x
    UNION
    SELECT 2 AS t, 4 AS x) ss
-WHERE x > 3;
-                                  QUERY PLAN                                  
-------------------------------------------------------------------------------
- Subquery Scan on ss
-   Filter: (ss.x > 3)
-   ->  Unique
-         ->  Sort
-               Sort Key: (1), (((random() * '3'::double precision))::integer)
-               ->  Append
-                     ->  Result
-                     ->  Result
-(8 rows)
+WHERE x > 3
+ORDER BY x;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Sort
+   Sort Key: ss.x
+   ->  Subquery Scan on ss
+         Filter: (ss.x > 3)
+         ->  Unique
+               ->  Sort
+                     Sort Key: (1), (((random() * '3'::double precision))::integer)
+                     ->  Append
+                           ->  Result
+                           ->  Result
+(10 rows)
 
 SELECT * FROM
   (SELECT 1 AS t, (random()*3)::int AS x
    UNION
    SELECT 2 AS t, 4 AS x) ss
-WHERE x > 3;
+WHERE x > 3
+ORDER BY x;
  t | x 
 ---+---
  2 | 4
diff --git a/src/test/regress/expected/with.out b/src/test/regress/expected/with.out
index 137420d..1b7f57b 100644
--- a/src/test/regress/expected/with.out
+++ b/src/test/regress/expected/with.out
@@ -1244,7 +1244,7 @@ WITH outermost(x) AS (
          SELECT * FROM innermost
          UNION SELECT 3)
 )
-SELECT * FROM outermost;
+SELECT * FROM outermost ORDER BY 1;
  x 
 ---
  1
@@ -1258,7 +1258,7 @@ WITH outermost(x) AS (
          SELECT * FROM outermost  -- fail
          UNION SELECT * FROM innermost)
 )
-SELECT * FROM outermost;
+SELECT * FROM outermost ORDER BY 1;
 ERROR:  relation "outermost" does not exist
 LINE 4:          SELECT * FROM outermost  
                                ^
@@ -1270,7 +1270,7 @@ WITH RECURSIVE outermost(x) AS (
          SELECT * FROM outermost
          UNION SELECT * FROM innermost)
 )
-SELECT * FROM outermost;
+SELECT * FROM outermost ORDER BY 1;
  x 
 ---
  1
@@ -1282,7 +1282,7 @@ WITH RECURSIVE outermost(x) AS (
     SELECT * FROM innermost
     UNION SELECT * from outermost
 )
-SELECT * FROM outermost;
+SELECT * FROM outermost ORDER BY 1;
 ERROR:  recursive reference to query "outermost" must not appear within a subquery
 LINE 2:   WITH innermost as (SELECT 2 FROM outermost) 
                                            ^
diff --git a/src/test/regress/sql/matview.sql b/src/test/regress/sql/matview.sql
index 5f3269d..65a743c 100644
--- a/src/test/regress/sql/matview.sql
+++ b/src/test/regress/sql/matview.sql
@@ -16,11 +16,11 @@ EXPLAIN (costs off)
   CREATE MATERIALIZED VIEW mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA;
 CREATE MATERIALIZED VIEW mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA;
 SELECT relispopulated FROM pg_class WHERE oid = 'mvtest_tm'::regclass;
-SELECT * FROM mvtest_tm;
+SELECT * FROM mvtest_tm ORDER BY type;
 REFRESH MATERIALIZED VIEW mvtest_tm;
 SELECT relispopulated FROM pg_class WHERE oid = 'mvtest_tm'::regclass;
 CREATE UNIQUE INDEX mvtest_tm_type ON mvtest_tm (type);
-SELECT * FROM mvtest_tm;
+SELECT * FROM mvtest_tm ORDER BY type;
 
 -- create various views
 EXPLAIN (costs off)
diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql
index 4dc0745..900aa7e 100644
--- a/src/test/regress/sql/psql.sql
+++ b/src/test/regress/sql/psql.sql
@@ -67,7 +67,7 @@ select 'drop table gexec_test', 'select ''2000-01-01''::date as party_over'
 prepare q as select array_to_string(array_agg(repeat('x',2*n)),E'\n') as "ab
 
 c", array_to_string(array_agg(repeat('y',20-2*n)),E'\n') as "a
-bc" from generate_series(1,10) as n(n) group by n>1 ;
+bc" from generate_series(1,10) as n(n) group by n>1 order by n>1;
 
 \pset linestyle ascii
 
diff --git a/src/test/regress/sql/tsrf.sql b/src/test/regress/sql/tsrf.sql
index 4f854c8..cf2fbe3 100644
--- a/src/test/regress/sql/tsrf.sql
+++ b/src/test/regress/sql/tsrf.sql
@@ -56,7 +56,7 @@ SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few;
 SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_series(1,3)), generate_series(1,3) g FROM few GROUP BY g;
 
 -- sorting + grouping
-SELECT few.dataa, count(*), min(id), max(id), generate_series(1,3) FROM few GROUP BY few.dataa ORDER BY 5;
+SELECT few.dataa, count(*), min(id), max(id), generate_series(1,3) FROM few GROUP BY few.dataa ORDER BY 5, 1;
 
 -- grouping sets are a bit special, they produce NULLs in columns not actually NULL
 SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab);
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index 9ff1551..debd99e 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -4,41 +4,41 @@
 
 -- Simple UNION constructs
 
-SELECT 1 AS two UNION SELECT 2;
+SELECT 1 AS two UNION SELECT 2 ORDER BY 1;
 
-SELECT 1 AS one UNION SELECT 1;
+SELECT 1 AS one UNION SELECT 1 ORDER BY 1;
 
 SELECT 1 AS two UNION ALL SELECT 2;
 
 SELECT 1 AS two UNION ALL SELECT 1;
 
-SELECT 1 AS three UNION SELECT 2 UNION SELECT 3;
+SELECT 1 AS three UNION SELECT 2 UNION SELECT 3 ORDER BY 1;
 
-SELECT 1 AS two UNION SELECT 2 UNION SELECT 2;
+SELECT 1 AS two UNION SELECT 2 UNION SELECT 2 ORDER BY 1;
 
-SELECT 1 AS three UNION SELECT 2 UNION ALL SELECT 2;
+SELECT 1 AS three UNION SELECT 2 UNION ALL SELECT 2 ORDER BY 1;
 
-SELECT 1.1 AS two UNION SELECT 2.2;
+SELECT 1.1 AS two UNION SELECT 2.2 ORDER BY 1;
 
 -- Mixed types
 
-SELECT 1.1 AS two UNION SELECT 2;
+SELECT 1.1 AS two UNION SELECT 2 ORDER BY 1;
 
-SELECT 1 AS two UNION SELECT 2.2;
+SELECT 1 AS two UNION SELECT 2.2 ORDER BY 1;
 
-SELECT 1 AS one UNION SELECT 1.0::float8;
+SELECT 1 AS one UNION SELECT 1.0::float8 ORDER BY 1;
 
-SELECT 1.1 AS two UNION ALL SELECT 2;
+SELECT 1.1 AS two UNION ALL SELECT 2 ORDER BY 1;
 
-SELECT 1.0::float8 AS two UNION ALL SELECT 1;
+SELECT 1.0::float8 AS two UNION ALL SELECT 1 ORDER BY 1;
 
-SELECT 1.1 AS three UNION SELECT 2 UNION SELECT 3;
+SELECT 1.1 AS three UNION SELECT 2 UNION SELECT 3 ORDER BY 1;
 
 SELECT 1.1::float8 AS two UNION SELECT 2 UNION SELECT 2.0::float8 ORDER BY 1;
 
-SELECT 1.1 AS three UNION SELECT 2 UNION ALL SELECT 2;
+SELECT 1.1 AS three UNION SELECT 2 UNION ALL SELECT 2 ORDER BY 1;
 
-SELECT 1.1 AS two UNION (SELECT 2 UNION ALL SELECT 2);
+SELECT 1.1 AS two UNION (SELECT 2 UNION ALL SELECT 2) ORDER BY 1;
 
 --
 -- Try testing from tables...
@@ -66,7 +66,8 @@ SELECT f1 AS five FROM FLOAT8_TBL
   WHERE f1 BETWEEN -1e6 AND 1e6
 UNION
 SELECT f1 FROM INT4_TBL
-  WHERE f1 BETWEEN 0 AND 1000000;
+  WHERE f1 BETWEEN 0 AND 1000000
+ORDER BY 1;
 
 SELECT CAST(f1 AS char(4)) AS three FROM VARCHAR_TBL
 UNION
@@ -93,9 +94,9 @@ ORDER BY 1;
 -- INTERSECT and EXCEPT
 --
 
-SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl ORDER BY 1;
 
-SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl ORDER BY 1;
 
 SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
 
@@ -103,11 +104,11 @@ SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl ORDER BY 1;
 
 SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1;
 
-SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl;
+SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY 1;
 
-SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q2 FROM int8_tbl;
+SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q2 FROM int8_tbl ORDER BY 1;
 
-SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl;
+SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl ORDER BY 1;
 
 SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl FOR NO KEY UPDATE;
 
@@ -115,7 +116,7 @@ SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl FOR NO KEY UPDATE;
 -- Mixed types
 --
 
-SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl;
+SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl ORDER BY 1;
 
 SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1;
 
@@ -123,11 +124,11 @@ SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1;
 -- Operator precedence and (((((extra))))) parentheses
 --
 
-SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl;
+SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl  ORDER BY 1;
 
-SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl)));
+SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) ORDER BY 1;
 
-(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL SELECT q2 FROM int8_tbl;
+(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl ORDER BY 1))) UNION ALL SELECT q2 FROM int8_tbl;
 
 SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
 
@@ -147,7 +148,7 @@ ORDER BY q2,q1;
 SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1;
 
 -- But this should work:
-SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1)));
+SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1))) ORDER BY 1;
 
 --
 -- New syntaxes (7.1) permit new tests
@@ -261,13 +262,15 @@ SELECT * FROM
   (SELECT 1 AS t, 2 AS x
    UNION
    SELECT 2 AS t, 4 AS x) ss
-WHERE x < 4;
+WHERE x < 4
+ORDER BY x;
 
 SELECT * FROM
   (SELECT 1 AS t, 2 AS x
    UNION
    SELECT 2 AS t, 4 AS x) ss
-WHERE x < 4;
+WHERE x < 4
+ORDER BY x;
 
 explain (costs off)
 SELECT * FROM
@@ -289,13 +292,15 @@ SELECT * FROM
   (SELECT 1 AS t, (random()*3)::int AS x
    UNION
    SELECT 2 AS t, 4 AS x) ss
-WHERE x > 3;
+WHERE x > 3
+ORDER BY x;
 
 SELECT * FROM
   (SELECT 1 AS t, (random()*3)::int AS x
    UNION
    SELECT 2 AS t, 4 AS x) ss
-WHERE x > 3;
+WHERE x > 3
+ORDER BY x;
 
 -- Test proper handling of parameterized appendrel paths when the
 -- potential join qual is expensive
diff --git a/src/test/regress/sql/with.sql b/src/test/regress/sql/with.sql
index 133ff0b..7ee32ba 100644
--- a/src/test/regress/sql/with.sql
+++ b/src/test/regress/sql/with.sql
@@ -575,7 +575,7 @@ WITH outermost(x) AS (
          SELECT * FROM innermost
          UNION SELECT 3)
 )
-SELECT * FROM outermost;
+SELECT * FROM outermost ORDER BY 1;
 
 WITH outermost(x) AS (
   SELECT 1
@@ -583,7 +583,7 @@ WITH outermost(x) AS (
          SELECT * FROM outermost  -- fail
          UNION SELECT * FROM innermost)
 )
-SELECT * FROM outermost;
+SELECT * FROM outermost ORDER BY 1;
 
 WITH RECURSIVE outermost(x) AS (
   SELECT 1
@@ -591,14 +591,14 @@ WITH RECURSIVE outermost(x) AS (
          SELECT * FROM outermost
          UNION SELECT * FROM innermost)
 )
-SELECT * FROM outermost;
+SELECT * FROM outermost ORDER BY 1;
 
 WITH RECURSIVE outermost(x) AS (
   WITH innermost as (SELECT 2 FROM outermost) -- fail
     SELECT * FROM innermost
     UNION SELECT * from outermost
 )
-SELECT * FROM outermost;
+SELECT * FROM outermost ORDER BY 1;
 
 --
 -- This test will fail with the old implementation of PARAM_EXEC parameter
-- 
2.9.3

>From 4d7227bf2ef7f48609bd37a667dc8d7934985581 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Tue, 19 Jul 2016 14:10:28 -0700
Subject: [PATCH 3/5] Add a macro customizable hashtable.

dynahash.c hash tables aren't quite fast enough for some
use-cases. There are several reasons for lacking performance:
- the use of chaining for collision handling makes them cache
  inefficient, that's especially an issue when the tables get bigger.
- as the element sizes for dynahash are only determined at runtime,
  offset computations are somewhat expensive
- hash and element comparisons are indirect function calls, causing
  unnecessary pipeline stalls
- it's two level structure has some benefits (somewhat natural
  partitioning), but increases the number of indirections

to fix several of these the hash tables have to be adjusted to the
invididual use-case at compile-time. C unfortunately doesn't provide a
good way to do compile code generation (like e.g. c++'s templates for
all their weaknesses do).  Thus the somewhat ugly approach taken here is
to allow for code generation using a macro-templatized header file,
which generates functions and types based on a prefix and other
parameters.

Later patches use this infrastructure to use such hash tables for
tidbitmap.c (bitmap scans) and execGrouping.c (hash aggregation,
...). In queries where these use up a large fraction of the time, this
has been measured to lead to performance improvements of over 100%.

There are other cases where this could be useful (e.g. catcache.c).

The hash table design chosen is a variant of linear open-addressing. The
biggest disadvantage of simple linear addressing schemes are highly
variable lookup times due to clustering, and deletions leaving a lot of
toombstones around.  To address these issues a variant of "robin hood"
hashing is employed.  Robin hood hashing optimizes chaining lengths by
moving elements close to their optimal bucket ("rich" elements), out of
the way if a to-be-inserted element is further away from its optimal
position (i.e. it's "poor").  While that can make insertions slower, the
average lookup performance is a lot better, and higher fill factors can
be used in a still performant manner.  To avoid toombstones - which
normally solve the issue that a deleted node's presence is relevant to
determine whether a lookup needs to continue looking or is done -
buckets following a deleted element are shifted backwards, unless
they're empty or already at their optimal position.

Reviewed-By: Tomas Vondra
Discussion: <20160727004333.r3e2k2y6fvk2n...@alap3.anarazel.de>
---
 src/include/lib/simplehash.h | 877 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 877 insertions(+)
 create mode 100644 src/include/lib/simplehash.h

diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
new file mode 100644
index 0000000..0b28a58
--- /dev/null
+++ b/src/include/lib/simplehash.h
@@ -0,0 +1,877 @@
+/*
+ * simplehash.h
+ *
+ *	  Hash table implementation which will be specialized to user-defined
+ *	  types, by including this file to generate the required code.  It's
+ *	  probably not worthwile to do so for hash tables that aren't performance
+ *	  or space sensitive.
+ *
+ * Usage notes:
+ *
+ *    To generate a hash-table and associated functions for a use case several
+ *    macros have to be #define'ed before this file is included.  Including
+ *    the file #undef's all those, so a new hash table can be generated
+ *    afterwards.
+ *    The relevant parameters are:
+ *    - SH_PREFIX - prefix for all symbol names generated. A prefix of 'foo'
+ *      will result in hash table type 'foo_hash' and functions like
+ *      'foo_insert'/'foo_lookup' and so forth.
+ *    - SH_ELEMENT_TYPE - type of the contained elements
+ *    - SH_KEY_TYPE - type of the hashtable's key
+ *    - SH_DECLARE - if defined function prototypes and type declarations are
+ *      generated
+ *    - SH_DEFINE - if defined function definitions are generated
+ *    - SH_SCOPE - in which scope (e.g. extern, static inline) do function
+ *      declarations reside
+ *    The following parameters are only relevant when SH_DEFINE is defined:
+ *    - SH_KEY - name of the element in SH_ELEMENT_TYPE containing the hash key
+ *    - SH_EQUAL(table, a, b) - compare two table keys
+ *    - SH_HASH_KEY(table, key) - generate hash for the key
+ *    - SH_STORE_HASH - if defined the hash is stored in the elements
+ *    - SH_GET_HASH(tb, a) - return the field to store the hash in
+ *
+ *    For examples of usage look at simplehash.c (file local definition) and
+ *    execnodes.h/execGrouping.c (exposed declaration, file local
+ *    implementation).
+ *
+ * Hash table design:
+ *
+ *    The hash table design chosen is a variant of linear open-addressing. The
+ *    reason for doing so is that linear addressing is CPU cache & pipeline
+ *    friendly. The biggest disadvantage of simple linear addressing schemes
+ *    are highly variable lookup times due to clustering, and deletions
+ *    leaving a lot of toombstones around.  To address these issues a variant
+ *    of "robin hood" hashing is employed.  Robin hood hashing optimizes
+ *    chaining lengths by moving elements close to their optimal bucket
+ *    ("rich" elements), out of the way if a to-be-inserted element is further
+ *    away from its optimal position (i.e. it's "poor").  While that can make
+ *    insertions slower, the average lookup performance is a lot better, and
+ *    higher fill factors can be used in a still performant manner.  To avoid
+ *    toombstones - which normally solve the issue that a deleted node's
+ *    presence is relevant to determine whether a lookup needs to continue
+ *    looking or is done - buckets following a deleted element are shifted
+ *    backwards, unless they're empty or already at their optimal position.
+ */
+
+/* helpers */
+#define SH_MAKE_PREFIX(a) CppConcat(a,_)
+#define SH_MAKE_NAME(name) SH_MAKE_NAME_(SH_MAKE_PREFIX(SH_PREFIX),name)
+#define SH_MAKE_NAME_(a,b) CppConcat(a,b)
+
+/* name macros for: */
+
+/* type declarations */
+#define SH_TYPE SH_MAKE_NAME(hash)
+#define SH_STATUS SH_MAKE_NAME(status)
+#define SH_STATUS_EMPTY SH_MAKE_NAME(EMPTY)
+#define SH_STATUS_IN_USE SH_MAKE_NAME(IN_USE)
+#define SH_ITERATOR SH_MAKE_NAME(iterator)
+
+/* function declarations */
+#define SH_CREATE SH_MAKE_NAME(create)
+#define SH_DESTROY SH_MAKE_NAME(destroy)
+#define SH_INSERT SH_MAKE_NAME(insert)
+#define SH_DELETE SH_MAKE_NAME(delete)
+#define SH_LOOKUP SH_MAKE_NAME(lookup)
+#define SH_GROW SH_MAKE_NAME(grow)
+#define SH_START_ITERATE SH_MAKE_NAME(start_iterate)
+#define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at)
+#define SH_ITERATE SH_MAKE_NAME(iterate)
+#define SH_STAT SH_MAKE_NAME(stat)
+
+/* internal helper functions (no externally visible prototypes) */
+#define SH_COMPUTE_PARAMETERS SH_MAKE_NAME(compute_parameters)
+#define SH_NEXT SH_MAKE_NAME(next)
+#define SH_PREV SH_MAKE_NAME(prev)
+#define SH_DISTANCE_FROM_OPTIMAL SH_MAKE_NAME(distance)
+#define SH_INITIAL_BUCKET SH_MAKE_NAME(initial_bucket)
+#define SH_ENTRY_HASH SH_MAKE_NAME(entry_hash)
+
+/* generate forward declarations necessary to use the hash table */
+#ifdef SH_DECLARE
+
+/* type definitions */
+typedef struct SH_TYPE
+{
+	/*
+	 * Size of data / bucket array, 64 bits to handle UINT32_MAX sized hash
+	 * tables.  Note that the maximum number of elements is lower
+	 * (SH_MAX_FILLFACTOR)
+	 */
+	uint64 size;
+
+	/* how many elements have valid contents */
+	uint32 members;
+
+	/* mask for bucket and size calculations, based on size */
+	uint32 sizemask;
+
+	/* boundary after which to grow hashtable */
+	uint32 grow_threshold;
+
+	/* hash buckets */
+	SH_ELEMENT_TYPE *data;
+
+	/* memory context to use for allocations */
+	MemoryContext ctx;
+
+	/* user defined data, useful for callbacks */
+	void *private;
+} SH_TYPE;
+
+typedef enum SH_STATUS
+{
+	SH_STATUS_EMPTY = 0x00,
+	SH_STATUS_IN_USE = 0x01
+} SH_STATUS;
+
+typedef struct SH_ITERATOR
+{
+	uint32 cur; /* current element */
+	uint32 end;
+	bool done; /* iterator exhausted? */
+} SH_ITERATOR;
+
+/* externally visible function prototypes */
+SH_SCOPE SH_TYPE * SH_CREATE(MemoryContext ctx, uint32 nelements);
+SH_SCOPE void SH_DESTROY(SH_TYPE *tb);
+SH_SCOPE void SH_GROW(SH_TYPE *tb, uint32 newsize);
+SH_SCOPE SH_ELEMENT_TYPE * SH_INSERT(SH_TYPE *tb, SH_KEY_TYPE key, bool *found);
+SH_SCOPE SH_ELEMENT_TYPE * SH_LOOKUP(SH_TYPE *tb, SH_KEY_TYPE key);
+SH_SCOPE bool SH_DELETE(SH_TYPE *tb, SH_KEY_TYPE key);
+SH_SCOPE void SH_START_ITERATE(SH_TYPE *tb, SH_ITERATOR *iter);
+SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE *tb, SH_ITERATOR *iter, uint32 at);
+SH_SCOPE SH_ELEMENT_TYPE * SH_ITERATE(SH_TYPE *tb, SH_ITERATOR *iter);
+SH_SCOPE void SH_STAT(SH_TYPE *tb);
+
+#endif /* SH_DECLARE */
+
+
+/* generate implementation of the hash table */
+#ifdef SH_DEFINE
+
+#include "utils/memutils.h"
+
+/* conservative fillfactor for a robin hood table, might want to adjust */
+#define SH_FILLFACTOR (0.8)
+/* increase fillfactor if we otherwise would error out */
+#define SH_MAX_FILLFACTOR (0.95)
+/* max data array size,we allow up to PG_UINT32_MAX buckets, including 0 */
+#define SH_MAX_SIZE (((uint64) PG_UINT32_MAX) + 1)
+
+#ifdef SH_STORE_HASH
+#define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
+#else
+#define SH_COMPARE_KEYS(tb, ahash, akey, b) (SH_EQUAL(tb, b->SH_KEY, akey))
+#endif
+
+/* FIXME: can we move these to a central location? */
+
+/* calculate ceil(log base 2) of num */
+static inline uint64
+sh_log2(uint64 num)
+{
+	int			i;
+	uint64	limit;
+
+	for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
+		;
+	return i;
+}
+
+/* calculate first power of 2 >= num */
+static inline uint64
+sh_pow2(uint64 num)
+{
+	return ((uint64) 1) << sh_log2(num);
+}
+
+/*
+ * Compute sizing parameters for hashtable. Called when creating and growing
+ * the hashtable.
+ */
+static inline void
+SH_COMPUTE_PARAMETERS(SH_TYPE *tb, uint32 newsize)
+{
+	uint64 size;
+
+	/* supporting zero sized hashes would complicate matters */
+	size = Max(newsize, 2);
+
+	/* round up size to the next power of 2, that's the bucketing works  */
+	size = sh_pow2(size);
+	Assert(size <= SH_MAX_SIZE);
+
+	/*
+	 * Verify allocation of ->data is possible on platform, without
+	 * overflowing Size.
+	 */
+	if ((((uint64) sizeof(SH_ELEMENT_TYPE)) * size) >= MaxAllocHugeSize)
+		elog(ERROR, "hash table too large");
+
+	/* now set size */
+	tb->size = size;
+
+	if (tb->size == SH_MAX_SIZE)
+		tb->sizemask = 0;
+	else
+		tb->sizemask = tb->size - 1;
+
+	/*
+	 * Compute growth threshold here and after growing the table, to make
+	 * computations during insert cheaper.
+	 */
+	if (tb->size == SH_MAX_SIZE)
+		tb->grow_threshold = ((double) tb->size) * SH_MAX_FILLFACTOR;
+	else
+		tb->grow_threshold = ((double) tb->size) * SH_FILLFACTOR;
+}
+
+/* return the optimal bucket for the hash */
+static inline uint32
+SH_INITIAL_BUCKET(SH_TYPE *tb, uint32 hash)
+{
+	return hash & tb->sizemask;
+}
+
+/* return next bucket after the current, handling wraparound */
+static inline uint32
+SH_NEXT(SH_TYPE *tb, uint32 curelem, uint32 startelem)
+{
+	curelem = (curelem + 1) & tb->sizemask;
+
+	Assert(curelem != startelem);
+
+	return curelem;
+}
+
+/* return bucket before the current, handling wraparound */
+static inline uint32
+SH_PREV(SH_TYPE *tb, uint32 curelem, uint32 startelem)
+{
+	curelem = (curelem - 1) & tb->sizemask;
+
+	Assert(curelem != startelem);
+
+	return curelem;
+}
+
+/* return distance between bucket and it's optimal position */
+static inline uint32
+SH_DISTANCE_FROM_OPTIMAL(SH_TYPE *tb, uint32 optimal, uint32 bucket)
+{
+	if (optimal <= bucket)
+		return bucket - optimal;
+	else
+		return (tb->size + bucket) - optimal;
+}
+
+static inline uint32
+SH_ENTRY_HASH(SH_TYPE *tb, SH_ELEMENT_TYPE *entry)
+{
+#ifdef SH_STORE_HASH
+	return SH_GET_HASH(tb, entry);
+#else
+	return SH_HASH_KEY(tb, entry->SH_KEY);
+#endif
+}
+
+/*
+ * Create a hash table with enough space for `nelements` distinct members,
+ * allocating required memory in the passed-in context.
+ */
+SH_SCOPE SH_TYPE *
+SH_CREATE(MemoryContext ctx, uint32 nelements)
+{
+	SH_TYPE *tb;
+	uint64 size;
+
+	tb = MemoryContextAllocZero(ctx, sizeof(SH_TYPE));
+	tb->ctx = ctx;
+
+	/* increase nelements by fillfactor, want to store nelements elements */
+	size = Min((double) SH_MAX_SIZE, ((double) nelements) / SH_FILLFACTOR);
+
+	SH_COMPUTE_PARAMETERS(tb, size);
+
+	tb->data = MemoryContextAllocExtended(
+		tb->ctx, sizeof(SH_ELEMENT_TYPE) * tb->size,
+		MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
+
+	return tb;
+}
+
+/* destroy a previously created hash table */
+SH_SCOPE void
+SH_DESTROY(SH_TYPE *tb)
+{
+	pfree(tb->data);
+	pfree(tb);
+}
+
+/*
+ * Grow a hash table to at least `newsize` buckets.
+ *
+ * Usually this will automatically be called by insertions/deletions, when
+ * necessary. But resizing to the exact input size can be advantageous
+ * performance-wise, when known at some point.
+ */
+SH_SCOPE void __attribute__((noinline))
+SH_GROW(SH_TYPE *tb, uint32 newsize)
+{
+	uint64 oldsize = tb->size;
+	SH_ELEMENT_TYPE *olddata = tb->data;
+	SH_ELEMENT_TYPE *newdata;
+	uint32 i;
+	uint32 startelem = 0;
+	uint32 copyelem;
+
+	Assert(oldsize == sh_pow2(oldsize));
+	Assert(oldsize != SH_MAX_SIZE);
+	Assert(oldsize < newsize);
+
+	/* compute parameters for new table */
+	SH_COMPUTE_PARAMETERS(tb, newsize);
+
+	tb->data = MemoryContextAllocExtended(
+		tb->ctx, sizeof(SH_ELEMENT_TYPE) * tb->size,
+		MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
+
+	newdata = tb->data;
+
+	/*
+	 * Copy entries from the old data to newdata. We theoretically could use
+	 * SH_INSERT here, to avoid code duplication, but that's more general than
+	 * we need. We neither want tb->members increased, nor do we need to do
+	 * deal with deleted elements, nor do we need to compare keys. So a
+	 * special-cased implementation is lot faster. As resizing can be time
+	 * consuming and frequent, that's worthwile to optimize.
+	 *
+	 * To be able to simply move entries over, we have to start not at the
+	 * first bucket (i.e olddata[0]), but find the first bucket that's either
+	 * empty, or is occupied by an entry at it's optimal position. Such a
+	 * bucket has to exist in any table with a load factor under 1, as not all
+	 * buckets are occupied, i.e. there always has to be an empty bucket.  By
+	 * starting at such a bucket we can move the entries to the larger table,
+	 * without having to deal with conflicts.
+	 */
+
+	/* search for the first element in the hash that's not wrapped around */
+	for (i = 0; i < oldsize; i++)
+	{
+		SH_ELEMENT_TYPE *oldentry = &olddata[i];
+		uint32 hash;
+		uint32 optimal;
+
+		if (oldentry->status != SH_STATUS_IN_USE)
+		{
+			startelem = i;
+			break;
+		}
+
+		hash = SH_ENTRY_HASH(tb, oldentry);
+		optimal = SH_INITIAL_BUCKET(tb, hash);
+
+		if (optimal == i)
+		{
+			startelem = i;
+			break;
+		}
+	}
+
+	/* and copy all elements in the old table */
+	copyelem = startelem;
+	for (i = 0; i < oldsize; i++)
+	{
+		SH_ELEMENT_TYPE *oldentry = &olddata[copyelem];
+
+		if (oldentry->status == SH_STATUS_IN_USE)
+		{
+			uint32 hash;
+			uint32 startelem;
+			uint32 curelem;
+			SH_ELEMENT_TYPE *newentry;
+
+			hash = SH_ENTRY_HASH(tb, oldentry);
+			startelem = SH_INITIAL_BUCKET(tb, hash);
+			curelem = startelem;
+
+			/* find empty element to put data into */
+			while(true)
+			{
+				newentry = &newdata[curelem];
+
+				if (newentry->status == SH_STATUS_EMPTY)
+				{
+					break;
+				}
+
+				curelem = SH_NEXT(tb, curelem, startelem);
+			}
+
+			/* copy entry to new slot */
+			memcpy(newentry, oldentry, sizeof(SH_ELEMENT_TYPE));
+		}
+
+		/* can't use SH_NEXT here, would use new size */
+		copyelem++;
+		if (copyelem >= oldsize)
+		{
+			copyelem = 0;
+		}
+	}
+
+	pfree(olddata);
+}
+
+/*
+ * Insert the key key into the hash-table, set *found to true if the key
+ * already exists, false otherwise. Returns the hash-table entry in either
+ * case.
+ */
+SH_SCOPE SH_ELEMENT_TYPE *
+SH_INSERT(SH_TYPE *tb, SH_KEY_TYPE key, bool *found)
+{
+	uint32 hash = SH_HASH_KEY(tb, key);
+	uint32 startelem;
+	uint32 curelem;
+	SH_ELEMENT_TYPE *data;
+	uint32 insertdist = 0;
+
+	/*
+	 * We do the grow check even if the key is actually present, to avoid
+	 * doing the check inside the loop. This also lets us avoid having to
+	 * re-find our position in the hashtable after resizing.
+	 */
+	if (unlikely(tb->members >= tb->grow_threshold))
+	{
+		if (tb->size == SH_MAX_SIZE)
+		{
+			elog(ERROR, "hash table size exceeded");
+		}
+
+		/*
+		 * When optimizing, it can be very useful to print these out.
+		 */
+		/* SH_STAT(tb); */
+		SH_GROW(tb, tb->size * 2);
+		/* SH_STAT(tb); */
+	}
+
+	/* perform insert, start bucket search at optimal location */
+	data = tb->data;
+	startelem = SH_INITIAL_BUCKET(tb, hash);
+	curelem = startelem;
+	while(true)
+	{
+		uint32 curdist;
+		uint32 curhash;
+		uint32 curoptimal;
+		SH_ELEMENT_TYPE *entry = &data[curelem];
+
+		/* any empty bucket can directly be used */
+		if (entry->status == SH_STATUS_EMPTY)
+		{
+			tb->members++;
+			entry->SH_KEY = key;
+#ifdef SH_STORE_HASH
+			SH_GET_HASH(tb, entry) = hash;
+#endif
+			entry->status = SH_STATUS_IN_USE;
+			*found = false;
+			return entry;
+		}
+
+		/*
+		 * If the bucket is not empty, we either found a match (in which case
+		 * we're done), or we have to decide whether to skip over or move the
+		 * colliding entry. When the the colliding elements distance to it's
+		 * optimal position is smaller than the to-be-inserted entry's, we
+		 * shift the colliding entry (and it's followers) forward by one.
+		 */
+
+		if (SH_COMPARE_KEYS(tb, hash, key, entry))
+		{
+			Assert(entry->status == SH_STATUS_IN_USE);
+			*found = true;
+			return entry;
+		}
+
+		curhash = SH_ENTRY_HASH(tb, entry);
+		curoptimal = SH_INITIAL_BUCKET(tb, curhash);
+		curdist = SH_DISTANCE_FROM_OPTIMAL(tb, curoptimal, curelem);
+
+		if (insertdist > curdist)
+		{
+			SH_ELEMENT_TYPE *lastentry = entry;
+			uint32 emptyelem = curelem;
+			uint32 moveelem;
+
+			/* find next empty bucket */
+			while (true)
+			{
+				SH_ELEMENT_TYPE *emptyentry;
+
+				emptyelem = SH_NEXT(tb, emptyelem, startelem);
+				emptyentry = &data[emptyelem];
+
+				if (emptyentry->status == SH_STATUS_EMPTY)
+				{
+					lastentry = emptyentry;
+					break;
+				}
+			}
+
+			/* shift forward, starting at last occupied element */
+			/*
+			 * TODO: This could be optimized to be one memcpy in may cases,
+			 * excepting wrapping around at the end of ->data. Hasn't shown up
+			 * in profiles so far though.
+			 */
+			moveelem = emptyelem;
+			while(moveelem != curelem)
+			{
+				SH_ELEMENT_TYPE *moveentry;
+
+				moveelem = SH_PREV(tb, moveelem, startelem);
+				moveentry = &data[moveelem];
+
+				memcpy(lastentry, moveentry, sizeof(SH_ELEMENT_TYPE));
+				lastentry = moveentry;
+			}
+
+			/* and fill the now empty spot */
+			tb->members++;
+
+			entry->SH_KEY = key;
+#ifdef SH_STORE_HASH
+			SH_GET_HASH(tb, entry) = hash;
+#endif
+			entry->status = SH_STATUS_IN_USE;
+			*found = false;
+			return entry;
+		}
+
+		curelem = SH_NEXT(tb, curelem, startelem);
+		insertdist++;
+	}
+}
+
+/*
+ * Lookup up entry in hash table.  Returns NULL if key not present.
+ */
+SH_SCOPE SH_ELEMENT_TYPE *
+SH_LOOKUP(SH_TYPE *tb, SH_KEY_TYPE key)
+{
+	uint32 hash = SH_HASH_KEY(tb, key);
+	const uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
+	uint32 curelem = startelem;
+
+	while(true)
+	{
+		SH_ELEMENT_TYPE *entry = &tb->data[curelem];
+
+		if (entry->status == SH_STATUS_EMPTY)
+		{
+			return NULL;
+		}
+
+		Assert(entry->status == SH_STATUS_IN_USE);
+
+		if (SH_COMPARE_KEYS(tb, hash, key, entry))
+			return entry;
+
+		/*
+		 * TODO: we could stop search based on distance. If the current
+		 * buckets's distance-from-optimal is smaller than what we've skipped
+		 * already, the entry doesn't exist. Probably only do so if
+		 * SH_STORE_HASH is defined, to avoid re-computing hashes?
+		 */
+
+		curelem = SH_NEXT(tb, curelem, startelem);
+	}
+}
+
+/*
+ * Delete entry from hash table.  Returns whether to-be-deleted key was
+ * present.
+ */
+SH_SCOPE bool
+SH_DELETE(SH_TYPE *tb, SH_KEY_TYPE key)
+{
+	uint32 hash = SH_HASH_KEY(tb, key);
+	uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
+	uint32 curelem = startelem;
+
+	while(true)
+	{
+		SH_ELEMENT_TYPE *entry = &tb->data[curelem];
+
+		if (entry->status == SH_STATUS_EMPTY)
+			return false;
+
+		if (entry->status == SH_STATUS_IN_USE &&
+			SH_COMPARE_KEYS(tb, hash, key, entry))
+		{
+			SH_ELEMENT_TYPE *lastentry = entry;
+
+			tb->members--;
+
+			/*
+			 * Backward shift following elements till either:
+			 * a) an empty element
+			 * b) an element at its optimal position
+			 * is encounterered.
+			 *
+			 * While that sounds expensive, the average chain length is short,
+			 * and deletions would otherwise require toombstones.
+			 */
+			while (true)
+			{
+				SH_ELEMENT_TYPE *curentry;
+				uint32 curhash;
+				uint32 curoptimal;
+
+				curelem = SH_NEXT(tb, curelem, startelem);
+				curentry = &tb->data[curelem];
+
+				if (curentry->status != SH_STATUS_IN_USE)
+				{
+					lastentry->status = SH_STATUS_EMPTY;
+					break;
+				}
+
+				curhash = SH_ENTRY_HASH(tb, curentry);
+				curoptimal = SH_INITIAL_BUCKET(tb, curhash);
+
+				/* current is at optimal position, done */
+				if (curoptimal == curelem)
+				{
+					lastentry->status = SH_STATUS_EMPTY;
+					break;
+				}
+
+				/* shift */
+				memcpy(lastentry, curentry, sizeof(SH_ELEMENT_TYPE));
+
+				lastentry = curentry;
+			}
+
+			return true;
+		}
+
+		/* TODO: return false; if distance too big */
+
+		curelem = SH_NEXT(tb, curelem, startelem);
+	}
+}
+
+/*
+ * Initialize iterator.
+ */
+SH_SCOPE void
+SH_START_ITERATE(SH_TYPE *tb, SH_ITERATOR *iter)
+{
+	int i;
+	uint32 startelem;
+
+	/*
+	 * Search for the first empty element. As deletions during iterations are
+	 * supported, we want to start/end at an element that cannot be affected
+	 * by elements being shifted.
+	 */
+	for (i = 0; i < tb->size; i++)
+	{
+		SH_ELEMENT_TYPE *entry = &tb->data[i];
+
+		if (entry->status != SH_STATUS_IN_USE)
+		{
+			startelem = i;
+			break;
+		}
+	}
+
+	/*
+	 * Iterate backwards, that allows the current element to be deleted, even
+	 * if there are backward shifts
+	 */
+	iter->cur = startelem;
+	iter->end = iter->cur;
+	iter->done = false;
+}
+
+/*
+ * Initialize iterator to a specific bucket. That's really only useful for
+ * cases where callers are partially iterating over the hashspace, and that
+ * iteration deletes and inserts elements based on visited entries. Doing that
+ * repeatedly could lead to an unbalanced keyspace when always starting at the
+ * same position.
+ */
+SH_SCOPE void
+SH_START_ITERATE_AT(SH_TYPE *tb, SH_ITERATOR *iter, uint32 at)
+{
+	/*
+	 * Iterate backwards, that allows the current element to be deleted, even
+	 * if there are backward shifts.
+	 */
+	iter->cur = at & tb->sizemask; /* ensure at is within a valid range */
+	iter->end = iter->cur;
+	iter->done = false;
+}
+
+/*
+ * Iterate over all entries in the hash-table. Return the next occupied entry,
+ * or NULL if done.
+ *
+ * During iteration the current entry in the hash table may be deleted,
+ * without leading to elements being skipped or returned twice.  Additionally
+ * the rest of the table may be modified (i.e. there can be insertions or
+ * deletions), but if so, there's neither a guarantee that all nodes are
+ * visited at least once, nor a guarantee that a node is visited at most once.
+ */
+SH_SCOPE SH_ELEMENT_TYPE *
+SH_ITERATE(SH_TYPE *tb, SH_ITERATOR *iter)
+{
+	while (!iter->done)
+	{
+		SH_ELEMENT_TYPE *elem;
+
+		elem = &tb->data[iter->cur];
+
+		/* next element in backward direction */
+		iter->cur = (iter->cur - 1) & tb->sizemask;
+
+		if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask))
+			iter->done = true;
+		if (elem->status == SH_STATUS_IN_USE)
+		{
+			return elem;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Report some statistics about the state of the hashtable. For
+ * debugging/profiling purposes only.
+ */
+SH_SCOPE void
+SH_STAT(SH_TYPE *tb)
+{
+	uint32 max_chain_length = 0;
+	uint32 total_chain_length = 0;
+	double avg_chain_length;
+	double fillfactor;
+	uint32 i;
+
+	uint32 *collisions = palloc0(tb->size * sizeof(uint32));
+	uint32 total_collisions = 0;
+	uint32 max_collisions = 0;
+	double avg_collisions;
+
+	for (i = 0; i < tb->size; i++)
+	{
+		uint32 hash;
+		uint32 optimal;
+		uint32 dist;
+		SH_ELEMENT_TYPE *elem;
+
+		elem = &tb->data[i];
+
+		if (elem->status != SH_STATUS_IN_USE)
+			continue;
+
+		hash = SH_ENTRY_HASH(tb, elem);
+		optimal = SH_INITIAL_BUCKET(tb, hash);
+		dist = SH_DISTANCE_FROM_OPTIMAL(tb, optimal, i);
+
+		if (dist > max_chain_length)
+			max_chain_length = dist;
+		total_chain_length += dist;
+
+		collisions[optimal]++;
+	}
+
+	for (i = 0; i < tb->size; i++)
+	{
+		uint32 curcoll = collisions[i];
+
+		if (curcoll == 0)
+			continue;
+
+		/* single contained element is not a collision */
+		curcoll--;
+		total_collisions += curcoll;
+		if (curcoll > max_collisions)
+			max_collisions = curcoll;
+	}
+
+	if (tb->members > 0)
+	{
+		fillfactor = tb->members / ((double) tb->size);
+		avg_chain_length = ((double) total_chain_length) / tb->members;
+		avg_collisions = ((double) total_collisions) / tb->members;
+	}
+	else
+	{
+		fillfactor = 0;
+		avg_chain_length = 0;
+		avg_collisions = 0;
+	}
+
+	elog(LOG, "size: "UINT64_FORMAT", members: %u, filled: %f, total chain: %u, max chain: %u, avg chain: %f, total_collisions: %u, max_collisions: %i, avg_collisions: %f",
+		 tb->size, tb->members, fillfactor, total_chain_length, max_chain_length, avg_chain_length,
+		 total_collisions, max_collisions, avg_collisions);
+}
+
+#endif /* SH_DEFINE */
+
+
+/* undefine external paramters, so next hash table can be defined */
+#undef SH_PREFIX
+#undef SH_KEY_TYPE
+#undef SH_KEY
+#undef SH_ELEMENT_TYPE
+#undef SH_HASH_KEY
+#undef SH_SCOPE
+#undef SH_DECLARE
+#undef SH_DEFINE
+#undef SH_GET_HASH
+#undef SH_STORE_HASH
+
+/* undefine locally declared macros */
+#undef SH_MAKE_PREFIX
+#undef SH_MAKE_NAME
+#undef SH_MAKE_NAME_
+#undef SH_FILLFACTOR
+#undef SH_MAX_FILLFACTOR
+#undef SH_MAX_SIZE
+
+/* types */
+#undef SH_TYPE
+#undef SH_STATUS
+#undef SH_STATUS_EMPTY
+#undef SH_STATUS_IN_USE
+#undef SH_ITERTOR
+
+/* external function names */
+#undef SH_CREATE
+#undef SH_DESTROY
+#undef SH_INSERT
+#undef SH_DELETE
+#undef SH_LOOKUP
+#undef SH_GROW
+#undef SH_START_ITERATE
+#undef SH_START_ITERATE_AT
+#undef SH_ITERATE
+#undef SH_STAT
+
+/* internal function names */
+#undef SH_COMPUTE_PARAMETERS
+#undef SH_COMPARE_KEYS
+#undef SH_INITIAL_BUCKET
+#undef SH_NEXT
+#undef SH_PREV
+#undef SH_DISTANCE_FROM_OPTIMAL
+#undef SH_ENTRY_HASH
-- 
2.9.3

>From 0aa179e159a2be6528a749051b334b3f8d63f26e Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 7 Jul 2016 14:47:19 -0700
Subject: [PATCH 4/5] Use more efficient hashtable for tidbitmap.c to speed up
 bitmap scans.

Use the new simplehash.h to speed up tidbitmap.c uses. For bitmap scan
heavy queries speedups of over 100% have been measured. Both lossy and
exact scans benefit, but the wins are bigger for mostly exact scans.

The conversion is mostly trivial, except that tbm_lossify() now restarts
lossifying at the point it previously stopped. Otherwise the hash table
becomes unbalanced because the scan in done in hash-order, leaving the
end of the hashtable more densely filled then the beginning. That caused
performance issues with dynahash as well, but due to the open chaining
they were less pronounced than with the linear adressing from
simplehash.h.

Reviewed-By: Tomas Vondra
Discussion: <20160727004333.r3e2k2y6fvk2n...@alap3.anarazel.de>
---
 src/backend/nodes/tidbitmap.c | 129 ++++++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 60 deletions(-)

diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index dfeb7d5..0026304 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -43,7 +43,6 @@
 #include "access/htup_details.h"
 #include "nodes/bitmapset.h"
 #include "nodes/tidbitmap.h"
-#include "utils/hsearch.h"
 
 /*
  * The maximum number of tuples per page is not large (typically 256 with
@@ -97,6 +96,7 @@
 typedef struct PagetableEntry
 {
 	BlockNumber blockno;		/* page number (hashtable key) */
+	char		status;
 	bool		ischunk;		/* T = lossy storage, F = exact */
 	bool		recheck;		/* should the tuples be rechecked? */
 	bitmapword	words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
@@ -128,12 +128,13 @@ struct TIDBitmap
 	NodeTag		type;			/* to make it a valid Node */
 	MemoryContext mcxt;			/* memory context containing me */
 	TBMStatus	status;			/* see codes above */
-	HTAB	   *pagetable;		/* hash table of PagetableEntry's */
+	struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
 	int			nentries;		/* number of entries in pagetable */
 	int			maxentries;		/* limit on same to meet maxbytes */
 	int			npages;			/* number of exact entries in pagetable */
 	int			nchunks;		/* number of lossy entries in pagetable */
 	bool		iterating;		/* tbm_begin_iterate called? */
+	uint32		lossify_start;  /* offset to start lossifying hashtable */
 	PagetableEntry entry1;		/* used when status == TBM_ONE_PAGE */
 	/* these are valid when iterating is true: */
 	PagetableEntry **spages;	/* sorted exact-page list, or NULL */
@@ -168,6 +169,29 @@ static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
 static void tbm_lossify(TIDBitmap *tbm);
 static int	tbm_comparator(const void *left, const void *right);
 
+static inline uint32
+hash_blockno(BlockNumber b)
+{
+	uint32 h = b;
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+	return h;
+}
+
+#define SH_PREFIX pagetable
+#define SH_ELEMENT_TYPE PagetableEntry
+#define SH_KEY_TYPE BlockNumber
+#define SH_KEY blockno
+#define SH_HASH_KEY(tb, key) hash_blockno(key)
+#define SH_EQUAL(tb, a, b) a == b
+#define SH_SCOPE extern
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
 
 /*
  * tbm_create - create an initially-empty bitmap
@@ -190,17 +214,15 @@ tbm_create(long maxbytes)
 
 	/*
 	 * Estimate number of hashtable entries we can have within maxbytes. This
-	 * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
-	 * pointer per hash entry, which is crude but good enough for our purpose.
-	 * Also count an extra Pointer per entry for the arrays created during
-	 * iteration readout.
+	 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
+	 * for our purpose.  Also count an extra Pointer per entry for the arrays
+	 * created during iteration readout.
 	 */
-	nbuckets = maxbytes /
-		(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
-		 + sizeof(Pointer) + sizeof(Pointer));
+	nbuckets = maxbytes / (sizeof(PagetableEntry) + sizeof(Pointer));
 	nbuckets = Min(nbuckets, INT_MAX - 1);		/* safety limit */
 	nbuckets = Max(nbuckets, 16);		/* sanity limit */
 	tbm->maxentries = (int) nbuckets;
+	tbm->lossify_start = 0;
 
 	return tbm;
 }
@@ -212,32 +234,24 @@ tbm_create(long maxbytes)
 static void
 tbm_create_pagetable(TIDBitmap *tbm)
 {
-	HASHCTL		hash_ctl;
-
 	Assert(tbm->status != TBM_HASH);
 	Assert(tbm->pagetable == NULL);
 
-	/* Create the hashtable proper */
-	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-	hash_ctl.keysize = sizeof(BlockNumber);
-	hash_ctl.entrysize = sizeof(PagetableEntry);
-	hash_ctl.hcxt = tbm->mcxt;
-	tbm->pagetable = hash_create("TIDBitmap",
-								 128,	/* start small and extend */
-								 &hash_ctl,
-								 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+	tbm->pagetable = pagetable_create(tbm->mcxt, 128);
 
-	/* If entry1 is valid, push it into the hashtable */
 	if (tbm->status == TBM_ONE_PAGE)
 	{
 		PagetableEntry *page;
 		bool		found;
+		char		oldstatus;
 
-		page = (PagetableEntry *) hash_search(tbm->pagetable,
-											  (void *) &tbm->entry1.blockno,
-											  HASH_ENTER, &found);
+		page = pagetable_insert(tbm->pagetable,
+								tbm->entry1.blockno,
+								&found);
 		Assert(!found);
+		oldstatus = page->status;
 		memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
+		page->status = oldstatus;
 	}
 
 	tbm->status = TBM_HASH;
@@ -250,7 +264,7 @@ void
 tbm_free(TIDBitmap *tbm)
 {
 	if (tbm->pagetable)
-		hash_destroy(tbm->pagetable);
+		pagetable_destroy(tbm->pagetable);
 	if (tbm->spages)
 		pfree(tbm->spages);
 	if (tbm->schunks)
@@ -357,12 +371,12 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b)
 		tbm_union_page(a, &b->entry1);
 	else
 	{
-		HASH_SEQ_STATUS status;
+		pagetable_iterator i;
 		PagetableEntry *bpage;
 
 		Assert(b->status == TBM_HASH);
-		hash_seq_init(&status, b->pagetable);
-		while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+		pagetable_start_iterate(b->pagetable, &i);
+		while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
 			tbm_union_page(a, bpage);
 	}
 }
@@ -449,12 +463,12 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
 	}
 	else
 	{
-		HASH_SEQ_STATUS status;
+		pagetable_iterator i;
 		PagetableEntry *apage;
 
 		Assert(a->status == TBM_HASH);
-		hash_seq_init(&status, a->pagetable);
-		while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+		pagetable_start_iterate(a->pagetable, &i);
+		while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
 		{
 			if (tbm_intersect_page(a, apage, b))
 			{
@@ -464,9 +478,7 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
 				else
 					a->npages--;
 				a->nentries--;
-				if (hash_search(a->pagetable,
-								(void *) &apage->blockno,
-								HASH_REMOVE, NULL) == NULL)
+				if (!pagetable_delete(a->pagetable, apage->blockno))
 					elog(ERROR, "hash table corrupted");
 			}
 		}
@@ -606,7 +618,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
 	 */
 	if (tbm->status == TBM_HASH && !tbm->iterating)
 	{
-		HASH_SEQ_STATUS status;
+		pagetable_iterator	i;
 		PagetableEntry *page;
 		int			npages;
 		int			nchunks;
@@ -620,9 +632,9 @@ tbm_begin_iterate(TIDBitmap *tbm)
 				MemoryContextAlloc(tbm->mcxt,
 								   tbm->nchunks * sizeof(PagetableEntry *));
 
-		hash_seq_init(&status, tbm->pagetable);
 		npages = nchunks = 0;
-		while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+		pagetable_start_iterate(tbm->pagetable, &i);
+		while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
 		{
 			if (page->ischunk)
 				tbm->schunks[nchunks++] = page;
@@ -791,9 +803,7 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
 		return page;
 	}
 
-	page = (PagetableEntry *) hash_search(tbm->pagetable,
-										  (void *) &pageno,
-										  HASH_FIND, NULL);
+	page = pagetable_lookup(tbm->pagetable, pageno);
 	if (page == NULL)
 		return NULL;
 	if (page->ischunk)
@@ -833,16 +843,15 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
 			tbm_create_pagetable(tbm);
 		}
 
-		/* Look up or create an entry */
-		page = (PagetableEntry *) hash_search(tbm->pagetable,
-											  (void *) &pageno,
-											  HASH_ENTER, &found);
+		page = pagetable_insert(tbm->pagetable, pageno, &found);
 	}
 
 	/* Initialize it if not present before */
 	if (!found)
 	{
+		char oldstatus = page->status;
 		MemSet(page, 0, sizeof(PagetableEntry));
+		page->status = oldstatus;
 		page->blockno = pageno;
 		/* must count it too */
 		tbm->nentries++;
@@ -869,9 +878,9 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
 
 	bitno = pageno % PAGES_PER_CHUNK;
 	chunk_pageno = pageno - bitno;
-	page = (PagetableEntry *) hash_search(tbm->pagetable,
-										  (void *) &chunk_pageno,
-										  HASH_FIND, NULL);
+
+	page = pagetable_lookup(tbm->pagetable, chunk_pageno);
+
 	if (page != NULL && page->ischunk)
 	{
 		int			wordnum = WORDNUM(bitno);
@@ -912,9 +921,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 	 */
 	if (bitno != 0)
 	{
-		if (hash_search(tbm->pagetable,
-						(void *) &pageno,
-						HASH_REMOVE, NULL) != NULL)
+		if (pagetable_delete(tbm->pagetable, pageno))
 		{
 			/* It was present, so adjust counts */
 			tbm->nentries--;
@@ -922,15 +929,14 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 		}
 	}
 
-	/* Look up or create entry for chunk-header page */
-	page = (PagetableEntry *) hash_search(tbm->pagetable,
-										  (void *) &chunk_pageno,
-										  HASH_ENTER, &found);
+	page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
 
 	/* Initialize it if not present before */
 	if (!found)
 	{
+		char oldstatus = page->status;
 		MemSet(page, 0, sizeof(PagetableEntry));
+		page->status = oldstatus;
 		page->blockno = chunk_pageno;
 		page->ischunk = true;
 		/* must count it too */
@@ -939,8 +945,10 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 	}
 	else if (!page->ischunk)
 	{
+		char oldstatus = page->status;
 		/* chunk header page was formerly non-lossy, make it lossy */
 		MemSet(page, 0, sizeof(PagetableEntry));
+		page->status = oldstatus;
 		page->blockno = chunk_pageno;
 		page->ischunk = true;
 		/* we assume it had some tuple bit(s) set, so mark it lossy */
@@ -962,7 +970,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 static void
 tbm_lossify(TIDBitmap *tbm)
 {
-	HASH_SEQ_STATUS status;
+	pagetable_iterator i;
 	PagetableEntry *page;
 
 	/*
@@ -977,8 +985,8 @@ tbm_lossify(TIDBitmap *tbm)
 	Assert(!tbm->iterating);
 	Assert(tbm->status == TBM_HASH);
 
-	hash_seq_init(&status, tbm->pagetable);
-	while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+	pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
+	while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
 	{
 		if (page->ischunk)
 			continue;			/* already a chunk header */
@@ -996,14 +1004,15 @@ tbm_lossify(TIDBitmap *tbm)
 		if (tbm->nentries <= tbm->maxentries / 2)
 		{
 			/* we have done enough */
-			hash_seq_term(&status);
+			tbm->lossify_start = i.cur;
 			break;
 		}
 
 		/*
 		 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
-		 * hashtable.  We can continue the same seq_search scan since we do
-		 * not care whether we visit lossy chunks or not.
+		 * hashtable and may have deleted the non-lossy chunk.  We can
+		 * continue the same hash table scan, since failure to visit one
+		 * element or visiting the newly inserted element, isn't fatal.
 		 */
 	}
 
-- 
2.9.3

>From 548834baff66b90dc73e705e9b107f5f6df68969 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 21 Jul 2016 12:51:14 -0700
Subject: [PATCH 5/5] Use more efficient hashtable for execGrouping.c to speed
 up hash aggregation.

The more efficient hashtable speeds up hash-aggregations with more than
a few hundred groups significantly. Improvements of over 120% have been
measured.

Due to the the different hash table queries that not fully
determined (e.g. GROUP BY without ORDER BY) may change their result
order.

The conversion is largely straight-forward, except that, due to the
static element types of simplehash.h type hashes, the additional data
some users store in elements (e.g. the per-group working data for hash
aggregaters) is now stored in TupleHashEntryData->additional.  The
meaning of BuildTupleHashTable's entrysize (renamed to additionalsize)
has been changed to only be about the additionally stored size.  That
size is only used for the initial sizing of the hash-table.

Reviewed-By: Tomas Vondra
Discussion: <20160727004333.r3e2k2y6fvk2n...@alap3.anarazel.de>
---
 src/backend/executor/execGrouping.c       | 147 +++++++++++-------------------
 src/backend/executor/nodeAgg.c            |  61 +++++--------
 src/backend/executor/nodeRecursiveunion.c |  13 +--
 src/backend/executor/nodeSetOp.c          |  43 ++++-----
 src/backend/executor/nodeSubplan.c        |   6 +-
 src/backend/optimizer/plan/planner.c      |   6 ++
 src/include/executor/executor.h           |   2 +-
 src/include/nodes/execnodes.h             |  37 +++++---
 8 files changed, 131 insertions(+), 184 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 808275a..0fa0502 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -23,12 +23,25 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 
+static uint32 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple);
+static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 
-static TupleHashTable CurTupleHashTable = NULL;
-
-static uint32 TupleHashTableHash(const void *key, Size keysize);
-static int TupleHashTableMatch(const void *key1, const void *key2,
-					Size keysize);
+/*
+ * Define parameters for tuple hash table code generation. The interface is
+ * *also* declared in execnodes.h (to generate the types, which are externally
+ * visible).
+ */
+#define SH_PREFIX tuplehash
+#define SH_ELEMENT_TYPE TupleHashEntryData
+#define SH_KEY_TYPE MinimalTuple
+#define SH_KEY firstTuple
+#define SH_HASH_KEY(tb, key) TupleHashTableHash(tb, key)
+#define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0
+#define SH_SCOPE extern
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a) a->hash
+#define SH_DEFINE
+#include "lib/simplehash.h"
 
 
 /*****************************************************************************
@@ -260,7 +273,7 @@ execTuplesHashPrepare(int numCols,
  *	eqfunctions: equality comparison functions to use
  *	hashfunctions: datatype-specific hashing functions to use
  *	nbuckets: initial estimate of hashtable size
- *	entrysize: size of each entry (at least sizeof(TupleHashEntryData))
+ *	additionalsize: size of data stored in ->additional
  *	tablecxt: memory context in which to store table and table entries
  *	tempcxt: short-lived context for evaluation hash and comparison functions
  *
@@ -275,20 +288,18 @@ TupleHashTable
 BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 					FmgrInfo *eqfunctions,
 					FmgrInfo *hashfunctions,
-					long nbuckets, Size entrysize,
+					long nbuckets, Size additionalsize,
 					MemoryContext tablecxt, MemoryContext tempcxt)
 {
 	TupleHashTable hashtable;
-	HASHCTL		hash_ctl;
-
+	Size		entrysize = sizeof(TupleHashEntryData) + additionalsize;
 	Assert(nbuckets > 0);
-	Assert(entrysize >= sizeof(TupleHashEntryData));
 
 	/* Limit initial table size request to not more than work_mem */
 	nbuckets = Min(nbuckets, (long) ((work_mem * 1024L) / entrysize));
 
-	hashtable = (TupleHashTable) MemoryContextAlloc(tablecxt,
-												 sizeof(TupleHashTableData));
+	hashtable = (TupleHashTable)
+		MemoryContextAlloc(tablecxt, sizeof(TupleHashTableData));
 
 	hashtable->numCols = numCols;
 	hashtable->keyColIdx = keyColIdx;
@@ -302,15 +313,8 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 	hashtable->in_hash_funcs = NULL;
 	hashtable->cur_eq_funcs = NULL;
 
-	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-	hash_ctl.keysize = sizeof(TupleHashEntryData);
-	hash_ctl.entrysize = entrysize;
-	hash_ctl.hash = TupleHashTableHash;
-	hash_ctl.match = TupleHashTableMatch;
-	hash_ctl.hcxt = tablecxt;
-	hashtable->hashtab = hash_create("TupleHashTable", nbuckets,
-									 &hash_ctl,
-					HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
+	hashtable->hashtab = tuplehash_create(tablecxt, nbuckets);
+	hashtable->hashtab->private = hashtable;
 
 	return hashtable;
 }
@@ -324,18 +328,17 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
  *
  * If isnew isn't NULL, then a new entry is created if no existing entry
  * matches.  On return, *isnew is true if the entry is newly created,
- * false if it existed already.  Any extra space in a new entry has been
- * zeroed.
+ * false if it existed already.  ->additional_data in the new entry has
+ * been zeroed.
  */
 TupleHashEntry
 LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 					 bool *isnew)
 {
-	TupleHashEntry entry;
+	TupleHashEntryData *entry;
 	MemoryContext oldContext;
-	TupleHashTable saveCurHT;
-	TupleHashEntryData dummy;
 	bool		found;
+	MinimalTuple key;
 
 	/* If first time through, clone the input slot to make table slot */
 	if (hashtable->tableslot == NULL)
@@ -358,26 +361,17 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 
 	/*
 	 * Set up data needed by hash and match functions
-	 *
-	 * We save and restore CurTupleHashTable just in case someone manages to
-	 * invoke this code re-entrantly.
 	 */
 	hashtable->inputslot = slot;
 	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
 	hashtable->cur_eq_funcs = hashtable->tab_eq_funcs;
 
-	saveCurHT = CurTupleHashTable;
-	CurTupleHashTable = hashtable;
-
-	/* Search the hash table */
-	dummy.firstTuple = NULL;	/* flag to reference inputslot */
-	entry = (TupleHashEntry) hash_search(hashtable->hashtab,
-										 &dummy,
-										 isnew ? HASH_ENTER : HASH_FIND,
-										 &found);
+	key = NULL;
 
 	if (isnew)
 	{
+		entry = tuplehash_insert(hashtable->hashtab, key, &found);
+
 		if (found)
 		{
 			/* found pre-existing entry */
@@ -385,24 +379,20 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 		}
 		else
 		{
-			/*
-			 * created new entry
-			 *
-			 * Zero any caller-requested space in the entry.  (This zaps the
-			 * "key data" dynahash.c copied into the new entry, but we don't
-			 * care since we're about to overwrite it anyway.)
-			 */
-			MemSet(entry, 0, hashtable->entrysize);
-
-			/* Copy the first tuple into the table context */
-			MemoryContextSwitchTo(hashtable->tablecxt);
-			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
-
+			/* created new entry */
 			*isnew = true;
+			/* zero caller data */
+			entry->additional = NULL;
+			MemoryContextSwitchTo(hashtable->tablecxt);
+			/* Copy the first tuple into the table context */
+			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
 		}
 	}
+	else
+	{
+		entry = tuplehash_lookup(hashtable->hashtab, key);
+	}
 
-	CurTupleHashTable = saveCurHT;
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -425,34 +415,19 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 {
 	TupleHashEntry entry;
 	MemoryContext oldContext;
-	TupleHashTable saveCurHT;
-	TupleHashEntryData dummy;
+	MinimalTuple key;
 
 	/* Need to run the hash functions in short-lived context */
 	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
 
-	/*
-	 * Set up data needed by hash and match functions
-	 *
-	 * We save and restore CurTupleHashTable just in case someone manages to
-	 * invoke this code re-entrantly.
-	 */
+	/* Set up data needed by hash and match functions */
 	hashtable->inputslot = slot;
 	hashtable->in_hash_funcs = hashfunctions;
 	hashtable->cur_eq_funcs = eqfunctions;
 
-	saveCurHT = CurTupleHashTable;
-	CurTupleHashTable = hashtable;
-
 	/* Search the hash table */
-	dummy.firstTuple = NULL;	/* flag to reference inputslot */
-	entry = (TupleHashEntry) hash_search(hashtable->hashtab,
-										 &dummy,
-										 HASH_FIND,
-										 NULL);
-
-	CurTupleHashTable = saveCurHT;
-
+	key = NULL;	/* flag to reference inputslot */
+	entry = tuplehash_lookup(hashtable->hashtab, key);
 	MemoryContextSwitchTo(oldContext);
 
 	return entry;
@@ -468,22 +443,18 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
  * This convention avoids the need to materialize virtual input tuples unless
  * they actually need to get copied into the table.
  *
- * CurTupleHashTable must be set before calling this, since dynahash.c
- * doesn't provide any API that would let us get at the hashtable otherwise.
- *
  * Also, the caller must select an appropriate memory context for running
  * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.)
  */
 static uint32
-TupleHashTableHash(const void *key, Size keysize)
+TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
 {
-	MinimalTuple tuple = ((const TupleHashEntryData *) key)->firstTuple;
-	TupleTableSlot *slot;
-	TupleHashTable hashtable = CurTupleHashTable;
+	TupleHashTable hashtable = (TupleHashTable) tb->private;
 	int			numCols = hashtable->numCols;
 	AttrNumber *keyColIdx = hashtable->keyColIdx;
-	FmgrInfo   *hashfunctions;
 	uint32		hashkey = 0;
+	TupleTableSlot *slot;
+	FmgrInfo   *hashfunctions;
 	int			i;
 
 	if (tuple == NULL)
@@ -495,7 +466,6 @@ TupleHashTableHash(const void *key, Size keysize)
 	else
 	{
 		/* Process a tuple already stored in the table */
-		/* (this case never actually occurs in current dynahash.c code) */
 		slot = hashtable->tableslot;
 		ExecStoreMinimalTuple(tuple, slot, false);
 		hashfunctions = hashtable->tab_hash_funcs;
@@ -530,30 +500,23 @@ TupleHashTableHash(const void *key, Size keysize)
  *
  * As above, the passed pointers are pointers to TupleHashEntryData.
  *
- * CurTupleHashTable must be set before calling this, since dynahash.c
- * doesn't provide any API that would let us get at the hashtable otherwise.
- *
  * Also, the caller must select an appropriate memory context for running
  * the compare functions.  (dynahash.c doesn't change CurrentMemoryContext.)
  */
 static int
-TupleHashTableMatch(const void *key1, const void *key2, Size keysize)
+TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2)
 {
-	MinimalTuple tuple1 = ((const TupleHashEntryData *) key1)->firstTuple;
-
-#ifdef USE_ASSERT_CHECKING
-	MinimalTuple tuple2 = ((const TupleHashEntryData *) key2)->firstTuple;
-#endif
 	TupleTableSlot *slot1;
 	TupleTableSlot *slot2;
-	TupleHashTable hashtable = CurTupleHashTable;
+	TupleHashTable hashtable = (TupleHashTable) tb->private;
 
 	/*
-	 * We assume that dynahash.c will only ever call us with the first
+	 * We assume that simplehash.h will only ever call us with the first
 	 * argument being an actual table entry, and the second argument being
 	 * LookupTupleHashEntry's dummy TupleHashEntryData.  The other direction
-	 * could be supported too, but is not currently used by dynahash.c.
+	 * could be supported too, but is not currently required..
 	 */
+
 	Assert(tuple1 != NULL);
 	slot1 = hashtable->tableslot;
 	ExecStoreMinimalTuple(tuple1, slot1, false);
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index ce2fc28..951f0a7 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -434,20 +434,6 @@ typedef struct AggStatePerPhaseData
 	Sort	   *sortnode;		/* Sort node for input ordering for phase */
 }	AggStatePerPhaseData;
 
-/*
- * To implement hashed aggregation, we need a hashtable that stores a
- * representative tuple and an array of AggStatePerGroup structs for each
- * distinct set of GROUP BY column values.  We compute the hash key from
- * the GROUP BY columns.
- */
-typedef struct AggHashEntryData *AggHashEntry;
-
-typedef struct AggHashEntryData
-{
-	TupleHashEntryData shared;	/* common header for hash table entries */
-	/* per-aggregate transition status array */
-	AggStatePerGroupData pergroup[FLEXIBLE_ARRAY_MEMBER];
-}	AggHashEntryData;
 
 static void initialize_phase(AggState *aggstate, int newphase);
 static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
@@ -487,7 +473,7 @@ static TupleTableSlot *project_aggregates(AggState *aggstate);
 static Bitmapset *find_unaggregated_cols(AggState *aggstate);
 static bool find_unaggregated_cols_walker(Node *node, Bitmapset **colnos);
 static void build_hash_table(AggState *aggstate);
-static AggHashEntry lookup_hash_entry(AggState *aggstate,
+static TupleHashEntryData *lookup_hash_entry(AggState *aggstate,
 				  TupleTableSlot *inputslot);
 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
 static void agg_fill_hash_table(AggState *aggstate);
@@ -1653,20 +1639,19 @@ build_hash_table(AggState *aggstate)
 {
 	Agg		   *node = (Agg *) aggstate->ss.ps.plan;
 	MemoryContext tmpmem = aggstate->tmpcontext->ecxt_per_tuple_memory;
-	Size		entrysize;
+	Size		additionalsize;
 
 	Assert(node->aggstrategy == AGG_HASHED);
 	Assert(node->numGroups > 0);
 
-	entrysize = offsetof(AggHashEntryData, pergroup) +
-		aggstate->numaggs * sizeof(AggStatePerGroupData);
+	additionalsize = aggstate->numaggs * sizeof(AggStatePerGroupData);
 
 	aggstate->hashtable = BuildTupleHashTable(node->numCols,
 											  node->grpColIdx,
 											  aggstate->phase->eqfunctions,
 											  aggstate->hashfunctions,
 											  node->numGroups,
-											  entrysize,
+											  additionalsize,
 							 aggstate->aggcontexts[0]->ecxt_per_tuple_memory,
 											  tmpmem);
 }
@@ -1723,6 +1708,8 @@ find_hash_columns(AggState *aggstate)
  *
  * Note that the estimate does not include space for pass-by-reference
  * transition data values, nor for the representative tuple of each group.
+ * Nor does this account of the target fill-factor and growth policy of the
+ * hash table.
  */
 Size
 hash_agg_entry_size(int numAggs)
@@ -1730,11 +1717,10 @@ hash_agg_entry_size(int numAggs)
 	Size		entrysize;
 
 	/* This must match build_hash_table */
-	entrysize = offsetof(AggHashEntryData, pergroup) +
+	entrysize = sizeof(TupleHashEntryData) +
 		numAggs * sizeof(AggStatePerGroupData);
 	entrysize = MAXALIGN(entrysize);
-	/* Account for hashtable overhead (assuming fill factor = 1) */
-	entrysize += 3 * sizeof(void *);
+
 	return entrysize;
 }
 
@@ -1744,18 +1730,21 @@ hash_agg_entry_size(int numAggs)
  *
  * When called, CurrentMemoryContext should be the per-query context.
  */
-static AggHashEntry
+static TupleHashEntryData *
 lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
 {
 	TupleTableSlot *hashslot = aggstate->hashslot;
 	ListCell   *l;
-	AggHashEntry entry;
+	TupleHashEntryData *entry;
 	bool		isnew;
 
 	/* if first time through, initialize hashslot by cloning input slot */
 	if (hashslot->tts_tupleDescriptor == NULL)
 	{
-		ExecSetSlotDescriptor(hashslot, inputslot->tts_tupleDescriptor);
+		MemoryContext oldContext = MemoryContextSwitchTo(hashslot->tts_mcxt);
+		/* get rid of constraints */
+		ExecSetSlotDescriptor(hashslot, CreateTupleDescCopy(inputslot->tts_tupleDescriptor));
+		MemoryContextSwitchTo(oldContext);
 		/* Make sure all unused columns are NULLs */
 		ExecStoreAllNullTuple(hashslot);
 	}
@@ -1771,14 +1760,14 @@ lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
 	}
 
 	/* find or create the hashtable entry using the filtered tuple */
-	entry = (AggHashEntry) LookupTupleHashEntry(aggstate->hashtable,
-												hashslot,
-												&isnew);
+	entry = LookupTupleHashEntry(aggstate->hashtable, hashslot, &isnew);
 
 	if (isnew)
 	{
+		entry->additional = MemoryContextAlloc(aggstate->hashtable->tablecxt,
+											   sizeof(AggStatePerGroupData) * aggstate->numtrans);
 		/* initialize aggregates for new tuple group */
-		initialize_aggregates(aggstate, entry->pergroup, 0);
+		initialize_aggregates(aggstate, entry->additional, 0);
 	}
 
 	return entry;
@@ -2176,7 +2165,7 @@ static void
 agg_fill_hash_table(AggState *aggstate)
 {
 	ExprContext *tmpcontext;
-	AggHashEntry entry;
+	TupleHashEntryData *entry;
 	TupleTableSlot *outerslot;
 
 	/*
@@ -2203,9 +2192,9 @@ agg_fill_hash_table(AggState *aggstate)
 
 		/* Advance the aggregates */
 		if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
-			combine_aggregates(aggstate, entry->pergroup);
+			combine_aggregates(aggstate, entry->additional);
 		else
-			advance_aggregates(aggstate, entry->pergroup);
+			advance_aggregates(aggstate, entry->additional);
 
 		/* Reset per-input-tuple context after each tuple */
 		ResetExprContext(tmpcontext);
@@ -2225,7 +2214,7 @@ agg_retrieve_hash_table(AggState *aggstate)
 	ExprContext *econtext;
 	AggStatePerAgg peragg;
 	AggStatePerGroup pergroup;
-	AggHashEntry entry;
+	TupleHashEntryData *entry;
 	TupleTableSlot *firstSlot;
 	TupleTableSlot *result;
 
@@ -2246,7 +2235,7 @@ agg_retrieve_hash_table(AggState *aggstate)
 		/*
 		 * Find the next entry in the hash table
 		 */
-		entry = (AggHashEntry) ScanTupleHashTable(&aggstate->hashiter);
+		entry = ScanTupleHashTable(aggstate->hashtable, &aggstate->hashiter);
 		if (entry == NULL)
 		{
 			/* No more entries in hashtable, so done */
@@ -2267,11 +2256,11 @@ agg_retrieve_hash_table(AggState *aggstate)
 		 * Store the copied first input tuple in the tuple table slot reserved
 		 * for it, so that it can be used in ExecProject.
 		 */
-		ExecStoreMinimalTuple(entry->shared.firstTuple,
+		ExecStoreMinimalTuple(entry->firstTuple,
 							  firstSlot,
 							  false);
 
-		pergroup = entry->pergroup;
+		pergroup = entry->additional;
 
 		finalize_aggregates(aggstate, peragg, pergroup, 0);
 
diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c
index 39be191..6e7d9d2 100644
--- a/src/backend/executor/nodeRecursiveunion.c
+++ b/src/backend/executor/nodeRecursiveunion.c
@@ -20,17 +20,6 @@
 #include "utils/memutils.h"
 
 
-/*
- * To implement UNION (without ALL), we need a hashtable that stores tuples
- * already seen.  The hash key is computed from the grouping columns.
- */
-typedef struct RUHashEntryData *RUHashEntry;
-
-typedef struct RUHashEntryData
-{
-	TupleHashEntryData shared;	/* common header for hash table entries */
-}	RUHashEntryData;
-
 
 /*
  * Initialize the hash table to empty.
@@ -48,7 +37,7 @@ build_hash_table(RecursiveUnionState *rustate)
 											 rustate->eqfunctions,
 											 rustate->hashfunctions,
 											 node->numGroups,
-											 sizeof(RUHashEntryData),
+											 0,
 											 rustate->tableContext,
 											 rustate->tempContext);
 }
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index 633580b..37d0477 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -66,19 +66,6 @@ typedef struct SetOpStatePerGroupData
 	long		numRight;		/* number of right-input dups in group */
 } SetOpStatePerGroupData;
 
-/*
- * To implement hashed mode, we need a hashtable that stores a
- * representative tuple and the duplicate counts for each distinct set
- * of grouping columns.  We compute the hash key from the grouping columns.
- */
-typedef struct SetOpHashEntryData *SetOpHashEntry;
-
-typedef struct SetOpHashEntryData
-{
-	TupleHashEntryData shared;	/* common header for hash table entries */
-	SetOpStatePerGroupData pergroup;
-}	SetOpHashEntryData;
-
 
 static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate);
 static void setop_fill_hash_table(SetOpState *setopstate);
@@ -141,7 +128,7 @@ build_hash_table(SetOpState *setopstate)
 												setopstate->eqfunctions,
 												setopstate->hashfunctions,
 												node->numGroups,
-												sizeof(SetOpHashEntryData),
+												0,
 												setopstate->tableContext,
 												setopstate->tempContext);
 }
@@ -367,7 +354,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 	{
 		TupleTableSlot *outerslot;
 		int			flag;
-		SetOpHashEntry entry;
+		TupleHashEntryData *entry;
 		bool		isnew;
 
 		outerslot = ExecProcNode(outerPlan);
@@ -383,15 +370,19 @@ setop_fill_hash_table(SetOpState *setopstate)
 			Assert(in_first_rel);
 
 			/* Find or build hashtable entry for this tuple's group */
-			entry = (SetOpHashEntry)
-				LookupTupleHashEntry(setopstate->hashtable, outerslot, &isnew);
+			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
+										 &isnew);
 
 			/* If new tuple group, initialize counts */
 			if (isnew)
-				initialize_counts(&entry->pergroup);
+			{
+				entry->additional = MemoryContextAlloc(setopstate->hashtable->tablecxt,
+													   sizeof(SetOpStatePerGroupData));
+				initialize_counts(entry->additional);
+			}
 
 			/* Advance the counts */
-			advance_counts(&entry->pergroup, flag);
+			advance_counts(entry->additional, flag);
 		}
 		else
 		{
@@ -399,12 +390,12 @@ setop_fill_hash_table(SetOpState *setopstate)
 			in_first_rel = false;
 
 			/* For tuples not seen previously, do not make hashtable entry */
-			entry = (SetOpHashEntry)
-				LookupTupleHashEntry(setopstate->hashtable, outerslot, NULL);
+			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
+										 NULL);
 
 			/* Advance the counts if entry is already present */
 			if (entry)
-				advance_counts(&entry->pergroup, flag);
+				advance_counts(entry->additional, flag);
 		}
 
 		/* Must reset temp context after each hashtable lookup */
@@ -422,7 +413,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 static TupleTableSlot *
 setop_retrieve_hash_table(SetOpState *setopstate)
 {
-	SetOpHashEntry entry;
+	TupleHashEntryData *entry;
 	TupleTableSlot *resultTupleSlot;
 
 	/*
@@ -438,7 +429,7 @@ setop_retrieve_hash_table(SetOpState *setopstate)
 		/*
 		 * Find the next entry in the hash table
 		 */
-		entry = (SetOpHashEntry) ScanTupleHashTable(&setopstate->hashiter);
+		entry = ScanTupleHashTable(setopstate->hashtable, &setopstate->hashiter);
 		if (entry == NULL)
 		{
 			/* No more entries in hashtable, so done */
@@ -450,12 +441,12 @@ setop_retrieve_hash_table(SetOpState *setopstate)
 		 * See if we should emit any copies of this tuple, and if so return
 		 * the first copy.
 		 */
-		set_output_count(setopstate, &entry->pergroup);
+		set_output_count(setopstate, entry->additional);
 
 		if (setopstate->numOutput > 0)
 		{
 			setopstate->numOutput--;
-			return ExecStoreMinimalTuple(entry->shared.firstTuple,
+			return ExecStoreMinimalTuple(entry->firstTuple,
 										 resultTupleSlot,
 										 false);
 		}
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 2cf169f..8ca8fc4 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -508,7 +508,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
 										  node->tab_eq_funcs,
 										  node->tab_hash_funcs,
 										  nbuckets,
-										  sizeof(TupleHashEntryData),
+										  0,
 										  node->hashtablecxt,
 										  node->hashtempcxt);
 
@@ -527,7 +527,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
 											  node->tab_eq_funcs,
 											  node->tab_hash_funcs,
 											  nbuckets,
-											  sizeof(TupleHashEntryData),
+											  0,
 											  node->hashtablecxt,
 											  node->hashtempcxt);
 	}
@@ -626,7 +626,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
 	TupleHashEntry entry;
 
 	InitTupleHashIterator(hashtable, &hashiter);
-	while ((entry = ScanTupleHashTable(&hashiter)) != NULL)
+	while ((entry = ScanTupleHashTable(hashtable, &hashiter)) != NULL)
 	{
 		ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
 		if (!execTuplesUnequal(slot, hashtable->tableslot,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f657ffc..644b8b6 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3292,6 +3292,12 @@ estimate_hashagg_tablesize(Path *path, const AggClauseCosts *agg_costs,
 	/* plus the per-hash-entry overhead */
 	hashentrysize += hash_agg_entry_size(agg_costs->numAggs);
 
+	/*
+	 * Note that this disregards the effect of fill-factor and growth policy
+	 * of the hash-table. That's probably ok, given default the default
+	 * fill-factor is relatively high. It'd be hard to meaningfully factor in
+	 * "double-in-size" growth policies here.
+	 */
 	return hashentrysize * dNumGroups;
 }
 
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 39521ed..136276b 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -140,7 +140,7 @@ extern void execTuplesHashPrepare(int numCols,
 extern TupleHashTable BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 					FmgrInfo *eqfunctions,
 					FmgrInfo *hashfunctions,
-					long nbuckets, Size entrysize,
+					long nbuckets, Size additionalsize,
 					MemoryContext tablecxt,
 					MemoryContext tempcxt);
 extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 4fa3661..5c09b31 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -499,14 +499,28 @@ typedef struct TupleHashTableData *TupleHashTable;
 
 typedef struct TupleHashEntryData
 {
-	/* firstTuple must be the first field in this struct! */
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	/* there may be additional data beyond the end of this struct */
-} TupleHashEntryData;			/* VARIABLE LENGTH STRUCT */
+	void		*additional;	/* user data */
+	uint32		status;			/* hash status */
+	uint32		hash;			/* hash value (cached) */
+} TupleHashEntryData;
+
+/*
+ * Define paramters necessary to generate the tuple hash table interface
+ * functions.
+ */
+#define SH_PREFIX tuplehash
+#define SH_ELEMENT_TYPE TupleHashEntryData
+#define SH_KEY_TYPE MinimalTuple
+#define SH_SCOPE extern
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+typedef tuplehash_iterator TupleHashIterator;
 
 typedef struct TupleHashTableData
 {
-	HTAB	   *hashtab;		/* underlying dynahash table */
+	struct tuplehash_hash *hashtab;	/* underlying hash table */
 	int			numCols;		/* number of columns in lookup key */
 	AttrNumber *keyColIdx;		/* attr numbers of key columns */
 	FmgrInfo   *tab_hash_funcs; /* hash functions for table datatype(s) */
@@ -521,24 +535,19 @@ typedef struct TupleHashTableData
 	FmgrInfo   *cur_eq_funcs;	/* equality functions for input vs. table */
 }	TupleHashTableData;
 
-typedef HASH_SEQ_STATUS TupleHashIterator;
-
 /*
  * Use InitTupleHashIterator/TermTupleHashIterator for a read/write scan.
  * Use ResetTupleHashIterator if the table can be frozen (in this case no
  * explicit scan termination is needed).
  */
 #define InitTupleHashIterator(htable, iter) \
-	hash_seq_init(iter, (htable)->hashtab)
+	tuplehash_start_iterate(htable->hashtab, iter)
 #define TermTupleHashIterator(iter) \
-	hash_seq_term(iter)
+	(void)0
 #define ResetTupleHashIterator(htable, iter) \
-	do { \
-		hash_freeze((htable)->hashtab); \
-		hash_seq_init(iter, (htable)->hashtab); \
-	} while (0)
-#define ScanTupleHashTable(iter) \
-	((TupleHashEntry) hash_seq_search(iter))
+	InitTupleHashIterator(htable, iter)
+#define ScanTupleHashTable(htable, iter) \
+	tuplehash_iterate(htable->hashtab, iter)
 
 
 /* ----------------------------------------------------------------
-- 
2.9.3

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Macro customizable hashtable / bitmapscan & aggregation perf

Reply via email to