On Wed, Jan 9, 2019 at 10:50 PM Amit Kapila <amit.kapil...@gmail.com> wrote:
> Thanks, Mithun for performance testing, it really helps us to choose
> the right strategy here.  Once John provides next version, it would be
> good to see the results of regular pgbench (read-write) runs (say at
> 50 and 300 scale factor) and the results of large copy.  I don't think
> there will be any problem, but we should just double check that.

Attached is v12 using the alternating-page strategy. I've updated the
comments and README as needed. In addition, I've

-handled a possible stat() call failure during pg_upgrade
-added one more assertion
-moved the new README material into a separate paragraph
-added a comment to FSMClearLocalMap() about transaction abort
-corrected an outdated comment that erroneously referred to extension
rather than creation
-fleshed out the draft commit messages
From 854ba27740ed26e0d3e89f6228186f2b0914b21e Mon Sep 17 00:00:00 2001
From: John Naylor <jcnay...@gmail.com>
Date: Thu, 10 Jan 2019 16:37:57 -0500
Subject: [PATCH v12 1/2] Avoid creation of the free space map for small heap
 relations.

Previously, all heaps had FSMs. For very small tables, this means that
the FSM took up more space than the heap did. This is wasteful, so now
we refrain from creating the FSM for heaps with 4 pages or fewer. If the
last known target block has insufficient space, we still try to insert
into some other page before before giving up and extending the relation,
since doing otherwise leads to table bloat. Testing showed that trying
every page penalized performance slightly, so we compromise and try
every other page. This way, we visit at most two pages. Any pages with
wasted free space become visible at next relation extension, so we still
control table bloat. As a bonus, directly attempting one or two pages
can even be faster than consulting the FSM would have been.

If a heap with a FSM is truncated back to below the threshold, the FSM
stays around and can be used as usual.
---
 contrib/pageinspect/expected/page.out     |  77 +++----
 contrib/pageinspect/sql/page.sql          |  33 +--
 doc/src/sgml/storage.sgml                 |  13 +-
 src/backend/access/brin/brin.c            |   2 +-
 src/backend/access/brin/brin_pageops.c    |  10 +-
 src/backend/access/heap/hio.c             |  47 ++--
 src/backend/access/transam/xact.c         |  14 ++
 src/backend/commands/vacuumlazy.c         |  17 +-
 src/backend/storage/freespace/README      |  13 +-
 src/backend/storage/freespace/freespace.c | 262 +++++++++++++++++++++-
 src/backend/storage/freespace/indexfsm.c  |   6 +-
 src/include/storage/freespace.h           |   9 +-
 src/test/regress/expected/fsm.out         |  62 +++++
 src/test/regress/parallel_schedule        |   6 +
 src/test/regress/serial_schedule          |   1 +
 src/test/regress/sql/fsm.sql              |  46 ++++
 16 files changed, 516 insertions(+), 102 deletions(-)
 create mode 100644 src/test/regress/expected/fsm.out
 create mode 100644 src/test/regress/sql/fsm.sql

diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out
index 3fcd9fbe6d..83e5910453 100644
--- a/contrib/pageinspect/expected/page.out
+++ b/contrib/pageinspect/expected/page.out
@@ -1,48 +1,69 @@
 CREATE EXTENSION pageinspect;
-CREATE TABLE test1 (a int, b int);
-INSERT INTO test1 VALUES (16777217, 131584);
-VACUUM test1;  -- set up FSM
+CREATE TABLE test_rel_forks (a int);
+-- Make sure there are enough blocks in the heap for the FSM to be created.
+INSERT INTO test_rel_forks SELECT g from generate_series(1,10000) g;
+-- set up FSM and VM
+VACUUM test_rel_forks;
 -- The page contents can vary, so just test that it can be read
 -- successfully, but don't keep the output.
-SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
  main_0 
 --------
    8192
 (1 row)
 
-SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
-ERROR:  block number 1 is out of range for relation "test1"
-SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
+ERROR:  block number 100 is out of range for relation "test_rel_forks"
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
  fsm_0 
 -------
   8192
 (1 row)
 
-SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
- fsm_1 
--------
-  8192
-(1 row)
-
-SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10;
+ERROR:  block number 10 is out of range for relation "test_rel_forks"
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
  vm_0 
 ------
  8192
 (1 row)
 
-SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
-ERROR:  block number 1 is out of range for relation "test1"
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
+ERROR:  block number 1 is out of range for relation "test_rel_forks"
 SELECT octet_length(get_raw_page('xxx', 'main', 0));
 ERROR:  relation "xxx" does not exist
-SELECT octet_length(get_raw_page('test1', 'xxx', 0));
+SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
 ERROR:  invalid fork name
 HINT:  Valid fork names are "main", "fsm", "vm", and "init".
-SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
+SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
+ fsm_page_contents 
+-------------------
+ 0: 192           +
+ 1: 192           +
+ 3: 192           +
+ 7: 192           +
+ 15: 192          +
+ 31: 192          +
+ 63: 192          +
+ 127: 192         +
+ 255: 192         +
+ 511: 192         +
+ 1023: 192        +
+ 2047: 192        +
+ 4095: 192        +
+ fp_next_slot: 0  +
+ 
+(1 row)
+
+SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
  ?column? 
 ----------
  t
 (1 row)
 
+DROP TABLE test_rel_forks;
+CREATE TABLE test1 (a int, b int);
+INSERT INTO test1 VALUES (16777217, 131584);
 SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
  pagesize | version 
 ----------+---------
@@ -62,26 +83,6 @@ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bi
  {"\\x01000001","\\x00020200"}
 (1 row)
 
-SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
- fsm_page_contents 
--------------------
- 0: 254           +
- 1: 254           +
- 3: 254           +
- 7: 254           +
- 15: 254          +
- 31: 254          +
- 63: 254          +
- 127: 254         +
- 255: 254         +
- 511: 254         +
- 1023: 254        +
- 2047: 254        +
- 4095: 254        +
- fp_next_slot: 0  +
- 
-(1 row)
-
 DROP TABLE test1;
 -- check that using any of these functions with a partitioned table or index
 -- would fail
diff --git a/contrib/pageinspect/sql/page.sql b/contrib/pageinspect/sql/page.sql
index 8ac9991837..ee811759d5 100644
--- a/contrib/pageinspect/sql/page.sql
+++ b/contrib/pageinspect/sql/page.sql
@@ -1,26 +1,35 @@
 CREATE EXTENSION pageinspect;
 
-CREATE TABLE test1 (a int, b int);
-INSERT INTO test1 VALUES (16777217, 131584);
+CREATE TABLE test_rel_forks (a int);
+-- Make sure there are enough blocks in the heap for the FSM to be created.
+INSERT INTO test_rel_forks SELECT g from generate_series(1,10000) g;
 
-VACUUM test1;  -- set up FSM
+-- set up FSM and VM
+VACUUM test_rel_forks;
 
 -- The page contents can vary, so just test that it can be read
 -- successfully, but don't keep the output.
 
-SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
-SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
 
-SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
-SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10;
 
-SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
-SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
 
 SELECT octet_length(get_raw_page('xxx', 'main', 0));
-SELECT octet_length(get_raw_page('test1', 'xxx', 0));
+SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
+
+SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
+
+SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
 
-SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
+DROP TABLE test_rel_forks;
+
+CREATE TABLE test1 (a int, b int);
+INSERT INTO test1 VALUES (16777217, 131584);
 
 SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
 
@@ -29,8 +38,6 @@ SELECT page_checksum(get_raw_page('test1', 0), 0) IS NOT NULL AS silly_checksum_
 SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits)
     FROM heap_page_items(get_raw_page('test1', 0));
 
-SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
-
 DROP TABLE test1;
 
 -- check that using any of these functions with a partitioned table or index
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index 8ef2ac8010..cbdad0c3fb 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -590,12 +590,13 @@ tuple would otherwise be too big.
 <indexterm><primary>FSM</primary><see>Free Space Map</see></indexterm>
 
 <para>
-Each heap and index relation, except for hash indexes, has a Free Space Map
-(FSM) to keep track of available space in the relation. It's stored
-alongside the main relation data in a separate relation fork, named after the
-filenode number of the relation, plus a <literal>_fsm</literal> suffix. For example,
-if the filenode of a relation is 12345, the FSM is stored in a file called
-<filename>12345_fsm</filename>, in the same directory as the main relation file.
+Each heap relation, unless it is very small, and each index relation, except
+for hash indexes, has a Free Space Map (FSM) to keep track of available
+space in the relation. It's stored alongside the main relation data in a
+separate relation fork, named after the filenode number of the relation, plus
+a <literal>_fsm</literal> suffix. For example, if the filenode of a relation
+is 12345, the FSM is stored in a file called <filename>12345_fsm</filename>,
+in the same directory as the main relation file.
 </para>
 
 <para>
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index e95fbbcea7..7c5b1af764 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -1148,7 +1148,7 @@ terminate_brin_buildstate(BrinBuildState *state)
 		freespace = PageGetFreeSpace(page);
 		blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
 		ReleaseBuffer(state->bs_currentInsertBuf);
-		RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
+		RecordPageWithFreeSpace(state->bs_irel, blk, freespace, InvalidBlockNumber);
 		FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
 	}
 
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index 040cb62e55..cd10458ddc 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -310,7 +310,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 
 		if (extended)
 		{
-			RecordPageWithFreeSpace(idxrel, newblk, freespace);
+			RecordPageWithFreeSpace(idxrel, newblk, freespace, InvalidBlockNumber);
 			FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
 		}
 
@@ -461,7 +461,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
 
 	if (extended)
 	{
-		RecordPageWithFreeSpace(idxrel, blk, freespace);
+		RecordPageWithFreeSpace(idxrel, blk, freespace, InvalidBlockNumber);
 		FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
 	}
 
@@ -654,7 +654,7 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
 
 	/* Measure free space and record it */
 	RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
-							br_page_get_freespace(page));
+							br_page_get_freespace(page), InvalidBlockNumber);
 }
 
 /*
@@ -703,7 +703,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 	/* Choose initial target page, re-using existing target if known */
 	newblk = RelationGetTargetBlock(irel);
 	if (newblk == InvalidBlockNumber)
-		newblk = GetPageWithFreeSpace(irel, itemsz);
+		newblk = GetPageWithFreeSpace(irel, itemsz, true);
 
 	/*
 	 * Loop until we find a page with sufficient free space.  By the time we
@@ -895,7 +895,7 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
 	 * pages whose FSM records were forgotten in a crash.
 	 */
 	RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
-							br_page_get_freespace(page));
+							br_page_get_freespace(page), InvalidBlockNumber);
 }
 
 
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index b8b5871559..e59568e47e 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -239,8 +239,14 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
 		 * Immediately update the bottom level of the FSM.  This has a good
 		 * chance of making this page visible to other concurrently inserting
 		 * backends, and we want that to happen without delay.
+		 *
+		 * Since we know the table will end up with extraBlocks additional
+		 * pages, we pass the final number to avoid possible unnecessary
+		 * system calls and to make sure the FSM is created when we add
+		 * the first new page.
 		 */
-		RecordPageWithFreeSpace(relation, blockNum, freespace);
+		RecordPageWithFreeSpace(relation, blockNum, freespace,
+								firstBlock + extraBlocks);
 	}
 	while (--extraBlocks > 0);
 
@@ -377,20 +383,9 @@ RelationGetBufferForTuple(Relation relation, Size len,
 		 * We have no cached target page, so ask the FSM for an initial
 		 * target.
 		 */
-		targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
-
-		/*
-		 * If the FSM knows nothing of the rel, try the last page before we
-		 * give up and extend.  This avoids one-tuple-per-page syndrome during
-		 * bootstrapping or in a recently-started system.
-		 */
-		if (targetBlock == InvalidBlockNumber)
-		{
-			BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
-
-			if (nblocks > 0)
-				targetBlock = nblocks - 1;
-		}
+		targetBlock = GetPageWithFreeSpace(relation,
+										   len + saveFreeSpace,
+										   false);
 	}
 
 loop:
@@ -484,6 +479,14 @@ loop:
 		{
 			/* use this page as future insert target, too */
 			RelationSetTargetBlock(relation, targetBlock);
+
+			/*
+			 * In case we used an in-memory map of available blocks, reset
+			 * it for next use.
+			 */
+			if (targetBlock < HEAP_FSM_CREATION_THRESHOLD)
+				FSMClearLocalMap();
+
 			return buffer;
 		}
 
@@ -543,9 +546,12 @@ loop:
 
 			/*
 			 * Check if some other backend has extended a block for us while
-			 * we were waiting on the lock.
+			 * we were waiting on the lock.  We only check the FSM -- if there
+			 * isn't one we don't recheck the number of blocks.
 			 */
-			targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
+			targetBlock = GetPageWithFreeSpace(relation,
+											   len + saveFreeSpace,
+											   true);
 
 			/*
 			 * If some other waiter has already extended the relation, we
@@ -625,5 +631,12 @@ loop:
 	 */
 	RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
 
+	/*
+	 * In case we used an in-memory map of available blocks, reset
+	 * it for next use.  We do this unconditionally since after relation
+	 * extension we can't skip this based on the targetBlock.
+	 */
+	FSMClearLocalMap();
+
 	return buffer;
 }
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index d967400384..fa0620b301 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -48,6 +48,7 @@
 #include "replication/walsender.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
+#include "storage/freespace.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -2488,6 +2489,12 @@ AbortTransaction(void)
 	pgstat_report_wait_end();
 	pgstat_progress_end_command();
 
+	/*
+	 * In case we aborted during RelationGetBufferForTuple(),
+	 * clear the local map of heap pages.
+	 */
+	FSMClearLocalMap();
+
 	/* Clean up buffer I/O and buffer context locks, too */
 	AbortBufferIO();
 	UnlockBuffers();
@@ -4709,6 +4716,13 @@ AbortSubTransaction(void)
 
 	pgstat_report_wait_end();
 	pgstat_progress_end_command();
+
+	/*
+	 * In case we aborted during RelationGetBufferForTuple(),
+	 * clear the local map of heap pages.
+	 */
+	FSMClearLocalMap();
+
 	AbortBufferIO();
 	UnlockBuffers();
 
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 8134c52253..d2818dfad4 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -154,7 +154,7 @@ static BufferAccessStrategy vac_strategy;
 static void lazy_scan_heap(Relation onerel, int options,
 			   LVRelStats *vacrelstats, Relation *Irel, int nindexes,
 			   bool aggressive);
-static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
+static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats, BlockNumber nblocks);
 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
 static void lazy_vacuum_index(Relation indrel,
 				  IndexBulkDeleteResult **stats,
@@ -759,7 +759,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 			pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
 
 			/* Remove tuples from heap */
-			lazy_vacuum_heap(onerel, vacrelstats);
+			lazy_vacuum_heap(onerel, vacrelstats, nblocks);
 
 			/*
 			 * Forget the now-vacuumed tuples, and press on, but be careful
@@ -897,7 +897,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 			MarkBufferDirty(buf);
 			UnlockReleaseBuffer(buf);
 
-			RecordPageWithFreeSpace(onerel, blkno, freespace);
+			RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
 			continue;
 		}
 
@@ -936,7 +936,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 			}
 
 			UnlockReleaseBuffer(buf);
-			RecordPageWithFreeSpace(onerel, blkno, freespace);
+			RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
 			continue;
 		}
 
@@ -1333,7 +1333,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 		 * taken if there are no indexes.)
 		 */
 		if (vacrelstats->num_dead_tuples == prev_dead_count)
-			RecordPageWithFreeSpace(onerel, blkno, freespace);
+			RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
 	}
 
 	/* report that everything is scanned and vacuumed */
@@ -1395,7 +1395,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 		/* Remove tuples from heap */
 		pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
 									 PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
-		lazy_vacuum_heap(onerel, vacrelstats);
+		lazy_vacuum_heap(onerel, vacrelstats, nblocks);
 		vacrelstats->num_index_scans++;
 	}
 
@@ -1466,9 +1466,10 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
  * Note: the reason for doing this as a second pass is we cannot remove
  * the tuples until we've removed their index entries, and we want to
  * process index entry removal in batches as large as possible.
+ * Note: nblocks is passed as an optimization for RecordPageWithFreeSpace().
  */
 static void
-lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
+lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats, BlockNumber nblocks)
 {
 	int			tupindex;
 	int			npages;
@@ -1505,7 +1506,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		freespace = PageGetHeapFreeSpace(page);
 
 		UnlockReleaseBuffer(buf);
-		RecordPageWithFreeSpace(onerel, tblk, freespace);
+		RecordPageWithFreeSpace(onerel, tblk, freespace, nblocks);
 		npages++;
 	}
 
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README
index e7ff23b76f..77e4992728 100644
--- a/src/backend/storage/freespace/README
+++ b/src/backend/storage/freespace/README
@@ -8,7 +8,16 @@ free space to hold a tuple to be stored; or to determine that no such page
 exists and the relation must be extended by one page.  As of PostgreSQL 8.4
 each relation has its own, extensible free space map stored in a separate
 "fork" of its relation.  This eliminates the disadvantages of the former
-fixed-size FSM.
+fixed-size FSM.  There are two exceptions:
+
+1. Hash indexes never have a FSM.
+2. For very small heap relations, the FSM would be relatively large and
+wasteful, so as of PostgreSQL 12 we refrain from creating the FSM for
+heaps with HEAP_FSM_CREATION_THRESHOLD pages or fewer, both to save space
+and to improve performance.  To locate free space in this case, we simply
+iterate over the heap, trying alternating pages in turn.  There may be some
+wasted free space in this case, but it becomes visible again upon next
+relation extension.
 
 It is important to keep the map small so that it can be searched rapidly.
 Therefore, we don't attempt to record the exact free space on a page.
@@ -192,5 +201,3 @@ TODO
 ----
 
 - fastroot to avoid traversing upper nodes with just 1 child
-- use a different system for tables that fit into one FSM page, with a
-  mechanism to switch to the real thing as it grows.
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 7c4ad1c449..8edb2841ae 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -76,6 +76,14 @@
 #define FSM_ROOT_LEVEL	(FSM_TREE_DEPTH - 1)
 #define FSM_BOTTOM_LEVEL 0
 
+/* Status codes for the local map. */
+
+/* Either already tried, or beyond the end of the relation */
+#define FSM_LOCAL_NOT_AVAIL 0x00
+
+/* Available to try */
+#define FSM_LOCAL_AVAIL		0x01
+
 /*
  * The internal FSM routines work on a logical addressing scheme. Each
  * level of the tree can be thought of as a separately addressable file.
@@ -89,6 +97,15 @@ typedef struct
 /* Address of the root page. */
 static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
 
+/* Local map of block numbers for small heaps with no FSM. */
+typedef struct
+{
+	BlockNumber nblocks;
+	char		map[HEAP_FSM_CREATION_THRESHOLD];
+} FSMLocalMap;
+
+static FSMLocalMap fsm_local_map = {0, {FSM_LOCAL_NOT_AVAIL}};
+
 /* functions to navigate the tree */
 static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
 static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
@@ -107,10 +124,14 @@ static Size fsm_space_cat_to_avail(uint8 cat);
 /* workhorse functions for various operations */
 static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
 				   uint8 newValue, uint8 minValue);
+static void fsm_local_set(Relation rel, BlockNumber curr_nblocks);
 static BlockNumber fsm_search(Relation rel, uint8 min_cat);
+static BlockNumber fsm_local_search(void);
 static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
 				BlockNumber start, BlockNumber end,
 				bool *eof);
+static bool fsm_allow_writes(Relation rel, BlockNumber heapblk,
+				BlockNumber nblocks, BlockNumber *get_nblocks);
 
 
 /******** Public API ********/
@@ -127,13 +148,45 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
  * amount of free space available on that page and then try again (see
  * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
  * extend the relation.
+ *
+ * For very small heap relations that don't have a FSM, we try every other
+ * page before extending the relation.  To keep track of which pages have
+ * been tried, initialize a local in-memory map of pages.
  */
 BlockNumber
-GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
+GetPageWithFreeSpace(Relation rel, Size spaceNeeded, bool check_fsm_only)
 {
 	uint8		min_cat = fsm_space_needed_to_cat(spaceNeeded);
+	BlockNumber target_block,
+				nblocks;
+
+	/* First try the FSM, if it exists. */
+	target_block = fsm_search(rel, min_cat);
+
+	if (target_block == InvalidBlockNumber &&
+		rel->rd_rel->relkind == RELKIND_RELATION &&
+		!check_fsm_only)
+	{
+		nblocks = RelationGetNumberOfBlocks(rel);
+
+		if (nblocks > HEAP_FSM_CREATION_THRESHOLD)
+		{
+			/*
+			 * If the FSM knows nothing of the rel, try the last page before
+			 * we give up and extend.  This avoids one-tuple-per-page syndrome
+			 * during bootstrapping or in a recently-started system.
+			 */
+			target_block = nblocks - 1;
+		}
+		else if (nblocks > 0)
+		{
+			/* Create or update local map and get first candidate block. */
+			fsm_local_set(rel, nblocks);
+			target_block = fsm_local_search();
+		}
+	}
 
-	return fsm_search(rel, min_cat);
+	return target_block;
 }
 
 /*
@@ -144,16 +197,46 @@ GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
  * also some effort to return a page close to the old page; if there's a
  * page with enough free space on the same FSM page where the old one page
  * is located, it is preferred.
+ *
+ * For very small heap relations that don't have a FSM, we update the local
+ * map to indicate we have tried a page, and return the next page to try.
  */
 BlockNumber
 RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
 							  Size oldSpaceAvail, Size spaceNeeded)
 {
-	int			old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
-	int			search_cat = fsm_space_needed_to_cat(spaceNeeded);
+	int			old_cat;
+	int			search_cat;
 	FSMAddress	addr;
 	uint16		slot;
 	int			search_slot;
+	BlockNumber nblocks = InvalidBlockNumber;
+
+	/* First try the local map, if it exists. */
+	if (oldPage < fsm_local_map.nblocks)
+	{
+		Assert(rel->rd_rel->relkind == RELKIND_RELATION &&
+			   fsm_local_map.map[oldPage] == FSM_LOCAL_AVAIL);
+
+		fsm_local_map.map[oldPage] = FSM_LOCAL_NOT_AVAIL;
+		return fsm_local_search();
+	}
+
+	if (!fsm_allow_writes(rel, oldPage, InvalidBlockNumber, &nblocks))
+	{
+		/*
+		 * If we have neither a local map nor a FSM, we probably just
+		 * tried the target block in the smgr relation entry and failed,
+		 * so we'll need to create the local map.
+		 */
+		fsm_local_set(rel, nblocks);
+		return fsm_local_search();
+	}
+
+	/* Normal FSM logic follows */
+
+	old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
+	search_cat = fsm_space_needed_to_cat(spaceNeeded);
 
 	/* Get the location of the FSM byte representing the heap block */
 	addr = fsm_get_location(oldPage, &slot);
@@ -176,20 +259,40 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
  * Note that if the new spaceAvail value is higher than the old value stored
  * in the FSM, the space might not become visible to searchers until the next
  * FreeSpaceMapVacuum call, which updates the upper level pages.
+ *
+ * Callers have no need for a local map.
  */
 void
-RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
+RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
+						Size spaceAvail, BlockNumber nblocks)
 {
-	int			new_cat = fsm_space_avail_to_cat(spaceAvail);
+	int			new_cat;
 	FSMAddress	addr;
 	uint16		slot;
+	BlockNumber dummy;
+
+	if (!fsm_allow_writes(rel, heapBlk, nblocks, &dummy))
+		/* No FSM to update and no local map either */
+		return;
 
 	/* Get the location of the FSM byte representing the heap block */
 	addr = fsm_get_location(heapBlk, &slot);
 
+	new_cat = fsm_space_avail_to_cat(spaceAvail);
 	fsm_set_and_search(rel, addr, slot, new_cat, 0);
 }
 
+/*
+ * Clear the local map.  We must call this when we have found a block with
+ * enough free space, when we extend the relation, or on transaction abort.
+ */
+void
+FSMClearLocalMap(void)
+{
+	fsm_local_map.nblocks = 0;
+	memset(&fsm_local_map.map, 0, sizeof(fsm_local_map.map));
+}
+
 /*
  * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
  *		WAL replay
@@ -204,6 +307,30 @@ XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
 	BlockNumber blkno;
 	Buffer		buf;
 	Page		page;
+	bool		write_to_fsm;
+
+	/* This is meant to mirror the logic in fsm_allow_writes() */
+	if (heapBlk >= HEAP_FSM_CREATION_THRESHOLD)
+		write_to_fsm = true;
+	else
+	{
+		/* Open the relation at smgr level */
+		SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+
+		if (smgrexists(smgr, FSM_FORKNUM))
+			write_to_fsm = true;
+		else
+		{
+			BlockNumber heap_nblocks = smgrnblocks(smgr, MAIN_FORKNUM);
+			if (heap_nblocks > HEAP_FSM_CREATION_THRESHOLD)
+				write_to_fsm = true;
+			else
+				write_to_fsm = false;
+		}
+	}
+
+	if (!write_to_fsm)
+		return;
 
 	/* Get the location of the FSM byte representing the heap block */
 	addr = fsm_get_location(heapBlk, &slot);
@@ -904,3 +1031,126 @@ fsm_vacuum_page(Relation rel, FSMAddress addr,
 
 	return max_avail;
 }
+
+/*
+ * For heaps we prevent creation of the FSM unless the number of pages
+ * exceeds HEAP_FSM_CREATION_THRESHOLD.  For tables that don't already have
+ * a FSM, this will save an inode and a few kB of space.
+ *
+ * XXX The API is a little awkward -- if the caller passes a valid nblocks
+ * value, it can avoid invoking a system call.  If the caller passes
+ * InvalidBlockNumber and receives a false return value, it can get an
+ * up-to-date relation size from get_nblocks.
+ */
+static bool
+fsm_allow_writes(Relation rel, BlockNumber heapblk,
+				 BlockNumber nblocks, BlockNumber *get_nblocks)
+{
+	bool		skip_get_nblocks;
+
+	if (heapblk >= HEAP_FSM_CREATION_THRESHOLD)
+		return true;
+
+	/* Non-heap rels can always create a FSM. */
+	if (rel->rd_rel->relkind != RELKIND_RELATION)
+		return true;
+
+	/*
+	 * If the caller knows nblocks, we can avoid a system call later.
+	 * If it doesn't, maybe we have relpages from a previous VACUUM.
+	 * Since the table may have extended since then, we still have to
+	 * count the pages later if we can't return now.
+	 */
+	if (nblocks != InvalidBlockNumber)
+	{
+		if (nblocks > HEAP_FSM_CREATION_THRESHOLD)
+			return true;
+		else
+			skip_get_nblocks = true;
+	}
+	else
+	{
+		if (rel->rd_rel->relpages != InvalidBlockNumber &&
+			rel->rd_rel->relpages > HEAP_FSM_CREATION_THRESHOLD)
+			return true;
+		else
+			skip_get_nblocks = false;
+	}
+
+	RelationOpenSmgr(rel);
+	if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
+		return true;
+
+	if (skip_get_nblocks)
+		return false;
+
+	/* last resort */
+	*get_nblocks = RelationGetNumberOfBlocks(rel);
+	if (*get_nblocks > HEAP_FSM_CREATION_THRESHOLD)
+		return true;
+	else
+		return false;
+}
+
+/*
+ * Initialize or update the local map of blocks to try, for when there is
+ * no FSM.
+ *
+ * When we initialize the map, the whole heap is potentially available to
+ * try.  If a caller wanted to reset the map after another backend extends
+ * the relation, this will only flag new blocks as available.  No callers
+ * do this currently, however.
+ */
+static void
+fsm_local_set(Relation rel, BlockNumber curr_nblocks)
+{
+	BlockNumber blkno,
+				cached_target_block;
+
+	/*
+	 * Starting at the current last block in the relation and working
+	 * backwards, mark alternating blocks as available, ending just after
+	 * the last block we have mapped.
+	 */
+	blkno = curr_nblocks - 1;
+	for (;;)
+	{
+		fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL;
+		if (blkno >= fsm_local_map.nblocks + 2)
+			blkno -= 2;
+		else
+			break;
+	}
+
+	/* Cache the number of blocks. */
+	fsm_local_map.nblocks = curr_nblocks;
+
+	/* Set the status of the cached target block to 'unavailable'. */
+	cached_target_block = RelationGetTargetBlock(rel);
+	if (cached_target_block != InvalidBlockNumber &&
+		cached_target_block < curr_nblocks)
+		fsm_local_map.map[cached_target_block] = FSM_LOCAL_NOT_AVAIL;
+}
+
+/*
+ * Search the local map for an available block to try, in descending order.
+ *
+ * For use when there is no FSM.
+ */
+static BlockNumber
+fsm_local_search(void)
+{
+	BlockNumber		target_block = fsm_local_map.nblocks;
+
+	Assert(target_block > 0);
+
+	do
+	{
+		target_block--;
+		if (fsm_local_map.map[target_block] == FSM_LOCAL_AVAIL)
+			return target_block;
+	}
+	while (target_block > 0);
+
+	return InvalidBlockNumber;
+}
diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c
index e21047b96f..88a71ae548 100644
--- a/src/backend/storage/freespace/indexfsm.c
+++ b/src/backend/storage/freespace/indexfsm.c
@@ -37,7 +37,7 @@
 BlockNumber
 GetFreeIndexPage(Relation rel)
 {
-	BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2);
+	BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2, true);
 
 	if (blkno != InvalidBlockNumber)
 		RecordUsedIndexPage(rel, blkno);
@@ -51,7 +51,7 @@ GetFreeIndexPage(Relation rel)
 void
 RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
 {
-	RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1);
+	RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1, InvalidBlockNumber);
 }
 
 
@@ -61,7 +61,7 @@ RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
 void
 RecordUsedIndexPage(Relation rel, BlockNumber usedBlock)
 {
-	RecordPageWithFreeSpace(rel, usedBlock, 0);
+	RecordPageWithFreeSpace(rel, usedBlock, 0, InvalidBlockNumber);
 }
 
 /*
diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h
index 726eb30fb8..b6f5b86ee5 100644
--- a/src/include/storage/freespace.h
+++ b/src/include/storage/freespace.h
@@ -18,15 +18,20 @@
 #include "storage/relfilenode.h"
 #include "utils/relcache.h"
 
+/* Only create the FSM if the heap has greater than this many blocks */
+#define HEAP_FSM_CREATION_THRESHOLD 4
+
 /* prototypes for public functions in freespace.c */
 extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk);
-extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded);
+extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded,
+							bool check_fsm_only);
 extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel,
 							  BlockNumber oldPage,
 							  Size oldSpaceAvail,
 							  Size spaceNeeded);
 extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
-						Size spaceAvail);
+						Size spaceAvail, BlockNumber nblocks);
+extern void FSMClearLocalMap(void);
 extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
 							Size spaceAvail);
 
diff --git a/src/test/regress/expected/fsm.out b/src/test/regress/expected/fsm.out
new file mode 100644
index 0000000000..bf590bfa58
--- /dev/null
+++ b/src/test/regress/expected/fsm.out
@@ -0,0 +1,62 @@
+--
+-- Free Space Map test
+--
+CREATE TABLE fsm_check_size (num int, str text);
+-- Fill 3 blocks with as many large records as will fit
+-- No FSM
+INSERT INTO fsm_check_size SELECT g, rpad('', 1024, 'a')
+FROM generate_series(1,7*3) g;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ heap_size | fsm_size 
+-----------+----------
+     24576 |        0
+(1 row)
+
+-- Clear some space on block 0
+DELETE FROM fsm_check_size where num <= 5;
+VACUUM fsm_check_size;
+-- Insert small record in block 2 to set the cached smgr targetBlock
+INSERT INTO fsm_check_size values(99, 'b');
+-- Insert large record and make sure it goes in block 0 rather than
+-- causing the relation to extend
+INSERT INTO fsm_check_size values (101, rpad('', 1024, 'a'));
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ heap_size | fsm_size 
+-----------+----------
+     24576 |        0
+(1 row)
+
+-- Extend table with enough blocks to exceed the FSM threshold
+-- FSM is created and extended to 3 blocks
+INSERT INTO fsm_check_size SELECT g, 'c' FROM generate_series(200,3000) g;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ fsm_size 
+----------
+    24576
+(1 row)
+
+-- Truncate heap to 1 block
+-- No change in FSM
+DELETE FROM fsm_check_size WHERE num > 7;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ fsm_size 
+----------
+    24576
+(1 row)
+
+-- Truncate heap to 0 blocks
+-- FSM now truncated to 2 blocks
+DELETE FROM fsm_check_size;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ fsm_size 
+----------
+    16384
+(1 row)
+
+DROP TABLE fsm_check_size;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index cc0bbf5db9..4051a4ad4e 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -68,6 +68,12 @@ test: create_aggregate create_function_3 create_cast constraints triggers inheri
 # ----------
 test: sanity_check
 
+# ----------
+# fsm does a delete followed by vacuum, and running it in parallel can prevent
+# removal of rows.
+# ----------
+test: fsm
+
 # ----------
 # Believe it or not, select creates a table, subsequent
 # tests need.
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 0c10c7100c..ac1ea622d6 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -80,6 +80,7 @@ test: roleattributes
 test: create_am
 test: hash_func
 test: sanity_check
+test: fsm
 test: errors
 test: select
 test: select_into
diff --git a/src/test/regress/sql/fsm.sql b/src/test/regress/sql/fsm.sql
new file mode 100644
index 0000000000..a864c7fd88
--- /dev/null
+++ b/src/test/regress/sql/fsm.sql
@@ -0,0 +1,46 @@
+--
+-- Free Space Map test
+--
+
+CREATE TABLE fsm_check_size (num int, str text);
+
+-- Fill 3 blocks with as many large records as will fit
+-- No FSM
+INSERT INTO fsm_check_size SELECT g, rpad('', 1024, 'a')
+FROM generate_series(1,7*3) g;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Clear some space on block 0
+DELETE FROM fsm_check_size where num <= 5;
+VACUUM fsm_check_size;
+
+-- Insert small record in block 2 to set the cached smgr targetBlock
+INSERT INTO fsm_check_size values(99, 'b');
+
+-- Insert large record and make sure it goes in block 0 rather than
+-- causing the relation to extend
+INSERT INTO fsm_check_size values (101, rpad('', 1024, 'a'));
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Extend table with enough blocks to exceed the FSM threshold
+-- FSM is created and extended to 3 blocks
+INSERT INTO fsm_check_size SELECT g, 'c' FROM generate_series(200,3000) g;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Truncate heap to 1 block
+-- No change in FSM
+DELETE FROM fsm_check_size WHERE num > 7;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Truncate heap to 0 blocks
+-- FSM now truncated to 2 blocks
+DELETE FROM fsm_check_size;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+DROP TABLE fsm_check_size;
-- 
2.17.1

From 66910dd20b72f13dace6907f7be2a06a86592a89 Mon Sep 17 00:00:00 2001
From: John Naylor <jcnay...@gmail.com>
Date: Thu, 10 Jan 2019 16:38:17 -0500
Subject: [PATCH v12 2/2] During pg_upgrade, conditionally skip transfer of
 FSMs.

If a heap on the old cluster has 4 pages or fewer, don't copy or
link the FSM. This will reduce space usage for installations with
large numbers of small tables.
---
 src/bin/pg_upgrade/info.c        | 11 +++++--
 src/bin/pg_upgrade/pg_upgrade.h  |  6 +++-
 src/bin/pg_upgrade/relfilenode.c | 56 ++++++++++++++++++--------------
 3 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index fd0b44c3ce..31a24e0bec 100644
--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -200,6 +200,7 @@ create_rel_filename_map(const char *old_data, const char *new_data,
 
 	map->old_db_oid = old_db->db_oid;
 	map->new_db_oid = new_db->db_oid;
+	map->relkind = old_rel->relkind;
 
 	/*
 	 * old_relfilenode might differ from pg_class.oid (and hence
@@ -418,6 +419,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo)
 	char	   *nspname = NULL;
 	char	   *relname = NULL;
 	char	   *tablespace = NULL;
+	char	   *relkind = NULL;
 	int			i_spclocation,
 				i_nspname,
 				i_relname,
@@ -425,7 +427,8 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo)
 				i_indtable,
 				i_toastheap,
 				i_relfilenode,
-				i_reltablespace;
+				i_reltablespace,
+				i_relkind;
 	char		query[QUERY_ALLOC];
 	char	   *last_namespace = NULL,
 			   *last_tablespace = NULL;
@@ -497,7 +500,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo)
 	 */
 	snprintf(query + strlen(query), sizeof(query) - strlen(query),
 			 "SELECT all_rels.*, n.nspname, c.relname, "
-			 "  c.relfilenode, c.reltablespace, %s "
+			 "  c.relfilenode, c.reltablespace, c.relkind, %s "
 			 "FROM (SELECT * FROM regular_heap "
 			 "      UNION ALL "
 			 "      SELECT * FROM toast_heap "
@@ -529,6 +532,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo)
 	i_relfilenode = PQfnumber(res, "relfilenode");
 	i_reltablespace = PQfnumber(res, "reltablespace");
 	i_spclocation = PQfnumber(res, "spclocation");
+	i_relkind = PQfnumber(res, "relkind");
 
 	for (relnum = 0; relnum < ntups; relnum++)
 	{
@@ -558,6 +562,9 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo)
 		relname = PQgetvalue(res, relnum, i_relname);
 		curr->relname = pg_strdup(relname);
 
+		relkind = PQgetvalue(res, relnum, i_relkind);
+		curr->relkind = relkind[0];
+
 		curr->relfilenode = atooid(PQgetvalue(res, relnum, i_relfilenode));
 		curr->tblsp_alloc = false;
 
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 51bd211d46..8eda9c1f4a 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -147,6 +147,7 @@ typedef struct
 	char	   *tablespace;		/* tablespace path; "" for cluster default */
 	bool		nsp_alloc;		/* should nspname be freed? */
 	bool		tblsp_alloc;	/* should tablespace be freed? */
+	char		relkind;		/* relation relkind -- see pg_class.h */
 } RelInfo;
 
 typedef struct
@@ -173,9 +174,12 @@ typedef struct
 	 */
 	Oid			old_relfilenode;
 	Oid			new_relfilenode;
-	/* the rest are used only for logging and error reporting */
+
+	/* These are used only for logging and error reporting. */
 	char	   *nspname;		/* namespaces */
 	char	   *relname;
+
+	char		relkind;		/* relation relkind -- see pg_class.h */
 } FileNameMap;
 
 /*
diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c
index 3b16c92a02..cc723ef1e2 100644
--- a/src/bin/pg_upgrade/relfilenode.c
+++ b/src/bin/pg_upgrade/relfilenode.c
@@ -14,10 +14,11 @@
 #include <sys/stat.h>
 #include "catalog/pg_class_d.h"
 #include "access/transam.h"
+#include "storage/freespace.h"
 
 
 static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace);
-static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit);
+static Size transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit);
 
 
 /*
@@ -144,6 +145,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
 	int			mapnum;
 	bool		vm_crashsafe_match = true;
 	bool		vm_must_add_frozenbit = false;
+	Size		first_seg_size = 0;
 
 	/*
 	 * Do the old and new cluster disagree on the crash-safetiness of the vm
@@ -165,18 +167,22 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
 		if (old_tablespace == NULL ||
 			strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0)
 		{
-			/* transfer primary file */
-			transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit);
+			/* Transfer main fork and return size of the first segment. */
+			first_seg_size = transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit);
 
 			/* fsm/vm files added in PG 8.4 */
 			if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
 			{
 				/*
-				 * Copy/link any fsm and vm files, if they exist
+				 * Copy/link any fsm and vm files, if they exist and if they would
+				 * be created in the new cluster.
 				 */
-				transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit);
+				if (maps[mapnum].relkind != RELKIND_RELATION ||
+					first_seg_size > HEAP_FSM_CREATION_THRESHOLD * BLCKSZ ||
+					GET_MAJOR_VERSION(new_cluster.major_version) <= 1100)
+					(void) transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit);
 				if (vm_crashsafe_match)
-					transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit);
+					(void) transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit);
 			}
 		}
 	}
@@ -190,7 +196,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
  * is true, visibility map forks are converted and rewritten, even in link
  * mode.
  */
-static void
+static Size
 transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit)
 {
 	char		old_file[MAXPGPATH];
@@ -198,6 +204,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
 	int			segno;
 	char		extent_suffix[65];
 	struct stat statbuf;
+	Size		first_seg_size = 0;
 
 	/*
 	 * Now copy/link any related segments as well. Remember, PG breaks large
@@ -226,26 +233,27 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
 				 type_suffix,
 				 extent_suffix);
 
-		/* Is it an extent, fsm, or vm file? */
-		if (type_suffix[0] != '\0' || segno != 0)
+		/* Did file open fail? */
+		if (stat(old_file, &statbuf) != 0)
 		{
-			/* Did file open fail? */
-			if (stat(old_file, &statbuf) != 0)
-			{
-				/* File does not exist?  That's OK, just return */
-				if (errno == ENOENT)
-					return;
-				else
-					pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
-							 map->nspname, map->relname, old_file, new_file,
-							 strerror(errno));
-			}
-
-			/* If file is empty, just return */
-			if (statbuf.st_size == 0)
-				return;
+			/* Extent, fsm, or vm does not exist?  That's OK, just return */
+			if (errno == ENOENT &&
+				(type_suffix[0] != '\0' || segno != 0))
+				return first_seg_size;
+			else
+				pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
+						 map->nspname, map->relname, old_file, new_file,
+						 strerror(errno));
 		}
 
+		/* Save the size of the first segment of the main fork. */
+		else if (type_suffix[0] == '\0' && segno == 0)
+			first_seg_size = statbuf.st_size;
+
+		/* If extent, fsm, or vm is empty, just return */
+		else if (statbuf.st_size == 0)
+			return first_seg_size;
+
 		unlink(new_file);
 
 		/* Copying files might take some time, so give feedback. */
-- 
2.17.1

Reply via email to