From 0e7b7f2a4189cb98722e0a6b0d4e76e18d980900 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Wed, 10 Aug 2022 20:36:25 +0200
Subject: [PATCH v2] Avoid reltuples distortion in very small tables.

It is possible that a small subset of pages contains the overwhelming
majority of all tuples. This leads to skews in gathered statistics if these
dense and bloated pages are vacuumed independently: the 2%-barrier is not
reached when vacuuming dense pages, such that these pages would not
contribute to the tuple count estimate.

This commit fixes (part of) that issue by always updating the tuple count
estimate whenever we encounter more tuples during vacuum than the current
row count estimate.
---
 src/backend/commands/vacuum.c | 45 ++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index dbdfe8bd2d..3ae7a89549 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1235,24 +1235,37 @@ vac_estimate_reltuples(Relation relation,
 		return scanned_tuples;
 
 	/*
-	 * When successive VACUUM commands scan the same few pages again and
-	 * again, without anything from the table really changing, there is a risk
-	 * that our beliefs about tuple density will gradually become distorted.
-	 * This might be caused by vacuumlazy.c implementation details, such as
-	 * its tendency to always scan the last heap page.  Handle that here.
+	 * The following block may decide to not update the estimated
+	 * count of tuples in the table, most of which are due to skew
+	 * in which blocks are scanned in which situations.
 	 *
-	 * If the relation is _exactly_ the same size according to the existing
-	 * pg_class entry, and only a few of its pages (less than 2%) were
-	 * scanned, keep the existing value of reltuples.  Also keep the existing
-	 * value when only a subset of rel's pages <= a single page were scanned.
-	 *
-	 * (Note: we might be returning -1 here.)
+	 * However, whenever we have a previous estimated count, and
+	 * the count in scanned_tuples is strictly larger, then we
+	 * must update our estimate, because the previous estimate is
+	 * strictly worse than the new count.
 	 */
-	if (old_rel_pages == total_pages &&
-		scanned_pages < (double) total_pages * 0.02)
-		return old_rel_tuples;
-	if (scanned_pages <= 1)
-		return old_rel_tuples;
+	if (old_rel_tuples > scanned_tuples || old_rel_tuples < 0)
+	{
+		/*
+		 * When successive VACUUM commands scan the same few pages again and
+		 * again, without anything from the table really changing, there is a risk
+		 * that our beliefs about tuple density will gradually become distorted.
+		 * This might be caused by vacuumlazy.c implementation details, such as
+		 * its tendency to always scan the last heap page.  Handle that here.
+		 *
+		 * If the relation is _exactly_ the same size according to the existing
+		 * pg_class entry, and only a few of its pages (less than 2%) were
+		 * scanned, keep the existing value of reltuples.  Also keep the existing
+		 * value when only a subset of rel's pages <= a single page were scanned.
+		 *
+		 * (Note: we might be returning -1 here.)
+		 */
+		if (old_rel_pages == total_pages &&
+			scanned_pages < (double) total_pages * 0.02)
+			return old_rel_tuples;
+		if (scanned_pages <= 1 && allowSystemTableMods)
+			return old_rel_tuples;
+	}
 
 	/*
 	 * If old density is unknown, we can't do much except scale up
-- 
2.30.2