Re: VM corruption on standby

Aleksander Alekseev Thu, 07 Aug 2025 07:10:23 -0700

Hi,

> This is a tricky bug. Do you also have a proposal of a particular fix?


If my understanding is correct, we should make a WAL record with the
XLH_LOCK_ALL_FROZEN_CLEARED flag *before* we modify the VM but within
the same critical section (in order to avoid race conditions within
the same backend).

A draft patch is attached. It makes the test pass and doesn't seem to
break any other tests.

Thoughts?

From 2909bfd3483b4796cb671c722520853d3b6ce61b Mon Sep 17 00:00:00 2001
From: Andrey Borodin <[email protected]>
Date: Sun, 27 Jul 2025 11:37:55 +0500
Subject: [PATCH v2 1/2] Corrupt VM on standby

---
 src/backend/access/heap/heapam.c              |   3 +
 src/backend/access/heap/heapam_xlog.c         |   1 +
 src/test/modules/test_slru/Makefile           |   2 +-
 src/test/modules/test_slru/t/001_multixact.pl | 172 +++++++++---------
 src/test/perl/PostgreSQL/Test/Cluster.pm      |  40 ++++
 5 files changed, 128 insertions(+), 90 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 0dcd6ee817e..e5da051aaf0 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2123,6 +2123,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	 */
 	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
 
+	INJECTION_POINT_LOAD("heap-insert:visibilitymap-clear");
+
 	/* NO EREPORT(ERROR) from here till changes are logged */
 	START_CRIT_SECTION();
 
@@ -2136,6 +2138,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		visibilitymap_clear(relation,
 							ItemPointerGetBlockNumber(&(heaptup->t_self)),
 							vmbuffer, VISIBILITYMAP_VALID_BITS);
+		INJECTION_POINT_CACHED("heap-insert:visibilitymap-clear", NULL);
 	}
 
 	/*
diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c
index eb4bd3d6ae3..978030048b6 100644
--- a/src/backend/access/heap/heapam_xlog.c
+++ b/src/backend/access/heap/heapam_xlog.c
@@ -21,6 +21,7 @@
 #include "access/xlogutils.h"
 #include "storage/freespace.h"
 #include "storage/standby.h"
+#include "utils/injection_point.h"
 
 
 /*
diff --git a/src/test/modules/test_slru/Makefile b/src/test/modules/test_slru/Makefile
index fe4e7390437..b6ad41ec804 100644
--- a/src/test/modules/test_slru/Makefile
+++ b/src/test/modules/test_slru/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	test_multixact.o
 PGFILEDESC = "test_slru - test module for SLRUs"
 
-EXTRA_INSTALL=src/test/modules/injection_points
+EXTRA_INSTALL=src/test/modules/injection_points contrib/pg_visibility
 export enable_injection_points
 TAP_TESTS = 1
 
diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl
index e2b567a603d..0c347a5157d 100644
--- a/src/test/modules/test_slru/t/001_multixact.pl
+++ b/src/test/modules/test_slru/t/001_multixact.pl
@@ -1,11 +1,6 @@
 # Copyright (c) 2024-2025, PostgreSQL Global Development Group
 
-# This test verifies edge case of reading a multixact:
-# when we have multixact that is followed by exactly one another multixact,
-# and another multixact have no offset yet, we must wait until this offset
-# becomes observable. Previously we used to wait for 1ms in a loop in this
-# case, but now we use CV for this. This test is exercising such a sleep.
-
+# This test verifies edge case of visibilitymap_clear
 use strict;
 use warnings FATAL => 'all';
 
@@ -18,106 +13,105 @@ if ($ENV{enable_injection_points} ne 'yes')
 {
 	plan skip_all => 'Injection points not supported by this build';
 }
+if ($windows_os)
+{
+	plan skip_all => 'Kill9 works unpredicatably on Windows';
+}
 
 my ($node, $result);
 
 $node = PostgreSQL::Test::Cluster->new('mike');
-$node->init;
+$node->init(allows_streaming => 1);
 $node->append_conf('postgresql.conf',
 	"shared_preload_libraries = 'test_slru,injection_points'");
 $node->start;
 $node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
+$node->safe_psql('postgres', q(CREATE EXTENSION pg_visibility));
 $node->safe_psql('postgres', q(CREATE EXTENSION test_slru));
 
-# Test for Multixact generation edge case
-$node->safe_psql('postgres',
-	q{select injection_points_attach('test-multixact-read','wait')});
-$node->safe_psql('postgres',
-	q{select injection_points_attach('multixact-get-members-cv-sleep','wait')}
-);
-
-# This session must observe sleep on the condition variable while generating a
-# multixact.  To achieve this it first will create a multixact, then pause
-# before reading it.
-my $observer = $node->background_psql('postgres');
-
-# This query will create a multixact, and hang just before reading it.
-$observer->query_until(
-	qr/start/,
-	q{
-	\echo start
-	SELECT test_read_multixact(test_create_multixact());
-});
-$node->wait_for_event('client backend', 'test-multixact-read');
-
-# This session will create the next Multixact. This is necessary to avoid
-# multixact.c's non-sleeping edge case 1.
-my $creator = $node->background_psql('postgres');
+my $backup_name = 'my_backup';
+$node->backup($backup_name);
+my $standby = PostgreSQL::Test::Cluster->new('standby');
+$standby->init_from_backup($node, $backup_name, has_streaming => 1);
+$standby->start;
+
+my $bg_psql = $node->background_psql('postgres');
+
+# prepare the table, index and freeze some rows.
+# index is here only to demonstrate selecting deleted data on standby
+my $multi = $bg_psql->query_safe(
+	q(
+	CREATE TABLE x(i int);
+	CREATE INDEX on x(i);
+	INSERT INTO x VALUES (1);
+	VACUUM FREEZE x;
+	));
+
+# next backand clearing VM in heap_insert() will hang after releasing buffer lock
 $node->safe_psql('postgres',
-	q{SELECT injection_points_attach('multixact-create-from-members','wait');}
+	q{SELECT injection_points_attach('heap-insert:visibilitymap-clear','wait');}
 );
 
-# We expect this query to hang in the critical section after generating new
-# multixact, but before filling its offset into SLRU.
-# Running an injection point inside a critical section requires it to be
-# loaded beforehand.
-$creator->query_until(
-	qr/start/, q{
-	\echo start
-	SELECT test_create_multixact();
-});
-
-$node->wait_for_event('client backend', 'multixact-create-from-members');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, test-multixact-read',
-	"matching injection point waits");
-
-# Now wake observer to get it to read the initial multixact.  A subsequent
-# multixact already exists, but that one doesn't have an offset assigned, so
-# this will hit multixact.c's edge case 2.
-$node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('test-multixact-read')});
-$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, multixact-get-members-cv-sleep',
-	"matching injection point waits");
-
-# Now we have two backends waiting in multixact-create-from-members and
-# multixact-get-members-cv-sleep.  Also we have 3 injections points set to wait.
-# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must
-# detach it first. So let's detach all injection points, then wake up all
-# backends.
-
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('test-multixact-read')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('multixact-create-from-members')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('multixact-get-members-cv-sleep')});
+# we expect this backend to hang forever
+$bg_psql->query_until(
+	qr/deploying lost VM bit/, q(
+\echo deploying lost VM bit
+	INSERT INTO x VALUES (1);
+));
 
+$node->wait_for_event('client backend', 'heap-insert:visibilitymap-clear');
 $node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-create-from-members')});
+	q{SELECT injection_points_detach('heap-insert:visibilitymap-clear')});
+
+# now we want VM on-disk. Checkpoint will hang too, hence another background session.
+my $bg_psql1 = $node->background_psql('postgres');
+$bg_psql1->query_until(
+	qr/flush data to disk/, q(
+\echo flush data to disk
+	checkpoint;
+));
+
+sleep(1);
+# All set and done, it's time for hard restart
+$node->kill9;
+$node->stop('immediate', fail_ok => 1);
+$node->poll_start;
+$bg_psql->{run}->finish;
+$bg_psql1->{run}->finish;
+
+# Now VMs are different on primary and standby. Insert and some data to demonstrate problems.
 $node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')});
-
-# Background psql will now be able to read the result and disconnect.
-$observer->quit;
-$creator->quit;
+	q{
+	INSERT INTO x VALUES (1);
+	INSERT INTO x VALUES (2);
+	DELETE FROM x WHERE i = 2;
+	});
+
+$node->wait_for_catchup($standby);
+
+# corruptino tests
+my $corrupted_all_frozen = $standby->safe_psql('postgres',
+	qq{SELECT pg_check_frozen('x')});
+my $corrupted_all_visible = $standby->safe_psql('postgres',
+	qq{SELECT pg_check_visible('x')});
+my $observed_deleted_data = $standby->safe_psql('postgres',
+	qq{set enable_seqscan to false; SELECT * FROM x WHERE i = 2;});
+
+# debug printing
+print("\n");
+print($corrupted_all_frozen);
+print("\n\n");
+print($corrupted_all_visible);
+print("\n\n");
+print($observed_deleted_data);
+print("\n\n");
+
+# fail the test if any corruption is reported
+is($corrupted_all_frozen, '', 'pg_check_frozen() observes corruption');
+is($corrupted_all_visible, '', 'pg_check_visible() observes corruption');
+is($observed_deleted_data, '', 'deleted data returned by select');
 
 $node->stop;
+$standby->stop;
 
-# If we reached this point - everything is OK.
-ok(1);
 done_testing();
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 35413f14019..e810f123f93 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -1191,6 +1191,46 @@ sub start
 
 =pod
 
+=item $node->poll_start() => success_or_failure
+
+Polls start after kill9
+
+We may need retries to start a new postmaster.  Causes:
+ - kernel is slow to deliver SIGKILL
+ - postmaster parent is slow to waitpid()
+ - postmaster child is slow to exit in response to SIGQUIT
+ - postmaster child is slow to exit after postmaster death
+
+=cut
+
+sub poll_start
+{
+	my ($self) = @_;
+
+	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
+	my $attempts = 0;
+
+	while ($attempts < $max_attempts)
+	{
+		$self->start(fail_ok => 1) && return 1;
+
+		# Wait 0.1 second before retrying.
+		usleep(100_000);
+
+		# Clean up in case the start attempt just timed out or some such.
+		$self->stop('fast', fail_ok => 1);
+
+		$attempts++;
+	}
+
+	# Try one last time without fail_ok, which will BAIL_OUT unless it
+	# succeeds.
+	$self->start && return 1;
+	return 0;
+}
+
+=pod
+
 =item $node->kill9()
 
 Send SIGKILL (signal 9) to the postmaster.
-- 
2.43.0

From 4617d97e2953fcb5b20bb14750bcc971ed504286 Mon Sep 17 00:00:00 2001
From: Aleksander Alekseev <[email protected]>
Date: Thu, 7 Aug 2025 17:04:30 +0300
Subject: [PATCH v2 2/2] Bugfix TODO FIXME write a better message

---
 src/backend/access/heap/heapam.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index e5da051aaf0..6e38c9b4d37 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2135,10 +2135,13 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	{
 		all_visible_cleared = true;
 		PageClearAllVisible(BufferGetPage(buffer));
-		visibilitymap_clear(relation,
-							ItemPointerGetBlockNumber(&(heaptup->t_self)),
-							vmbuffer, VISIBILITYMAP_VALID_BITS);
-		INJECTION_POINT_CACHED("heap-insert:visibilitymap-clear", NULL);
+
+		/*
+		 * Visibility map should be updated *after* logging the change
+		 * within the same critical section.
+		 *
+		 * AALEKSEEV TODO clarify the reason
+		 */
 	}
 
 	/*
@@ -2233,6 +2236,14 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		PageSetLSN(page, recptr);
 	}
 
+	if (all_visible_cleared)
+	{
+		visibilitymap_clear(relation,
+							ItemPointerGetBlockNumber(&(heaptup->t_self)),
+							vmbuffer, VISIBILITYMAP_VALID_BITS);
+		INJECTION_POINT_CACHED("heap-insert:visibilitymap-clear", NULL);
+	}
+
 	END_CRIT_SECTION();
 
 	UnlockReleaseBuffer(buffer);
-- 
2.43.0

Re: VM corruption on standby

Reply via email to