Hi,
I had another fresh look at datachecksum_state.c while rebasing my
"Interrupts vs signals" patch set, and spotted a few minor things that I
think should be cleaned up. See commit messages for details.
Unless I'm missing something, the last patch fixes a bug, albeit a very
theoretical one. The crux is that when the launcher exits, the worker
might be left running; launcher_exit sends it SIGTERM but it might not
exit instantly. If the launcher is restarted, and it launches a new
worker while the old one is still running, the old launcher might set
the worker's result field in shared memory, misleading the launcher to
believe that the *new* worker succeeded.
That'd race condition would be really hard to hit in practice - I didn't
even try to write a test - but it'd be nice to fix it. The patch adds a
unique ID to each worker invocation to distinguish the old and new
worker if both are running at the same time, ensuring that the old
worker doesn't mess with the new worker's state.
- Heikki
From db1171e6585e41319b3f365bcc988dd8c42b758e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Tue, 23 Jun 2026 21:05:57 +0300
Subject: [PATCH 1/6] Move DataChecksumsWorkerResult struct to the .c file
It's not used anywhere else. Commit 07009121c2 removed the injection
point test code that the comment referred to.
---
src/backend/postmaster/datachecksum_state.c | 9 +++++++++
src/include/postmaster/datachecksum_state.h | 12 ------------
2 files changed, 9 insertions(+), 12 deletions(-)
diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c
index 04f1a268845..6fceb0b349c 100644
--- a/src/backend/postmaster/datachecksum_state.c
+++ b/src/backend/postmaster/datachecksum_state.c
@@ -276,6 +276,15 @@ static const ChecksumBarrierCondition checksum_barriers[9] =
{PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_OFF},
};
+/* Possible states for a database entry which has been processed */
+typedef enum
+{
+ DATACHECKSUMSWORKER_SUCCESSFUL = 0,
+ DATACHECKSUMSWORKER_ABORTED,
+ DATACHECKSUMSWORKER_FAILED,
+ DATACHECKSUMSWORKER_DROPDB,
+} DataChecksumsWorkerResult;
+
/*
* Signaling between backends calling pg_enable/disable_data_checksums, the
* checksums launcher process, and the checksums worker process.
diff --git a/src/include/postmaster/datachecksum_state.h b/src/include/postmaster/datachecksum_state.h
index 2a1ae10d55d..dbe25f5461f 100644
--- a/src/include/postmaster/datachecksum_state.h
+++ b/src/include/postmaster/datachecksum_state.h
@@ -24,18 +24,6 @@ typedef enum DataChecksumsWorkerOperation
DISABLE_DATACHECKSUMS,
} DataChecksumsWorkerOperation;
-/*
- * Possible states for a database entry which has been processed. Exported
- * here since we want to be able to reference this from injection point tests.
- */
-typedef enum
-{
- DATACHECKSUMSWORKER_SUCCESSFUL = 0,
- DATACHECKSUMSWORKER_ABORTED,
- DATACHECKSUMSWORKER_FAILED,
- DATACHECKSUMSWORKER_DROPDB,
-} DataChecksumsWorkerResult;
-
/* Prototypes for data checksum state manipulation */
bool AbsorbDataChecksumsBarrier(ProcSignalBarrierType barrier);
void EmitAndWaitDataChecksumsBarrier(uint32 state);
--
2.47.3
From 92d92feb622384f9b80f1f8cb0a5deb079ab7c46 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Tue, 23 Jun 2026 23:29:51 +0300
Subject: [PATCH 2/6] Clarify StartDataChecksumsWorkerLauncher() function
Mark StartDataChecksumsWorkerLauncher() as static, since it's not
called from outside the .c file. The DataChecksumsWorkerOperation
struct can be moved into the .c file too.
I found the "Main entry point for datachecksumsworker launcher
process" description misleading. That description would be a better
fit for DataChecksumsWorkerLauncherMain(), which is the process's
"main" function, rather than StartDataChecksumsWorkerLauncher().
Reword the comment.
---
src/backend/postmaster/datachecksum_state.c | 18 ++++++++++++++----
src/include/postmaster/datachecksum_state.h | 14 --------------
2 files changed, 14 insertions(+), 18 deletions(-)
diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c
index 6fceb0b349c..91fff22d70d 100644
--- a/src/backend/postmaster/datachecksum_state.c
+++ b/src/backend/postmaster/datachecksum_state.c
@@ -276,6 +276,13 @@ static const ChecksumBarrierCondition checksum_barriers[9] =
{PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_OFF},
};
+/* Possible operations the DataChecksumsWorker can perform */
+typedef enum DataChecksumsWorkerOperation
+{
+ ENABLE_DATACHECKSUMS,
+ DISABLE_DATACHECKSUMS,
+} DataChecksumsWorkerOperation;
+
/* Possible states for a database entry which has been processed */
typedef enum
{
@@ -364,6 +371,9 @@ static volatile sig_atomic_t launcher_running = false;
static DataChecksumsWorkerOperation operation;
/* Prototypes */
+static void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
+ int cost_delay,
+ int cost_limit);
static void DataChecksumsShmemRequest(void *arg);
static bool DatabaseExists(Oid dboid);
static List *BuildDatabaseList(void);
@@ -564,12 +574,12 @@ enable_data_checksums(PG_FUNCTION_ARGS)
/*
* StartDataChecksumsWorkerLauncher
- * Main entry point for datachecksumsworker launcher process
+ * Start the datachecksumsworker launcher process, if not running yet
*
- * The main entrypoint for starting data checksums processing for enabling as
- * well as disabling.
+ * This is called to start data checksums processing for enabling as well as
+ * disabling.
*/
-void
+static void
StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
int cost_delay,
int cost_limit)
diff --git a/src/include/postmaster/datachecksum_state.h b/src/include/postmaster/datachecksum_state.h
index dbe25f5461f..f34db0c09e0 100644
--- a/src/include/postmaster/datachecksum_state.h
+++ b/src/include/postmaster/datachecksum_state.h
@@ -17,24 +17,10 @@
#include "storage/procsignal.h"
-/* Possible operations the DataChecksumsWorker can perform */
-typedef enum DataChecksumsWorkerOperation
-{
- ENABLE_DATACHECKSUMS,
- DISABLE_DATACHECKSUMS,
-} DataChecksumsWorkerOperation;
-
/* Prototypes for data checksum state manipulation */
bool AbsorbDataChecksumsBarrier(ProcSignalBarrierType barrier);
void EmitAndWaitDataChecksumsBarrier(uint32 state);
-/* Prototypes for data checksum background worker */
-
-/* Start the background processes for enabling or disabling checksums */
-void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
- int cost_delay,
- int cost_limit);
-
/* Background worker entrypoints */
void DataChecksumsWorkerLauncherMain(Datum arg);
void DataChecksumsWorkerMain(Datum arg);
--
2.47.3
From 409926ec65f6b27020ee454903aba5b9fbe242db Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Tue, 23 Jun 2026 23:29:59 +0300
Subject: [PATCH 3/6] Avoid leaving DataChecksumState->worker_pid to an old
value
It might be left to an old value if the launcher was terminated while
a worker was running. launcher_exit() sends SIGTERM to the worker, but
did not clear 'worker_pid'. Clear it, to be tidy.
Also clear it in ProcessDatabase() before starting a new datachecksums
worker, to be sure we start from clean slate. The codepath where
WaitForBackgroundWorkerStartup() returns BGWH_STOPPED but
worker_result != DATACHECKSUMSWORKER_SUCCESSFUL didn't clear it, while
all other codepaths did clear or set it.
---
src/backend/postmaster/datachecksum_state.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c
index 91fff22d70d..6b44e4ee9ea 100644
--- a/src/backend/postmaster/datachecksum_state.c
+++ b/src/backend/postmaster/datachecksum_state.c
@@ -817,6 +817,7 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
DataChecksumState->success = DATACHECKSUMSWORKER_FAILED;
+ DataChecksumState->worker_pid = InvalidPid;
LWLockRelease(DataChecksumsWorkerLock);
memset(&bgw, 0, sizeof(bgw));
@@ -856,9 +857,6 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
{
LWLockRelease(DataChecksumsWorkerLock);
pgstat_report_activity(STATE_IDLE, NULL);
- LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- DataChecksumState->worker_pid = InvalidPid;
- LWLockRelease(DataChecksumsWorkerLock);
return DataChecksumState->success;
}
LWLockRelease(DataChecksumsWorkerLock);
@@ -951,6 +949,7 @@ launcher_exit(int code, Datum arg)
ereport(LOG,
errmsg("data checksums launcher exiting while worker is still running, signalling worker"));
kill(DataChecksumState->worker_pid, SIGTERM);
+ DataChecksumState->worker_pid = InvalidPid;
}
LWLockRelease(DataChecksumsWorkerLock);
}
--
2.47.3
From a297e5e6374e079d96e30450fe12888e22a0cbc4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Tue, 23 Jun 2026 23:30:04 +0300
Subject: [PATCH 4/6] Minor cleanup around checking datachecksum worker result
Rename the 'success' field in DataChecksumState to 'worker_result'.
That's more appropriate when it's not a simple boolean.
Don't access the field after releasing the lock in ProcessDatabase().
No other process should be modifying it, but if we bother to do any
locking in the first place, let's do it right.
---
src/backend/postmaster/datachecksum_state.c | 35 +++++++++++----------
1 file changed, 19 insertions(+), 16 deletions(-)
diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c
index 6b44e4ee9ea..c7aff4a38da 100644
--- a/src/backend/postmaster/datachecksum_state.c
+++ b/src/backend/postmaster/datachecksum_state.c
@@ -340,7 +340,7 @@ typedef struct DataChecksumsStateStruct
*/
/* result, set by worker before exiting */
- DataChecksumsWorkerResult success;
+ DataChecksumsWorkerResult worker_result;
/*
* Tells the worker process whether it should also process the shared
@@ -814,9 +814,14 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
BgwHandleStatus status;
pid_t pid;
char activity[NAMEDATALEN + 64];
+ DataChecksumsWorkerResult result;
+ /*
+ * Initialize result to FAILED. The worker will change it to SUCCESSFUL
+ * if it completes successfully.
+ */
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- DataChecksumState->success = DATACHECKSUMSWORKER_FAILED;
+ DataChecksumState->worker_result = DATACHECKSUMSWORKER_FAILED;
DataChecksumState->worker_pid = InvalidPid;
LWLockRelease(DataChecksumsWorkerLock);
@@ -853,11 +858,11 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
* for it we can see a STOPPED status here without it being a failure.
*/
LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
- if (DataChecksumState->success == DATACHECKSUMSWORKER_SUCCESSFUL)
+ if (DataChecksumState->worker_result == DATACHECKSUMSWORKER_SUCCESSFUL)
{
LWLockRelease(DataChecksumsWorkerLock);
pgstat_report_activity(STATE_IDLE, NULL);
- return DataChecksumState->success;
+ return DATACHECKSUMSWORKER_SUCCESSFUL;
}
LWLockRelease(DataChecksumsWorkerLock);
@@ -911,19 +916,17 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
db->dbname),
errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
- LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
- if (DataChecksumState->success == DATACHECKSUMSWORKER_ABORTED)
- ereport(LOG,
- errmsg("data checksums processing was aborted in database \"%s\"",
- db->dbname));
- LWLockRelease(DataChecksumsWorkerLock);
-
- pgstat_report_activity(STATE_IDLE, NULL);
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ result = DataChecksumState->worker_result;
DataChecksumState->worker_pid = InvalidPid;
LWLockRelease(DataChecksumsWorkerLock);
- return DataChecksumState->success;
+ if (result == DATACHECKSUMSWORKER_ABORTED)
+ ereport(LOG,
+ errmsg("data checksums processing was aborted in database \"%s\"",
+ db->dbname));
+ pgstat_report_activity(STATE_IDLE, NULL);
+ return result;
}
/*
@@ -1649,7 +1652,7 @@ DataChecksumsWorkerMain(Datum arg)
if (aborted || abort_requested)
{
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED;
+ DataChecksumState->worker_result = DATACHECKSUMSWORKER_ABORTED;
LWLockRelease(DataChecksumsWorkerLock);
ereport(DEBUG1,
errmsg("data checksum processing aborted in database OID %u",
@@ -1721,7 +1724,7 @@ DataChecksumsWorkerMain(Datum arg)
if (aborted || abort_requested)
{
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED;
+ DataChecksumState->worker_result = DATACHECKSUMSWORKER_ABORTED;
LWLockRelease(DataChecksumsWorkerLock);
ereport(LOG,
errmsg("data checksum processing aborted in database OID %u",
@@ -1736,6 +1739,6 @@ DataChecksumsWorkerMain(Datum arg)
pgstat_progress_end_command();
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- DataChecksumState->success = DATACHECKSUMSWORKER_SUCCESSFUL;
+ DataChecksumState->worker_result = DATACHECKSUMSWORKER_SUCCESSFUL;
LWLockRelease(DataChecksumsWorkerLock);
}
--
2.47.3
From daf3b889f0b7b688db5d59ce586e2eb3c1e081c8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Tue, 23 Jun 2026 23:30:09 +0300
Subject: [PATCH 5/6] Fix comment on WaitForAllTransactionsToFinish() on
postmaster death
The comment claimed that it sets "the abort flag" on postmaster death,
but it actually just errors outs. In the passing, improve the comment
to explain why it doesn't just use WL_EXIT_ON_PM_DEATH.
---
src/backend/postmaster/datachecksum_state.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c
index c7aff4a38da..1ad8fea93f0 100644
--- a/src/backend/postmaster/datachecksum_state.c
+++ b/src/backend/postmaster/datachecksum_state.c
@@ -1000,9 +1000,7 @@ launcher_cancel_handler(SIGNAL_ARGS)
* Blocks awaiting all current transactions to finish
*
* Returns when all transactions which are active at the call of the function
- * have ended, or if the postmaster dies while waiting. If the postmaster dies
- * the abort flag will be set to indicate that the caller of this shouldn't
- * proceed.
+ * have ended.
*
* NB: this will return early, if aborted by SIGINT or if the target state
* is changed while we're running.
@@ -1036,8 +1034,8 @@ WaitForAllTransactionsToFinish(void)
WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION);
/*
- * If the postmaster died we won't be able to enable checksums
- * cluster-wide so abort and hope to continue when restarted.
+ * If the postmaster died, bail out. But first print a log message to
+ * note that the checksumming didn't complete.
*/
if (rc & WL_POSTMASTER_DEATH)
ereport(FATAL,
--
2.47.3
From e8d55bde15b1779c40c7bbdb9e7d5f29a68d49a8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Tue, 23 Jun 2026 23:30:17 +0300
Subject: [PATCH 6/6] Distinguish datacheckums worker invocations more reliably
In some corner cases, a new datachecksums worker could be launched
while an old one was still running. If you're really unlucky, the old
worker could set the worker_result in shared memory and mislead the
launcher to think that a newer worker invocation completed
successfully, even though it failed for some reason. That's highly
unlikely to happen in practice as it requires several race conditions
with workers and launchers starting, failing and succeeding and at the
right moments. Nevertheless, better to tighten it up.
To distinguish different worker invocations, assign a unique
'worker_invocation' number every time a new worker is launched. In the
worker, check that the invocation number matches before setting the
worker result. This ensures that the result always belongs to the
latest invocation.
---
src/backend/postmaster/datachecksum_state.c | 99 +++++++++++++++------
1 file changed, 73 insertions(+), 26 deletions(-)
diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c
index 1ad8fea93f0..68557c16cb9 100644
--- a/src/backend/postmaster/datachecksum_state.c
+++ b/src/backend/postmaster/datachecksum_state.c
@@ -315,11 +315,18 @@ typedef struct DataChecksumsStateStruct
bool launcher_running;
/*
- * PID of the worker process, if it's currently running, of InvalidPid if
- * none. This is set by the worker launcher when it starts waiting for a
- * worker process to finish.
+ * Every time a new worker is launched, it's assigned a unique invocation
+ * number by incrementing this counter.
*/
- pid_t worker_pid;
+ uint64 worker_invocation_counter;
+
+ /*
+ * Information about the current worker, if it's currently running. These
+ * are set by the worker launcher.
+ */
+ uint64 worker_invocation; /* unique invocation number */
+ Oid database_oid; /* database it's processing */
+ pid_t worker_pid; /* worker process's PID */
/*
* These fields indicate the target state that the worker is currently
@@ -361,6 +368,8 @@ typedef struct DataChecksumsWorkerDatabase
/* Flag set by the interrupt handler */
static volatile sig_atomic_t abort_requested = false;
+static uint64 worker_invocation;
+
/*
* Have we set the DataChecksumsStateStruct->launcher_running flag?
* If we have, we need to clear it before exiting!
@@ -389,10 +398,21 @@ const ShmemCallbacks DataChecksumsShmemCallbacks = {
.request_fn = DataChecksumsShmemRequest,
};
-#define CHECK_FOR_ABORT_REQUEST() \
+#define CHECK_FOR_LAUNCHER_ABORT_REQUEST() \
+ do { \
+ Assert(MyBackendType == B_DATACHECKSUMSWORKER_LAUNCHER); \
+ LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); \
+ if (DataChecksumState->launch_operation != operation) \
+ abort_requested = true; \
+ LWLockRelease(DataChecksumsWorkerLock); \
+ } while (0)
+
+#define CHECK_FOR_WORKER_ABORT_REQUEST() \
do { \
+ Assert(MyBackendType == B_DATACHECKSUMSWORKER_WORKER); \
LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); \
- if (DataChecksumState->launch_operation != operation) \
+ if (DataChecksumState->worker_invocation != worker_invocation || \
+ DataChecksumState->launch_operation != operation) \
abort_requested = true; \
LWLockRelease(DataChecksumsWorkerLock); \
} while (0)
@@ -726,11 +746,7 @@ ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrateg
/* Check if we are asked to abort, the abortion will bubble up. */
Assert(operation == ENABLE_DATACHECKSUMS);
- LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
- if (DataChecksumState->launch_operation == DISABLE_DATACHECKSUMS)
- abort_requested = true;
- LWLockRelease(DataChecksumsWorkerLock);
-
+ CHECK_FOR_WORKER_ABORT_REQUEST();
if (abort_requested)
return false;
@@ -813,16 +829,23 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
BackgroundWorkerHandle *bgw_handle;
BgwHandleStatus status;
pid_t pid;
+ uint64 invocation;
char activity[NAMEDATALEN + 64];
DataChecksumsWorkerResult result;
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+
/*
* Initialize result to FAILED. The worker will change it to SUCCESSFUL
* if it completes successfully.
*/
- LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
DataChecksumState->worker_result = DATACHECKSUMSWORKER_FAILED;
DataChecksumState->worker_pid = InvalidPid;
+
+ invocation = ++DataChecksumState->worker_invocation_counter;
+ DataChecksumState->worker_invocation = invocation;
+ DataChecksumState->database_oid = db->dboid;
+
LWLockRelease(DataChecksumsWorkerLock);
memset(&bgw, 0, sizeof(bgw));
@@ -834,7 +857,8 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksums worker");
bgw.bgw_restart_time = BGW_NEVER_RESTART;
bgw.bgw_notify_pid = MyProcPid;
- bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid);
+ /* pass the invocation number to the worker process */
+ bgw.bgw_main_arg = UInt64GetDatum(invocation);
/*
* If there are no worker slots available, there is little we can do. If
@@ -858,6 +882,7 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
* for it we can see a STOPPED status here without it being a failure.
*/
LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
+ Assert(DataChecksumState->worker_invocation == invocation);
if (DataChecksumState->worker_result == DATACHECKSUMSWORKER_SUCCESSFUL)
{
LWLockRelease(DataChecksumsWorkerLock);
@@ -901,6 +926,7 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
/* Save the pid of the worker so we can signal it later */
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ Assert(DataChecksumState->worker_invocation == invocation);
DataChecksumState->worker_pid = pid;
LWLockRelease(DataChecksumsWorkerLock);
@@ -917,6 +943,7 @@ ProcessDatabase(DataChecksumsWorkerDatabase *db)
errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ Assert(DataChecksumState->worker_invocation == invocation);
result = DataChecksumState->worker_result;
DataChecksumState->worker_pid = InvalidPid;
LWLockRelease(DataChecksumsWorkerLock);
@@ -1044,7 +1071,7 @@ WaitForAllTransactionsToFinish(void)
errhint("Data checksums processing must be restarted manually after cluster restart."));
CHECK_FOR_INTERRUPTS();
- CHECK_FOR_ABORT_REQUEST();
+ CHECK_FOR_LAUNCHER_ABORT_REQUEST();
if (abort_requested)
break;
@@ -1145,13 +1172,9 @@ again:
* If the target state changed during processing then it's not a
* failure, so restart processing instead.
*/
- LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- if (DataChecksumState->launch_operation != operation)
- {
- LWLockRelease(DataChecksumsWorkerLock);
+ CHECK_FOR_LAUNCHER_ABORT_REQUEST();
+ if (abort_requested)
goto done;
- }
- LWLockRelease(DataChecksumsWorkerLock);
ereport(ERROR,
errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("unable to enable data checksums in cluster"));
@@ -1520,7 +1543,7 @@ BuildRelationList(bool temp_relations, bool include_shared)
void
DataChecksumsWorkerMain(Datum arg)
{
- Oid dboid = DatumGetObjectId(arg);
+ Oid dboid;
List *RelationList = NIL;
List *InitialTempTableList = NIL;
BufferAccessStrategy strategy;
@@ -1531,6 +1554,8 @@ DataChecksumsWorkerMain(Datum arg)
bool retried = false;
#endif
+ worker_invocation = DatumGetUInt64(arg);
+
operation = ENABLE_DATACHECKSUMS;
pqsignal(SIGTERM, die);
@@ -1541,6 +1566,15 @@ DataChecksumsWorkerMain(Datum arg)
MyBackendType = B_DATACHECKSUMSWORKER_WORKER;
init_ps_display(NULL);
+ LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
+ if (DataChecksumState->worker_invocation != worker_invocation)
+ {
+ LWLockRelease(DataChecksumsWorkerLock);
+ return;
+ }
+ dboid = DataChecksumState->database_oid;
+ LWLockRelease(DataChecksumsWorkerLock);
+
BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid,
BGWORKER_BYPASS_ALLOWCONN);
@@ -1556,6 +1590,11 @@ DataChecksumsWorkerMain(Datum arg)
*/
InitialTempTableList = BuildRelationList(true, false);
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ if (DataChecksumState->worker_invocation != worker_invocation)
+ {
+ LWLockRelease(DataChecksumsWorkerLock);
+ return;
+ }
process_shared = DataChecksumState->process_shared_catalogs;
/*
@@ -1611,7 +1650,7 @@ DataChecksumsWorkerMain(Datum arg)
pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE,
++rels_done);
CHECK_FOR_INTERRUPTS();
- CHECK_FOR_ABORT_REQUEST();
+ CHECK_FOR_WORKER_ABORT_REQUEST();
if (abort_requested)
break;
@@ -1622,6 +1661,11 @@ DataChecksumsWorkerMain(Datum arg)
* to be refreshed.
*/
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ if (DataChecksumState->worker_invocation != worker_invocation)
+ {
+ LWLockRelease(DataChecksumsWorkerLock);
+ break;
+ }
if ((DataChecksumState->launch_cost_delay != DataChecksumState->cost_delay)
|| (DataChecksumState->launch_cost_limit != DataChecksumState->cost_limit))
{
@@ -1650,7 +1694,8 @@ DataChecksumsWorkerMain(Datum arg)
if (aborted || abort_requested)
{
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- DataChecksumState->worker_result = DATACHECKSUMSWORKER_ABORTED;
+ if (DataChecksumState->worker_invocation == worker_invocation)
+ DataChecksumState->worker_result = DATACHECKSUMSWORKER_ABORTED;
LWLockRelease(DataChecksumsWorkerLock);
ereport(DEBUG1,
errmsg("data checksum processing aborted in database OID %u",
@@ -1717,12 +1762,13 @@ DataChecksumsWorkerMain(Datum arg)
WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT);
CHECK_FOR_INTERRUPTS();
- CHECK_FOR_ABORT_REQUEST();
+ CHECK_FOR_WORKER_ABORT_REQUEST();
if (aborted || abort_requested)
{
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- DataChecksumState->worker_result = DATACHECKSUMSWORKER_ABORTED;
+ if (DataChecksumState->worker_invocation == worker_invocation)
+ DataChecksumState->worker_result = DATACHECKSUMSWORKER_ABORTED;
LWLockRelease(DataChecksumsWorkerLock);
ereport(LOG,
errmsg("data checksum processing aborted in database OID %u",
@@ -1737,6 +1783,7 @@ DataChecksumsWorkerMain(Datum arg)
pgstat_progress_end_command();
LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
- DataChecksumState->worker_result = DATACHECKSUMSWORKER_SUCCESSFUL;
+ if (DataChecksumState->worker_invocation == worker_invocation)
+ DataChecksumState->worker_result = DATACHECKSUMSWORKER_SUCCESSFUL;
LWLockRelease(DataChecksumsWorkerLock);
}
--
2.47.3