Hello Andres,
11.02.2022 05:22, Andres Freund wrote:
> Over in another thread I made some wild unsubstantiated guesses that the
> windows issues could have been made much more likely by a somewhat odd bit of
> code in PQisBusy():
>
> https://postgr.es/m/1959196.1644544971%40sss.pgh.pa.us
>
> Alexander, any chance you'd try if that changes the likelihood of the problem
> occurring, without any other fixes / reverts applied?
Unfortunately I haven't seen an improvement for the test in question.
With the PQisBusy-fix.patch from [1] and without any other changes on
the master branch (52377bb8) it still fails (on iterations 13, 5, 2, 2
for me).
The diagnostic logging (in attachment) added:
2022-02-12 01:04:16.341 PST [4912] LOG:  libpqrcv_receive: PQgetCopyData
returned 0
2022-02-12 01:04:16.341 PST [4912] LOG:  libpqrcv_receive: PQgetCopyData
2 returned -1
2022-02-12 01:04:16.341 PST [4912] LOG:  libpqrcv_receive:
end-of-streaming or error: -1
2022-02-12 01:04:16.341 PST [4912] LOG:  libpqrcv_PQgetResult:
streamConn->asyncStatus: 1 && streamConn->status: 0
2022-02-12 01:04:16.341 PST [4912] LOG:  libpqrcv_receive
libpqrcv_PQgetResult returned 10551584, 1
2022-02-12 01:04:16.341 PST [4912] LOG:  libpqrcv_receive
libpqrcv_PQgetResult PGRES_COMMAND_OK
2022-02-12 01:04:16.341 PST [4912] LOG:  libpqrcv_PQgetResult:
streamConn->asyncStatus: 1 && streamConn->status: 0
2022-02-12 01:04:16.341 PST [4912] LOG:  libpqrcv_PQgetResult loop
before WaitLatchOrSocket
2022-02-12 01:04:16.341 PST [4912] LOG:  WSAEventSelect event->fd: 948,
flags: 21
2022-02-12 01:04:16.341 PST [4912] LOG:  WaitLatchOrSocket before
WaitEventSetWait
2022-02-12 01:04:16.341 PST [4912] LOG:  WaitEventSetWait before
WaitEventSetWaitBlock
2022-02-12 01:04:16.341 PST [4912] LOG:  WaitEventSetWaitBlock before
WaitForMultipleObjects: 3
...
shows that before the doomed WaitForMultipleObjects() call the field
conn->status is 0 (CONNECTION_OK).

[1] https://www.postgresql.org/message-id/2187263.1644616494%40sss.pgh.pa.us

Best regards,
Alexander
diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
index 0d89db4e6a..0776885306 100644
--- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
+++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
@@ -34,6 +34,9 @@
 #include "utils/pg_lsn.h"
 #include "utils/tuplestore.h"
 
+#undef ENABLE_THREAD_SAFETY
+#include "libpq-int.h"
+
 PG_MODULE_MAGIC;
 
 void		_PG_init(void);
@@ -696,10 +699,13 @@ libpqrcv_PQgetResult(PGconn *streamConn)
 	 * Collect data until PQgetResult is ready to get the result without
 	 * blocking.
 	 */
+elog(LOG, "libpqrcv_PQgetResult: streamConn->asyncStatus: %d && streamConn->status: %d", streamConn->asyncStatus, streamConn->status);
+
 	while (PQisBusy(streamConn))
 	{
 		int			rc;
 
+elog(LOG, "libpqrcv_PQgetResult loop before WaitLatchOrSocket");
 		/*
 		 * We don't need to break down the sleep into smaller increments,
 		 * since we'll get interrupted by signals and can handle any
@@ -711,6 +717,7 @@ libpqrcv_PQgetResult(PGconn *streamConn)
 							   PQsocket(streamConn),
 							   0,
 							   WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE);
+elog(LOG, "libpqrcv_PQgetResult loop after WaitLatchOrSocket: %d");
 
 		/* Interrupted? */
 		if (rc & WL_LATCH_SET)
@@ -771,6 +778,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
 
 	/* Try to receive a CopyData message */
 	rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1);
+elog(LOG, "libpqrcv_receive: PQgetCopyData returned %d", rawlen);
 	if (rawlen == 0)
 	{
 		/* Try consuming some data. */
@@ -782,6 +790,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
 
 		/* Now that we've consumed some input, try again */
 		rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1);
+elog(LOG, "libpqrcv_receive: PQgetCopyData 2 returned %d", rawlen);
 		if (rawlen == 0)
 		{
 			/* Tell caller to try again when our socket is ready. */
@@ -791,15 +800,20 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
 	}
 	if (rawlen == -1)			/* end-of-streaming or error */
 	{
+elog(LOG, "libpqrcv_receive: end-of-streaming or error: %d", rawlen);
+
 		PGresult   *res;
 
 		res = libpqrcv_PQgetResult(conn->streamConn);
+elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult returned %d, %d", res, PQresultStatus(res));
 		if (PQresultStatus(res) == PGRES_COMMAND_OK)
 		{
+elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult PGRES_COMMAND_OK");
 			PQclear(res);
 
 			/* Verify that there are no more results. */
 			res = libpqrcv_PQgetResult(conn->streamConn);
+elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult 2 returned %p", res);
 			if (res != NULL)
 			{
 				PQclear(res);
@@ -809,6 +823,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
 				 * we'd seen an error, or PGRES_COPY_IN) don't report an error
 				 * here, but let callers deal with it.
 				 */
+elog(LOG, "libpqrcv_receive PQstatus(conn->streamConn) returned %d", PQstatus(conn->streamConn));
 				if (PQstatus(conn->streamConn) == CONNECTION_BAD)
 					return -1;
 
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index b39fce8c23..6bbbf67344 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -425,6 +425,7 @@ WalReceiverMain(void)
 				bool		endofwal = false;
 				pgsocket	wait_fd = PGINVALID_SOCKET;
 				int			rc;
+elog(LOG, "!!!wal receiver loop");
 
 				/*
 				 * Exit walreceiver if we're not in recovery. This should not
@@ -447,6 +448,8 @@ WalReceiverMain(void)
 
 				/* See if we can read data immediately */
 				len = walrcv_receive(wrconn, &buf, &wait_fd);
+elog(LOG, "!!!walrcv_receive returned %d", len);
+
 				if (len != 0)
 				{
 					/*
@@ -514,6 +517,7 @@ WalReceiverMain(void)
 									   wait_fd,
 									   NAPTIME_PER_CYCLE,
 									   WAIT_EVENT_WAL_RECEIVER_MAIN);
+elog(LOG, "!!!WaitLatchOrSocket returned %d", rc);
 				if (rc & WL_LATCH_SET)
 				{
 					ResetLatch(MyLatch);
@@ -583,6 +587,7 @@ WalReceiverMain(void)
 					XLogWalRcvSendHSFeedback(false);
 				}
 			}
+elog(LOG, "!!!wal receiver loop finished");
 
 			/*
 			 * The backend finished streaming. Exit streaming COPY-mode from
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 5bb609b368..11afc03072 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -529,12 +529,12 @@ WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
 	if (wakeEvents & WL_SOCKET_MASK)
 	{
 		int			ev;
-
 		ev = wakeEvents & WL_SOCKET_MASK;
 		AddWaitEventToSet(set, ev, sock, NULL, NULL);
 	}
-
+elog(LOG, "WaitLatchOrSocket before WaitEventSetWait");
 	rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
+elog(LOG, "WaitLatchOrSocket after WaitEventSetWait");
 
 	if (rc == 0)
 		ret |= WL_TIMEOUT;
@@ -1280,6 +1280,7 @@ WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
 				elog(ERROR, "failed to create event for socket: error code %d",
 					 WSAGetLastError());
 		}
+elog(LOG, "WSAEventSelect event->fd: %d, flags: %x", event->fd, flags);
 		if (WSAEventSelect(event->fd, *handle, flags) != 0)
 			elog(ERROR, "failed to set up event for socket: error code %d",
 				 WSAGetLastError());
@@ -1392,8 +1393,10 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
 		 * this file. If -1 is returned, a timeout has occurred, if 0 we have
 		 * to retry, everything >= 1 is the number of returned events.
 		 */
+elog(LOG, "WaitEventSetWait before WaitEventSetWaitBlock");
 		rc = WaitEventSetWaitBlock(set, cur_timeout,
 								   occurred_events, nevents);
+elog(LOG, "WaitEventSetWait after WaitEventSetWaitBlock");
 
 		if (set->latch)
 		{
@@ -1888,9 +1891,11 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 	 *
 	 * Need to wait for ->nevents + 1, because signal handle is in [0].
 	 */
+elog(LOG, "WaitEventSetWaitBlock before WaitForMultipleObjects: %d", set->nevents);
 	rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
 								cur_timeout);
 
+elog(LOG, "WaitEventSetWaitBlock after WaitForMultipleObjects");
 	/* Check return code */
 	if (rc == WAIT_FAILED)
 		elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",

Reply via email to