Hello, sorry for the dazed reply in the previous mail.

I made revised patch for this issue.

Attached patches are following,

- 0001_Revise_socket_emulation_for_win32_backend.patch

  Revises socket emulation on win32 backend so that each socket
  can have its own blocking mode state.

- 0002_Allow_backend_termination_during_write_blocking.patch

  The patch to solve the issue. This patch depends on the 0001_
  patch.

==========

> > I'm marking this as Waiting on Author in the commitfest app, because:
> > 1. the protocol violation needs to be avoided one way or another, and
> > 2. the behavior needs to be consistent so that a single
> > pg_terminate_backend() is enough to always kill the connection.

- Preventing protocol violation.

  To prevent protocol violation, secure_write sets
  ClientConnectionLost when SIGTERM detected, then
  internal_flush() and ProcessInterrupts() follow the
  instruction.

- Single pg_terminate_backend surely kills the backend.

  secure_raw_write() uses non-blocking socket and a loop of
  select() with timeout to surely detects received
  signal(SIGTERM).

  To avoid frequent switching of blocking mode, the bare socket
  for Port is put to non-blocking mode from the first in
  StreamConnection() and blocking mode is controlled only by
  Port->noblock in secure_raw_read/write().

  To make the code mentioned above (Patch 0002) tidy, rewrite the
  socket emulation code for win32 backends so that each socket
  can have its own non-blocking state. (patch 0001)

Some concern about this patch,

- This patch allows the number of non-blocking socket to be below
  64 (FD_SETSIZE) on win32 backend but it seems to be sufficient.

- This patch introduced redundant socket emulation for win32
  backend but win32 bare socket for Port is already nonblocking
  as described so it donsn't seem to be a serious problem on
  performance. Addition to it, since I don't know the reason why
  win32/socket.c provides the blocking-mode socket emulation, I
  decided to preserve win32/socket.c to have blocking socket
  emulation. Possibly it can be removed.

Any suggestions?

regards,

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 605d891..c92851e 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -795,10 +795,6 @@ pq_set_nonblocking(bool nonblocking)
 	if (MyProcPort->noblock == nonblocking)
 		return;
 
-#ifdef WIN32
-	pgwin32_noblock = nonblocking ? 1 : 0;
-#else
-
 	/*
 	 * Use COMMERROR on failure, because ERROR would try to send the error to
 	 * the client, which might require changing the mode again, leading to
@@ -816,7 +812,7 @@ pq_set_nonblocking(bool nonblocking)
 			ereport(COMMERROR,
 					(errmsg("could not set socket to blocking mode: %m")));
 	}
-#endif
+
 	MyProcPort->noblock = nonblocking;
 }
 
diff --git a/src/backend/port/win32/socket.c b/src/backend/port/win32/socket.c
index c981169..f0ff3e7 100644
--- a/src/backend/port/win32/socket.c
+++ b/src/backend/port/win32/socket.c
@@ -21,11 +21,8 @@
  * non-blocking mode in order to be able to deliver signals, we must
  * specify this in a separate flag if we actually need non-blocking
  * operation.
- *
- * This flag changes the behaviour *globally* for all socket operations,
- * so it should only be set for very short periods of time.
  */
-int			pgwin32_noblock = 0;
+static fd_set		nonblockset;
 
 #undef socket
 #undef accept
@@ -33,6 +30,7 @@ int			pgwin32_noblock = 0;
 #undef select
 #undef recv
 #undef send
+#undef closesocket
 
 /*
  * Blocking socket functions implemented so they listen on both
@@ -40,6 +38,34 @@ int			pgwin32_noblock = 0;
  */
 
 /*
+ * Set blocking mode for each socket
+ */
+void
+pgwin32_set_socket_nonblock(SOCKET s, int nonblock)
+{
+	if (nonblock)
+		FD_SET(s, &nonblockset);
+	else
+		FD_CLR(s, &nonblockset);
+
+	/*
+	 * fd_set cannot have more than FD_SETSIZE entries. It's not likey to come
+	 * close to this limit but if it goes above the limit, non blocking state
+	 * of some existing sockets will be discarded.
+	 */
+	if (nonblockset.fd_count >= FD_SETSIZE)
+		elog(FATAL, "Too many sockets requested to be nonblocking mode.");
+}
+
+void
+pgwin32_nonblockset_init()
+{
+	FD_ZERO(&nonblockset);
+}
+
+#define socket_is_nonblocking(s) FD_ISSET((s), &nonblockset)
+
+/*
  * Convert the last socket error code into errno
  */
 static void
@@ -256,6 +282,10 @@ pgwin32_socket(int af, int type, int protocol)
 		TranslateSocketError();
 		return INVALID_SOCKET;
 	}
+
+	/* newly cerated socket should be in blocking mode  */
+	pgwin32_set_socket_nonblock(s, false);
+
 	errno = 0;
 
 	return s;
@@ -334,7 +364,7 @@ pgwin32_recv(SOCKET s, char *buf, int len, int f)
 		return -1;
 	}
 
-	if (pgwin32_noblock)
+	if (socket_is_nonblocking(s))
 	{
 		/*
 		 * No data received, and we are in "emulated non-blocking mode", so
@@ -420,7 +450,7 @@ pgwin32_send(SOCKET s, const void *buf, int len, int flags)
 			return -1;
 		}
 
-		if (pgwin32_noblock)
+		if (socket_is_nonblocking(s))
 		{
 			/*
 			 * No data sent, and we are in "emulated non-blocking mode", so
@@ -645,6 +675,15 @@ pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, c
 
 
 /*
+ * Unused entry in nonblockset needs to be removed when closing socket.
+ */
+int pgwin32_closesocket(SOCKET s)
+{
+	pgwin32_set_socket_nonblock(s, false);
+	return closesocket(s);
+}
+
+/*
  * Return win32 error string, since strerror can't
  * handle winsock codes
  */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index c7f41a5..72e0576 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3255,22 +3255,10 @@ PgstatCollectorMain(int argc, char *argv[])
 			/*
 			 * Try to receive and process a message.  This will not block,
 			 * since the socket is set to non-blocking mode.
-			 *
-			 * XXX On Windows, we have to force pgwin32_recv to cooperate,
-			 * despite the previous use of pg_set_noblock() on the socket.
-			 * This is extremely broken and should be fixed someday.
 			 */
-#ifdef WIN32
-			pgwin32_noblock = 1;
-#endif
-
 			len = recv(pgStatSock, (char *) &msg,
 					   sizeof(PgStat_Msg), 0);
 
-#ifdef WIN32
-			pgwin32_noblock = 0;
-#endif
-
 			if (len < 0)
 			{
 				if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index b190cf5..5d32de6 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -896,6 +896,10 @@ PostmasterMain(int argc, char *argv[])
 	 */
 	InitializeMaxBackends();
 
+#ifdef WIN32	
+	pgwin32_nonblockset_init();
+#endif
+
 	/*
 	 * Establish input sockets.
 	 */
diff --git a/src/include/port/win32.h b/src/include/port/win32.h
index 550c3ec..b0df45e 100644
--- a/src/include/port/win32.h
+++ b/src/include/port/win32.h
@@ -368,6 +368,7 @@ void		pg_queue_signal(int signum);
 #define select(n, r, w, e, timeout) pgwin32_select(n, r, w, e, timeout)
 #define recv(s, buf, len, flags) pgwin32_recv(s, buf, len, flags)
 #define send(s, buf, len, flags) pgwin32_send(s, buf, len, flags)
+#define closesocket(s) pgwin32_closesocket(s)
 
 SOCKET		pgwin32_socket(int af, int type, int protocol);
 SOCKET		pgwin32_accept(SOCKET s, struct sockaddr * addr, int *addrlen);
@@ -375,11 +376,12 @@ int			pgwin32_connect(SOCKET s, const struct sockaddr * name, int namelen);
 int			pgwin32_select(int nfds, fd_set *readfs, fd_set *writefds, fd_set *exceptfds, const struct timeval * timeout);
 int			pgwin32_recv(SOCKET s, char *buf, int len, int flags);
 int			pgwin32_send(SOCKET s, const void *buf, int len, int flags);
+int			pgwin32_closesocket(SOCKET s);
 
 const char *pgwin32_socket_strerror(int err);
 int			pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout);
-
-extern int	pgwin32_noblock;
+void		pgwin32_set_socket_nonblock(SOCKET s, int nonblock);
+void		pgwin32_nonblockset_init();
 
 /* in backend/port/win32/security.c */
 extern int	pgwin32_is_admin(void);
diff --git a/src/port/noblock.c b/src/port/noblock.c
index 1da0339..d6cc6a2 100644
--- a/src/port/noblock.c
+++ b/src/port/noblock.c
@@ -25,9 +25,18 @@ pg_set_noblock(pgsocket sock)
 #else
 	unsigned long ioctlsocket_ret = 1;
 
+#ifndef FRONTEND
+	/*
+	 * sockets on non-frontend processes on win32 is wrapped and blocking mode
+	 * is controlled there. See socket.c for the details.
+	 */
+	pgwin32_set_socket_nonblock(sock, true);
+	return 1;
+#else
 	/* Returns non-0 on failure, while fcntl() returns -1 on failure */
 	return (ioctlsocket(sock, FIONBIO, &ioctlsocket_ret) == 0);
-#endif
+#endif /* FRONTEND */
+#endif /* !WIN32   */
 }
 
 
@@ -41,10 +50,16 @@ pg_set_block(pgsocket sock)
 	if (flags < 0 || fcntl(sock, F_SETFL, (long) (flags & ~O_NONBLOCK)))
 		return false;
 	return true;
-#else
+#else /* !WIN32   */
 	unsigned long ioctlsocket_ret = 0;
 
+#ifndef FRONTEND
+	/*  See pg_set_noblock */
+	pgwin32_set_socket_nonblock(sock, false);
+	return 1;
+#else
 	/* Returns non-0 on failure, while fcntl() returns -1 on failure */
 	return (ioctlsocket(sock, FIONBIO, &ioctlsocket_ret) == 0);
-#endif
+#endif /* FRONTEND */
+#endif /* !WIN32   */
 }
diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index 41ec1ad..fbb4c47 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -34,7 +34,7 @@
 #include "libpq/libpq.h"
 #include "tcop/tcopprot.h"
 #include "utils/memutils.h"
-
+#include "miscadmin.h"
 
 char	   *ssl_cert_file;
 char	   *ssl_key_file;
@@ -140,6 +140,10 @@ secure_read(Port *port, void *ptr, size_t len)
 	return n;
 }
 
+/*
+ *  Read data from socket.
+ *  This emulates blocking behavior using non-blocking sockets.
+ */
 ssize_t
 secure_raw_read(Port *port, void *ptr, size_t len)
 {
@@ -147,8 +151,34 @@ secure_raw_read(Port *port, void *ptr, size_t len)
 
 	prepare_for_client_read();
 
-	n = recv(port->sock, ptr, len, 0);
+	if (port->noblock)
+		n = recv(port->sock, ptr, len, 0);
+	else
+	{
+		do
+		{
+			fd_set rfds;
+
+			FD_ZERO(&rfds);
+			FD_SET(port->sock, &rfds);
 
+			/*
+			 * In contrast to secure_raw_write, this section runs with
+			 * ImmediateInterruptOK = true so we can wait forever in
+			 * select.
+			 */
+			n = select(port->sock + 1, &rfds, NULL, NULL, NULL);
+			if (n < 0) break;
+
+			n = recv(port->sock, ptr, len, 0);
+
+			/*
+			 * We should have something to read here so EAGAIN/EWOULDBLOCK is
+			 * likey not to be seen. But we check them here not to return
+			 * these error numbers for blocking sockets for the caller.
+			 */
+		} while (n < 0 && (errno == EAGAIN || errno == EWOULDBLOCK));
+	}
 	client_read_ended();
 
 	return n;
@@ -178,5 +208,77 @@ secure_write(Port *port, void *ptr, size_t len)
 ssize_t
 secure_raw_write(Port *port, const void *ptr, size_t len)
 {
-	return send(port->sock, ptr, len, 0);
+	int ret = 0;
+
+	/*
+	 * Port socket is always in non-blocking mode. See StreamConnection for
+	 * the details.
+	 */
+	ret = send(port->sock, ptr, len, 0);
+
+	/* We can return here regardless of blocking mode in the most cases */
+	if (port->noblock || ret > 0 || len == 0)
+		return ret;
+
+	/* Here, we shold block waiting for the room in send buffer. */
+	while(ret < 1 && !ProcDiePending)
+	{
+		fd_set wfds;
+		struct timeval tv;
+		int i = 0;
+
+		FD_ZERO(&wfds);
+		tv.tv_usec = 0;
+
+		/*
+		 * We may get terminate signal (SIGTERM) during write blocking. If we
+		 * check ProcDiePending then wait by select indefinitely, SIGTERM
+		 * comes after the check and before the select will be pending and we
+		 * should wait the second SIGTERM. So we periodically wake up to check
+		 * ProcDiePending in order to catch the signal surely.  The timeout
+		 * for the select is the maximum delay of handling the signal. 1
+		 * seconds groundlessly seems to be appropreate.
+		 */
+		do
+		{
+			FD_SET(port->sock, &wfds);
+			tv.tv_sec = 1;
+			tv.tv_usec = 0;
+
+			ret = select(port->sock + 1, NULL, &wfds, NULL, &tv);
+		} while (!ProcDiePending && ret == 0);
+
+		if (ProcDiePending || ret < 0)
+			break;
+
+		ret = send(port->sock, ptr, len, 0);
+		if (ProcDiePending)
+			break;
+		if (ret < 0)
+		{
+			if (errno != EAGAIN && errno != EWOULDBLOCK)
+				break;
+
+			/*
+			 * This loop might run a busy loop if send(2) returned EAGAIN or
+			 * EWOULDBLOCK after select(2) returned normally. Sleep expressly
+			 * to avoid the busy loop.
+			 */
+			pg_usleep(200000L); /* 200 ms */
+			ret = 0;
+		}
+	}
+
+	if (ProcDiePending)
+	{
+		/*
+		 * Allow to terminate this backend. ClientConnectionLost prevents any
+		 * more bytes including error messages from being sent to
+		 * client. errno is set in order to teach ssl layer not to retry.
+		 */
+		ClientConnectionLost = 1;
+		errno = ECONNRESET;
+	}
+	
+	return ret;
 }
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index c92851e..8387d6a 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -718,6 +718,17 @@ StreamConnection(pgsocket server_fd, Port *port)
 		(void) pq_setkeepalivescount(tcp_keepalives_count, port);
 	}
 
+	/*
+	 * Put this socket to non-blocking mode. Blocking behavior is emulated in
+	 * secure_write() and secure_read().
+	 * Use COMMERROR on failure, because ERROR would try to send the error to
+	 * the client, which might require changing the mode again, leading to
+	 * infinite recursion.
+	 */
+	if (!pg_set_noblock(port->sock))
+		ereport(COMMERROR,
+				(errmsg("could not set socket to nonblocking mode: %m")));
+
 	return STATUS_OK;
 }
 
@@ -792,27 +803,6 @@ TouchSocketFiles(void)
 static void
 pq_set_nonblocking(bool nonblocking)
 {
-	if (MyProcPort->noblock == nonblocking)
-		return;
-
-	/*
-	 * Use COMMERROR on failure, because ERROR would try to send the error to
-	 * the client, which might require changing the mode again, leading to
-	 * infinite recursion.
-	 */
-	if (nonblocking)
-	{
-		if (!pg_set_noblock(MyProcPort->sock))
-			ereport(COMMERROR,
-					(errmsg("could not set socket to nonblocking mode: %m")));
-	}
-	else
-	{
-		if (!pg_set_block(MyProcPort->sock))
-			ereport(COMMERROR,
-					(errmsg("could not set socket to blocking mode: %m")));
-	}
-
 	MyProcPort->noblock = nonblocking;
 }
 
@@ -1249,34 +1239,38 @@ internal_flush(void)
 
 		if (r <= 0)
 		{
-			if (errno == EINTR)
-				continue;		/* Ok if we were interrupted */
-
-			/*
-			 * Ok if no data writable without blocking, and the socket is in
-			 * non-blocking mode.
-			 */
-			if (errno == EAGAIN ||
-				errno == EWOULDBLOCK)
+			if (!ClientConnectionLost)
 			{
+				if (errno == EINTR)
+					continue;		/* Ok if we were interrupted */
+
+				/*
+				 * Ok if no data writable without blocking, and the socket is in
+				 * non-blocking mode.
+				 */
+				if (errno == EAGAIN ||
+					errno == EWOULDBLOCK)
+				{
 				return 0;
-			}
-
-			/*
-			 * Careful: an ereport() that tries to write to the client would
-			 * cause recursion to here, leading to stack overflow and core
-			 * dump!  This message must go *only* to the postmaster log.
-			 *
-			 * If a client disconnects while we're in the midst of output, we
-			 * might write quite a bit of data before we get to a safe query
-			 * abort point.  So, suppress duplicate log messages.
-			 */
-			if (errno != last_reported_send_errno)
-			{
-				last_reported_send_errno = errno;
-				ereport(COMMERROR,
-						(errcode_for_socket_access(),
-						 errmsg("could not send data to client: %m")));
+				}
+
+				/*
+				 * Careful: an ereport() that tries to write to the client
+				 * would cause recursion to here, leading to stack overflow
+				 * and core dump!  This message must go *only* to the
+				 * postmaster log.
+				 *
+				 * If a client disconnects while we're in the midst of output,
+				 * we might write quite a bit of data before we get to a safe
+				 * query abort point.  So, suppress duplicate log messages.
+				 */
+				if (errno != last_reported_send_errno)
+				{
+					last_reported_send_errno = errno;
+					ereport(COMMERROR,
+							(errcode_for_socket_access(),
+							 errmsg("could not send data to client: %m")));
+				}
 			}
 
 			/*
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 5d32de6..d979191 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -5789,6 +5789,13 @@ read_inheritable_socket(SOCKET *dest, InheritableSocket *src)
 		*dest = s;
 
 		/*
+		 * We didn't inherit emulated blocking mode but port socket should be
+		 * always in nonblocking mode. pg_set_noblock() on win32 backend won't
+		 * return error.
+		 */
+		pg_set_noblock(s);
+
+		/*
 		 * To make sure we don't get two references to the same socket, close
 		 * the original one. (This would happen when inheritance actually
 		 * works..
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 7b5480f..1d252e7 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2840,8 +2840,16 @@ ProcessInterrupts(void)
 		ImmediateInterruptOK = false;	/* not idle anymore */
 		DisableNotifyInterrupt();
 		DisableCatchupInterrupt();
-		/* As in quickdie, don't risk sending to client during auth */
-		if (ClientAuthInProgress && whereToSendOutput == DestRemote)
+		/*
+		 *  As in quickdie, don't risk sending to client during auth. In
+		 *  addition to that, don't try to send any more to client if current
+		 *  connection is marked as ClientConnectionLost. It will lead to
+		 *  protocol violation if the truth is that the connection is living
+		 *  and amid sending data. Such case will occur if this backend was
+		 *  terminated during waiting for query result to be sent.
+		 */
+		if ((ClientAuthInProgress && whereToSendOutput == DestRemote) ||
+			ClientConnectionLost)
 			whereToSendOutput = DestNone;
 		if (IsAutoVacuumWorkerProcess())
 			ereport(FATAL,
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to