Hello, sorry for the dazed reply in the previous mail. I made revised patch for this issue.
Attached patches are following, - 0001_Revise_socket_emulation_for_win32_backend.patch Revises socket emulation on win32 backend so that each socket can have its own blocking mode state. - 0002_Allow_backend_termination_during_write_blocking.patch The patch to solve the issue. This patch depends on the 0001_ patch. ========== > > I'm marking this as Waiting on Author in the commitfest app, because: > > 1. the protocol violation needs to be avoided one way or another, and > > 2. the behavior needs to be consistent so that a single > > pg_terminate_backend() is enough to always kill the connection. - Preventing protocol violation. To prevent protocol violation, secure_write sets ClientConnectionLost when SIGTERM detected, then internal_flush() and ProcessInterrupts() follow the instruction. - Single pg_terminate_backend surely kills the backend. secure_raw_write() uses non-blocking socket and a loop of select() with timeout to surely detects received signal(SIGTERM). To avoid frequent switching of blocking mode, the bare socket for Port is put to non-blocking mode from the first in StreamConnection() and blocking mode is controlled only by Port->noblock in secure_raw_read/write(). To make the code mentioned above (Patch 0002) tidy, rewrite the socket emulation code for win32 backends so that each socket can have its own non-blocking state. (patch 0001) Some concern about this patch, - This patch allows the number of non-blocking socket to be below 64 (FD_SETSIZE) on win32 backend but it seems to be sufficient. - This patch introduced redundant socket emulation for win32 backend but win32 bare socket for Port is already nonblocking as described so it donsn't seem to be a serious problem on performance. Addition to it, since I don't know the reason why win32/socket.c provides the blocking-mode socket emulation, I decided to preserve win32/socket.c to have blocking socket emulation. Possibly it can be removed. Any suggestions? regards, -- Kyotaro Horiguchi NTT Open Source Software Center
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 605d891..c92851e 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -795,10 +795,6 @@ pq_set_nonblocking(bool nonblocking) if (MyProcPort->noblock == nonblocking) return; -#ifdef WIN32 - pgwin32_noblock = nonblocking ? 1 : 0; -#else - /* * Use COMMERROR on failure, because ERROR would try to send the error to * the client, which might require changing the mode again, leading to @@ -816,7 +812,7 @@ pq_set_nonblocking(bool nonblocking) ereport(COMMERROR, (errmsg("could not set socket to blocking mode: %m"))); } -#endif + MyProcPort->noblock = nonblocking; } diff --git a/src/backend/port/win32/socket.c b/src/backend/port/win32/socket.c index c981169..f0ff3e7 100644 --- a/src/backend/port/win32/socket.c +++ b/src/backend/port/win32/socket.c @@ -21,11 +21,8 @@ * non-blocking mode in order to be able to deliver signals, we must * specify this in a separate flag if we actually need non-blocking * operation. - * - * This flag changes the behaviour *globally* for all socket operations, - * so it should only be set for very short periods of time. */ -int pgwin32_noblock = 0; +static fd_set nonblockset; #undef socket #undef accept @@ -33,6 +30,7 @@ int pgwin32_noblock = 0; #undef select #undef recv #undef send +#undef closesocket /* * Blocking socket functions implemented so they listen on both @@ -40,6 +38,34 @@ int pgwin32_noblock = 0; */ /* + * Set blocking mode for each socket + */ +void +pgwin32_set_socket_nonblock(SOCKET s, int nonblock) +{ + if (nonblock) + FD_SET(s, &nonblockset); + else + FD_CLR(s, &nonblockset); + + /* + * fd_set cannot have more than FD_SETSIZE entries. It's not likey to come + * close to this limit but if it goes above the limit, non blocking state + * of some existing sockets will be discarded. + */ + if (nonblockset.fd_count >= FD_SETSIZE) + elog(FATAL, "Too many sockets requested to be nonblocking mode."); +} + +void +pgwin32_nonblockset_init() +{ + FD_ZERO(&nonblockset); +} + +#define socket_is_nonblocking(s) FD_ISSET((s), &nonblockset) + +/* * Convert the last socket error code into errno */ static void @@ -256,6 +282,10 @@ pgwin32_socket(int af, int type, int protocol) TranslateSocketError(); return INVALID_SOCKET; } + + /* newly cerated socket should be in blocking mode */ + pgwin32_set_socket_nonblock(s, false); + errno = 0; return s; @@ -334,7 +364,7 @@ pgwin32_recv(SOCKET s, char *buf, int len, int f) return -1; } - if (pgwin32_noblock) + if (socket_is_nonblocking(s)) { /* * No data received, and we are in "emulated non-blocking mode", so @@ -420,7 +450,7 @@ pgwin32_send(SOCKET s, const void *buf, int len, int flags) return -1; } - if (pgwin32_noblock) + if (socket_is_nonblocking(s)) { /* * No data sent, and we are in "emulated non-blocking mode", so @@ -645,6 +675,15 @@ pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, c /* + * Unused entry in nonblockset needs to be removed when closing socket. + */ +int pgwin32_closesocket(SOCKET s) +{ + pgwin32_set_socket_nonblock(s, false); + return closesocket(s); +} + +/* * Return win32 error string, since strerror can't * handle winsock codes */ diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index c7f41a5..72e0576 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3255,22 +3255,10 @@ PgstatCollectorMain(int argc, char *argv[]) /* * Try to receive and process a message. This will not block, * since the socket is set to non-blocking mode. - * - * XXX On Windows, we have to force pgwin32_recv to cooperate, - * despite the previous use of pg_set_noblock() on the socket. - * This is extremely broken and should be fixed someday. */ -#ifdef WIN32 - pgwin32_noblock = 1; -#endif - len = recv(pgStatSock, (char *) &msg, sizeof(PgStat_Msg), 0); -#ifdef WIN32 - pgwin32_noblock = 0; -#endif - if (len < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index b190cf5..5d32de6 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -896,6 +896,10 @@ PostmasterMain(int argc, char *argv[]) */ InitializeMaxBackends(); +#ifdef WIN32 + pgwin32_nonblockset_init(); +#endif + /* * Establish input sockets. */ diff --git a/src/include/port/win32.h b/src/include/port/win32.h index 550c3ec..b0df45e 100644 --- a/src/include/port/win32.h +++ b/src/include/port/win32.h @@ -368,6 +368,7 @@ void pg_queue_signal(int signum); #define select(n, r, w, e, timeout) pgwin32_select(n, r, w, e, timeout) #define recv(s, buf, len, flags) pgwin32_recv(s, buf, len, flags) #define send(s, buf, len, flags) pgwin32_send(s, buf, len, flags) +#define closesocket(s) pgwin32_closesocket(s) SOCKET pgwin32_socket(int af, int type, int protocol); SOCKET pgwin32_accept(SOCKET s, struct sockaddr * addr, int *addrlen); @@ -375,11 +376,12 @@ int pgwin32_connect(SOCKET s, const struct sockaddr * name, int namelen); int pgwin32_select(int nfds, fd_set *readfs, fd_set *writefds, fd_set *exceptfds, const struct timeval * timeout); int pgwin32_recv(SOCKET s, char *buf, int len, int flags); int pgwin32_send(SOCKET s, const void *buf, int len, int flags); +int pgwin32_closesocket(SOCKET s); const char *pgwin32_socket_strerror(int err); int pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout); - -extern int pgwin32_noblock; +void pgwin32_set_socket_nonblock(SOCKET s, int nonblock); +void pgwin32_nonblockset_init(); /* in backend/port/win32/security.c */ extern int pgwin32_is_admin(void); diff --git a/src/port/noblock.c b/src/port/noblock.c index 1da0339..d6cc6a2 100644 --- a/src/port/noblock.c +++ b/src/port/noblock.c @@ -25,9 +25,18 @@ pg_set_noblock(pgsocket sock) #else unsigned long ioctlsocket_ret = 1; +#ifndef FRONTEND + /* + * sockets on non-frontend processes on win32 is wrapped and blocking mode + * is controlled there. See socket.c for the details. + */ + pgwin32_set_socket_nonblock(sock, true); + return 1; +#else /* Returns non-0 on failure, while fcntl() returns -1 on failure */ return (ioctlsocket(sock, FIONBIO, &ioctlsocket_ret) == 0); -#endif +#endif /* FRONTEND */ +#endif /* !WIN32 */ } @@ -41,10 +50,16 @@ pg_set_block(pgsocket sock) if (flags < 0 || fcntl(sock, F_SETFL, (long) (flags & ~O_NONBLOCK))) return false; return true; -#else +#else /* !WIN32 */ unsigned long ioctlsocket_ret = 0; +#ifndef FRONTEND + /* See pg_set_noblock */ + pgwin32_set_socket_nonblock(sock, false); + return 1; +#else /* Returns non-0 on failure, while fcntl() returns -1 on failure */ return (ioctlsocket(sock, FIONBIO, &ioctlsocket_ret) == 0); -#endif +#endif /* FRONTEND */ +#endif /* !WIN32 */ }
diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c index 41ec1ad..fbb4c47 100644 --- a/src/backend/libpq/be-secure.c +++ b/src/backend/libpq/be-secure.c @@ -34,7 +34,7 @@ #include "libpq/libpq.h" #include "tcop/tcopprot.h" #include "utils/memutils.h" - +#include "miscadmin.h" char *ssl_cert_file; char *ssl_key_file; @@ -140,6 +140,10 @@ secure_read(Port *port, void *ptr, size_t len) return n; } +/* + * Read data from socket. + * This emulates blocking behavior using non-blocking sockets. + */ ssize_t secure_raw_read(Port *port, void *ptr, size_t len) { @@ -147,8 +151,34 @@ secure_raw_read(Port *port, void *ptr, size_t len) prepare_for_client_read(); - n = recv(port->sock, ptr, len, 0); + if (port->noblock) + n = recv(port->sock, ptr, len, 0); + else + { + do + { + fd_set rfds; + + FD_ZERO(&rfds); + FD_SET(port->sock, &rfds); + /* + * In contrast to secure_raw_write, this section runs with + * ImmediateInterruptOK = true so we can wait forever in + * select. + */ + n = select(port->sock + 1, &rfds, NULL, NULL, NULL); + if (n < 0) break; + + n = recv(port->sock, ptr, len, 0); + + /* + * We should have something to read here so EAGAIN/EWOULDBLOCK is + * likey not to be seen. But we check them here not to return + * these error numbers for blocking sockets for the caller. + */ + } while (n < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)); + } client_read_ended(); return n; @@ -178,5 +208,77 @@ secure_write(Port *port, void *ptr, size_t len) ssize_t secure_raw_write(Port *port, const void *ptr, size_t len) { - return send(port->sock, ptr, len, 0); + int ret = 0; + + /* + * Port socket is always in non-blocking mode. See StreamConnection for + * the details. + */ + ret = send(port->sock, ptr, len, 0); + + /* We can return here regardless of blocking mode in the most cases */ + if (port->noblock || ret > 0 || len == 0) + return ret; + + /* Here, we shold block waiting for the room in send buffer. */ + while(ret < 1 && !ProcDiePending) + { + fd_set wfds; + struct timeval tv; + int i = 0; + + FD_ZERO(&wfds); + tv.tv_usec = 0; + + /* + * We may get terminate signal (SIGTERM) during write blocking. If we + * check ProcDiePending then wait by select indefinitely, SIGTERM + * comes after the check and before the select will be pending and we + * should wait the second SIGTERM. So we periodically wake up to check + * ProcDiePending in order to catch the signal surely. The timeout + * for the select is the maximum delay of handling the signal. 1 + * seconds groundlessly seems to be appropreate. + */ + do + { + FD_SET(port->sock, &wfds); + tv.tv_sec = 1; + tv.tv_usec = 0; + + ret = select(port->sock + 1, NULL, &wfds, NULL, &tv); + } while (!ProcDiePending && ret == 0); + + if (ProcDiePending || ret < 0) + break; + + ret = send(port->sock, ptr, len, 0); + if (ProcDiePending) + break; + if (ret < 0) + { + if (errno != EAGAIN && errno != EWOULDBLOCK) + break; + + /* + * This loop might run a busy loop if send(2) returned EAGAIN or + * EWOULDBLOCK after select(2) returned normally. Sleep expressly + * to avoid the busy loop. + */ + pg_usleep(200000L); /* 200 ms */ + ret = 0; + } + } + + if (ProcDiePending) + { + /* + * Allow to terminate this backend. ClientConnectionLost prevents any + * more bytes including error messages from being sent to + * client. errno is set in order to teach ssl layer not to retry. + */ + ClientConnectionLost = 1; + errno = ECONNRESET; + } + + return ret; } diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index c92851e..8387d6a 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -718,6 +718,17 @@ StreamConnection(pgsocket server_fd, Port *port) (void) pq_setkeepalivescount(tcp_keepalives_count, port); } + /* + * Put this socket to non-blocking mode. Blocking behavior is emulated in + * secure_write() and secure_read(). + * Use COMMERROR on failure, because ERROR would try to send the error to + * the client, which might require changing the mode again, leading to + * infinite recursion. + */ + if (!pg_set_noblock(port->sock)) + ereport(COMMERROR, + (errmsg("could not set socket to nonblocking mode: %m"))); + return STATUS_OK; } @@ -792,27 +803,6 @@ TouchSocketFiles(void) static void pq_set_nonblocking(bool nonblocking) { - if (MyProcPort->noblock == nonblocking) - return; - - /* - * Use COMMERROR on failure, because ERROR would try to send the error to - * the client, which might require changing the mode again, leading to - * infinite recursion. - */ - if (nonblocking) - { - if (!pg_set_noblock(MyProcPort->sock)) - ereport(COMMERROR, - (errmsg("could not set socket to nonblocking mode: %m"))); - } - else - { - if (!pg_set_block(MyProcPort->sock)) - ereport(COMMERROR, - (errmsg("could not set socket to blocking mode: %m"))); - } - MyProcPort->noblock = nonblocking; } @@ -1249,34 +1239,38 @@ internal_flush(void) if (r <= 0) { - if (errno == EINTR) - continue; /* Ok if we were interrupted */ - - /* - * Ok if no data writable without blocking, and the socket is in - * non-blocking mode. - */ - if (errno == EAGAIN || - errno == EWOULDBLOCK) + if (!ClientConnectionLost) { + if (errno == EINTR) + continue; /* Ok if we were interrupted */ + + /* + * Ok if no data writable without blocking, and the socket is in + * non-blocking mode. + */ + if (errno == EAGAIN || + errno == EWOULDBLOCK) + { return 0; - } - - /* - * Careful: an ereport() that tries to write to the client would - * cause recursion to here, leading to stack overflow and core - * dump! This message must go *only* to the postmaster log. - * - * If a client disconnects while we're in the midst of output, we - * might write quite a bit of data before we get to a safe query - * abort point. So, suppress duplicate log messages. - */ - if (errno != last_reported_send_errno) - { - last_reported_send_errno = errno; - ereport(COMMERROR, - (errcode_for_socket_access(), - errmsg("could not send data to client: %m"))); + } + + /* + * Careful: an ereport() that tries to write to the client + * would cause recursion to here, leading to stack overflow + * and core dump! This message must go *only* to the + * postmaster log. + * + * If a client disconnects while we're in the midst of output, + * we might write quite a bit of data before we get to a safe + * query abort point. So, suppress duplicate log messages. + */ + if (errno != last_reported_send_errno) + { + last_reported_send_errno = errno; + ereport(COMMERROR, + (errcode_for_socket_access(), + errmsg("could not send data to client: %m"))); + } } /* diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 5d32de6..d979191 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -5789,6 +5789,13 @@ read_inheritable_socket(SOCKET *dest, InheritableSocket *src) *dest = s; /* + * We didn't inherit emulated blocking mode but port socket should be + * always in nonblocking mode. pg_set_noblock() on win32 backend won't + * return error. + */ + pg_set_noblock(s); + + /* * To make sure we don't get two references to the same socket, close * the original one. (This would happen when inheritance actually * works.. diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 7b5480f..1d252e7 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -2840,8 +2840,16 @@ ProcessInterrupts(void) ImmediateInterruptOK = false; /* not idle anymore */ DisableNotifyInterrupt(); DisableCatchupInterrupt(); - /* As in quickdie, don't risk sending to client during auth */ - if (ClientAuthInProgress && whereToSendOutput == DestRemote) + /* + * As in quickdie, don't risk sending to client during auth. In + * addition to that, don't try to send any more to client if current + * connection is marked as ClientConnectionLost. It will lead to + * protocol violation if the truth is that the connection is living + * and amid sending data. Such case will occur if this backend was + * terminated during waiting for query result to be sent. + */ + if ((ClientAuthInProgress && whereToSendOutput == DestRemote) || + ClientConnectionLost) whereToSendOutput = DestNone; if (IsAutoVacuumWorkerProcess()) ereport(FATAL,
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers