Hi, the current implementation of PostmasterIsAlive() uses a pipe to monitor the existence of the postmaster process. One end of the pipe is held open in the postmaster, while the other end is inherited to all the auxiliary and background processes when they fork. This leads to multiple processes calling select(2), poll(2) and read(2) on the same end of the pipe. While this is technically perfectly ok, it has the unfortunate side effect that it triggers an inefficient behaviour[0] in the select/poll implementation on some operating systems[1]: The kernel can only keep track of one pid per select address and thus has no other choice than to wakeup(9) every process that is waiting on select/poll.
In our case the system had to wakeup ~3000 idle ssh processes every time postgresql did call PostmasterIsAlive. WalReceiver did run trigger with a rate of ~400 calls per second. With the result that the system performs very badly, being mostly busy scheduling idle processs. Attached patch avoids the select contention by using a separate pipe for each auxiliary and background process. Since the postmaster has three different ways to create new processes, the patch got a bit more complicated than I anticipated :) For auxiliary processes, pgstat, pgarch and the autovacuum launcher get a preallocated pipe each. The pipes are held in: postmaster_alive_fds_own[NUM_AUXPROCTYPES]; postmaster_alive_fds_watch[NUM_AUXPROCTYPES]; Just before we fork a new process we set postmaster_alive_fd for each process type: postmaster_alive_fd = postmaster_alive_fds_watch[type]; Since there can be multiple backend processes, BackendStarup() allocates a pipe on-demand and keeps the reference in the Backend structure. And is closed when the backend terminates. The patch was developed and tested under OpenBSD using the REL9_4_STABLE branch. I've merged it to current, compile tested and ran make check on Ubuntu 14.04. Marco [0] http://man.openbsd.org/OpenBSD-5.9/man2/select.2?manpath=OpenBSD-5.9 BUGS [...] "Internally to the kernel, select() and pselect() work poorly if multiple processes wait on the same file descriptor. Given that, it is rather surprising to see that many daemons are written that way." [1] At least OpenBSD and NetBSD are affected, FreeBSD rewrote their select implementation in 8.0.
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 1a92ca1..a3b4497 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -369,6 +369,13 @@ StartAutoVacLauncher(void) { pid_t AutoVacPID; +#ifndef WIN32 + /* + * Set the monitoring pipe for PostmasterIsAlive check + */ + postmaster_alive_fd = postmaster_alive_fds_watch[AutoVacLauncherProcess]; +#endif + #ifdef EXEC_BACKEND switch ((AutoVacPID = avlauncher_forkexec())) #else @@ -386,7 +393,7 @@ StartAutoVacLauncher(void) InitPostmasterChild(); /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, true); AutoVacLauncherMain(0, NULL); break; @@ -1447,7 +1454,7 @@ StartAutoVacWorker(void) InitPostmasterChild(); /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); AutoVacWorkerMain(0, NULL); break; diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 1aa6466..3ff3479 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -138,6 +138,13 @@ pgarch_start(void) return 0; last_pgarch_start_time = curtime; +#ifndef WIN32 + /* + * Set the monitoring pipe for PostmasterIsAlive check + */ + postmaster_alive_fd = postmaster_alive_fds_watch[PgArchiverProcess]; +#endif + #ifdef EXEC_BACKEND switch ((pgArchPid = pgarch_forkexec())) #else @@ -155,7 +162,7 @@ pgarch_start(void) InitPostmasterChild(); /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, true); /* Drop our connection to postmaster's shared memory, as well */ dsm_detach_all(); diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 8a2ce91..600fb14 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -676,6 +676,13 @@ pgstat_start(void) return 0; last_pgstat_start_time = curtime; +#ifndef WIN32 + /* + * Set the monitoring pipe for PostmasterIsAlive check + */ + postmaster_alive_fd = postmaster_alive_fds_watch[PgStatProcess]; +#endif + /* * Okay, fork off the collector. */ @@ -696,7 +703,7 @@ pgstat_start(void) InitPostmasterChild(); /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, true); /* Drop our connection to postmaster's shared memory, as well */ dsm_detach_all(); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index eaf3f61..6eb8f48 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -176,6 +176,7 @@ typedef struct bkend bool dead_end; /* is it going to send an error and quit? */ bool bgworker_notify; /* gets bgworker start/stop notifications */ dlist_node elem; /* list link in BackendList */ + int postmaster_alive_fd_own; } Backend; static dlist_head BackendList = DLIST_STATIC_INIT(BackendList); @@ -416,6 +417,7 @@ static void maybe_start_bgworker(void); static bool CreateOptsFile(int argc, char *argv[], char *fullprogname); static pid_t StartChildProcess(AuxProcType type); static void StartAutovacuumWorker(void); +static void CreatePostmasterDeathWatchPipe(int *ownfd, int *watchfd); static void InitPostmasterDeathWatchHandle(void); /* @@ -507,7 +509,7 @@ typedef struct HANDLE initial_signal_pipe; HANDLE syslogPipe[2]; #else - int postmaster_alive_fds[2]; + int postmaster_alive_fd; int syslogPipe[2]; #endif char my_exec_path[MAXPGPATH]; @@ -542,10 +544,13 @@ static void ShmemBackendArrayRemove(Backend *bn); #ifndef WIN32 /* - * File descriptors for pipe used to monitor if postmaster is alive. - * First is POSTMASTER_FD_WATCH, second is POSTMASTER_FD_OWN. + * File descriptors for pipes used to monitor if postmaster is alive. + * Every process gets it's own pipe. + * Contains the postmaster end of each pipe. */ -int postmaster_alive_fds[2] = {-1, -1}; +int postmaster_alive_fds_own[NUM_AUXPROCTYPES]; +int postmaster_alive_fds_watch[NUM_AUXPROCTYPES]; +int postmaster_alive_fd = -1; /* set per child */ #else /* Process handle of postmaster used for the same purpose on Windows */ HANDLE PostmasterHandle; @@ -2384,11 +2389,13 @@ ConnFree(Port *conn) * that are not needed by that child process. The postmaster still has * them open, of course. * + * monitor_pm is set for childs that run the PostmasterIsAlive check + * * Note: we pass am_syslogger as a boolean because we don't want to set * the global variable yet when this is called. */ void -ClosePostmasterPorts(bool am_syslogger) +ClosePostmasterPorts(bool am_syslogger, bool monitor_pm) { int i; @@ -2398,12 +2405,32 @@ ClosePostmasterPorts(bool am_syslogger) * Close the write end of postmaster death watch pipe. It's important to * do this as early as possible, so that if postmaster dies, others won't * think that it's still running because we're holding the pipe open. + * + * For child processes that don't monitor the postmaster, all the + * monitoring pipes are closed. */ - if (close(postmaster_alive_fds[POSTMASTER_FD_OWN])) - ereport(FATAL, - (errcode_for_file_access(), - errmsg_internal("could not close postmaster death monitoring pipe in child process: %m"))); - postmaster_alive_fds[POSTMASTER_FD_OWN] = -1; + for (i = 0; i < NUM_AUXPROCTYPES; i++) { + if (monitor_pm) + { + if (postmaster_alive_fds_watch[i] != postmaster_alive_fd) + { + close(postmaster_alive_fds_watch[i]); + postmaster_alive_fds_watch[i] = -1; + } + if (close(postmaster_alive_fds_own[i])) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not close postmaster death monitoring pipe in child process: %m"))); + postmaster_alive_fds_own[i] = -1; + } + else + { + close(postmaster_alive_fds_own[i]); + postmaster_alive_fds_own[i] = -1; + close(postmaster_alive_fds_watch[i]); + postmaster_alive_fds_watch[i] = -1; + } + } #endif /* Close the listen sockets */ @@ -3175,6 +3202,7 @@ CleanupBackend(int pid, BackgroundWorkerStopNotifications(bp->pid); } dlist_delete(iter.cur); + close(bp->postmaster_alive_fd_own); free(bp); break; } @@ -3278,6 +3306,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) #endif } dlist_delete(iter.cur); + close(bp->postmaster_alive_fd_own); free(bp); /* Keep looping so we can signal remaining backends */ } @@ -3913,19 +3942,36 @@ BackendStartup(Port *port) /* Hasn't asked to be notified about any bgworkers yet */ bn->bgworker_notify = false; +#ifndef WIN32 + CreatePostmasterDeathWatchPipe( + &bn->postmaster_alive_fd_own, + &postmaster_alive_fd); +#else + bn->postmaster_alive_fd_own = -1; +#endif + #ifdef EXEC_BACKEND pid = backend_forkexec(port); #else /* !EXEC_BACKEND */ pid = fork_process(); if (pid == 0) /* child */ { - free(bn); + dlist_iter iter; /* Detangle from postmaster */ InitPostmasterChild(); /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); + close(bn->postmaster_alive_fd_own); + + dlist_foreach(iter, &BackendList) + { + Backend *bp = dlist_container(Backend, elem, iter.cur); + close(bp->postmaster_alive_fd_own); + } + + free(bn); /* Perform additional initialization and collect startup packet */ BackendInitialize(port); @@ -3942,6 +3988,8 @@ BackendStartup(Port *port) if (!bn->dead_end) (void) ReleasePostmasterChildSlot(bn->child_slot); + close(postmaster_alive_fd); + close(bn->postmaster_alive_fd_own); free(bn); errno = save_errno; ereport(LOG, @@ -3959,6 +4007,8 @@ BackendStartup(Port *port) * Everything's been successful, it's safe to add this backend to our list * of backends. */ + close(postmaster_alive_fd); + postmaster_alive_fd = -1; bn->pid = pid; bn->bkend_type = BACKEND_TYPE_NORMAL; /* Can change later to WALSND */ dlist_push_head(&BackendList, &bn->elem); @@ -4701,7 +4751,7 @@ SubPostmasterMain(int argc, char *argv[]) Assert(argc == 3); /* shouldn't be any more args */ /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); /* * Need to reinitialize the SSL library in the backend, since the @@ -4752,7 +4802,7 @@ SubPostmasterMain(int argc, char *argv[]) if (strcmp(argv[1], "--forkboot") == 0) { /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); /* Restore basic shared memory pointers */ InitShmemAccess(UsedShmemSegAddr); @@ -4768,7 +4818,7 @@ SubPostmasterMain(int argc, char *argv[]) if (strcmp(argv[1], "--forkavlauncher") == 0) { /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); /* Restore basic shared memory pointers */ InitShmemAccess(UsedShmemSegAddr); @@ -4784,7 +4834,7 @@ SubPostmasterMain(int argc, char *argv[]) if (strcmp(argv[1], "--forkavworker") == 0) { /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); /* Restore basic shared memory pointers */ InitShmemAccess(UsedShmemSegAddr); @@ -4805,7 +4855,7 @@ SubPostmasterMain(int argc, char *argv[]) IsBackgroundWorker = true; /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); /* Restore basic shared memory pointers */ InitShmemAccess(UsedShmemSegAddr); @@ -4825,7 +4875,7 @@ SubPostmasterMain(int argc, char *argv[]) if (strcmp(argv[1], "--forkarch") == 0) { /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); /* Do not want to attach to shared memory */ @@ -4834,7 +4884,7 @@ SubPostmasterMain(int argc, char *argv[]) if (strcmp(argv[1], "--forkcol") == 0) { /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); /* Do not want to attach to shared memory */ @@ -4843,7 +4893,7 @@ SubPostmasterMain(int argc, char *argv[]) if (strcmp(argv[1], "--forklog") == 0) { /* Close the postmaster's sockets */ - ClosePostmasterPorts(true); + ClosePostmasterPorts(true, false); /* Do not want to attach to shared memory */ @@ -5207,6 +5257,8 @@ StartChildProcess(AuxProcType type) av[ac] = NULL; Assert(ac < lengthof(av)); + postmaster_alive_fd = postmaster_alive_fds_watch[type]; + #ifdef EXEC_BACKEND pid = postmaster_forkexec(ac, av); #else /* !EXEC_BACKEND */ @@ -5217,7 +5269,7 @@ StartChildProcess(AuxProcType type) InitPostmasterChild(); /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, true); /* Release postmaster's working memory context */ MemoryContextSwitchTo(TopMemoryContext); @@ -5275,6 +5327,7 @@ StartChildProcess(AuxProcType type) /* * in parent, successful fork */ + postmaster_alive_fd = -1; return pid; } @@ -5523,7 +5576,7 @@ do_start_bgworker(RegisteredBgWorker *rw) InitPostmasterChild(); /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); + ClosePostmasterPorts(false, false); /* * Before blowing away PostmasterContext, save this bgworker's @@ -5841,8 +5894,7 @@ save_backend_variables(BackendParameters *param, Port *port, childProcess)) return false; #else - memcpy(¶m->postmaster_alive_fds, &postmaster_alive_fds, - sizeof(postmaster_alive_fds)); + param->postmaster_alive_fd = postmaster_alive_fd; #endif memcpy(¶m->syslogPipe, &syslogPipe, sizeof(syslogPipe)); @@ -6070,8 +6122,8 @@ restore_backend_variables(BackendParameters *param, Port *port) PostmasterHandle = param->PostmasterHandle; pgwin32_initial_signal_pipe = param->initial_signal_pipe; #else - memcpy(&postmaster_alive_fds, ¶m->postmaster_alive_fds, - sizeof(postmaster_alive_fds)); + memcpy(&postmaster_alive_fds_watch, ¶m->postmaster_alive_fds_watch, + sizeof(postmaster_alive_fds_watch)); #endif memcpy(&syslogPipe, ¶m->syslogPipe, sizeof(syslogPipe)); @@ -6197,8 +6249,29 @@ pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired) } #endif /* WIN32 */ +static void +CreatePostmasterDeathWatchPipe(int *ownfd, int *watchfd) +{ + int tmp_pipe[2]; + + if (pipe(tmp_pipe)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + /* + * Set O_NONBLOCK to allow testing for the fd's presence with a read() + * call. + */ + if (fcntl(tmp_pipe[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK)) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); + *ownfd = tmp_pipe[POSTMASTER_FD_OWN]; + *watchfd = tmp_pipe[POSTMASTER_FD_WATCH]; +} + /* - * Initialize one and only handle for monitoring postmaster death. + * Initialize one per process handle for monitoring postmaster death. * * Called once in the postmaster, so that child processes can subsequently * monitor if their parent is dead. @@ -6208,9 +6281,12 @@ InitPostmasterDeathWatchHandle(void) { #ifndef WIN32 + int i; + /* - * Create a pipe. Postmaster holds the write end of the pipe open - * (POSTMASTER_FD_OWN), and children hold the read end. Children can pass + * Create a pipe for each Auxiliary Proc. + * Postmaster holds the write end of the pipe open + * (postmaster_alive_fds_own[]), and children hold the read end. Children can pass * the read file descriptor to select() to wake up in case postmaster * dies, or check for postmaster death with a (read() == 0). Children must * close the write end as soon as possible after forking, because EOF @@ -6218,19 +6294,12 @@ InitPostmasterDeathWatchHandle(void) * write fd. That is taken care of in ClosePostmasterPorts(). */ Assert(MyProcPid == PostmasterPid); - if (pipe(postmaster_alive_fds)) - ereport(FATAL, - (errcode_for_file_access(), - errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + for (i = 0; i < NUM_AUXPROCTYPES; i++) { + CreatePostmasterDeathWatchPipe( + &postmaster_alive_fds_own[i], + &postmaster_alive_fds_watch[i]); + } - /* - * Set O_NONBLOCK to allow testing for the fd's presence with a read() - * call. - */ - if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK)) - ereport(FATAL, - (errcode_for_socket_access(), - errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); #else /* diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c index e7e488a..3e3e81e 100644 --- a/src/backend/postmaster/syslogger.c +++ b/src/backend/postmaster/syslogger.c @@ -590,7 +590,7 @@ SysLogger_Start(void) InitPostmasterChild(); /* Close the postmaster's sockets */ - ClosePostmasterPorts(true); + ClosePostmasterPorts(true, false); /* Drop our connection to postmaster's shared memory, as well */ dsm_detach_all(); diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index 9def8a1..f5198df 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -652,7 +652,7 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, else if (events == WL_POSTMASTER_DEATH) { #ifndef WIN32 - event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH]; + event->fd = postmaster_alive_fd; #endif } diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c index 7ae622c..c0ef6e5 100644 --- a/src/backend/storage/ipc/pmsignal.c +++ b/src/backend/storage/ipc/pmsignal.c @@ -275,7 +275,7 @@ PostmasterIsAlive(void) char c; ssize_t rc; - rc = read(postmaster_alive_fds[POSTMASTER_FD_WATCH], &c, 1); + rc = read(postmaster_alive_fd, &c, 1); if (rc < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK) diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 78545da..6cc0364 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -392,6 +392,14 @@ typedef enum CheckpointerProcess, WalWriterProcess, WalReceiverProcess, + /* + * The types below are not auxiliary processes. + * We abuse the identifiers as a reference for + * the per-process postmaster monitoring pipe + */ + PgArchiverProcess, + AutoVacLauncherProcess, + PgStatProcess, NUM_AUXPROCTYPES /* Must be last! */ } AuxProcType; diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index b2d7776..ceeccb3 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -13,6 +13,8 @@ #ifndef _POSTMASTER_H #define _POSTMASTER_H +#include "miscadmin.h" + /* GUC options */ extern bool EnableSSL; extern int ReservedBackends; @@ -33,7 +35,8 @@ extern bool restart_after_crash; #ifdef WIN32 extern HANDLE PostmasterHandle; #else -extern int postmaster_alive_fds[2]; +extern int postmaster_alive_fds_watch[NUM_AUXPROCTYPES]; +extern int postmaster_alive_fd; /* * Constants that represent which of postmaster_alive_fds is held by @@ -47,7 +50,7 @@ extern int postmaster_alive_fds[2]; extern const char *progname; extern void PostmasterMain(int argc, char *argv[]) pg_attribute_noreturn(); -extern void ClosePostmasterPorts(bool am_syslogger); +extern void ClosePostmasterPorts(bool am_syslogger, bool monitor_pm); extern int MaxLivePostmasterChildren(void);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers