In the scenario that amfnd terminates a huge number of components at once (around 800 components), amfnd catches the sigchild signal from components' processes in signal handler and calls write() to notify amfnd's threads to proceed the component termination. As of this result, multiple blocking write() calls are observed being blocked because the thread calls read() being busy with waitpid despite that waitpid is nohang.
The slowness of read() thread is due to scanning through all pids and there are so many child processes being terminated at the same time. This patch changes the socketpair as non-blocking to avoid write() being blocked. It also uses poll event to avoid hogging cpu in the read() thread. --- src/base/sysf_exc_scr.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/base/sysf_exc_scr.c b/src/base/sysf_exc_scr.c index 378b1eeab..6348985cb 100644 --- a/src/base/sysf_exc_scr.c +++ b/src/base/sysf_exc_scr.c @@ -33,10 +33,11 @@ #include "base/sysf_exc_scr.h" #include "base/ncssysf_def.h" +#include <poll.h> #include <sched.h> SYSF_EXECUTE_MODULE_CB module_cb; - +static struct pollfd fds[1]; /***************************************************************************** PROCEDURE : ncs_exc_mdl_start_timer @@ -169,8 +170,20 @@ void ncs_exec_mod_hdlr(void) SYSF_PID_LIST *exec_pid = NULL; int status = -1; int pid = -1; + int polltmo = -1; + + fds[0].fd = module_cb.read_fd; + fds[0].events = POLLIN; while (1) { + int pollretval = poll(fds, 1, polltmo); + if (pollretval == -1) { + if (errno == EINTR) continue; + LOG_ER("ncs_exec_mod_hdlr: poll FAILED - %s", + strerror(errno)); + break; + } + if ((fds[0].revents & POLLIN) == false) continue; while ((ret_val = read( module_cb.read_fd, (((uint8_t *)&info) + count), (maxsize - count))) != (maxsize - count)) { @@ -430,7 +443,7 @@ uint32_t start_exec_mod_cb(void) return m_LEAP_DBG_SINK(NCSCC_RC_FAILURE); } - if (0 != socketpair(AF_UNIX, SOCK_DGRAM, 0, spair)) { + if (0 != socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, spair)) { perror("init_exec_mod_cb: socketpair: "); return m_LEAP_DBG_SINK(NCSCC_RC_FAILURE); } -- 2.20.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel