In the scenario that amfnd terminates a huge number of components
at once (around 800 components), amfnd catches the sigchild signal
from components' processes in signal handler and calls write() to
notify amfnd's threads to proceed the component termination. As of
this result, multiple blocking write() calls are observed being
blocked because the thread calls read() being busy with waitpid
despite that waitpid is nohang.

The slowness of read() thread is due to scanning through all pids
and there are so many child processes being terminated at the same
time.

This patch changes the socketpair as non-blocking to avoid write()
being blocked. It also uses poll event to avoid hogging cpu in the
read() thread.
---
 src/base/sysf_exc_scr.c | 51 +++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/src/base/sysf_exc_scr.c b/src/base/sysf_exc_scr.c
index 378b1eeab..ed4123a8c 100644
--- a/src/base/sysf_exc_scr.c
+++ b/src/base/sysf_exc_scr.c
@@ -33,10 +33,11 @@
 #include "base/sysf_exc_scr.h"
 #include "base/ncssysf_def.h"
 
+#include <poll.h>
 #include <sched.h>
 
 SYSF_EXECUTE_MODULE_CB module_cb;
-
+static struct pollfd fds[1];
 /*****************************************************************************
 
   PROCEDURE        : ncs_exc_mdl_start_timer
@@ -108,8 +109,18 @@ void ncs_exec_module_signal_hdlr(int signal)
 
                /*  printf("\n In  SIGCHLD Handler \n"); */
 
-               if (-1 == write(module_cb.write_fd, (const void *)&info,
+               while (-1 == write(module_cb.write_fd, (const void *)&info,
                                sizeof(EXEC_MOD_INFO))) {
+                       /* Only continue if the error is EINTR which may be
+                        * caused by the signal interupt, and do not try again
+                        * with EAGAIN and EWOULDBLOCK since that will become
+                        * the reason to cause the threads hanging with
+                        * BLOCKING socketpair
+                        */
+                       if (errno == EINTR)
+                               continue;
+
+                       break;
                        perror("ncs_exec_module_signal_hdlr: write");
                }
        }
@@ -138,9 +149,19 @@ void ncs_exec_module_timer_hdlr(void *uarg)
                              .status = 0,
                              .type = SYSF_EXEC_INFO_TIME_OUT};
 
-       if (-1 == write(module_cb.write_fd, (const void *)&info,
-                       sizeof(EXEC_MOD_INFO))) {
-               perror("ncs_exec_module_timer_hdlr: write");
+               while (-1 == write(module_cb.write_fd, (const void *)&info,
+                               sizeof(EXEC_MOD_INFO))) {
+                       /* Only continue if the error is EINTR which may be
+                        * caused by the signal interupt, and do not try again
+                        * with EAGAIN and EWOULDBLOCK since that will become
+                        * the reason to cause the threads hanging with
+                        * BLOCKING socketpair
+                        */
+                       if (errno == EINTR)
+                               continue;
+
+                       break;
+                       perror("ncs_exec_module_timer_hdlr: write");
        }
 }
 
@@ -169,8 +190,25 @@ void ncs_exec_mod_hdlr(void)
        SYSF_PID_LIST *exec_pid = NULL;
        int status = -1;
        int pid = -1;
+       int polltmo = -1;
+
+       fds[0].fd = module_cb.read_fd;
+       fds[0].events = POLLIN;
 
        while (1) {
+               int pollretval = poll(fds, 1, polltmo);
+
+               if (pollretval == -1) {
+                       if (errno == EINTR)
+                               continue;
+
+                       LOG_ER("ncs_exec_mod_hdlr: poll FAILED - %s",
+                               strerror(errno));
+                       break;
+               }
+               if ((fds[0].revents & POLLIN) == false)
+                       continue;
+
                while ((ret_val = read(
                            module_cb.read_fd, (((uint8_t *)&info) + count),
                            (maxsize - count))) != (maxsize - count)) {
@@ -178,7 +216,6 @@ void ncs_exec_mod_hdlr(void)
                                if (errno == EBADF)
                                        return;
 
-                               perror("ncs_exec_mod_hdlr: read fail:");
                                continue;
                        }
                        count += ret_val;
@@ -430,7 +467,7 @@ uint32_t start_exec_mod_cb(void)
                return m_LEAP_DBG_SINK(NCSCC_RC_FAILURE);
        }
 
-       if (0 != socketpair(AF_UNIX, SOCK_DGRAM, 0, spair)) {
+       if (0 != socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, spair)) {
                perror("init_exec_mod_cb: socketpair: ");
                return m_LEAP_DBG_SINK(NCSCC_RC_FAILURE);
        }
-- 
2.20.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to