yes, I'll update the 00-README.conf before pushing.

Praveen, is it ok to push this patch? We have run large clusters with the OSAF_MDS_WATCHDOG active for amfd and immd

and it shows that both services performs well regarding receive throughput and latency. So e.g. the earlier improvements related to

receive logic and do mds logging in a separate process has improved the performance in large clusters considerably.

/Regards Hans



On 09/06/2017 02:18 PM, Anders Widell wrote:
Ack. Maybe mention OSAF_MDS_WATCHDOG in 00-README.conf ?

regards,

Anders Widell


On 09/05/2017 11:55 AM, Hans Nordeback wrote:
---
  src/mds/mds_dt_tipc.c | 35 +++++++++++++++++++++++++++++++++--
  1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 37745e7f0..94310ba59 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -40,6 +40,10 @@
  #include <sys/types.h>
  #include <unistd.h>
  #include <fcntl.h>
+
+#include <signal.h>
+#include <sys/timerfd.h>
+
  #include "mds_dt_tipc.h"
  #include "mds_dt_tcp_disc.h"
  #include "mds_core.h"
@@ -708,6 +712,10 @@ ssize_t recvfrom_connectionless(int sd, void *buf, size_t nbytes, int flags,
      }
  }
  +static void osaf_sigalrm_handler(int signo) {
+    raise(SIGABRT);
+}
+
  /*********************************************************
      Function NAME: mdtm_process_recv_events
@@ -722,7 +730,7 @@ ssize_t recvfrom_connectionless(int sd, void *buf, size_t nbytes, int flags,
   *********************************************************/
  static uint32_t mdtm_process_recv_events(void)
  {
-    enum { FD_DSOCK = 0, FD_BSRSOCK, FD_TMRFD, NUM_FDS };
+    enum { FD_DSOCK = 0, FD_BSRSOCK, FD_TMRFD, FD_TMRWD, NUM_FDS };
        /*
         STEP 1: Poll on the BSRsock and Dsock to get the events
@@ -730,6 +738,20 @@ static uint32_t mdtm_process_recv_events(void)
         if discovery events are received , process the discovery events
       */
  +    int timerfd = -1;
+    if (getenv("OSAF_MDS_WATCHDOG") != NULL) {
+        signal(SIGALRM, osaf_sigalrm_handler);
+ timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC);
+        struct itimerspec spec;
+        spec.it_interval.tv_sec = 0;
+        spec.it_interval.tv_nsec = 100000000;
+        spec.it_value.tv_sec = 0;
+        spec.it_value.tv_nsec = 100000000;
+        timerfd_settime(timerfd, 0, &spec, NULL);
+        alarm(4);
+        LOG_NO("Started MDS watchdog");
+    }
+
      while (true) {
          int pollres;
  @@ -743,9 +765,11 @@ static uint32_t mdtm_process_recv_events(void)
          pfd[FD_BSRSOCK].events = POLLIN;
          pfd[FD_TMRFD].fd = tipc_cb.tmr_fd;
          pfd[FD_TMRFD].events = POLLIN;
+        pfd[FD_TMRWD].fd = timerfd;
+        pfd[FD_TMRWD].events = POLLIN;
            pfd[FD_DSOCK].revents = pfd[FD_BSRSOCK].revents =
-            pfd[FD_TMRFD].revents = 0;
+            pfd[FD_TMRFD].revents = pfd[FD_TMRWD].revents = 0;
            pollres = poll(pfd, NUM_FDS, MDTM_TIPC_POLL_TIMEOUT);
  @@ -1099,6 +1123,13 @@ static uint32_t mdtm_process_recv_events(void)
                                    */
                  }
              }
+
+            if (pfd[FD_TMRWD].revents == POLLIN) {
+                alarm(4);
+                uint64_t expirations = 0;
+ if (read(timerfd, &expirations, 8) != 8) LOG_ER("MDS error reading timerfd value"); + if (expirations != 1) LOG_NO("MDS timerfd expired %" PRIu64 " times", expirations);
+            }
osaf_mutex_unlock_ordie(&gl_mds_library_mutex);
          } /* if pollres */
      }     /* while */



------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to