Re: [devel] [PATCH 1/1] clm: Make the cluster reset admin op safe V3 [#2451]

2017-09-29 Thread Hans Nordebäck
Hi,

I've checked Zoran's suggestion attached in the ticket, it handles safe cluster 
reboot in the most preferable way compared to the other
three patches I think.
/Regards Hans

-Original Message-
From: Hans Nordebäck 
Sent: den 27 september 2017 13:26
To: Anders Widell ; praveen.malv...@oracle.com
Cc: opensaf-devel@lists.sourceforge.net; Hans Nordebäck 

Subject: [PATCH 1/1] clm: Make the cluster reset admin op safe V3 [#2451]

---
 00-README.conf  | 10 +-
 src/base/osaf_utility.c | 44 
 src/base/osaf_utility.h |  5 +
 src/clm/clmnd/main.c|  1 +
 src/nid/nodeinit.cc |  2 ++
 5 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/00-README.conf b/00-README.conf index f64aa031c..5b06b15d5 100644
--- a/00-README.conf
+++ b/00-README.conf
@@ -610,4 +610,12 @@ A message will be written if the latency is > 0.1 second, 
example below shows a
 
 messages.1:Sep 12 13:09:26 SC-1 osafimmd[26732]: NO MDS timerfd expired 10 
times
 
-If the latency exceeds 4 seconds a sigalrm will be sent and the process will 
be aborted.
\ No newline at end of file
+If the latency exceeds 4 seconds a sigalrm will be sent and the process will 
be aborted.
+
+If clm adm command for cluster reboot is issued an environment variable 
+OPENSAF_CLUSTER_REBOOT_WAIT_TIME_SEC can be set in opensafd script to 
+specify the time to wait for nodes to be started, except for the active node.
+Default is two seconds. A file, "clm_cluster_reboot_in_progress", is 
+created on each node, except on the active node. This file indicates 
+that a cluster reboot is in progress and all nodes needs to delay their 
+start, this to give the active a lead.
diff --git a/src/base/osaf_utility.c b/src/base/osaf_utility.c index 
230cd7e0f..6ee6c3d8f 100644
--- a/src/base/osaf_utility.c
+++ b/src/base/osaf_utility.c
@@ -23,9 +23,53 @@
 #include 
 #include 
 #include 
+
+#include 
+#include 
+#include 
+#include 
 #include "base/ncssysf_def.h"
 #include "osaf/configmake.h"
 
+void osaf_wait_for_active_to_start(void)
+{
+   struct stat statbuf;
+   static char file[NAME_MAX];
+   const char *wait_time_str = NULL;
+   unsigned int wait_time = kDfltClusterRebootWaitTimeSec;
+
+   if ((wait_time_str = getenv("OPENSAF_CLUSTER_REBOOT_WAIT_TIME_SEC")) != 
NULL) {
+   wait_time = strtol(wait_time_str, NULL, 0);
+   }
+   snprintf(file, sizeof(file), PKGLOGDIR "/%s", 
+kClmClusterRebootInProgress);
+
+   if (stat(file, ) != 0) {
+   syslog(LOG_NOTICE, "Reboot file %s not found, startup 
continue...", file);
+   return;
+   }
+
+   syslog(LOG_NOTICE, "Cluster reboot in progress, this node will start 
+in %u second(s)", wait_time);
+
+   sleep(wait_time);
+
+   if (unlink(file) == -1) {
+   syslog(LOG_ERR, "cannot remove file %s: %s", file, 
strerror(errno));
+   }
+}
+
+void osaf_create_cluster_reboot_in_progress_file(void)
+{
+   static char file[NAME_MAX];
+   snprintf(file, sizeof(file), PKGLOGDIR "/%s", 
kClmClusterRebootInProgress);
+   int fd;
+
+   if ((fd = open(file, O_RDWR | O_CREAT, 0644)) < 0) {
+   syslog(LOG_ERR, "Open %s failed, %s", file, strerror(errno));
+   return;
+   }
+   close(fd);
+}
+
 void osaf_abort(long i_cause)
 {
syslog(LOG_ERR, "osaf_abort(%ld) called from %p with errno=%d", 
i_cause, diff --git a/src/base/osaf_utility.h b/src/base/osaf_utility.h index 
b935c5003..f7b5a07b3 100644
--- a/src/base/osaf_utility.h
+++ b/src/base/osaf_utility.h
@@ -30,6 +30,8 @@
 extern "C" {
 #endif
 
+#define kClmClusterRebootInProgress "clm_cluster_reboot_in_progress"
+enum { kDfltClusterRebootWaitTimeSec = 2 };
 enum { kOsafUseSafeReboot = 1 };
 
 /**
@@ -71,6 +73,9 @@ extern void osaf_abort(long i_cause) __attribute__((
 
 extern void osaf_safe_reboot(void) __attribute__((nothrow));
 
+extern void osaf_wait_for_active_to_start(void);
+extern void osaf_create_cluster_reboot_in_progress_file(void);
+
 static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) {
   int result = pthread_mutex_lock(io_mutex);
   if (result != 0) osaf_abort(result);
diff --git a/src/clm/clmnd/main.c b/src/clm/clmnd/main.c index 
3a8479600..2801c218f 100644
--- a/src/clm/clmnd/main.c
+++ b/src/clm/clmnd/main.c
@@ -122,6 +122,7 @@ static uint32_t clmna_mds_dec(struct ncsmds_callback_info 
*info)
// Reboot will be performed by CLMS for this node.
if (clmna_cb->node_info.node_id !=
msg->info.reboot_info.node_id) {
+   osaf_create_cluster_reboot_in_progress_file();
osaf_safe_reboot();
}
break;
diff --git a/src/nid/nodeinit.cc b/src/nid/nodeinit.cc index 
9eddd743d..5a4b73cc6 100644
--- a/src/nid/nodeinit.cc
+++ b/src/nid/nodeinit.cc
@@ -1625,6 +1625,8 @@ int 

[devel] [PATCH 1/1] clm: Make the cluster reset admin op safe V3 [#2451]

2017-09-27 Thread Hans Nordeback
---
 00-README.conf  | 10 +-
 src/base/osaf_utility.c | 44 
 src/base/osaf_utility.h |  5 +
 src/clm/clmnd/main.c|  1 +
 src/nid/nodeinit.cc |  2 ++
 5 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/00-README.conf b/00-README.conf
index f64aa031c..5b06b15d5 100644
--- a/00-README.conf
+++ b/00-README.conf
@@ -610,4 +610,12 @@ A message will be written if the latency is > 0.1 second, 
example below shows a
 
 messages.1:Sep 12 13:09:26 SC-1 osafimmd[26732]: NO MDS timerfd expired 10 
times
 
-If the latency exceeds 4 seconds a sigalrm will be sent and the process will 
be aborted.
\ No newline at end of file
+If the latency exceeds 4 seconds a sigalrm will be sent and the process will 
be aborted.
+
+If clm adm command for cluster reboot is issued an environment variable
+OPENSAF_CLUSTER_REBOOT_WAIT_TIME_SEC can be set in opensafd script to specify
+the time to wait for nodes to be started, except for the active node.
+Default is two seconds. A file, "clm_cluster_reboot_in_progress", is created
+on each node, except on the active node. This file indicates that a cluster
+reboot is in progress and all nodes needs to delay their start, this to give
+the active a lead.
diff --git a/src/base/osaf_utility.c b/src/base/osaf_utility.c
index 230cd7e0f..6ee6c3d8f 100644
--- a/src/base/osaf_utility.c
+++ b/src/base/osaf_utility.c
@@ -23,9 +23,53 @@
 #include 
 #include 
 #include 
+
+#include 
+#include 
+#include 
+#include 
 #include "base/ncssysf_def.h"
 #include "osaf/configmake.h"
 
+void osaf_wait_for_active_to_start(void)
+{
+   struct stat statbuf;
+   static char file[NAME_MAX];
+   const char *wait_time_str = NULL;
+   unsigned int wait_time = kDfltClusterRebootWaitTimeSec;
+
+   if ((wait_time_str = getenv("OPENSAF_CLUSTER_REBOOT_WAIT_TIME_SEC")) != 
NULL) {
+   wait_time = strtol(wait_time_str, NULL, 0);
+   }
+   snprintf(file, sizeof(file), PKGLOGDIR "/%s", 
kClmClusterRebootInProgress);
+
+   if (stat(file, ) != 0) {
+   syslog(LOG_NOTICE, "Reboot file %s not found, startup 
continue...", file);
+   return;
+   }
+
+   syslog(LOG_NOTICE, "Cluster reboot in progress, this node will start in 
%u second(s)", wait_time);
+
+   sleep(wait_time);
+
+   if (unlink(file) == -1) {
+   syslog(LOG_ERR, "cannot remove file %s: %s", file, 
strerror(errno));
+   }
+}
+
+void osaf_create_cluster_reboot_in_progress_file(void)
+{
+   static char file[NAME_MAX];
+   snprintf(file, sizeof(file), PKGLOGDIR "/%s", 
kClmClusterRebootInProgress);
+   int fd;
+
+   if ((fd = open(file, O_RDWR | O_CREAT, 0644)) < 0) {
+   syslog(LOG_ERR, "Open %s failed, %s", file, strerror(errno));
+   return;
+   }
+   close(fd);
+}
+
 void osaf_abort(long i_cause)
 {
syslog(LOG_ERR, "osaf_abort(%ld) called from %p with errno=%d", i_cause,
diff --git a/src/base/osaf_utility.h b/src/base/osaf_utility.h
index b935c5003..f7b5a07b3 100644
--- a/src/base/osaf_utility.h
+++ b/src/base/osaf_utility.h
@@ -30,6 +30,8 @@
 extern "C" {
 #endif
 
+#define kClmClusterRebootInProgress "clm_cluster_reboot_in_progress"
+enum { kDfltClusterRebootWaitTimeSec = 2 };
 enum { kOsafUseSafeReboot = 1 };
 
 /**
@@ -71,6 +73,9 @@ extern void osaf_abort(long i_cause) __attribute__((
 
 extern void osaf_safe_reboot(void) __attribute__((nothrow));
 
+extern void osaf_wait_for_active_to_start(void);
+extern void osaf_create_cluster_reboot_in_progress_file(void);
+
 static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) {
   int result = pthread_mutex_lock(io_mutex);
   if (result != 0) osaf_abort(result);
diff --git a/src/clm/clmnd/main.c b/src/clm/clmnd/main.c
index 3a8479600..2801c218f 100644
--- a/src/clm/clmnd/main.c
+++ b/src/clm/clmnd/main.c
@@ -122,6 +122,7 @@ static uint32_t clmna_mds_dec(struct ncsmds_callback_info 
*info)
// Reboot will be performed by CLMS for this node.
if (clmna_cb->node_info.node_id !=
msg->info.reboot_info.node_id) {
+   osaf_create_cluster_reboot_in_progress_file();
osaf_safe_reboot();
}
break;
diff --git a/src/nid/nodeinit.cc b/src/nid/nodeinit.cc
index 9eddd743d..5a4b73cc6 100644
--- a/src/nid/nodeinit.cc
+++ b/src/nid/nodeinit.cc
@@ -1625,6 +1625,8 @@ int main(int argc, char *argv[]) {
 
   TRACE_ENTER();
 
+  osaf_wait_for_active_to_start();
+
 #ifdef RLIMIT_RTPRIO
   struct rlimit mylimit;
   mylimit.rlim_max = mylimit.rlim_cur = sched_get_priority_max(SCHED_RR);
-- 
2.14.1


--
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
___
Opensaf-devel mailing list