Re: [devel] [PATCH 1/1] clm: Make the cluster reset admin op safe V3 [#2451]
Hi, I've checked Zoran's suggestion attached in the ticket, it handles safe cluster reboot in the most preferable way compared to the other three patches I think. /Regards Hans -Original Message- From: Hans Nordebäck Sent: den 27 september 2017 13:26 To: Anders Widell; praveen.malv...@oracle.com Cc: opensaf-devel@lists.sourceforge.net; Hans Nordebäck Subject: [PATCH 1/1] clm: Make the cluster reset admin op safe V3 [#2451] --- 00-README.conf | 10 +- src/base/osaf_utility.c | 44 src/base/osaf_utility.h | 5 + src/clm/clmnd/main.c| 1 + src/nid/nodeinit.cc | 2 ++ 5 files changed, 61 insertions(+), 1 deletion(-) diff --git a/00-README.conf b/00-README.conf index f64aa031c..5b06b15d5 100644 --- a/00-README.conf +++ b/00-README.conf @@ -610,4 +610,12 @@ A message will be written if the latency is > 0.1 second, example below shows a messages.1:Sep 12 13:09:26 SC-1 osafimmd[26732]: NO MDS timerfd expired 10 times -If the latency exceeds 4 seconds a sigalrm will be sent and the process will be aborted. \ No newline at end of file +If the latency exceeds 4 seconds a sigalrm will be sent and the process will be aborted. + +If clm adm command for cluster reboot is issued an environment variable +OPENSAF_CLUSTER_REBOOT_WAIT_TIME_SEC can be set in opensafd script to +specify the time to wait for nodes to be started, except for the active node. +Default is two seconds. A file, "clm_cluster_reboot_in_progress", is +created on each node, except on the active node. This file indicates +that a cluster reboot is in progress and all nodes needs to delay their +start, this to give the active a lead. diff --git a/src/base/osaf_utility.c b/src/base/osaf_utility.c index 230cd7e0f..6ee6c3d8f 100644 --- a/src/base/osaf_utility.c +++ b/src/base/osaf_utility.c @@ -23,9 +23,53 @@ #include #include #include + +#include +#include +#include +#include #include "base/ncssysf_def.h" #include "osaf/configmake.h" +void osaf_wait_for_active_to_start(void) +{ + struct stat statbuf; + static char file[NAME_MAX]; + const char *wait_time_str = NULL; + unsigned int wait_time = kDfltClusterRebootWaitTimeSec; + + if ((wait_time_str = getenv("OPENSAF_CLUSTER_REBOOT_WAIT_TIME_SEC")) != NULL) { + wait_time = strtol(wait_time_str, NULL, 0); + } + snprintf(file, sizeof(file), PKGLOGDIR "/%s", +kClmClusterRebootInProgress); + + if (stat(file, ) != 0) { + syslog(LOG_NOTICE, "Reboot file %s not found, startup continue...", file); + return; + } + + syslog(LOG_NOTICE, "Cluster reboot in progress, this node will start +in %u second(s)", wait_time); + + sleep(wait_time); + + if (unlink(file) == -1) { + syslog(LOG_ERR, "cannot remove file %s: %s", file, strerror(errno)); + } +} + +void osaf_create_cluster_reboot_in_progress_file(void) +{ + static char file[NAME_MAX]; + snprintf(file, sizeof(file), PKGLOGDIR "/%s", kClmClusterRebootInProgress); + int fd; + + if ((fd = open(file, O_RDWR | O_CREAT, 0644)) < 0) { + syslog(LOG_ERR, "Open %s failed, %s", file, strerror(errno)); + return; + } + close(fd); +} + void osaf_abort(long i_cause) { syslog(LOG_ERR, "osaf_abort(%ld) called from %p with errno=%d", i_cause, diff --git a/src/base/osaf_utility.h b/src/base/osaf_utility.h index b935c5003..f7b5a07b3 100644 --- a/src/base/osaf_utility.h +++ b/src/base/osaf_utility.h @@ -30,6 +30,8 @@ extern "C" { #endif +#define kClmClusterRebootInProgress "clm_cluster_reboot_in_progress" +enum { kDfltClusterRebootWaitTimeSec = 2 }; enum { kOsafUseSafeReboot = 1 }; /** @@ -71,6 +73,9 @@ extern void osaf_abort(long i_cause) __attribute__(( extern void osaf_safe_reboot(void) __attribute__((nothrow)); +extern void osaf_wait_for_active_to_start(void); +extern void osaf_create_cluster_reboot_in_progress_file(void); + static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) { int result = pthread_mutex_lock(io_mutex); if (result != 0) osaf_abort(result); diff --git a/src/clm/clmnd/main.c b/src/clm/clmnd/main.c index 3a8479600..2801c218f 100644 --- a/src/clm/clmnd/main.c +++ b/src/clm/clmnd/main.c @@ -122,6 +122,7 @@ static uint32_t clmna_mds_dec(struct ncsmds_callback_info *info) // Reboot will be performed by CLMS for this node. if (clmna_cb->node_info.node_id != msg->info.reboot_info.node_id) { + osaf_create_cluster_reboot_in_progress_file(); osaf_safe_reboot(); } break; diff --git a/src/nid/nodeinit.cc b/src/nid/nodeinit.cc index 9eddd743d..5a4b73cc6 100644 --- a/src/nid/nodeinit.cc +++ b/src/nid/nodeinit.cc @@ -1625,6 +1625,8 @@ int
[devel] [PATCH 1/1] clm: Make the cluster reset admin op safe V3 [#2451]
--- 00-README.conf | 10 +- src/base/osaf_utility.c | 44 src/base/osaf_utility.h | 5 + src/clm/clmnd/main.c| 1 + src/nid/nodeinit.cc | 2 ++ 5 files changed, 61 insertions(+), 1 deletion(-) diff --git a/00-README.conf b/00-README.conf index f64aa031c..5b06b15d5 100644 --- a/00-README.conf +++ b/00-README.conf @@ -610,4 +610,12 @@ A message will be written if the latency is > 0.1 second, example below shows a messages.1:Sep 12 13:09:26 SC-1 osafimmd[26732]: NO MDS timerfd expired 10 times -If the latency exceeds 4 seconds a sigalrm will be sent and the process will be aborted. \ No newline at end of file +If the latency exceeds 4 seconds a sigalrm will be sent and the process will be aborted. + +If clm adm command for cluster reboot is issued an environment variable +OPENSAF_CLUSTER_REBOOT_WAIT_TIME_SEC can be set in opensafd script to specify +the time to wait for nodes to be started, except for the active node. +Default is two seconds. A file, "clm_cluster_reboot_in_progress", is created +on each node, except on the active node. This file indicates that a cluster +reboot is in progress and all nodes needs to delay their start, this to give +the active a lead. diff --git a/src/base/osaf_utility.c b/src/base/osaf_utility.c index 230cd7e0f..6ee6c3d8f 100644 --- a/src/base/osaf_utility.c +++ b/src/base/osaf_utility.c @@ -23,9 +23,53 @@ #include #include #include + +#include +#include +#include +#include #include "base/ncssysf_def.h" #include "osaf/configmake.h" +void osaf_wait_for_active_to_start(void) +{ + struct stat statbuf; + static char file[NAME_MAX]; + const char *wait_time_str = NULL; + unsigned int wait_time = kDfltClusterRebootWaitTimeSec; + + if ((wait_time_str = getenv("OPENSAF_CLUSTER_REBOOT_WAIT_TIME_SEC")) != NULL) { + wait_time = strtol(wait_time_str, NULL, 0); + } + snprintf(file, sizeof(file), PKGLOGDIR "/%s", kClmClusterRebootInProgress); + + if (stat(file, ) != 0) { + syslog(LOG_NOTICE, "Reboot file %s not found, startup continue...", file); + return; + } + + syslog(LOG_NOTICE, "Cluster reboot in progress, this node will start in %u second(s)", wait_time); + + sleep(wait_time); + + if (unlink(file) == -1) { + syslog(LOG_ERR, "cannot remove file %s: %s", file, strerror(errno)); + } +} + +void osaf_create_cluster_reboot_in_progress_file(void) +{ + static char file[NAME_MAX]; + snprintf(file, sizeof(file), PKGLOGDIR "/%s", kClmClusterRebootInProgress); + int fd; + + if ((fd = open(file, O_RDWR | O_CREAT, 0644)) < 0) { + syslog(LOG_ERR, "Open %s failed, %s", file, strerror(errno)); + return; + } + close(fd); +} + void osaf_abort(long i_cause) { syslog(LOG_ERR, "osaf_abort(%ld) called from %p with errno=%d", i_cause, diff --git a/src/base/osaf_utility.h b/src/base/osaf_utility.h index b935c5003..f7b5a07b3 100644 --- a/src/base/osaf_utility.h +++ b/src/base/osaf_utility.h @@ -30,6 +30,8 @@ extern "C" { #endif +#define kClmClusterRebootInProgress "clm_cluster_reboot_in_progress" +enum { kDfltClusterRebootWaitTimeSec = 2 }; enum { kOsafUseSafeReboot = 1 }; /** @@ -71,6 +73,9 @@ extern void osaf_abort(long i_cause) __attribute__(( extern void osaf_safe_reboot(void) __attribute__((nothrow)); +extern void osaf_wait_for_active_to_start(void); +extern void osaf_create_cluster_reboot_in_progress_file(void); + static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) { int result = pthread_mutex_lock(io_mutex); if (result != 0) osaf_abort(result); diff --git a/src/clm/clmnd/main.c b/src/clm/clmnd/main.c index 3a8479600..2801c218f 100644 --- a/src/clm/clmnd/main.c +++ b/src/clm/clmnd/main.c @@ -122,6 +122,7 @@ static uint32_t clmna_mds_dec(struct ncsmds_callback_info *info) // Reboot will be performed by CLMS for this node. if (clmna_cb->node_info.node_id != msg->info.reboot_info.node_id) { + osaf_create_cluster_reboot_in_progress_file(); osaf_safe_reboot(); } break; diff --git a/src/nid/nodeinit.cc b/src/nid/nodeinit.cc index 9eddd743d..5a4b73cc6 100644 --- a/src/nid/nodeinit.cc +++ b/src/nid/nodeinit.cc @@ -1625,6 +1625,8 @@ int main(int argc, char *argv[]) { TRACE_ENTER(); + osaf_wait_for_active_to_start(); + #ifdef RLIMIT_RTPRIO struct rlimit mylimit; mylimit.rlim_max = mylimit.rlim_cur = sched_get_priority_max(SCHED_RR); -- 2.14.1 -- Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot ___ Opensaf-devel mailing list