Hi Mathi, see comments inline below. Do you think we can push this with AndersW comments incorporated?
/Thanks HansN On 09/30/2016 11:26 AM, Mathivanan Naickan Palanivelu wrote: > Thanks for the details and I agree. > I was actually enquiring about > - why a new "opensaf_safe_reboot" option in the opensaf_reboot script and > - Why a osaf_safe_reboot() is added in osaf_utility.c instead of using the > opensaf_reboot() function itself. [HansN] after CLM has requested safe_reboot, processing should continue normally, i.e. not stop in e.g. the use_fallback loop in opensaf_reboot, as then AMF probably will start error handling/recovery etc. > > For the first question, I think the obvious guess would be that we want to > use 'shutdown -r' such that rc scripts are invoked when doing the reboot. [HansN] yes, we have to separate between shutdown and reboot -f. > > Thanks, > Mathi. > >> -----Original Message----- >> From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com] >> Sent: Friday, September 30, 2016 2:14 PM >> To: Mathivanan Naickan Palanivelu; anders.wid...@ericsson.com >> Cc: opensaf-devel@lists.sourceforge.net >> Subject: Re: [PATCH 1 of 1] clm: add support for cluster reboot V3 [#2053] >> >> Hi Mathi, >> >> ordering a cluster reboots are done at several places, both outside OpenSAF >> and inside OpenSAF e.g. SMF. >> >> There are several different solutions to order a cluster reboot and in many >> cases they don't work well, so it would be >> >> good if OpenSAF could provide one way to perform a cluster reboot. AMF >> may also use this for implementing SA_AMF_CLUSTER_RESET. >> >> One common use case is upgrade with a following cluster reboot. The >> payloads are normally PXE booted and the DHCP >> >> is stopped at the controllers before ordering a cluster reboot so the >> payloads >> will not start until the controllers has rebooted and DHCP is started. >> >> Then each node is rebooted by in sequence do ssh to each node and order a >> reboot. This may take time and if CLM can broadcast this reboot request the >> >> reboot will be considerable faster. So therefor in the first version of the >> cluster reboot support I don't think we need to consider implementing >> phases, this can >> >> be added later, with e.g. a flag in CLM as AndersW suggested. >> >> /Thanks HansN >> >> >> On 09/30/2016 09:43 AM, Mathivanan Naickan Palanivelu wrote: >>> Hi Hans, >>> >>> Could you provide some background on the need for >> opensaf_safe_reboot(). >>> What would be the need for this? >>> >>> Thanks, >>> Mathi. >>> >>> >>>> -----Original Message----- >>>> From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com] >>>> Sent: Wednesday, September 28, 2016 5:26 PM >>>> To: anders.wid...@ericsson.com; Mathivanan Naickan Palanivelu >>>> Cc: opensaf-devel@lists.sourceforge.net >>>> Subject: [PATCH 1 of 1] clm: add support for cluster reboot V3 >>>> [#2053] >>>> >>>> osaf/libs/common/clmsv/include/clmsv_msg.h | 6 +++ >>>> osaf/libs/core/common/include/osaf_utility.h | 5 +++ >>>> osaf/libs/core/common/osaf_utility.c | 22 +++++++++++++ >>>> osaf/services/saf/clmsv/clms/clms.h | 3 +- >>>> osaf/services/saf/clmsv/clms/clms_imm.c | 18 ++++++++++ >>>> osaf/services/saf/clmsv/clms/clms_mds.c | 46 >>>> +++++++++++++++++++++++++++- >>>> osaf/services/saf/clmsv/clms/clms_util.c | 12 +++++++ >>>> osaf/services/saf/clmsv/nodeagent/main.c | 12 +++++++ >>>> scripts/opensaf_reboot | 22 ++++++++++--- >>>> 9 files changed, 139 insertions(+), 7 deletions(-) >>>> >>>> >>>> Admin command to request cluster reboot: >>>> immadm -o 1 safCluster=myClmCluster >>>> >>>> diff --git a/osaf/libs/common/clmsv/include/clmsv_msg.h >>>> b/osaf/libs/common/clmsv/include/clmsv_msg.h >>>> --- a/osaf/libs/common/clmsv/include/clmsv_msg.h >>>> +++ b/osaf/libs/common/clmsv/include/clmsv_msg.h >>>> @@ -23,6 +23,7 @@ typedef enum clms_msg_type { >>>> CLMSV_CLMS_TO_CLMA_CBK_MSG, >>>> CLMSV_CLMS_TO_CLMA_API_RESP_MSG, >>>> CLMSV_CLMS_TO_CLMA_IS_MEMBER_MSG, >>>> + CLMSV_CLMS_TO_CLMNA_REBOOT_MSG, >>>> CLMSV_MSG_MAX >>>> } CLMSV_MSG_TYPE; >>>> >>>> @@ -174,6 +175,10 @@ typedef struct clmsv_is_member_info_t { >>>> SaUint32T client_id; >>>> }CLMSV_IS_MEMBER_INFO; >>>> >>>> +typedef struct clmsv_reboot_info_t { >>>> + SaClmNodeIdT node_id; >>>> +} CLMSV_REBOOT_INFO; >>>> + >>>> /* Top Level CLMSv MDS message structure for use between CLMS-> >>>> CLMA && CLMA -> CLMS */ typedef struct clmsv_msg_t { >>>> struct clmsv_msg_t *next; /* Mailbox processing */ >>>> @@ -183,6 +188,7 @@ typedef struct clmsv_msg_t { >>>> CLMSV_CBK_INFO cbk_info; /* Callback Messages from CLMS to >> CLA >>>> */ >>>> CLMSV_API_RESP_INFO api_resp_info; /* Response Messages from >>>> CLMS to CLA */ >>>> CLMSV_IS_MEMBER_INFO is_member_info; /*Is node member or >> not >>>> Message from CLMS to CLA*/ >>>> + CLMSV_REBOOT_INFO reboot_info; /* Reboot request from >>>> CLMS to CLMNA */ >>>> } info; >>>> } CLMSV_MSG; >>>> >>>> diff --git a/osaf/libs/core/common/include/osaf_utility.h >>>> b/osaf/libs/core/common/include/osaf_utility.h >>>> --- a/osaf/libs/core/common/include/osaf_utility.h >>>> +++ b/osaf/libs/core/common/include/osaf_utility.h >>>> @@ -24,6 +24,8 @@ >>>> #ifndef OPENSAF_CORE_OSAF_UTILITY_H_ >>>> #define OPENSAF_CORE_OSAF_UTILITY_H_ >>>> >>>> +#define USE_SAFE_REBOOT 1 >>>> + >>>> #include <pthread.h> >>>> >>>> #ifdef __cplusplus >>>> @@ -68,6 +70,9 @@ extern void osaf_abort(long i_cause) #endif >>>> nothrow, noreturn)); >>>> >>>> +extern void osaf_safe_reboot() >>>> + __attribute__ ((nothrow)); >>>> + >>>> static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) { >>>> int result = pthread_mutex_lock(io_mutex); >>>> if (result != 0) osaf_abort(result); diff --git >>>> a/osaf/libs/core/common/osaf_utility.c >>>> b/osaf/libs/core/common/osaf_utility.c >>>> --- a/osaf/libs/core/common/osaf_utility.c >>>> +++ b/osaf/libs/core/common/osaf_utility.c >>>> @@ -16,9 +16,12 @@ >>>> */ >>>> >>>> #include "osaf_utility.h" >>>> +#include "ncssysf_def.h" >>>> +#include "configmake.h" >>>> #include <stdlib.h> >>>> #include <errno.h> >>>> #include <syslog.h> >>>> +#include <stdio.h> >>>> >>>> void osaf_abort(long i_cause) >>>> { >>>> @@ -26,3 +29,22 @@ void osaf_abort(long i_cause) >>>> i_cause, __builtin_return_address(0), errno); >>>> abort(); >>>> } >>>> + >>>> +void osaf_safe_reboot() >>>> +{ >>>> + char str[256]; >>>> + >>>> + snprintf(str, sizeof(str), PKGLIBDIR "/opensaf_reboot %u %s %u", 0, >>>> "not_used", USE_SAFE_REBOOT); >>>> + syslog(LOG_NOTICE, "Reboot ordered using command: %s", str); >>>> + >>>> + int rc = system(str); >>>> + if (rc < 0) { >>>> + syslog(LOG_CRIT, "Node reboot failure: exit code %d", >>>> WEXITSTATUS(rc)); >>>> + } else { >>>> + if (WIFEXITED(rc) && WEXITSTATUS(rc) == 0) { >>>> + syslog(LOG_NOTICE, "Command: %s successfully >>>> executed", str); >>>> + } else { >>>> + syslog(LOG_CRIT, "Command: %s failed with exit >>>> code %d", str, WEXITSTATUS(rc)); >>>> + } >>>> + } >>>> +} >>>> diff --git a/osaf/services/saf/clmsv/clms/clms.h >>>> b/osaf/services/saf/clmsv/clms/clms.h >>>> --- a/osaf/services/saf/clmsv/clms/clms.h >>>> +++ b/osaf/services/saf/clmsv/clms/clms.h >>>> @@ -99,6 +99,7 @@ extern uint32_t clms_mds_msg_send(CLMS_C >>>> MDS_DEST *dest, >>>> MDS_SYNC_SND_CTXT *mds_ctxt, >>>> MDS_SEND_PRIORITY_TYPE prio, NCSMDS_SVC_ID svc_id); >>>> >>>> +extern uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG >>>> *bcast_msg); >>>> extern SaAisErrorT clms_imm_activate(CLMS_CB * cb); extern >>>> uint32_t clms_node_trackresplist_empty(CLMS_CLUSTER_NODE * >> op_node); >>>> extern uint32_t clms_send_cbk_start_sub(CLMS_CB * cb, >>>> CLMS_CLUSTER_NODE * node); @@ -125,5 +126,5 @@ extern void >>>> clms_cb_dump(void); extern uint32_t >> clms_send_is_member_info(CLMS_CB >>>> * cb, SaClmNodeIdT node_id, SaBoolT member, SaBoolT is_configured); >>>> extern void clm_imm_reinit_bg(CLMS_CB * cb); extern void >>>> proc_downs_during_rolechange (void); >>>> - >>>> +extern void clms_cluster_reboot(); >>>> #endif /* ifndef CLMS_H */ >>>> diff --git a/osaf/services/saf/clmsv/clms/clms_imm.c >>>> b/osaf/services/saf/clmsv/clms/clms_imm.c >>>> --- a/osaf/services/saf/clmsv/clms/clms_imm.c >>>> +++ b/osaf/services/saf/clmsv/clms/clms_imm.c >>>> @@ -19,6 +19,7 @@ >>>> >>>> #include "clms.h" >>>> #include "osaf_extended_name.h" >>>> +#include "osaf_utility.h" >>>> >>>> extern struct ImmutilWrapperProfile immutilWrapperProfile; >>>> >>>> @@ -886,6 +887,23 @@ static void clms_imm_admin_op_callback(S >>>> >>>> TRACE_ENTER2("Admin callback for nodename:%s, opId:%llu", >>>> objectName->value, opId); >>>> >>>> + // E.g. immadm -o 1 safCluster=myClmCluster >>>> + if (strncmp(osaf_extended_name_borrow(objectName), >>>> + osaf_extended_name_borrow(&osaf_cluster->name), >>>> + osaf_extended_name_length(objectName)) == 0) { >>>> + if (opId == 1) { >>>> + LOG_WA("Cluster reboot requested. Ordering >>>> cluster reboot"); >>>> + // MDS broadcast/multi cast call is synchronous >>>> + clms_cluster_reboot(); >>>> + sleep(1); >>>> + osaf_safe_reboot(); >>>> + } else { >>>> + LOG_ER("Admin Operation not supported for %s", >>>> osaf_extended_name_borrow(objectName)); >>>> + >>>> immutil_saImmOiAdminOperationResult(immOiHandle, invocation, >>>> SA_AIS_ERR_INVALID_PARAM); >>>> + } >>>> + goto done; >>>> + } >>>> + >>>> /*Lookup by the node_name and get the cluster node for CLM >> Admin >>>> oper */ >>>> nodeop = clms_node_get_by_name(objectName); >>>> if (nodeop == NULL) { >>>> diff --git a/osaf/services/saf/clmsv/clms/clms_mds.c >>>> b/osaf/services/saf/clmsv/clms/clms_mds.c >>>> --- a/osaf/services/saf/clmsv/clms/clms_mds.c >>>> +++ b/osaf/services/saf/clmsv/clms/clms_mds.c >>>> @@ -659,7 +659,17 @@ uint32_t clms_mds_enc(struct ncsmds_call >>>> ncs_enc_claim_space(uba, 4); >>>> total_bytes += 4; >>>> >>>> - if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg->evt_type) { >>>> + if (CLMSV_CLMS_TO_CLMNA_REBOOT_MSG == msg->evt_type) { >>>> + /* encode the reboot msg **/ >>>> + p8 = ncs_enc_reserve_space(uba, 4); >>>> + if (!p8) { >>>> + TRACE("ncs_enc_reserve_space failed"); >>>> + goto err; >>>> + } >>>> + ncs_encode_32bit(&p8, msg->info.reboot_info.node_id); >>>> + ncs_enc_claim_space(uba, 4); >>>> + total_bytes += 4; >>>> + } else if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg- >>>>> evt_type) { >>>> /** encode the API RSP msg subtype **/ >>>> p8 = ncs_enc_reserve_space(uba, 4); >>>> if (!p8) { >>>> @@ -1517,3 +1527,37 @@ uint32_t clms_mds_msg_send(CLMS_CB * cb, >>>> TRACE_LEAVE(); >>>> return rc; >>>> } >>>> + >>>> >> +/********************************************************* >>>> ******************* >>>> + Name : clms_mds_msg_bcast >>>> + >>>> + Description : This routine sends a broadcast message to CLMNA. >>>> + >>>> + Arguments : cb - ptr to the CLMA CB >>>> + bcast_msg - ptr to the CLMSv broadcast message >>>> + >>>> + Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE >>>> + >>>> + Notes : None. >>>> >> +********************************************************* >>>> ************** >>>> +*******/ uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG >>>> *bcast_msg) >>>> +{ >>>> + NCSMDS_INFO snd_mds = {0}; >>>> + uint32_t rc; >>>> + >>>> + snd_mds.i_mds_hdl = cb->mds_hdl; >>>> + snd_mds.i_svc_id = NCSMDS_SVC_ID_CLMS; >>>> + snd_mds.i_op = MDS_SEND; >>>> + snd_mds.info.svc_send.i_msg = (NCSCONTEXT)bcast_msg; >>>> + snd_mds.info.svc_send.i_to_svc = NCSMDS_SVC_ID_CLMNA; >>>> + snd_mds.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH; >>>> + snd_mds.info.svc_send.i_sendtype = MDS_SENDTYPE_BCAST; >>>> + snd_mds.info.svc_send.info.bcast.i_bcast_scope = >>>> NCSMDS_SCOPE_NONE; >>>> + >>>> + if ((rc = ncsmds_api(&snd_mds)) != NCSCC_RC_SUCCESS) { >>>> + LOG_ER("%s: ncsmds_api MDS_SEND failed %u", >>>> __FUNCTION__ ,rc); >>>> + return rc; >>>> + } >>>> + >>>> + return NCSCC_RC_SUCCESS; >>>> +} >>>> \ No newline at end of file >>>> diff --git a/osaf/services/saf/clmsv/clms/clms_util.c >>>> b/osaf/services/saf/clmsv/clms/clms_util.c >>>> --- a/osaf/services/saf/clmsv/clms/clms_util.c >>>> +++ b/osaf/services/saf/clmsv/clms/clms_util.c >>>> @@ -1200,3 +1200,15 @@ bool ip_matched(uint16_t family1, uint8_ >>>> return true; >>>> } >>>> >>>> +// >>>> +void clms_cluster_reboot() >>>> +{ >>>> + CLMSV_MSG bcast_msg; >>>> + bcast_msg.evt_type = CLMSV_CLMS_TO_CLMNA_REBOOT_MSG; >>>> + bcast_msg.info.reboot_info.node_id = clms_cb->node_id; >>>> + if (clms_mds_msg_bcast(clms_cb, &bcast_msg) == >>>> NCSCC_RC_SUCCESS) { >>>> + LOG_NO("Sending cluster reboot broadcast message >>>> succeeded"); >>>> + } else { >>>> + LOG_ER("Sending cluster reboot broadcast message failed"); >>>> + } >>>> +} >>>> diff --git a/osaf/services/saf/clmsv/nodeagent/main.c >>>> b/osaf/services/saf/clmsv/nodeagent/main.c >>>> --- a/osaf/services/saf/clmsv/nodeagent/main.c >>>> +++ b/osaf/services/saf/clmsv/nodeagent/main.c >>>> @@ -114,6 +114,18 @@ static uint32_t clmna_mds_dec(struct ncs >>>> total_bytes += 4; >>>> >>>> switch (msg->evt_type) { >>>> + case CLMSV_CLMS_TO_CLMNA_REBOOT_MSG: >>>> + { >>>> + p8 = ncs_dec_flatten_space(uba, local_data, 4); >>>> + msg->info.reboot_info.node_id = >>>> ncs_decode_32bit(&p8); >>>> + ncs_dec_skip_space(uba, 4); >>>> + total_bytes += 4; >>>> + // Reboot will be performed by CLMS for this node. >>>> + if (clmna_cb->node_info.node_id != msg- >>>>> info.reboot_info.node_id) { >>>> + osaf_safe_reboot(); >>>> + } >>>> + break; >>>> + } >>>> case CLMSV_CLMS_TO_CLMA_API_RESP_MSG: >>>> { >>>> p8 = ncs_dec_flatten_space(uba, local_data, 8); >>>> diff - >> -git >>>> a/scripts/opensaf_reboot b/scripts/opensaf_reboot >>>> --- a/scripts/opensaf_reboot >>>> +++ b/scripts/opensaf_reboot >>>> @@ -40,10 +40,17 @@ NODE_ID_FILE=$pkglocalstatedir/node_id >>>> >>>> node_id=$1 >>>> ee_name=$2 >>>> +safe_reboot=$3 >>>> >>>> # Run commands through sudo when not superuser test $(id -u) -ne 0 >>>> && icmd=$(which sudo 2> /dev/null) >>>> >>>> +opensaf_safe_reboot() >>>> +{ >>>> + logger -t "opensaf_reboot" "Rebooting local node using shutdown" >>>> + $icmd /sbin/shutdown -r now >>>> +} >>>> + >>>> ## Use stonith for remote fencing >>>> opensaf_reboot_with_remote_fencing() >>>> { >>>> @@ -91,8 +98,12 @@ temp_node_id=`cat "$NODE_ID_FILE"` >>>> temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e >>>> 's:^:0x:'` self_node_id=`printf "%d" $temp_node_id` >>>> >>>> -# A node ID of zero(0) means an order to reboot the local node -if [ >>>> "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then >>>> + >>>> +if [ "$safe_reboot" = 1 ]; then >>>> + opensaf_safe_reboot >>>> +else >>>> + # A node ID of zero(0) means an order to reboot the local node >>>> + if [ "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then >>>> # uncomment the following line if debugging errors that keep >>>> restarting the node >>>> # exit 0 >>>> >>>> @@ -114,8 +125,8 @@ if [ "$self_node_id" = "$node_id" ] || [ >>>> >>>> # Reboot (not shutdown) system WITH file system sync >>>> $icmd /sbin/reboot -f >>>> -else >>>> - if [ "$FMS_USE_REMOTE_FENCING" = "1" ]; then >>>> + else >>>> + if [ "$FMS_USE_REMOTE_FENCING" = 1 ]; then >>>> opensaf_reboot_with_remote_fencing >>>> else >>>> if [ ":$ee_name" != ":" ]; then >>>> @@ -133,4 +144,5 @@ else >>>> logger -t "opensaf_reboot" "Rebooting remote >>>> node >> in the >>>> absence of PLM is outside the scope of OpenSAF" >>>> fi >>>> fi >>>> -fi >>>> + fi >>>> +fi >>>> \ No newline at end of file ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel