Hi Mathi,

ordering a cluster reboots are done at several places, both outside 
OpenSAF and inside OpenSAF e.g. SMF.

There are several different solutions to order a cluster reboot and in 
many cases they don't work well, so it would be

good if OpenSAF could provide one way to perform a cluster reboot. AMF 
may also use this for implementing SA_AMF_CLUSTER_RESET.

One common use case is upgrade with a following cluster reboot. The 
payloads are normally PXE booted and the DHCP

is stopped at the controllers before ordering a cluster reboot so the 
payloads will not start until the controllers has rebooted and DHCP is 
started.

Then each node is rebooted by in sequence do ssh to each node and order 
a reboot. This may take time and if CLM can broadcast this reboot 
request the

reboot will be considerable faster. So therefor in the first version of 
the cluster reboot support I don't think we need to consider 
implementing phases, this can

be added later, with e.g. a flag in CLM as AndersW suggested.

/Thanks HansN


On 09/30/2016 09:43 AM, Mathivanan Naickan Palanivelu wrote:
> Hi Hans,
>
> Could you provide some background on the need for opensaf_safe_reboot().
> What would be the need for this?
>
> Thanks,
> Mathi.
>
>
>> -----Original Message-----
>> From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com]
>> Sent: Wednesday, September 28, 2016 5:26 PM
>> To: anders.wid...@ericsson.com; Mathivanan Naickan Palanivelu
>> Cc: opensaf-devel@lists.sourceforge.net
>> Subject: [PATCH 1 of 1] clm: add support for cluster reboot V3 [#2053]
>>
>>   osaf/libs/common/clmsv/include/clmsv_msg.h   |   6 +++
>>   osaf/libs/core/common/include/osaf_utility.h |   5 +++
>>   osaf/libs/core/common/osaf_utility.c         |  22 +++++++++++++
>>   osaf/services/saf/clmsv/clms/clms.h          |   3 +-
>>   osaf/services/saf/clmsv/clms/clms_imm.c      |  18 ++++++++++
>>   osaf/services/saf/clmsv/clms/clms_mds.c      |  46
>> +++++++++++++++++++++++++++-
>>   osaf/services/saf/clmsv/clms/clms_util.c     |  12 +++++++
>>   osaf/services/saf/clmsv/nodeagent/main.c     |  12 +++++++
>>   scripts/opensaf_reboot                       |  22 ++++++++++---
>>   9 files changed, 139 insertions(+), 7 deletions(-)
>>
>>
>> Admin command to request cluster reboot:
>> immadm -o 1 safCluster=myClmCluster
>>
>> diff --git a/osaf/libs/common/clmsv/include/clmsv_msg.h
>> b/osaf/libs/common/clmsv/include/clmsv_msg.h
>> --- a/osaf/libs/common/clmsv/include/clmsv_msg.h
>> +++ b/osaf/libs/common/clmsv/include/clmsv_msg.h
>> @@ -23,6 +23,7 @@ typedef enum clms_msg_type {
>>     CLMSV_CLMS_TO_CLMA_CBK_MSG,
>>     CLMSV_CLMS_TO_CLMA_API_RESP_MSG,
>>     CLMSV_CLMS_TO_CLMA_IS_MEMBER_MSG,
>> +  CLMSV_CLMS_TO_CLMNA_REBOOT_MSG,
>>     CLMSV_MSG_MAX
>>   } CLMSV_MSG_TYPE;
>>
>> @@ -174,6 +175,10 @@ typedef struct clmsv_is_member_info_t {
>>     SaUint32T client_id;
>>   }CLMSV_IS_MEMBER_INFO;
>>
>> +typedef struct clmsv_reboot_info_t {
>> +  SaClmNodeIdT node_id;
>> +} CLMSV_REBOOT_INFO;
>> +
>>   /* Top Level CLMSv MDS message structure for use between CLMS-> CLMA
>> && CLMA -> CLMS */  typedef struct clmsv_msg_t {
>>     struct clmsv_msg_t *next;       /* Mailbox processing */
>> @@ -183,6 +188,7 @@ typedef struct clmsv_msg_t {
>>       CLMSV_CBK_INFO cbk_info;        /* Callback Messages from CLMS to CLA
>> */
>>       CLMSV_API_RESP_INFO api_resp_info;      /* Response Messages from
>> CLMS to CLA */
>>       CLMSV_IS_MEMBER_INFO is_member_info;    /*Is node member or not
>> Message from CLMS to CLA*/
>> +    CLMSV_REBOOT_INFO reboot_info;  /* Reboot request from
>> CLMS to CLMNA */
>>     } info;
>>   } CLMSV_MSG;
>>
>> diff --git a/osaf/libs/core/common/include/osaf_utility.h
>> b/osaf/libs/core/common/include/osaf_utility.h
>> --- a/osaf/libs/core/common/include/osaf_utility.h
>> +++ b/osaf/libs/core/common/include/osaf_utility.h
>> @@ -24,6 +24,8 @@
>>   #ifndef OPENSAF_CORE_OSAF_UTILITY_H_
>>   #define OPENSAF_CORE_OSAF_UTILITY_H_
>>
>> +#define USE_SAFE_REBOOT 1
>> +
>>   #include <pthread.h>
>>
>>   #ifdef  __cplusplus
>> @@ -68,6 +70,9 @@ extern void osaf_abort(long i_cause)  #endif
>>           nothrow, noreturn));
>>
>> +extern void osaf_safe_reboot()
>> +    __attribute__ ((nothrow));
>> +
>>   static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) {
>>     int result = pthread_mutex_lock(io_mutex);
>>     if (result != 0) osaf_abort(result);
>> diff --git a/osaf/libs/core/common/osaf_utility.c
>> b/osaf/libs/core/common/osaf_utility.c
>> --- a/osaf/libs/core/common/osaf_utility.c
>> +++ b/osaf/libs/core/common/osaf_utility.c
>> @@ -16,9 +16,12 @@
>>    */
>>
>>   #include "osaf_utility.h"
>> +#include "ncssysf_def.h"
>> +#include "configmake.h"
>>   #include <stdlib.h>
>>   #include <errno.h>
>>   #include <syslog.h>
>> +#include <stdio.h>
>>
>>   void osaf_abort(long i_cause)
>>   {
>> @@ -26,3 +29,22 @@ void osaf_abort(long i_cause)
>>              i_cause, __builtin_return_address(0), errno);
>>      abort();
>>   }
>> +
>> +void osaf_safe_reboot()
>> +{
>> +    char str[256];
>> +
>> +    snprintf(str, sizeof(str), PKGLIBDIR "/opensaf_reboot %u %s %u", 0,
>> "not_used", USE_SAFE_REBOOT);
>> +    syslog(LOG_NOTICE, "Reboot ordered using command: %s", str);
>> +
>> +    int rc = system(str);
>> +    if (rc < 0) {
>> +            syslog(LOG_CRIT, "Node reboot failure: exit code %d",
>> WEXITSTATUS(rc));
>> +    } else {
>> +             if (WIFEXITED(rc) && WEXITSTATUS(rc) == 0) {
>> +                    syslog(LOG_NOTICE, "Command: %s successfully
>> executed", str);
>> +            } else {
>> +                    syslog(LOG_CRIT, "Command: %s failed with exit
>> code %d", str, WEXITSTATUS(rc));
>> +            }
>> +    }
>> +}
>> diff --git a/osaf/services/saf/clmsv/clms/clms.h
>> b/osaf/services/saf/clmsv/clms/clms.h
>> --- a/osaf/services/saf/clmsv/clms/clms.h
>> +++ b/osaf/services/saf/clmsv/clms/clms.h
>> @@ -99,6 +99,7 @@ extern uint32_t clms_mds_msg_send(CLMS_C
>>                                     MDS_DEST *dest,
>>                                     MDS_SYNC_SND_CTXT *mds_ctxt,
>> MDS_SEND_PRIORITY_TYPE prio, NCSMDS_SVC_ID svc_id);
>>
>> +extern uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG
>> *bcast_msg);
>>   extern SaAisErrorT clms_imm_activate(CLMS_CB * cb);  extern uint32_t
>> clms_node_trackresplist_empty(CLMS_CLUSTER_NODE * op_node);  extern
>> uint32_t clms_send_cbk_start_sub(CLMS_CB * cb, CLMS_CLUSTER_NODE *
>> node); @@ -125,5 +126,5 @@ extern void clms_cb_dump(void);  extern
>> uint32_t clms_send_is_member_info(CLMS_CB * cb, SaClmNodeIdT
>> node_id,  SaBoolT member, SaBoolT is_configured);  extern void
>> clm_imm_reinit_bg(CLMS_CB * cb);  extern void
>> proc_downs_during_rolechange (void);
>> -
>> +extern void clms_cluster_reboot();
>>   #endif   /* ifndef CLMS_H */
>> diff --git a/osaf/services/saf/clmsv/clms/clms_imm.c
>> b/osaf/services/saf/clmsv/clms/clms_imm.c
>> --- a/osaf/services/saf/clmsv/clms/clms_imm.c
>> +++ b/osaf/services/saf/clmsv/clms/clms_imm.c
>> @@ -19,6 +19,7 @@
>>
>>   #include "clms.h"
>>   #include "osaf_extended_name.h"
>> +#include "osaf_utility.h"
>>
>>   extern struct ImmutilWrapperProfile immutilWrapperProfile;
>>
>> @@ -886,6 +887,23 @@ static void clms_imm_admin_op_callback(S
>>
>>      TRACE_ENTER2("Admin callback for nodename:%s, opId:%llu",
>> objectName->value, opId);
>>
>> +    // E.g. immadm -o 1 safCluster=myClmCluster
>> +    if (strncmp(osaf_extended_name_borrow(objectName),
>> +                  osaf_extended_name_borrow(&osaf_cluster->name),
>> +                  osaf_extended_name_length(objectName)) == 0) {
>> +            if (opId == 1) {
>> +                    LOG_WA("Cluster reboot requested. Ordering
>> cluster reboot");
>> +                    // MDS broadcast/multi cast call is synchronous
>> +                    clms_cluster_reboot();
>> +                    sleep(1);
>> +                    osaf_safe_reboot();
>> +            } else {
>> +                    LOG_ER("Admin Operation not supported for %s",
>> osaf_extended_name_borrow(objectName));
>> +
>>      immutil_saImmOiAdminOperationResult(immOiHandle, invocation,
>> SA_AIS_ERR_INVALID_PARAM);
>> +            }
>> +            goto done;
>> +    }
>> +
>>      /*Lookup by the node_name and get the cluster node for CLM
>> Admin oper */
>>      nodeop = clms_node_get_by_name(objectName);
>>      if (nodeop == NULL) {
>> diff --git a/osaf/services/saf/clmsv/clms/clms_mds.c
>> b/osaf/services/saf/clmsv/clms/clms_mds.c
>> --- a/osaf/services/saf/clmsv/clms/clms_mds.c
>> +++ b/osaf/services/saf/clmsv/clms/clms_mds.c
>> @@ -659,7 +659,17 @@ uint32_t clms_mds_enc(struct ncsmds_call
>>      ncs_enc_claim_space(uba, 4);
>>      total_bytes += 4;
>>
>> -    if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg->evt_type) {
>> +    if (CLMSV_CLMS_TO_CLMNA_REBOOT_MSG == msg->evt_type) {
>> +            /* encode the reboot msg **/
>> +            p8 = ncs_enc_reserve_space(uba, 4);
>> +            if (!p8) {
>> +                    TRACE("ncs_enc_reserve_space failed");
>> +                    goto err;
>> +            }
>> +            ncs_encode_32bit(&p8, msg->info.reboot_info.node_id);
>> +            ncs_enc_claim_space(uba, 4);
>> +            total_bytes += 4;
>> +    } else if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg-
>>> evt_type) {
>>      /** encode the API RSP msg subtype **/
>>              p8 = ncs_enc_reserve_space(uba, 4);
>>              if (!p8) {
>> @@ -1517,3 +1527,37 @@ uint32_t clms_mds_msg_send(CLMS_CB * cb,
>>      TRACE_LEAVE();
>>      return rc;
>>   }
>> +
>> +/*********************************************************
>> *******************
>> +  Name          : clms_mds_msg_bcast
>> +
>> +  Description   : This routine sends a broadcast message to CLMNA.
>> +
>> +  Arguments     : cb  - ptr to the CLMA CB
>> +                  bcast_msg - ptr to the CLMSv broadcast message
>> +
>> +  Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE
>> +
>> +  Notes         : None.
>> +*********************************************************
>> **************
>> +*******/ uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG
>> *bcast_msg)
>> +{
>> +    NCSMDS_INFO snd_mds = {0};
>> +    uint32_t rc;
>> +
>> +    snd_mds.i_mds_hdl = cb->mds_hdl;
>> +    snd_mds.i_svc_id = NCSMDS_SVC_ID_CLMS;
>> +    snd_mds.i_op = MDS_SEND;
>> +    snd_mds.info.svc_send.i_msg = (NCSCONTEXT)bcast_msg;
>> +    snd_mds.info.svc_send.i_to_svc = NCSMDS_SVC_ID_CLMNA;
>> +    snd_mds.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH;
>> +    snd_mds.info.svc_send.i_sendtype = MDS_SENDTYPE_BCAST;
>> +    snd_mds.info.svc_send.info.bcast.i_bcast_scope =
>> NCSMDS_SCOPE_NONE;
>> +
>> +    if ((rc = ncsmds_api(&snd_mds)) != NCSCC_RC_SUCCESS) {
>> +            LOG_ER("%s: ncsmds_api MDS_SEND failed %u",
>> __FUNCTION__ ,rc);
>> +            return rc;
>> +    }
>> +
>> +    return NCSCC_RC_SUCCESS;
>> +}
>> \ No newline at end of file
>> diff --git a/osaf/services/saf/clmsv/clms/clms_util.c
>> b/osaf/services/saf/clmsv/clms/clms_util.c
>> --- a/osaf/services/saf/clmsv/clms/clms_util.c
>> +++ b/osaf/services/saf/clmsv/clms/clms_util.c
>> @@ -1200,3 +1200,15 @@ bool ip_matched(uint16_t family1, uint8_
>>      return true;
>>   }
>>
>> +//
>> +void clms_cluster_reboot()
>> +{
>> +    CLMSV_MSG bcast_msg;
>> +    bcast_msg.evt_type = CLMSV_CLMS_TO_CLMNA_REBOOT_MSG;
>> +    bcast_msg.info.reboot_info.node_id = clms_cb->node_id;
>> +    if (clms_mds_msg_bcast(clms_cb, &bcast_msg) ==
>> NCSCC_RC_SUCCESS) {
>> +            LOG_NO("Sending cluster reboot broadcast message
>> succeeded");
>> +    } else {
>> +            LOG_ER("Sending cluster reboot broadcast message failed");
>> +    }
>> +}
>> diff --git a/osaf/services/saf/clmsv/nodeagent/main.c
>> b/osaf/services/saf/clmsv/nodeagent/main.c
>> --- a/osaf/services/saf/clmsv/nodeagent/main.c
>> +++ b/osaf/services/saf/clmsv/nodeagent/main.c
>> @@ -114,6 +114,18 @@ static uint32_t clmna_mds_dec(struct ncs
>>      total_bytes += 4;
>>
>>      switch (msg->evt_type) {
>> +    case    CLMSV_CLMS_TO_CLMNA_REBOOT_MSG:
>> +            {
>> +                    p8 = ncs_dec_flatten_space(uba, local_data, 4);
>> +                    msg->info.reboot_info.node_id =
>> ncs_decode_32bit(&p8);
>> +                    ncs_dec_skip_space(uba, 4);
>> +                    total_bytes += 4;
>> +                    // Reboot will be performed by CLMS for this node.
>> +                    if (clmna_cb->node_info.node_id != msg-
>>> info.reboot_info.node_id) {
>> +                            osaf_safe_reboot();
>> +                    }
>> +                    break;
>> +            }
>>      case CLMSV_CLMS_TO_CLMA_API_RESP_MSG:
>>              {
>>                      p8 = ncs_dec_flatten_space(uba, local_data, 8); diff -
>> -git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
>> --- a/scripts/opensaf_reboot
>> +++ b/scripts/opensaf_reboot
>> @@ -40,10 +40,17 @@ NODE_ID_FILE=$pkglocalstatedir/node_id
>>
>>   node_id=$1
>>   ee_name=$2
>> +safe_reboot=$3
>>
>>   # Run commands through sudo when not superuser  test $(id -u) -ne 0 &&
>> icmd=$(which sudo 2> /dev/null)
>>
>> +opensaf_safe_reboot()
>> +{
>> +    logger -t "opensaf_reboot" "Rebooting local node using shutdown"
>> +    $icmd /sbin/shutdown -r now
>> +}
>> +
>>   ## Use stonith for remote fencing
>>   opensaf_reboot_with_remote_fencing()
>>   {
>> @@ -91,8 +98,12 @@ temp_node_id=`cat "$NODE_ID_FILE"`
>> temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e
>> 's:^:0x:'`  self_node_id=`printf "%d" $temp_node_id`
>>
>> -# A node ID of zero(0) means an order to reboot the local node -if [
>> "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then
>> +
>> +if [ "$safe_reboot" = 1 ]; then
>> +    opensaf_safe_reboot
>> +else
>> +    # A node ID of zero(0) means an order to reboot the local node
>> +    if [ "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then
>>      # uncomment the following line if debugging errors that keep
>> restarting the node
>>      # exit 0
>>
>> @@ -114,8 +125,8 @@ if [ "$self_node_id" = "$node_id" ] || [
>>
>>      # Reboot (not shutdown) system WITH file system sync
>>      $icmd /sbin/reboot -f
>> -else
>> -    if [ "$FMS_USE_REMOTE_FENCING" = "1" ]; then
>> +    else
>> +    if [ "$FMS_USE_REMOTE_FENCING" = 1 ]; then
>>              opensaf_reboot_with_remote_fencing
>>      else
>>              if [ ":$ee_name" != ":" ]; then
>> @@ -133,4 +144,5 @@ else
>>                      logger -t "opensaf_reboot" "Rebooting remote node in the
>> absence of PLM is outside the scope of OpenSAF"
>>              fi
>>      fi
>> -fi
>> +    fi
>> +fi
>> \ No newline at end of file


------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to