Hi Praveen

Ack (reviewed and regression tests run)

Thanks

-----Original Message-----
From: <[email protected]>
Date: Wednesday, 15 February 2017 at 7:39 pm
To: <[email protected]>, <[email protected]>, gary 
<[email protected]>, <[email protected]>
Cc: <[email protected]>
Subject: [PATCH 1 of 1] amfd: support for cluster reset recovery[#2065]

     src/amf/amfd/comp.cc     |   5 ++-
     src/amf/amfd/comptype.cc |   5 ++-
     src/amf/amfd/sgproc.cc   |  49 ++++++++++++++++++++++++++++++++++++
     src/amf/amfd/util.cc     |  40 ++++++++++++++++++++++++++++++
     src/amf/amfd/util.h      |   2 +
     src/amf/amfnd/cpm.cc     |   3 +-
     src/amf/amfnd/err.cc     |  64 
+++++++++++++++++++++++++++++++++++------------
     7 files changed, 146 insertions(+), 22 deletions(-)
    
    
    Support for cluster reset recovery (SA_AMF_CLUSTER_RESET = 7) mentioned in
    B.04.01 section 3.11.1.3.4 Cluster Reset Recovery Action.
    
    -Use this by setting saAmfCompRecoveryOnError or saAmfCtDefRecoveryOnError 
in
     application configuration. Or
    -pass as argument in APIs:
    saAmfPmStart(), saAmfComponentErrorReport(), saAmfPmStart_3(), 
saAmfHealthcheckStart(),
    and saAmfComponentErrorReport_4().
    
    TODO: AMFD will have to raise alarm for cluster reset.
    
    diff --git a/src/amf/amfd/comp.cc b/src/amf/amfd/comp.cc
    --- a/src/amf/amfd/comp.cc
    +++ b/src/amf/amfd/comp.cc
    @@ -2,6 +2,7 @@
      *
      * (C) Copyright 2008 The OpenSAF Foundation
      * (C) Copyright 2017 Ericsson AB - All Rights Reserved.
    + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
      *
      * This program is distributed in the hope that it will be useful, but
      * WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
    @@ -368,7 +369,7 @@ static int is_config_valid(const std::st
     
        rc = 
immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfCompRecoveryOnError"), 
attributes, 0, &value);
        if (rc == SA_AIS_OK) {
    -           if ((value < SA_AMF_NO_RECOMMENDATION) || (value > 
SA_AMF_NODE_FAILFAST)) {
    +           if ((value < SA_AMF_NO_RECOMMENDATION) || (value > 
SA_AMF_CLUSTER_RESET)) {
                        report_ccb_validation_error(opdata, 
"Illegal/unsupported saAmfCompRecoveryOnError value %u for '%s'",
                                   value, dn.c_str());
                        return 0;
    @@ -1186,7 +1187,7 @@ static SaAisErrorT ccb_completed_modify_
                        if (value_is_deleted == true)
                                continue;
                        uint32_t recovery = *((SaUint32T *)value);
    -                   if ((recovery < SA_AMF_NO_RECOMMENDATION) || (recovery 
> SA_AMF_CONTAINER_RESTART )) {
    +                   if ((recovery < SA_AMF_NO_RECOMMENDATION) || (recovery 
> SA_AMF_CLUSTER_RESET)) {
                                report_ccb_validation_error(opdata, 
"Modification of saAmfCompRecoveryOnError Fail,"
                                                " Invalid recovery 
=%d",recovery);
                                goto done;
    diff --git a/src/amf/amfd/comptype.cc b/src/amf/amfd/comptype.cc
    --- a/src/amf/amfd/comptype.cc
    +++ b/src/amf/amfd/comptype.cc
    @@ -1,6 +1,7 @@
     /*      -*- OpenSAF  -*-
      *
      * (C) Copyright 2008 The OpenSAF Foundation
    + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
      *
      * This program is distributed in the hope that it will be useful, but
      * WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
    @@ -335,7 +336,7 @@ static bool config_is_valid(const std::s
        rc = 
immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfCtDefRecoveryOnError"), 
attributes, 0, &value);
        osafassert(rc == SA_AIS_OK);
     
    -   if ((value < SA_AMF_NO_RECOMMENDATION) || (value > 
SA_AMF_NODE_FAILFAST)) {
    +   if ((value < SA_AMF_NO_RECOMMENDATION) || (value > 
SA_AMF_CLUSTER_RESET)) {
                report_ccb_validation_error(opdata, "Illegal/unsupported 
saAmfCtDefRecoveryOnError value %u for '%s'",
                                value, dn.c_str());
                return false;
    @@ -646,7 +647,7 @@ static SaAisErrorT ccb_completed_modify_
                                goto done;
                        }
                        uint32_t value = *((SaUint32T 
*)mod->modAttr.attrValues[0]);
    -                   if ((value < SA_AMF_COMPONENT_RESTART) || (value > 
SA_AMF_NODE_FAILFAST)) {
    +                   if ((value < SA_AMF_COMPONENT_RESTART) || (value > 
SA_AMF_CLUSTER_RESET)) {
                                report_ccb_validation_error(opdata,
                                        "Invalid saAmfCtDefRecoveryOnError for 
'%s'", dn);
                                rc = SA_AIS_ERR_BAD_OPERATION;
    diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc
    --- a/src/amf/amfd/sgproc.cc
    +++ b/src/amf/amfd/sgproc.cc
    @@ -1,6 +1,7 @@
     /*      -*- OpenSAF  -*-
      *
      * (C) Copyright 2008 The OpenSAF Foundation
    + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
      *
      * This program is distributed in the hope that it will be useful, but
      * WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
    @@ -626,6 +627,46 @@ static void perform_nodeswitchover_recov
     done:
        TRACE_LEAVE();
     }
    +
    +/**
    + * @brief       Performs Cluster reset recovery.
    + **/
    +static void perform_cluster_reset_recovery() {
    +  TRACE_ENTER();
    +  uint32_t rc = NCSCC_RC_SUCCESS;
    +  AVD_AVND *node = nullptr;
    +  for (std::map<uint32_t, AVD_AVND *>::const_iterator it = 
node_id_db->begin();
    +    it != node_id_db->end(); it++) {
    +    node = it->second;
    +    //First reboot payloads.
    +    if ((node->node_info.nodeId == avd_cb->node_id_avd) ||
    +               (node->node_info.nodeId == avd_cb->node_id_avd_other))
    +      continue;
    +    TRACE_1("node:'%s', nodeId:%x", node->name.c_str(), 
node->node_info.nodeId);
    +    rc = avd_send_reboot_msg_directly(node);
    +    if (rc != NCSCC_RC_SUCCESS)
    +      TRACE_1("Send failed fpr Reboot msg to payload.");
    +  }
    +
    +  //Send for standby.
    +  node = nullptr;
    +  node = avd_node_find_nodeid(avd_cb->node_id_avd_other);
    +  if (node != nullptr) {
    +    rc = avd_send_reboot_msg_directly(node);
    +    if (rc != NCSCC_RC_SUCCESS)
    +      TRACE_1("Send failed for Reboot msg to standby.");
    +  }
    +
    +  //Send for self.
    +  node = nullptr;
    +  node = avd_node_find_nodeid(avd_cb->node_id_avd);
    +  osafassert(node != nullptr);
    +  rc = avd_send_reboot_msg_directly(node);
    +  if (rc != NCSCC_RC_SUCCESS)
    +    TRACE_1("Send failed for Reboot msg to active.");
    +
    +  TRACE_LEAVE();
    +}
     
/*****************************************************************************
      * Function: avd_su_oper_state_func
      *
    @@ -692,6 +733,8 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb
        } else if (n2d_msg->msg_info.n2d_opr_state.rec_rcvr.saf_amf == 
SA_AMF_NODE_FAILOVER) {
                saflog(LOG_NOTICE, amfSvcUsrName, "Node Fail-Over requested by 
'%s'",
                           node->name.c_str());
    +   } else if (n2d_msg->msg_info.n2d_opr_state.rec_rcvr.saf_amf == 
SA_AMF_CLUSTER_RESET) {
    +           saflog(LOG_NOTICE, amfSvcUsrName, "Cluster reset requested by 
'%s'", node->name.c_str());
        }
     
        /* Verify that the SU and node oper state is diabled and rcvr is 
failfast */
    @@ -819,6 +862,12 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb
                                        
perform_nodeswitchover_recovery(su->su_on_node);
                                        goto done;
                                        break;
    +                           case SA_AMF_CLUSTER_RESET:
    +                                   perform_cluster_reset_recovery();
    +                                   LOG_WA("Wait for reboot");
    +                                   for (;;) 
    +                                           sleep(1);
    +                                   break;
                                default :
                                        break;
                                }
    diff --git a/src/amf/amfd/util.cc b/src/amf/amfd/util.cc
    --- a/src/amf/amfd/util.cc
    +++ b/src/amf/amfd/util.cc
    @@ -1,6 +1,7 @@
     /*      -*- OpenSAF  -*-
      *
      * (C) Copyright 2008 The OpenSAF Foundation
    + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
      *
      * This program is distributed in the hope that it will be useful, but
      * WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
    @@ -2125,3 +2126,42 @@ void avd_association_namet_init(const st
       child.erase(std::remove(child.begin(), child.end(), '\\'), child.end());
     }
     
    +/**
    + * @brief  Sends reboot msg to node directly without queueing it up in
    + *    AMFD message queue.
    + * @param  ptr to AVD_AVND.
    + * @return NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. 
    + **/
    +uint32_t avd_send_reboot_msg_directly(AVD_AVND *node) {
    +  NCSMDS_INFO snd_mds = {0};
    +  uint32_t rc = NCSCC_RC_SUCCESS;
    +  AVD_DND_MSG *d2n_msg = new AVD_DND_MSG();
    +
    +  d2n_msg->msg_type = AVSV_D2N_REBOOT_MSG;
    +  d2n_msg->msg_info.d2n_reboot_info.node_id = node->node_info.nodeId;
    +
    +  if (node->adest == 0) {
    +    LOG_WA("Invalid adest for %x, msg type %u",
    +      node->node_info.nodeId, d2n_msg->msg_type);
    +    rc = NCSCC_RC_FAILURE;
    +    goto done;
    +  }
    +  d2n_msg->msg_info.d2n_reboot_info.msg_id = ++(node->snd_msg_id);
    +
    +  TRACE("Sending REBOOT MSG to %x", node->node_info.nodeId);
    +
    +  snd_mds.i_mds_hdl = avd_cb->adest_hdl;
    +  snd_mds.i_svc_id = NCSMDS_SVC_ID_AVD;
    +  snd_mds.i_op = MDS_SEND;
    +  snd_mds.info.svc_send.i_msg = (NCSCONTEXT)d2n_msg;
    +  snd_mds.info.svc_send.i_to_svc = NCSMDS_SVC_ID_AVND;
    +  snd_mds.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH;
    +  snd_mds.info.svc_send.i_sendtype = MDS_SENDTYPE_SND;
    +  snd_mds.info.svc_send.info.snd.i_to_dest = node->adest;
    +  if ((rc = ncsmds_api(&snd_mds)) != NCSCC_RC_SUCCESS) {
    +    LOG_ER("ncsmds_api failed %u", rc);
    +  }
    +done:
    +  delete d2n_msg;
    +  return rc;
    +}
    diff --git a/src/amf/amfd/util.h b/src/amf/amfd/util.h
    --- a/src/amf/amfd/util.h
    +++ b/src/amf/amfd/util.h
    @@ -1,6 +1,7 @@
     /*      -*- OpenSAF  -*-
      *
      * (C) Copyright 2008 The OpenSAF Foundation
    + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
      *
      * This program is distributed in the hope that it will be useful, but
      * WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
    @@ -104,5 +105,6 @@ extern const char *admin_op_name(SaAmfAd
     int compare_sanamet(const std::string& lhs, const std::string& rhs);
     uint32_t avd_snd_compcsi_msg(AVD_COMP *comp, AVD_CSI *csi,
                avd_comp_csi_rel_tag *compcsi, AVSV_COMPCSI_ACT act);
    +uint32_t avd_send_reboot_msg_directly(AVD_AVND *node);
     
     #endif  // AMF_AMFD_UTIL_H_
    diff --git a/src/amf/amfnd/cpm.cc b/src/amf/amfnd/cpm.cc
    --- a/src/amf/amfnd/cpm.cc
    +++ b/src/amf/amfnd/cpm.cc
    @@ -1,6 +1,7 @@
     /*      -*- OpenSAF  -*-
      *
      * (C) Copyright 2008 The OpenSAF Foundation
    + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
      *
      * This program is distributed in the hope that it will be useful, but
      * WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
    @@ -556,7 +557,7 @@ void avnd_comp_pm_param_val(AVND_CB *cb,
                                *o_amf_rc = SA_AIS_ERR_NOT_EXIST;
                                return;
                        }
    -                   if ((pm_start->rec_rcvr.saf_amf >= 
SA_AMF_CLUSTER_RESET) && 
    +                   if ((pm_start->rec_rcvr.saf_amf > SA_AMF_CLUSTER_RESET) 
&& 
                                        (pm_start->rec_rcvr.saf_amf <= 
SA_AMF_CONTAINER_RESTART)) {
                                *o_amf_rc = SA_AIS_ERR_NOT_SUPPORTED;
                                return;
    diff --git a/src/amf/amfnd/err.cc b/src/amf/amfnd/err.cc
    --- a/src/amf/amfnd/err.cc
    +++ b/src/amf/amfnd/err.cc
    @@ -1,6 +1,7 @@
     /*      -*- OpenSAF  -*-
      *
      * (C) Copyright 2008 The OpenSAF Foundation
    + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
      *
      * This program is distributed in the hope that it will be useful, but
      * WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
    @@ -189,11 +190,15 @@ uint32_t avnd_evt_ava_err_rep_evh(AVND_C
                        amf_rc = SA_AIS_ERR_INVALID_PARAM;
        }
     
    -   if(comp && ((err_rep->rec_rcvr.saf_amf == SA_AMF_CLUSTER_RESET) || 
    -                   (err_rep->rec_rcvr.saf_amf == 
SA_AMF_APPLICATION_RESTART)|| 
    +   if(comp && ((err_rep->rec_rcvr.saf_amf == SA_AMF_APPLICATION_RESTART)|| 
                        (err_rep->rec_rcvr.saf_amf == 
SA_AMF_CONTAINER_RESTART)))
                amf_rc = SA_AIS_ERR_NOT_SUPPORTED;
     
    +   if (comp && (comp->su->is_ncs == true) &&
    +                   (err_rep->rec_rcvr.saf_amf == SA_AMF_CLUSTER_RESET)) {
    +           LOG_NO("Cluster Reset recovery not supported for MW components 
'%s'", comp->name.c_str());
    +           amf_rc = SA_AIS_ERR_NOT_SUPPORTED;
    +   }
        /* send the response back to AvA */
        rc = avnd_amf_resp_send(cb, AVSV_AMF_ERR_REP, amf_rc, 0, 
&api_info->dest, &evt->mds_ctxt, comp, msg_from_avnd);
     
    @@ -296,6 +301,45 @@ uint32_t avnd_evt_ava_err_clear_evh(AVND
        return rc;
     }
     
    +/**
    + * @brief Performs cluster reset recovery action.
    + *
    + * @param cb: ptr to AvND control block.
    + * @param su: ptr to the SU which contains the failed component.
    + * @param comp: ptr to failed component.
    + *
    + * @return NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
    + */
    +static uint32_t avnd_err_rcvr_cluster_reset(AVND_CB *cb, AVND_SU 
*failed_su, AVND_COMP *failed_comp) {
    +  uint32_t rc = NCSCC_RC_SUCCESS;
    +  TRACE_ENTER();
    +
    +  m_AVND_COMP_FAILED_SET(failed_comp);
    +  m_AVND_SU_FAILED_SET(failed_su);
    +
    +  m_AVND_COMP_OPER_STATE_SET(failed_comp, SA_AMF_OPERATIONAL_DISABLED);
    +  rc = avnd_comp_oper_state_avd_sync(cb, failed_comp);
    +  if (NCSCC_RC_SUCCESS != rc)
    +    goto done;
    +
    +  rc = avnd_comp_curr_info_del(cb, failed_comp);
    +  if (NCSCC_RC_SUCCESS != rc)
    +    goto done;
    +
    +  //AMFD will not send any assignments, so clean up PI/NPI comp.
    +  rc = avnd_comp_clc_fsm_run(cb, failed_comp, 
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
    +  if (NCSCC_RC_SUCCESS != rc)
    +    goto done;
    +
    +  cb->oper_state = SA_AMF_OPERATIONAL_DISABLED;
    +  m_AVND_SU_OPER_STATE_SET(failed_su, SA_AMF_OPERATIONAL_DISABLED);
    +  rc = avnd_di_oper_send(cb, failed_su, SA_AMF_CLUSTER_RESET);
    +
    +done:
    +  TRACE_LEAVE2("%u", rc);
    +  return rc;
    +}
    +
     
/****************************************************************************
       Name          : avnd_err_process
      
    @@ -532,20 +576,6 @@ uint32_t avnd_err_recover(AVND_CB *cb, A
                return rc;
        }
     
    -   /* if we are already inst-failed,  do nothing */
    -   if ((su->pres == SA_AMF_PRESENCE_INSTANTIATION_FAILED) &&
    -       (comp->pres == SA_AMF_PRESENCE_TERMINATING) && (rcvr != 
SA_AMF_NODE_FAILOVER)
    -       && (rcvr != SA_AMF_NODE_FAILFAST)) {
    -           rc = avnd_comp_clc_fsm_run(cb, comp, 
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
    -           return rc;
    -   }
    -
    -   /* if we are already terminating do nothing */
    -   if ((comp->pres == SA_AMF_PRESENCE_TERMINATING) && (rcvr == 
SA_AMF_COMPONENT_RESTART)) {
    -           rc = avnd_comp_clc_fsm_run(cb, comp, 
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
    -           return rc;
    -   }
    -
        /* When SU is in TERMINATING state, higher level recovery 
(SA_AMF_NODE_FAILOVER, 
           SA_AMF_NODE_FAILFAST and SA_AMF_NODE_SWITCHOVER) should be processed 
because higher 
           level recovery will terminate the component. If the faulted 
component has recovery 
    @@ -595,7 +625,7 @@ uint32_t avnd_err_recover(AVND_CB *cb, A
                break;
     
        case SA_AMF_CLUSTER_RESET:
    -           /* not supported */
    +           rc = avnd_err_rcvr_cluster_reset(cb, su, comp);
                break;
     
        case AVSV_ERR_RCVR_SU_RESTART:
    



------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to