Hi Praveen,

A question, the opensafd script, in stop function, amfnd is sent a 
SIGTERM signal and starts to terminate its components.
Despite the timeout retry logic, the final_clean seems to be called even 
if amfnd has not finished.
Shouldn't there be a pkill -9 osaf* in the case where amfnd has not 
finished in time? As it now looks the amfnd
is running when the final_clean is called to run all the clc_cli 
scripts, e.g. killing amfd.
amfnd should not be running at this time and leads to the "ER AMF 
director unexpectedly crashed".

/Thanks  HansN


On 12/16/2015 12:53 PM, [email protected] wrote:
>   osaf/services/saf/amf/amfnd/clc.cc  |  12 ++++++
>   osaf/services/saf/amf/amfnd/sidb.cc |   8 ++++
>   osaf/services/saf/amf/amfnd/susm.cc |  72 
> +++++++++++++++++++++++++++++++++++++
>   3 files changed, 92 insertions(+), 0 deletions(-)
>
>
> In the reported problem, opensaf shutdown got stuck when one of the components
> of NPI su faulted.
>
> During opnesaf shutdown, amfnd started removing assignment from lower rank
> SI2 assigned to SU2. During this time, comp of a NPI SU1 having a csi from a 
> higher rank SI1
> faulted. AMFND successfully cleaned up the failed component. Now when all the 
> assignments
> from SU2 got removed, amfnd started removing assignments from SU1. Since SU1 
> had only one
> component which was moved to UNINSTANTIATED state after clean up, no further 
> clean up
> was required. Since no further comp was cleaned up in SU1 got stuck in 
> TERMINATING state.
>
> In healthy condition when last CSI is removed from a NPI SU, SU will move to
> UNINSTANTIATED state. After this if no further lower rank SIs are
> available for application SUs, amfnd will launch clean up of all the 
> components.
> But since SU1 got stuck in TERMINATING state AMFND could not launch clean up 
> of all the
> comps and after 60 sceconds NID rebooted the node.
>
> Patch fixes the problem by resuming the SU FSM of failed component and 
> removes further
> CSIs. If no CSI is available it will mark the SU UNINSTANTIATED and will go 
> with
> the normal sequence of removal and then clean up of all comps.
>
> diff --git a/osaf/services/saf/amf/amfnd/clc.cc 
> b/osaf/services/saf/amf/amfnd/clc.cc
> --- a/osaf/services/saf/amf/amfnd/clc.cc
> +++ b/osaf/services/saf/amf/amfnd/clc.cc
> @@ -2204,6 +2204,18 @@ uint32_t avnd_comp_clc_terming_cleansucc
>                       }
>               }
>   
> +             if ((!comp->su->is_ncs) && (comp->csi_list.n_nodes > 0) &&
> +                             (!m_AVND_SU_IS_PREINSTANTIABLE(comp->su))) {
> +                     AVND_COMP_CSI_REC *csi = nullptr;
> +                     /*
> +                        Explantion written above for PI SU case is valid 
> here also.
> +                        However for a NPI comp in NPI SU, mark it REMOVED 
> instead of
> +                        generating remove done indication.
> +                      */
> +                     csi = 
> m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&comp->csi_list));
> +                     if (csi != nullptr)
> +                             m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(csi, 
> AVND_COMP_CSI_ASSIGN_STATE_REMOVED);
> +             }
>               if (all_comps_terminated()) {
>                       LOG_NO("Terminated all AMF components");
>                       LOG_NO("Shutdown completed, exiting");
> diff --git a/osaf/services/saf/amf/amfnd/sidb.cc 
> b/osaf/services/saf/amf/amfnd/sidb.cc
> --- a/osaf/services/saf/amf/amfnd/sidb.cc
> +++ b/osaf/services/saf/amf/amfnd/sidb.cc
> @@ -185,6 +185,7 @@ AVND_SU_SI_REC *avnd_su_si_rec_add(AVND_
>       /*
>        * Update the rest of the parameters with default values.
>        */
> +     TRACE("Marking curr assigned state of '%s' 
> unassigned.",si_rec->name.value);
>       m_AVND_SU_SI_CURR_ASSIGN_STATE_SET(si_rec, 
> AVND_SU_SI_ASSIGN_STATE_UNASSIGNED);
>   
>       /*
> @@ -394,6 +395,7 @@ AVND_COMP_CSI_REC *avnd_su_si_csi_rec_ad
>       /*
>        * Update the rest of the parameters with default values.
>        */
> +     TRACE("Marking curr assigned state of '%s' 
> unassigned.",csi_rec->name.value);
>       m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(csi_rec, 
> AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED);
>       m_AVND_COMP_CSI_PRV_ASSIGN_STATE_SET(csi_rec, 
> AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED);
>   
> @@ -477,6 +479,7 @@ AVND_SU_SI_REC *avnd_su_si_rec_modify(AV
>   
>       /* store the prv assign-state & update the new assign-state */
>       si_rec->prv_assign_state = si_rec->curr_assign_state;
> +     TRACE_1("Marking curr assigned state of '%s' 
> unassigned.",si_rec->name.value);
>       m_AVND_SU_SI_CURR_ASSIGN_STATE_SET(si_rec, 
> AVND_SU_SI_ASSIGN_STATE_UNASSIGNED);
>   
>       m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, si_rec, AVND_CKPT_SU_SI_REC);
> @@ -518,6 +521,7 @@ uint32_t avnd_su_si_csi_rec_modify(AVND_
>       TRACE_ENTER2("%p", param);
>       /* pick up all the csis belonging to the si & modify them */
>       if (!param) {
> +             TRACE_1("Marking curr assigned state of all CSIs of '%s' 
> unassigned.",si_rec->name.value);
>               for (curr_csi = (AVND_COMP_CSI_REC 
> *)m_NCS_DBLIST_FIND_FIRST(&si_rec->csi_list);
>                    curr_csi; curr_csi = (AVND_COMP_CSI_REC 
> *)m_NCS_DBLIST_FIND_NEXT(&curr_csi->si_dll_node)) {
>                       /* store the prv assign-state & update the new 
> assign-state */
> @@ -544,6 +548,7 @@ uint32_t avnd_su_si_csi_rec_modify(AVND_
>   
>               /* store the prv assign-state & update the new assign-state */
>               curr_csi->prv_assign_state = curr_csi->curr_assign_state;
> +             TRACE("Marking curr assigned state of '%s' 
> unassigned.",curr_csi->name.value);
>               m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(curr_csi, 
> AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED);
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, curr_csi, 
> AVND_CKPT_CSI_REC);
>       }                       /* for */
> @@ -574,6 +579,7 @@ uint32_t avnd_su_si_all_modify(AVND_CB *
>   
>       TRACE_ENTER2();
>       /* modify all the si records */
> +     TRACE("Marking curr assigned state all SIs in '%s' 
> unassigned.",su->name.value);
>       for (curr_si = (AVND_SU_SI_REC *)m_NCS_DBLIST_FIND_FIRST(&su->si_list);
>            curr_si; curr_si = (AVND_SU_SI_REC 
> *)m_NCS_DBLIST_FIND_NEXT(&curr_si->su_dll_node)) {
>               /* store the prv state & update the new state */
> @@ -624,6 +630,7 @@ uint32_t avnd_su_si_csi_all_modify(AVND_
>       TRACE_ENTER2("%p", param);
>       /* pick up all the csis belonging to all the sis & modify them */
>       if (!param) {
> +             TRACE("Marking curr assigned state all CSIs in SIs of '%s' 
> unassigned.",su->name.value);
>               for (curr_si = (AVND_SU_SI_REC 
> *)m_NCS_DBLIST_FIND_FIRST(&su->si_list);
>                    curr_si; curr_si = (AVND_SU_SI_REC 
> *)m_NCS_DBLIST_FIND_NEXT(&curr_si->su_dll_node)) {
>                       for (curr_csi = (AVND_COMP_CSI_REC 
> *)m_NCS_DBLIST_FIND_FIRST(&curr_si->csi_list);
> @@ -658,6 +665,7 @@ uint32_t avnd_su_si_csi_all_modify(AVND_
>               }
>               if (false == curr_comp->assigned_flag) {
>                       /* modify all the csi-records */
> +                     TRACE("Marking curr assigned state all CSIs assigned to 
> '%s' unassigned.",curr_comp->name.value);
>                       for (curr_csi = 
> m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&curr_comp->csi_list));
>                                       curr_csi;
>                                       curr_csi = 
> m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(m_NCS_DBLIST_FIND_NEXT(&curr_csi->comp_dll_node)))
> diff --git a/osaf/services/saf/amf/amfnd/susm.cc 
> b/osaf/services/saf/amf/amfnd/susm.cc
> --- a/osaf/services/saf/amf/amfnd/susm.cc
> +++ b/osaf/services/saf/amf/amfnd/susm.cc
> @@ -1917,6 +1917,19 @@ uint32_t avnd_su_pres_st_chng_prc(AVND_C
>                        */
>                       avnd_di_uns32_upd_send(AVSV_SA_AMF_SU, 
> saAmfSUOperState_ID, &su->name, su->oper);
>               }
> +             
> +             if ((prv_st == SA_AMF_PRESENCE_INSTANTIATED) &&
> +                                (final_st == SA_AMF_PRESENCE_UNINSTANTIATED) 
> &&
> +                             (cb->term_state == 
> AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED)) {
> +                     /*
> +                        During shutdown phase, all comps of SU may fault. In 
> that case,
> +                        SU FSM marks SU in TERMINAIING state and finally 
> moves it to
> +                        UNINSTANTIATED state. So generated the assignment 
> done indication
> +                        so that removal of lower rank SI can proceed.
> +                      */
> +                     rc = avnd_su_si_oper_done(cb, su, si);
> +                     m_AVND_SU_ALL_SI_RESET(su);
> +             }
>       }
>   
>    done:
> @@ -2264,6 +2277,21 @@ uint32_t avnd_su_pres_insting_compinstfa
>       return rc;
>   }
>   
> +/**
> + * @brief  Returns first assigned csi traversing from end.
> + * @return Ptr to csi_rec.
> + */
> +static AVND_COMP_CSI_REC *get_next_assigned_csi_from_end(const 
> AVND_SU_SI_REC *si)
> +{
> +     for (AVND_COMP_CSI_REC *csi = (AVND_COMP_CSI_REC 
> *)m_NCS_DBLIST_FIND_LAST(&si->csi_list);
> +                     (csi != nullptr);
> +                     csi = (AVND_COMP_CSI_REC 
> *)m_NCS_DBLIST_FIND_PREV(&csi->si_dll_node)) {
> +             if (m_AVND_COMP_CSI_CURR_ASSIGN_STATE_IS_ASSIGNED(csi) && 
> ((csi->comp != nullptr)
> +                                     && (csi->comp->pres == 
> SA_AMF_PRESENCE_INSTANTIATED)))
> +                     return csi;
> +     }
> +     return nullptr;
> +}
>   
> /****************************************************************************
>     Name          : avnd_su_pres_inst_suterm_hdler
>    
> @@ -2334,6 +2362,32 @@ uint32_t avnd_su_pres_inst_suterm_hdler(
>                                          AVND_COMP_CLC_PRES_FSM_EV_CLEANUP : 
> AVND_COMP_CLC_PRES_FSM_EV_TERM);
>               if (NCSCC_RC_SUCCESS != rc)
>                       goto done;
> +
> +             /*
> +                During shutdown phase if a component faults, it will be 
> cleaned up by AMFND
> +                irrespective of recovery policy. This component will move to 
> UNINSTANTIATED
> +                after successful clean up. When amfnd starts removing SI 
> from SU of this comp,
> +                it will have to skip the CSI of cleaned up component.
> +             */
> +             if ((csi->comp->pres == SA_AMF_PRESENCE_UNINSTANTIATED) &&
> +                     (cb->term_state == 
> AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED)) {
> +                     m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(csi, 
> AVND_COMP_CSI_ASSIGN_STATE_REMOVED);
> +                     avnd_su_pres_state_set(su, SA_AMF_PRESENCE_TERMINATING);
> +                     AVND_COMP_CSI_REC *assigned_csi = 
> get_next_assigned_csi_from_end(si);
> +                     if (assigned_csi == nullptr) {
> +                             //Components of all the CSIs in SI are cleaned 
> up.
> +                             avnd_su_pres_state_set(su, 
> SA_AMF_PRESENCE_UNINSTANTIATED);
> +                             goto done;
> +                     } else {
> +                             //One CSI is still assigned.
> +                             
> m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(assigned_csi,
> +                                             
> AVND_COMP_CSI_ASSIGN_STATE_REMOVING);
> +                             rc = avnd_comp_clc_fsm_trigger(cb, 
> assigned_csi->comp,
> +                                        
> (m_AVND_COMP_IS_FAILED(assigned_csi->comp)) ?
> +                                        AVND_COMP_CLC_PRES_FSM_EV_CLEANUP :
> +                                        AVND_COMP_CLC_PRES_FSM_EV_TERM);
> +                     }
> +             }
>       }
>   
>       /* transition to terminating state */
> @@ -2937,6 +2991,24 @@ uint32_t avnd_su_pres_terming_compuninst
>               if (all_csis_in_assigned_state(su) || 
> all_csis_in_removed_state(su)) {
>                       TRACE("SI Assignment done");
>                       avnd_su_pres_state_set(su, 
> SA_AMF_PRESENCE_UNINSTANTIATED);
> +                     goto done;
> +             }
> +
> +             /*
> +                During shutdown phase if a component faults, it will be 
> cleaned up by AMFND
> +                irrespective of recovery policy. This component will move to 
> UNINSTANTIATED
> +                after successful clean up. When amfnd starts removing SI 
> from SU of this comp,
> +                it will have to skip the CSI of cleaned up component.
> +             */
> +             if ((curr_csi != NULL) && (curr_csi->comp->pres == 
> SA_AMF_PRESENCE_UNINSTANTIATED) &&
> +                             (cb->term_state == 
> AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED)) {
> +                     m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(curr_csi, 
> AVND_COMP_CSI_ASSIGN_STATE_REMOVED);
> +                     AVND_COMP_CSI_REC *assigned_csi = 
> get_next_assigned_csi_from_end(curr_csi->si);
> +                     m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(assigned_csi, 
> AVND_COMP_CSI_ASSIGN_STATE_REMOVING);
> +                     rc = avnd_comp_clc_fsm_trigger(cb, assigned_csi->comp,
> +                                     
> (m_AVND_COMP_IS_FAILED(assigned_csi->comp)) ?
> +                                     AVND_COMP_CLC_PRES_FSM_EV_CLEANUP :
> +                                     AVND_COMP_CLC_PRES_FSM_EV_TERM);
>               }
>       }
>   


------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to