Hi Praveen, A question, the opensafd script, in stop function, amfnd is sent a SIGTERM signal and starts to terminate its components. Despite the timeout retry logic, the final_clean seems to be called even if amfnd has not finished. Shouldn't there be a pkill -9 osaf* in the case where amfnd has not finished in time? As it now looks the amfnd is running when the final_clean is called to run all the clc_cli scripts, e.g. killing amfd. amfnd should not be running at this time and leads to the "ER AMF director unexpectedly crashed".
/Thanks HansN On 12/16/2015 12:53 PM, [email protected] wrote: > osaf/services/saf/amf/amfnd/clc.cc | 12 ++++++ > osaf/services/saf/amf/amfnd/sidb.cc | 8 ++++ > osaf/services/saf/amf/amfnd/susm.cc | 72 > +++++++++++++++++++++++++++++++++++++ > 3 files changed, 92 insertions(+), 0 deletions(-) > > > In the reported problem, opensaf shutdown got stuck when one of the components > of NPI su faulted. > > During opnesaf shutdown, amfnd started removing assignment from lower rank > SI2 assigned to SU2. During this time, comp of a NPI SU1 having a csi from a > higher rank SI1 > faulted. AMFND successfully cleaned up the failed component. Now when all the > assignments > from SU2 got removed, amfnd started removing assignments from SU1. Since SU1 > had only one > component which was moved to UNINSTANTIATED state after clean up, no further > clean up > was required. Since no further comp was cleaned up in SU1 got stuck in > TERMINATING state. > > In healthy condition when last CSI is removed from a NPI SU, SU will move to > UNINSTANTIATED state. After this if no further lower rank SIs are > available for application SUs, amfnd will launch clean up of all the > components. > But since SU1 got stuck in TERMINATING state AMFND could not launch clean up > of all the > comps and after 60 sceconds NID rebooted the node. > > Patch fixes the problem by resuming the SU FSM of failed component and > removes further > CSIs. If no CSI is available it will mark the SU UNINSTANTIATED and will go > with > the normal sequence of removal and then clean up of all comps. > > diff --git a/osaf/services/saf/amf/amfnd/clc.cc > b/osaf/services/saf/amf/amfnd/clc.cc > --- a/osaf/services/saf/amf/amfnd/clc.cc > +++ b/osaf/services/saf/amf/amfnd/clc.cc > @@ -2204,6 +2204,18 @@ uint32_t avnd_comp_clc_terming_cleansucc > } > } > > + if ((!comp->su->is_ncs) && (comp->csi_list.n_nodes > 0) && > + (!m_AVND_SU_IS_PREINSTANTIABLE(comp->su))) { > + AVND_COMP_CSI_REC *csi = nullptr; > + /* > + Explantion written above for PI SU case is valid > here also. > + However for a NPI comp in NPI SU, mark it REMOVED > instead of > + generating remove done indication. > + */ > + csi = > m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&comp->csi_list)); > + if (csi != nullptr) > + m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(csi, > AVND_COMP_CSI_ASSIGN_STATE_REMOVED); > + } > if (all_comps_terminated()) { > LOG_NO("Terminated all AMF components"); > LOG_NO("Shutdown completed, exiting"); > diff --git a/osaf/services/saf/amf/amfnd/sidb.cc > b/osaf/services/saf/amf/amfnd/sidb.cc > --- a/osaf/services/saf/amf/amfnd/sidb.cc > +++ b/osaf/services/saf/amf/amfnd/sidb.cc > @@ -185,6 +185,7 @@ AVND_SU_SI_REC *avnd_su_si_rec_add(AVND_ > /* > * Update the rest of the parameters with default values. > */ > + TRACE("Marking curr assigned state of '%s' > unassigned.",si_rec->name.value); > m_AVND_SU_SI_CURR_ASSIGN_STATE_SET(si_rec, > AVND_SU_SI_ASSIGN_STATE_UNASSIGNED); > > /* > @@ -394,6 +395,7 @@ AVND_COMP_CSI_REC *avnd_su_si_csi_rec_ad > /* > * Update the rest of the parameters with default values. > */ > + TRACE("Marking curr assigned state of '%s' > unassigned.",csi_rec->name.value); > m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(csi_rec, > AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED); > m_AVND_COMP_CSI_PRV_ASSIGN_STATE_SET(csi_rec, > AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED); > > @@ -477,6 +479,7 @@ AVND_SU_SI_REC *avnd_su_si_rec_modify(AV > > /* store the prv assign-state & update the new assign-state */ > si_rec->prv_assign_state = si_rec->curr_assign_state; > + TRACE_1("Marking curr assigned state of '%s' > unassigned.",si_rec->name.value); > m_AVND_SU_SI_CURR_ASSIGN_STATE_SET(si_rec, > AVND_SU_SI_ASSIGN_STATE_UNASSIGNED); > > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, si_rec, AVND_CKPT_SU_SI_REC); > @@ -518,6 +521,7 @@ uint32_t avnd_su_si_csi_rec_modify(AVND_ > TRACE_ENTER2("%p", param); > /* pick up all the csis belonging to the si & modify them */ > if (!param) { > + TRACE_1("Marking curr assigned state of all CSIs of '%s' > unassigned.",si_rec->name.value); > for (curr_csi = (AVND_COMP_CSI_REC > *)m_NCS_DBLIST_FIND_FIRST(&si_rec->csi_list); > curr_csi; curr_csi = (AVND_COMP_CSI_REC > *)m_NCS_DBLIST_FIND_NEXT(&curr_csi->si_dll_node)) { > /* store the prv assign-state & update the new > assign-state */ > @@ -544,6 +548,7 @@ uint32_t avnd_su_si_csi_rec_modify(AVND_ > > /* store the prv assign-state & update the new assign-state */ > curr_csi->prv_assign_state = curr_csi->curr_assign_state; > + TRACE("Marking curr assigned state of '%s' > unassigned.",curr_csi->name.value); > m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(curr_csi, > AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED); > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, curr_csi, > AVND_CKPT_CSI_REC); > } /* for */ > @@ -574,6 +579,7 @@ uint32_t avnd_su_si_all_modify(AVND_CB * > > TRACE_ENTER2(); > /* modify all the si records */ > + TRACE("Marking curr assigned state all SIs in '%s' > unassigned.",su->name.value); > for (curr_si = (AVND_SU_SI_REC *)m_NCS_DBLIST_FIND_FIRST(&su->si_list); > curr_si; curr_si = (AVND_SU_SI_REC > *)m_NCS_DBLIST_FIND_NEXT(&curr_si->su_dll_node)) { > /* store the prv state & update the new state */ > @@ -624,6 +630,7 @@ uint32_t avnd_su_si_csi_all_modify(AVND_ > TRACE_ENTER2("%p", param); > /* pick up all the csis belonging to all the sis & modify them */ > if (!param) { > + TRACE("Marking curr assigned state all CSIs in SIs of '%s' > unassigned.",su->name.value); > for (curr_si = (AVND_SU_SI_REC > *)m_NCS_DBLIST_FIND_FIRST(&su->si_list); > curr_si; curr_si = (AVND_SU_SI_REC > *)m_NCS_DBLIST_FIND_NEXT(&curr_si->su_dll_node)) { > for (curr_csi = (AVND_COMP_CSI_REC > *)m_NCS_DBLIST_FIND_FIRST(&curr_si->csi_list); > @@ -658,6 +665,7 @@ uint32_t avnd_su_si_csi_all_modify(AVND_ > } > if (false == curr_comp->assigned_flag) { > /* modify all the csi-records */ > + TRACE("Marking curr assigned state all CSIs assigned to > '%s' unassigned.",curr_comp->name.value); > for (curr_csi = > m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&curr_comp->csi_list)); > curr_csi; > curr_csi = > m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(m_NCS_DBLIST_FIND_NEXT(&curr_csi->comp_dll_node))) > diff --git a/osaf/services/saf/amf/amfnd/susm.cc > b/osaf/services/saf/amf/amfnd/susm.cc > --- a/osaf/services/saf/amf/amfnd/susm.cc > +++ b/osaf/services/saf/amf/amfnd/susm.cc > @@ -1917,6 +1917,19 @@ uint32_t avnd_su_pres_st_chng_prc(AVND_C > */ > avnd_di_uns32_upd_send(AVSV_SA_AMF_SU, > saAmfSUOperState_ID, &su->name, su->oper); > } > + > + if ((prv_st == SA_AMF_PRESENCE_INSTANTIATED) && > + (final_st == SA_AMF_PRESENCE_UNINSTANTIATED) > && > + (cb->term_state == > AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED)) { > + /* > + During shutdown phase, all comps of SU may fault. In > that case, > + SU FSM marks SU in TERMINAIING state and finally > moves it to > + UNINSTANTIATED state. So generated the assignment > done indication > + so that removal of lower rank SI can proceed. > + */ > + rc = avnd_su_si_oper_done(cb, su, si); > + m_AVND_SU_ALL_SI_RESET(su); > + } > } > > done: > @@ -2264,6 +2277,21 @@ uint32_t avnd_su_pres_insting_compinstfa > return rc; > } > > +/** > + * @brief Returns first assigned csi traversing from end. > + * @return Ptr to csi_rec. > + */ > +static AVND_COMP_CSI_REC *get_next_assigned_csi_from_end(const > AVND_SU_SI_REC *si) > +{ > + for (AVND_COMP_CSI_REC *csi = (AVND_COMP_CSI_REC > *)m_NCS_DBLIST_FIND_LAST(&si->csi_list); > + (csi != nullptr); > + csi = (AVND_COMP_CSI_REC > *)m_NCS_DBLIST_FIND_PREV(&csi->si_dll_node)) { > + if (m_AVND_COMP_CSI_CURR_ASSIGN_STATE_IS_ASSIGNED(csi) && > ((csi->comp != nullptr) > + && (csi->comp->pres == > SA_AMF_PRESENCE_INSTANTIATED))) > + return csi; > + } > + return nullptr; > +} > > /**************************************************************************** > Name : avnd_su_pres_inst_suterm_hdler > > @@ -2334,6 +2362,32 @@ uint32_t avnd_su_pres_inst_suterm_hdler( > AVND_COMP_CLC_PRES_FSM_EV_CLEANUP : > AVND_COMP_CLC_PRES_FSM_EV_TERM); > if (NCSCC_RC_SUCCESS != rc) > goto done; > + > + /* > + During shutdown phase if a component faults, it will be > cleaned up by AMFND > + irrespective of recovery policy. This component will move to > UNINSTANTIATED > + after successful clean up. When amfnd starts removing SI > from SU of this comp, > + it will have to skip the CSI of cleaned up component. > + */ > + if ((csi->comp->pres == SA_AMF_PRESENCE_UNINSTANTIATED) && > + (cb->term_state == > AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED)) { > + m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(csi, > AVND_COMP_CSI_ASSIGN_STATE_REMOVED); > + avnd_su_pres_state_set(su, SA_AMF_PRESENCE_TERMINATING); > + AVND_COMP_CSI_REC *assigned_csi = > get_next_assigned_csi_from_end(si); > + if (assigned_csi == nullptr) { > + //Components of all the CSIs in SI are cleaned > up. > + avnd_su_pres_state_set(su, > SA_AMF_PRESENCE_UNINSTANTIATED); > + goto done; > + } else { > + //One CSI is still assigned. > + > m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(assigned_csi, > + > AVND_COMP_CSI_ASSIGN_STATE_REMOVING); > + rc = avnd_comp_clc_fsm_trigger(cb, > assigned_csi->comp, > + > (m_AVND_COMP_IS_FAILED(assigned_csi->comp)) ? > + AVND_COMP_CLC_PRES_FSM_EV_CLEANUP : > + AVND_COMP_CLC_PRES_FSM_EV_TERM); > + } > + } > } > > /* transition to terminating state */ > @@ -2937,6 +2991,24 @@ uint32_t avnd_su_pres_terming_compuninst > if (all_csis_in_assigned_state(su) || > all_csis_in_removed_state(su)) { > TRACE("SI Assignment done"); > avnd_su_pres_state_set(su, > SA_AMF_PRESENCE_UNINSTANTIATED); > + goto done; > + } > + > + /* > + During shutdown phase if a component faults, it will be > cleaned up by AMFND > + irrespective of recovery policy. This component will move to > UNINSTANTIATED > + after successful clean up. When amfnd starts removing SI > from SU of this comp, > + it will have to skip the CSI of cleaned up component. > + */ > + if ((curr_csi != NULL) && (curr_csi->comp->pres == > SA_AMF_PRESENCE_UNINSTANTIATED) && > + (cb->term_state == > AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED)) { > + m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(curr_csi, > AVND_COMP_CSI_ASSIGN_STATE_REMOVED); > + AVND_COMP_CSI_REC *assigned_csi = > get_next_assigned_csi_from_end(curr_csi->si); > + m_AVND_COMP_CSI_CURR_ASSIGN_STATE_SET(assigned_csi, > AVND_COMP_CSI_ASSIGN_STATE_REMOVING); > + rc = avnd_comp_clc_fsm_trigger(cb, assigned_csi->comp, > + > (m_AVND_COMP_IS_FAILED(assigned_csi->comp)) ? > + AVND_COMP_CLC_PRES_FSM_EV_CLEANUP : > + AVND_COMP_CLC_PRES_FSM_EV_TERM); > } > } > ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
