My patch is not touching the code path below, does that means this is a new bug? Is it reproducible? Do you have traces?
Around line 1760 in comp_csi_remove_done csi is a variable local to the loop but it is also a function parameter. I believe this should be OK in C++ but maybe you have an older compiler or so? Thanks, Hans > -----Original Message----- > From: praveen malviya [mailto:[email protected]] > Sent: den 17 januari 2014 08:19 > To: Hans Feldt > Cc: [email protected] > Subject: Re: [devel] [PATCH 1 of 1] amfnd: escalate comp errors in quiesced > HA state [#601] > > While testing #601 getting crash. > case: Node shutdown and FAILED_OPERATION in quiescing callback. Crash > does not come without patch. > (gdb) bt > #0 0x00000039ef679b60 in strlen () from /lib64/libc.so.6 > #1 0x00000039ef646cb9 in vfprintf () from /lib64/libc.so.6 > #2 0x00000039ef6e72e8 in __vsnprintf_chk () from /lib64/libc.so.6 > #3 0x00007fb0ad51029f in output (file=0x44326b "comp.cc", line=1218, > priority=7, category=10, > format=0x44334b "%s: comp: '%s' : csi: '%p'", ap=0x7fff69bc8e10) at > logtrace.c:100 > #4 0x00007fb0ad51047e in _logtrace_trace (file=0x3a <Address 0x3a out > of bounds>, line=9, category=1634099571, > format=0x8 <Address 0x8 out of bounds>) at logtrace.c:168 > #5 0x000000000041d369 in avnd_comp_csi_remove (cb=0x658240, comp=0x0, > csi=0x0) at comp.cc:1218 > #6 0x000000000041cfb7 in avnd_comp_csi_remove_done (cb=0x658240, > comp=0x714ce0, csi=<value optimized out>) > at comp.cc:1760 > #7 0x000000000040a230 in avnd_evt_ava_resp_evh (cb=0x658240, > evt=0x70b9b0) at cbq.cc:482 > #8 0x0000000000429c2b in avnd_evt_process (evt=0x70b9b0) at main.cc:660 > #9 0x000000000042a608 in avnd_main_process () at main.cc:604 > #10 0x000000000042a7ad in main (argc=2, argv=0x7fff69bc9308) at main.cc:178 > > > Thanks, > Praveen > On 10-Jan-14 2:44 PM, Hans Feldt wrote: > > osaf/services/saf/amf/amfnd/comp.cc | 23 > > +++++++++++++++++++++-- > > osaf/services/saf/amf/amfnd/err.cc | 15 +++++++++++---- > > osaf/services/saf/amf/amfnd/include/avnd_comp.h | 1 + > > osaf/services/saf/amf/amfnd/su.cc | 1 + > > osaf/services/saf/amf/amfnd/util.cc | 5 ++--- > > 5 files changed, 36 insertions(+), 9 deletions(-) > > > > > > When a node or SU is locked, AMF assigns the QUIESCED HA state to ACTIVE > > sa-aware components. This state is intended to transfer state to a peer > > component assigned STANDBY. If the ACTIVE component fails in the QUIESCED > > state, AMF will restart it and reassign it QUIESCED. This is an invalid > > state > > transition according to picture 3 in the B.04 AMF spec and makes no sense > > since > > the state is anyway gone (because the component failed). > > > > The problem was likely reintroduced when fixing #3083, before that special > > handling for comp errors in QUIESCED state was there. However the solution > > had > > other problems and was removed. So in the current code there is no special > > handling for errors in QUIESCED state which results in re-assignment of the > > QUIESCED HA state after component restart. > > > > This patch changes the error escalation logic so that instead of component > > restart, component failover is performed as recovery action. > > > > diff --git a/osaf/services/saf/amf/amfnd/comp.cc > > b/osaf/services/saf/amf/amfnd/comp.cc > > --- a/osaf/services/saf/amf/amfnd/comp.cc > > +++ b/osaf/services/saf/amf/amfnd/comp.cc > > @@ -32,9 +32,7 @@ > > > > ****************************************************************************** > > */ > > > > -#include <stdbool.h> > > #include "avnd.h" > > -#include <stdbool.h> > > #include <immutil.h> > > > > /*** Static function declarations ***/ > > @@ -2766,3 +2764,24 @@ void avnd_comp_pres_state_set(AVND_COMP > > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, comp, AVND_CKPT_COMP_PRES_STATE); > > } > > > > +/** > > + * Returns true if the HA state for any CSI assignment is > > QUIESCED/QUIESCING > > + * @param su > > + */ > > +bool comp_has_quiesced_assignment(const AVND_COMP *comp) > > +{ > > + const AVND_COMP_CSI_REC *csi; > > + > > + for (csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET( > > + m_NCS_DBLIST_FIND_FIRST(&comp->csi_list)); > > + csi != NULL; > > + csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET( > > + m_NCS_DBLIST_FIND_NEXT(&csi->comp_dll_node))) { > > + > > + if ((csi->si->curr_state == SA_AMF_HA_QUIESCED) || > > + (csi->si->curr_state == SA_AMF_HA_QUIESCING)) > > + return true; > > + } > > + > > + return false; > > +} > > diff --git a/osaf/services/saf/amf/amfnd/err.cc > > b/osaf/services/saf/amf/amfnd/err.cc > > --- a/osaf/services/saf/amf/amfnd/err.cc > > +++ b/osaf/services/saf/amf/amfnd/err.cc > > @@ -411,10 +411,17 @@ uint32_t avnd_err_escalate(AVND_CB *cb, > > if (*io_esc_rcvr == SA_AMF_NO_RECOMMENDATION) > > *io_esc_rcvr = comp->err_info.def_rec; > > > > - /* disallow comp-restart if it's disabled */ > > - if ((SA_AMF_COMPONENT_RESTART == *io_esc_rcvr) && > > m_AVND_COMP_IS_RESTART_DIS(comp) && (!su->is_ncs)) { > > - LOG_NO("saAmfCompDisableRestart is true for > > '%s'",comp->name.value); > > - *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER; > > + if (*io_esc_rcvr == SA_AMF_COMPONENT_RESTART) { > > + if (m_AVND_COMP_IS_RESTART_DIS(comp) && (!su->is_ncs)) { > > + LOG_NO("saAmfCompDisableRestart is true for > > '%s'",comp->name.value); > > + LOG_NO("recovery action 'comp restart' escalated to > > 'comp failover'"); > > + *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER; > > + } else if (comp_has_quiesced_assignment(comp) == true) { > > + /* Cannot re-assign QUIESCED, escalate to failover */ > > + LOG_NO("component with QUIESCED/QUIESCING assignment > > failed"); > > + LOG_NO("recovery action 'comp restart' escalated to > > 'comp failover'"); > > + *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER; > > + } > > } > > > > if ((SA_AMF_COMPONENT_FAILOVER== *io_esc_rcvr) && (su->sufailover) && > > (!su->is_ncs)) { > > diff --git a/osaf/services/saf/amf/amfnd/include/avnd_comp.h > > b/osaf/services/saf/amf/amfnd/include/avnd_comp.h > > --- a/osaf/services/saf/amf/amfnd/include/avnd_comp.h > > +++ b/osaf/services/saf/amf/amfnd/include/avnd_comp.h > > @@ -876,6 +876,7 @@ extern unsigned int avnd_comp_config_get > > extern int avnd_comp_config_reinit(AVND_COMP *comp); > > extern void avnd_comp_delete(AVND_COMP *comp); > > extern void avnd_comp_pres_state_set(AVND_COMP *comp, SaAmfPresenceStateT > > newstate); > > +bool comp_has_quiesced_assignment(const AVND_COMP *comp); > > > > /** > > * Initiate restart of a component. > > diff --git a/osaf/services/saf/amf/amfnd/su.cc > > b/osaf/services/saf/amf/amfnd/su.cc > > --- a/osaf/services/saf/amf/amfnd/su.cc > > +++ b/osaf/services/saf/amf/amfnd/su.cc > > @@ -481,6 +481,7 @@ uint32_t avnd_evt_su_admin_op_req(AVND_C > > AVND_COMP *comp; > > > > /* SU has been repaired. Reset states and update AMF director > > accordingly. */ > > + LOG_NO("Repair request for '%s'", su->name.value); > > > > for (comp = > > m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list)); > > comp; > > diff --git a/osaf/services/saf/amf/amfnd/util.cc > > b/osaf/services/saf/amf/amfnd/util.cc > > --- a/osaf/services/saf/amf/amfnd/util.cc > > +++ b/osaf/services/saf/amf/amfnd/util.cc > > @@ -236,9 +236,8 @@ void avnd_failed_state_file_create(void) > > */ > > void avnd_failed_state_file_delete(void) > > { > > - if (unlink(failed_state_file_name) == -1) > > - LOG_ER("cannot unlink failed state file %s: %s", > > - failed_state_file_name, strerror(errno)); > > + // file might not exist in some cases, ignore errors > > + (void) unlink(failed_state_file_name); > > } > > > > /** > > > ------------------------------------------------------------------------------ > CenturyLink Cloud: The Leader in Enterprise Cloud Services. > Learn Why More Businesses Are Choosing CenturyLink Cloud For > Critical Workloads, Development Environments & Everything In Between. > Get a Quote or Start a Free Trial Today. > http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk > _______________________________________________ > Opensaf-devel mailing list > [email protected] > https://lists.sourceforge.net/lists/listinfo/opensaf-devel ------------------------------------------------------------------------------ CenturyLink Cloud: The Leader in Enterprise Cloud Services. Learn Why More Businesses Are Choosing CenturyLink Cloud For Critical Workloads, Development Environments & Everything In Between. Get a Quote or Start a Free Trial Today. http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
