Re: [devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]
Hi Thuan, ack from me. Thanks Minh On 9/3/20 5:08 pm, thuan.tran wrote: - Split-brain recovery in headless enable, IMMND may expected restart. If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger, clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay restart IMMND, delay resend node_up then AMFD will order reboot node. - Only call clm_to_amf_node() if amf node name is empty. --- src/amf/amfnd/clm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc index 06eb229c7..73c8ff83c 100644 --- a/src/amf/amfnd/clm.cc +++ b/src/amf/amfnd/clm.cc @@ -250,7 +250,7 @@ static void clm_track_cb( memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode), sizeof(SaClmClusterNodeT_4)); /*get the amf node from clm node name */ - clm_to_amf_node(); + if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node(); avnd_send_node_up_msg(); avnd_cb->first_time_up = false; } ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]
Hi Thuan, Ack from me (not tested). B.R/Thang -Original Message- From: Thuan Tran Sent: Monday, March 9, 2020 1:08 PM To: Thang Duc Nguyen ; Minh Hon Chau ; Gary Lee Cc: opensaf-devel@lists.sourceforge.net; Thuan Tran Subject: [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162] - Split-brain recovery in headless enable, IMMND may expected restart. If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger, clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay restart IMMND, delay resend node_up then AMFD will order reboot node. - Only call clm_to_amf_node() if amf node name is empty. --- src/amf/amfnd/clm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc index 06eb229c7..73c8ff83c 100644 --- a/src/amf/amfnd/clm.cc +++ b/src/amf/amfnd/clm.cc @@ -250,7 +250,7 @@ static void clm_track_cb( memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode), sizeof(SaClmClusterNodeT_4)); /*get the amf node from clm node name */ - clm_to_amf_node(); + if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node(); avnd_send_node_up_msg(); avnd_cb->first_time_up = false; } -- 2.17.1 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
[devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]
- Split-brain recovery in headless enable, IMMND may expected restart. If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger, clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay restart IMMND, delay resend node_up then AMFD will order reboot node. - Only call clm_to_amf_node() if amf node name is empty. --- src/amf/amfnd/clm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc index 06eb229c7..73c8ff83c 100644 --- a/src/amf/amfnd/clm.cc +++ b/src/amf/amfnd/clm.cc @@ -250,7 +250,7 @@ static void clm_track_cb( memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode), sizeof(SaClmClusterNodeT_4)); /*get the amf node from clm node name */ - clm_to_amf_node(); + if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node(); avnd_send_node_up_msg(); avnd_cb->first_time_up = false; } -- 2.17.1 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
[devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]
- Split-brain recovery in headless enable, IMMND may expected restart. If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger, clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay restart IMMND, delay resend node_up then AMFD will order reboot node. - In clm_to_amf_node(), use old amf node name if immnd is down. --- src/amf/amfnd/avnd_cb.h | 1 + src/amf/amfnd/clc.cc| 8 src/amf/amfnd/clm.cc| 7 --- src/amf/amfnd/main.cc | 1 + 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h index 8b0cc2304..0fa0590ff 100644 --- a/src/amf/amfnd/avnd_cb.h +++ b/src/amf/amfnd/avnd_cb.h @@ -125,6 +125,7 @@ typedef struct avnd_cb_tag { SaTimeT scs_absence_max_duration; /* the timer for supervision of the absence of SC */ AVND_TMR sc_absence_tmr; + bool immnd_down; } AVND_CB; #define AVND_CB_NULL ((AVND_CB *)0) diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc index f78e1a707..f96f3b3a9 100644 --- a/src/amf/amfnd/clc.cc +++ b/src/amf/amfnd/clc.cc @@ -,6 +,14 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, AVND_COMP *comp, // outcome of command is reported in comp_clc_resp_callback() } + if (comp->su->is_ncs && + comp->name.find("safComp=IMMND,") != std::string::npos) { +if (cmd_type == AVND_COMP_CLC_CMD_TYPE_CLEANUP) + cb->immnd_down = true; +else if (cmd_type == AVND_COMP_CLC_CMD_TYPE_INSTANTIATE) + cb->immnd_down = false; + } + TRACE_2("success"); goto done; diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc index 06eb229c7..7fef38daa 100644 --- a/src/amf/amfnd/clm.cc +++ b/src/amf/amfnd/clm.cc @@ -106,7 +106,7 @@ done: } static void clm_to_amf_node(void) { - SaAisErrorT error; + SaAisErrorT error = SA_AIS_ERR_UNAVAILABLE; SaImmSearchHandleT searchHandle; SaNameT amfdn, clmdn; SaImmSearchParametersT_2 searchParam; @@ -122,9 +122,10 @@ static void clm_to_amf_node(void) { searchParam.searchOneAttr.attrValueType = SA_IMM_ATTR_SASTRINGT; searchParam.searchOneAttr.attrValue = &className; - error = saImmOmInitialize_cond(&immOmHandle, nullptr, &immVersion); + if (avnd_cb->immnd_down == false) +error = saImmOmInitialize_cond(&immOmHandle, nullptr, &immVersion); if (SA_AIS_OK != error) { -LOG_WA("saImmOmInitialize failed. Use previous value of nodeName."); +LOG_WA("Use previous value of nodeName %s", avnd_cb->amf_nodeName.c_str()); osafassert(avnd_cb->amf_nodeName.empty() == false); goto done1; } diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc index d7857fabe..cef0543e6 100644 --- a/src/amf/amfnd/main.cc +++ b/src/amf/amfnd/main.cc @@ -334,6 +334,7 @@ AVND_CB *avnd_cb_create() { cb->is_avd_down = true; cb->amfd_sync_required = false; + cb->immnd_down = false; // retrieve hydra configuration from IMM hydra_config_get(cb); -- 2.17.1 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]
Hi Thuan One comment inline with [GL]. Thanks Gary From: Thuan Tran Sent: 04 March 2020 18:28 To: Thang Duc Nguyen ; Minh Hon Chau ; Gary Lee Cc: opensaf-devel@lists.sourceforge.net ; Thuan Tran Subject: [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162] - Split-brain recovery in headless enable, IMMND may expected restart. If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger, clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay restart IMMND, delay resend node_up then AMFD will order reboot node. - Do not trigger saClmDispatch() if immnd down. --- src/amf/amfnd/avnd_cb.h | 1 + src/amf/amfnd/clc.cc| 10 ++ src/amf/amfnd/main.cc | 4 +++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h index 8b0cc2304..0fa0590ff 100644 --- a/src/amf/amfnd/avnd_cb.h +++ b/src/amf/amfnd/avnd_cb.h @@ -125,6 +125,7 @@ typedef struct avnd_cb_tag { SaTimeT scs_absence_max_duration; /* the timer for supervision of the absence of SC */ AVND_TMR sc_absence_tmr; + bool immnd_down; } AVND_CB; #define AVND_CB_NULL ((AVND_CB *)0) diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc index f78e1a707..227bf6a5a 100644 --- a/src/amf/amfnd/clc.cc +++ b/src/amf/amfnd/clc.cc @@ -3106,6 +3106,9 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, AVND_COMP *comp, unsigned int i; SaStringT env; size_t env_set_nmemb; + size_t comma = comp->saAmfCompType.find_last_of(","); + size_t end = comp->saAmfCompType.length(); + std::string compBaseType = comp->saAmfCompType.substr(comma + 1, end); TRACE_ENTER2("'%s':CLC CLI command type:'%s'", comp->name.c_str(), clc_cmd_type[cmd_type]); @@ -,6 +3336,13 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, AVND_COMP *comp, // outcome of command is reported in comp_clc_resp_callback() } + if (compBaseType.compare("safCompType=OpenSafCompTypeIMMND") == 0) { +if (cmd_type == AVND_COMP_CLC_CMD_TYPE_CLEANUP) + cb->immnd_down = true; +else if (cmd_type == AVND_COMP_CLC_CMD_TYPE_INSTANTIATE) + cb->immnd_down = false; + } + TRACE_2("success"); goto done; diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc index d7857fabe..447e2aa82 100644 --- a/src/amf/amfnd/main.cc +++ b/src/amf/amfnd/main.cc @@ -334,6 +334,7 @@ AVND_CB *avnd_cb_create() { cb->is_avd_down = true; cb->amfd_sync_required = false; + cb->immnd_down = false; // retrieve hydra configuration from IMM hydra_config_get(cb); @@ -609,7 +610,8 @@ void avnd_main_process(void) { exit(0); } -if (avnd_cb->clmHandle && (fds[FD_CLM].revents & POLLIN)) { +if (!avnd_cb->immnd_down && avnd_cb->clmHandle && +(fds[FD_CLM].revents & POLLIN)) { [GL] I think, in general, it's probably bad practise to skip an event when it is ready to be processed. This could end up in a tight loop, spiking CPU usage. // LOG_NO("DEBUG-> CLM event fd: %d sel_obj: %llu, clm handle: %llu", // fds[FD_CLM].fd, avnd_cb->clm_sel_obj, avnd_cb->clmHandle); result = saClmDispatch(avnd_cb->clmHandle, SA_DISPATCH_ALL); -- 2.17.1 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
[devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]
- Split-brain recovery in headless enable, IMMND may expected restart. If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger, clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay restart IMMND, delay resend node_up then AMFD will order reboot node. - Do not trigger saClmDispatch() if immnd down. --- src/amf/amfnd/avnd_cb.h | 1 + src/amf/amfnd/clc.cc| 10 ++ src/amf/amfnd/main.cc | 4 +++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h index 8b0cc2304..0fa0590ff 100644 --- a/src/amf/amfnd/avnd_cb.h +++ b/src/amf/amfnd/avnd_cb.h @@ -125,6 +125,7 @@ typedef struct avnd_cb_tag { SaTimeT scs_absence_max_duration; /* the timer for supervision of the absence of SC */ AVND_TMR sc_absence_tmr; + bool immnd_down; } AVND_CB; #define AVND_CB_NULL ((AVND_CB *)0) diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc index f78e1a707..227bf6a5a 100644 --- a/src/amf/amfnd/clc.cc +++ b/src/amf/amfnd/clc.cc @@ -3106,6 +3106,9 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, AVND_COMP *comp, unsigned int i; SaStringT env; size_t env_set_nmemb; + size_t comma = comp->saAmfCompType.find_last_of(","); + size_t end = comp->saAmfCompType.length(); + std::string compBaseType = comp->saAmfCompType.substr(comma + 1, end); TRACE_ENTER2("'%s':CLC CLI command type:'%s'", comp->name.c_str(), clc_cmd_type[cmd_type]); @@ -,6 +3336,13 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, AVND_COMP *comp, // outcome of command is reported in comp_clc_resp_callback() } + if (compBaseType.compare("safCompType=OpenSafCompTypeIMMND") == 0) { +if (cmd_type == AVND_COMP_CLC_CMD_TYPE_CLEANUP) + cb->immnd_down = true; +else if (cmd_type == AVND_COMP_CLC_CMD_TYPE_INSTANTIATE) + cb->immnd_down = false; + } + TRACE_2("success"); goto done; diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc index d7857fabe..447e2aa82 100644 --- a/src/amf/amfnd/main.cc +++ b/src/amf/amfnd/main.cc @@ -334,6 +334,7 @@ AVND_CB *avnd_cb_create() { cb->is_avd_down = true; cb->amfd_sync_required = false; + cb->immnd_down = false; // retrieve hydra configuration from IMM hydra_config_get(cb); @@ -609,7 +610,8 @@ void avnd_main_process(void) { exit(0); } -if (avnd_cb->clmHandle && (fds[FD_CLM].revents & POLLIN)) { +if (!avnd_cb->immnd_down && avnd_cb->clmHandle && +(fds[FD_CLM].revents & POLLIN)) { // LOG_NO("DEBUG-> CLM event fd: %d sel_obj: %llu, clm handle: %llu", // fds[FD_CLM].fd, avnd_cb->clm_sel_obj, avnd_cb->clmHandle); result = saClmDispatch(avnd_cb->clmHandle, SA_DISPATCH_ALL); -- 2.17.1 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel