Re: [devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]

2020-03-09 Thread Minh Hon Chau

Hi Thuan,

ack from me.

Thanks

Minh

On 9/3/20 5:08 pm, thuan.tran wrote:

- Split-brain recovery in headless enable, IMMND may expected restart.
If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger,
clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay
restart IMMND, delay resend node_up then AMFD will order reboot node.
- Only call clm_to_amf_node() if amf node name is empty.
---
  src/amf/amfnd/clm.cc | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc
index 06eb229c7..73c8ff83c 100644
--- a/src/amf/amfnd/clm.cc
+++ b/src/amf/amfnd/clm.cc
@@ -250,7 +250,7 @@ static void clm_track_cb(
memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode),
   sizeof(SaClmClusterNodeT_4));
/*get the amf node from clm node name */
-  clm_to_amf_node();
+  if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node();
avnd_send_node_up_msg();
avnd_cb->first_time_up = false;
  }



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


Re: [devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]

2020-03-09 Thread Thang Duc Nguyen
Hi Thuan,

Ack from me (not tested).

B.R/Thang

-Original Message-
From: Thuan Tran  
Sent: Monday, March 9, 2020 1:08 PM
To: Thang Duc Nguyen ; Minh Hon Chau 
; Gary Lee 
Cc: opensaf-devel@lists.sourceforge.net; Thuan Tran 
Subject: [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery 
[#3162]

- Split-brain recovery in headless enable, IMMND may expected restart.
If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger,
clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay 
restart IMMND, delay resend node_up then AMFD will order reboot node.
- Only call clm_to_amf_node() if amf node name is empty.
---
 src/amf/amfnd/clm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc index 
06eb229c7..73c8ff83c 100644
--- a/src/amf/amfnd/clm.cc
+++ b/src/amf/amfnd/clm.cc
@@ -250,7 +250,7 @@ static void clm_track_cb(
   memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode),
  sizeof(SaClmClusterNodeT_4));
   /*get the amf node from clm node name */
-  clm_to_amf_node();
+  if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node();
   avnd_send_node_up_msg();
   avnd_cb->first_time_up = false;
 }
--
2.17.1



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


[devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]

2020-03-08 Thread thuan.tran
- Split-brain recovery in headless enable, IMMND may expected restart.
If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger,
clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay
restart IMMND, delay resend node_up then AMFD will order reboot node.
- Only call clm_to_amf_node() if amf node name is empty.
---
 src/amf/amfnd/clm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc
index 06eb229c7..73c8ff83c 100644
--- a/src/amf/amfnd/clm.cc
+++ b/src/amf/amfnd/clm.cc
@@ -250,7 +250,7 @@ static void clm_track_cb(
   memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode),
  sizeof(SaClmClusterNodeT_4));
   /*get the amf node from clm node name */
-  clm_to_amf_node();
+  if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node();
   avnd_send_node_up_msg();
   avnd_cb->first_time_up = false;
 }
-- 
2.17.1



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


[devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]

2020-03-05 Thread thuan.tran
- Split-brain recovery in headless enable, IMMND may expected restart.
If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger,
clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay
restart IMMND, delay resend node_up then AMFD will order reboot node.
- In clm_to_amf_node(), use old amf node name if immnd is down.
---
 src/amf/amfnd/avnd_cb.h | 1 +
 src/amf/amfnd/clc.cc| 8 
 src/amf/amfnd/clm.cc| 7 ---
 src/amf/amfnd/main.cc   | 1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h
index 8b0cc2304..0fa0590ff 100644
--- a/src/amf/amfnd/avnd_cb.h
+++ b/src/amf/amfnd/avnd_cb.h
@@ -125,6 +125,7 @@ typedef struct avnd_cb_tag {
   SaTimeT scs_absence_max_duration;
   /* the timer for supervision of the absence of SC */
   AVND_TMR sc_absence_tmr;
+  bool immnd_down;
 } AVND_CB;
 
 #define AVND_CB_NULL ((AVND_CB *)0)
diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
index f78e1a707..f96f3b3a9 100644
--- a/src/amf/amfnd/clc.cc
+++ b/src/amf/amfnd/clc.cc
@@ -,6 +,14 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, 
AVND_COMP *comp,
 // outcome of command is reported in comp_clc_resp_callback()
   }
 
+  if (comp->su->is_ncs &&
+  comp->name.find("safComp=IMMND,") != std::string::npos) {
+if (cmd_type == AVND_COMP_CLC_CMD_TYPE_CLEANUP)
+  cb->immnd_down = true;
+else if (cmd_type == AVND_COMP_CLC_CMD_TYPE_INSTANTIATE)
+  cb->immnd_down = false;
+  }
+
   TRACE_2("success");
   goto done;
 
diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc
index 06eb229c7..7fef38daa 100644
--- a/src/amf/amfnd/clm.cc
+++ b/src/amf/amfnd/clm.cc
@@ -106,7 +106,7 @@ done:
 }
 
 static void clm_to_amf_node(void) {
-  SaAisErrorT error;
+  SaAisErrorT error = SA_AIS_ERR_UNAVAILABLE;
   SaImmSearchHandleT searchHandle;
   SaNameT amfdn, clmdn;
   SaImmSearchParametersT_2 searchParam;
@@ -122,9 +122,10 @@ static void clm_to_amf_node(void) {
   searchParam.searchOneAttr.attrValueType = SA_IMM_ATTR_SASTRINGT;
   searchParam.searchOneAttr.attrValue = &className;
 
-  error = saImmOmInitialize_cond(&immOmHandle, nullptr, &immVersion);
+  if (avnd_cb->immnd_down == false)
+error = saImmOmInitialize_cond(&immOmHandle, nullptr, &immVersion);
   if (SA_AIS_OK != error) {
-LOG_WA("saImmOmInitialize failed. Use previous value of nodeName.");
+LOG_WA("Use previous value of nodeName %s", avnd_cb->amf_nodeName.c_str());
 osafassert(avnd_cb->amf_nodeName.empty() == false);
 goto done1;
   }
diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc
index d7857fabe..cef0543e6 100644
--- a/src/amf/amfnd/main.cc
+++ b/src/amf/amfnd/main.cc
@@ -334,6 +334,7 @@ AVND_CB *avnd_cb_create() {
 
   cb->is_avd_down = true;
   cb->amfd_sync_required = false;
+  cb->immnd_down = false;
 
   // retrieve hydra configuration from IMM
   hydra_config_get(cb);
-- 
2.17.1



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


Re: [devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]

2020-03-05 Thread Gary Lee
Hi Thuan

One comment inline with [GL].

Thanks
Gary


From: Thuan Tran 
Sent: 04 March 2020 18:28
To: Thang Duc Nguyen ; Minh Hon Chau 
; Gary Lee 
Cc: opensaf-devel@lists.sourceforge.net ; 
Thuan Tran 
Subject: [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery 
[#3162]

- Split-brain recovery in headless enable, IMMND may expected restart.
If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger,
clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay
restart IMMND, delay resend node_up then AMFD will order reboot node.
- Do not trigger saClmDispatch() if immnd down.
---
 src/amf/amfnd/avnd_cb.h |  1 +
 src/amf/amfnd/clc.cc| 10 ++
 src/amf/amfnd/main.cc   |  4 +++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h
index 8b0cc2304..0fa0590ff 100644
--- a/src/amf/amfnd/avnd_cb.h
+++ b/src/amf/amfnd/avnd_cb.h
@@ -125,6 +125,7 @@ typedef struct avnd_cb_tag {
   SaTimeT scs_absence_max_duration;
   /* the timer for supervision of the absence of SC */
   AVND_TMR sc_absence_tmr;
+  bool immnd_down;
 } AVND_CB;

 #define AVND_CB_NULL ((AVND_CB *)0)
diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
index f78e1a707..227bf6a5a 100644
--- a/src/amf/amfnd/clc.cc
+++ b/src/amf/amfnd/clc.cc
@@ -3106,6 +3106,9 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, AVND_COMP 
*comp,
   unsigned int i;
   SaStringT env;
   size_t env_set_nmemb;
+  size_t comma = comp->saAmfCompType.find_last_of(",");
+  size_t end = comp->saAmfCompType.length();
+  std::string compBaseType = comp->saAmfCompType.substr(comma + 1, end);

   TRACE_ENTER2("'%s':CLC CLI command type:'%s'", comp->name.c_str(),
clc_cmd_type[cmd_type]);
@@ -,6 +3336,13 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, 
AVND_COMP *comp,
 // outcome of command is reported in comp_clc_resp_callback()
   }

+  if (compBaseType.compare("safCompType=OpenSafCompTypeIMMND") == 0) {
+if (cmd_type == AVND_COMP_CLC_CMD_TYPE_CLEANUP)
+  cb->immnd_down = true;
+else if (cmd_type == AVND_COMP_CLC_CMD_TYPE_INSTANTIATE)
+  cb->immnd_down = false;
+  }
+
   TRACE_2("success");
   goto done;

diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc
index d7857fabe..447e2aa82 100644
--- a/src/amf/amfnd/main.cc
+++ b/src/amf/amfnd/main.cc
@@ -334,6 +334,7 @@ AVND_CB *avnd_cb_create() {

   cb->is_avd_down = true;
   cb->amfd_sync_required = false;
+  cb->immnd_down = false;

   // retrieve hydra configuration from IMM
   hydra_config_get(cb);
@@ -609,7 +610,8 @@ void avnd_main_process(void) {
   exit(0);
 }

-if (avnd_cb->clmHandle && (fds[FD_CLM].revents & POLLIN)) {
+if (!avnd_cb->immnd_down && avnd_cb->clmHandle &&
+(fds[FD_CLM].revents & POLLIN)) {

[GL] I think, in general, it's probably bad practise to skip an event when it 
is ready to be processed. This could end up in a tight loop, spiking CPU usage.

   // LOG_NO("DEBUG-> CLM event fd: %d sel_obj: %llu, clm handle: %llu",
   // fds[FD_CLM].fd, avnd_cb->clm_sel_obj, avnd_cb->clmHandle);
   result = saClmDispatch(avnd_cb->clmHandle, SA_DISPATCH_ALL);
--
2.17.1


___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


[devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]

2020-03-03 Thread thuan.tran
- Split-brain recovery in headless enable, IMMND may expected restart.
If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger,
clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay
restart IMMND, delay resend node_up then AMFD will order reboot node.
- Do not trigger saClmDispatch() if immnd down.
---
 src/amf/amfnd/avnd_cb.h |  1 +
 src/amf/amfnd/clc.cc| 10 ++
 src/amf/amfnd/main.cc   |  4 +++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h
index 8b0cc2304..0fa0590ff 100644
--- a/src/amf/amfnd/avnd_cb.h
+++ b/src/amf/amfnd/avnd_cb.h
@@ -125,6 +125,7 @@ typedef struct avnd_cb_tag {
   SaTimeT scs_absence_max_duration;
   /* the timer for supervision of the absence of SC */
   AVND_TMR sc_absence_tmr;
+  bool immnd_down;
 } AVND_CB;
 
 #define AVND_CB_NULL ((AVND_CB *)0)
diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
index f78e1a707..227bf6a5a 100644
--- a/src/amf/amfnd/clc.cc
+++ b/src/amf/amfnd/clc.cc
@@ -3106,6 +3106,9 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, AVND_COMP 
*comp,
   unsigned int i;
   SaStringT env;
   size_t env_set_nmemb;
+  size_t comma = comp->saAmfCompType.find_last_of(",");
+  size_t end = comp->saAmfCompType.length();
+  std::string compBaseType = comp->saAmfCompType.substr(comma + 1, end);
 
   TRACE_ENTER2("'%s':CLC CLI command type:'%s'", comp->name.c_str(),
clc_cmd_type[cmd_type]);
@@ -,6 +3336,13 @@ uint32_t avnd_comp_clc_cmd_execute(AVND_CB *cb, 
AVND_COMP *comp,
 // outcome of command is reported in comp_clc_resp_callback()
   }
 
+  if (compBaseType.compare("safCompType=OpenSafCompTypeIMMND") == 0) {
+if (cmd_type == AVND_COMP_CLC_CMD_TYPE_CLEANUP)
+  cb->immnd_down = true;
+else if (cmd_type == AVND_COMP_CLC_CMD_TYPE_INSTANTIATE)
+  cb->immnd_down = false;
+  }
+
   TRACE_2("success");
   goto done;
 
diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc
index d7857fabe..447e2aa82 100644
--- a/src/amf/amfnd/main.cc
+++ b/src/amf/amfnd/main.cc
@@ -334,6 +334,7 @@ AVND_CB *avnd_cb_create() {
 
   cb->is_avd_down = true;
   cb->amfd_sync_required = false;
+  cb->immnd_down = false;
 
   // retrieve hydra configuration from IMM
   hydra_config_get(cb);
@@ -609,7 +610,8 @@ void avnd_main_process(void) {
   exit(0);
 }
 
-if (avnd_cb->clmHandle && (fds[FD_CLM].revents & POLLIN)) {
+if (!avnd_cb->immnd_down && avnd_cb->clmHandle &&
+(fds[FD_CLM].revents & POLLIN)) {
   // LOG_NO("DEBUG-> CLM event fd: %d sel_obj: %llu, clm handle: %llu",
   // fds[FD_CLM].fd, avnd_cb->clm_sel_obj, avnd_cb->clmHandle);
   result = saClmDispatch(avnd_cb->clmHandle, SA_DISPATCH_ALL);
-- 
2.17.1



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel