Re: [devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]

2019-09-12 Thread Minh Hon Chau

Hi Gary,

This V2 has fixed the error reported in V1, ack from me.

Thanks

Minh

On 12/9/19 5:20 pm, Gary Lee wrote:

If delayed failover is enabled, and a downgrade to a version without #3060 
occurs,
then the standby running a newer version with #3060 may complain about an out
of sync error during warm sync.
---
  src/amf/amfd/ckpt_dec.cc | 23 +++
  1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc
index 6288b4f..75213f8 100644
--- a/src/amf/amfd/ckpt_dec.cc
+++ b/src/amf/amfd/ckpt_dec.cc
@@ -2721,10 +2721,25 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec) {
  if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt)
LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u",
   updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt);
-if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt)
-  LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
- updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
-
+if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) {
+  if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) {
+LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
+   updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
+  } else {
+// Versions before 10 did not support failover_updt
+// After a downgrade scenario, where the active is < v10
+// and this node is >= v10, then there will be failover_updt mismatch
+// If so, just set the value to what's on the older active
+cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt;
+
+// check again
+if (0 == memcmp(updt_cnt, >async_updt_cnt,
+sizeof(AVSV_ASYNC_UPDT_CNT))) {
+  cb->stby_sync_state = AVD_STBY_IN_SYNC;
+  return status;
+}
+  }
+}
  LOG_ER("Out of sync detected in warm sync response, exiting");
  osafassert(0);
  



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


[devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]

2019-09-12 Thread Gary Lee
If delayed failover is enabled, and a downgrade to a version without #3060 
occurs,
then the standby running a newer version with #3060 may complain about an out
of sync error during warm sync.
---
 src/amf/amfd/ckpt_dec.cc | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc
index 6288b4f..75213f8 100644
--- a/src/amf/amfd/ckpt_dec.cc
+++ b/src/amf/amfd/ckpt_dec.cc
@@ -2721,10 +2721,25 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec) {
 if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt)
   LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u",
  updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt);
-if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt)
-  LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
- updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
-
+if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) {
+  if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) {
+LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
+   updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
+  } else {
+// Versions before 10 did not support failover_updt
+// After a downgrade scenario, where the active is < v10
+// and this node is >= v10, then there will be failover_updt mismatch
+// If so, just set the value to what's on the older active
+cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt;
+
+// check again
+if (0 == memcmp(updt_cnt, >async_updt_cnt,
+sizeof(AVSV_ASYNC_UPDT_CNT))) {
+  cb->stby_sync_state = AVD_STBY_IN_SYNC;
+  return status;
+}
+  }
+}
 LOG_ER("Out of sync detected in warm sync response, exiting");
 osafassert(0);
 
-- 
2.7.4



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


[devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]

2019-09-12 Thread Gary Lee
If delayed failover is enabled, and a downgrade to a version without #3060 
occurs,
then the standby running a newer version with #3060 may complain about an out
of sync error during warm sync.
---
 src/amf/amfd/ckpt_dec.cc | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc
index 6288b4f..5d4b3f5 100644
--- a/src/amf/amfd/ckpt_dec.cc
+++ b/src/amf/amfd/ckpt_dec.cc
@@ -2721,10 +2721,25 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec) {
 if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt)
   LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u",
  updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt);
-if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt)
-  LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
- updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
-
+if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) {
+  if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) {
+LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
+   updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
+  } else {
+// Versions before 10 did not support failover_updt
+// After a downupgrade scenario, where the active is < v10
+// and this node is >= v10, then there will be failover_updt mismatch
+// If so, just set the value to what's on the older active
+cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt;
+
+// check again
+if (0 == memcmp(updt_cnt, >async_updt_cnt,
+sizeof(AVSV_ASYNC_UPDT_CNT))) {
+  cb->stby_sync_state = AVD_STBY_IN_SYNC;
+  return status;
+}
+  }
+}
 LOG_ER("Out of sync detected in warm sync response, exiting");
 osafassert(0);
 
-- 
2.7.4



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


Re: [devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]

2019-09-10 Thread Minh Hon Chau

Hi Gary,

The patch works fine in the reported scenario, no coredump in amfd.

But after downgrade succeeds (meaning the sc1 is active and running old 
software, the sc2 is standby running latest software + #3078), I 
continue another switchover to make sc2 back to active, I got error


Thanks

Minh

2019-09-11 14:31:58.633 SC-2 osafamfd[280]: WA 
avsv_validate_reo_type_in_csync: unknown type 52
2019-09-11 14:31:58.674 SC-2 osafimmnd[234]: NO Implementer (applier) 
connected: 43 (@OpenSafImmReplicatorB) <0, 2010f>
2019-09-11 14:31:59.496 SC-2 osafimmnd[234]: NO Implementer disconnected 
35 <0, 2010f> (safAmfService)
2019-09-11 14:31:59.500 SC-2 osafimmnd[234]: NO Implementer (applier) 
connected: 44 (@safAmfService2010f) <0, 2010f>
2019-09-11 14:31:59.524 SC-2 osafamfd[280]: NO Switching StandBy --> 
Active State
2019-09-11 14:31:59.526 SC-2 osafamfd[280]: ER Switch Standby --> Active 
FAILED, Standby OUT OF SYNC
2019-09-11 14:31:59.526 SC-2 osafamfd[280]: ER avd_role_change role 
change failure
2019-09-11 14:31:59.544 SC-2 osafimmd[223]: NO MDS event from svc_id 24 
(change:7, dest:13)
2019-09-11 14:31:59.547 SC-2 osafimmd[223]: NO MDS event from svc_id 24 
(change:7, dest:13)

2019-09-11 14:31:59.551 SC-2 osafamfnd[290]: NO AVD NEW_ACTIVE, adest:1
2019-09-11 14:31:59.563 SC-2 osafimmnd[234]: NO Implementer disconnected 
44 <0, 2010f> (@safAmfService2010f)
2019-09-11 14:31:59.566 SC-2 osafimmnd[234]: NO Implementer connected: 
45 (safAmfService) <0, 2010f>
2019-09-11 14:31:59.580 SC-2 osafamfd[280]: WA 
avsv_validate_reo_type_in_csync: unknown type 52
2019-09-11 14:32:09.626 SC-2 osafamfd[280]: message repeated 4 times: [ 
WA avsv_validate_reo_type_in_csync: unknown type 52]
2019-09-11 14:32:59.775 SC-2 osafimmd[223]: NO MDS event from svc_id 25 
(change:4, dest:564114788998701)
2019-09-11 14:32:59.775 SC-2 osafimmd[223]: NO MDS event from svc_id 24 
(change:1, dest:13)
2019-09-11 14:32:59.776 SC-2 osafimmd[223]: NO MDS event from svc_id 24 
(change:6, dest:13)

2019-09-11 14:32:59.777 SC-2 osaffmd[213]: NO IMMND down on: 2010f
2019-09-11 14:32:59.777 SC-2 osafimmnd[234]: WA DISCARD DUPLICATE FEVS 
message:2334
2019-09-11 14:32:59.778 SC-2 osafimmnd[234]: WA Error code 2 returned 
for message type 82 - ignoring
2019-09-11 14:32:59.778 SC-2 osafimmd[223]: WA IMMD lost contact with 
peer IMMD (NCSMDS_RED_DOWN)
2019-09-11 14:32:59.780 SC-2 osaffmd[213]: NO Node Down event for node 
id 2010f:

2019-09-11 14:32:59.780 SC-2 osafrded[204]: NO Peer down on node 0x2010f
2019-09-11 14:32:59.782 SC-2 osaffmd[213]: NO AMFND down on: 2010f
2019-09-11 14:32:59.783 SC-2 osaffmd[213]: NO FM down on: 2010f
2019-09-11 14:32:59.784 SC-2 osafamfd[280]: NO Node 'SC-1' is down. 
Start failover delay timer

2019-09-11 14:32:59.784 SC-2 osaffmd[213]: NO IMMD down on: 2010f
2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO AVD down on: 2010f
2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO Core services went down on 
node_id: 2010f

2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO Current role: STANDBY
2019-09-11 14:32:59.788 SC-2 osaffmd[213]: Rebooting OpenSAF NodeId = 
131343 EE Name = , Reason: Received Node Down for peer controller, 
OwnNodeId = 131599, SupervisionTime = 60
2019-09-11 14:32:59.789 SC-2 osafclmd[270]: NO Node 131343 went down. 
Not sending track callback for agents on that node
2019-09-11 14:32:59.792 SC-2 osafclmd[270]: message repeated 4 times: [ 
NO Node 131343 went down. Not sending track callback for agents on that 
node]
2019-09-11 14:32:59.792 SC-2 osafclmd[270]: NO saflog write 
"safNode=SC-1,safCluster=myClmCluster LEFT, init view=9, cluster 
view=10" FAILED: SA_AIS_ERR_TRY_AGAIN (6)

2019-09-11 14:32:59.792 SC-2 osafamfd[280]: NO Start timer for '2010f'
2019-09-11 14:32:59.808 SC-2 opensaf_reboot: Rebooting remote node in 
the absence of PLM is outside the scope of OpenSAF
2019-09-11 14:32:59.809 SC-2 osaffmd[213]: NO Controller Failover: 
Setting role to ACTIVE

2019-09-11 14:32:59.809 SC-2 osafrded[204]: NO RDE role set to ACTIVE
2019-09-11 14:32:59.810 SC-2 osafrded[204]: NO Running 
'/usr/local/lib/opensaf/opensaf_sc_active' with 0 argument(s)

2019-09-11 14:32:59.812 SC-2 osafamfd[280]: NO FAILOVER StandBy --> Active
2019-09-11 14:32:59.812 SC-2 osafamfd[280]: ER FAILOVER StandBy --> 
Active FAILED, Standby OUT OF SYNC
2019-09-11 14:32:59.812 SC-2 osafamfd[280]: Rebooting OpenSAF NodeId = 0 
EE Name = No EE Mapped, Reason: FAILOVER failed, OwnNodeId = 131599, 
SupervisionTime = 60


2019-09-11 14:31:58.181 SC-1 osafamfd[273]: NO ROLE SWITCH Active --> 
Quiesced
2019-09-11 14:31:58.675 SC-1 osafimmnd[233]: NO Implementer (applier) 
connected: 43 (@OpenSafImmReplicatorB) <269, 2010f>

2019-09-11 14:31:58.676 SC-1 osafntfimcnd[471]: NO Started
2019-09-11 14:31:59.496 SC-1 osafimmnd[233]: NO Implementer disconnected 
35 <97, 2010f> (safAmfService)
2019-09-11 14:31:59.501 SC-1 osafimmnd[233]: NO Implementer (applier) 
connected: 44 (@safAmfService2010f) <97, 2010f>
2019-09-11 14:31:59.525 SC-1 

[devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]

2019-09-08 Thread Gary Lee
If delayed failover is enabled, and a downgrade to a version without #3060 
occurs,
then the standby running a newer version with #3060 may complain about an out
of sync error during warm sync.
---
 src/amf/amfd/ckpt_dec.cc | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc
index 6288b4f..3c253d2 100644
--- a/src/amf/amfd/ckpt_dec.cc
+++ b/src/amf/amfd/ckpt_dec.cc
@@ -2721,10 +2721,21 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec) {
 if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt)
   LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u",
  updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt);
-if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt)
-  LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
- updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
-
+if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) {
+  if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) {
+LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
+   updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
+  } else {
+// Versions before 10 did not support failover_updt
+// After a downupgrade scenario, where the active is < v10
+// and this node is >= v10, then there will be failover_updt mismatch
+// If so, just set the value to what's on the older active
+cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt;
+// failover_updt must be the LAST comparison made, otherwise
+// these if statements need will some refactoring
+return status;
+  }
+}
 LOG_ER("Out of sync detected in warm sync response, exiting");
 osafassert(0);
 
-- 
2.7.4



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel