Re: [devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]
Hi Gary, This V2 has fixed the error reported in V1, ack from me. Thanks Minh On 12/9/19 5:20 pm, Gary Lee wrote: If delayed failover is enabled, and a downgrade to a version without #3060 occurs, then the standby running a newer version with #3060 may complain about an out of sync error during warm sync. --- src/amf/amfd/ckpt_dec.cc | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc index 6288b4f..75213f8 100644 --- a/src/amf/amfd/ckpt_dec.cc +++ b/src/amf/amfd/ckpt_dec.cc @@ -2721,10 +2721,25 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) { if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt) LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u", updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt); -if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) - LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", - updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); - +if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) { + if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) { +LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", + updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); + } else { +// Versions before 10 did not support failover_updt +// After a downgrade scenario, where the active is < v10 +// and this node is >= v10, then there will be failover_updt mismatch +// If so, just set the value to what's on the older active +cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt; + +// check again +if (0 == memcmp(updt_cnt, >async_updt_cnt, +sizeof(AVSV_ASYNC_UPDT_CNT))) { + cb->stby_sync_state = AVD_STBY_IN_SYNC; + return status; +} + } +} LOG_ER("Out of sync detected in warm sync response, exiting"); osafassert(0); ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
[devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]
If delayed failover is enabled, and a downgrade to a version without #3060 occurs, then the standby running a newer version with #3060 may complain about an out of sync error during warm sync. --- src/amf/amfd/ckpt_dec.cc | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc index 6288b4f..75213f8 100644 --- a/src/amf/amfd/ckpt_dec.cc +++ b/src/amf/amfd/ckpt_dec.cc @@ -2721,10 +2721,25 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) { if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt) LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u", updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt); -if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) - LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", - updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); - +if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) { + if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) { +LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", + updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); + } else { +// Versions before 10 did not support failover_updt +// After a downgrade scenario, where the active is < v10 +// and this node is >= v10, then there will be failover_updt mismatch +// If so, just set the value to what's on the older active +cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt; + +// check again +if (0 == memcmp(updt_cnt, >async_updt_cnt, +sizeof(AVSV_ASYNC_UPDT_CNT))) { + cb->stby_sync_state = AVD_STBY_IN_SYNC; + return status; +} + } +} LOG_ER("Out of sync detected in warm sync response, exiting"); osafassert(0); -- 2.7.4 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
[devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]
If delayed failover is enabled, and a downgrade to a version without #3060 occurs, then the standby running a newer version with #3060 may complain about an out of sync error during warm sync. --- src/amf/amfd/ckpt_dec.cc | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc index 6288b4f..5d4b3f5 100644 --- a/src/amf/amfd/ckpt_dec.cc +++ b/src/amf/amfd/ckpt_dec.cc @@ -2721,10 +2721,25 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) { if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt) LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u", updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt); -if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) - LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", - updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); - +if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) { + if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) { +LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", + updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); + } else { +// Versions before 10 did not support failover_updt +// After a downupgrade scenario, where the active is < v10 +// and this node is >= v10, then there will be failover_updt mismatch +// If so, just set the value to what's on the older active +cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt; + +// check again +if (0 == memcmp(updt_cnt, >async_updt_cnt, +sizeof(AVSV_ASYNC_UPDT_CNT))) { + cb->stby_sync_state = AVD_STBY_IN_SYNC; + return status; +} + } +} LOG_ER("Out of sync detected in warm sync response, exiting"); osafassert(0); -- 2.7.4 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]
Hi Gary, The patch works fine in the reported scenario, no coredump in amfd. But after downgrade succeeds (meaning the sc1 is active and running old software, the sc2 is standby running latest software + #3078), I continue another switchover to make sc2 back to active, I got error Thanks Minh 2019-09-11 14:31:58.633 SC-2 osafamfd[280]: WA avsv_validate_reo_type_in_csync: unknown type 52 2019-09-11 14:31:58.674 SC-2 osafimmnd[234]: NO Implementer (applier) connected: 43 (@OpenSafImmReplicatorB) <0, 2010f> 2019-09-11 14:31:59.496 SC-2 osafimmnd[234]: NO Implementer disconnected 35 <0, 2010f> (safAmfService) 2019-09-11 14:31:59.500 SC-2 osafimmnd[234]: NO Implementer (applier) connected: 44 (@safAmfService2010f) <0, 2010f> 2019-09-11 14:31:59.524 SC-2 osafamfd[280]: NO Switching StandBy --> Active State 2019-09-11 14:31:59.526 SC-2 osafamfd[280]: ER Switch Standby --> Active FAILED, Standby OUT OF SYNC 2019-09-11 14:31:59.526 SC-2 osafamfd[280]: ER avd_role_change role change failure 2019-09-11 14:31:59.544 SC-2 osafimmd[223]: NO MDS event from svc_id 24 (change:7, dest:13) 2019-09-11 14:31:59.547 SC-2 osafimmd[223]: NO MDS event from svc_id 24 (change:7, dest:13) 2019-09-11 14:31:59.551 SC-2 osafamfnd[290]: NO AVD NEW_ACTIVE, adest:1 2019-09-11 14:31:59.563 SC-2 osafimmnd[234]: NO Implementer disconnected 44 <0, 2010f> (@safAmfService2010f) 2019-09-11 14:31:59.566 SC-2 osafimmnd[234]: NO Implementer connected: 45 (safAmfService) <0, 2010f> 2019-09-11 14:31:59.580 SC-2 osafamfd[280]: WA avsv_validate_reo_type_in_csync: unknown type 52 2019-09-11 14:32:09.626 SC-2 osafamfd[280]: message repeated 4 times: [ WA avsv_validate_reo_type_in_csync: unknown type 52] 2019-09-11 14:32:59.775 SC-2 osafimmd[223]: NO MDS event from svc_id 25 (change:4, dest:564114788998701) 2019-09-11 14:32:59.775 SC-2 osafimmd[223]: NO MDS event from svc_id 24 (change:1, dest:13) 2019-09-11 14:32:59.776 SC-2 osafimmd[223]: NO MDS event from svc_id 24 (change:6, dest:13) 2019-09-11 14:32:59.777 SC-2 osaffmd[213]: NO IMMND down on: 2010f 2019-09-11 14:32:59.777 SC-2 osafimmnd[234]: WA DISCARD DUPLICATE FEVS message:2334 2019-09-11 14:32:59.778 SC-2 osafimmnd[234]: WA Error code 2 returned for message type 82 - ignoring 2019-09-11 14:32:59.778 SC-2 osafimmd[223]: WA IMMD lost contact with peer IMMD (NCSMDS_RED_DOWN) 2019-09-11 14:32:59.780 SC-2 osaffmd[213]: NO Node Down event for node id 2010f: 2019-09-11 14:32:59.780 SC-2 osafrded[204]: NO Peer down on node 0x2010f 2019-09-11 14:32:59.782 SC-2 osaffmd[213]: NO AMFND down on: 2010f 2019-09-11 14:32:59.783 SC-2 osaffmd[213]: NO FM down on: 2010f 2019-09-11 14:32:59.784 SC-2 osafamfd[280]: NO Node 'SC-1' is down. Start failover delay timer 2019-09-11 14:32:59.784 SC-2 osaffmd[213]: NO IMMD down on: 2010f 2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO AVD down on: 2010f 2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO Core services went down on node_id: 2010f 2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO Current role: STANDBY 2019-09-11 14:32:59.788 SC-2 osaffmd[213]: Rebooting OpenSAF NodeId = 131343 EE Name = , Reason: Received Node Down for peer controller, OwnNodeId = 131599, SupervisionTime = 60 2019-09-11 14:32:59.789 SC-2 osafclmd[270]: NO Node 131343 went down. Not sending track callback for agents on that node 2019-09-11 14:32:59.792 SC-2 osafclmd[270]: message repeated 4 times: [ NO Node 131343 went down. Not sending track callback for agents on that node] 2019-09-11 14:32:59.792 SC-2 osafclmd[270]: NO saflog write "safNode=SC-1,safCluster=myClmCluster LEFT, init view=9, cluster view=10" FAILED: SA_AIS_ERR_TRY_AGAIN (6) 2019-09-11 14:32:59.792 SC-2 osafamfd[280]: NO Start timer for '2010f' 2019-09-11 14:32:59.808 SC-2 opensaf_reboot: Rebooting remote node in the absence of PLM is outside the scope of OpenSAF 2019-09-11 14:32:59.809 SC-2 osaffmd[213]: NO Controller Failover: Setting role to ACTIVE 2019-09-11 14:32:59.809 SC-2 osafrded[204]: NO RDE role set to ACTIVE 2019-09-11 14:32:59.810 SC-2 osafrded[204]: NO Running '/usr/local/lib/opensaf/opensaf_sc_active' with 0 argument(s) 2019-09-11 14:32:59.812 SC-2 osafamfd[280]: NO FAILOVER StandBy --> Active 2019-09-11 14:32:59.812 SC-2 osafamfd[280]: ER FAILOVER StandBy --> Active FAILED, Standby OUT OF SYNC 2019-09-11 14:32:59.812 SC-2 osafamfd[280]: Rebooting OpenSAF NodeId = 0 EE Name = No EE Mapped, Reason: FAILOVER failed, OwnNodeId = 131599, SupervisionTime = 60 2019-09-11 14:31:58.181 SC-1 osafamfd[273]: NO ROLE SWITCH Active --> Quiesced 2019-09-11 14:31:58.675 SC-1 osafimmnd[233]: NO Implementer (applier) connected: 43 (@OpenSafImmReplicatorB) <269, 2010f> 2019-09-11 14:31:58.676 SC-1 osafntfimcnd[471]: NO Started 2019-09-11 14:31:59.496 SC-1 osafimmnd[233]: NO Implementer disconnected 35 <97, 2010f> (safAmfService) 2019-09-11 14:31:59.501 SC-1 osafimmnd[233]: NO Implementer (applier) connected: 44 (@safAmfService2010f) <97, 2010f> 2019-09-11 14:31:59.525 SC-1
[devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]
If delayed failover is enabled, and a downgrade to a version without #3060 occurs, then the standby running a newer version with #3060 may complain about an out of sync error during warm sync. --- src/amf/amfd/ckpt_dec.cc | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc index 6288b4f..3c253d2 100644 --- a/src/amf/amfd/ckpt_dec.cc +++ b/src/amf/amfd/ckpt_dec.cc @@ -2721,10 +2721,21 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) { if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt) LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u", updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt); -if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) - LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", - updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); - +if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) { + if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) { +LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", + updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); + } else { +// Versions before 10 did not support failover_updt +// After a downupgrade scenario, where the active is < v10 +// and this node is >= v10, then there will be failover_updt mismatch +// If so, just set the value to what's on the older active +cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt; +// failover_updt must be the LAST comparison made, otherwise +// these if statements need will some refactoring +return status; + } +} LOG_ER("Out of sync detected in warm sync response, exiting"); osafassert(0); -- 2.7.4 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel