Re: [devel] [PATCH 1/1] osaf: support compile with gcc/g++ 10 [#3307]
Hi Thang, I think it would be nice you can separate two commits, one for test, one for gcc/g++ 10 code changes. Thanks, Minh From: Thang Duc Nguyen Sent: Wednesday, March 16, 2022 11:44 AM To: Hieu Hong Hoang ; Thien Minh Huynh ; Minh Hon Chau Cc: opensaf-devel@lists.sourceforge.net ; Thang Duc Nguyen Subject: [PATCH 1/1] osaf: support compile with gcc/g++ 10 [#3307] - Fix error to support gcc/g++ 10. - Fix memleak in api test. --- src/ckpt/agent/cpa_cb.h | 2 +- src/ckpt/apitest/test_cpa.c | 2 + src/ckpt/apitest/test_cpa_util.c | 13 ++- src/ckpt/apitest/test_cpsv_conf.h | 2 +- src/ckpt/ckptd/cpd_amf.c | 1 - src/ckpt/ckptd/cpd_init.h | 2 +- src/evt/agent/eda.h | 2 +- src/evt/apitest/tet_eda.c | 32 src/evt/apitest/tet_eda.h | 80 --- src/evt/apitest/tet_edsv_func.c | 1 + src/evt/evtd/eds.h| 2 +- src/evt/evtd/eds_amf.h| 6 +- src/evt/evtd/eds_cb.h | 2 +- .../test_saImmOmThreadInterference.c | 4 +- src/imm/immd/immd.h | 2 +- src/lck/apitest/tet_gld.c | 1 - src/lck/apitest/tet_glnd.c| 2 - src/lck/lckd/gld_dl_api.h | 2 +- src/lck/lcknd/glnd_cb.h | 4 +- src/log/apitest/logtest.c | 6 ++ src/log/apitest/logtest.h | 6 +- src/log/logd/lgs_dest.cc | 4 +- src/mds/apitest/mdstipc.h | 30 +++ src/mds/apitest/mdstipc_api.c | 17 src/mds/mds_core.h| 30 +++ src/mds/mds_dt_common.c | 3 + src/mds/mds_dt_tcp.c | 3 +- src/mds/mds_dt_tcp.h | 2 +- src/mds/mds_dt_tipc.c | 2 - src/mds/mds_main.c| 47 +++ src/msg/msgnd/mqnd_db.h | 2 +- tools/devel/fenced/node_state_hdlr_pl.cc | 1 + 32 files changed, 224 insertions(+), 91 deletions(-) diff --git a/src/ckpt/agent/cpa_cb.h b/src/ckpt/agent/cpa_cb.h index ac48c6c4f..d6335830f 100644 --- a/src/ckpt/agent/cpa_cb.h +++ b/src/ckpt/agent/cpa_cb.h @@ -119,7 +119,7 @@ typedef struct cpa_cb { } CPA_CB; -uint32_t gl_cpa_hdl; +extern uint32_t gl_cpa_hdl; typedef struct cpa_prcess_evt_sync { NCS_QELEM qelem; diff --git a/src/ckpt/apitest/test_cpa.c b/src/ckpt/apitest/test_cpa.c index 6c37e91d5..0093b91ea 100644 --- a/src/ckpt/apitest/test_cpa.c +++ b/src/ckpt/apitest/test_cpa.c @@ -364,6 +364,7 @@ void fill_testcase_data() *(ckpt_name + length) = '.'; saAisNameLend(ckpt_name, _replicas_ckpt_with_valid_extended_name_length); + free(ckpt_name); ckpt_name = malloc(INVALID_EXTENDED_NAME_LENGTH); memset(ckpt_name, 0, INVALID_EXTENDED_NAME_LENGTH); @@ -374,6 +375,7 @@ void fill_testcase_data() *(ckpt_name + length) = '.'; saAisNameLend(ckpt_name, _replicas_ckpt_with_invalid_extended_name_length); + free(ckpt_name); /* Variables for sec create */ tcd.sec_id1 = (SaUint8T *)"11"; diff --git a/src/ckpt/apitest/test_cpa_util.c b/src/ckpt/apitest/test_cpa_util.c index 474e76f0d..7da36e0c1 100644 --- a/src/ckpt/apitest/test_cpa_util.c +++ b/src/ckpt/apitest/test_cpa_util.c @@ -24,6 +24,7 @@ extern const char *saf_error_string[]; int gl_try_again_cnt; int gl_tmout_cnt; int gl_sync_pointnum; +NCSCONTEXT gl_task_hdl = NULL; int tmoutFlag; int cpsv_test_result(SaAisErrorT rc, SaAisErrorT exp_out, char *test_case, @@ -651,23 +652,24 @@ void selection_thread_blocking(NCSCONTEXT arg) m_TEST_CPSV_PRINTF("\n Dispatching FAILED %d \n", rc); else m_TEST_CPSV_PRINTF("\n Thread selected \n"); + m_NCS_TASK_RELEASE(gl_task_hdl); } void cpsv_createthread(SaCkptHandleT *cl_hdl) { SaAisErrorT rc; - NCSCONTEXT thread_handle; rc = m_NCS_TASK_CREATE((NCS_OS_CB)selection_thread_blocking, (NCSCONTEXT)cl_hdl, "cpsv_block_test", 0, - SCHED_OTHER, 8000, _handle); + SCHED_OTHER, 8000, _task_hdl); if (rc != NCSCC_RC_SUCCESS) { m_TEST_CPSV_PRINTF(" Failed to create thread\n"); return; } - rc = m_NCS_TASK_START(thread_handle); + rc = m_NCS_TASK_START(gl_task_hdl); if (rc != NCSCC_RC_SUCCESS) { + m_NCS_TASK_RELEASE(gl_task_hdl); m_TEST_CPSV_PRINTF(" Failed to start thread\n");
Re: [devel] [PATCH 1/1] amf: correct behavior SU restart [#3233]
Hi Thang Ack from me Thanks Minh Get Outlook for iOS<https://aka.ms/o0ukef> From: Thang Duc Nguyen Sent: Tuesday, November 10, 2020 7:58:04 PM To: Minh Hon Chau ; Thuan Tran Cc: opensaf-devel@lists.sourceforge.net ; Thang Duc Nguyen Subject: [PATCH 1/1] amf: correct behavior SU restart [#3233] During standby SU restarts, active SU is failover. The standby SU need re-assignment standby then take over active assignment. This is to correct the issue in the ticket #3207. --- src/amf/amfnd/comp.cc | 3 +-- src/amf/amfnd/susm.cc | 55 +-- 2 files changed, 17 insertions(+), 41 deletions(-) diff --git a/src/amf/amfnd/comp.cc b/src/amf/amfnd/comp.cc index f1e33c372..d805346bb 100644 --- a/src/amf/amfnd/comp.cc +++ b/src/amf/amfnd/comp.cc @@ -1083,8 +1083,7 @@ uint32_t avnd_comp_csi_assign(AVND_CB *cb, AVND_COMP *comp, if (curr_csi->curr_assign_state == AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED && curr_csi->prv_assign_state == - AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED && - !m_AVND_SU_IS_RESTART(comp->su)) { + AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED) { // Mark suspending_assignment for all unassigned csi(s) which are // going to be assigned to *curr_csi->comp* for (t_csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET( diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc index d8ef66ea2..80b35ea8f 100644 --- a/src/amf/amfnd/susm.cc +++ b/src/amf/amfnd/susm.cc @@ -306,18 +306,15 @@ uint32_t avnd_su_siq_prc(AVND_CB *cb, AVND_SU *su) { return rc; } + /* unlink the buffered msg from the queue */ + ncs_db_link_list_delink(>siq, >su_dll_node); + /* initiate si asignment / removal */ rc = avnd_su_si_msg_prc(cb, su, >info); - // Siq will used to su-si respond later - // in case modify SU-SI during SURestart - if ((siq->info.msg_act != AVSV_SUSI_ACT_MOD) || - !m_AVND_SU_IS_RESTART(su)) { -/* unlink the buffered msg from the queue */ -ncs_db_link_list_delink(>siq, >su_dll_node); -/* delete the buffered msg */ -avnd_su_siq_rec_del(cb, su, siq); - } + /* delete the buffered msg */ + avnd_su_siq_rec_del(cb, su, siq); + TRACE_LEAVE2("%u", rc); return rc; } @@ -1134,7 +1131,6 @@ static bool container_contained_shutdown(const AVND_SU *su) { uint32_t avnd_su_si_oper_done(AVND_CB *cb, AVND_SU *su, AVND_SU_SI_REC *si) { AVND_SU_SI_REC *curr_si = 0; AVND_COMP_CSI_REC *curr_csi = 0, *t_csi = 0; - AVND_SU_SIQ_REC *siq = 0; uint32_t rc = NCSCC_RC_SUCCESS; bool opr_done; @@ -1212,18 +1208,6 @@ uint32_t avnd_su_si_oper_done(AVND_CB *cb, AVND_SU *su, AVND_SU_SI_REC *si) { if (NCSCC_RC_SUCCESS != rc) goto done; } - // Modify event during SURestart should be respond - siq = reinterpret_cast(m_NCS_DBLIST_FIND_LAST(>siq)); - if (siq && (siq->info.msg_act == AVSV_SUSI_ACT_MOD) && - m_AVND_SU_IS_RESTART(su)) { - ncs_db_link_list_delink(>siq, >su_dll_node); - /* delete the buffered msg */ - avnd_su_siq_rec_del(avnd_cb, su, siq); - rc = avnd_di_susi_resp_send(cb, su, - m_AVND_SU_IS_ALL_SI(su) ? nullptr : si); - if (NCSCC_RC_SUCCESS != rc) goto done; - } - if (si && (cb->term_state == AVND_TERM_STATE_OPENSAF_SHUTDOWN_INITIATED)) { (void)avnd_evt_last_step_term_evh(cb, nullptr); } else if (si && @@ -1713,23 +1697,16 @@ static uint32_t pi_su_instantiating_to_instantiated(AVND_SU *su) { /* reset the su failed flag & set the oper state to enabled */ m_AVND_SU_OPER_STATE_SET(su, SA_AMF_OPERATIONAL_ENABLED); TRACE("Setting the Oper state to Enabled"); - -AVND_SU_SIQ_REC *siq = 0; -siq = reinterpret_cast(m_NCS_DBLIST_FIND_LAST(>siq)); -if (siq && (siq->info.msg_act == AVSV_SUSI_ACT_MOD)) { - rc = avnd_su_siq_prc(avnd_cb, su); -} else { - /* - * reassign all the sis... - * it's possible that the si was never assigned. send su-oper - * enable msg instead. - */ - if (su->si_list.n_nodes) -rc = avnd_su_si_reassign(avnd_cb, su); - else { -rc = avnd_di_oper_send(avnd_cb, su, 0); -reset_suRestart_flag(su); - } +/* + * reassign all the sis... + * it's possible that the si was never assigned. send su-oper + * enable msg instead. + */ +if (su->si_list.n_nodes) + rc = avnd_su_si_reassign(avnd_cb, su); +else { + rc = avnd_di_oper_send(avnd_cb, su, 0); + reset_suRestart_flag(su); } su->admin_op_Id = static_cast(0); } else { -- 2.17.1 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] ntf: fix ntfimcn fail to send notification with no space error [#3181]
Yes right, i was thinking of a 64bit value. Get Outlook for iOS<https://aka.ms/o0ukef> From: Thuan Tran Sent: Monday, May 4, 2020 8:12:43 PM To: Minh Hon Chau ; Thang Duc Nguyen Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] ntf: fix ntfimcn fail to send notification with no space error [#3181] Hi Minh, Regarding to check max unit32, I think it's not necessary. Because the atoi() returns the converted integral number as an int value. It cannot bigger than max of uint32. Best Regards, Thuan From: Minh Hon Chau Sent: Monday, May 4, 2020 12:37 PM To: Thuan Tran ; Thang Duc Nguyen Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] ntf: fix ntfimcn fail to send notification with no space error [#3181] Hi Thuan Ack with comment. I think we need to check the max value of unit32t for ntf_var_data_limit when we source from the env var. Thanks Minh On 27/4/20 9:05 pm, thuan.tran wrote: > - Support NTFA_VARIABLE_DATA_LIMIT configuration for NTF Agent. > Default value is SHRT_MAX(32767). > - In system that object creation may have many info attributes/values, > it should configure this env variable to suitable value for ntfimcn > able send notification. > --- > src/ntf/agent/ntfa_util.c | 13 - > src/ntf/ntfd/ntfd.conf | 4 > src/ntf/ntfimcnd/ntfimcn_imm.c | 18 -- > 3 files changed, 28 insertions(+), 7 deletions(-) > > diff --git a/src/ntf/agent/ntfa_util.c b/src/ntf/agent/ntfa_util.c > index 5bc859259..379348ab5 100644 > --- a/src/ntf/agent/ntfa_util.c > +++ b/src/ntf/agent/ntfa_util.c > @@ -60,8 +60,19 @@ static unsigned int ntfa_create(void) >/* No longer needed */ >m_NCS_SEL_OBJ_DESTROY(_cb.ntfs_sync_sel); > > - /* TODO: fix env variable */ > + char *ptr = NULL; > + int optval = 0; >ntfa_cb.ntf_var_data_limit = NTFA_VARIABLE_DATA_LIMIT; > + if ((ptr = getenv("NTFA_VARIABLE_DATA_LIMIT")) != NULL) { > + optval = atoi(ptr); > + if (optval > 0) { > + ntfa_cb.ntf_var_data_limit = optval; > + LOG_NO("NTFA_VARIABLE_DATA_LIMIT=%d", optval); > + } else { > + LOG_WA("Invalid NTFA_VARIABLE_DATA_LIMIT, using default > %d", > +NTFA_VARIABLE_DATA_LIMIT); > + } > + } >return rc; > > error: > diff --git a/src/ntf/ntfd/ntfd.conf b/src/ntf/ntfd/ntfd.conf > index 91bfcd2e2..f2f67496f 100644 > --- a/src/ntf/ntfd/ntfd.conf > +++ b/src/ntf/ntfd/ntfd.conf > @@ -24,6 +24,10 @@ export NTFSV_ENV_HEALTHCHECK_KEY="Default" > # directory and the directory component of the path name (if any) is > ignored. > #export NTFSCN_TRACE_PATHNAME=osafntfcn > > +# Uncomment the next line to configure max allowed variable data size for the > +# osafntfcn (configuration notifier). Default value is 32767 bytes > +#export NTFA_VARIABLE_DATA_LIMIT=32767 > + > # Only log priority LOG_WARNING and higher to the system log file. > # All logging will be recorded in a new node local log file > $PKGLOGDIR/osaf.log. > # Uncomment the next line to enable this service to log to OpenSAF node > local log file. > diff --git a/src/ntf/ntfimcnd/ntfimcn_imm.c b/src/ntf/ntfimcnd/ntfimcn_imm.c > index c58e8a268..3f2c1a873 100644 > --- a/src/ntf/ntfimcnd/ntfimcn_imm.c > +++ b/src/ntf/ntfimcnd/ntfimcn_imm.c > @@ -680,8 +680,10 @@ static void saImmOiCcbApplyCallback(SaImmOiHandleT > immOiHandle, >ccbUtilOperationData, rdn_attr_name, ccbLast); >if (internal_rc != 0) { >LOG_ER( > - "%s send_object_create_notification fail", > - __FUNCTION__); > + "%s send_object_create_notification %s > fail", > + __FUNCTION__, > + osaf_extended_name_borrow( > + >objectName)); >goto done; >} >break; > @@ -706,8 +708,10 @@ static void saImmOiCcbApplyCallback(SaImmOiHandleT > immOiHandle, >ccbUtilOperationData, invoke_name_ptr, ccbLast); >if (internal_rc != 0) { >LOG_ER( > - "%s send_object_delete_notification fail", > - __FUNCTION__); > + "%s send_object_delete_no
Re: [devel] [PATCH 1/1] osaf: enhance vm frozen detection in tcp.plugin [#3164]
Hi Thuan, I'm adding Thanh since he's looking at the patch as well. I see you pushed the patch, here some late comments. Thanks Minh On 9/3/20 4:49 pm, thuan.tran wrote: - Active SC will reboot if arb time somehow has big gap b/w heartbeats in watch takeover request. Active SC may still OK but be rebooted unexpectedly. - Enhance VM was frozen detection base on arb time and local time counter. [M]: The patch has a general solution for both vm and container, and running a counter thread stead of reading time.time(), we need to explain it with a bit more details. --- src/osaf/consensus/plugins/tcp/tcp.plugin | 43 ++- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/src/osaf/consensus/plugins/tcp/tcp.plugin b/src/osaf/consensus/plugins/tcp/tcp.plugin index 0be20fcee..aaa1c1c3f 100755 --- a/src/osaf/consensus/plugins/tcp/tcp.plugin +++ b/src/osaf/consensus/plugins/tcp/tcp.plugin @@ -23,8 +23,24 @@ import sys import time import xmlrpc.client import syslog +import threading +counter_run = False +counter_time = 0.0 + +def time_counting(hb_interval): +''' +When node is frozen, if it is VM, clock time not jump +but if it is container, clock time still jump. +This function to help know node is frozen or arbitrator server issue +''' +global counter_run, counter_time +counter_time = 0.0 +while (counter_run): +time.sleep(hb_interval) +counter_time += hb_interval + class ArbitratorPlugin(object): """ This class represents a TCP Plugin """ @@ -478,6 +494,8 @@ class ArbitratorPlugin(object): return ret last_arb_timestamp = 0 +global counter_run, counter_time +counter = None while True: if key == self.takeover_request: if self.is_active() is False: @@ -486,15 +504,24 @@ class ArbitratorPlugin(object): while True: try: time_at_arb = self.proxy.heartbeat(self.hostname) -if last_arb_timestamp == 0: -last_arb_timestamp = time_at_arb -break -elif (time_at_arb - last_arb_timestamp) > self.timeout: -# VM was frozen? -syslog.syslog('VM was frozen!') -ret['code'] = 126 -return ret +if counter is not None: +counter_run = False +counter.join() +if (last_arb_timestamp != 0) and \ + (time_at_arb - last_arb_timestamp > self.timeout): +if counter_time < self.timeout: +syslog.syslog('VM was frozen!') +ret['code'] = 126 +return ret +syslog.syslog('Arb server issue?') +raise socket.error('Arb server issue?') else: +counter = threading.Thread( +target=time_counting, +args=(self.heartbeat_interval,)) +counter_run = True +counter.setDaemon(True) +counter.start() [M] What it means to we are going to start the thread, and wait for it join() back multiple times in this while loop. last_arb_timestamp = time_at_arb break except socket.error: ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]
Hi Thuan, ack from me. Thanks Minh On 9/3/20 5:08 pm, thuan.tran wrote: - Split-brain recovery in headless enable, IMMND may expected restart. If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger, clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay restart IMMND, delay resend node_up then AMFD will order reboot node. - Only call clm_to_amf_node() if amf node name is empty. --- src/amf/amfnd/clm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc index 06eb229c7..73c8ff83c 100644 --- a/src/amf/amfnd/clm.cc +++ b/src/amf/amfnd/clm.cc @@ -250,7 +250,7 @@ static void clm_track_cb( memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode), sizeof(SaClmClusterNodeT_4)); /*get the amf node from clm node name */ - clm_to_amf_node(); + if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node(); avnd_send_node_up_msg(); avnd_cb->first_time_up = false; } ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfnd: correct handling "terminate success" evt in terminating state [#3157]
Hi Thang, ack (not tested), would be good if you can elaborate the commit message to explain how/why the patch can fix coredump. Thanks Minh On 20/2/20 5:27 pm, Thang Duc Nguyen wrote: Amfnd need to exist in node in shutdown state and all components terminated. --- src/amf/amfnd/clc.cc | 40 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc index de57838c9..f78e1a707 100644 --- a/src/amf/amfnd/clc.cc +++ b/src/amf/amfnd/clc.cc @@ -80,6 +80,8 @@ uint32_t avnd_comp_clc_st_chng_prc(AVND_CB *, AVND_COMP *, SaAmfPresenceStateT, static uint32_t avnd_instfail_su_failover(AVND_CB *, AVND_SU *, AVND_COMP *); +static void amfnd_clean_before_exit(AVND_CB *); + /*** ** C O M P O N E N T C L C F S M M A T R I X D E F I N I T I O N ** ***/ @@ -297,6 +299,23 @@ static void log_failed_exec(NCS_OS_PROC_EXEC_STATUS_INFO *exec_stat, comp->clc_info.cmds[exec_cmd - 1].cmd); } +/ + Name : amfnd_clean_before_exit + + Description : Clean database before exit + + Arguments : cb - ptr to the AvND control block + + Return Values : None + +**/ +void amfnd_clean_before_exit(AVND_CB *cb) { + LOG_NO("Shutdown completed, exiting"); + cb->nodeid_mdsdest_db.deleteAll(); + cb->hctypedb.deleteAll(); + daemon_exit(); +} + / Name : avnd_evt_clc_resp @@ -810,10 +829,7 @@ uint32_t avnd_comp_clc_fsm_run(AVND_CB *cb, AVND_COMP *comp, avnd_comp_pres_state_set(cb, comp, SA_AMF_PRESENCE_UNINSTANTIATED); if (all_comps_terminated()) { LOG_NO("Terminated all AMF components"); - LOG_NO("Shutdown completed, exiting"); - cb->nodeid_mdsdest_db.deleteAll(); - cb->hctypedb.deleteAll(); - daemon_exit(); + amfnd_clean_before_exit(cb); } else { TRACE("Do nothing"); goto done; @@ -2401,6 +2417,12 @@ uint32_t avnd_comp_clc_terming_termsucc_hdler(AVND_CB *cb, AVND_COMP *comp) { avnd_comp_curr_info_del(cb, comp); } + if ((cb->term_state == AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED) && + all_comps_terminated()) { +LOG_NO("Terminated all AMF components"); +amfnd_clean_before_exit(cb); + } + TRACE_LEAVE(); return rc; } @@ -2520,10 +2542,7 @@ uint32_t avnd_comp_clc_terming_cleansucc_hdler(AVND_CB *cb, AVND_COMP *comp) { } if (all_comps_terminated()) { LOG_NO("Terminated all AMF components"); - LOG_NO("Shutdown completed, exiting"); - cb->nodeid_mdsdest_db.deleteAll(); - cb->hctypedb.deleteAll(); - daemon_exit(); + amfnd_clean_before_exit(cb); } } /* @@ -2584,10 +2603,7 @@ uint32_t avnd_comp_clc_terming_cleanfail_hdler(AVND_CB *cb, AVND_COMP *comp) { if ((cb->term_state == AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED) && all_comps_terminated()) { LOG_WA("Terminated all AMF components (with failures)"); -LOG_NO("Shutdown completed, exiting"); -cb->nodeid_mdsdest_db.deleteAll(); -cb->hctypedb.deleteAll(); -daemon_exit(); +amfnd_clean_before_exit(cb); } TRACE_LEAVE(); ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: fix memleak in agent enable flow control [#3151]
Hi Thuan, Ack from me. Thanks Minh On 12/2/20 9:29 pm, thuan.tran wrote: Agent enable flow control keep add new portid without remove. Remove portid when svc count become zero then handle portid reset properly, peer A may see portid reset (peer B) then peer B should accept fseq(1) message from peer A. --- src/mds/mds_tipc_fctrl_intf.cc | 6 ++ src/mds/mds_tipc_fctrl_portid.cc | 17 - 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index f3883ba36..f3504b901 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -428,6 +428,12 @@ uint32_t mds_tipc_fctrl_portid_down(struct tipc_portid id, uint32_t type) { portid->svc_cnt_--; m_MDS_LOG_DBG("FCTRL: Remove svc[node:%x, ref:%u svc_id:%u], svc_cnt:%u", id.node, id.ref, svc_id, portid->svc_cnt_); +if (portid->svc_cnt_ == 0) { + delete portid; + portid_map.erase(TipcPortId::GetUniqueId(id)); + m_MDS_LOG_NOTIFY("FCTRL: Remove portid[node:%x, ref:%u]", + id.node, id.ref); +} } portid_map_mutex.unlock(); diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 3562c4a00..57843b6de 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -373,7 +373,7 @@ uint32_t TipcPortId::ReceiveData(uint32_t mseq, uint16_t mfrag, if (rcvwnd_.rcv_ + Seq16(1) < Seq16(fseq)) { if (rcvwnd_.rcv_ == 0 && rcvwnd_.acked_ == 0) { // peer does not realize that this portid reset -m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], " +m_MDS_LOG_NOTIFY("FCTRL: [me] <-- [node:%x, ref:%u], " "RcvData[mseq:%u, mfrag:%u, fseq:%u], " "rcvwnd[acked:%u, rcv:%u, nacked:%" PRIu64 "], " "Warning[portid reset]", @@ -381,7 +381,9 @@ uint32_t TipcPortId::ReceiveData(uint32_t mseq, uint16_t mfrag, mseq, mfrag, fseq, rcvwnd_.acked_.v(), rcvwnd_.rcv_.v(), rcvwnd_.nacked_space_); +SendChunkAck(fseq, svc_id, 1); rcvwnd_.rcv_ = fseq; +rcvwnd_.acked_ = rcvwnd_.rcv_; } else { rc = NCSCC_RC_FAILURE; // msg loss @@ -395,6 +397,19 @@ uint32_t TipcPortId::ReceiveData(uint32_t mseq, uint16_t mfrag, // send nack SendNack((rcvwnd_.rcv_ + Seq16(1)).v(), svc_id); } +} else if (fseq == 1) { + // sender realize me as portid reset + m_MDS_LOG_NOTIFY("FCTRL: [me] <-- [node:%x, ref:%u], " + "RcvData[mseq:%u, mfrag:%u, fseq:%u], " + "rcvwnd[acked:%u, rcv:%u, nacked:%" PRIu64 "], " + "Warning[portid reset on sender]", + id_.node, id_.ref, + mseq, mfrag, fseq, + rcvwnd_.acked_.v(), rcvwnd_.rcv_.v(), rcvwnd_.nacked_space_); + + SendChunkAck(fseq, svc_id, 1); + rcvwnd_.rcv_ = fseq; + rcvwnd_.acked_ = rcvwnd_.rcv_; } else if (Seq16(fseq) <= rcvwnd_.rcv_) { rc = NCSCC_RC_FAILURE; // unexpected retransmission ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] dtm: improve time accuracy in a trace record [#3144]
Hi aThanh, ack from me. Thanks Minh On 6/2/20 3:42 pm, Thanh Nguyen wrote: In the trace record the time value is generated after acquiring the mutex. The time accuracy is improved when generated before seizing the mutext. --- src/base/logtrace.cc| 2 +- src/base/logtrace_client.cc | 15 --- src/base/logtrace_client.h | 9 + 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/base/logtrace.cc b/src/base/logtrace.cc index 8908c1ff3..9822879ab 100644 --- a/src/base/logtrace.cc +++ b/src/base/logtrace.cc @@ -97,7 +97,7 @@ void trace_output(const char *file, unsigned line, unsigned priority, if (!entry) { entry = gl_local_thread_trace->CreateLogEntry( static_cast(priority), - preamble, ap); + base::ReadRealtimeClock(), preamble, ap); } gl_thread_buffer.WriteToBuffer(entry); } diff --git a/src/base/logtrace_client.cc b/src/base/logtrace_client.cc index e22112a43..104e08ce1 100644 --- a/src/base/logtrace_client.cc +++ b/src/base/logtrace_client.cc @@ -96,32 +96,33 @@ const char* LogTraceClient::Log(LogTraceClient* tracelog, const char* LogTraceClient::Log(base::LogMessage::Severity severity, const char *fmt, va_list ap) { if (log_socket_ != nullptr && log_mutex_ != nullptr) { -return LogInternal(severity, fmt, ap); +return LogInternal(severity, base::ReadRealtimeClock(), fmt, ap); } return nullptr; } const char* LogTraceClient::LogInternal(base::LogMessage::Severity severity, -const char *fmt, va_list ap) { +timespec time_spec, const char *fmt, va_list ap) { base::Lock lock(*log_mutex_); - CreateLogEntryInternal(severity, fmt, ap); + CreateLogEntryInternal(severity, time_spec, fmt, ap); log_socket_->Send(buffer_.data(), buffer_.size()); return buffer_.data(); } const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity, -const char *fmt, va_list ap) { +timespec time_spec, const char *fmt, va_list ap) { base::Lock lock(*log_mutex_); - return CreateLogEntryInternal(severity, fmt, ap); + return CreateLogEntryInternal(severity, time_spec, fmt, ap); } const char* LogTraceClient::CreateLogEntryInternal( -base::LogMessage::Severity severity, const char *fmt, va_list ap) { +base::LogMessage::Severity severity, timespec time_spec, +const char *fmt, va_list ap) { uint32_t id = sequence_id_; sequence_id_ = id < kMaxSequenceId ? id + 1 : 1; buffer_.clear(); base::LogMessage::Write( - base::LogMessage::Facility::kLocal1, severity, base::ReadRealtimeClock(), + base::LogMessage::Facility::kLocal1, severity, time_spec, fqdn_, app_name_, proc_id_, msg_id_, {{base::LogMessage::SdName{"meta"}, {base::LogMessage::Parameter{base::LogMessage::SdName{"sequenceId"}, diff --git a/src/base/logtrace_client.h b/src/base/logtrace_client.h index 5b165e528..1ccb44d06 100644 --- a/src/base/logtrace_client.h +++ b/src/base/logtrace_client.h @@ -44,7 +44,7 @@ class LogTraceClient { const char* Log(base::LogMessage::Severity severity, const char *fmt, va_list ap); const char* CreateLogEntry(base::LogMessage::Severity severity, - const char *fmt, va_list ap); + timespec time_spec, const char *fmt, va_list ap); void AddExternalBuffer(int64_t tid, LogTraceBuffer* buffer); void RemoveExternalBuffer(int64_t tid); void RequestFlushExternalBuffer(); @@ -56,10 +56,11 @@ class LogTraceClient { private: bool Init(const char *msg_id, WriteMode mode); - const char* LogInternal(base::LogMessage::Severity severity, const char *fmt, - va_list ap); + + const char* LogInternal(base::LogMessage::Severity severity, + timespec time_spec, const char *fmt, va_list ap); const char* CreateLogEntryInternal(base::LogMessage::Severity severity, - const char *fmt, va_list ap); + timespec time_spec, const char *fmt, va_list ap); static constexpr const uint32_t kMaxSequenceId = uint32_t{0x7fff}; base::LogMessage::HostName fqdn_{""}; base::LogMessage::AppName app_name_{""}; ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] dtm: improve time accuracy in a trace record [#3144]
Hi aThanh, The CreateLogEntry/... are added recently in LogTraceClient.h/cc, and the "client" you mean should have been calling the log/trace in logtrace.h, which are the OpenSAF services and agents. The real client should include the SAF headers in src/ais/include to use SAF services. Do you see any use cases that "client" should include LogTraceClient.h to call CreateLogEntry without logtrace.h? Thanks, Minh On 6/2/20 2:05 pm, Thanh Nguyen wrote: Hello Minh, 1) For LogInternal(..), there is only one version. The new replaced the old. This is private method, thus it can be safely replaced. 2) For CreateLogEntry(...) which is a public method. For compatibility reasons, I keep the old method and create the new method. I do not know if there is any client code of opensaf out there calling CreateLogEntry. If it is known that there is no client code, I will remove the old method. 3) For CreateLogEntryInternal(..), I also keep two versions corresponding to two versions of calling methods CreateLogEntry(..). I will remove the old version of (2) and (3) if it is confirmed that there is no client code calling CreateLogEntry(..). Best Regards, Thanh -Original Message- From: Minh Hon Chau [mailto:minh.c...@dektech.com.au] Sent: Thursday, 6 February 2020 12:48 PM To: Thanh Nguyen; peter.mcint...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] dtm: improve time accuracy in a trace record [#3144] Hi aThanh, The patch adds a new pair of CreateLogEntry/CreateLogEntryInternal with one extra parameter. If the old one (within 3 parameters) is not being used anywhere else, we can delete them. Thanks Minh On 24/1/20 11:34 am, Thanh Nguyen wrote: In the trace record the time value is generated after acquiring the mutex. The time accuracy is improved when generated before seizing the mutext. --- src/base/logtrace.cc| 2 +- src/base/logtrace_client.cc | 18 +- src/base/logtrace_client.h | 13 ++--- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/base/logtrace.cc b/src/base/logtrace.cc index 8908c1ff3..9822879ab 100644 --- a/src/base/logtrace.cc +++ b/src/base/logtrace.cc @@ -97,7 +97,7 @@ void trace_output(const char *file, unsigned line, unsigned priority, if (!entry) { entry = gl_local_thread_trace->CreateLogEntry( static_cast(priority), - preamble, ap); + base::ReadRealtimeClock(), preamble, ap); } gl_thread_buffer.WriteToBuffer(entry); } diff --git a/src/base/logtrace_client.cc b/src/base/logtrace_client.cc index e22112a43..484bd17e5 100644 --- a/src/base/logtrace_client.cc +++ b/src/base/logtrace_client.cc @@ -96,19 +96,26 @@ const char* LogTraceClient::Log(LogTraceClient* tracelog, const char* LogTraceClient::Log(base::LogMessage::Severity severity, const char *fmt, va_list ap) { if (log_socket_ != nullptr && log_mutex_ != nullptr) { -return LogInternal(severity, fmt, ap); +return LogInternal(severity, base::ReadRealtimeClock(), fmt, ap); } return nullptr; } const char* LogTraceClient::LogInternal(base::LogMessage::Severity severity, -const char *fmt, va_list ap) { +timespec time_spec, const char *fmt, va_list ap) { base::Lock lock(*log_mutex_); - CreateLogEntryInternal(severity, fmt, ap); + CreateLogEntryInternal(severity, time_spec, fmt, ap); log_socket_->Send(buffer_.data(), buffer_.size()); return buffer_.data(); } +const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity, +timespec time_spec, const char *fmt, va_list ap) { + base::Lock lock(*log_mutex_); + return CreateLogEntryInternal(severity, time_spec, fmt, ap); +} + +// This is original const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity, const char *fmt, va_list ap) { base::Lock lock(*log_mutex_); @@ -116,12 +123,13 @@ const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity, } const char* LogTraceClient::CreateLogEntryInternal( -base::LogMessage::Severity severity, const char *fmt, va_list ap) { +base::LogMessage::Severity severity, timespec time_spec, +const char *fmt, va_list ap) { uint32_t id = sequence_id_; sequence_id_ = id < kMaxSequenceId ? id + 1 : 1; buffer_.clear(); base::LogMessage::Write( - base::LogMessage::Facility::kLocal1, severity, base::ReadRealtimeClock(), + base::LogMessage::Facility::kLocal1, severity, time_spec, fqdn_, app_name_, proc_id_, msg_id_, {{base::LogMessage::SdName{"meta"}, {base::LogMessage::Parameter{base::LogMessage::SdName{"sequenceId"}, diff --git a/src/base/logtrace_client.h b/src/base/logtrace_client.h index 5b165e528..29aa79b95 100644 --- a/src/base/logtrace_client.h +++ b/src/base/logtrace_client.h @@ -45,6 +45,8
Re: [devel] [PATCH 1/1] dtm: improve time accuracy in a trace record [#3144]
Hi aThanh, The patch adds a new pair of CreateLogEntry/CreateLogEntryInternal with one extra parameter. If the old one (within 3 parameters) is not being used anywhere else, we can delete them. Thanks Minh On 24/1/20 11:34 am, Thanh Nguyen wrote: In the trace record the time value is generated after acquiring the mutex. The time accuracy is improved when generated before seizing the mutext. --- src/base/logtrace.cc| 2 +- src/base/logtrace_client.cc | 18 +- src/base/logtrace_client.h | 13 ++--- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/base/logtrace.cc b/src/base/logtrace.cc index 8908c1ff3..9822879ab 100644 --- a/src/base/logtrace.cc +++ b/src/base/logtrace.cc @@ -97,7 +97,7 @@ void trace_output(const char *file, unsigned line, unsigned priority, if (!entry) { entry = gl_local_thread_trace->CreateLogEntry( static_cast(priority), - preamble, ap); + base::ReadRealtimeClock(), preamble, ap); } gl_thread_buffer.WriteToBuffer(entry); } diff --git a/src/base/logtrace_client.cc b/src/base/logtrace_client.cc index e22112a43..484bd17e5 100644 --- a/src/base/logtrace_client.cc +++ b/src/base/logtrace_client.cc @@ -96,19 +96,26 @@ const char* LogTraceClient::Log(LogTraceClient* tracelog, const char* LogTraceClient::Log(base::LogMessage::Severity severity, const char *fmt, va_list ap) { if (log_socket_ != nullptr && log_mutex_ != nullptr) { -return LogInternal(severity, fmt, ap); +return LogInternal(severity, base::ReadRealtimeClock(), fmt, ap); } return nullptr; } const char* LogTraceClient::LogInternal(base::LogMessage::Severity severity, -const char *fmt, va_list ap) { +timespec time_spec, const char *fmt, va_list ap) { base::Lock lock(*log_mutex_); - CreateLogEntryInternal(severity, fmt, ap); + CreateLogEntryInternal(severity, time_spec, fmt, ap); log_socket_->Send(buffer_.data(), buffer_.size()); return buffer_.data(); } +const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity, +timespec time_spec, const char *fmt, va_list ap) { + base::Lock lock(*log_mutex_); + return CreateLogEntryInternal(severity, time_spec, fmt, ap); +} + +// This is original const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity, const char *fmt, va_list ap) { base::Lock lock(*log_mutex_); @@ -116,12 +123,13 @@ const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity, } const char* LogTraceClient::CreateLogEntryInternal( -base::LogMessage::Severity severity, const char *fmt, va_list ap) { +base::LogMessage::Severity severity, timespec time_spec, +const char *fmt, va_list ap) { uint32_t id = sequence_id_; sequence_id_ = id < kMaxSequenceId ? id + 1 : 1; buffer_.clear(); base::LogMessage::Write( - base::LogMessage::Facility::kLocal1, severity, base::ReadRealtimeClock(), + base::LogMessage::Facility::kLocal1, severity, time_spec, fqdn_, app_name_, proc_id_, msg_id_, {{base::LogMessage::SdName{"meta"}, {base::LogMessage::Parameter{base::LogMessage::SdName{"sequenceId"}, diff --git a/src/base/logtrace_client.h b/src/base/logtrace_client.h index 5b165e528..29aa79b95 100644 --- a/src/base/logtrace_client.h +++ b/src/base/logtrace_client.h @@ -45,6 +45,8 @@ class LogTraceClient { va_list ap); const char* CreateLogEntry(base::LogMessage::Severity severity, const char *fmt, va_list ap); + const char* CreateLogEntry(base::LogMessage::Severity severity, + timespec time_spec, const char *fmt, va_list ap); void AddExternalBuffer(int64_t tid, LogTraceBuffer* buffer); void RemoveExternalBuffer(int64_t tid); void RequestFlushExternalBuffer(); @@ -56,10 +58,15 @@ class LogTraceClient { private: bool Init(const char *msg_id, WriteMode mode); - const char* LogInternal(base::LogMessage::Severity severity, const char *fmt, - va_list ap); + + const char* LogInternal(base::LogMessage::Severity severity, + timespec time_spec, const char *fmt, va_list ap); const char* CreateLogEntryInternal(base::LogMessage::Severity severity, - const char *fmt, va_list ap); + timespec time_spec, const char *fmt, va_list ap); + inline const char* CreateLogEntryInternal( + base::LogMessage::Severity severity, const char *fmt, va_list ap) { +return CreateLogEntryInternal(severity, base::ReadRealtimeClock(), fmt, ap); + } static constexpr const uint32_t kMaxSequenceId = uint32_t{0x7fff}; base::LogMessage::HostName fqdn_{""}; base::LogMessage::AppName app_name_{""}; ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] log: fix segmentation fault in log agent [#3137]
Hi Vu, Ack(review). Thanks, Minh Quoting Vu Minh Nguyen : log agent did not protect the resource `unacked_invocations_ list` from accessing by multiple threads, so caused segmentation fault. This patch introduces a mutex in order to synchronize the access to that common resource. --- src/log/agent/lga_client.cc | 2 +- src/log/agent/lga_client.h | 16 +++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/log/agent/lga_client.cc b/src/log/agent/lga_client.cc index cdc54904a..2eb37a0f7 100644 --- a/src/log/agent/lga_client.cc +++ b/src/log/agent/lga_client.cc @@ -86,7 +86,7 @@ LogClient::~LogClient() { } stream_list_.clear(); - unacked_invocations_.clear(); + CleanUnackedList(); // Free the client handle allocated to this log client if (handle_ != 0) { diff --git a/src/log/agent/lga_client.h b/src/log/agent/lga_client.h index f5fa6faa4..e6e2c911e 100644 --- a/src/log/agent/lga_client.h +++ b/src/log/agent/lga_client.h @@ -174,13 +174,18 @@ class LogClient { // get acknowledgement from it. void KeepTrack(SaInvocationT inv, uint32_t ack_flags) { if (ack_flags != SA_LOG_RECORD_WRITE_ACK) return; +base::Lock scope_lock{mutex_unacked_list_}; unacked_invocations_.push_back(inv); } // Got an acknowledgment, so remove from the track list. - void RemoveTrack(SaInvocationT inv) { unacked_invocations_.remove(inv); } + void RemoveTrack(SaInvocationT inv) { +base::Lock scope_lock{mutex_unacked_list_}; +unacked_invocations_.remove(inv); + } void NotifyClientAboutLostInvocations() { +base::Lock scope_lock{mutex_unacked_list_}; for (const auto& i : unacked_invocations_) { TRACE("The write async with this invocation %lld has been lost", i); // the below memory will be freed by lga_msg_destroy(cbk_msg) @@ -232,6 +237,11 @@ class LogClient { // Invoke the registered callback void InvokeCallback(const lgsv_msg_t* msg); + void CleanUnackedList() { +base::Lock scope_lock{mutex_unacked_list_}; +unacked_invocations_.clear(); + } + // Delete all messages from the mailbox static bool ClearMailBox(NCSCONTEXT, NCSCONTEXT); @@ -290,6 +300,10 @@ class LogClient { // If cluster goes to headless, log agent will inform to log client with // SA_AIS_ERR_TRY_AGAIN code for these invocations. std::list unacked_invocations_{}; + + // To protect the `unacked_invocations_` list. + base::Mutex mutex_unacked_list_{}; + // LOG handle (derived from hdl-mngr) SaLogHandleT handle_; -- 2.17.1 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] log: fix segmentation fault in log agent [#3137]
Hi Vu, Don't you need to protect the list in ~LogClient()? And in NotifyClientAboutLostInvocations(), does it need to protect before 'read' in the 'for' loop? Otherwise it's ack from me. Thanks Minh On 6/1/20 2:15 pm, Vu Minh Nguyen wrote: log agent did not protect the resource `unacked_invocations_ list` from accessing by multiple threads, so caused segmentation fault. This patch introduces a mutex in order to synchronize the access to that common resource. --- src/log/agent/lga_client.h | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/log/agent/lga_client.h b/src/log/agent/lga_client.h index f5fa6faa4..c999d148e 100644 --- a/src/log/agent/lga_client.h +++ b/src/log/agent/lga_client.h @@ -174,11 +174,15 @@ class LogClient { // get acknowledgement from it. void KeepTrack(SaInvocationT inv, uint32_t ack_flags) { if (ack_flags != SA_LOG_RECORD_WRITE_ACK) return; +base::Lock scope_lock{mutex_unacked_list_}; unacked_invocations_.push_back(inv); } // Got an acknowledgment, so remove from the track list. - void RemoveTrack(SaInvocationT inv) { unacked_invocations_.remove(inv); } + void RemoveTrack(SaInvocationT inv) { +base::Lock scope_lock{mutex_unacked_list_}; +unacked_invocations_.remove(inv); + } void NotifyClientAboutLostInvocations() { for (const auto& i : unacked_invocations_) { @@ -196,6 +200,8 @@ class LogClient { SendMsgToMbx(msg, MDS_SEND_PRIORITY_HIGH); } + +base::Lock scope_lock{mutex_unacked_list_}; unacked_invocations_.clear(); } @@ -290,6 +296,10 @@ class LogClient { // If cluster goes to headless, log agent will inform to log client with // SA_AIS_ERR_TRY_AGAIN code for these invocations. std::list unacked_invocations_{}; + + // To protect the `unacked_invocations_` list. + base::Mutex mutex_unacked_list_{}; + // LOG handle (derived from hdl-mngr) SaLogHandleT handle_; ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: fix ckpt 20 11 failure [#3127]
Hi Thuan, - We could give the patch title a bit more meanings than "fix ckpt 20 11..", for example, something as "Using timer to continue sending queued message". - And a few comments inline Thanks Minh On 5/12/19 3:05 pm, thuan.tran wrote: - In overflow, receive chunk ack may stuck in retrying to send pending messages then later chunk ack comming cannot proceed. - Instead of retrying to send pending messages, reuse timer send chunk ack to trigger send pending messages if any. By this, even no more Nack or ChunkAck event comming, pending messages will be sent by timer. --- src/mds/mds_dt_tipc.c| 12 ++--- src/mds/mds_tipc_fctrl_intf.cc | 10 src/mds/mds_tipc_fctrl_portid.cc | 88 ++-- src/mds/mds_tipc_fctrl_portid.h | 1 + 4 files changed, 56 insertions(+), 55 deletions(-) diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 9b3290833..6b30846a1 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -3183,13 +3183,13 @@ ssize_t mds_retry_sendto(int sockfd, const void *buf, size_t len, int flags, { int retry = 5; ssize_t send_len = 0; - while (retry >= 0) { + while (retry-- >= 0) { send_len = sendto(sockfd, buf, len, flags, dest_addr, addrlen); if (send_len == len) { return send_len; - } else if (retry-- > 0) { - if (errno != ENOMEM && - errno != ENOBUFS && + } else if (retry >= 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK && + errno != ENOMEM && errno != ENOBUFS && errno != EINTR) break; osaf_nanosleep(); [Minh] We may need to do error-log the strerror and errno in case of failure in mds_retry_sendto(). Also, uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) { ... m_MDS_LOG_ERR("FCTRL: sendto() failed, Error[%s]", strerror(errno)); } this logging "sendto()" should be now "TipcPortId::Send()" @@ -3242,7 +3242,7 @@ static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t buff_len, if (mds_tipc_fctrl_trysend(id, buffer, buff_len, is_queued) == NCSCC_RC_SUCCESS) { send_len = mds_retry_sendto( - tipc_cb.BSRsock, buffer, buff_len, 0, + tipc_cb.BSRsock, buffer, buff_len, MSG_DONTWAIT, (struct sockaddr *)_addr, sizeof(server_addr)); [Minh] There must be a reason that you want to use non-blocking with MSG_DONTWAIT? if (send_len == buff_len) { m_MDS_LOG_INFO("MDTM: Successfully sent message"); @@ -3289,7 +3289,7 @@ static uint32_t mdtm_mcast_sendto(void *buffer, size_t size, /*This can be scope-down to dest_svc_id server_inst TBD*/ server_addr.addr.nameseq.upper = HTONL(MDS_MDTM_UPPER_INSTANCE); ssize_t send_len = - mds_retry_sendto(tipc_cb.BSRsock, buffer, size, 0, + mds_retry_sendto(tipc_cb.BSRsock, buffer, size, MSG_DONTWAIT, (struct sockaddr *)_addr, sizeof(server_addr)); if (send_len == size) { diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index 7d0571e7c..b20205686 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -102,6 +102,8 @@ void tmr_exp_cbk(void* uarg) { void process_timer_event(const Event& evt) { bool txprob_restart = false; + m_MDS_LOG_DBG("FCTRL: process timer event start [evt:%d]", +static_cast(evt.type_)); for (auto i : portid_map) { TipcPortId* portid = i.second; @@ -113,16 +115,20 @@ void process_timer_event(const Event& evt) { if (evt.type_ == Event::Type::kEvtTmrChunkAck) { portid->ReceiveTmrChunkAck(); + portid->SendUnsentMsg(); } [Minh] The idea now is using ChunkAck timer to continue sending unsent message. This fix comes from a situation that we failed in the middle of sending unsent message due to "Cannot allocate memory...". In the scenario without such error "Cannot allocate ...", the function SendUnsentMsg() here will be sending extra messages from the "receiving channel" as ChunkAck timer apart from the "sending channel" as ReceiveChunkAck(). That would cause more undeliverable messages (the ones are now sent from ChunkAck timer) if the overloading starts to happen and sender keeps pushing more messages to send (more message pushes into queue). } if (txprob_restart) { txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk); m_MDS_LOG_DBG("FCTRL: Restart txprob"); } + m_MDS_LOG_DBG("FCTRL: process timer event end"); } uint32_t process_flow_event(const Event& evt) { uint32_t rc = NCSCC_RC_SUCCESS; + m_MDS_LOG_DBG("FCTRL: process flow event start [evt:%d]", +static_cast(evt.type_));
Re: [devel] [PATCH 0/1] Review Request for mds: not waste 1.5s in waiting dead Adest to send RSP [#3102] V2 (updated)
Hi Thuan One minor comment, we could separate this commit into one for code change, one for test case. @Vu, you have any comments? Thanks Minh On 27/11/19 1:21 pm, thuan.tran wrote: Summary: mds: not waste 1.5s in waiting dead Adest to send RSP [#3102] Review request for Ticket(s): 3102 Peer Reviewer(s): Minh, Vu, Thang, Gary Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-3102 Base revision: b61bee5c8accd79e573ef726d40b945afc7c7b3e Personal repository: git://git.code.sf.net/u/thuantr/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesn OpenSAF servicesn Core libraries y Samples n Tests y Other n NOTE: Patch(es) contain lines longer than 80 characers Comments (indicate scope for each "y" above): - N/A revision f4f5ab3efe19bdd11c5cb43e4f4d48af79656737 Author: thuan.tran Date: Tue, 26 Nov 2019 15:58:34 +0700 mds: not waste 1.5s in waiting dead Adest to send RSP [#3102] - When sending response message to Adest which is not exist (crash/terminate), current MDS try to wait for 1.5 seconds before conclude no route to send RSP. - Here are scenarios may waste 1.5s waiting: SVCs DOWN (dead adest or vdest role change) -> get SNDRSP -> send RSP (wait 1.5s) get SNDRSP -> SVCs DOWN (dead adest or vdest role change) -> send RSP (wait 1.5s) This long wait time cause trouble for higher layer services, e.g: ntf, imm, etc... where there are many agents send initialize request (use message SNDRSP type) - Solution: create adest list, a timer start when last SVC of adest DOWN. When sending RSP to this adest, the wait time will reduce to only 10ms. Notice that following origin behavior is kept: No any SVC UP before -> get SNDRSP -> send RSP (wait 1.5s) - New TC tet_send_response_tp_13() is created to verify this scenario. Complete diffstat: -- src/mds/apitest/mdstipc.h | 1 + src/mds/apitest/mdstipc_api.c | 107 ++ src/mds/apitest/mdstipc_conf.c | 1 - src/mds/mds_c_api.c| 199 + src/mds/mds_c_sndrcv.c | 38 +--- src/mds/mds_core.h | 30 ++- src/mds/mds_dt2c.h | 2 +- src/mds/mds_dt_common.c| 24 - src/mds/mds_main.c | 4 + 9 files changed, 350 insertions(+), 56 deletions(-) Testing Commands: - N/A Testing, Expected Results: -- N/A Conditions of Submission: - ACK by reviewers Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 y y powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible line length in your headers/comments/text. ___ You have failed to put in a proper Trac Ticket # into your commits. ___ You have incorrectly put/left internal data in your comments/files (i.e. internal bug tracking tool IDs, product names etc) ___ You have not given any evidence of testing beyond basic build tests. Demonstrate some level of runtime or other sanity testing. ___ You have ^M present in some of your files. These have to be removed. ___ You have needlessly changed whitespace or added whitespace crimes like trailing spaces, or spaces before tabs. ___ You have mixed real technical changes with whitespace and other cosmetic code cleanup changes. These have to be separate commits. ___ You need to refactor your submission into logical chunks; there is too much content into a single commit. ___ You have extraneous garbage in your review (merge commits etc) ___ You have giant attachments which should never have been sent; Instead you should place your content in a public tree to be pulled. ___ You have too many commits attached to an e-mail; resend as threaded commits, or place in a public tree for a pull. ___ You have resent this content multiple times without a clear indication of what has changed between each re-send. ___ You have failed to adequately and
Re: [devel] [PATCH 1/1] mds: close sockets at the end of mdtm_tipc_destroy() [#3125]
hi Thuan, ack (review only). Thanks Minh On 3/12/19 7:28 pm, thuan.tran wrote: Aslo create wrapper of sendto() to retry if errno is ENOMEM/ENOBUFFS/EINTR. And return for other errors, do not assert() cause coredump. --- src/mds/mds_dt_tipc.c| 47 +++ src/mds/mds_dt_tipc.h| 3 ++ src/mds/mds_tipc_fctrl_portid.cc | 65 +++- 3 files changed, 74 insertions(+), 41 deletions(-) diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index fdf0da7fb..b0f38ee49 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -51,6 +51,7 @@ #include "mds_tipc_recvq_stats.h" #include "base/osaf_utility.h" #include "base/osaf_poll.h" +#include "base/osaf_time.h" #ifndef SOCK_CLOEXEC enum { SOCK_CLOEXEC = 0x8 }; @@ -523,9 +524,7 @@ uint32_t mdtm_tipc_destroy(void) MDTM_REASSEMBLY_QUEUE *reassem_queue = NULL; MDTM_REASSEMBLY_KEY reassembly_key; - /* close sockets first */ - close(tipc_cb.BSRsock); - close(tipc_cb.Dsock); + mds_tipc_fctrl_shutdown(); /* Destroy receiving task */ if (mdtm_destroy_rcv_task() != NCSCC_RC_SUCCESS) { @@ -537,7 +536,6 @@ uint32_t mdtm_tipc_destroy(void) NULL); m_NCS_IPC_RELEASE(_cb.tmr_mbx, (NCS_IPC_CB)mdtm_mailbox_mbx_cleanup); - mds_tipc_fctrl_shutdown(); /* Clear reference hdl list */ while (mdtm_ref_hdl_list_hdr != NULL) { /* Store temporary the pointer of entry to be deleted */ @@ -587,6 +585,9 @@ uint32_t mdtm_tipc_destroy(void) handle = 0; mdtm_global_frag_num = 0; + close(tipc_cb.BSRsock); + close(tipc_cb.Dsock); + return NCSCC_RC_SUCCESS; } @@ -3135,6 +3136,37 @@ uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, return NCSCC_RC_SUCCESS; } +/* + + Function NAME: mds_retry_sendto + + DESCRIPTION: wrapper of sendto() for retry purpose + + ARGUMENTS: same as sendto() + + RETURNS: same as sendto() + +*/ +ssize_t mds_retry_sendto(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + int retry = 5; + ssize_t send_len = 0; + while (retry >= 0) { + send_len = sendto(sockfd, buf, len, flags, dest_addr, addrlen); + if (send_len == len) { + return send_len; + } else if (retry-- > 0) { + if (errno != ENOMEM && + errno != ENOBUFS && + errno != EINTR) + break; + osaf_nanosleep(); + } + } + return send_len; +} + /* Function NAME: mdtm_sendto @@ -3176,7 +3208,8 @@ static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t buff_len, } #endif if (mds_tipc_fctrl_trysend(buffer, buff_len, id) == NCSCC_RC_SUCCESS) { - send_len = sendto(tipc_cb.BSRsock, buffer, buff_len, 0, + send_len = mds_retry_sendto( + tipc_cb.BSRsock, buffer, buff_len, 0, (struct sockaddr *)_addr, sizeof(server_addr)); if (send_len == buff_len) { m_MDS_LOG_INFO("MDTM: Successfully sent message"); @@ -3222,8 +3255,8 @@ static uint32_t mdtm_mcast_sendto(void *buffer, size_t size, server_addr.addr.nameseq.lower = HTONL(MDS_MDTM_LOWER_INSTANCE); /*This can be scope-down to dest_svc_id server_inst TBD*/ server_addr.addr.nameseq.upper = HTONL(MDS_MDTM_UPPER_INSTANCE); - int send_len = - sendto(tipc_cb.BSRsock, buffer, size, 0, + ssize_t send_len = + mds_retry_sendto(tipc_cb.BSRsock, buffer, size, 0, (struct sockaddr *)_addr, sizeof(server_addr)); if (send_len == size) { diff --git a/src/mds/mds_dt_tipc.h b/src/mds/mds_dt_tipc.h index e73a11b09..65175839e 100644 --- a/src/mds/mds_dt_tipc.h +++ b/src/mds/mds_dt_tipc.h @@ -107,4 +107,7 @@ extern uint32_t mds_mdtm_node_subscribe_tipc(MDS_SVC_HDL svc_hdl, MDS_SUBTN_REF_VAL *subtn_ref_val); extern uint32_t mds_mdtm_node_unsubscribe_tipc(MDS_SUBTN_REF_VAL subtn_ref_val); +ssize_t mds_retry_sendto(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); + #endif // MDS_MDS_DT_TIPC_H_ diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index dab2b8c69..6b033b0e5 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -17,11 +17,14 @@ #include
Re: [devel] [PATCH 0/2] Review Request for mds: Avoid message reallocation [#3089] V3
Hi Vu, Thuan Any comments on the patches. Thanks Minh On 28/11/19 10:54 pm, Minh Chau wrote: Summary: mds: Avoid message reallocation [#3089] Review request for Ticket(s): 3089 Peer Reviewer(s): Thuan, Vu, Gary Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-3089 Base revision: 8e07c19aed63c249f4e7fa8470270d2de1a56046 Personal repository: git://git.code.sf.net/u/minh-chau/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesn OpenSAF servicesn Core libraries y Samples n Tests n Other n NOTE: Patch(es) contain lines longer than 80 characers Comments (indicate scope for each "y" above): - *** EXPLAIN/COMMENT THE PATCH SERIES HERE *** revision d3bdf53e99523785cdc932d62b25267ea900c643 Author: Minh Chau Date: Thu, 28 Nov 2019 21:08:50 +1100 mds: Avoid message reallocation [#3089] The patch avoids message reallocation if the message is in retransmission queue revision 7be0f5404ebb8ec5b8752813899d6aefd1ef6c33 Author: Minh Chau Date: Thu, 28 Nov 2019 21:08:38 +1100 mds: Improve readibility [#3089] Correct indent and reduce code lines (<80 chars) for mds_mdtm_send_tipc() and mdtm_frag_and_send() Complete diffstat: -- src/mds/mds_dt_tipc.c| 534 +-- src/mds/mds_tipc_fctrl_intf.cc | 6 +- src/mds/mds_tipc_fctrl_intf.h| 4 +- src/mds/mds_tipc_fctrl_msg.cc| 2 +- src/mds/mds_tipc_fctrl_portid.cc | 9 +- 5 files changed, 294 insertions(+), 261 deletions(-) Testing Commands: - *** LIST THE COMMAND LINE TOOLS/STEPS TO TEST YOUR CHANGES *** Testing, Expected Results: -- *** PASTE COMMAND OUTPUTS / TEST RESULTS *** Conditions of Submission: - *** HOW MANY DAYS BEFORE PUSHING, CONSENSUS ETC *** Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 n n powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible line length in your headers/comments/text. ___ You have failed to put in a proper Trac Ticket # into your commits. ___ You have incorrectly put/left internal data in your comments/files (i.e. internal bug tracking tool IDs, product names etc) ___ You have not given any evidence of testing beyond basic build tests. Demonstrate some level of runtime or other sanity testing. ___ You have ^M present in some of your files. These have to be removed. ___ You have needlessly changed whitespace or added whitespace crimes like trailing spaces, or spaces before tabs. ___ You have mixed real technical changes with whitespace and other cosmetic code cleanup changes. These have to be separate commits. ___ You need to refactor your submission into logical chunks; there is too much content into a single commit. ___ You have extraneous garbage in your review (merge commits etc) ___ You have giant attachments which should never have been sent; Instead you should place your content in a public tree to be pulled. ___ You have too many commits attached to an e-mail; resend as threaded commits, or place in a public tree for a pull. ___ You have resent this content multiple times without a clear indication of what has changed between each re-send. ___ You have failed to adequately and individually address all of the comments and change requests that were proposed in the initial review. ___ You have a misconfigured ~/.gitconfig file (i.e. user.name, user.email etc) ___ Your computer have a badly configured date and time; confusing the the threaded patch review. ___ Your changes affect IPC mechanism, and you don't present any results for in-service upgradability test. ___ Your changes affect user manual and documentation, your patch series do not contain the patch that updates the Doxygen manual. ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net
Re: [devel] [PATCH 1/1] amfd: not accept lock-in if su is reparing [#3121]
Hi Thang, I assume you have tried and there is no way to reuse the current *state* of su to prevent the lock-in op in this scenario, and this patch tested ok with upgrade/downgrade. The down side of adding checkpoint is that we will stick with it even then we find better solution later on, since removing the checkpoint would cause a nbc. No comments from me. Thanks Minh On 2/12/19 2:39 pm, thang.d.nguyen wrote: AMFD should not accept lock-in admin op on SU if the SU is repairing. --- src/amf/amfd/chkop.cc | 9 + src/amf/amfd/ckpt.h | 3 ++- src/amf/amfd/ckpt_dec.cc | 42 +-- src/amf/amfd/ckpt_enc.cc | 30 +++- src/amf/amfd/ckpt_msg.h | 1 + src/amf/amfd/ckpt_updt.cc | 1 + src/amf/amfd/sgproc.cc| 1 + src/amf/amfd/su.cc| 19 ++ src/amf/amfd/su.h | 3 +++ 9 files changed, 105 insertions(+), 4 deletions(-) diff --git a/src/amf/amfd/chkop.cc b/src/amf/amfd/chkop.cc index 56b0142a6..15408b657 100644 --- a/src/amf/amfd/chkop.cc +++ b/src/amf/amfd/chkop.cc @@ -923,6 +923,14 @@ uint32_t avsv_send_ckpt_data(AVD_CL_CB *cb, uint32_t action, /* No need to send the message as standy would get the applier callback */ return NCSCC_RC_SUCCESS; +case AVSV_CKPT_SU_INST_PROCESSED: + if (avd_cb->avd_peer_ver < AVD_MBCSV_SUB_PART_VERSION_11) { +/* No need to send the message to old std as this async is newly added. + */ +return NCSCC_RC_SUCCESS; + } + cb->async_updt_cnt.su_updt++; + break; /* else fall through */ case AVSV_CKPT_SU_SI_CURR_ACTIVE: case AVSV_CKPT_SU_SI_CURR_STBY: @@ -1366,6 +1374,7 @@ static uint32_t avsv_validate_reo_type_in_csync(AVD_CL_CB *cb, case AVSV_CKPT_SU_SI_CURR_STBY: case AVSV_CKPT_SU_ADMIN_STATE: case AVSV_CKPT_SU_TERM_STATE: +case AVSV_CKPT_SU_INST_PROCESSED: case AVSV_CKPT_SU_SWITCH: case AVSV_CKPT_SU_OPER_STATE: case AVSV_CKPT_SU_PRES_STATE: diff --git a/src/amf/amfd/ckpt.h b/src/amf/amfd/ckpt.h index 2e1538719..f092f5b8c 100644 --- a/src/amf/amfd/ckpt.h +++ b/src/amf/amfd/ckpt.h @@ -35,9 +35,10 @@ #define AMF_AMFD_CKPT_H_ // current version -#define AVD_MBCSV_SUB_PART_VERSION 10 +#define AVD_MBCSV_SUB_PART_VERSION 11 // supported versions +#define AVD_MBCSV_SUB_PART_VERSION_11 11 #define AVD_MBCSV_SUB_PART_VERSION_10 10 #define AVD_MBCSV_SUB_PART_VERSION_9 9 #define AVD_MBCSV_SUB_PART_VERSION_8 8 diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc index 75213f821..7030f43b1 100644 --- a/src/amf/amfd/ckpt_dec.cc +++ b/src/amf/amfd/ckpt_dec.cc @@ -63,6 +63,7 @@ static uint32_t dec_su_si_curr_active(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec); static uint32_t dec_su_si_curr_stby(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec); static uint32_t dec_su_admin_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec); static uint32_t dec_su_term_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec); +static uint32_t dec_su_inst_msg_processed(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec); static uint32_t dec_su_switch(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec); static uint32_t dec_su_oper_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec); static uint32_t dec_su_pres_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec); @@ -162,8 +163,8 @@ const AVSV_DECODE_CKPT_DATA_FUNC_PTR avd_dec_data_func_list[] = { dec_comp_pres_state, dec_comp_restart_count, nullptr, /* AVSV_SYNC_COMMIT */ dec_su_restart_count, dec_si_dep_state, dec_ng_admin_state, dec_avd_to_avd_job_queue_status, -dec_node_failover_state - +dec_node_failover_state, +dec_su_inst_msg_processed }; /* @@ -445,6 +446,9 @@ static void decode_su(NCS_UBAID *ub, AVD_SU *su, uint16_t peer_version) { if (peer_version >= AVD_MBCSV_SUB_PART_VERSION_2) osaf_decode_bool(ub, >su_is_external); + + if (peer_version >= AVD_MBCSV_SUB_PART_VERSION_11) +osaf_decode_bool(ub, >is_inst_msg_processed); } /\ @@ -1538,6 +1542,40 @@ static uint32_t dec_su_term_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) { return NCSCC_RC_SUCCESS; } +/\ + * + * Purpose: Decode SU inst msg of service + * + * Input: cb - CB pointer. + *dec - Decode arguments passed by MBCSV. + * + * Returns: NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. + * + * NOTES: + * + * +\**/ +static uint32_t dec_su_inst_msg_processed( + AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) { + SaNameT name; + + TRACE_ENTER(); + + osaf_decode_sanamet(>i_uba, ); + AVD_SU *su = su_db->find(Amf::to_string()); + osafassert(su != nullptr); + osaf_decode_uint32(>i_uba, +reinterpret_cast(>is_inst_msg_processed)); + +
Re: [devel] [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123]
Hi Thuan, ack with comments. Thanks Minh On 28/11/19 6:55 pm, thuan.tran wrote: When overflow happens, mds with flow control enabled may keep all messages in queue if it fails to send a message when receiving Nack or ChunkAck since no more trigger come after that. MDS flow control should retry to send message in this scenario. --- src/mds/mds_tipc_fctrl_portid.cc | 47 ++-- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 316e1ba75..d5314d5bc 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -17,6 +17,7 @@ #include "mds/mds_tipc_fctrl_portid.h" #include "base/ncssysf_def.h" +#include "base/osaf_time.h" #include "mds/mds_dt.h" #include "mds/mds_log.h" @@ -149,23 +150,24 @@ void TipcPortId::FlushData() { uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) { struct sockaddr_tipc server_addr; - ssize_t send_len = 0; - uint32_t rc = NCSCC_RC_SUCCESS; - memset(_addr, 0, sizeof(server_addr)); server_addr.family = AF_TIPC; server_addr.addrtype = TIPC_ADDR_ID; server_addr.addr.id = id_; - send_len = sendto(bsrsock_, data, length, 0, -(struct sockaddr *)_addr, sizeof(server_addr)); - - if (send_len == length) { -rc = NCSCC_RC_SUCCESS; - } else { -m_MDS_LOG_ERR("FCTRL: sendto() failed, Error[%s]", strerror(errno)); -rc = NCSCC_RC_FAILURE; + int retry = 5; + while (retry >= 0) { +ssize_t send_len = sendto(bsrsock_, data, length, 0, + (struct sockaddr *)_addr, sizeof(server_addr)); + +if (send_len == length) { + return NCSCC_RC_SUCCESS; +} else if (retry-- > 0) { + assert(errno == ENOMEM || errno == ENOBUFS); + osaf_nanosleep(); +} } [Minh] It might be a good thing to make a wrapper of sendto(), since the sendto() is currently called in fctrl_portid.cc and mds_dt_tipc.c. So we only call the wrapper of sendto(), which handles the error code of sendto(). I think the only EINTR code to be checked, there are a few places in opensaf that is handling error code of sendto() which we can take as reference. - return rc; + m_MDS_LOG_ERR("FCTRL: sendto() failed, Error[%s]", strerror(errno)); + return NCSCC_RC_FAILURE; } uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length, @@ -440,13 +442,16 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { // try to send a few pending msg DataMessage* msg = nullptr; uint16_t send_msg_cnt = 0; -while (send_msg_cnt++ < chunk_size_) { +int retry = 0; +while (send_msg_cnt < chunk_size_) { // find the lowest sequence unsent yet msg = sndqueue_.FirstUnsent(); if (msg == nullptr) { break; } else { if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { +retry = 0; +send_msg_cnt++; msg->is_sent_ = true; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "SndQData[fseq:%u, len:%u], " @@ -454,6 +459,12 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { id_.node, id_.ref, msg->header_.fseq_, msg->header_.msg_len_, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); + } else if (send_msg_cnt == 0) { +// If not retry, all messages are kept in queue +// and no more trigger to send messages +retry++; +assert(retry < 100); +continue; [Minh] We can accept to use the assert for now, and 100 should be defined as constant. But I do think we need a fallback mechanism, if the socket fd is not able to send data, we can terminate the portid, and trigger a MDS_DOWN event, ... and this could be looked in another ticket. Also, the patch title does not seem to be right in the context of this ticket, where we have problem of "Cannot allocate memeory", we might not be able to send any more message (not that for all) and hit the assert. We can say "Add retry for tipc sendto()" or you have a better description for it. } else { break; } @@ -508,9 +519,15 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag, DataMessage* msg = sndqueue_.Find(Seq16(fseq)); if (msg != nullptr) { // Resend the msg found -if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { - msg->is_sent_ = true; +int retry = 0; +while (Send(msg->msg_data_, msg->header_.msg_len_) != NCSCC_RC_SUCCESS) { + // If not retry, all messages are kept in queue + // and no more trigger to send messages + retry++; + assert(retry < 100); + continue; } +msg->is_sent_ = true; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "RsndData[mseq:%u, mfrag:%u, fseq:%u], "
Re: [devel] [PATCH 2/2] mds: Avoid message reallocation [#3089]
Hi Thuan, We should free() the memory at the same function level where the memory is allocated. The @buffer is passed to mdtm_sendto() could be from a stack memory (as it is used to be before this patch). Thanks Minh On 27/11/19 5:40 pm, Tran Thuan wrote: Hi Minh, Why not free() inside mdtm_sendto() and mdtm_mcast_sendto()? It will help reduce much code change. Best Regards, ThuanTr -Original Message- From: Minh Chau Sent: Tuesday, November 26, 2019 7:02 PM To: thuan.t...@dektech.com.au; vu.m.ngu...@dektech.com.au; gary@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; Minh Chau Subject: [PATCH 2/2] mds: Avoid message reallocation [#3089] The patch avoids message reallocation if the message is in retransmission queue --- src/mds/mds_dt_tipc.c| 42 +++- src/mds/mds_tipc_fctrl_intf.cc | 6 -- src/mds/mds_tipc_fctrl_intf.h| 4 ++-- src/mds/mds_tipc_fctrl_msg.cc| 2 +- src/mds/mds_tipc_fctrl_portid.cc | 9 +++-- 5 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 16cf11b..866c370 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -120,7 +120,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req); /* Tipc actual send, can be made as Macro even*/ static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t buff_len, - struct tipc_portid tipc_id); + struct tipc_portid tipc_id, uint8_t *is_queued); static uint32_t mdtm_mcast_sendto(void *buffer, size_t size, const MDTM_SEND_REQ *req); @@ -2643,7 +2643,8 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) if (req->snd_type == MDS_SENDTYPE_ACK || req->snd_type == MDS_SENDTYPE_RACK) { uint8_t len = mds_and_mdtm_hdr_len; - uint8_t buffer_ack[len]; + uint8_t *buffer_ack = calloc(1, len); + uint8_t is_queued = 0; /* Add mds_hdr */ if (mdtm_add_mds_hdr(buffer_ack, req) @@ -2657,18 +2658,24 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) _seq_num) == NCSCC_RC_FAILURE){ m_MDS_LOG_ERR("FCTRL: Failed to send message" " len :%d", len); + free(buffer_ack); return NCSCC_RC_FAILURE; } /* Add frag_hdr */ if (mdtm_add_frag_hdr(buffer_ack, len, frag_seq_num, 0, fctrl_seq_num) != NCSCC_RC_SUCCESS) { + free(buffer_ack); return NCSCC_RC_FAILURE; } m_MDS_LOG_DBG("MDTM:Sending message with Service" " Seqno=%d, TO Dest_Tipc_id=<0x%08x:%u> ", req->svc_seq_num, tipc_id.node, tipc_id.ref); - return mdtm_sendto(buffer_ack, len, tipc_id); + status = mdtm_sendto(buffer_ack, len, tipc_id, + _queued); + if (is_queued == 0) + free(buffer_ack); + return status; } if (req->msg.encoding == MDS_ENC_TYPE_FLAT) { @@ -2730,6 +2737,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) } else { uint8_t *p8; uint8_t *body = NULL; + uint8_t is_queued = 0; body = calloc(1, len + mds_and_mdtm_hdr_len); @@ -2824,7 +2832,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) return NCSCC_RC_FAILURE; } } else { - if (mdtm_sendto(body, len, tipc_id) + if (mdtm_sendto(body, len, tipc_id, _queued) != NCSCC_RC_SUCCESS) { m_MDS_LOG_ERR("MDTM: Unable to" " send the msg thru" @@ -2835,7 +2843,8 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) } } m_MMGR_FREE_BUFR_LIST(usrbuf); - free(body); + if (is_queued == 0) + free(body); return NCSCC_RC_SUCCESS; } } break; @@ -2864,6 +2873,7 @@ uint32_t
Re: [devel] [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123]
But after all retries are still failed, we might need to terminate the portid, which leads to a MDS DOWN event, but let's look at it later. On 27/11/19 3:23 pm, Minh Hon Chau wrote: Hi Thuan, I'm thinking to retry 3 times with 100 ms in between, but you can decide it. Also, we need to ensure not to make the mds main receiving thread being blocked with the retry (on the flow of processing data). The retry in this patch is ok since it retries on the mds flow control thread, so it does not delay the mds main receiving thread. Thanks Minh On 27/11/19 2:40 pm, Tran Thuan wrote: Hi Minh, I think it's good if retry some times for normal Send(). Do you have any idea how many retries? Interval b/w tries? Best Regards, ThuanTr -Original Message- From: Minh Hon Chau Sent: Wednesday, November 27, 2019 10:30 AM To: thuan.tran ; thang . d . nguyen ; 'Nguyen Minh Vu' ; gary@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123] Hi Thuan, The TipcPortId:Send is also called at a few other places, do you think it is good if we make a wrapper of TipcPortId::Send with a few retries on failures, says TipcPortId::TryToSend(), and call TryToSend() instead of Send()? Thanks Minh On 27/11/19 1:26 pm, thuan.tran wrote: When overflow happens, mds with flow control enabled may keep all messages in queue if it fails to send a message when receiving Nack or ChunkAck since no more trigger come after that. MDS flow control should retry to send message in this scenario. --- src/mds/mds_tipc_fctrl_portid.cc | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 724eb7b7b..e6e179669 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -17,6 +17,7 @@ #include "mds/mds_tipc_fctrl_portid.h" #include "base/ncssysf_def.h" +#include "base/osaf_time.h" #include "mds/mds_dt.h" #include "mds/mds_log.h" @@ -440,13 +441,14 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { // try to send a few pending msg DataMessage* msg = nullptr; uint16_t send_msg_cnt = 0; - while (send_msg_cnt++ < chunk_size_) { + while (send_msg_cnt < chunk_size_) { // find the lowest sequence unsent yet msg = sndqueue_.FirstUnsent(); if (msg == nullptr) { break; } else { if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { + send_msg_cnt++; msg->is_sent_ = true; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "SndQData[fseq:%u, len:%u], " @@ -455,7 +457,10 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { msg->header_.fseq_, msg->header_.msg_len_, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); } else { - break; + // If not retry, all messages are kept in queue + // and no more trigger to send messages + osaf_nanosleep(); + continue; } } } @@ -508,9 +513,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag, DataMessage* msg = sndqueue_.Find(Seq16(fseq)); if (msg != nullptr) { // Resend the msg found - if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { - msg->is_sent_ = true; + while (Send(msg->msg_data_, msg->header_.msg_len_) != NCSCC_RC_SUCCESS) { + // If not retry, all messages are kept in queue + // and no more trigger to send messages + osaf_nanosleep(); } + msg->is_sent_ = true; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "RsndData[mseq:%u, mfrag:%u, fseq:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123]
Hi Thuan, I'm thinking to retry 3 times with 100 ms in between, but you can decide it. Also, we need to ensure not to make the mds main receiving thread being blocked with the retry (on the flow of processing data). The retry in this patch is ok since it retries on the mds flow control thread, so it does not delay the mds main receiving thread. Thanks Minh On 27/11/19 2:40 pm, Tran Thuan wrote: Hi Minh, I think it's good if retry some times for normal Send(). Do you have any idea how many retries? Interval b/w tries? Best Regards, ThuanTr -Original Message- From: Minh Hon Chau Sent: Wednesday, November 27, 2019 10:30 AM To: thuan.tran ; thang . d . nguyen ; 'Nguyen Minh Vu' ; gary@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123] Hi Thuan, The TipcPortId:Send is also called at a few other places, do you think it is good if we make a wrapper of TipcPortId::Send with a few retries on failures, says TipcPortId::TryToSend(), and call TryToSend() instead of Send()? Thanks Minh On 27/11/19 1:26 pm, thuan.tran wrote: When overflow happens, mds with flow control enabled may keep all messages in queue if it fails to send a message when receiving Nack or ChunkAck since no more trigger come after that. MDS flow control should retry to send message in this scenario. --- src/mds/mds_tipc_fctrl_portid.cc | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 724eb7b7b..e6e179669 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -17,6 +17,7 @@ #include "mds/mds_tipc_fctrl_portid.h" #include "base/ncssysf_def.h" +#include "base/osaf_time.h" #include "mds/mds_dt.h" #include "mds/mds_log.h" @@ -440,13 +441,14 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { // try to send a few pending msg DataMessage* msg = nullptr; uint16_t send_msg_cnt = 0; -while (send_msg_cnt++ < chunk_size_) { +while (send_msg_cnt < chunk_size_) { // find the lowest sequence unsent yet msg = sndqueue_.FirstUnsent(); if (msg == nullptr) { break; } else { if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { +send_msg_cnt++; msg->is_sent_ = true; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "SndQData[fseq:%u, len:%u], " @@ -455,7 +457,10 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { msg->header_.fseq_, msg->header_.msg_len_, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); } else { -break; +// If not retry, all messages are kept in queue +// and no more trigger to send messages +osaf_nanosleep(); +continue; } } } @@ -508,9 +513,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag, DataMessage* msg = sndqueue_.Find(Seq16(fseq)); if (msg != nullptr) { // Resend the msg found -if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { - msg->is_sent_ = true; +while (Send(msg->msg_data_, msg->header_.msg_len_) != NCSCC_RC_SUCCESS) { + // If not retry, all messages are kept in queue + // and no more trigger to send messages + osaf_nanosleep(); } +msg->is_sent_ = true; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "RsndData[mseq:%u, mfrag:%u, fseq:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123]
Hi Thuan, The TipcPortId:Send is also called at a few other places, do you think it is good if we make a wrapper of TipcPortId::Send with a few retries on failures, says TipcPortId::TryToSend(), and call TryToSend() instead of Send()? Thanks Minh On 27/11/19 1:26 pm, thuan.tran wrote: When overflow happens, mds with flow control enabled may keep all messages in queue if it fails to send a message when receiving Nack or ChunkAck since no more trigger come after that. MDS flow control should retry to send message in this scenario. --- src/mds/mds_tipc_fctrl_portid.cc | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 724eb7b7b..e6e179669 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -17,6 +17,7 @@ #include "mds/mds_tipc_fctrl_portid.h" #include "base/ncssysf_def.h" +#include "base/osaf_time.h" #include "mds/mds_dt.h" #include "mds/mds_log.h" @@ -440,13 +441,14 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { // try to send a few pending msg DataMessage* msg = nullptr; uint16_t send_msg_cnt = 0; -while (send_msg_cnt++ < chunk_size_) { +while (send_msg_cnt < chunk_size_) { // find the lowest sequence unsent yet msg = sndqueue_.FirstUnsent(); if (msg == nullptr) { break; } else { if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { +send_msg_cnt++; msg->is_sent_ = true; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "SndQData[fseq:%u, len:%u], " @@ -455,7 +457,10 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { msg->header_.fseq_, msg->header_.msg_len_, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); } else { -break; +// If not retry, all messages are kept in queue +// and no more trigger to send messages +osaf_nanosleep(); +continue; } } } @@ -508,9 +513,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag, DataMessage* msg = sndqueue_.Find(Seq16(fseq)); if (msg != nullptr) { // Resend the msg found -if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { - msg->is_sent_ = true; +while (Send(msg->msg_data_, msg->header_.msg_len_) != NCSCC_RC_SUCCESS) { + // If not retry, all messages are kept in queue + // and no more trigger to send messages + osaf_nanosleep(); } +msg->is_sent_ = true; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "RsndData[mseq:%u, mfrag:%u, fseq:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: Avoid message re-allocation [#3089]
Hi Vu, Thuan, The patch misses the error cases and the kDisabled state. I rework for the V2. Thanks Minh On 25/11/19 6:44 pm, Nguyen Minh Vu wrote: Hi Minh, Ack with comments inline. Regards, Vu On 11/25/19 1:12 PM, Minh Chau wrote: The patch avoids message reallocation if enable MDS_TIPC_FCTRL_ENABLED --- src/mds/mds_dt_tipc.c | 27 --- src/mds/mds_tipc_fctrl_msg.cc | 2 +- src/mds/mds_tipc_fctrl_portid.cc | 9 +++-- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index fdf0da7..aa8d5c2 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -2644,7 +2644,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) if (req->snd_type == MDS_SENDTYPE_ACK || req->snd_type == MDS_SENDTYPE_RACK) { uint8_t len = sum_mds_hdr_plus_mdtm_hdr_plus_len; - uint8_t buffer_ack[len]; + uint8_t* buffer_ack = calloc(1, len); [Vu] Below this allocation, there are several error handlings, but not free memory before returning. Is that expected? /* Add mds_hdr */ if (NCSCC_RC_SUCCESS != @@ -2667,7 +2667,11 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) m_MDS_LOG_DBG( "MDTM:Sending message with Service Seqno=%d, TO Dest_Tipc_id=<0x%08x:%u> ", req->svc_seq_num, tipc_id.node, tipc_id.ref); - return mdtm_sendto(buffer_ack, len, tipc_id); + status = mdtm_sendto(buffer_ack, len, tipc_id); + if (gl_mds_pro_ver != MDS_PROT_FCTRL) { + free(buffer_ack); + } [Vu] Above allocation does not stick with `MDS_PROT_FCTRL` check, so if the above condition check gets failure, the allocated memory is leaked? + return status; } if (MDS_ENC_TYPE_FLAT == req->msg.encoding) { @@ -2815,6 +2819,8 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) free(body); return NCSCC_RC_FAILURE; } + m_MMGR_FREE_BUFR_LIST(usrbuf); + free(body); } else { if (NCSCC_RC_SUCCESS != mdtm_sendto(body, len, tipc_id)) { @@ -2824,9 +2830,12 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) free(body); return NCSCC_RC_FAILURE; } + if (gl_mds_pro_ver != MDS_PROT_FCTRL) { + m_MMGR_FREE_BUFR_LIST(usrbuf); + free(body); + } } - m_MMGR_FREE_BUFR_LIST(usrbuf); - free(body); + return NCSCC_RC_SUCCESS; } } break; @@ -2909,7 +2918,9 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) mds_free_direct_buff( req->msg.data.buff_info.buff); } - free(body); + if (gl_mds_pro_ver != MDS_PROT_FCTRL) { + free(body); + } return NCSCC_RC_SUCCESS; } break; @@ -3059,21 +3070,23 @@ uint32_t mdtm_frag_and_send(MDTM_SEND_REQ *req, uint32_t seq_num, get_svc_names(req->src_svc_id), req->src_svc_id, get_svc_names(req->dest_svc_id), req->dest_svc_id); ret = mdtm_mcast_sendto(body, len_buf, req); + free(body); } else { m_MDS_LOG_DBG( "MDTM:Sending message with Service Seqno=%d, Fragment Seqnum=%d, frag_num=%d, TO Dest_Tipc_id=<0x%08x:%u>", req->svc_seq_num, seq_num, frag_val, id.node, id.ref); ret = mdtm_sendto(body, len_buf, id); + if (gl_mds_pro_ver != MDS_PROT_FCTRL) { + free(body); + } } if (ret != NCSCC_RC_SUCCESS) { // Failed to send a fragmented msg, stop sending m_MMGR_FREE_BUFR_LIST(usrbuf); - free(body); break; } m_MMGR_REMOVE_FROM_START(, len_buf - hdr_plus); - free(body); len = len - (len_buf - hdr_plus); if (len == 0) break; diff --git a/src/mds/mds_tipc_fctrl_msg.cc b/src/mds/mds_tipc_fctrl_msg.cc index 454c02c..0f9fd09 100644 --- a/src/mds/mds_tipc_fctrl_msg.cc +++ b/src/mds/mds_tipc_fctrl_msg.cc @@ -138,7 +138,7 @@ void DataMessage::Decode(uint8_t *msg) { DataMessage::~DataMessage() { if (msg_data_ != nullptr) { - delete[] msg_data_; + free(msg_data_); msg_data_ = nullptr; } } diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 724eb7b..08e8dce 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++
Re: [devel] [PATCH 1/1] amfd: not accept lock-in admin op if presence msg not processed [#3121]
Hi Thang, Instead of adding is_presence_msg_processed, which requires a checkpoint to standby, can we make it as a function (or might be *if* statement) that utilizes the pres_state and term_state to (dis)allow the lock-in op? Thanks Minh On 25/11/19 5:50 pm, thang.d.nguyen wrote: AMFD should not accept lock-in admin op on SU if the presence msg has already sent to that SU. --- src/amf/amfd/sgproc.cc | 1 + src/amf/amfd/su.cc | 13 + src/amf/amfd/su.h | 2 ++ 3 files changed, 16 insertions(+) diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc index ddd825d44..8aeb9ec3c 100644 --- a/src/amf/amfd/sgproc.cc +++ b/src/amf/amfd/sgproc.cc @@ -2126,6 +2126,7 @@ uint32_t avd_sg_app_su_inst_func(AVD_CL_CB *cb, AVD_SG *sg) { } } else { if (avd_snd_presence_msg(cb, i_su, false) == NCSCC_RC_SUCCESS) { + i_su->is_presence_msg_processed = true; num_try_insvc_su++; } } diff --git a/src/amf/amfd/su.cc b/src/amf/amfd/su.cc index 8c8ef9d4f..494022893 100644 --- a/src/amf/amfd/su.cc +++ b/src/amf/amfd/su.cc @@ -51,6 +51,7 @@ void AVD_SU::initialize() { term_state = false; su_switch = AVSV_SI_TOGGLE_STABLE; su_is_external = false; + is_presence_msg_processed = false; su_act_state = 0; sg_of_su = nullptr; su_on_node = nullptr; @@ -810,6 +811,12 @@ void AVD_SU::set_pres_state(SaAmfPresenceStateT pres_state) { */ return; + if ((pres_state == SA_AMF_PRESENCE_INSTANTIATED) || + (pres_state == SA_AMF_PRESENCE_INSTANTIATION_FAILED) || + (pres_state == SA_AMF_PRESENCE_TERMINATION_FAILED)) { +this->is_presence_msg_processed = false; + } + osafassert(pres_state <= SA_AMF_PRESENCE_TERMINATION_FAILED); TRACE_ENTER2("'%s' %s => %s", name.c_str(), avd_pres_state_name[saAmfSUPresenceState], @@ -1085,6 +1092,12 @@ void AVD_SU::lock_instantiation(SaImmOiHandleT immoi_handle, goto done; } + if (is_presence_msg_processed == true) { +report_admin_op_error(immoi_handle, invocation, SA_AIS_ERR_TRY_AGAIN, + nullptr, "'%s' instantiate not done", name.c_str()); +goto done; + } + if (list_of_susi != nullptr) { report_admin_op_error(immoi_handle, invocation, SA_AIS_ERR_TRY_AGAIN, nullptr, "SIs still assigned to this SU '%s'", diff --git a/src/amf/amfd/su.h b/src/amf/amfd/su.h index 7afc5abee..722c68b9c 100644 --- a/src/amf/amfd/su.h +++ b/src/amf/amfd/su.h @@ -87,6 +87,8 @@ class AVD_SU { bool su_is_external; /* indicates if this SU is external */ + bool is_presence_msg_processed; /* indicate inst msg sent to nd */ + int su_act_state; // not used, kept for EDU, remove later bool wait_for_contained_to_quiesce; ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: fix memleak in code and test [#1860]
Hi Thuan ack (review only) Thanks Minh On 19/11/19 5:49 pm, thuan.tran wrote: --- src/mds/apitest/mdstipc.h | 2 +- src/mds/apitest/mdstipc_api.c | 134 +++-- src/mds/apitest/mdstipc_conf.c | 9 ++- src/mds/mds_c_sndrcv.c | 1 + src/mds/mds_tipc_fctrl_intf.cc | 4 +- 5 files changed, 88 insertions(+), 62 deletions(-) diff --git a/src/mds/apitest/mdstipc.h b/src/mds/apitest/mdstipc.h index 5fd7b9c6e..b56940ea6 100644 --- a/src/mds/apitest/mdstipc.h +++ b/src/mds/apitest/mdstipc.h @@ -203,7 +203,7 @@ uint32_t destroy_pwe_on_vdest(MDS_HDL); /** USER DEFINED WRAPPERS FOR MDS SERVICE APIs **/ -uint32_t tet_create_task(NCS_OS_CB, NCSCONTEXT); +uint32_t tet_create_task(NCS_OS_CB, NCSCONTEXT*); uint32_t tet_release_task(void *task_handle); int is_adest_sel_obj_found(int); int is_sel_obj_found(int); diff --git a/src/mds/apitest/mdstipc_api.c b/src/mds/apitest/mdstipc_api.c index 651365e95..847f9a7f1 100644 --- a/src/mds/apitest/mdstipc_api.c +++ b/src/mds/apitest/mdstipc_api.c @@ -398,7 +398,7 @@ void tet_svc_install_tp_10() printf( "\nTest case 10:Installing the External MIN service EXTMIN in a seperate thread and Uninstalling it here\n"); // Install thread - rc = tet_create_task((NCS_OS_CB)tet_vdest_install_thread, t_handle); + rc = tet_create_task((NCS_OS_CB)tet_vdest_install_thread, _handle); if (rc != NCSCC_RC_SUCCESS) { printf("\nFail to Install thread\n"); FAIL = 1; @@ -999,7 +999,7 @@ void tet_svc_unstall_tp_5() // Uninstalling the above service in a seperate thread // Uninstall thread rc = tet_create_task((NCS_OS_CB)tet_vdest_uninstall_thread, -gl_tet_vdest[0].svc[0].task.t_handle); +_tet_vdest[0].svc[0].task.t_handle); if (rc != NCSCC_RC_SUCCESS) { printf("\nFail to create the uninstall thread\n"); FAIL = 1; @@ -2141,12 +2141,18 @@ void cleanup_ADEST_srv() { int id; printf("\nUninstalling all the services on this ADESt\n"); - for (id = gl_tet_adest.svc_count - 1; id >= 0; id--) + for (id = gl_tet_adest.svc_count - 1; id >= 0; id--) { + if (mds_service_retrieve(gl_tet_adest.mds_pwe1_hdl, +gl_tet_adest.svc[id].svc_id, +SA_DISPATCH_ALL) != NCSCC_RC_SUCCESS) { + printf("Adest Svc Retrieve Fail\n"); + } if (mds_service_uninstall(gl_tet_adest.mds_pwe1_hdl, gl_tet_adest.svc[id].svc_id) != NCSCC_RC_SUCCESS) { printf("\nFail mds_service_uninstall\n"); } + } } void tet_svc_subscr_ADEST_1() @@ -2441,7 +2447,7 @@ void tet_svc_subscr_ADEST_8() } printf("\nAction: Cancel in a seperate thread\n"); if (tet_create_task((NCS_OS_CB)tet_adest_cancel_thread, - gl_tet_adest.svc[0].task.t_handle) == + _tet_adest.svc[0].task.t_handle) == NCSCC_RC_SUCCESS) { printf("\nTask has been Created\n"); fflush(stdout); @@ -2547,7 +2553,7 @@ void tet_svc_subscr_ADEST_10() printf("\nAction: Retrieve in a seperate thread\n"); /*Retrieve thread*/ if (tet_create_task((NCS_OS_CB)tet_adest_retrieve_thread, - gl_tet_adest.svc[0].task.t_handle) == + _tet_adest.svc[0].task.t_handle) == NCSCC_RC_SUCCESS) { printf("\nTask has been Created\n"); fflush(stdout); @@ -2751,7 +2757,10 @@ uint32_t tet_cleanup_setup() printf("Fail mds_service_retrieve\n"); FAIL = 1; } - + if (gl_rcvdmsginfo.msg) { + free(gl_rcvdmsginfo.msg); + gl_rcvdmsginfo.msg = NULL; + } if (mds_service_uninstall( gl_tet_vdest[i].mds_pwe1_hdl, gl_tet_vdest[i].svc[id].svc_id) != @@ -2785,6 +2794,10 @@ uint32_t tet_cleanup_setup() printf("Adest Svc Retrieve Fail\n"); FAIL = 1; } + if (gl_rcvdmsginfo.msg) { + free(gl_rcvdmsginfo.msg); + gl_rcvdmsginfo.msg = NULL; + } if (mds_service_uninstall(gl_tet_adest.mds_pwe1_hdl, i) != NCSCC_RC_SUCCESS) { printf("Adest Svc Uninstall
Re: [devel] [PATCH 1/1] ntf: Fix coding issues identified by codechecker [#3114]
Hi Thuan ack from me. Thanks Minh On 4/11/19 6:42 pm, thuan.tran wrote: --- src/ntf/agent/ntfa_api.c | 29 +++-- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/ntf/agent/ntfa_api.c b/src/ntf/agent/ntfa_api.c index 417c9d688..e89479bf6 100644 --- a/src/ntf/agent/ntfa_api.c +++ b/src/ntf/agent/ntfa_api.c @@ -1379,30 +1379,31 @@ SaAisErrorT recoverClient(ntfa_client_hdl_rec_t *client_hdl) if ((rc = reinitializeClient(client_hdl)) == SA_AIS_OK) { /* Restore reader */ ntfa_reader_hdl_rec_t *reader_hdl = client_hdl->reader_list; - while (reader_hdl != NULL && rc == SA_AIS_OK) { + while (reader_hdl != NULL) { rc = recoverReader(client_hdl, reader_hdl); + if (rc != SA_AIS_OK) { + TRACE("Failed to restore reader (readerId:%d)", + reader_hdl->reader_id); + goto done; + } reader_hdl = reader_hdl->next; } - if (rc != SA_AIS_OK) { - TRACE("Failed to restore reader (readerId:%d)", - reader_hdl->reader_id); - goto done; - } /* Restore subscriber */ ntfa_subscriber_list_t *subscriber_hdl = subscriberNoList; - while (subscriber_hdl != NULL && rc == SA_AIS_OK) { + while (subscriber_hdl != NULL) { if (client_hdl->local_hdl == - subscriber_hdl->subscriberListNtfHandle) + subscriber_hdl->subscriberListNtfHandle) { rc = recoverSubscriber(client_hdl, subscriber_hdl); + if (rc != SA_AIS_OK) { + TRACE( + "Failed to restore subscriber (subscriptionId:%d)", + subscriber_hdl->subscriberListSubscriptionId); + goto done; + } + } subscriber_hdl = subscriber_hdl->next; } - if (rc != SA_AIS_OK) { - TRACE( - "Failed to restore subscriber (subscriptionId:%d)", - subscriber_hdl->subscriberListSubscriptionId); - goto done; - } client_hdl->valid = true; } else { TRACE("Failed to restore client (id:%d)", ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: Fix coding issues identified by codechecker [#3112]
Hi Thuan ack from me. thanks Minh On 4/11/19 5:56 pm, thuan.tran wrote: --- src/mds/mds_c_db.c | 1 + src/mds/mds_c_sndrcv.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mds/mds_c_db.c b/src/mds/mds_c_db.c index 58f0e3aee..e1991517e 100644 --- a/src/mds/mds_c_db.c +++ b/src/mds/mds_c_db.c @@ -433,6 +433,7 @@ uint32_t mds_vdest_tbl_get_role(MDS_VDEST_ID vdest_id, V_DEST_RL *role) vdest_info = (MDS_VDEST_INFO *)ncs_patricia_tree_get( _mds_mcm_cb->vdest_list, (uint8_t *)_id); if (vdest_info == NULL) { + *role = V_DEST_RL_INVALID; m_MDS_LOG_DBG("MDS:DB: VDEST not present"); m_MDS_LEAVE(); return NCSCC_RC_FAILURE; diff --git a/src/mds/mds_c_sndrcv.c b/src/mds/mds_c_sndrcv.c index 7850ac714..0dc76eef4 100644 --- a/src/mds/mds_c_sndrcv.c +++ b/src/mds/mds_c_sndrcv.c @@ -2319,7 +2319,7 @@ static uint32_t mcm_query_for_node_dest(MDS_DEST adest, uint8_t *to) *to = DESTINATION_SAME_PROCESS; else *to = DESTINATION_ON_NODE; - } else if (dest_node_id != src_node_id) { + } else { *to = DESTINATION_OFF_NODE; } return NCSCC_RC_SUCCESS; ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111]
Hi Thuan, I add one comment inline for explanation. Thanks Minh On 14/11/19 8:33 pm, Tran Thuan wrote: Hi Minh, I thought you will update check state of port id to know FCTRL or LEGACY. Since if (msg_len_ - fseq_ - 2 == MDTM_FRAG_HDR_LEN) may be not LEGACY protocol. [Minh] Yes, this case we can not tell whether it is FCTRL or LEGACY, thus the pro_ver_ remains UNDEFINED. In the mds_tipc_fctrl_rcv_data(), this UNDEFINED pro_ver_ fragment is forwarded to portid under the "if (header.IsFlowMessage() || header.IsUndefinedMessage())". The portid will skip this fragment if the state is kDisabled. In short, the fragment is forwarded to portid to check internally to follow the data flow, instead of checking the portid state inside message decoding which we need to refer the portid in mds_tipc_fctrl_msg.cc . Agree if (msg_len_ - fseq_ - 2 != MDTM_FRAG_HDR_LEN) 100% is FCTRL protocol. *Best Regards,* *ThuanTr*** *From:*Minh Hon Chau *Sent:* Thursday, November 14, 2019 4:28 PM *To:* Tran Thuan ; hans.nordeb...@ericsson.com; gary@dektech.com.au; vu.m.ngu...@dektech.com.au *Cc:* opensaf-devel@lists.sourceforge.net *Subject:* Re: [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111] Hi Thuan, Are you happy with my reply? Thanks Minh On 14/11/19 9:35 am, Minh Hon Chau wrote: Hi Thuan, Please see my reply inline. Thanks Minh On 13/11/19 9:54 pm, Tran Thuan wrote: Hi Minh, See my comment inline. Best Regards, ThuanTr -Original Message- From: Minh Chau <mailto:minh.c...@dektech.com.au> Sent: Friday, November 8, 2019 5:33 PM To:hans.nordeb...@ericsson.com <mailto:hans.nordeb...@ericsson.com>;gary@dektech.com.au <mailto:gary@dektech.com.au>;vu.m.ngu...@dektech.com.au <mailto:vu.m.ngu...@dektech.com.au>;thuan.t...@dektech.com.au <mailto:thuan.t...@dektech.com.au> Cc:opensaf-devel@lists.sourceforge.net <mailto:opensaf-devel@lists.sourceforge.net>; Minh Chau <mailto:minh.c...@dektech.com.au> Subject: [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111] The legacy mds encodes the protocol version in either non fragment message or the first fragment only. Hence, the subsequent fragment after the first one is not able for mds to determine the protocol version. The patch maintains the encoding of lengthcheck as same as the legacy mds version. Also, the subsequent fragments needs to consult the stateful portid to determine the protocol version, so that the fragment will be skipped if it is sent from legacy mds, or inspected the sequence if it is sent from new mds. --- src/mds/mds_dt.h | 6 ++ src/mds/mds_dt_tipc.c | 11 ++- src/mds/mds_tipc_fctrl_intf.cc | 154 ++- src/mds/mds_tipc_fctrl_msg.cc | 86 +++--- src/mds/mds_tipc_fctrl_msg.h | 5 ++ src/mds/mds_tipc_fctrl_portid.cc | 23 ++ src/mds/mds_tipc_fctrl_portid.h | 1 + 7 files changed, 193 insertions(+), 93 deletions(-) diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index 64da600..007ff98 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -243,6 +243,12 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* Unknown or undefined MDS protocol/version */ +#define MDS_PROT_UNDEFINED 0x00 + +/* MDS protocol/version for non flow control (legacy) */ +#define MDS_PROT_LEGACY (MDS_PROT | MDS_VERSION) + /* MDS protocol/version for flow control */ #define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) #define MDS_PROT_FCTRL_ID 0xFDAC13F5 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index e085de7..fdf0da7 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -166,7 +166,7 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; -static uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; +static uint8_t gl_mds_pro_ver = MDS_PROT_LEGACY; static int gl_mds_fctrl_acksize = -1; static int gl_mds_fctrl_ackto = -1; @@ -381,7 +381,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ACKSIZE"); } } else { - gl_mds_pro_ver = MDS_
Re: [devel] [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111]
Hi Thuan, Are you happy with my reply? Thanks Minh On 14/11/19 9:35 am, Minh Hon Chau wrote: Hi Thuan, Please see my reply inline. Thanks Minh On 13/11/19 9:54 pm, Tran Thuan wrote: Hi Minh, See my comment inline. Best Regards, ThuanTr -Original Message- From: Minh Chau Sent: Friday, November 8, 2019 5:33 PM To:hans.nordeb...@ericsson.com;gary@dektech.com.au;vu.m.ngu...@dektech.com.au;thuan.t...@dektech.com.au Cc:opensaf-devel@lists.sourceforge.net; Minh Chau Subject: [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111] The legacy mds encodes the protocol version in either non fragment message or the first fragment only. Hence, the subsequent fragment after the first one is not able for mds to determine the protocol version. The patch maintains the encoding of lengthcheck as same as the legacy mds version. Also, the subsequent fragments needs to consult the stateful portid to determine the protocol version, so that the fragment will be skipped if it is sent from legacy mds, or inspected the sequence if it is sent from new mds. --- src/mds/mds_dt.h | 6 ++ src/mds/mds_dt_tipc.c| 11 ++- src/mds/mds_tipc_fctrl_intf.cc | 154 ++- src/mds/mds_tipc_fctrl_msg.cc| 86 +++--- src/mds/mds_tipc_fctrl_msg.h | 5 ++ src/mds/mds_tipc_fctrl_portid.cc | 23 ++ src/mds/mds_tipc_fctrl_portid.h | 1 + 7 files changed, 193 insertions(+), 93 deletions(-) diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index 64da600..007ff98 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -243,6 +243,12 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* Unknown or undefined MDS protocol/version */ +#define MDS_PROT_UNDEFINED 0x00 + +/* MDS protocol/version for non flow control (legacy) */ +#define MDS_PROT_LEGACY (MDS_PROT | MDS_VERSION) + /* MDS protocol/version for flow control */ #define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) #define MDS_PROT_FCTRL_ID 0xFDAC13F5 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index e085de7..fdf0da7 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -166,7 +166,7 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; -static uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; +static uint8_t gl_mds_pro_ver = MDS_PROT_LEGACY; static int gl_mds_fctrl_acksize = -1; static int gl_mds_fctrl_ackto = -1; @@ -381,7 +381,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ACKSIZE"); } } else { - gl_mds_pro_ver = MDS_PROT | MDS_VERSION; + gl_mds_pro_ver = MDS_PROT_LEGACY; syslog(LOG_ERR, "MDTM:TIPC Invalid value of" "MDS_TIPC_FCTRL_ENABLED"); } @@ -3125,7 +3125,12 @@ uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, * hereafter, these 2 bytes will be used as sequence number in flow control * (per tipc portid) * */ - ncs_encode_16bit(, fctrl_seq_num); + if (gl_mds_pro_ver == MDS_PROT_FCTRL) { + ncs_encode_16bit(, fctrl_seq_num); + } else { + ncs_encode_16bit(, len - MDTM_FRAG_HDR_LEN - 2); + } + #endif return NCSCC_RC_SUCCESS; } diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index c9073b2..3d92290 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -132,8 +132,16 @@ uint32_t process_flow_event(const Event& evt) { portid = new TipcPortId(evt.id_, data_sock_fd, chunk_ack_size, sock_buf_size); portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; - rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, -evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled); + if (evt.legacy_data_ == true) { +// we create portid and set state kDisabled even though we know +// this portid has no flow control. It is because the 2nd, 3rd fragment +// could not reflect the protocol version, so need to keep this portid +// remained stateful +portid->ChangeState(TipcPortId::State::kDisabled); + } else { +rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, + evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled); + } } else if (evt.type_ == Event::Type::kEvtRcvIntro) { portid = new TipcPortId(evt.id_, data_sock_fd, chunk_ack_size, sock_buf_size); @@ -146,8 +154,12 @@ uint32_t process_flow_event(const Event& evt) { } } else { if (evt.type_ == Event::Type::kEvtRcvDa
Re: [devel] [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119]
Hi Thuan, ack from me. THanks Minh On 13/11/19 10:00 pm, thuan.tran wrote: When overload happens, sender will wait for chunkAck to continue sending more messages, it should send number of message equal chunkAck size of receiver. If not, receiver don't receive enough messages to send chunkAck and wait until timer timeout to send chunkAck to sender. This loop will make sender take very long time to sending all messages. --- src/mds/mds_tipc_fctrl_portid.cc | 14 ++ 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 3704baddb..bd1825446 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -190,6 +190,7 @@ uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); } else { ++sndwnd_.send_; +sndwnd_.nacked_space_ += length; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "QueData[mseq:%u, mfrag:%u, fseq:%u, len:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", @@ -444,32 +445,29 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { // the nacked_space_ of sender uint64_t acked_bytes = sndqueue_.Erase(Seq16(fseq) - (chksize-1), Seq16(fseq)); +assert(sndwnd_.nacked_space_ >= acked_bytes); sndwnd_.nacked_space_ -= acked_bytes; // try to send a few pending msg DataMessage* msg = nullptr; -uint64_t resend_bytes = 0; -while (resend_bytes < acked_bytes) { +uint16_t send_msg_cnt = 0; +while (send_msg_cnt++ < chunk_size_) { // find the lowest sequence unsent yet msg = sndqueue_.FirstUnsent(); if (msg == nullptr) { break; } else { -if (resend_bytes < acked_bytes) { if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { -sndwnd_.nacked_space_ += msg->header_.msg_len_; msg->is_sent_ = true; -resend_bytes += msg->header_.msg_len_; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "SndQData[fseq:%u, len:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", id_.node, id_.ref, msg->header_.fseq_, msg->header_.msg_len_, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); + } else { +break; } -} else { - break; -} } } // no more unsent message, back to kEnabled ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111]
Hi Thuan, Please see my reply inline. Thanks Minh On 13/11/19 9:54 pm, Tran Thuan wrote: Hi Minh, See my comment inline. Best Regards, ThuanTr -Original Message- From: Minh Chau Sent: Friday, November 8, 2019 5:33 PM To: hans.nordeb...@ericsson.com; gary@dektech.com.au; vu.m.ngu...@dektech.com.au; thuan.t...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; Minh Chau Subject: [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111] The legacy mds encodes the protocol version in either non fragment message or the first fragment only. Hence, the subsequent fragment after the first one is not able for mds to determine the protocol version. The patch maintains the encoding of lengthcheck as same as the legacy mds version. Also, the subsequent fragments needs to consult the stateful portid to determine the protocol version, so that the fragment will be skipped if it is sent from legacy mds, or inspected the sequence if it is sent from new mds. --- src/mds/mds_dt.h | 6 ++ src/mds/mds_dt_tipc.c| 11 ++- src/mds/mds_tipc_fctrl_intf.cc | 154 ++- src/mds/mds_tipc_fctrl_msg.cc| 86 +++--- src/mds/mds_tipc_fctrl_msg.h | 5 ++ src/mds/mds_tipc_fctrl_portid.cc | 23 ++ src/mds/mds_tipc_fctrl_portid.h | 1 + 7 files changed, 193 insertions(+), 93 deletions(-) diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index 64da600..007ff98 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -243,6 +243,12 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* Unknown or undefined MDS protocol/version */ +#define MDS_PROT_UNDEFINED 0x00 + +/* MDS protocol/version for non flow control (legacy) */ +#define MDS_PROT_LEGACY (MDS_PROT | MDS_VERSION) + /* MDS protocol/version for flow control */ #define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) #define MDS_PROT_FCTRL_ID 0xFDAC13F5 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index e085de7..fdf0da7 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -166,7 +166,7 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; -static uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; +static uint8_t gl_mds_pro_ver = MDS_PROT_LEGACY; static int gl_mds_fctrl_acksize = -1; static int gl_mds_fctrl_ackto = -1; @@ -381,7 +381,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ACKSIZE"); } } else { - gl_mds_pro_ver = MDS_PROT | MDS_VERSION; + gl_mds_pro_ver = MDS_PROT_LEGACY; syslog(LOG_ERR, "MDTM:TIPC Invalid value of" "MDS_TIPC_FCTRL_ENABLED"); } @@ -3125,7 +3125,12 @@ uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, * hereafter, these 2 bytes will be used as sequence number in flow control * (per tipc portid) * */ - ncs_encode_16bit(, fctrl_seq_num); + if (gl_mds_pro_ver == MDS_PROT_FCTRL) { + ncs_encode_16bit(, fctrl_seq_num); + } else { + ncs_encode_16bit(, len - MDTM_FRAG_HDR_LEN - 2); + } + #endif return NCSCC_RC_SUCCESS; } diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index c9073b2..3d92290 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -132,8 +132,16 @@ uint32_t process_flow_event(const Event& evt) { portid = new TipcPortId(evt.id_, data_sock_fd, chunk_ack_size, sock_buf_size); portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; - rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, -evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled); + if (evt.legacy_data_ == true) { +// we create portid and set state kDisabled even though we know +// this portid has no flow control. It is because the 2nd, 3rd fragment +// could not reflect the protocol version, so need to keep this portid +// remained stateful +portid->ChangeState(TipcPortId::State::kDisabled); + } else { +rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, + evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled); + } } else if (evt.type_ == Event::Type::kEvtRcvIntro) { portid = new TipcPortId(evt.id_, data_sock_fd, chunk_ack_size, sock_buf_size); @@ -146,8 +154,12 @@ uint32_t process_flow_event(const Event& evt) { } } else { if (evt.type_ == Event::Type::kEvtRcvData) { - rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, - evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled); + if
Re: [devel] [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119]
Hi Thuan, Please see comment inline Thanks Minh On 13/11/19 2:24 pm, Tran Thuan wrote: Hi Minh, Please check replies inline. Thanks. Best Regards, ThuanTr -Original Message- From: Minh Hon Chau Sent: Wednesday, November 13, 2019 10:05 AM To: Tran Thuan ; 'Nguyen Minh Vu' ; gary@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119] Hi Thuan, Please see comment inline. Thanks Minh On 13/11/19 1:11 pm, Tran Thuan wrote: Hi Minh, Thanks for comments, please check my replies inline. Best Regards, ThuanTr -Original Message- From: Minh Hon Chau Sent: Wednesday, November 13, 2019 7:47 AM To: thuan.tran ; 'Nguyen Minh Vu' ; gary@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119] Hi Thuan, Some comments inline. Thanks Minh On 12/11/19 5:04 pm, thuan.tran wrote: When overload happens, sender will wait for chunkAck to continue sending more messages, it should send number of message equal chunkAck size of receiver. If not, receiver don't receive enough messages to send chunkAck and wait until timer timeout to send chunkAck to sender. This loop will make sender take very long time to sending all messages. --- src/mds/mds_tipc_fctrl_portid.cc | 30 +++--- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 3704baddb..1fff4c855 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -190,6 +190,7 @@ uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); } else { ++sndwnd_.send_; +sndwnd_.nacked_space_ += length; [Minh] We haven't sent the msg out to wait for ack, thus nacked_space_ should not be increased m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "QueData[mseq:%u, mfrag:%u, fseq:%u, len:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", @@ -444,32 +445,29 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { // the nacked_space_ of sender uint64_t acked_bytes = sndqueue_.Erase(Seq16(fseq) - (chksize-1), Seq16(fseq)); +assert(sndwnd_.nacked_space_ >= acked_bytes); sndwnd_.nacked_space_ -= acked_bytes; // try to send a few pending msg DataMessage* msg = nullptr; -uint64_t resend_bytes = 0; -while (resend_bytes < acked_bytes) { +uint16_t send_msg_cnt = 0; +while (send_msg_cnt++ < chunk_size_) { // find the lowest sequence unsent yet msg = sndqueue_.FirstUnsent(); if (msg == nullptr) { break; } else { -if (resend_bytes < acked_bytes) { if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { -sndwnd_.nacked_space_ += msg->header_.msg_len_; [Minh] We now send it out and wait for acked, thus the nacked_space_ is increased here, so any reason moving the nacked_space_ from Queue() to here? [Thuan] Because the message could be in sndwnd (resend) either in sndqueue (send) Cannot increase nacked_space with resend message. I have tried another way to increase/decrease nacked_space dynamic but it become complex with markUnsent() since sender may receiver Nack for same msg > 2 times. [Minh] OK. msg->is_sent_ = true; -resend_bytes += msg->header_.msg_len_; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "SndQData[fseq:%u, len:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", id_.node, id_.ref, msg->header_.fseq_, msg->header_.msg_len_, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); + } else { +break; } -} else { - break; -} } } // no more unsent message, back to kEnabled [Minh] Agree, the new strategy to resend with chunk_size_ is better than with acked_bytes, it will increase transmission rate and not to depend on the timer [Thuan] Thanks @@ -502,26 +500,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag, fseq); return; } - if (state_ == State::kRcvBuffOverflow) { -sndqueue_.MarkUnsentFrom(Seq16(fseq)); -if (Seq16(fseq) - sndwnd_.acked_ > 1) { - m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], " - "RcvNack[fseq:%u], " - "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "], " - "queue[size:%" PRIu
Re: [devel] [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119]
Hi Thuan, Please see comment inline. Thanks Minh On 13/11/19 1:11 pm, Tran Thuan wrote: Hi Minh, Thanks for comments, please check my replies inline. Best Regards, ThuanTr -Original Message- From: Minh Hon Chau Sent: Wednesday, November 13, 2019 7:47 AM To: thuan.tran ; 'Nguyen Minh Vu' ; gary@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119] Hi Thuan, Some comments inline. Thanks Minh On 12/11/19 5:04 pm, thuan.tran wrote: When overload happens, sender will wait for chunkAck to continue sending more messages, it should send number of message equal chunkAck size of receiver. If not, receiver don't receive enough messages to send chunkAck and wait until timer timeout to send chunkAck to sender. This loop will make sender take very long time to sending all messages. --- src/mds/mds_tipc_fctrl_portid.cc | 30 +++--- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 3704baddb..1fff4c855 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -190,6 +190,7 @@ uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); } else { ++sndwnd_.send_; +sndwnd_.nacked_space_ += length; [Minh] We haven't sent the msg out to wait for ack, thus nacked_space_ should not be increased m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "QueData[mseq:%u, mfrag:%u, fseq:%u, len:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", @@ -444,32 +445,29 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { // the nacked_space_ of sender uint64_t acked_bytes = sndqueue_.Erase(Seq16(fseq) - (chksize-1), Seq16(fseq)); +assert(sndwnd_.nacked_space_ >= acked_bytes); sndwnd_.nacked_space_ -= acked_bytes; // try to send a few pending msg DataMessage* msg = nullptr; -uint64_t resend_bytes = 0; -while (resend_bytes < acked_bytes) { +uint16_t send_msg_cnt = 0; +while (send_msg_cnt++ < chunk_size_) { // find the lowest sequence unsent yet msg = sndqueue_.FirstUnsent(); if (msg == nullptr) { break; } else { -if (resend_bytes < acked_bytes) { if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { -sndwnd_.nacked_space_ += msg->header_.msg_len_; [Minh] We now send it out and wait for acked, thus the nacked_space_ is increased here, so any reason moving the nacked_space_ from Queue() to here? [Thuan] Because the message could be in sndwnd (resend) either in sndqueue (send) Cannot increase nacked_space with resend message. I have tried another way to increase/decrease nacked_space dynamic but it become complex with markUnsent() since sender may receiver Nack for same msg > 2 times. [Minh] OK. msg->is_sent_ = true; -resend_bytes += msg->header_.msg_len_; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "SndQData[fseq:%u, len:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", id_.node, id_.ref, msg->header_.fseq_, msg->header_.msg_len_, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); + } else { +break; } -} else { - break; -} } } // no more unsent message, back to kEnabled [Minh] Agree, the new strategy to resend with chunk_size_ is better than with acked_bytes, it will increase transmission rate and not to depend on the timer [Thuan] Thanks @@ -502,26 +500,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag, fseq); return; } - if (state_ == State::kRcvBuffOverflow) { -sndqueue_.MarkUnsentFrom(Seq16(fseq)); -if (Seq16(fseq) - sndwnd_.acked_ > 1) { - m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], " - "RcvNack[fseq:%u], " - "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "], " - "queue[size:%" PRIu64 "], " - "Warning[Ignore Nack]", - id_.node, id_.ref, fseq, - sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_, - sndqueue_.Size()); - return; -} - } if (state_ != State::kRcvBuffOverflow) { state_ = State::kRcvBuffOverflow; m_MDS_LOG_NOTIFY("FCTRL: [node:%x, ref:%u] --> Overflow ", id_.node, id_.ref); -sndqueue_.MarkUnsentFrom(Seq16(fseq)); } + s
Re: [devel] [PATCH 0/3] Review Request for mds: Fix backward compatibility of mds fragmentation message [#3111]
Hi, Any comments on the patches? Otherwise I wish to push them in the next day or two. Thanks Minh On 8/11/19 9:33 pm, Minh Chau wrote: Summary: mds: Distinguish protocol version of fragment [#3111] Review request for Ticket(s): 3111 Peer Reviewer(s): Gary, Vu, Thuan Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-3111 Base revision: ddb9d7065376df7757716013779755864d53ebe5 Personal repository: git://git.code.sf.net/u/minh-chau/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesn OpenSAF servicesn Core libraries y Samples n Tests n Other n Comments (indicate scope for each "y" above): - *** EXPLAIN/COMMENT THE PATCH SERIES HERE *** revision 2cb2d135827d920155323a70a9587264e5c62ae2 Author: Minh Chau Date: Fri, 8 Nov 2019 21:17:22 +1100 mds: Add backward compatibility mdstest for fragment [#3111] revision 153b657d2873019160f31a3091fa660e4e469a9e Author: Minh Chau Date: Fri, 8 Nov 2019 21:08:18 +1100 mds: Refactor logging [#3111] Since adding TipcPortId:ChangeState(), the patch refactors logging to shorten the code. revision 1ce0c74ca96fa028d02abe72932171e98c034342 Author: Minh Chau Date: Fri, 8 Nov 2019 20:51:54 +1100 mds: Distinguish protocol version of fragment [#3111] The legacy mds encodes the protocol version in either non fragment message or the first fragment only. Hence, the subsequent fragment after the first one is not able for mds to determine the protocol version. The patch maintains the encoding of lengthcheck as same as the legacy mds version. Also, the subsequent fragments needs to consult the stateful portid to determine the protocol version, so that the fragment will be skipped if it is sent from legacy mds, or inspected the sequence if it is sent from new mds. Complete diffstat: -- src/mds/apitest/mdstipc_api.c| 83 +++-- src/mds/mds_dt.h | 6 ++ src/mds/mds_dt_tipc.c| 11 ++- src/mds/mds_tipc_fctrl_intf.cc | 154 ++- src/mds/mds_tipc_fctrl_msg.cc| 86 +++--- src/mds/mds_tipc_fctrl_msg.h | 5 ++ src/mds/mds_tipc_fctrl_portid.cc | 94 +++- src/mds/mds_tipc_fctrl_portid.h | 1 + 8 files changed, 292 insertions(+), 148 deletions(-) Testing Commands: - mdstest Testing, Expected Results: -- all tests pass Conditions of Submission: - *** HOW MANY DAYS BEFORE PUSHING, CONSENSUS ETC *** Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 y y powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible line length in your headers/comments/text. ___ You have failed to put in a proper Trac Ticket # into your commits. ___ You have incorrectly put/left internal data in your comments/files (i.e. internal bug tracking tool IDs, product names etc) ___ You have not given any evidence of testing beyond basic build tests. Demonstrate some level of runtime or other sanity testing. ___ You have ^M present in some of your files. These have to be removed. ___ You have needlessly changed whitespace or added whitespace crimes like trailing spaces, or spaces before tabs. ___ You have mixed real technical changes with whitespace and other cosmetic code cleanup changes. These have to be separate commits. ___ You need to refactor your submission into logical chunks; there is too much content into a single commit. ___ You have extraneous garbage in your review (merge commits etc) ___ You have giant attachments which should never have been sent; Instead you should place your content in a public tree to be pulled. ___ You have too many commits attached to an e-mail; resend as threaded commits, or place in a public tree for a pull. ___ You have resent this content multiple times without a clear indication of what has changed between each re-send. ___ You have
Re: [devel] [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119]
Hi Thuan, Some comments inline. Thanks Minh On 12/11/19 5:04 pm, thuan.tran wrote: When overload happens, sender will wait for chunkAck to continue sending more messages, it should send number of message equal chunkAck size of receiver. If not, receiver don't receive enough messages to send chunkAck and wait until timer timeout to send chunkAck to sender. This loop will make sender take very long time to sending all messages. --- src/mds/mds_tipc_fctrl_portid.cc | 30 +++--- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 3704baddb..1fff4c855 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -190,6 +190,7 @@ uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); } else { ++sndwnd_.send_; +sndwnd_.nacked_space_ += length; [Minh] We haven't sent the msg out to wait for ack, thus nacked_space_ should not be increased m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "QueData[mseq:%u, mfrag:%u, fseq:%u, len:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", @@ -444,32 +445,29 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t chksize) { // the nacked_space_ of sender uint64_t acked_bytes = sndqueue_.Erase(Seq16(fseq) - (chksize-1), Seq16(fseq)); +assert(sndwnd_.nacked_space_ >= acked_bytes); sndwnd_.nacked_space_ -= acked_bytes; // try to send a few pending msg DataMessage* msg = nullptr; -uint64_t resend_bytes = 0; -while (resend_bytes < acked_bytes) { +uint16_t send_msg_cnt = 0; +while (send_msg_cnt++ < chunk_size_) { // find the lowest sequence unsent yet msg = sndqueue_.FirstUnsent(); if (msg == nullptr) { break; } else { -if (resend_bytes < acked_bytes) { if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) { -sndwnd_.nacked_space_ += msg->header_.msg_len_; [Minh] We now send it out and wait for acked, thus the nacked_space_ is increased here, so any reason moving the nacked_space_ from Queue() to here? msg->is_sent_ = true; -resend_bytes += msg->header_.msg_len_; m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], " "SndQData[fseq:%u, len:%u], " "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", id_.node, id_.ref, msg->header_.fseq_, msg->header_.msg_len_, sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_); + } else { +break; } -} else { - break; -} } } // no more unsent message, back to kEnabled [Minh] Agree, the new strategy to resend with chunk_size_ is better than with acked_bytes, it will increase transmission rate and not to depend on the timer @@ -502,26 +500,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag, fseq); return; } - if (state_ == State::kRcvBuffOverflow) { -sndqueue_.MarkUnsentFrom(Seq16(fseq)); -if (Seq16(fseq) - sndwnd_.acked_ > 1) { - m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], " - "RcvNack[fseq:%u], " - "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "], " - "queue[size:%" PRIu64 "], " - "Warning[Ignore Nack]", - id_.node, id_.ref, fseq, - sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_, - sndqueue_.Size()); - return; -} - } if (state_ != State::kRcvBuffOverflow) { state_ = State::kRcvBuffOverflow; m_MDS_LOG_NOTIFY("FCTRL: [node:%x, ref:%u] --> Overflow ", id_.node, id_.ref); -sndqueue_.MarkUnsentFrom(Seq16(fseq)); } + sndqueue_.MarkUnsentFrom(Seq16(fseq)); [Minh] I have a doubt with this change in ReceiveNack(), so every Nack will trigger a retransmission on the Nacked sequence even though we are already in kRcvBufferOverFlow state. This will increase the "unexpected retransmission" error rate. On reception of 2nd-Nack, 3rd-Nack, we already moved into kRcvBufferOverFlow state, we don't need to resend the 2nd-Nack, 3rd-Nack as we already did at the 1st-Nack. Only mark it as Unsent, the actual retransmission of 2nd-Nack, 3rd-Nack, is done in the loop ReceiveChunkAck() as you have improved in this patch, that will keep msg in order at receivers. So any reason for this change? DataMessage* msg = sndqueue_.Find(Seq16(fseq)); if (msg != nullptr) { // Resend the msg found ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: Unset flow control env var [#3109]
Hi Vu, Please find comments below. Thanks Minh On 31/10/19 6:15 pm, Nguyen Minh Vu wrote: Hi Minh, Ack with minor comments. Regards, Vu On 10/31/19 11:55 AM, Minh Chau wrote: Patch unsets MDS_TIPC_FCTRL_ENABLED, MDS_TIPC_FCTRL_ACKTIMEOUT, and MDS_TIPC_FCTRL_ACKSIZE to prevent child process inheritance. --- src/mds/mds_dt_tipc.c | 39 +-- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index e7a7b48..096e4ca 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -167,6 +167,8 @@ uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; +int gl_mds_fctrl_acksize = -1; +int gl_mds_fctrl_ackto = -1; [Vu] Should we declare these ones as static variables if they are only used in this file ? [M]: Yes, make them static static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) { struct sockaddr_tipc addr; @@ -347,32 +349,49 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) if ((ptr = getenv("MDS_TIPC_FCTRL_ENABLED")) != NULL) { if (atoi(ptr) == 1) { gl_mds_pro_ver = MDS_PROT_FCTRL; - int ackto = -1; - int acksize = -1; if ((ptr = getenv("MDS_TIPC_FCTRL_ACKTIMEOUT")) != NULL) { - ackto = atoi(ptr); - if (ackto == 0) { + gl_mds_fctrl_ackto = atoi(ptr); + if (gl_mds_fctrl_ackto == 0) { syslog(LOG_ERR, "MDTM:TIPC Invalid " "MDS_TIPC_FCTRL_ACKTIMEOUT, using default value"); - ackto = -1; + gl_mds_fctrl_ackto = -1; } } if ((ptr = getenv("MDS_TIPC_FCTRL_ACKSIZE")) != NULL) { - acksize = atoi(ptr); - if (acksize == 0) { + gl_mds_fctrl_acksize = atoi(ptr); + if (gl_mds_fctrl_acksize == 0) { syslog(LOG_ERR, "MDTM:TIPC Invalid " "MDS_TIPC_FCTRL_ACKSIZE, using default value"); - acksize = -1; + gl_mds_fctrl_acksize = -1; } } - mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, optval, - ackto, acksize, tipc_mcast_enabled); + /* unset the env var to prevent child process inheritance */ + if (unsetenv("MDS_TIPC_FCTRL_ENABLED") != 0) { + syslog(LOG_ERR, + "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ENABLED"); + } + if (gl_mds_fctrl_ackto != -1 && + unsetenv("MDS_TIPC_FCTRL_ACKTIMEOUT") != 0) { + syslog(LOG_ERR, + "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ACKTIMEOUT"); + } + if (gl_mds_fctrl_acksize != -1 && + unsetenv("MDS_TIPC_FCTRL_ACKSIZE") != 0) { + syslog(LOG_ERR, + "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ACKSIZE"); + } } else { + gl_mds_pro_ver = MDS_PROT | MDS_VERSION; [Vu] This line may be not necessary as the default value of gl_mds_pro_ver is `MDS_PROT | MDS_VERSION`. [M] It may be invalid value by setenv() in the scenario you suggested: Init/Finalize/Init with setenv(invalid value). syslog(LOG_ERR, "MDTM:TIPC Invalid value of" "MDS_TIPC_FCTRL_ENABLED"); } } + if (gl_mds_pro_ver == MDS_PROT_FCTRL) { + mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, optval, + gl_mds_fctrl_ackto, gl_mds_fctrl_acksize, tipc_mcast_enabled); + } + /* Create a task to receive the events and data */ if (mdtm_create_rcv_task(tipc_cb.hdle_mdtm) != NCSCC_RC_SUCCESS) { syslog(LOG_ERR, ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mds: Unset flow control env var [#3109]
Hi Vu, I think users will lose the value set in env var if users repeat Init/Finalize/Init. I have sent out V2. Thanks Minh On 31/10/19 2:33 pm, Nguyen Minh Vu wrote: Hi Minh, Ack with one question. What happens if user does following sequence: 1) Init service handle --> Have this env variable set, then unset later on. 2) Finalize the handle 3) Init service handle --> I am not sure if previous unset has any affects to tipc flow control from this point e.g. has tipc flow control been disabled from previous unset? Regards, Vu On 10/31/19 5:32 AM, Minh Chau wrote: Patch unsets MDS_TIPC_FCTRL_ENABLED, MDS_TIPC_FCTRL_ACKTIMEOUT, and MDS_TIPC_FCTRL_ACKSIZE to prevent child process inheritance. --- src/mds/mds_dt_tipc.c | 13 + 1 file changed, 13 insertions(+) diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index e7a7b48..12b275d 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -367,6 +367,19 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) } mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, optval, ackto, acksize, tipc_mcast_enabled); + /* unset the env var to prevent child process inheritance */ + if (unsetenv("MDS_TIPC_FCTRL_ENABLED") != 0) { + syslog(LOG_ERR, + "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ENABLED"); + } + if (ackto != -1 && unsetenv("MDS_TIPC_FCTRL_ACKTIMEOUT") != 0) { + syslog(LOG_ERR, + "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ACKTIMEOUT"); + } + if (acksize != -1 && unsetenv("MDS_TIPC_FCTRL_ACKSIZE") != 0) { + syslog(LOG_ERR, + "MDTM:TIPC Failed to unset MDS_TIPC_FCTRL_ACKSIZE"); + } } else { syslog(LOG_ERR, "MDTM:TIPC Invalid value of" "MDS_TIPC_FCTRL_ENABLED"); ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] dtm: rotate logtraces on demand [#3086]
Hi Vu, ack from me. Thanks Minh On 21/10/19 5:51 pm, Nguyen Minh Vu wrote: Hi, Any comments on this patch? I will push it by this week if no comments. Regards, Vu On 10/4/19 5:30 PM, Vu Minh Nguyen wrote: Introducing a new option '--rotate' to rotate given logtrace stream(s). This patch also cleans the code of LogServer::ExecuteCommand(). --- src/base/log_writer.h | 2 +- src/dtm/tools/osaflog.cc | 25 ++- src/dtm/transport/log_server.cc | 125 +--- src/dtm/transport/log_server.h | 11 ++- 4 files changed, 115 insertions(+), 48 deletions(-) diff --git a/src/base/log_writer.h b/src/base/log_writer.h index 0a03e9253..ab2bf32ae 100644 --- a/src/base/log_writer.h +++ b/src/base/log_writer.h @@ -45,13 +45,13 @@ class LogWriter { void Write(size_t size); void Write(const char* bytes, size_t size); void Flush(); + void RotateLog(); void SetLogFile(const std::string& log_file) { log_file_ = log_file; } private: constexpr static const size_t kBufferSize = 128 * size_t{1024}; void Open(); void Close(); - void RotateLog(); std::string log_file(size_t backup) const; diff --git a/src/dtm/tools/osaflog.cc b/src/dtm/tools/osaflog.cc index 4e0956aa2..f6fa16801 100644 --- a/src/dtm/tools/osaflog.cc +++ b/src/dtm/tools/osaflog.cc @@ -54,6 +54,7 @@ base::UnixServerSocket* CreateSocket(); uint64_t Random64Bits(uint64_t seed); bool PrettyPrint(const std::string& log_stream); bool Delete(const std::string& log_stream); +bool Rotate(const std::string& log_stream); std::list OpenLogFiles(const std::string& log_stream); std::string PathName(const std::string& log_stream, int suffix); uint64_t GetInode(int fd); @@ -70,6 +71,7 @@ int main(int argc, char** argv) { {"flush", no_argument, 0, 'f'}, {"print", no_argument, nullptr, 'p'}, {"delete", no_argument, nullptr, 'd'}, + {"rotate", no_argument, nullptr, 'r'}, {"extract-trace", required_argument, 0, 'e'}, {"max-idle", required_argument, 0, 'i'}, {0, 0, 0, 0}}; @@ -83,12 +85,14 @@ int main(int argc, char** argv) { bool flush_result = true; bool print_result = true; bool delete_result = true; + bool rotate_result = true; bool max_file_size_result = true; bool number_of_backups_result = true; bool max_idle_result = true; bool flush_set = false; bool pretty_print_set = false; bool delete_set = false; + bool rotate_set = false; bool max_file_size_set = false; bool max_backups_set = false; bool max_idle_set = false; @@ -101,7 +105,7 @@ int main(int argc, char** argv) { exit(EXIT_FAILURE); } - while ((option = getopt_long(argc, argv, "m:b:p:f:e:", + while ((option = getopt_long(argc, argv, "m:b:p:f:e:i:r", long_options, _index)) != -1) { switch (option) { case 'p': @@ -114,6 +118,9 @@ int main(int argc, char** argv) { case 'f': flush_set = true; break; + case 'r': + rotate_set = true; + break; case 'm': max_file_size = base::StrToUint64(optarg, _file_size_set); @@ -164,12 +171,12 @@ int main(int argc, char** argv) { if (thread_trace) exit(ExtractTrace(input_core, output_trace)); - if (argc > optind && !pretty_print_set && !delete_set) { + if (argc > optind && !pretty_print_set && !delete_set && !rotate_set) { pretty_print_set = true; flush_set = true; } - if ((argc <= optind && (pretty_print_set || delete_set)) || + if ((argc <= optind && (pretty_print_set || delete_set || rotate_set)) || (pretty_print_set && delete_set)) { PrintUsage(argv[0]); exit(EXIT_FAILURE); @@ -188,6 +195,11 @@ int main(int argc, char** argv) { delete_result = Delete(argv[optind++]); } } + if (rotate_set == true) { + while (rotate_result && optind < argc) { + rotate_result = Rotate(argv[optind++]); + } + } if (max_backups_set == true) { number_of_backups_result = NoOfBackupFiles(max_backups); } @@ -197,7 +209,7 @@ int main(int argc, char** argv) { if (max_idle_set == true) { max_idle_result = SetMaxIdleTime(max_idle); } - if (flush_result && print_result && max_file_size_result && + if (flush_result && print_result && max_file_size_result && rotate_result && delete_result && number_of_backups_result && max_idle_result) exit(EXIT_SUCCESS); exit(EXIT_FAILURE); @@ -224,6 +236,7 @@ void PrintUsage(const char* program_name) { "--delete Delete the specified LOGSTREAM(s) by\n" " removing allocated resources in the log\n" " server. Does not
Re: [devel] [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102]
Hi Thuan, Please see comment inline Thanks Minh On 23/10/19 3:32 pm, Tran Thuan wrote: Hi Minh, Thanks for comments. See my response inline. Btw, I am preparing to send out new patch, I think I found an issue in current patch. Best Regards, ThuanTr -Original Message- From: Minh Hon Chau Sent: Wednesday, October 23, 2019 5:52 AM To: Tran Thuan ; 'Nguyen Minh Vu' ; hans.nordeb...@ericsson.com; gary@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102] Hi Thuan, I wonder the patch would work in the same reproduced steps if the both adests have subscribed each other more than 2 services. The svc_cnt will be greater than 1 until it is the last service down event. I think that's why mds has the database @subtn_results, in which each item is an adest associated with a service id separately. [Thuan] We can understand that adest still alive, then go with origin flow (wait 1.5s). But can a process send SNDRSP then mds unregister? I think it cannot, because it’s in SNDRSP (blocking) [M] Not unregister, it can be unsubscribe. Or do you mean a process can not send two SNDRSP at the same time on 2 different subscribed services? The scenario of this ticket happens for the process terminated/crash. [M] Yes my doubt is in the context of this ticket - terminated/crash - you would get 2 service down event I think [M] I don't think adding a new database at the global scope for this specific case is a good idea, if we can reuse the existing database. Can you try to use MDS_SUBSCRIPTION_INFO, add a flag or something similar to indicate which case mds should wait 1.5 sec. It would isolate the bug fix in the scope of this problem. The problem originally resides at the services code e.g ntf, imm... where the threads structure between mds receiving thread and main thread cause a race condition. Thus the service sends a message with a death adest which is removed from mds database, that confuses mds and hit 1.5 secs wait time. If I read the code correctly, the 1.5 wait time is for another case, it gives another chance to wait 1.5 when the subscription result is empty in @subtn_results because the service up has not arrived yet. [Thuan] Yes, it will give a chance if adest not yet UP any. My patch still keep that chance as origin code. But I think I need reduce timeout for adest down timer, I am verifying this change. mds subscribe > mds sends message A x mds wait 1.5 sec mds <--- service up mds sends message A > So the 1.5 sec time is for early phase of waiting service up. } else if (sub_info->tmr_flag != true) { if ((MDS_SENDTYPE_RSP == req->i_sendtype) || (MDS_SENDTYPE_RRSP == req->i_sendtype)) { time_wait = true; m_MDS_LOG_INFO( "MDS_SND_RCV:Disc queue: Subscr exists no timer running: Waiting for some time\n"); -> I think at this line, it means: the SUBSCRIPTION_TMR has timeout, and mds is sending RSP/RRSP which means mds should have received the *request* message (?), so mds wants to wait for another 1.5 second for service_up to create the subscription result in database. The problem in this ticket hit 1.5 because the service down has already come and mds removed the subscription result item, now the ntf, imm... sends msg with a death adest, and mds now it thinks it is waiting for a service up to come as at the early phase, so it waits. Both two scenarios can be distinguished themselves to avoid to wait 1.5 secs for the latter case I think. Thanks Minh On 22/10/19 9:50 pm, Tran Thuan wrote: > Hi Vu, > > Thanks for additional comments. > I reply your concerns inline below. > > Best Regards, > ThuanTr > > -Original Message- > From: Nguyen Minh Vu <mailto:vu.m.ngu...@dektech.com.au>> > Sent: Tuesday, October 22, 2019 5:28 PM > To: thuan.tran <mailto:thuan.t...@dektech.com.au>>; 'Minh Chau' mailto:minh.c...@dektech.com.au>>; hans.nordeb...@ericsson.com <mailto:hans.nordeb...@ericsson.com>; gary@dektech.com.au <mailto:gary@dektech.com.au> > Cc: opensaf-devel@lists.sourceforge.net <mailto:opensaf-devel@lists.sourceforge.net> > Subject: Re: [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102] > > Hi Thuan, > > I have additional comments below. > > Regards, Vu > > On 10/22/19 7:14 AM, thuan.tran wrote: >> - When sending response message which Adest not exist (already down) >> current MDS try to wait for 1.5 seconds before conclude no route to >> send response message. >> >> - There are 2 scenarios may have: >> UP -> DOWN -> receive SNDRSP -> response timeout after
Re: [devel] [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102]
Hi Thuan, I wonder the patch would work in the same reproduced steps if the both adests have subscribed each other more than 2 services. The svc_cnt will be greater than 1 until it is the last service down event. I think that's why mds has the database @subtn_results, in which each item is an adest associated with a service id separately. The problem originally resides at the services code e.g ntf, imm... where the threads structure between mds receiving thread and main thread cause a race condition. Thus the service sends a message with a death adest which is removed from mds database, that confuses mds and hit 1.5 secs wait time. If I read the code correctly, the 1.5 wait time is for another case, it gives another chance to wait 1.5 when the subscription result is empty in @subtn_results because the service up has not arrived yet. mds subscribe > mds sends message A x mds wait 1.5 sec mds <--- service up mds sends message A > So the 1.5 sec time is for early phase of waiting service up. } else if (sub_info->tmr_flag != true) { if ((MDS_SENDTYPE_RSP == req->i_sendtype) || (MDS_SENDTYPE_RRSP == req->i_sendtype)) { time_wait = true; m_MDS_LOG_INFO( "MDS_SND_RCV:Disc queue: Subscr exists no timer running: Waiting for some time\n"); -> I think at this line, it means: the SUBSCRIPTION_TMR has timeout, and mds is sending RSP/RRSP which means mds should have received the *request* message (?), so mds wants to wait for another 1.5 second for service_up to create the subscription result in database. The problem in this ticket hit 1.5 because the service down has already come and mds removed the subscription result item, now the ntf, imm... sends msg with a death adest, and mds now it thinks it is waiting for a service up to come as at the early phase, so it waits. Both two scenarios can be distinguished themselves to avoid to wait 1.5 secs for the latter case I think. Thanks Minh On 22/10/19 9:50 pm, Tran Thuan wrote: Hi Vu, Thanks for additional comments. I reply your concerns inline below. Best Regards, ThuanTr -Original Message- From: Nguyen Minh Vu Sent: Tuesday, October 22, 2019 5:28 PM To: thuan.tran ; 'Minh Chau' ; hans.nordeb...@ericsson.com; gary@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102] Hi Thuan, I have additional comments below. Regards, Vu On 10/22/19 7:14 AM, thuan.tran wrote: - When sending response message which Adest not exist (already down) current MDS try to wait for 1.5 seconds before conclude no route to send response message. - There are 2 scenarios may have: UP -> DOWN -> receive SNDRSP -> response timeout after 1.5s UP -> receive SNDRSP -> DOWN -> response timeout after 1.5s - With this change, MDS will not waste for 1.5s which can cause trouble for higher layer services, e.g: ntf, imm, etc... --- src/mds/mds_c_api.c | 70 - src/mds/mds_c_sndrcv.c | 52 -- src/mds/mds_core.h | 25 +-- src/mds/mds_dt2c.h | 2 +- src/mds/mds_dt_common.c | 22 - 5 files changed, 162 insertions(+), 9 deletions(-) diff --git a/src/mds/mds_c_api.c b/src/mds/mds_c_api.c index 132555b8e..5dd30c536 100644 --- a/src/mds/mds_c_api.c +++ b/src/mds/mds_c_api.c @@ -1900,6 +1900,32 @@ uint32_t mds_mcm_svc_up(PW_ENV_ID pwe_id, MDS_SVC_ID svc_id, V_DEST_RL role, /*** Validation for SCOPE **/ + if (adest != m_MDS_GET_ADEST) { + MDS_ADEST_INFO *adest_info = + (MDS_ADEST_INFO *)ncs_patricia_tree_get( + _mds_mcm_cb->adest_list, + (uint8_t *)); + if (!adest_info) { + /* Add adest to adest list */ + adest_info = m_MMGR_ALLOC_ADEST_INFO; + memset(adest_info, 0, sizeof(MDS_ADEST_INFO)); + adest_info->adest = adest; + adest_info->node.key_info = + (uint8_t *)_info->adest; + adest_info->svc_cnt = 1; + adest_info->tmr_start = false; + ncs_patricia_tree_add( + _mds_mcm_cb->adest_list, + (NCS_PATRICIA_NODE *)adest_info); + m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64 + " svc_cnt=%u", adest, adest_info->svc_cnt); + } else { + adest_info->svc_cnt++; + m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64 + " svc_cnt=%u", adest, adest_info->svc_cnt); + } + } + status =
Re: [devel] [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102]
Hi Thuan, 1- Can you point out where is the mds code that waits for 1.5 seconds, is it hard coded within 1.5 secs? 2- Is existing db (mds_c_db.c) in mds not enough so we need to introduce adest_list? I think mds must have a memory of adest, perhaps in another implicit form, so mds can validate an adest, a svc_id associated with adest. thanks Minh On 22/10/19 11:14 am, thuan.tran wrote: - When sending response message which Adest not exist (already down) current MDS try to wait for 1.5 seconds before conclude no route to send response message. - There are 2 scenarios may have: UP -> DOWN -> receive SNDRSP -> response timeout after 1.5s UP -> receive SNDRSP -> DOWN -> response timeout after 1.5s - With this change, MDS will not waste for 1.5s which can cause trouble for higher layer services, e.g: ntf, imm, etc... --- src/mds/mds_c_api.c | 70 - src/mds/mds_c_sndrcv.c | 52 -- src/mds/mds_core.h | 25 +-- src/mds/mds_dt2c.h | 2 +- src/mds/mds_dt_common.c | 22 - 5 files changed, 162 insertions(+), 9 deletions(-) diff --git a/src/mds/mds_c_api.c b/src/mds/mds_c_api.c index 132555b8e..5dd30c536 100644 --- a/src/mds/mds_c_api.c +++ b/src/mds/mds_c_api.c @@ -1900,6 +1900,32 @@ uint32_t mds_mcm_svc_up(PW_ENV_ID pwe_id, MDS_SVC_ID svc_id, V_DEST_RL role, /*** Validation for SCOPE **/ + if (adest != m_MDS_GET_ADEST) { + MDS_ADEST_INFO *adest_info = + (MDS_ADEST_INFO *)ncs_patricia_tree_get( + _mds_mcm_cb->adest_list, + (uint8_t *)); + if (!adest_info) { + /* Add adest to adest list */ + adest_info = m_MMGR_ALLOC_ADEST_INFO; + memset(adest_info, 0, sizeof(MDS_ADEST_INFO)); + adest_info->adest = adest; + adest_info->node.key_info = + (uint8_t *)_info->adest; + adest_info->svc_cnt = 1; + adest_info->tmr_start = false; + ncs_patricia_tree_add( + _mds_mcm_cb->adest_list, + (NCS_PATRICIA_NODE *)adest_info); + m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64 + " svc_cnt=%u", adest, adest_info->svc_cnt); + } else { + adest_info->svc_cnt++; + m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64 + " svc_cnt=%u", adest, adest_info->svc_cnt); + } + } + status = mds_get_subtn_res_tbl_by_adest(local_svc_hdl, svc_id, vdest_id, adest, _subtn_result_info); @@ -3571,6 +3597,24 @@ uint32_t mds_mcm_svc_down(PW_ENV_ID pwe_id, MDS_SVC_ID svc_id, V_DEST_RL role, /* Discard : Getting down before getting up */ } else { /* Entry exist in subscription result table */ + /* If adest exist and no sndrsp, start a timer */ + MDS_ADEST_INFO *adest_info = + (MDS_ADEST_INFO *)ncs_patricia_tree_get( + _mds_mcm_cb->adest_list, + (uint8_t *)); + if (adest_info) { + adest_info->svc_cnt--; + if (adest_info->svc_cnt == 0 && + adest_info->sndrsp_cnt == 0) { + m_MDS_LOG_INFO("MCM:API: Adest=%" PRIu64 + " down timer start", adest); + if (adest_info->tmr_start == false) { + adest_info->tmr_start = true; + start_mds_down_tmr(adest, svc_id); + } + } + } + if (vdest_id == m_VDEST_ID_FOR_ADEST_ENTRY) { status = mds_subtn_res_tbl_del( local_svc_hdl, svc_id, vdest_id, adest, @@ -4956,6 +5000,17 @@ uint32_t mds_mcm_init(void) return NCSCC_RC_FAILURE; } + /* ADEST TREE */ + memset(_tree_params, 0, sizeof(NCS_PATRICIA_PARAMS)); + pat_tree_params.key_size = sizeof(MDS_DEST); + if (NCSCC_RC_SUCCESS != + ncs_patricia_tree_init(_mds_mcm_cb->adest_list, + _tree_params)) { + m_MDS_LOG_ERR( + "MCM:API: patricia_tree_init: adest :failure, L mds_mcm_init"); + return NCSCC_RC_FAILURE; + } + /* SERVICE TREE */ memset(_tree_params, 0, sizeof(NCS_PATRICIA_PARAMS)); pat_tree_params.key_size = sizeof(MDS_SVC_HDL); @@ -4966,7 +5021,12 @@ uint32_t mds_mcm_init(void) if
Re: [devel] [PATCH 1/1] mds: Disable mds flow control for mds broadcast/multicast message [#3101]
Hi Thuan, The patch is acked and I pushed it. The commit message may cause you a missundestanding, but in this context it does not mention anything regarding to configuration, so I hope it's ok. Another comment inline. Thanks Minh On 21/10/19 1:56 pm, Tran Thuan wrote: Hi Minh, I suggest commit message as following mds: skip flow control for bcast/mcast message if tipc multicast enabled. because "disable mds flow control" cause misunderstood overwrite configure MDS_TIPC_FCTRL_ENABLED And another comment with [Thuan] inline. Thanks. Best Regards, ThuanTr -Original Message- From: Minh Chau Sent: Thursday, October 17, 2019 10:00 AM To: hans.nordeb...@ericsson.com; thuan.t...@dektech.com.au; gary@dektech.com.au; vu.m.ngu...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; Minh Chau Subject: [PATCH 1/1] mds: Disable mds flow control for mds broadcast/multicast message [#3101] The mds flow control has been disabled for broadcast/mulitcast unfragment message if tipc multicast is enabled. This patch revisits and continues with fragment messages. --- src/mds/mds_tipc_fctrl_intf.cc | 47 src/mds/mds_tipc_fctrl_msg.h | 11 +++--- src/mds/mds_tipc_fctrl_portid.cc | 47 ++-- src/mds/mds_tipc_fctrl_portid.h | 3 ++- 4 files changed, 69 insertions(+), 39 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index b803bfe..fe3dbd5 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -133,7 +133,7 @@ uint32_t process_flow_event(const Event& evt) { kChunkAckSize, sock_buf_size); portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, -evt.fseq_, evt.svc_id_); +evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled); } else if (evt.type_ == Event::Type::kEvtRcvIntro) { portid = new TipcPortId(evt.id_, data_sock_fd, kChunkAckSize, sock_buf_size); @@ -147,7 +147,7 @@ uint32_t process_flow_event(const Event& evt) { } else { if (evt.type_ == Event::Type::kEvtRcvData) { rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, - evt.fseq_, evt.svc_id_); + evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled); } if (evt.type_ == Event::Type::kEvtRcvChunkAck) { portid->ReceiveChunkAck(evt.fseq_, evt.chunk_size_); @@ -430,6 +430,7 @@ uint32_t mds_tipc_fctrl_drop_data(uint8_t *buffer, uint16_t len, HeaderMessage header; header.Decode(buffer); + Event* pevt = nullptr; // if mds support flow control if ((header.pro_ver_ & MDS_PROT_VER_MASK) == MDS_PROT_FCTRL) { if (header.pro_id_ == MDS_PROT_FCTRL_ID) { @@ -438,9 +439,10 @@ uint32_t mds_tipc_fctrl_drop_data(uint8_t *buffer, uint16_t len, ChunkAck ack; ack.Decode(buffer); // send to the event thread -if (m_NCS_IPC_SEND(_events, -new Event(Event::Type::kEvtSendChunkAck, id, ack.svc_id_, -header.mseq_, header.mfrag_, ack.acked_fseq_, ack.chunk_size_), +pevt = new Event(Event::Type::kEvtSendChunkAck, id, ack.svc_id_, +header.mseq_, header.mfrag_, ack.acked_fseq_); +pevt->chunk_size_ = ack.chunk_size_; +if (m_NCS_IPC_SEND(_events, pevt, NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]", strerror(errno)); @@ -453,9 +455,9 @@ uint32_t mds_tipc_fctrl_drop_data(uint8_t *buffer, uint16_t len, DataMessage data; data.Decode(buffer); // send to the event thread - if (m_NCS_IPC_SEND(_events, - new Event(Event::Type::kEvtDropData, id, data.svc_id_, - header.mseq_, header.mfrag_, header.fseq_), + pevt = new Event(Event::Type::kEvtDropData, id, data.svc_id_, + header.mseq_, header.mfrag_, header.fseq_); + if (m_NCS_IPC_SEND(_events, pevt, NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]", strerror(errno)); @@ -474,6 +476,7 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t len, HeaderMessage header; header.Decode(buffer); + Event* pevt = nullptr; // if mds support flow control if ((header.pro_ver_ & MDS_PROT_VER_MASK) == MDS_PROT_FCTRL) { if (header.pro_id_ == MDS_PROT_FCTRL_ID) { @@ -482,9 +485,10 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t len, ChunkAck ack; ack.Decode(buffer); // send to the event thread -if (m_NCS_IPC_SEND(_events, -new Event(Event::Type::kEvtRcvChunkAck, id, ack.svc_id_, -header.mseq_, header.mfrag_, ack.acked_fseq_, ack.chunk_size_), +pevt = new Event(Event::Type::kEvtRcvChunkAck,
Re: [devel] [PATCH 1/1] mds: Add Intro message [#3090]
Hi, The counters reset will be removed in ReceiveIntro(). Thanks Minh On 15/10/19 12:50 pm, Minh Chau wrote: mds relies on data message sent from the peer to determine whether the MDS_TIPC_FCTRL_ENABLED is set. The data message may not be sent right after TIPC_PUBLISHED event, which can cause the tx probation timer timeout. This patch add Intro message, which is sent right after the TIPC_PUBLISHED to help mds determine the flow control supported at the peer earlier. --- src/mds/mds_main.c | 2 +- src/mds/mds_tipc_fctrl_intf.cc | 27 ++ src/mds/mds_tipc_fctrl_msg.cc| 11 + src/mds/mds_tipc_fctrl_msg.h | 18 +++ src/mds/mds_tipc_fctrl_portid.cc | 49 ++-- src/mds/mds_tipc_fctrl_portid.h | 2 ++ 6 files changed, 96 insertions(+), 13 deletions(-) diff --git a/src/mds/mds_main.c b/src/mds/mds_main.c index 8c9b1f1..c7d2f7b 100644 --- a/src/mds/mds_main.c +++ b/src/mds/mds_main.c @@ -408,7 +408,7 @@ uint32_t mds_lib_req(NCS_LIB_REQ_INFO *req) if (tipc_mcast_enabled != false) tipc_mcast_enabled = true; -m_MDS_LOG_DBG( + m_MDS_LOG_NOTIFY( "MDS: TIPC_MCAST_ENABLED: %d Set argument \n", tipc_mcast_enabled); } diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index 6271890..b803bfe 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -39,6 +39,7 @@ using mds::DataMessage; using mds::ChunkAck; using mds::HeaderMessage; using mds::Nack; +using mds::Intro; namespace { // flow control enabled/disabled @@ -124,12 +125,20 @@ uint32_t process_flow_event(const Event& evt) { uint32_t rc = NCSCC_RC_SUCCESS; TipcPortId *portid = portid_lookup(evt.id_); if (portid == nullptr) { +// the null portid normally should not happen; however because the +// tipc_cb.Dsock and tipc_cb.BSRsock are separated; the data message +// sent from BSRsock may come before reception of TIPC_PUBLISHED if (evt.type_ == Event::Type::kEvtRcvData) { portid = new TipcPortId(evt.id_, data_sock_fd, kChunkAckSize, sock_buf_size); portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, evt.fseq_, evt.svc_id_); +} else if (evt.type_ == Event::Type::kEvtRcvIntro) { + portid = new TipcPortId(evt.id_, data_sock_fd, + kChunkAckSize, sock_buf_size); + portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; + portid->ReceiveIntro(); } else { m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], " "RcvEvt[evt:%d], Error[PortId not found]", @@ -151,6 +160,9 @@ uint32_t process_flow_event(const Event& evt) { portid->ReceiveNack(evt.mseq_, evt.mfrag_, evt.fseq_); } +if (evt.type_ == Event::Type::kEvtRcvIntro) { + portid->ReceiveIntro(); +} } return rc; } @@ -489,6 +501,16 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t len, m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]", strerror(errno)); } + } else if (header.msg_type_ == Intro::kIntroMsgType) { +// no need to decode intro message +// the decoding intro message type is done in header decoding +// send to the event thread +if (m_NCS_IPC_SEND(_events, +new Event(Event::Type::kEvtRcvIntro, id, 0, 0, 0, 0), +NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { + m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]", + strerror(errno)); +} } else { m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], " "[msg_type:%u], Error[not supported message type]", @@ -516,6 +538,11 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t len, portid_map_mutex.unlock(); return rc; } + } else { +m_MDS_LOG_DBG("FCTRL: [me] <-- [node:%x, ref:%u], " +"Receive non-flow-control data message, " +"header.pro_ver:%u", +id.node, id.ref, header.pro_ver_); } return NCSCC_RC_SUCCESS; } diff --git a/src/mds/mds_tipc_fctrl_msg.cc b/src/mds/mds_tipc_fctrl_msg.cc index 932120f..180dcb6 100644 --- a/src/mds/mds_tipc_fctrl_msg.cc +++ b/src/mds/mds_tipc_fctrl_msg.cc @@ -178,4 +178,15 @@ void Nack::Decode(uint8_t *msg) { nacked_fseq_ = ncs_decode_16bit(); } + +void Intro::Encode(uint8_t *msg) { + uint8_t *ptr; + // encode protocol identifier + ptr = [Intro::FieldIndex::kProtocolIdentifier]; + ncs_encode_32bit(, MDS_PROT_FCTRL_ID); + // encode message type + ptr = [Intro::FieldIndex::kFlowControlMessageType]; + ncs_encode_8bit(, kIntroMsgType); +} + } // end
Re: [devel] [PATCH 1/1] mds: Add Reset message [#3090]
Hi Thuan, I can rename it as "Intro" message, then the rcvwnd counter shall be removed. This new message can not replace the tx prob timer. This new message is to speed up the determinatin of flow control at the peer side rather than mds data message. It is needed for the flow control sender 'talk' with the non-flow-control receiver who will not send any ack back. THanks, Minh On 14/10/19 7:06 pm, Tran Thuan wrote: Hi bro.Minh, Thanks for explanation. I think the "reset" message should be rename to "introduce" message. Another question: with this fix, will tx probation timer become redundant or still useful in somehow? Best Regards, ThuanTr -----Original Message- From: Minh Hon Chau Sent: Monday, October 14, 2019 1:01 PM To: Tran Thuan ; hans.nordeb...@ericsson.com; gary@dektech.com.au; vu.m.ngu...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: Add Reset message [#3090] Hi Thuan, If the chunkack is configured to send after a few data messages, then the sender is not getting any chunkack for the first message from receiver until chunkack timeout (which is also configurable to be a bit larger value). Then, the probation timer would be timeout at sender. The rcvwnd.acked_ will be fixed. Thanks Minh On 14/10/19 4:39 pm, Tran Thuan wrote: Hi bro.Minh, - In my understanding, tx probation timer only start when sender send first message. Then sender relies on chunkAck to know receiver support MDS FCTRL or timeout as not support. But from what you describe, sender got tx probation timer timeout before sending first message? Or after sending first message but sender cannot get any chunkAck somehow? I am confused this point. Could you help explain? - About the code, mistake set '0' twice for .acked_ in TipcPortId::ReceiveReset() Best Regards, ThuanTr -Original Message- From: Minh Chau Sent: Friday, October 11, 2019 10:52 AM To: hans.nordeb...@ericsson.com; gary@dektech.com.au; vu.m.ngu...@dektech.com.au; thuan.t...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; Minh Chau Subject: [PATCH 1/1] mds: Add Reset message [#3090] mds relies on data message sent from the peer to determine whether the MDS_TIPC_FCTRL_ENABLED is set. The data message may not be sent right after TIPC_PUBLISHED event, which can cause the tx probation timer timeout. This patch add Reset message, which is sent right after the TIPC_PUBLISHED to help mds determine the flow control supported at the peer earlier. --- src/mds/mds_main.c | 2 +- src/mds/mds_tipc_fctrl_intf.cc | 27 ++ src/mds/mds_tipc_fctrl_msg.cc| 11 + src/mds/mds_tipc_fctrl_msg.h | 18 +++ src/mds/mds_tipc_fctrl_portid.cc | 49 ++-- src/mds/mds_tipc_fctrl_portid.h | 2 ++ 6 files changed, 96 insertions(+), 13 deletions(-) diff --git a/src/mds/mds_main.c b/src/mds/mds_main.c index 8c9b1f1..c7d2f7b 100644 --- a/src/mds/mds_main.c +++ b/src/mds/mds_main.c @@ -408,7 +408,7 @@ uint32_t mds_lib_req(NCS_LIB_REQ_INFO *req) if (tipc_mcast_enabled != false) tipc_mcast_enabled = true; -m_MDS_LOG_DBG( + m_MDS_LOG_NOTIFY( "MDS: TIPC_MCAST_ENABLED: %d Set argument \n", tipc_mcast_enabled); } diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index 6271890..e8c9121 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -39,6 +39,7 @@ using mds::DataMessage; using mds::ChunkAck; using mds::HeaderMessage; using mds::Nack; +using mds::Reset; namespace { // flow control enabled/disabled @@ -124,12 +125,20 @@ uint32_t process_flow_event(const Event& evt) { uint32_t rc = NCSCC_RC_SUCCESS; TipcPortId *portid = portid_lookup(evt.id_); if (portid == nullptr) { +// the null portid normally should not happen; however because the +// tipc_cb.Dsock and tipc_cb.BSRsock are separated; the data message +// sent from BSRsock may come before reception of TIPC_PUBLISHED if (evt.type_ == Event::Type::kEvtRcvData) { portid = new TipcPortId(evt.id_, data_sock_fd, kChunkAckSize, sock_buf_size); portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, evt.fseq_, evt.svc_id_); +} else if (evt.type_ == Event::Type::kEvtRcvReset) { + portid = new TipcPortId(evt.id_, data_sock_fd, + kChunkAckSize, sock_buf_size); + portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; + portid->ReceiveReset(); } else { m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], " "RcvEvt[evt:%d], Error
Re: [devel] [PATCH 1/1] mds: Add Reset message [#3090]
Hi Thuan, If the chunkack is configured to send after a few data messages, then the sender is not getting any chunkack for the first message from receiver until chunkack timeout (which is also configurable to be a bit larger value). Then, the probation timer would be timeout at sender. The rcvwnd.acked_ will be fixed. Thanks Minh On 14/10/19 4:39 pm, Tran Thuan wrote: Hi bro.Minh, - In my understanding, tx probation timer only start when sender send first message. Then sender relies on chunkAck to know receiver support MDS FCTRL or timeout as not support. But from what you describe, sender got tx probation timer timeout before sending first message? Or after sending first message but sender cannot get any chunkAck somehow? I am confused this point. Could you help explain? - About the code, mistake set '0' twice for .acked_ in TipcPortId::ReceiveReset() Best Regards, ThuanTr -Original Message- From: Minh Chau Sent: Friday, October 11, 2019 10:52 AM To: hans.nordeb...@ericsson.com; gary@dektech.com.au; vu.m.ngu...@dektech.com.au; thuan.t...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; Minh Chau Subject: [PATCH 1/1] mds: Add Reset message [#3090] mds relies on data message sent from the peer to determine whether the MDS_TIPC_FCTRL_ENABLED is set. The data message may not be sent right after TIPC_PUBLISHED event, which can cause the tx probation timer timeout. This patch add Reset message, which is sent right after the TIPC_PUBLISHED to help mds determine the flow control supported at the peer earlier. --- src/mds/mds_main.c | 2 +- src/mds/mds_tipc_fctrl_intf.cc | 27 ++ src/mds/mds_tipc_fctrl_msg.cc| 11 + src/mds/mds_tipc_fctrl_msg.h | 18 +++ src/mds/mds_tipc_fctrl_portid.cc | 49 ++-- src/mds/mds_tipc_fctrl_portid.h | 2 ++ 6 files changed, 96 insertions(+), 13 deletions(-) diff --git a/src/mds/mds_main.c b/src/mds/mds_main.c index 8c9b1f1..c7d2f7b 100644 --- a/src/mds/mds_main.c +++ b/src/mds/mds_main.c @@ -408,7 +408,7 @@ uint32_t mds_lib_req(NCS_LIB_REQ_INFO *req) if (tipc_mcast_enabled != false) tipc_mcast_enabled = true; -m_MDS_LOG_DBG( + m_MDS_LOG_NOTIFY( "MDS: TIPC_MCAST_ENABLED: %d Set argument \n", tipc_mcast_enabled); } diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index 6271890..e8c9121 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -39,6 +39,7 @@ using mds::DataMessage; using mds::ChunkAck; using mds::HeaderMessage; using mds::Nack; +using mds::Reset; namespace { // flow control enabled/disabled @@ -124,12 +125,20 @@ uint32_t process_flow_event(const Event& evt) { uint32_t rc = NCSCC_RC_SUCCESS; TipcPortId *portid = portid_lookup(evt.id_); if (portid == nullptr) { +// the null portid normally should not happen; however because the +// tipc_cb.Dsock and tipc_cb.BSRsock are separated; the data message +// sent from BSRsock may come before reception of TIPC_PUBLISHED if (evt.type_ == Event::Type::kEvtRcvData) { portid = new TipcPortId(evt.id_, data_sock_fd, kChunkAckSize, sock_buf_size); portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; rc = portid->ReceiveData(evt.mseq_, evt.mfrag_, evt.fseq_, evt.svc_id_); +} else if (evt.type_ == Event::Type::kEvtRcvReset) { + portid = new TipcPortId(evt.id_, data_sock_fd, + kChunkAckSize, sock_buf_size); + portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid; + portid->ReceiveReset(); } else { m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], " "RcvEvt[evt:%d], Error[PortId not found]", @@ -151,6 +160,9 @@ uint32_t process_flow_event(const Event& evt) { portid->ReceiveNack(evt.mseq_, evt.mfrag_, evt.fseq_); } +if (evt.type_ == Event::Type::kEvtRcvReset) { + portid->ReceiveReset(); +} } return rc; } @@ -489,6 +501,16 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t len, m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]", strerror(errno)); } + } else if (header.msg_type_ == Reset::kResetMsgType) { +// no need to decode reset message +// the decoding reset message type is done in header decoding +// send to the event thread +if (m_NCS_IPC_SEND(_events, +new Event(Event::Type::kEvtRcvReset, id, 0, 0, 0, 0), +NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { + m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]", + strerror(errno)); +} } else {
Re: [devel] [PATCH 1/1] ntfd: Do not send response to client if client down [#3084]
Hi all, What I guess from the ticket that it is a race condition between the mds thread and main thread in ntfd. We normally get NCSDOWN callback from mds, and send event to main thread to remove the client. But the mds callback here comes in the middle of processing Initialize(). We have something similar done in ntfd with SearchAndSetClientsDownFlag(), GetClientDownFlag(), SetClientDownFlag(), can we try to reuse them? Thanks, Minh On 9/10/19 5:10 pm, Thien Minh Huynh wrote: Hi Vu, Thanks for your time to review the patch. Best Regards, ThienHuynh -Original Message- From: Nguyen Minh Vu Sent: Wednesday, October 9, 2019 11:15 AM To: thien.m.huynh ; thuan.t...@dektech.com.au; minh.c...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] ntfd: Do not send response to client if client down [#3084] Hi Thien, I have some comments below. I see this enhancement does not bring much value to NTF as it deals with a very rare case - process is terminated before saNtfInitialize() returns. In reality, if NTF server is getting overloaded by such process, there must be an error in that process. @Minh: how about your opinion? is this ticket valid? Anyway, here are my comments: 1) Only C source files, ntfs_mds.c & ntfs_evt.c, access the new added list `ntfa_down_list_head`, why put new added methods in the C++ file and add C wrapper functions for them? It should be more clean if you move these functions into a new files e.g: ntfs_client_down.{h,c}. 2) C++ method name should start with a capital letter (refer to C++ google coding rule) 3) Naming methods that represent adding a down client to list, and removing from the list should pair/opposite with each other e.g. Open vs Close, Add vs Remove, not mark vs remove 4) The list is accessing from 02 different threads, mds and main thread, therefore must use mutex to prevent race conditions. 5) Should have a check to ensure *not* adding the down client into the list if that client has successfully initialized. Regards, Vu On 10/9/19 9:36 AM, thien.m.huynh wrote: Ntfd will not send response to a client when client already down. This will avoid timeout when ntfd send via mds. --- src/ntf/ntfd/NtfAdmin.cc | 93 src/ntf/ntfd/NtfAdmin.h | 3 ++ src/ntf/ntfd/ntfs_cb.h | 6 src/ntf/ntfd/ntfs_com.h | 3 ++ src/ntf/ntfd/ntfs_evt.c | 1 + src/ntf/ntfd/ntfs_mds.c | 9 - 6 files changed, 114 insertions(+), 1 deletion(-) diff --git a/src/ntf/ntfd/NtfAdmin.cc b/src/ntf/ntfd/NtfAdmin.cc index 8bbee69..641171b 100644 --- a/src/ntf/ntfd/NtfAdmin.cc +++ b/src/ntf/ntfd/NtfAdmin.cc @@ -560,6 +560,85 @@ void NtfAdmin::SearchAndSetClientsDownFlag(MDS_DEST mds_dest) { } /** + * @brief Add mds_dest tag into ntfa down list + * @param mds_dest + */ +void NtfAdmin::markAgentDown(MDS_DEST mds_dest) { + TRACE_ENTER(); + NTFA_DOWN_LIST *ntfa_down_rec = NULL; + if ((ntfa_down_rec = reinterpret_cast( + malloc(sizeof(NTFA_DOWN_LIST == NULL) { +LOG_ER("memory allocation for the NTFA_DOWN_LIST failed"); +return; + } + memset(ntfa_down_rec, 0, sizeof(NTFA_DOWN_LIST)); + ntfa_down_rec->mds_dest = mds_dest; + ntfa_down_rec->next = NULL; + + if (ntfs_cb->ntfa_down_list_head == NULL) { +ntfs_cb->ntfa_down_list_head = ntfa_down_rec; + } else { +NTFA_DOWN_LIST *p = ntfs_cb->ntfa_down_list_head; +while (p->next != NULL) { + p = p->next; +} +p->next = ntfa_down_rec; + } + TRACE_1("Added MDS dest: %" PRIx64, ntfa_down_rec->mds_dest); + TRACE_LEAVE(); +} + +/** + * @brief Find and remove agent from ntfa down list + * @param mds_dest + */ +void NtfAdmin::removeAgentFromDownList(MDS_DEST mds_dest) { + NTFA_DOWN_LIST *ntfa_down_rec = ntfs_cb->ntfa_down_list_head; + NTFA_DOWN_LIST *prev = NULL; + TRACE_ENTER(); + while (ntfa_down_rec != NULL) { +if (mds_dest == ntfa_down_rec->mds_dest) { + if (ntfa_down_rec == ntfs_cb->ntfa_down_list_head) { +if (ntfa_down_rec->next == NULL) { + ntfs_cb->ntfa_down_list_head = NULL; +} else { + ntfs_cb->ntfa_down_list_head = ntfa_down_rec->next; +} + } else if (prev) { +prev->next = ntfa_down_rec->next; + } + TRACE("Deleted MDS dest: %" PRIx64, ntfa_down_rec->mds_dest); + free(ntfa_down_rec); + ntfa_down_rec = NULL; + break; +} +prev = ntfa_down_rec; +ntfa_down_rec = ntfa_down_rec->next; + } + TRACE_LEAVE(); +} + +/** + * @brief Check if agent exists in down list + * @param mds_dest + * @return true/false + */ +bool NtfAdmin::isInNtfaDownList(MDS_DEST mds_dest) { + bool found = false; + NTFA_DOWN_LIST *ntfa_down_rec = ntfs_cb->ntfa_down_list_head; + TRACE_ENTER(); + while (ntfa_down_rec != NULL) { +if (mds_dest == ntfa_down_rec->mds_dest) { + found = true; + break; +} +ntfa_down_rec = ntfa_down_rec->next; + } + TRACE_LEAVE(); +
Re: [devel] [PATCH 1/1] mds: Enhance decoding for mds flow control message [#3097]
Hi Thuan, Please see comments inline. Thanks Minh On 7/10/19 3:18 pm, Tran Thuan wrote: Hi Minh, Some minor comments from me, check [Thuan] inline. Thanks. Best Regards, ThuanTr -Original Message- From: Minh Chau Sent: Monday, October 7, 2019 7:12 AM To: hans.nordeb...@ericsson.com; vu.m.ngu...@dektech.com.au; gary@dektech.com.au; thuan.t...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; Minh Chau Subject: [PATCH 1/1] mds: Enhance decoding for mds flow control message [#3097] mds currently uses MDS_PROT_FCTRL_ID 4 bytes value (0x00AC13F5) from octet11 to octet14 to identify the flow control message e.g., chunkack message. In case of fragmentation from big message, the second fragment onwards will start from the octet11, which may have arbitrary value and cause mds to incorrectly decode as a flow control message if the fragment starts with value of 0x00AC13F5. This patch fixes this rare case by decoding flow control message only if the oct2-5 (mds global sequence number) and oct6-7 (mds fragment number) are non-zero. Change MDS_PROT_FCTRL_ID:0xFDAC13F5 [Thuan]: typo "non-zero" -> "zero"? [Minh]: Yes, typo, it's "zero" [Thuan] Can you give info in commit message about why change MDS_PROT_FCTRL_ID to FDAC13F5? [Minh]: It is only a random number for identifier, but 0x00AC will occupy the oct11&12 which is msd header length, and may cause a higher probability to be identical --- src/mds/mds_dt.h | 2 +- src/mds/mds_tipc_fctrl_msg.cc | 20 +--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index d9e8633..64da600 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -245,7 +245,7 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); /* MDS protocol/version for flow control */ #define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) -#define MDS_PROT_FCTRL_ID 0x00AC13F5 +#define MDS_PROT_FCTRL_ID 0xFDAC13F5 /* Added for the subscription changes */ #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff) diff --git a/src/mds/mds_tipc_fctrl_msg.cc b/src/mds/mds_tipc_fctrl_msg.cc index 064d977..8375673 100644 --- a/src/mds/mds_tipc_fctrl_msg.cc +++ b/src/mds/mds_tipc_fctrl_msg.cc @@ -64,13 +64,19 @@ void HeaderMessage::Decode(uint8_t *msg) { // decode flow control sequence number ptr = [HeaderMessage::FieldIndex::kFlowControlSequenceNumber]; fseq_ = ncs_decode_16bit(); -// decode protocol identifier -ptr = [ChunkAck::FieldIndex::kProtocolIdentifier]; -pro_id_ = ncs_decode_32bit(); -if (pro_id_ == MDS_PROT_FCTRL_ID) { - // decode message type - ptr = [ChunkAck::FieldIndex::kFlowControlMessageType]; - msg_type_ = ncs_decode_8bit(); +// decode protocol identifier if the mfrag_ and mseq_ are 0 +// otherwise it is always DataMessage within non-zero mseq_ and mfrag_ +if (mfrag_ == 0 && mseq_ == 0) { + ptr = [ChunkAck::FieldIndex::kProtocolIdentifier]; + pro_id_ = ncs_decode_32bit(); + if (pro_id_ == MDS_PROT_FCTRL_ID) { +// decode message type +ptr = [ChunkAck::FieldIndex::kFlowControlMessageType]; +msg_type_ = ncs_decode_8bit(); + } +} else { + pro_id_ = 0; + msg_type_ = 0; [Thuan] Don't need ELSE as values 0 already? [Minh]: I think we should explicitly set again, the variable header might be reused to decode } } else { if (mfrag_ != 0) { -- 2.7.4 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 0/2] Review Request for mds: Add Nack message for MDS_TIPC_FCTRL_ENABLED [#3095] V2
Hi, I would like to push the patches today if no more comment for them. Thanks Minh On 4/10/19 3:20 pm, Minh Chau wrote: Summary: mds: Add Nack message for MDS_TIPC_FCTRL_ENABLED [#3095] V2 Review request for Ticket(s): 3095 Peer Reviewer(s): Hans, Vu, Gary, Thuan Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-3095 Base revision: 05064a1cfd0aeaf824dce7602d535654b3457e30 Personal repository: git://git.code.sf.net/u/minh-chau/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesn OpenSAF servicesn Core libraries y Samples n Tests n Other n Comments (indicate scope for each "y" above): - *** EXPLAIN/COMMENT THE PATCH SERIES HERE *** revision cbbeab8f2299620aa3eb9b0e29710a2b159b5a45 Author: Minh Chau Date: Fri, 4 Oct 2019 12:59:27 +1000 mds: Improve error log for MDS_TIPC_FCTRL_ENABLED [#3095] This commit as part of #3095 updates the error string with pattern "FCTRL:*Error[*]", in order to help grep-ing the error in mds debug log. revision cc666586717fa82df70471748d8766e8fe901460 Author: Minh Chau Date: Fri, 4 Oct 2019 12:59:16 +1000 mds: Add Nack message for MDS_TIPC_FCTRL_ENABLED [#3095] In the scenario of recovery from split-brain, where both active director services may suffer mds message loss due to lost-contact tipc link. If MDS_TIPC_FCTRL_ENABLED is set, the out-of-order message will be dropped, and there is no mechanism to trigger the retransmission from receiver side at this moment (the retransmission is only triggered from sender as result of TIPC_ERR_OVERLOAD). In reception of disordered message, the receiver can send not-acknowledgement to notify the sender for retransmission. Therefore, the sender can trigger retransmisison in the same way as receiving TIPC_ERR_OVERLOAD. This patch adds Nack message for retransmission of disordered message detected from receiver side, and adds a missing call to portid_map_mutex.unlock() in process_all_events(). Complete diffstat: -- src/mds/mds_c_api.c | 2 +- src/mds/mds_dt_common.c | 2 +- src/mds/mds_tipc_fctrl_intf.cc | 72 +--- src/mds/mds_tipc_fctrl_msg.cc| 35 ++- src/mds/mds_tipc_fctrl_msg.h | 22 src/mds/mds_tipc_fctrl_portid.cc | 42 --- src/mds/mds_tipc_fctrl_portid.h | 3 +- 7 files changed, 143 insertions(+), 35 deletions(-) Testing Commands: - *** LIST THE COMMAND LINE TOOLS/STEPS TO TEST YOUR CHANGES *** Testing, Expected Results: -- *** PASTE COMMAND OUTPUTS / TEST RESULTS *** Conditions of Submission: - *** HOW MANY DAYS BEFORE PUSHING, CONSENSUS ETC *** Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 n n powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible line length in your headers/comments/text. ___ You have failed to put in a proper Trac Ticket # into your commits. ___ You have incorrectly put/left internal data in your comments/files (i.e. internal bug tracking tool IDs, product names etc) ___ You have not given any evidence of testing beyond basic build tests. Demonstrate some level of runtime or other sanity testing. ___ You have ^M present in some of your files. These have to be removed. ___ You have needlessly changed whitespace or added whitespace crimes like trailing spaces, or spaces before tabs. ___ You have mixed real technical changes with whitespace and other cosmetic code cleanup changes. These have to be separate commits. ___ You need to refactor your submission into logical chunks; there is too much content into a single commit. ___ You have extraneous garbage in your review (merge commits etc) ___ You have giant attachments which should never have been sent; Instead you should place your content in a public tree to be pulled. ___ You have too many commits attached to an e-mail; resend as
Re: [devel] [PATCH 1/1] dtm: close unused log streams [#2642]
Hi Vu, No problem, I hope users will get the meaning of "clean-up job" :). Thanks Minh On 1/10/19 8:46 pm, Nguyen Minh Vu wrote: Hi Minh, I put that note in the usage of max-idle option. See below: + "--max-idle=NUM Set the maximum number of idle time to NUM\n" + " minutes. If a stream has not been used for\n" + " NUM minutes, the stream will be closed.\n" + " The default value is zero (disable the\n" + " clean-up job)\n", Regards, Vu On 10/1/19 5:38 PM, Minh Hon Chau wrote: Hi Vu, Ok, then the value '0' needs to be written somewhere (README?) for this special purpose I guess, to avoid a confusion later on. Thanks Minh On 1/10/19 8:27 pm, Nguyen Minh Vu wrote: Hi Minh, Thanks for your comment. When passing zero to max-idle, the server will disable 'close unused log streams' functionality. It may be useful when user has previously set max-idle to a specific value, and want to disable it later. If the range starts from 1, there is no chance to disable it. Regards, Vu On 10/1/19 5:17 PM, Minh Hon Chau wrote: Hi Vu, ack for minor comment. The range of --max-idle, I think, should be starting from 1, as the log_server ignores the tv_sec=0. From user's perspective, if allowing max-idle=0, the meaning seems that the stream must be constantly writing traces, or the stream will be deleted. Thanks Minh On 24/9/19 12:57 pm, Vu Minh Nguyen wrote: Providing a new option '--max-idle' to configure the maximum idle time of logtrace streams. If a stream has not been used for such time, logtrace server will close the stream from its database. This patch also corrects wrong indentation in osaflog.cc file. --- src/dtm/Makefile | 2 +- src/dtm/common/osaflog_protocol.h | 2 + src/dtm/tools/Makefile | 18 src/dtm/tools/osaflog.cc | 132 ++ src/dtm/transport/log_server.cc | 57 - src/dtm/transport/log_server.h | 7 +- src/dtm/transport/transportd.conf | 6 ++ 7 files changed, 168 insertions(+), 56 deletions(-) create mode 100644 src/dtm/tools/Makefile diff --git a/src/dtm/Makefile b/src/dtm/Makefile index 533b0f273..fb0221075 100644 --- a/src/dtm/Makefile +++ b/src/dtm/Makefile @@ -15,7 +15,7 @@ # all: - $(MAKE) -C ../.. bin/osafdtmd bin/osaftransportd + $(MAKE) -C ../.. bin/osafdtmd bin/osaftransportd bin/osaflog check: $(MAKE) -C ../.. bin/transport_test diff --git a/src/dtm/common/osaflog_protocol.h b/src/dtm/common/osaflog_protocol.h index 61e9f6f39..d35e5f345 100644 --- a/src/dtm/common/osaflog_protocol.h +++ b/src/dtm/common/osaflog_protocol.h @@ -27,6 +27,8 @@ namespace Osaflog { static constexpr const char* kServerSocketPath = PKGLOCALSTATEDIR "/osaf_log.sock"; +static constexpr const uint64_t kOneDayInMinute = 24*60; + struct __attribute__((__packed__)) ClientAddressConstantPrefix { sa_family_t family = AF_UNIX; char abstract = '\0'; diff --git a/src/dtm/tools/Makefile b/src/dtm/tools/Makefile new file mode 100644 index 0..8c48b70a5 --- /dev/null +++ b/src/dtm/tools/Makefile @@ -0,0 +1,18 @@ +# -*- OpenSAF -*- +# +# (C) Copyright 2019 The OpenSAF Foundation +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed +# under the GNU Lesser General Public License Version 2.1, February 1999. +# The complete license can be accessed from the following location: +# http://opensource.org/licenses/lgpl-license.php +# See the Copying file included with the OpenSAF distribution for full +# licensing terms. +# +# Author(s): Ericsson AB +# + +all: + $(MAKE) -C ../../.. bin/osaflog diff --git a/src/dtm/tools/osaflog.cc b/src/dtm/tools/osaflog.cc index 64be253e9..abbf0b164 100644 --- a/src/dtm/tools/osaflog.cc +++ b/src/dtm/tools/osaflog.cc @@ -47,6 +47,7 @@ namespace { void PrintUsage(const char* program_name); bool SendCommand(const std::string& command); bool MaxTraceFileSize(uint64_t max_file_size); +bool SetMaxIdleTime(uint64_t max_idle); bool NoOfBackupFiles(uint64_t number_of_backups); bool Flush(); base::UnixServerSocket* CreateSocket(); @@ -70,10 +71,12 @@ int main(int argc, char** argv) { {"print", no_argument, nullptr, 'p'}, {"delete", no_argument, nullptr, 'd'}, {"extract-trace", required_argument, 0, 'e'}, + {"max-idle", required_argument, 0, 'i'}, {0, 0, 0, 0}}; uint64_t max_file_size = 0;
Re: [devel] [PATCH 1/1] mds: Add Nack message for MDS_TIPC_FCTRL_ENABLED [#3095]
Hi Vu, See comments below. Thanks Minh On 1/10/19 8:34 pm, Nguyen Minh Vu wrote: Hi Minh, Ack with minor comments. Thanks. Regards, Vu On 10/1/19 12:49 PM, Minh Chau wrote: In the scenario of recovery from split-brain, where both active director services may suffer mds message loss due to lost-contact tipc link. If MDS_TIPC_FCTRL_ENABLED is set, the out-of-order message will be dropped, and there is no mechanism to trigger the retransmission from receiver side at this moment (the retransmission is only triggered from sender as result of TIPC_ERR_OVERLOAD). In reception of disordered message, the receiver can send not-acknowledgement to notify the sender for retransmission. Therefore, the sender can trigger retransmisison in the same way as receiving TIPC_ERR_OVERLOAD. This patch adds Nack message for retransmission of disordered message detected from receiver side. --- src/mds/mds_c_api.c | 2 +- src/mds/mds_dt_common.c | 2 +- src/mds/mds_tipc_fctrl_intf.cc | 19 ++- src/mds/mds_tipc_fctrl_msg.cc | 33 + src/mds/mds_tipc_fctrl_msg.h | 22 ++ src/mds/mds_tipc_fctrl_portid.cc | 18 +- src/mds/mds_tipc_fctrl_portid.h | 1 + 7 files changed, 93 insertions(+), 4 deletions(-) diff --git a/src/mds/mds_c_api.c b/src/mds/mds_c_api.c index c41c8dd..132555b 100644 --- a/src/mds/mds_c_api.c +++ b/src/mds/mds_c_api.c @@ -4196,7 +4196,7 @@ void mds_mcm_msg_loss(MDS_SVC_HDL local_svc_hdl, MDS_DEST rem_adest, /* Check whether the msg loss is enabled or not */ if (true != local_svc_info->i_msg_loss_indication) { - m_MDS_LOG_INFO(" MSG loss not enbaled mds_mcm_msg_loss\n"); + m_MDS_LOG_NOTIFY("MSG loss is not enabled mds_mcm_msg_loss\n"); return; } diff --git a/src/mds/mds_dt_common.c b/src/mds/mds_dt_common.c index 66652af..de13883 100644 --- a/src/mds/mds_dt_common.c +++ b/src/mds/mds_dt_common.c @@ -972,7 +972,7 @@ uint32_t mds_tmr_mailbox_processing(void) .vdest_id); break; case MDS_REASSEMBLY_TMR: - m_MDS_LOG_DBG( + m_MDS_LOG_ERR( "MDTM: Tmr Mailbox Processing:Reassemble Tmr Hdl=0x%08x", mbx_evt_info->info.tmr_info_hdl); mdtm_process_reassem_timer_event( diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index 2366672..65f1849 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -38,6 +38,7 @@ using mds::Timer; using mds::DataMessage; using mds::ChunkAck; using mds::HeaderMessage; +using mds::Nack; namespace { // flow control enabled/disabled @@ -142,7 +143,8 @@ uint32_t process_flow_event(const Event& evt) { if (evt.type_ == Event::Type::kEvtSendChunkAck) { portid->SendChunkAck(evt.fseq_, evt.svc_id_, evt.chunk_size_); } - if (evt.type_ == Event::Type::kEvtDropData) { + if (evt.type_ == Event::Type::kEvtDropData || + evt.type_ == Event::Type::kEvtRcvNack) { portid->ReceiveNack(evt.mseq_, evt.mfrag_, evt.fseq_); } @@ -464,6 +466,21 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t len, // skip this data msg return NCSCC_RC_FAILURE; } + if (header.msg_type_ == Nack::kNackMsgType) { + // receive nack message + Nack nack; + nack.Decode(buffer); + // send to the event thread + if (m_NCS_IPC_SEND(_events, + new Event(Event::Type::kEvtRcvNack, id, nack.svc_id_, + header.mseq_, header.mfrag_, nack.nacked_fseq_), + NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { + m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events\n"); + } + // return NCSCC_RC_FAILURE, so the tipc receiving thread (legacy) will + // skip this data msg + return NCSCC_RC_FAILURE; + } } else { // receive data message DataMessage data; diff --git a/src/mds/mds_tipc_fctrl_msg.cc b/src/mds/mds_tipc_fctrl_msg.cc index 064d977..f85568c 100644 --- a/src/mds/mds_tipc_fctrl_msg.cc +++ b/src/mds/mds_tipc_fctrl_msg.cc @@ -139,4 +139,37 @@ void ChunkAck::Decode(uint8_t *msg) { chunk_size_ = ncs_decode_16bit(); } + +Nack::Nack(uint16_t svc_id, uint16_t fseq): + svc_id_(svc_id), nacked_fseq_(fseq) { + msg_type_ = kNackMsgType; +} + +void Nack::Encode(uint8_t *msg) { + uint8_t *ptr; + // encode protocol identifier + ptr = [Nack::FieldIndex::kProtocolIdentifier]; + ncs_encode_32bit(, MDS_PROT_FCTRL_ID); + // encode message type + ptr = [Nack::FieldIndex::kFlowControlMessageType]; + ncs_encode_8bit(, kNackMsgType); + // encode service id + ptr = [Nack::FieldIndex::kServiceId]; + ncs_encode_16bit(, svc_id_); + // encode flow control sequence number + ptr = [Nack::FieldIndex::kFlowControlSequenceNumber]; +
Re: [devel] [PATCH 1/1] dtm: close unused log streams [#2642]
Hi Vu, Ok, then the value '0' needs to be written somewhere (README?) for this special purpose I guess, to avoid a confusion later on. Thanks Minh On 1/10/19 8:27 pm, Nguyen Minh Vu wrote: Hi Minh, Thanks for your comment. When passing zero to max-idle, the server will disable 'close unused log streams' functionality. It may be useful when user has previously set max-idle to a specific value, and want to disable it later. If the range starts from 1, there is no chance to disable it. Regards, Vu On 10/1/19 5:17 PM, Minh Hon Chau wrote: Hi Vu, ack for minor comment. The range of --max-idle, I think, should be starting from 1, as the log_server ignores the tv_sec=0. From user's perspective, if allowing max-idle=0, the meaning seems that the stream must be constantly writing traces, or the stream will be deleted. Thanks Minh On 24/9/19 12:57 pm, Vu Minh Nguyen wrote: Providing a new option '--max-idle' to configure the maximum idle time of logtrace streams. If a stream has not been used for such time, logtrace server will close the stream from its database. This patch also corrects wrong indentation in osaflog.cc file. --- src/dtm/Makefile | 2 +- src/dtm/common/osaflog_protocol.h | 2 + src/dtm/tools/Makefile | 18 src/dtm/tools/osaflog.cc | 132 ++ src/dtm/transport/log_server.cc | 57 - src/dtm/transport/log_server.h | 7 +- src/dtm/transport/transportd.conf | 6 ++ 7 files changed, 168 insertions(+), 56 deletions(-) create mode 100644 src/dtm/tools/Makefile diff --git a/src/dtm/Makefile b/src/dtm/Makefile index 533b0f273..fb0221075 100644 --- a/src/dtm/Makefile +++ b/src/dtm/Makefile @@ -15,7 +15,7 @@ # all: - $(MAKE) -C ../.. bin/osafdtmd bin/osaftransportd + $(MAKE) -C ../.. bin/osafdtmd bin/osaftransportd bin/osaflog check: $(MAKE) -C ../.. bin/transport_test diff --git a/src/dtm/common/osaflog_protocol.h b/src/dtm/common/osaflog_protocol.h index 61e9f6f39..d35e5f345 100644 --- a/src/dtm/common/osaflog_protocol.h +++ b/src/dtm/common/osaflog_protocol.h @@ -27,6 +27,8 @@ namespace Osaflog { static constexpr const char* kServerSocketPath = PKGLOCALSTATEDIR "/osaf_log.sock"; +static constexpr const uint64_t kOneDayInMinute = 24*60; + struct __attribute__((__packed__)) ClientAddressConstantPrefix { sa_family_t family = AF_UNIX; char abstract = '\0'; diff --git a/src/dtm/tools/Makefile b/src/dtm/tools/Makefile new file mode 100644 index 0..8c48b70a5 --- /dev/null +++ b/src/dtm/tools/Makefile @@ -0,0 +1,18 @@ +# -*- OpenSAF -*- +# +# (C) Copyright 2019 The OpenSAF Foundation +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed +# under the GNU Lesser General Public License Version 2.1, February 1999. +# The complete license can be accessed from the following location: +# http://opensource.org/licenses/lgpl-license.php +# See the Copying file included with the OpenSAF distribution for full +# licensing terms. +# +# Author(s): Ericsson AB +# + +all: + $(MAKE) -C ../../.. bin/osaflog diff --git a/src/dtm/tools/osaflog.cc b/src/dtm/tools/osaflog.cc index 64be253e9..abbf0b164 100644 --- a/src/dtm/tools/osaflog.cc +++ b/src/dtm/tools/osaflog.cc @@ -47,6 +47,7 @@ namespace { void PrintUsage(const char* program_name); bool SendCommand(const std::string& command); bool MaxTraceFileSize(uint64_t max_file_size); +bool SetMaxIdleTime(uint64_t max_idle); bool NoOfBackupFiles(uint64_t number_of_backups); bool Flush(); base::UnixServerSocket* CreateSocket(); @@ -70,10 +71,12 @@ int main(int argc, char** argv) { {"print", no_argument, nullptr, 'p'}, {"delete", no_argument, nullptr, 'd'}, {"extract-trace", required_argument, 0, 'e'}, + {"max-idle", required_argument, 0, 'i'}, {0, 0, 0, 0}}; uint64_t max_file_size = 0; uint64_t max_backups = 0; + uint64_t max_idle = 0; int option = 0; int long_index = 0; @@ -82,71 +85,81 @@ int main(int argc, char** argv) { bool delete_result = true; bool max_file_size_result = true; bool number_of_backups_result = true; + bool max_idle_result = true; bool flush_set = false; bool pretty_print_set = false; bool delete_set = false; bool max_file_size_set = false; bool max_backups_set = false; + bool max_idle_set = false; bool thread_trace = false; std::string input_core = ""; std::string output_trace = ""; if (argc == 1) { - PrintUsage(argv[0]); - exit(EXIT_FAILURE); +
Re: [devel] [PATCH 1/1] mds: optimize mdstest suite 27 [#3087]
Hi Thuan, ack from me. Thanks Minh On 25/9/19 2:05 pm, thuan.tran wrote: - Just allocate a small buffer instead of huge buffer --- src/mds/apitest/mdstipc_api.c | 119 +++--- 1 file changed, 53 insertions(+), 66 deletions(-) diff --git a/src/mds/apitest/mdstipc_api.c b/src/mds/apitest/mdstipc_api.c index 805728464..f667d7385 100644 --- a/src/mds/apitest/mdstipc_api.c +++ b/src/mds/apitest/mdstipc_api.c @@ -13105,10 +13105,14 @@ void tet_create_default_PWE_VDEST_tp() test_validate(FAIL, 0); } -void tet_sender(char *send_buff, uint32_t buff_len, int msg_count) +void tet_sender(uint32_t msg_count, uint32_t msg_size) { int live = 100; // sender live max 100s TET_MDS_MSG *mesg; + if (msg_size > TET_MSG_SIZE_MIN) { + printf("\nSender: msg_size > TET_MSG_SIZE_MIN\n"); + exit(1); + } mesg = (TET_MDS_MSG *)malloc(sizeof(TET_MDS_MSG)); memset(mesg, 0, sizeof(TET_MDS_MSG)); @@ -13134,7 +13138,7 @@ void tet_sender(char *send_buff, uint32_t buff_len, int msg_count) exit(1); } - while(!gl_tet_adest.svc[0].svcevt[0].dest && live-- > 0) { + while (!gl_tet_adest.svc[0].svcevt[0].dest && live-- > 0) { printf("\nSender is waiting for receiver UP\n"); sleep(1); } @@ -13147,11 +13151,11 @@ void tet_sender(char *send_buff, uint32_t buff_len, int msg_count) // otherwise, receiver won't detect loss message sleep(1); - uint32_t offset = 0; - uint32_t msg_len = buff_len / msg_count; - for (int i = 1; i <= msg_count; i++) { - memcpy(mesg->send_data, _buff[offset], msg_len); - mesg->send_len = msg_len; + for (uint32_t i = 1; i <= msg_count; i++) { + /* to verify received correct order */ + memset(mesg->send_data, 'X', msg_size); + sprintf(mesg->send_data, "%u", i); + mesg->send_len = msg_size; if (mds_just_send(gl_tet_adest.mds_pwe1_hdl, NCSMDS_SVC_ID_INTERNAL_MIN, NCSMDS_SVC_ID_EXTERNAL_MIN, @@ -13163,23 +13167,25 @@ void tet_sender(char *send_buff, uint32_t buff_len, int msg_count) } else { printf("\nSender SENT message %d successfully\n", i); } - offset += msg_len; } free(mesg); - while(live-- > 0) { + while (live-- > 0) { // Keep sender alive for retransmission sleep(1); } } -bool tet_receiver(char *expected_buff, uint32_t buff_len, int msg_count) +bool tet_receiver(uint32_t msg_count, uint32_t msg_size) { - int ret = 1; + if (msg_size > TET_MSG_SIZE_MIN) { + printf("\nReceiver: msg_size > TET_MSG_SIZE_MIN\n"); + return 1; + } printf("\nStarted Receiver (pid:%d) svc_id=%d\n", (int)getpid(), NCSMDS_SVC_ID_EXTERNAL_MIN); if (adest_get_handle() != NCSCC_RC_SUCCESS) { printf("\nReceiver FAIL to get adest handle\n"); - return ret; + return 1; } sleep(1); //Let sender subscribe before receiver install @@ -13197,14 +13203,13 @@ bool tet_receiver(char *expected_buff, uint32_t buff_len, int msg_count) exit(1); } - char *received_buff = malloc(buff_len); - memset(received_buff, 0, buff_len); - uint32_t offset = 0; struct pollfd sel; - int counter = 0; + uint32_t counter = 0; + char *expected_buff = malloc(msg_size); + memset(expected_buff, 'X', msg_size); sel.fd = m_GET_FD_FROM_SEL_OBJ(gl_tet_adest.svc[0].sel_obj); sel.events = POLLIN; - while(counter < msg_count) { + while (counter < msg_count) { int ret = osaf_poll(, 1, 1); if (ret > 0) { gl_rcvdmsginfo.msg = NULL; @@ -13214,11 +13219,18 @@ bool tet_receiver(char *expected_buff, uint32_t buff_len, int msg_count) printf("\nReceiver FAIL to retrieve message\n"); break; } - TET_MDS_MSG *msg = (TET_MDS_MSG*)gl_rcvdmsginfo.msg; + TET_MDS_MSG *msg = (TET_MDS_MSG *)gl_rcvdmsginfo.msg; if (msg != NULL) { - memcpy(_buff[offset],msg->recvd_data, msg->recvd_len); - offset += msg->recvd_len; counter++; + sprintf(expected_buff, "%u", counter); + if (memcmp(msg->recvd_data, + expected_buff, msg_size) != 0) { + printf("\nReceived incorrect message\n"); +
Re: [devel] [PATCH 1/1] mds: optimize mdstest suite 27 [#3087]
Hi Thuan, Some comments: - a few warnings for >80 chars line - Need to free(msg) that is returned from each MDS callback - Another minor comment below Thanks Minh On 24/9/19 1:10 pm, thuan.tran wrote: - Just allocate a small buffer instead of huge buffer --- src/mds/apitest/mdstipc_api.c | 116 +++--- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/src/mds/apitest/mdstipc_api.c b/src/mds/apitest/mdstipc_api.c index 805728464..33e7d6c12 100644 --- a/src/mds/apitest/mdstipc_api.c +++ b/src/mds/apitest/mdstipc_api.c @@ -13105,10 +13105,14 @@ void tet_create_default_PWE_VDEST_tp() test_validate(FAIL, 0); } -void tet_sender(char *send_buff, uint32_t buff_len, int msg_count) +void tet_sender(uint32_t msg_count, uint32_t msg_size) { int live = 100; // sender live max 100s TET_MDS_MSG *mesg; + if (msg_size > TET_MSG_SIZE_MIN) { + printf("\nSender: msg_size cannot bigger than TET_MSG_SIZE_MIN\n"); + exit(1); + } mesg = (TET_MDS_MSG *)malloc(sizeof(TET_MDS_MSG)); memset(mesg, 0, sizeof(TET_MDS_MSG)); @@ -13134,7 +13138,7 @@ void tet_sender(char *send_buff, uint32_t buff_len, int msg_count) exit(1); } - while(!gl_tet_adest.svc[0].svcevt[0].dest && live-- > 0) { + while (!gl_tet_adest.svc[0].svcevt[0].dest && live-- > 0) { printf("\nSender is waiting for receiver UP\n"); sleep(1); } @@ -13147,11 +13151,11 @@ void tet_sender(char *send_buff, uint32_t buff_len, int msg_count) // otherwise, receiver won't detect loss message sleep(1); - uint32_t offset = 0; - uint32_t msg_len = buff_len / msg_count; - for (int i = 1; i <= msg_count; i++) { - memcpy(mesg->send_data, _buff[offset], msg_len); - mesg->send_len = msg_len; + for (uint32_t i = 1; i <= msg_count; i++) { + /* to verify received correct order */ + memset(mesg->send_data, 'X', msg_size); + sprintf(mesg->send_data, "%u", i); + mesg->send_len = msg_size; if (mds_just_send(gl_tet_adest.mds_pwe1_hdl, NCSMDS_SVC_ID_INTERNAL_MIN, NCSMDS_SVC_ID_EXTERNAL_MIN, @@ -13163,23 +13167,25 @@ void tet_sender(char *send_buff, uint32_t buff_len, int msg_count) } else { printf("\nSender SENT message %d successfully\n", i); } - offset += msg_len; } free(mesg); - while(live-- > 0) { + while (live-- > 0) { // Keep sender alive for retransmission sleep(1); } } -bool tet_receiver(char *expected_buff, uint32_t buff_len, int msg_count) +bool tet_receiver(uint32_t msg_count, uint32_t msg_size) { - int ret = 1; + if (msg_size > TET_MSG_SIZE_MIN) { + printf("\nReceiver: msg_size cannot bigger than TET_MSG_SIZE_MIN\n"); + return 1; + } printf("\nStarted Receiver (pid:%d) svc_id=%d\n", (int)getpid(), NCSMDS_SVC_ID_EXTERNAL_MIN); if (adest_get_handle() != NCSCC_RC_SUCCESS) { printf("\nReceiver FAIL to get adest handle\n"); - return ret; + return 1; } sleep(1); //Let sender subscribe before receiver install @@ -13197,14 +13203,12 @@ bool tet_receiver(char *expected_buff, uint32_t buff_len, int msg_count) exit(1); } - char *received_buff = malloc(buff_len); - memset(received_buff, 0, buff_len); - uint32_t offset = 0; + char *expected_buff = malloc(msg_size); struct pollfd sel; - int counter = 0; + uint32_t counter = 0; sel.fd = m_GET_FD_FROM_SEL_OBJ(gl_tet_adest.svc[0].sel_obj); sel.events = POLLIN; - while(counter < msg_count) { + while (counter < msg_count) { int ret = osaf_poll(, 1, 1); if (ret > 0) { gl_rcvdmsginfo.msg = NULL; @@ -13214,11 +13218,23 @@ bool tet_receiver(char *expected_buff, uint32_t buff_len, int msg_count) printf("\nReceiver FAIL to retrieve message\n"); break; } - TET_MDS_MSG *msg = (TET_MDS_MSG*)gl_rcvdmsginfo.msg; + TET_MDS_MSG *msg = (TET_MDS_MSG *)gl_rcvdmsginfo.msg; if (msg != NULL) { - memcpy(_buff[offset],msg->recvd_data, msg->recvd_len); - offset += msg->recvd_len; counter++; + memset(expected_buff, 'X', msg_size); [M] I think you can move the above memset(expected_buff,...) before the while (counter,...) loop, since it constantly
Re: [devel] [PATCH 0/9] Review Request for mds: Add solution for TIPC buffer overflow [#1960]
Hi all, Below is the patch #10 that updates most of comments, it applies on top of current patch #9. This patch #10 does not use the shared_ptr and base:Mutex as comments given by Gary and Vu, the reason is that it will cause a similar problem reported in #2860 (user call exit() without properly doing mds shutdown), unless those variables are allocated on the heap. I would like to push the #1960 patches today if we don't have any more comments. There are some other incremental improvements/fixes that will be addressed in other tickets. Thanks Minh --- src/mds/README | 2 +- src/mds/mds_dt_tipc.c | 28 - src/mds/mds_tipc_fctrl_intf.cc | 67 ++-- src/mds/mds_tipc_fctrl_intf.h | 2 +- src/mds/mds_tipc_fctrl_msg.cc | 44 +- src/mds/mds_tipc_fctrl_msg.h | 22 +++-- src/mds/mds_tipc_fctrl_portid.cc | 46 --- 7 files changed, 137 insertions(+), 74 deletions(-) diff --git a/src/mds/README b/src/mds/README index 1b94632..0819bdc 100644 --- a/src/mds/README +++ b/src/mds/README @@ -182,7 +182,7 @@ TIPC portid state machine and its transition kDisabled, // no flow control support at this state kStartup, // a newly published portid starts at this state -kTxProb, // txprob timer is running to confirm if the flow control is supported +kTxProb, // tx probation timer is running to confirm if the flow control is supported kEnabled // flow control support is confirmed, data flow is controlled kRcvBuffOverflow // anticipating (or experienced) the receiver's buffer overflow diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 1b6c3f8..e7a7b48 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -247,6 +247,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) if (!get_tipc_port_id(tipc_cb.BSRsock, _id)) { close(tipc_cb.Dsock); close(tipc_cb.BSRsock); + *mds_tipc_ref = 0; return NCSCC_RC_FAILURE; } *mds_tipc_ref = port_id.ref; @@ -330,7 +331,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) } /* Get tipc socket receive buffer size */ - int optval; + int optval = 0; socklen_t optlen = sizeof(optval); if (getsockopt(tipc_cb.BSRsock, SOL_SOCKET, SO_RCVBUF, , ) != 0) { @@ -350,12 +351,25 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) int acksize = -1; if ((ptr = getenv("MDS_TIPC_FCTRL_ACKTIMEOUT")) != NULL) { ackto = atoi(ptr); + if (ackto == 0) { + syslog(LOG_ERR, "MDTM:TIPC Invalid " + "MDS_TIPC_FCTRL_ACKTIMEOUT, using default value"); + ackto = -1; + } } if ((ptr = getenv("MDS_TIPC_FCTRL_ACKSIZE")) != NULL) { acksize = atoi(ptr); + if (acksize == 0) { + syslog(LOG_ERR, "MDTM:TIPC Invalid " + "MDS_TIPC_FCTRL_ACKSIZE, using default value"); + acksize = -1; + } } - mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, (uint64_t)optval, + mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, optval, ackto, acksize, tipc_mcast_enabled); + } else { + syslog(LOG_ERR, "MDTM:TIPC Invalid value of" + "MDS_TIPC_FCTRL_ENABLED"); } } @@ -366,6 +380,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) close(tipc_cb.Dsock); close(tipc_cb.BSRsock); m_NCS_IPC_RELEASE(_cb.tmr_mbx, NULL); + mds_tipc_fctrl_shutdown(); return NCSCC_RC_FAILURE; } @@ -2528,7 +2543,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) */ uint32_t status = 0; uint32_t sum_mds_hdr_plus_mdtm_hdr_plus_len; - uint16_t fctrl_seq_num = 0; + uint16_t fctrl_seq_num = 0; int version = req->msg_arch_word & 0x7; if (version > 1) { sum_mds_hdr_plus_mdtm_hdr_plus_len = @@ -2618,7 +2633,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) return NCSCC_RC_FAILURE; } /* if sndqueue is capable, then obtain the current sending seq */ - if (mds_tipc_fctrl_sndqueue_capable(tipc_id, len, _seq_num) + if (mds_tipc_fctrl_sndqueue_capable(tipc_id, _seq_num) == NCSCC_RC_FAILURE){ m_MDS_LOG_ERR("FCTRL: Failed to send message len :%d", len); return NCSCC_RC_FAILURE; @@ -2717,10 +2732,10 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req) } /* if sndqueue is capable, then obtain the current sending seq */ if (mds_tipc_fctrl_sndqueue_capable(tipc_id, - len +
Re: [devel] [PATCH 1/1] amf: handle errors identified by codechecker [#3077]
Hi Gary, ack from me (code review only) Thanks Minh On 3/9/19 12:12 pm, Gary Lee wrote: add assertions where pointers should not be null fix a couple of typos --- src/amf/amfd/comp.cc | 1 + src/amf/amfd/csi.cc| 3 ++- src/amf/amfd/cstype.cc | 2 ++ src/amf/amfd/hlt.cc| 1 + src/amf/amfd/nodeswbundle.cc | 2 +- src/amf/amfd/ntf.cc| 1 + src/amf/amfd/sg_npm_fsm.cc | 34 +++--- src/amf/amfd/sg_nway_fsm.cc| 2 +- src/amf/amfd/sgproc.cc | 1 + src/amf/amfd/su.cc | 1 + src/amf/amfd/sutype.cc | 3 ++- src/amf/amfd/svctype.cc| 1 + src/amf/amfd/svctypecstypes.cc | 1 + src/amf/amfnd/cbq.cc | 2 ++ src/amf/amfnd/clc.cc | 1 + src/amf/amfnd/comp.cc | 4 src/amf/amfnd/compdb.cc| 2 +- src/amf/amfnd/susm.cc | 11 +++ 18 files changed, 53 insertions(+), 20 deletions(-) diff --git a/src/amf/amfd/comp.cc b/src/amf/amfd/comp.cc index 0ff365e..5c6a283 100644 --- a/src/amf/amfd/comp.cc +++ b/src/amf/amfd/comp.cc @@ -2117,6 +2117,7 @@ static void comp_ccb_apply_modify_hdlr(struct CcbUtilOperationData *opdata) { attribute->attrValuesNumber); if (!strcmp(attribute->attrName, "saAmfCompType")) { + osafassert(value != nullptr); SaNameT *dn = (SaNameT *)value; const std::string oldType(comp->saAmfCompType); if (oldType.compare(Amf::to_string(dn)) == 0) { diff --git a/src/amf/amfd/csi.cc b/src/amf/amfd/csi.cc index f7e3730..1856610 100644 --- a/src/amf/amfd/csi.cc +++ b/src/amf/amfd/csi.cc @@ -913,7 +913,8 @@ static void ccb_apply_delete_hdlr(CcbUtilOperationData_t *opdata) { goto done; } - TRACE("'%s'", csi ? csi->name.c_str() : nullptr); + osafassert(csi != nullptr); + TRACE("'%s'", csi->name.c_str()); /* Check whether si has been assigned to any SU. */ if ((nullptr != csi->si->list_of_sisu) && (csi->compcsi_cnt != 0)) { diff --git a/src/amf/amfd/cstype.cc b/src/amf/amfd/cstype.cc index cadc6df..683d3cd 100644 --- a/src/amf/amfd/cstype.cc +++ b/src/amf/amfd/cstype.cc @@ -62,6 +62,7 @@ static AVD_CS_TYPE *cstype_create(const std::string , * @param cst */ static void cstype_delete(AVD_CS_TYPE *cst) { + osafassert(cst != nullptr); cstype_db->erase(cst->name); cst->saAmfCSAttrName.clear(); delete cst; @@ -205,6 +206,7 @@ static SaAisErrorT cstype_ccb_completed_hdlr(CcbUtilOperationData_t *opdata) { opdata->userData = nullptr; break; } + osafassert(cst != nullptr); if (cst->list_of_csi != nullptr) { /* check whether there exists a delete operation for * each of the CSI in the cs_type list in the current CCB diff --git a/src/amf/amfd/hlt.cc b/src/amf/amfd/hlt.cc index 27863db..4c2737e 100644 --- a/src/amf/amfd/hlt.cc +++ b/src/amf/amfd/hlt.cc @@ -75,6 +75,7 @@ static SaAisErrorT ccb_completed_delete_hdlr(CcbUtilOperationData_t *opdata) { opdata->userData = nullptr; goto done; } + osafassert(comp != nullptr); for (curr_susi = comp->su->list_of_susi; curr_susi != nullptr; curr_susi = curr_susi->su_next) for (compcsi = curr_susi->list_of_csicomp; compcsi; diff --git a/src/amf/amfd/nodeswbundle.cc b/src/amf/amfd/nodeswbundle.cc index 4ab79f7..cf280cb 100644 --- a/src/amf/amfd/nodeswbundle.cc +++ b/src/amf/amfd/nodeswbundle.cc @@ -125,7 +125,7 @@ static int is_swbdl_delete_ok(const std::string _dn, if (node == nullptr && avd_cb->is_active() == false) { return 1; } - + osafassert(node != nullptr); if (!is_swbdl_delete_ok_for_node(bundle_dn, node_dn, node->list_of_ncs_su, opdata)) return 0; diff --git a/src/amf/amfd/ntf.cc b/src/amf/amfd/ntf.cc index eb2654a..52ee745 100644 --- a/src/amf/amfd/ntf.cc +++ b/src/amf/amfd/ntf.cc @@ -505,6 +505,7 @@ SaAisErrorT avd_try_send_notification(NtfSend* job) { >notification.alarmNotification.notificationHandle; } + osafassert(notificationHandle != nullptr); // Try to send the notification if not sent. if (job->already_sent == false) { rc = saNtfNotificationSend(*notificationHandle); diff --git a/src/amf/amfd/sg_npm_fsm.cc b/src/amf/amfd/sg_npm_fsm.cc index 0ef094d..0e91eb5 100644 --- a/src/amf/amfd/sg_npm_fsm.cc +++ b/src/amf/amfd/sg_npm_fsm.cc @@ -2773,23 +2773,26 @@ static uint32_t avd_sg_npm_susi_sucss_si_oper(AVD_CL_CB *cb, AVD_SU *su, * modify standby all to the Quiesced SU. Remove the SI from * admin pointer and add the quiesced SU to the SU oper list. */ - if (su->sg_of_su->admin_si->list_of_sisu == i_susi) { -o_susi = i_susi->si_next; - } else { -o_susi = su->sg_of_su->admin_si->list_of_sisu; - } + i_susi = avd_su_susi_find(cb, su, su->sg_of_su->admin_si->name); + if
Re: [devel] [PATCH 6/9] mds: Implement kRcvBuffOverflow state [#1960]
Hi Vu, Agree with your comments. Any comments for patches 8/9 and 9/9? thanks Minh On 16/9/19 5:22 pm, Nguyen Minh Vu wrote: Hi Minh, I has few comments below. Regards, Vu On 8/14/19 1:38 PM, Minh Chau wrote: This patch implements the kRcvBuffOverflow state machine as described in README file. --- src/mds/mds_tipc_fctrl_intf.cc | 6 +- src/mds/mds_tipc_fctrl_msg.h | 1 + src/mds/mds_tipc_fctrl_portid.cc | 137 ++- src/mds/mds_tipc_fctrl_portid.h | 5 +- 4 files changed, 131 insertions(+), 18 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index c2d0922..397114e 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -285,14 +285,16 @@ uint32_t mds_tipc_fctrl_trysend(const uint8_t *buffer, uint16_t len, rc = NCSCC_RC_FAILURE; } else { if (portid->state_ != TipcPortId::State::kDisabled) { - portid->Queue(buffer, len); + bool sendable = portid->ReceiveCapable(len); + portid->Queue(buffer, len, sendable); // start txprob timer for the first msg sent out // do not start for other states - if (portid->state_ == TipcPortId::State::kStartup) { + if (sendable && portid->state_ == TipcPortId::State::kStartup) { txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk); m_MDS_LOG_DBG("FCTRL: Start txprob"); portid->state_ = TipcPortId::State::kTxProb; } + if (sendable == false) rc = NCSCC_RC_FAILURE; } } diff --git a/src/mds/mds_tipc_fctrl_msg.h b/src/mds/mds_tipc_fctrl_msg.h index 69f8048..e6b9662 100644 --- a/src/mds/mds_tipc_fctrl_msg.h +++ b/src/mds/mds_tipc_fctrl_msg.h @@ -110,6 +110,7 @@ class DataMessage: public BaseMessage { uint8_t* msg_data_{nullptr}; uint8_t snd_type_{0}; + bool is_sent_{true}; DataMessage() {} virtual ~DataMessage(); void Decode(uint8_t *msg) override; diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 84ecee9..e762290 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -82,6 +82,23 @@ uint64_t MessageQueue::Erase(uint16_t fseq_from, uint16_t fseq_to) { return msg_len; } +DataMessage* MessageQueue::FirstUnsent() { + for (auto it = queue_.begin(); it != queue_.end(); ++it) { [Vu] Use the shorter version `for (const auto& it : queue_) + DataMessage *m = *it; + if (m->is_sent_ == false) { + return m; + } + } + return nullptr; +} + +void MessageQueue::MarkUnsentFrom(uint16_t fseq) { + for (auto it = queue_.begin(); it != queue_.end(); ++it) { [Vu] as above comment + DataMessage *m = *it; + if (m->header_.fseq_ >= fseq) m->is_sent_ = false; + } +} + void MessageQueue::Clear() { while (queue_.empty() == false) { DataMessage* msg = queue_.front(); @@ -99,7 +116,8 @@ TipcPortId::TipcPortId(struct tipc_portid id, int sock, uint16_t chksize, TipcPortId::~TipcPortId() { // Fake a TmrChunkAck event to ack all received messages ReceiveTmrChunkAck(); - // clear all msg in sndqueue_ + // flush all unsent msg in sndqueue_ + FlushData(); sndqueue_.Clear(); [Vu] If sndqueue_.Clear() must be called every time calling `FlushData`, should move `Clear()` into FlushData() ? } @@ -109,6 +127,24 @@ uint64_t TipcPortId::GetUniqueId(struct tipc_portid id) { return uid; } +void TipcPortId::FlushData() { + DataMessage* msg = nullptr; + do { + // find the lowest sequence unsent yet + msg = sndqueue_.FirstUnsent(); + if (msg != nullptr) { + Send(msg->msg_data_, msg->header_.msg_len_); + msg->is_sent_ = true; + m_MDS_LOG_DBG("FCTRL: [me] --> [node:%x, ref:%u], " + "FlushData[mseq:%u, mfrag:%u, fseq:%u], " + "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", + id_.node, id_.ref, + msg->header_.mseq_, msg->header_.mfrag_, msg->header_.fseq_, + sndwnd_.acked_, sndwnd_.send_, sndwnd_.nacked_space_); + } + } while (msg != nullptr); +} + uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) { struct sockaddr_tipc server_addr; ssize_t send_len = 0; @@ -130,29 +166,49 @@ uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) { return rc; } -uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length) { +uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length, + bool is_sent) { uint32_t rc = NCSCC_RC_SUCCESS; DataMessage *msg = new DataMessage; msg->header_.Decode(const_cast(data)); msg->Decode(const_cast(data)); msg->msg_data_ = new uint8_t[length]; + msg->is_sent_ = is_sent; memcpy(msg->msg_data_, data, length); sndqueue_.Queue(msg); - ++sndwnd_.send_; - sndwnd_.nacked_space_ += length; - m_MDS_LOG_DBG("FCTRL: [me] --> [node:%x, ref:%u], " - "SndData[mseq:%u, mfrag:%u, fseq:%u, len:%u], " - "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]", -
Re: [devel] [PATCH 1/9] mds: Add README for solution of TIPC buffer overflow at MDS [#1960]
Hi Vu, Thanks for your time to review the patches, the question is interesting. At this moment with normal load traffic, the resource towards the new standby (old active) is not released and will be reused if standby switches back to active. The reason is that mds won't start the "tx probation" again to confirm flow control support as mds has known it had enabled flow control on this port id. The messages towards the new active are sent on another port id thus they are running on a different flow control counter. The test of multiple switchover looks ok so far. However, the problem probably happens with overloaded traffic while a failover/switchover (I haven't tested this case). The pending messages under overload state to be sent to the old active won't be sent to the new active, I guess the mds user would get TIMEOUT and try again to send the message to the new active, which at least corresponds to legacy behavior. However, this could be looked at as an improvement as we have pending messages, we know the new active, we can send the pending messages to new active, but another question is that whether the existing users expect to receive these pending messages according to their current logics. Regards, Minh On 16/9/19 5:34 pm, Nguyen Minh Vu wrote: Hi Minh, I have just finished my review to your MDS patches, and I have a question: With 2N services, suppose the active is having TIPC overloaded issue; it will do some memory allocations, and probably starting a timer there too. Then, what happens if that active service is changed to the standby role? Shall allocated memory/timer be freed up and is there any impact on the subsequent messages sent to the new active? Regards, Vu On 8/14/19 1:38 PM, Minh Chau wrote: --- src/mds/README | 221 + 1 file changed, 221 insertions(+) create mode 100644 src/mds/README diff --git a/src/mds/README b/src/mds/README new file mode 100644 index 000..1b94632 --- /dev/null +++ b/src/mds/README @@ -0,0 +1,221 @@ +/* -*- OpenSAF -*- + * + * (C) Copyright 2019 The OpenSAF Foundation + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed + * under the GNU Lesser General Public License Version 2.1, February 1999. + * The complete license can be accessed from the following location: + * http://opensource.org/licenses/lgpl-license.php + * See the Copying file included with the OpenSAF distribution for full + * licensing terms. + * + * Author(s): Ericsson AB + * + */ +Background +== +If OpenSAF configures TIPC as transport, the MDS library today will use +TIPC SOCK_RDM socket for message distribution in the cluster. The SOCK_RDM +datagram socket possibly encounters buffer overflow at receiver ends which +has been documented in tipc.io[1]. A temporary solution for this buffer +overflow issue is that the socket buffer size can be increased to a larger +number. However, if the cluster continues either scaling out or adding more +components, the system will be under dimensioned, thus the TIPC buffer +overflow can occur again. + +MDS's solution for TIPC buffer overflow +=== +If MDS disables TIPC_DEST_DROPPABLE, TIPC will return the ancillary message +when the original message is failed to deliver. By this event, if the message +has been saved in queue, MDS at sender sides can search and retransmit this +message to the receivers. +Once the messages in the sender's queue has been delivered successfully, MDS +needs to remove them. MDS introduces its internal ACK message as an +acknowledgment from receivers so that the senders can remove the messages +out of the queue. +Also, as such situation of buffer overflow at receivers, the retransmission may +not succeed or even become worse at receiver ends (the more retransmission, +the more overflow to occur). MDS imitates the sliding window in TCP[2] to +control the flow of data message towards the receivers. + +Legacy MDS data message, new (data + ACK) MDS message, and upgradability + +Below is the MDS legacy message format that has been used till OpenSAF 5.19.07 + +oct 0 message length +oct 1 +-- +oct 2 sequence number: incremented for every message sent out to all destined +... tipc portid. +oct 5 +-- +oct 6 fragment number: a message with same sequence number can be fragmented, +oct 7 identified by this fragment number. +-- +oct 8 length check: cross check with message length(oct0,1), NOT USED. +oct 9 +-- +oct 10 protocol version: (MDS_PROT:0xA0 |
Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]
Hi Vu, I see it, will add. Thanks Minh On 16/9/19 4:21 pm, Nguyen Minh Vu wrote: Hi Minh, See my responses to your comments below, started with [Vu2]. Regards, Vu On 9/16/19 1:06 PM, Minh Hon Chau wrote: Hi Vu, Several comments with [M] too :). Thanks Minh On 16/9/19 2:24 pm, Nguyen Minh Vu wrote: Hi Minh, I have several comments below, started with [Vu]. Regards, Vu On 8/14/19 1:01 PM, Minh Chau wrote: This is a collaborative patch of two participants: - Tran Thuan - Minh Chau Main changes: - Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files introduce new functions which are called in mds_dt_tipc.c if the flow control is enabled - Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files implements the tipc portid instance, which supports the sliding window, mds msg queue - Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define the event and messages which are used for this solution. --- src/mds/Makefile.am | 10 +- src/mds/mds_dt.h | 8 +- src/mds/mds_dt_tipc.c | 188 +--- src/mds/mds_tipc_fctrl_intf.cc | 376 +++ src/mds/mds_tipc_fctrl_intf.h | 47 + src/mds/mds_tipc_fctrl_msg.cc | 142 +++ src/mds/mds_tipc_fctrl_msg.h | 129 ++ src/mds/mds_tipc_fctrl_portid.cc | 261 +++ src/mds/mds_tipc_fctrl_portid.h | 87 + 9 files changed, 1184 insertions(+), 64 deletions(-) create mode 100644 src/mds/mds_tipc_fctrl_intf.cc create mode 100644 src/mds/mds_tipc_fctrl_intf.h create mode 100644 src/mds/mds_tipc_fctrl_msg.cc create mode 100644 src/mds/mds_tipc_fctrl_msg.h create mode 100644 src/mds/mds_tipc_fctrl_portid.cc create mode 100644 src/mds/mds_tipc_fctrl_portid.h diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am index 2d7b652..d849e8f 100644 --- a/src/mds/Makefile.am +++ b/src/mds/Makefile.am @@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \ if ENABLE_TIPC_TRANSPORT noinst_HEADERS += src/mds/mds_dt_tipc.h \ src/mds/mds_tipc_recvq_stats.h \ - src/mds/mds_tipc_recvq_stats_impl.h + src/mds/mds_tipc_recvq_stats_impl.h \ + src/mds/mds_tipc_fctrl_intf.h \ + src/mds/mds_tipc_fctrl_portid.h \ + src/mds/mds_tipc_fctrl_msg.h lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \ src/mds/mds_tipc_recvq_stats.cc \ - src/mds/mds_tipc_recvq_stats_impl.cc + src/mds/mds_tipc_recvq_stats_impl.cc \ + src/mds/mds_tipc_fctrl_intf.cc \ + src/mds/mds_tipc_fctrl_portid.cc \ + src/mds/mds_tipc_fctrl_msg.cc endif if ENABLE_TESTS diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index b645bb4..d9e8633 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL ref); uint32_t mds_tmr_mailbox_processing(void); uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL *svc_hdl); uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, - uint16_t frag_byte); + uint16_t frag_byte, uint16_t fctrl_seq_num); uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg); uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, uint64_t tipc_id, uint32_t *buff_dump); @@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT 0xA0 #define MDS_VERSION 0x08 -#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION) +#define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* MDS protocol/version for flow control */ +#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) +#define MDS_PROT_FCTRL_ID 0x00AC13F5 + /* Added for the subscription changes */ #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff) #define MDS_TIPC_COMMON_ID 0x01001000 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 86b52bb..fef1c50 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -47,6 +47,7 @@ #include "mds_dt_tipc.h" #include "mds_dt_tcp_disc.h" #include "mds_core.h" +#include "mds_tipc_fctrl_intf.h" #include "mds_tipc_recvq_stats.h" #include "base/osaf_utility.h" #include "base/osaf_poll.h" @@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; +uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; -static bool get_tipc_port_id(int sock, uint32_t* port_id) { +static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) { struct sockaddr_tipc addr; socklen_t sz = sizeof(addr); memset(, 0, sizeof(addr)); - *port_id = 0; + port_id->node = 0; + port_id->ref = 0; if (0 > getsockname(sock, (struct sockaddr *), )) { syslog(LOG_ERR, "MDTM:TIPC Failed t
Re: [devel] [PATCH 5/9] mds: Add state machine for tipc portid instance [#1960]
Hi Vu, Some comments with [M] Thanks Minh On 16/9/19 2:56 pm, Nguyen Minh Vu wrote: Hi Minh, I has few comments below. Regards, Vu On 8/14/19 1:38 PM, Minh Chau wrote: This patch adds state machine to support tx probation timer. --- src/mds/mds_tipc_fctrl_intf.cc | 47 +++-- src/mds/mds_tipc_fctrl_msg.h | 1 + src/mds/mds_tipc_fctrl_portid.cc | 109 +++ src/mds/mds_tipc_fctrl_portid.h | 22 4 files changed, 176 insertions(+), 3 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index bd0a8f6..c2d0922 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -34,6 +34,7 @@ using mds::Event; using mds::TipcPortId; +using mds::Timer; using mds::DataMessage; using mds::ChunkAck; using mds::HeaderMessage; @@ -65,6 +66,11 @@ uint64_t sock_buf_size = 0; std::map portid_map; std::mutex portid_map_mutex; +// probation timer event to enable flow control at receivers +const int64_t kBaseTimerInt = 200; // in centisecond +const uint8_t kTxProbMaxRetries = 10; +Timer txprob_timer(Event::Type::kEvtTmrTxProb); + // chunk ack parameters // todo: The chunk ack timeout and chunk ack size should be configurable int kChunkAckTimeout = 1000; // in miliseconds @@ -76,13 +82,37 @@ TipcPortId* portid_lookup(struct tipc_portid id) { return portid_map[uid]; } +void tmr_exp_cbk(void* uarg) { + Timer* timer = reinterpret_cast(uarg); + if (timer != nullptr) { + timer->is_active_ = false; + // send to fctrl thread + if (m_NCS_IPC_SEND(_events, new Event(timer->type_), + NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { + m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events\n"); + } + } +} + void process_timer_event(const Event evt) { + bool txprob_restart = false; for (auto i : portid_map) { TipcPortId* portid = i.second; + + if (evt.type_ == Event::Type::kEvtTmrTxProb) { + if (portid->ReceiveTmrTxProb(kTxProbMaxRetries) == true) { + txprob_restart = true; + } + } + if (evt.type_ == Event::Type::kEvtTmrChunkAck) { portid->ReceiveTmrChunkAck(); } } + if (txprob_restart) { + txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk); + m_MDS_LOG_DBG("FCTRL: Restart txprob"); + } } uint32_t process_flow_event(const Event evt) { @@ -231,8 +261,10 @@ uint32_t mds_tipc_fctrl_sndqueue_capable(struct tipc_portid id, uint16_t len, id.node, id.ref, __LINE__); rc = NCSCC_RC_FAILURE; } else { - // assign the sequence number of the outgoing message - *next_seq = portid->GetCurrentSeq(); + if (portid->state_ != TipcPortId::State::kDisabled) { + // assign the sequence number of the outgoing message + *next_seq = portid->GetCurrentSeq(); + } } portid_map_mutex.unlock(); @@ -252,7 +284,16 @@ uint32_t mds_tipc_fctrl_trysend(const uint8_t *buffer, uint16_t len, id.node, id.ref, __LINE__); rc = NCSCC_RC_FAILURE; } else { - portid->Queue(buffer, len); + if (portid->state_ != TipcPortId::State::kDisabled) { + portid->Queue(buffer, len); + // start txprob timer for the first msg sent out + // do not start for other states + if (portid->state_ == TipcPortId::State::kStartup) { + txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk); + m_MDS_LOG_DBG("FCTRL: Start txprob"); + portid->state_ = TipcPortId::State::kTxProb; + } + } } portid_map_mutex.unlock(); diff --git a/src/mds/mds_tipc_fctrl_msg.h b/src/mds/mds_tipc_fctrl_msg.h index 8e6a874..69f8048 100644 --- a/src/mds/mds_tipc_fctrl_msg.h +++ b/src/mds/mds_tipc_fctrl_msg.h @@ -45,6 +45,7 @@ class Event { kEvtDropData, // event reported from tipc that a message is not // delivered kEvtTmrAll, + kEvtTmrTxProb, // event that tx probation timer expired for once kEvtTmrChunkAck, // event to send the chunk ack }; NCS_IPC_MSG next_{0}; diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 64115d5..84ecee9 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -23,6 +23,35 @@ namespace mds { +Timer::Timer(Event::Type type) { + tmr_id_ = nullptr; + type_ = type; + is_active_ = false; +} + +Timer::~Timer() { [Vu] Is it required to stop the timer here if it still in active? [M]: Yes, will add the Stop() here +} + +void Timer::Start(int64_t period, void (*tmr_exp_func)(void*)) { + // timer will not start if it's already started + // period is in centiseconds + if (is_active_ == false) { + if (tmr_id_ == nullptr) { + tmr_id_ = ncs_tmr_alloc(nullptr, 0); + } + tmr_id_ = ncs_tmr_start(tmr_id_, period, tmr_exp_func, this, + nullptr, 0); + is_active_ = true; + } +} + +void Timer::Stop() { [Vu] This method is not called from anywhere. Is there
Re: [devel] [PATCH 4/9] mds: Add timeout for ack message [#1960]
Hi Vu, Some comments with [M] Thanks Minh On 16/9/19 2:37 pm, Nguyen Minh Vu wrote: Hi Minh, I have minor comments below. Regards, Vu On 8/14/19 1:38 PM, Minh Chau wrote: If the ack size is configured greater than 1, there should be a timeout at receiver ends to send the ack message back to senders. The ack message timeout utilizes the poll timeout in flow control thread to make mds lightweight (in contrast to additional timer threads). --- src/mds/mds_tipc_fctrl_intf.cc | 33 ++--- src/mds/mds_tipc_fctrl_msg.h | 6 ++ src/mds/mds_tipc_fctrl_portid.cc | 15 +++ src/mds/mds_tipc_fctrl_portid.h | 1 + 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index 91b9107..bd0a8f6 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -66,7 +66,8 @@ std::map portid_map; std::mutex portid_map_mutex; // chunk ack parameters -// todo: The chunk ack size should be configurable +// todo: The chunk ack timeout and chunk ack size should be configurable +int kChunkAckTimeout = 1000; // in miliseconds uint16_t kChunkAckSize = 3; TipcPortId* portid_lookup(struct tipc_portid id) { @@ -75,6 +76,15 @@ TipcPortId* portid_lookup(struct tipc_portid id) { return portid_map[uid]; } +void process_timer_event(const Event evt) { + for (auto i : portid_map) { + TipcPortId* portid = i.second; + if (evt.type_ == Event::Type::kEvtTmrChunkAck) { + portid->ReceiveTmrChunkAck(); + } + } +} + uint32_t process_flow_event(const Event evt) { uint32_t rc = NCSCC_RC_SUCCESS; TipcPortId *portid = portid_lookup(evt.id_); @@ -110,7 +120,7 @@ uint32_t process_flow_event(const Event evt) { uint32_t process_all_events(void) { enum { FD_FCTRL = 0, NUM_FDS }; - int poll_tmo = MDTM_TIPC_POLL_TIMEOUT; + int poll_tmo = kChunkAckTimeout; while (true) { int pollres; struct pollfd pfd[NUM_FDS] = {{0}}; @@ -135,11 +145,24 @@ uint32_t process_all_events(void) { if (evt == nullptr) continue; portid_map_mutex.lock(); - process_flow_event(*evt); + + if (evt->IsTimerEvent()) { + process_timer_event(*evt); + } + if (evt->IsFlowEvent()) { + process_flow_event(*evt); + } + [Vu] Should log something here if the event is none of above? [M] Probably not, the event is created internally so we know there won't be any rather than the above delete evt; portid_map_mutex.unlock(); } } + // timeout, scan all portid and send ack msgs + if (pollres == 0) { + portid_map_mutex.lock(); + process_timer_event(Event(Event::Type::kEvtTmrChunkAck)); + portid_map_mutex.unlock(); + } } /* while */ return NCSCC_RC_SUCCESS; } @@ -368,6 +391,10 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t len, portid_map_mutex.lock(); uint32_t rc = process_flow_event(Event(Event::Type::kEvtRcvData, id, data.svc_id_, header.mseq_, header.mfrag_, header.fseq_)); + if (rc == NCSCC_RC_CONTINUE) { + process_timer_event(Event(Event::Type::kEvtTmrChunkAck)); [Vu] Missed to unlock the mutex here [M] It's not missed, it's called before return + rc = NCSCC_RC_SUCCESS; + } portid_map_mutex.unlock(); return rc; } diff --git a/src/mds/mds_tipc_fctrl_msg.h b/src/mds/mds_tipc_fctrl_msg.h index 677f256..8e6a874 100644 --- a/src/mds/mds_tipc_fctrl_msg.h +++ b/src/mds/mds_tipc_fctrl_msg.h @@ -44,6 +44,8 @@ class Event { // selective data msgs (not supported) kEvtDropData, // event reported from tipc that a message is not // delivered + kEvtTmrAll, + kEvtTmrChunkAck, // event to send the chunk ack }; NCS_IPC_MSG next_{0}; Type type_; @@ -68,6 +70,10 @@ class Event { fseq_(f_seg_num), chunk_size_(chunk_size) { type_ = type; } + bool IsTimerEvent() { return (type_ > Type::kEvtTmrAll); } + bool IsFlowEvent() { + return (Type::kEvtDataFlowAll < type_ && type_ < Type::kEvtTmrAll); + } [Vu] Consider making these ones to be constant methods if they do not change any of their attribute values. [M] Yes, will add const }; class BaseMessage { diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc index 24d13ee..64115d5 100644 --- a/src/mds/mds_tipc_fctrl_portid.cc +++ b/src/mds/mds_tipc_fctrl_portid.cc @@ -67,6 +67,8 @@ TipcPortId::TipcPortId(struct tipc_portid id, int sock, uint16_t chksize, } TipcPortId::~TipcPortId() { + // Fake a TmrChunkAck event to ack all received messages + ReceiveTmrChunkAck(); // clear all msg in sndqueue_ sndqueue_.Clear(); } @@ -156,6 +158,7 @@ uint32_t TipcPortId::ReceiveData(uint32_t mseq, uint16_t mfrag, // send ack for @chunk_size_ msgs starting
Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]
Hi Vu, Several comments with [M] too :). Thanks Minh On 16/9/19 2:24 pm, Nguyen Minh Vu wrote: Hi Minh, I have several comments below, started with [Vu]. Regards, Vu On 8/14/19 1:01 PM, Minh Chau wrote: This is a collaborative patch of two participants: - Tran Thuan - Minh Chau Main changes: - Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files introduce new functions which are called in mds_dt_tipc.c if the flow control is enabled - Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files implements the tipc portid instance, which supports the sliding window, mds msg queue - Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define the event and messages which are used for this solution. --- src/mds/Makefile.am | 10 +- src/mds/mds_dt.h | 8 +- src/mds/mds_dt_tipc.c | 188 +--- src/mds/mds_tipc_fctrl_intf.cc | 376 +++ src/mds/mds_tipc_fctrl_intf.h | 47 + src/mds/mds_tipc_fctrl_msg.cc | 142 +++ src/mds/mds_tipc_fctrl_msg.h | 129 ++ src/mds/mds_tipc_fctrl_portid.cc | 261 +++ src/mds/mds_tipc_fctrl_portid.h | 87 + 9 files changed, 1184 insertions(+), 64 deletions(-) create mode 100644 src/mds/mds_tipc_fctrl_intf.cc create mode 100644 src/mds/mds_tipc_fctrl_intf.h create mode 100644 src/mds/mds_tipc_fctrl_msg.cc create mode 100644 src/mds/mds_tipc_fctrl_msg.h create mode 100644 src/mds/mds_tipc_fctrl_portid.cc create mode 100644 src/mds/mds_tipc_fctrl_portid.h diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am index 2d7b652..d849e8f 100644 --- a/src/mds/Makefile.am +++ b/src/mds/Makefile.am @@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \ if ENABLE_TIPC_TRANSPORT noinst_HEADERS += src/mds/mds_dt_tipc.h \ src/mds/mds_tipc_recvq_stats.h \ - src/mds/mds_tipc_recvq_stats_impl.h + src/mds/mds_tipc_recvq_stats_impl.h \ + src/mds/mds_tipc_fctrl_intf.h \ + src/mds/mds_tipc_fctrl_portid.h \ + src/mds/mds_tipc_fctrl_msg.h lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \ src/mds/mds_tipc_recvq_stats.cc \ - src/mds/mds_tipc_recvq_stats_impl.cc + src/mds/mds_tipc_recvq_stats_impl.cc \ + src/mds/mds_tipc_fctrl_intf.cc \ + src/mds/mds_tipc_fctrl_portid.cc \ + src/mds/mds_tipc_fctrl_msg.cc endif if ENABLE_TESTS diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index b645bb4..d9e8633 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL ref); uint32_t mds_tmr_mailbox_processing(void); uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL *svc_hdl); uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, - uint16_t frag_byte); + uint16_t frag_byte, uint16_t fctrl_seq_num); uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg); uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, uint64_t tipc_id, uint32_t *buff_dump); @@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT 0xA0 #define MDS_VERSION 0x08 -#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION) +#define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* MDS protocol/version for flow control */ +#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) +#define MDS_PROT_FCTRL_ID 0x00AC13F5 + /* Added for the subscription changes */ #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff) #define MDS_TIPC_COMMON_ID 0x01001000 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 86b52bb..fef1c50 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -47,6 +47,7 @@ #include "mds_dt_tipc.h" #include "mds_dt_tcp_disc.h" #include "mds_core.h" +#include "mds_tipc_fctrl_intf.h" #include "mds_tipc_recvq_stats.h" #include "base/osaf_utility.h" #include "base/osaf_poll.h" @@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; +uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; -static bool get_tipc_port_id(int sock, uint32_t* port_id) { +static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) { struct sockaddr_tipc addr; socklen_t sz = sizeof(addr); memset(, 0, sizeof(addr)); - *port_id = 0; + port_id->node = 0; + port_id->ref = 0; if (0 > getsockname(sock, (struct sockaddr *), )) { syslog(LOG_ERR, "MDTM:TIPC Failed to get socket name, err: %s", strerror(errno)); return false; } - *port_id = addr.addr.id.ref; + *port_id = addr.addr.id; return true; } @@ -240,12 +243,13 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) } /* Code for
Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]
Hi Hans, Gary, Vu Do you have any comments on remaining patches? Thanks Minh On 11/9/19 11:01 am, Minh Hon Chau wrote: Hi Gary, Thanks for the review, please find comments with [M]. /Minh On 10/9/19 6:02 pm, Gary Lee wrote: Hi Minh & Thuan Some minor comments marked with [GL]. On 14/8/19 4:38 pm, Minh Chau wrote: This is a collaborative patch of two participants:Thuan, Minh. Main changes: - Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files introduce new functions which are called in mds_dt_tipc.c if the flow control is enabled - Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files implements the tipc portid instance, which supports the sliding window, mds msg queue - Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define the event and messages which are used for this solution. --- src/mds/Makefile.am | 10 +- src/mds/mds_dt.h | 8 +- src/mds/mds_dt_tipc.c | 188 +--- src/mds/mds_tipc_fctrl_intf.cc | 376 +++ src/mds/mds_tipc_fctrl_intf.h | 47 + src/mds/mds_tipc_fctrl_msg.cc | 142 +++ src/mds/mds_tipc_fctrl_msg.h | 129 ++ src/mds/mds_tipc_fctrl_portid.cc | 261 +++ src/mds/mds_tipc_fctrl_portid.h | 87 + 9 files changed, 1184 insertions(+), 64 deletions(-) create mode 100644 src/mds/mds_tipc_fctrl_intf.cc create mode 100644 src/mds/mds_tipc_fctrl_intf.h create mode 100644 src/mds/mds_tipc_fctrl_msg.cc create mode 100644 src/mds/mds_tipc_fctrl_msg.h create mode 100644 src/mds/mds_tipc_fctrl_portid.cc create mode 100644 src/mds/mds_tipc_fctrl_portid.h diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am index 2d7b652..d849e8f 100644 --- a/src/mds/Makefile.am +++ b/src/mds/Makefile.am @@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \ if ENABLE_TIPC_TRANSPORT noinst_HEADERS += src/mds/mds_dt_tipc.h \ src/mds/mds_tipc_recvq_stats.h \ - src/mds/mds_tipc_recvq_stats_impl.h + src/mds/mds_tipc_recvq_stats_impl.h \ + src/mds/mds_tipc_fctrl_intf.h \ + src/mds/mds_tipc_fctrl_portid.h \ + src/mds/mds_tipc_fctrl_msg.h lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \ src/mds/mds_tipc_recvq_stats.cc \ - src/mds/mds_tipc_recvq_stats_impl.cc + src/mds/mds_tipc_recvq_stats_impl.cc \ + src/mds/mds_tipc_fctrl_intf.cc \ + src/mds/mds_tipc_fctrl_portid.cc \ + src/mds/mds_tipc_fctrl_msg.cc endif if ENABLE_TESTS diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index b645bb4..d9e8633 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL ref); uint32_t mds_tmr_mailbox_processing(void); uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL *svc_hdl); uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, - uint16_t frag_byte); + uint16_t frag_byte, uint16_t fctrl_seq_num); uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg); uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, uint64_t tipc_id, uint32_t *buff_dump); @@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT 0xA0 #define MDS_VERSION 0x08 -#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION) +#define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* MDS protocol/version for flow control */ +#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) +#define MDS_PROT_FCTRL_ID 0x00AC13F5 + /* Added for the subscription changes */ #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff) #define MDS_TIPC_COMMON_ID 0x01001000 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 86b52bb..fef1c50 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -47,6 +47,7 @@ #include "mds_dt_tipc.h" #include "mds_dt_tcp_disc.h" #include "mds_core.h" +#include "mds_tipc_fctrl_intf.h" #include "mds_tipc_recvq_stats.h" #include "base/osaf_utility.h" #include "base/osaf_poll.h" @@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; +uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; -static bool get_tipc_port_id(int sock, uint32_t* port_id) { +static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) { struct sockaddr_tipc addr; socklen_t sz = sizeof(addr); memset(, 0, sizeof(addr)); - *port_id = 0; + port_id->node = 0; + port_id->ref = 0; if (0 > getsockname(sock, (struct sockaddr *), )) { syslog(LOG_ERR, "MDTM:TIPC Failed to get socket name, err: %s", strerror(errno)); return false; } - *port
Re: [devel] [PATCH 7/9] mds: Add configurable parameters [#1960]
Hi Vu, I have the comments [M]. Thanks Minh On 13/9/19 6:40 pm, Nguyen Minh Vu wrote: Hi Minh, I have minor comments below. Regards, Vu On 8/14/19 1:38 PM, Minh Chau wrote: This patch makes the solution of TIPC buffer overflow configurable, as well as the ack timeout/ack size. For example: The service config file can export the following environment variables export MDS_TIPC_FCTRL_ENABLED=1 export MDS_TIPC_FCTRL_ACKTIMEOUT=1000 export MDS_TIPC_FCTRL_ACKSIZE=1 If MDS_TIPC_FCTRL_ACKTIMEOUT, MDS_TIPC_FCTRL_ACKSIZE are not specified, the default values are used. --- src/mds/mds_dt_tipc.c | 19 --- src/mds/mds_tipc_fctrl_intf.cc | 25 +++-- src/mds/mds_tipc_fctrl_intf.h | 3 ++- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index fef1c50..1b6c3f8 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -342,9 +342,22 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) } /* Create flow control tasks if enabled*/ - gl_mds_pro_ver = MDS_PROT_FCTRL; - mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, - (uint64_t)optval, tipc_mcast_enabled); + char* ptr; + if ((ptr = getenv("MDS_TIPC_FCTRL_ENABLED")) != NULL) { + if (atoi(ptr) == 1) { + gl_mds_pro_ver = MDS_PROT_FCTRL; + int ackto = -1; + int acksize = -1; + if ((ptr = getenv("MDS_TIPC_FCTRL_ACKTIMEOUT")) != NULL) { + ackto = atoi(ptr); + } + if ((ptr = getenv("MDS_TIPC_FCTRL_ACKSIZE")) != NULL) { + acksize = atoi(ptr); + } [Vu] Do we have valid range of these environment variables? What if they mistakenly set them to empty values? e.g: export MDS_TIPC_FCTRL_ACKTIMEOUT="" [M] We have base::GetEnv and will try to use it here, if not possible due to this source file is C code, then will add more handling for out-of-range values or a warning if it's set a value e.g too big. + mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, (uint64_t)optval, + ackto, acksize, tipc_mcast_enabled); + } + } /* Create a task to receive the events and data */ if (mdtm_create_rcv_task(tipc_cb.hdle_mdtm) != NCSCC_RC_SUCCESS) { diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc index 397114e..8949937 100644 --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -40,6 +40,9 @@ using mds::ChunkAck; using mds::HeaderMessage; namespace { +// flow control enabled/disabled +bool is_fctrl_enabled = false; + // multicast/broadcast enabled // todo: to be removed if flow control support it bool is_mcast_enabled = true; @@ -225,7 +228,8 @@ uint32_t create_ncs_task(void *task_hdl) { } // end local namespace uint32_t mds_tipc_fctrl_initialize(int dgramsock, struct tipc_portid id, - uint64_t rcv_buf_size, bool mcast_enabled) { + uint64_t rcv_buf_size, int32_t ackto, int32_t acksize, + bool mcast_enabled) { if (create_ncs_task(_task_hdl) != NCSCC_RC_SUCCESS) { m_MDS_LOG_ERR("FCTRL: Start of the Created Task-failed:\n"); @@ -234,8 +238,10 @@ uint32_t mds_tipc_fctrl_initialize(int dgramsock, struct tipc_portid id, data_sock_fd = dgramsock; snd_rcv_portid = id; sock_buf_size = rcv_buf_size; + is_fctrl_enabled = true; is_mcast_enabled = mcast_enabled; - + if (ackto != -1) kChunkAckTimeout = ackto; + if (acksize != -1) kChunkAckSize = acksize; m_MDS_LOG_NOTIFY("FCTRL: Initialize [node:%x, ref:%u]", id.node, id.ref); @@ -243,6 +249,7 @@ uint32_t mds_tipc_fctrl_initialize(int dgramsock, struct tipc_portid id, } uint32_t mds_tipc_fctrl_shutdown(void) { + if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS; if (ncs_task_release(p_task_hdl) != NCSCC_RC_SUCCESS) { m_MDS_LOG_ERR("FCTRL: Stop of the Created Task-failed:\n"); } @@ -251,6 +258,8 @@ uint32_t mds_tipc_fctrl_shutdown(void) { uint32_t mds_tipc_fctrl_sndqueue_capable(struct tipc_portid id, uint16_t len, uint16_t* next_seq) { + if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS; + uint32_t rc = NCSCC_RC_SUCCESS; portid_map_mutex.lock(); [Vu] We has a common class base::Lock that can help to unlock automatically when it goes out the scope. Should we make portid_map_mutex to be an Lock object? [M]: Yes I should use base::Lock, will change it. @@ -274,6 +283,8 @@ uint32_t mds_tipc_fctrl_sndqueue_capable(struct tipc_portid id, uint16_t len, uint32_t mds_tipc_fctrl_trysend(const uint8_t *buffer, uint16_t len, struct tipc_portid id) { + if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS; + uint32_t rc = NCSCC_RC_SUCCESS; portid_map_mutex.lock(); @@ -304,6 +315,8 @@ uint32_t mds_tipc_fctrl_trysend(const uint8_t *buffer, uint16_t len, } uint32_t mds_tipc_fctrl_portid_up(struct tipc_portid id,
Re: [devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]
Hi Gary, This V2 has fixed the error reported in V1, ack from me. Thanks Minh On 12/9/19 5:20 pm, Gary Lee wrote: If delayed failover is enabled, and a downgrade to a version without #3060 occurs, then the standby running a newer version with #3060 may complain about an out of sync error during warm sync. --- src/amf/amfd/ckpt_dec.cc | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc index 6288b4f..75213f8 100644 --- a/src/amf/amfd/ckpt_dec.cc +++ b/src/amf/amfd/ckpt_dec.cc @@ -2721,10 +2721,25 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) { if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt) LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u", updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt); -if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) - LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", - updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); - +if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) { + if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) { +LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u", + updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt); + } else { +// Versions before 10 did not support failover_updt +// After a downgrade scenario, where the active is < v10 +// and this node is >= v10, then there will be failover_updt mismatch +// If so, just set the value to what's on the older active +cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt; + +// check again +if (0 == memcmp(updt_cnt, >async_updt_cnt, +sizeof(AVSV_ASYNC_UPDT_CNT))) { + cb->stby_sync_state = AVD_STBY_IN_SYNC; + return status; +} + } +} LOG_ER("Out of sync detected in warm sync response, exiting"); osafassert(0); ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]
Hi Gary, The patch works fine in the reported scenario, no coredump in amfd. But after downgrade succeeds (meaning the sc1 is active and running old software, the sc2 is standby running latest software + #3078), I continue another switchover to make sc2 back to active, I got error Thanks Minh 2019-09-11 14:31:58.633 SC-2 osafamfd[280]: WA avsv_validate_reo_type_in_csync: unknown type 52 2019-09-11 14:31:58.674 SC-2 osafimmnd[234]: NO Implementer (applier) connected: 43 (@OpenSafImmReplicatorB) <0, 2010f> 2019-09-11 14:31:59.496 SC-2 osafimmnd[234]: NO Implementer disconnected 35 <0, 2010f> (safAmfService) 2019-09-11 14:31:59.500 SC-2 osafimmnd[234]: NO Implementer (applier) connected: 44 (@safAmfService2010f) <0, 2010f> 2019-09-11 14:31:59.524 SC-2 osafamfd[280]: NO Switching StandBy --> Active State 2019-09-11 14:31:59.526 SC-2 osafamfd[280]: ER Switch Standby --> Active FAILED, Standby OUT OF SYNC 2019-09-11 14:31:59.526 SC-2 osafamfd[280]: ER avd_role_change role change failure 2019-09-11 14:31:59.544 SC-2 osafimmd[223]: NO MDS event from svc_id 24 (change:7, dest:13) 2019-09-11 14:31:59.547 SC-2 osafimmd[223]: NO MDS event from svc_id 24 (change:7, dest:13) 2019-09-11 14:31:59.551 SC-2 osafamfnd[290]: NO AVD NEW_ACTIVE, adest:1 2019-09-11 14:31:59.563 SC-2 osafimmnd[234]: NO Implementer disconnected 44 <0, 2010f> (@safAmfService2010f) 2019-09-11 14:31:59.566 SC-2 osafimmnd[234]: NO Implementer connected: 45 (safAmfService) <0, 2010f> 2019-09-11 14:31:59.580 SC-2 osafamfd[280]: WA avsv_validate_reo_type_in_csync: unknown type 52 2019-09-11 14:32:09.626 SC-2 osafamfd[280]: message repeated 4 times: [ WA avsv_validate_reo_type_in_csync: unknown type 52] 2019-09-11 14:32:59.775 SC-2 osafimmd[223]: NO MDS event from svc_id 25 (change:4, dest:564114788998701) 2019-09-11 14:32:59.775 SC-2 osafimmd[223]: NO MDS event from svc_id 24 (change:1, dest:13) 2019-09-11 14:32:59.776 SC-2 osafimmd[223]: NO MDS event from svc_id 24 (change:6, dest:13) 2019-09-11 14:32:59.777 SC-2 osaffmd[213]: NO IMMND down on: 2010f 2019-09-11 14:32:59.777 SC-2 osafimmnd[234]: WA DISCARD DUPLICATE FEVS message:2334 2019-09-11 14:32:59.778 SC-2 osafimmnd[234]: WA Error code 2 returned for message type 82 - ignoring 2019-09-11 14:32:59.778 SC-2 osafimmd[223]: WA IMMD lost contact with peer IMMD (NCSMDS_RED_DOWN) 2019-09-11 14:32:59.780 SC-2 osaffmd[213]: NO Node Down event for node id 2010f: 2019-09-11 14:32:59.780 SC-2 osafrded[204]: NO Peer down on node 0x2010f 2019-09-11 14:32:59.782 SC-2 osaffmd[213]: NO AMFND down on: 2010f 2019-09-11 14:32:59.783 SC-2 osaffmd[213]: NO FM down on: 2010f 2019-09-11 14:32:59.784 SC-2 osafamfd[280]: NO Node 'SC-1' is down. Start failover delay timer 2019-09-11 14:32:59.784 SC-2 osaffmd[213]: NO IMMD down on: 2010f 2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO AVD down on: 2010f 2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO Core services went down on node_id: 2010f 2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO Current role: STANDBY 2019-09-11 14:32:59.788 SC-2 osaffmd[213]: Rebooting OpenSAF NodeId = 131343 EE Name = , Reason: Received Node Down for peer controller, OwnNodeId = 131599, SupervisionTime = 60 2019-09-11 14:32:59.789 SC-2 osafclmd[270]: NO Node 131343 went down. Not sending track callback for agents on that node 2019-09-11 14:32:59.792 SC-2 osafclmd[270]: message repeated 4 times: [ NO Node 131343 went down. Not sending track callback for agents on that node] 2019-09-11 14:32:59.792 SC-2 osafclmd[270]: NO saflog write "safNode=SC-1,safCluster=myClmCluster LEFT, init view=9, cluster view=10" FAILED: SA_AIS_ERR_TRY_AGAIN (6) 2019-09-11 14:32:59.792 SC-2 osafamfd[280]: NO Start timer for '2010f' 2019-09-11 14:32:59.808 SC-2 opensaf_reboot: Rebooting remote node in the absence of PLM is outside the scope of OpenSAF 2019-09-11 14:32:59.809 SC-2 osaffmd[213]: NO Controller Failover: Setting role to ACTIVE 2019-09-11 14:32:59.809 SC-2 osafrded[204]: NO RDE role set to ACTIVE 2019-09-11 14:32:59.810 SC-2 osafrded[204]: NO Running '/usr/local/lib/opensaf/opensaf_sc_active' with 0 argument(s) 2019-09-11 14:32:59.812 SC-2 osafamfd[280]: NO FAILOVER StandBy --> Active 2019-09-11 14:32:59.812 SC-2 osafamfd[280]: ER FAILOVER StandBy --> Active FAILED, Standby OUT OF SYNC 2019-09-11 14:32:59.812 SC-2 osafamfd[280]: Rebooting OpenSAF NodeId = 0 EE Name = No EE Mapped, Reason: FAILOVER failed, OwnNodeId = 131599, SupervisionTime = 60 2019-09-11 14:31:58.181 SC-1 osafamfd[273]: NO ROLE SWITCH Active --> Quiesced 2019-09-11 14:31:58.675 SC-1 osafimmnd[233]: NO Implementer (applier) connected: 43 (@OpenSafImmReplicatorB) <269, 2010f> 2019-09-11 14:31:58.676 SC-1 osafntfimcnd[471]: NO Started 2019-09-11 14:31:59.496 SC-1 osafimmnd[233]: NO Implementer disconnected 35 <97, 2010f> (safAmfService) 2019-09-11 14:31:59.501 SC-1 osafimmnd[233]: NO Implementer (applier) connected: 44 (@safAmfService2010f) <97, 2010f> 2019-09-11 14:31:59.525 SC-1
Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]
Hi Gary, Thanks for the review, please find comments with [M]. /Minh On 10/9/19 6:02 pm, Gary Lee wrote: Hi Minh & Thuan Some minor comments marked with [GL]. On 14/8/19 4:38 pm, Minh Chau wrote: This is a collaborative patch of two participants:Thuan, Minh. Main changes: - Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files introduce new functions which are called in mds_dt_tipc.c if the flow control is enabled - Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files implements the tipc portid instance, which supports the sliding window, mds msg queue - Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define the event and messages which are used for this solution. --- src/mds/Makefile.am | 10 +- src/mds/mds_dt.h | 8 +- src/mds/mds_dt_tipc.c | 188 +--- src/mds/mds_tipc_fctrl_intf.cc | 376 +++ src/mds/mds_tipc_fctrl_intf.h | 47 + src/mds/mds_tipc_fctrl_msg.cc | 142 +++ src/mds/mds_tipc_fctrl_msg.h | 129 ++ src/mds/mds_tipc_fctrl_portid.cc | 261 +++ src/mds/mds_tipc_fctrl_portid.h | 87 + 9 files changed, 1184 insertions(+), 64 deletions(-) create mode 100644 src/mds/mds_tipc_fctrl_intf.cc create mode 100644 src/mds/mds_tipc_fctrl_intf.h create mode 100644 src/mds/mds_tipc_fctrl_msg.cc create mode 100644 src/mds/mds_tipc_fctrl_msg.h create mode 100644 src/mds/mds_tipc_fctrl_portid.cc create mode 100644 src/mds/mds_tipc_fctrl_portid.h diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am index 2d7b652..d849e8f 100644 --- a/src/mds/Makefile.am +++ b/src/mds/Makefile.am @@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \ if ENABLE_TIPC_TRANSPORT noinst_HEADERS += src/mds/mds_dt_tipc.h \ src/mds/mds_tipc_recvq_stats.h \ - src/mds/mds_tipc_recvq_stats_impl.h + src/mds/mds_tipc_recvq_stats_impl.h \ + src/mds/mds_tipc_fctrl_intf.h \ + src/mds/mds_tipc_fctrl_portid.h \ + src/mds/mds_tipc_fctrl_msg.h lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \ src/mds/mds_tipc_recvq_stats.cc \ - src/mds/mds_tipc_recvq_stats_impl.cc + src/mds/mds_tipc_recvq_stats_impl.cc \ + src/mds/mds_tipc_fctrl_intf.cc \ + src/mds/mds_tipc_fctrl_portid.cc \ + src/mds/mds_tipc_fctrl_msg.cc endif if ENABLE_TESTS diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index b645bb4..d9e8633 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL ref); uint32_t mds_tmr_mailbox_processing(void); uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL *svc_hdl); uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, - uint16_t frag_byte); + uint16_t frag_byte, uint16_t fctrl_seq_num); uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg); uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, uint64_t tipc_id, uint32_t *buff_dump); @@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT 0xA0 #define MDS_VERSION 0x08 -#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION) +#define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* MDS protocol/version for flow control */ +#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) +#define MDS_PROT_FCTRL_ID 0x00AC13F5 + /* Added for the subscription changes */ #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff) #define MDS_TIPC_COMMON_ID 0x01001000 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 86b52bb..fef1c50 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -47,6 +47,7 @@ #include "mds_dt_tipc.h" #include "mds_dt_tcp_disc.h" #include "mds_core.h" +#include "mds_tipc_fctrl_intf.h" #include "mds_tipc_recvq_stats.h" #include "base/osaf_utility.h" #include "base/osaf_poll.h" @@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; +uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; -static bool get_tipc_port_id(int sock, uint32_t* port_id) { +static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) { struct sockaddr_tipc addr; socklen_t sz = sizeof(addr); memset(, 0, sizeof(addr)); - *port_id = 0; + port_id->node = 0; + port_id->ref = 0; if (0 > getsockname(sock, (struct sockaddr *), )) { syslog(LOG_ERR, "MDTM:TIPC Failed to get socket name, err: %s", strerror(errno)); return false; } - *port_id = addr.addr.id.ref; + *port_id = addr.addr.id; return true; } @@ -240,12 +243,13 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref) } /* Code for getting the self tipc random
Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]
Hi Hans, I will update the code for that point. Thanks Minh On 23/8/19 11:14 pm, Hans Nordebäck wrote: Hi Minh, see one comment below. /Thanks Hans On 2019-08-23 03:48, Minh Hon Chau wrote: Hi Hans, Thanks for your time to review the patch, please see my replies below your comments. Regards, Minh On 22/8/19 11:07 pm, Hans Nordebäck wrote: Hi Minh, it is a large patch so i have to review parts of it, below are my comments, marked with [HansN], for files: src/mds/Makefile.am src/mds/mds_dt.h src/mds/mds_dt_tipc.c I'll continue with the rest of the files a bit later. /Thanks Hans On 2019-08-14 08:38, Minh Chau wrote: This is a collaborative patch of two participants:Thuan, Minh. Main changes: - Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files introduce new functions which are called in mds_dt_tipc.c if the flow control is enabled - Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files implements the tipc portid instance, which supports the sliding window, mds msg queue - Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define the event and messages which are used for this solution. --- src/mds/Makefile.am | 10 +- src/mds/mds_dt.h | 8 +- src/mds/mds_dt_tipc.c | 188 +--- src/mds/mds_tipc_fctrl_intf.cc | 376 +++ src/mds/mds_tipc_fctrl_intf.h | 47 + src/mds/mds_tipc_fctrl_msg.cc | 142 +++ src/mds/mds_tipc_fctrl_msg.h | 129 ++ src/mds/mds_tipc_fctrl_portid.cc | 261 +++ src/mds/mds_tipc_fctrl_portid.h | 87 + 9 files changed, 1184 insertions(+), 64 deletions(-) create mode 100644 src/mds/mds_tipc_fctrl_intf.cc create mode 100644 src/mds/mds_tipc_fctrl_intf.h create mode 100644 src/mds/mds_tipc_fctrl_msg.cc create mode 100644 src/mds/mds_tipc_fctrl_msg.h create mode 100644 src/mds/mds_tipc_fctrl_portid.cc create mode 100644 src/mds/mds_tipc_fctrl_portid.h diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am index 2d7b652..d849e8f 100644 --- a/src/mds/Makefile.am +++ b/src/mds/Makefile.am @@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \ if ENABLE_TIPC_TRANSPORT noinst_HEADERS += src/mds/mds_dt_tipc.h \ src/mds/mds_tipc_recvq_stats.h \ - src/mds/mds_tipc_recvq_stats_impl.h + src/mds/mds_tipc_recvq_stats_impl.h \ + src/mds/mds_tipc_fctrl_intf.h \ + src/mds/mds_tipc_fctrl_portid.h \ + src/mds/mds_tipc_fctrl_msg.h lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \ src/mds/mds_tipc_recvq_stats.cc \ - src/mds/mds_tipc_recvq_stats_impl.cc + src/mds/mds_tipc_recvq_stats_impl.cc \ + src/mds/mds_tipc_fctrl_intf.cc \ + src/mds/mds_tipc_fctrl_portid.cc \ + src/mds/mds_tipc_fctrl_msg.cc endif if ENABLE_TESTS diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index b645bb4..d9e8633 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL ref); uint32_t mds_tmr_mailbox_processing(void); uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL *svc_hdl); uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, - uint16_t frag_byte); + uint16_t frag_byte, uint16_t fctrl_seq_num); uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg); uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, uint64_t tipc_id, uint32_t *buff_dump); @@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT 0xA0 #define MDS_VERSION 0x08 -#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION) +#define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* MDS protocol/version for flow control */ +#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) +#define MDS_PROT_FCTRL_ID 0x00AC13F5 + /* Added for the subscription changes */ #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff) #define MDS_TIPC_COMMON_ID 0x01001000 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 86b52bb..fef1c50 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -47,6 +47,7 @@ #include "mds_dt_tipc.h" #include "mds_dt_tcp_disc.h" #include "mds_core.h" +#include "mds_tipc_fctrl_intf.h" #include "mds_tipc_recvq_stats.h" #include "base/osaf_utility.h" #include "base/osaf_poll.h" @@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; +uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; -static bool get_tipc_port_id(int sock, uint32_t* port_id) { +static bool get_tipc_port_id(int sock, struct tipc_portid* port_i
Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]
Hi Hans, Thanks for your time to review the patch, please see my replies below your comments. Regards, Minh On 22/8/19 11:07 pm, Hans Nordebäck wrote: Hi Minh, it is a large patch so i have to review parts of it, below are my comments, marked with [HansN], for files: src/mds/Makefile.am src/mds/mds_dt.h src/mds/mds_dt_tipc.c I'll continue with the rest of the files a bit later. /Thanks Hans On 2019-08-14 08:38, Minh Chau wrote: This is a collaborative patch of two participants:Thuan, Minh. Main changes: - Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files introduce new functions which are called in mds_dt_tipc.c if the flow control is enabled - Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files implements the tipc portid instance, which supports the sliding window, mds msg queue - Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define the event and messages which are used for this solution. --- src/mds/Makefile.am | 10 +- src/mds/mds_dt.h | 8 +- src/mds/mds_dt_tipc.c| 188 +--- src/mds/mds_tipc_fctrl_intf.cc | 376 +++ src/mds/mds_tipc_fctrl_intf.h| 47 + src/mds/mds_tipc_fctrl_msg.cc| 142 +++ src/mds/mds_tipc_fctrl_msg.h | 129 ++ src/mds/mds_tipc_fctrl_portid.cc | 261 +++ src/mds/mds_tipc_fctrl_portid.h | 87 + 9 files changed, 1184 insertions(+), 64 deletions(-) create mode 100644 src/mds/mds_tipc_fctrl_intf.cc create mode 100644 src/mds/mds_tipc_fctrl_intf.h create mode 100644 src/mds/mds_tipc_fctrl_msg.cc create mode 100644 src/mds/mds_tipc_fctrl_msg.h create mode 100644 src/mds/mds_tipc_fctrl_portid.cc create mode 100644 src/mds/mds_tipc_fctrl_portid.h diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am index 2d7b652..d849e8f 100644 --- a/src/mds/Makefile.am +++ b/src/mds/Makefile.am @@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \ if ENABLE_TIPC_TRANSPORT noinst_HEADERS += src/mds/mds_dt_tipc.h \ src/mds/mds_tipc_recvq_stats.h \ - src/mds/mds_tipc_recvq_stats_impl.h + src/mds/mds_tipc_recvq_stats_impl.h \ + src/mds/mds_tipc_fctrl_intf.h \ + src/mds/mds_tipc_fctrl_portid.h \ + src/mds/mds_tipc_fctrl_msg.h lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \ src/mds/mds_tipc_recvq_stats.cc \ - src/mds/mds_tipc_recvq_stats_impl.cc + src/mds/mds_tipc_recvq_stats_impl.cc \ + src/mds/mds_tipc_fctrl_intf.cc \ + src/mds/mds_tipc_fctrl_portid.cc \ + src/mds/mds_tipc_fctrl_msg.cc endif if ENABLE_TESTS diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index b645bb4..d9e8633 100644 --- a/src/mds/mds_dt.h +++ b/src/mds/mds_dt.h @@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL ref); uint32_t mds_tmr_mailbox_processing(void); uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL *svc_hdl); uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num, - uint16_t frag_byte); + uint16_t frag_byte, uint16_t fctrl_seq_num); uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg); uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, uint64_t tipc_id, uint32_t *buff_dump); @@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT msg); #define MDS_PROT 0xA0 #define MDS_VERSION 0x08 -#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION) +#define MDS_PROT_VER_MASK 0xFC #define MDTM_PRI_MASK 0x3 +/* MDS protocol/version for flow control */ +#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION) +#define MDS_PROT_FCTRL_ID 0x00AC13F5 + /* Added for the subscription changes */ #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff) #define MDS_TIPC_COMMON_ID 0x01001000 diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c index 86b52bb..fef1c50 100644 --- a/src/mds/mds_dt_tipc.c +++ b/src/mds/mds_dt_tipc.c @@ -47,6 +47,7 @@ #include "mds_dt_tipc.h" #include "mds_dt_tcp_disc.h" #include "mds_core.h" +#include "mds_tipc_fctrl_intf.h" #include "mds_tipc_recvq_stats.h" #include "base/osaf_utility.h" #include "base/osaf_poll.h" @@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list; uint32_t mdtm_global_frag_num; const unsigned int MAX_RECV_THRESHOLD = 30; +uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION; -static bool get_tipc_port_id(int sock, uint32_t* port_id) { +static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) { struct sockaddr_tipc addr; socklen_t sz = sizeof(addr); memset(, 0, sizeof(addr)); - *port_id = 0; + port_id->node = 0; + port_id->ref = 0; if (0 > getsockname(sock, (struct sockaddr *), )) { syslog(LOG_ERR, "MDTM:TIPC
Re: [devel] [PATCH 1/1] amfd: set failover_state on standby [#3072]
Hi, Ack, review only. Quoting Gary Lee : Otherwise, after two controller failovers, unexpected reboot of previously rebooted payloads may occur. --- src/amf/amfd/node_state_machine.cc | 6 ++ 1 file changed, 6 insertions(+) diff --git a/src/amf/amfd/node_state_machine.cc b/src/amf/amfd/node_state_machine.cc index efe2085..d38f79e 100644 --- a/src/amf/amfd/node_state_machine.cc +++ b/src/amf/amfd/node_state_machine.cc @@ -63,6 +63,12 @@ void NodeStateMachine::SetState(uint32_t state) { LOG_NO("New state '%u'", state); } + // this is needed for cold sync, in case this node (currently standby) + // becomes active later + AVD_AVND *node = avd_node_find_nodeid(node_id_); + osafassert(node != nullptr); + node->failover_state = state; + switch (state) { case NodeState::kStart: state_ = std::make_shared(this); -- 2.7.4 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/9] mds: Add README for solution of TIPC buffer overflow at MDS [#1960]
Hi Hans, I will update txprob -> "tx probation" The kEnabled, it means for a state of a tipc portid only. There's another @is_fctrl_enabled, that's for the feature whether mds has flow control enabled/disabled. Thanks Minh On 14/8/19 5:48 pm, Hans Nordebäck wrote: Hi Minh, ack, some minor comments below/Thanks Hans On 2019-08-14 08:38, Minh Chau wrote: --- src/mds/README | 221 + 1 file changed, 221 insertions(+) create mode 100644 src/mds/README diff --git a/src/mds/README b/src/mds/README new file mode 100644 index 000..1b94632 --- /dev/null +++ b/src/mds/README @@ -0,0 +1,221 @@ +/* -*- OpenSAF -*- + * + * (C) Copyright 2019 The OpenSAF Foundation + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed + * under the GNU Lesser General Public License Version 2.1, February 1999. + * The complete license can be accessed from the following location: + * http://opensource.org/licenses/lgpl-license.php + * See the Copying file included with the OpenSAF distribution for full + * licensing terms. + * + * Author(s): Ericsson AB + * + */ +Background +== +If OpenSAF configures TIPC as transport, the MDS library today will use +TIPC SOCK_RDM socket for message distribution in the cluster. The SOCK_RDM +datagram socket possibly encounters buffer overflow at receiver ends which +has been documented in tipc.io[1]. A temporary solution for this buffer +overflow issue is that the socket buffer size can be increased to a larger +number. However, if the cluster continues either scaling out or adding more +components, the system will be under dimensioned, thus the TIPC buffer +overflow can occur again. + +MDS's solution for TIPC buffer overflow +=== +If MDS disables TIPC_DEST_DROPPABLE, TIPC will return the ancillary message +when the original message is failed to deliver. By this event, if the message +has been saved in queue, MDS at sender sides can search and retransmit this +message to the receivers. +Once the messages in the sender's queue has been delivered successfully, MDS +needs to remove them. MDS introduces its internal ACK message as an +acknowledgment from receivers so that the senders can remove the messages +out of the queue. +Also, as such situation of buffer overflow at receivers, the retransmission may +not succeed or even become worse at receiver ends (the more retransmission, +the more overflow to occur). MDS imitates the sliding window in TCP[2] to +control the flow of data message towards the receivers. + +Legacy MDS data message, new (data + ACK) MDS message, and upgradability + +Below is the MDS legacy message format that has been used till OpenSAF 5.19.07 + +oct 0 message length +oct 1 +-- +oct 2 sequence number: incremented for every message sent out to all destined +... tipc portid. +oct 5 +-- +oct 6 fragment number: a message with same sequence number can be fragmented, +oct 7 identified by this fragment number. +-- +oct 8 length check: cross check with message length(oct0,1), NOT USED. +oct 9 +-- +oct 10 protocol version: (MDS_PROT:0xA0 | MDS_VERSION:0x08) = 0xA8, NOT USED +-- +oct 11 mds length: length of mds header and mds data, starting from oct13 +oct 12 +-- +oct 13 mds header and data +... +-- + +The current sequence number/fragment number are being used in MDS for all +messages sent to all discovered tipc portid(s), meaning that every message is sent +to any tipc portid, the sequence/fragment number is increased. The flow control +needs its own sequence number sliding between two tipc porid(s) so that receivers +can detect message drop due to buffer overload. Therefore, the oct8 and oct9 are +now reused as flow control sequence number. The oct10, protocol version, has new +value of 0xB8. The format of new data message as below: + +oct 0 same +... +oct 7 +-- +oct 8 flow control sequence number +oct 9 +-- +oct 10 protocol version: (MDS_PROT_TIPC_FCTRL:0xB0 | MDS_VERSION:0x08) = 0xB8 +-- +oct 11 same +... +-- + +The ACK message is introduced to acknowledge one data message or a chunk of +accumulative data message. The ACK message format: + +oct 0 message length +oct 1 +-- +oct 2 8 bytes, NOT USED + +oct 9
Re: [devel] [PATCH 0/9] Review Request for mds: Add solution for TIPC buffer overflow [#1960]
Hi all, Please ignore this patch series, the patch 2/9 and 9/9 committed under Thuan name/email, have been dropped (for some reason :) ) when sending for review. I am sending again. Thanks Minh On 14/8/19 4:01 pm, Minh Chau wrote: Summary: mds: Add solution of TIPC buffer overflow at MDS [#1960] Review request for Ticket(s): 1960 Peer Reviewer(s): Anders, HansN, Lennart, Gary, Vu, Thuan Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-1960 Base revision: 2d85d5d9264c6a7d1c6601b900fb810facbee3ac Personal repository: git://git.code.sf.net/u/minh-chau/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesy OpenSAF servicesn Core libraries y Samples n Tests n Other n NOTE: Patch(es) contain lines longer than 80 characers Comments (indicate scope for each "y" above): - Sending on behalf of Thuan & Minh. Some pending tasks to accomplish . Handle broadcast/multicast mds message with flow control. . Reduce the memory re-allocation overhead if enables flow control. (At this moment, memory is allocated at mds_dt_tip.c and cloned to buffer for retransmission queue again). . The sequence number arithmetic (sna) should be implemented in /base code. . Adding mdstest to cover sna wrapped-round . MDS_CHECKSUM_ENABLE_FLAG revision c49fdeb17fae20b4e0e9af134cc9b60de846c271 Author: Minh Chau Date: Wed, 14 Aug 2019 15:40:05 +1000 mds: Add TIPC buffer overflow for mdstest [#1960] revision 6948a2456642600d541b55c9787bb17cfde48a7e Author: Minh Chau Date: Wed, 14 Aug 2019 15:40:05 +1000 mds: Apply serial number arithmetic for sequence counter [#1960] This patch applies the serial number arithmetic for the flow control sequence number, referenced to RFC1982. This is only temporary patch, a proper one could be made in /base with template for others type, e.g uint32. Then mds reuses it from /base. revision 87662f659682f813f6746eef0e60d1e52ab03ff1 Author: Minh Chau Date: Wed, 14 Aug 2019 15:40:05 +1000 mds: Add configurable parameters [#1960] This patch makes the solution of TIPC buffer overflow configurable, as well as the ack timeout/ack size. For example: The service config file can export the following environment variables export MDS_TIPC_FCTRL_ENABLED=1 export MDS_TIPC_FCTRL_ACKTIMEOUT=1000 export MDS_TIPC_FCTRL_ACKSIZE=1 If MDS_TIPC_FCTRL_ACKTIMEOUT, MDS_TIPC_FCTRL_ACKSIZE are not specified, the default values are used. revision cd4f8af3f53b16aa05d11f30e25da209e7e51e98 Author: Minh Chau Date: Wed, 14 Aug 2019 15:40:05 +1000 mds: Implement kRcvBuffOverflow state [#1960] This patch implements the kRcvBuffOverflow state machine as described in README file. revision d5c9e8fc8605f453155f4a260ebda0f78ee017b4 Author: Minh Chau Date: Wed, 14 Aug 2019 15:40:05 +1000 mds: Add state machine for tipc portid instance [#1960] This patch adds state machine to support tx probation timer. revision f3f159d0aa3f43c4b28cbd6f0c7c9f041f4b6fd8 Author: Minh Chau Date: Wed, 14 Aug 2019 15:40:05 +1000 mds: Add timeout for ack message [#1960] If the ack size is configured greater than 1, there should be a timeout at receiver ends to send the ack message back to senders. The ack message timeout utilizes the poll timeout in flow control thread to make mds lightweight (in contrast to additional timer threads). revision 6b69713c85dfc46b4d570a61eb2e2c4b71c354f9 Author: Minh Chau Date: Wed, 14 Aug 2019 15:39:39 +1000 mds: Add implementation for TIPC buffer overflow solution [#1960] This is a collaborative patch of two participants: - Tran Thuan - Minh Chau Main changes: - Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files introduce new functions which are called in mds_dt_tipc.c if the flow control is enabled - Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files implements the tipc portid instance, which supports the sliding window, mds msg queue - Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define the event and messages which are used for this solution. revision f71e0ba303ea75b8f845d9f72ab903af93817c87 Author: Minh Chau Date: Wed, 14 Aug 2019 15:08:30 +1000 mds: Resolve c/c++ linking issue [#1960] This patch solves the linking issue if mds_dt.h or mds_core.h is included in c++ sources. revision 983ad4f94c9b9d458ba5a3f12351cd5b143c78d5 Author: Minh Chau Date: Wed, 14 Aug 2019 15:08:30 +1000 mds: Add README for solution of TIPC buffer overflow at MDS [#1960] Added Files: src/mds/mds_tipc_fctrl_intf.cc src/mds/mds_tipc_fctrl_intf.h src/mds/mds_tipc_fctrl_msg.cc src/mds/mds_tipc_fctrl_msg.h
Re: [devel] [PATCH 1/1] amf: fix no active assignment even one in-service SU can be assigned [#3020]
Hi Thuan, ack with minor comments. Thanks Minh On 18/3/19 7:04 pm, thuan.tran wrote: AMFD should try assign SI active for other in-service SUs if fail to assign for current in-service SU --- src/amf/amfd/sg_2n_fsm.cc | 75 +-- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/src/amf/amfd/sg_2n_fsm.cc b/src/amf/amfd/sg_2n_fsm.cc index 91ffc63..ba0f72e 100644 --- a/src/amf/amfd/sg_2n_fsm.cc +++ b/src/amf/amfd/sg_2n_fsm.cc @@ -630,6 +630,43 @@ done: } /* + * Function: avd_sg_2n_assign_si + * + * Purpose: This function choose and assign SIs in the SG that dont have + * active assignment. + * + * Input: cb - the AVD control block + *sg - The pointer to the service group. + *su - The pointer to the service unit to be assigned ACTIVE. + * + * Returns: True if assign succeed, otherwise return false + * + **/ +static bool avd_sg_2n_assign_si(AVD_CL_CB *cb, AVD_SG *sg, AVD_SU *su) { [M]: This function only creates active assignment, the name could be avd_sg_2n_assign_act_si (or you can come up another name) to suggest what it is actually doing inside. And add TRACE_ENTER()/LEAVE(). + bool l_flag = false; + AVD_SU_SI_REL *tmp_susi; + /* choose and assign SIs in the SG that dont have active assignment */ + for (const auto _si : sg->list_of_si) { +if ((i_si->saAmfSIAdminState == SA_AMF_ADMIN_UNLOCKED) && +(i_si->list_of_csi != nullptr) && +(i_si->si_dep_state != AVD_SI_SPONSOR_UNASSIGNED) && +(i_si->si_dep_state != AVD_SI_UNASSIGNING_DUE_TO_DEP) && +(i_si->si_dep_state != AVD_SI_READY_TO_UNASSIGN) && +(i_si->list_of_sisu == AVD_SU_SI_REL_NULL) && +(su->saAmfSUNumCurrActiveSIs < sg->saAmfSGMaxActiveSIsperSU)) { + /* found a SI that needs active assignment. */ + if (avd_new_assgn_susi(cb, su, i_si, SA_AMF_HA_ACTIVE, false, + _susi) == NCSCC_RC_SUCCESS) { +l_flag = true; + } else { +LOG_ER("%s:%u: %s", __FILE__, __LINE__, i_si->name.c_str()); + } +} + } + return l_flag; +} + +/* * Function: avd_sg_2n_su_chose_asgn * * Purpose: This function will identify the current active SU. @@ -675,7 +712,10 @@ static AVD_SU *avd_sg_2n_su_chose_asgn(AVD_CL_CB *cb, AVD_SG *sg) { for (const auto : sg->list_of_su) { if (iter->saAmfSuReadinessState == SA_AMF_READINESS_IN_SERVICE) { a_su = iter; -break; +l_flag = avd_sg_2n_assign_si(cb, sg, a_su); +if (l_flag == true) { + break; +} } } @@ -683,36 +723,13 @@ static AVD_SU *avd_sg_2n_su_chose_asgn(AVD_CL_CB *cb, AVD_SG *sg) { TRACE("No in service SUs available in the SG"); goto done; } - } else { /* if (a_susi == AVD_SU_SI_REL_NULL) */ - + } else { /* if (a_susi != AVD_SU_SI_REL_NULL) */ a_su = a_susi->su; - } - - if (a_su->saAmfSuReadinessState != SA_AMF_READINESS_IN_SERVICE) { -TRACE("The current active SU is OOS so return"); -goto done; - } - - /* check if any more active SIs can be assigned to this SU */ - l_flag = false; - - /* choose and assign SIs in the SG that dont have active assignment */ - for (const auto _si : sg->list_of_si) { -if ((i_si->saAmfSIAdminState == SA_AMF_ADMIN_UNLOCKED) && -(i_si->list_of_csi != nullptr) && -(i_si->si_dep_state != AVD_SI_SPONSOR_UNASSIGNED) && -(i_si->si_dep_state != AVD_SI_UNASSIGNING_DUE_TO_DEP) && -(i_si->si_dep_state != AVD_SI_READY_TO_UNASSIGN) && -(i_si->list_of_sisu == AVD_SU_SI_REL_NULL) && -(a_su->saAmfSUNumCurrActiveSIs < sg->saAmfSGMaxActiveSIsperSU)) { - /* found a SI that needs active assignment. */ - if (avd_new_assgn_susi(cb, a_su, i_si, SA_AMF_HA_ACTIVE, false, - _susi) == NCSCC_RC_SUCCESS) { -l_flag = true; - } else { -LOG_ER("%s:%u: %s", __FILE__, __LINE__, i_si->name.c_str()); - } +if (a_su->saAmfSuReadinessState != SA_AMF_READINESS_IN_SERVICE) { + TRACE("The current active SU is OOS so return"); + goto done; } +l_flag = avd_sg_2n_assign_si(cb, sg, a_su); } /* if any assignments have been done return the SU */ ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfd: include failover info in coldsync [#3060]
Hi, ack (code review only) Thanks Minh On 19/7/19 4:47 pm, Gary Lee wrote: Failover information is not currently included in coldsync. This means if a delayed failover is in progress *before* a standby controller is available, *and* a controller failover occurs, then information about the delayed failover is lost. --- src/amf/amfd/chkop.cc | 4 ++ src/amf/amfd/ckpt.h| 4 +- src/amf/amfd/ckpt_dec.cc | 77 -- src/amf/amfd/ckpt_edu.cc | 2 + src/amf/amfd/ckpt_enc.cc | 5 ++- src/amf/amfd/node.h| 3 ++ src/amf/amfd/node_state_machine.cc | 2 + src/amf/amfd/util.cc | 1 + 8 files changed, 76 insertions(+), 22 deletions(-) diff --git a/src/amf/amfd/chkop.cc b/src/amf/amfd/chkop.cc index e9a68f4..56b0142 100644 --- a/src/amf/amfd/chkop.cc +++ b/src/amf/amfd/chkop.cc @@ -1051,6 +1051,10 @@ uint32_t avsv_send_ckpt_data(AVD_CL_CB *cb, uint32_t action, avd_cb->avd_peer_ver); return NCSCC_RC_SUCCESS; } + if (avd_cb->avd_peer_ver >= AVD_MBCSV_SUB_PART_VERSION_10) { +cb->async_updt_cnt.failover_updt++; + } + break; default: return NCSCC_RC_SUCCESS; diff --git a/src/amf/amfd/ckpt.h b/src/amf/amfd/ckpt.h index 875776a..2e15387 100644 --- a/src/amf/amfd/ckpt.h +++ b/src/amf/amfd/ckpt.h @@ -35,9 +35,10 @@ #define AMF_AMFD_CKPT_H_ // current version -#define AVD_MBCSV_SUB_PART_VERSION 9 +#define AVD_MBCSV_SUB_PART_VERSION 10 // supported versions +#define AVD_MBCSV_SUB_PART_VERSION_10 10 #define AVD_MBCSV_SUB_PART_VERSION_9 9 #define AVD_MBCSV_SUB_PART_VERSION_8 8 #define AVD_MBCSV_SUB_PART_VERSION_7 7 @@ -109,6 +110,7 @@ typedef struct avsv_async_updt_cnt { uint32_t compcstype_updt; uint32_t si_trans_updt; uint32_t ng_updt; + uint32_t failover_updt; } AVSV_ASYNC_UPDT_CNT; /* diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc index a46f6d3..6288b4f 100644 --- a/src/amf/amfd/ckpt_dec.cc +++ b/src/amf/amfd/ckpt_dec.cc @@ -178,6 +178,31 @@ const AVSV_DECODE_COLD_SYNC_RSP_DATA_FUNC_PTR dec_cs_data_func_list[] = { dec_cs_comp_config, dec_cs_comp_cs_type_config, dec_cs_siass, dec_cs_si_trans,dec_cs_async_updt_cnt}; +void set_node_failover_state(AVD_CL_CB *cb, const SaClmNodeIdT node_id, +const uint32_t state) { + TRACE_ENTER(); + + if (state == NodeState::NodeStates::kUndefined) { +// not in failover list +return; + } + + auto failed_node = cb->failover_list.find(node_id); + if (failed_node != cb->failover_list.end()) { +failed_node->second->SetState(state); + } else { +LOG_NO("Node '%u' not found in failover_list. Create new entry", +node_id); +auto new_node = std::make_shared(cb, node_id); +// node must be added to failover_list before SetState() is called. +// If the state is 'end', then it will be deleted by SetState(). +// Otherwise, we will leave a node in 'End' state mistakenly in +// failover_list. +cb->failover_list[node_id] = new_node; +new_node->SetState(state); + } +} + void decode_cb(NCS_UBAID *ub, AVD_CL_CB *cb, const uint16_t peer_version) { osaf_decode_uint32(ub, reinterpret_cast(>init_state)); osaf_decode_satimet(ub, >cluster_init_time); @@ -254,6 +279,9 @@ void decode_node_config(NCS_UBAID *ub, AVD_AVND *avnd, osaf_decode_uint32(ub, >rcv_msg_id); osaf_decode_uint32(ub, >snd_msg_id); osaf_extended_name_free(_name); + if (peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) { +osaf_decode_uint32(ub, >failover_state); + } TRACE_LEAVE(); } @@ -585,7 +613,7 @@ void decode_siass(NCS_UBAID *ub, AVSV_SU_SI_REL_CKPT_MSG *su_si_ckpt, su_si_ckpt->csi_add_rem = static_cast(csi_add_rem); osaf_decode_sanamet(ub, _si_ckpt->comp_name); osaf_decode_sanamet(ub, _si_ckpt->csi_name); - }; + } } /\ @@ -2199,6 +2227,7 @@ static uint32_t dec_cs_node_config(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec, for (count = 0; count < num_of_obj; count++) { decode_node_config(>i_uba, , dec->i_peer_version); status = avd_ckpt_node(cb, , dec->i_action); +set_node_failover_state(cb, avnd.node_info.nodeId, avnd.failover_state); osafassert(status == NCSCC_RC_SUCCESS); } @@ -2552,14 +2581,23 @@ static uint32_t dec_cs_async_updt_cnt(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec, /* * Decode and send async update counts for all the data structures. */ - if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_7) { + if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) { TRACE( -"Peer AMFD version is >= AVD_MBCSV_SUB_PART_VERSION_7," +"Peer AMFD version is >= AVD_MBCSV_SUB_PART_VERSION_10," "peer ver:%d", avd_cb->avd_peer_ver); status = m_NCS_EDU_VER_EXEC(>edu_hdl,
Re: [devel] [PATCH 1/1] amfd: ignore amfnd down event if node state is absent [#3015]
Hi Thang, Ack from me. Thanks Minh On 11/6/19 4:11 pm, Minh Hon Chau wrote: Hi Thang, I can see it's a race in main thread that how amfd processes the mds down and clm callback. Node is going down <143>1 2019-06-11T15:16:42.157517+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38507"] 275:amf/amfd/mds.cc:398 >> avd_mds_svc_evt <143>1 2019-06-11T15:16:42.157526+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38508"] 275:amf/amfd/mds.cc:459 TR avnd 2030f00bd down <143>1 2019-06-11T15:16:42.157535+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38509"] 275:amf/amfd/mds.cc:0 << avd_mds_svc_evt <143>1 2019-06-11T15:16:48.332481+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38623"] 272:amf/amfd/clm.cc:226 >> clm_track_cb: '0' '4' '1' <143>1 2019-06-11T15:16:48.33249+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38624"] 272:amf/amfd/clm.cc:242 TR numberOfMembers:'4', numberOfItems:'1' <143>1 2019-06-11T15:16:48.3325+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38625"] 272:amf/amfd/clm.cc:248 TR i = 0, node:'safNode=PL-3,safCluster=myClmCluster', clusterChange:3 <143>1 2019-06-11T15:16:48.33251+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38626"] 272:amf/amfd/clm.cc:332 TR Node Left: rootCauseEntity safNode=PL-3,safCluster=myClmCluster for node 131855 <143>1 2019-06-11T15:16:48.332519+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38627"] 272:amf/amfd/clm.cc:188 >> clm_node_exit_complete: 2030f <143>1 2019-06-11T15:16:48.332534+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38628"] 272:amf/amfd/ndproc.cc:1267 >> avd_node_failover: 'safAmfNode=PL-3,safAmfCluster=myAmfCluster' <143>1 2019-06-11T15:16:48.332545+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38629"] 272:amf/amfd/ndfsm.cc:1153 >> avd_node_mark_absent Node is coming up again <143>1 2019-06-11T15:16:48.34867+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39826"] 272:amf/amfd/clm.cc:226 >> clm_track_cb: '0' '4' '1' <143>1 2019-06-11T15:16:48.348674+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39827"] 272:amf/amfd/clm.cc:242 TR numberOfMembers:'5', numberOfItems:'1' <143>1 2019-06-11T15:16:48.348678+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39828"] 272:amf/amfd/clm.cc:248 TR i = 0, node:'safNode=PL-3,safCluster=myClmCluster', clusterChange:2 <143>1 2019-06-11T15:16:48.348685+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39829"] 272:amf/amfd/node.cc:53 TR added node 131855 <143>1 2019-06-11T15:16:48.348689+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39830"] 272:amf/amfd/clm.cc:417 TR Node Joined 'safNode=PL-3,safCluster=myClmCluster' '36' Now amfd processes the mds down in main thread, its a race here then the @node_info.member set to FALSE <143>1 2019-06-11T15:16:48.351948+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39971"] 272:amf/amfd/ndfsm.cc:779 >> avd_mds_avnd_down_evh: 2030f, 0x558e549a1650 <143>1 2019-06-11T15:16:48.351954+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39972"] 272:amf/amfd/ndproc.cc:1267 >> avd_node_failover: 'safAmfNode=PL-3,safAmfCluster=myAmfCluster' <143>1 2019-06-11T15:16:48.351959+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39973"] 272:amf/amfd/ndfsm.cc:1153 >> avd_node_mark_absent Now the mds up comes, node_up come, but the node is not a clm member <143>1 2019-06-11T15:16:48.701771+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40552"] 275:amf/amfd/mds.cc:398 >> avd_mds_svc_evt <143>1 2019-06-11T15:16:48.701791+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40553"] 275:amf/amfd/mds.cc:0 << avd_mds_svc_evt <143>1 2019-06-11T15:16:48.706254+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40560"] 272:amf/amfd/ndfsm.cc:743 >> avd_mds_avnd_up_evh <143>1 2019-06-11T15:16:48.706271+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40561"] 275:amf/amfd/ndmsg.cc:389 << avd_n2d_msg_rcv <143>1 2019-06-11T15:16:48.706288+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40562"] 272:amf/amfd/ndfsm.cc:757 TR amfnd on 2030f is up <143>1 2019-06-11T15:16:48.706298+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40563"] 272:amf/amfd/ndfsm.cc:0 << avd_mds_avnd_up_evh <143>1 2019-06-11T15:16:48.707145+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40596"] 272:amf/amfd/ndfsm.cc:275 >> avd_node_up_evh: from 2030f, safAmfNode=PL-3,safAmfCluster=myAmfCluster <143>1 2019-06-11T15:16:48.707153+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40597"] 272:amf/am
Re: [devel] [PATCH 1/1] amf: remove SUSI assignemnt of dependent SI during failover [#3049]
Hi Thang, ack (review + test). In below syslog, I got the assignment of sponsor + dependent on the locked SC removed, and the other SC creates new active assignments. Minor comment: In sg_2n_fsm:node_fail_su_oper(), starting from line 3153, the codes are now most likely the same for both standby and active :) 3152: } else { 3154: /* the SU is not the same as the SU in the list */ 3153: if (avd_su_state_determine(su) == SA_AMF_HA_STANDBY) { *// same as the below active* } /* if(avd_su_state_determine(su) == SA_AMF_HA_STANDBY) */ else if (avd_su_state_determine(su) == SA_AMF_HA_ACTIVE) { } Thanks Minh --syslog-- 2019-06-20 19:01:44.975 SC-1 osafamfnd[331]: NO Assigning 'safSi=ma_si,safApp=ma_app' QUIESCED to 'safSu=ma_su_1,safSg=ma_sg,safApp=ma_app' 2019-06-20 19:01:44.976 SC-1 amf_demo[533]: CSI Set - HAState Quiesced for all assigned CSIs 2019-06-20 19:01:44.977 SC-1 osafamfnd[331]: NO Assigning 'safSi=ma_si_new,safApp=ma_app_new' QUIESCED to 'safSu=ma_su_3_new,safSg=ma_sg_new,safApp=ma_app_new' 2019-06-20 19:01:44.977 SC-1 amf_demo_ori[599]: CSI Set - HAState Quiesced for all assigned CSIs 2019-06-20 19:01:44.977 SC-1 osafamfnd[331]: NO Assigned 'safSi=ma_si_new,safApp=ma_app_new' QUIESCED to 'safSu=ma_su_3_new,safSg=ma_sg_new,safApp=ma_app_new' 2019-06-20 19:01:51.978 SC-1 osafamfnd[331]: NO Assigned 'safSi=ma_si,safApp=ma_app' QUIESCED to 'safSu=ma_su_1,safSg=ma_sg,safApp=ma_app' 2019-06-20 19:01:52.895 SC-1 osafdtmd[169]: NO Lost contact with 'SC-2' 2019-06-20 19:01:52.903 SC-1 osafclmd[306]: NO Node 131599 went down. Not sending track callback for agents on that node 2019-06-20 19:01:52.903 SC-1 osafamfd[316]: NO Node 'SC-2' left the cluster 2019-06-20 19:01:52.937 SC-1 osaffmd[201]: NO Current role: ACTIVE 2019-06-20 19:01:52.938 SC-1 osaffmd[201]: Rebooting OpenSAF NodeId = 131599 EE Name = , Reason: Received Node Down for peer controller, OwnNodeId = 131343, SupervisionTime = 60 2019-06-20 19:01:52.957 SC-1 opensaf_reboot: Rebooting remote node in the absence of PLM is outside the scope of OpenSAF 2019-06-20 19:01:52.958 SC-1 osafamfnd[331]: NO Removing 'safSi=ma_si,safApp=ma_app' from 'safSu=ma_su_1,safSg=ma_sg,safApp=ma_app' 2019-06-20 19:01:52.958 SC-1 amf_demo[533]: CSI Remove for all CSIs 2019-06-20 19:01:52.959 SC-1 amf_demo[533]: state: 3, mode: 1, code: 1 2019-06-20 19:01:52.959 SC-1 osafamfnd[331]: NO Removing 'safSi=ma_si_new,safApp=ma_app_new' from 'safSu=ma_su_3_new,safSg=ma_sg_new,safApp=ma_app_new' 2019-06-20 19:01:52.959 SC-1 amf_demo_ori[599]: CSI Remove for all CSIs 2019-06-20 19:01:52.959 SC-1 osafamfnd[331]: NO Removed 'safSi=ma_si,safApp=ma_app' from 'safSu=ma_su_1,safSg=ma_sg,safApp=ma_app' 2019-06-20 19:01:52.960 SC-1 osafamfnd[331]: NO Removed 'safSi=ma_si_new,safApp=ma_app_new' from 'safSu=ma_su_3_new,safSg=ma_sg_new,safApp=ma_app_new' 2019-06-20 19:01:52.962 SC-1 osafamfd[316]: NO Assigning due to dep 'safSi=ma_si,safApp=ma_app' 2019-06-20 19:01:52.964 SC-1 osafamfd[316]: NO Tolerance timer started, sponsor si:'safSi=ma_si,safApp=ma_app', dependent si:safSi=ma_si_new,safApp=ma_app_new 2019-06-20 19:01:54.352 SC-1 osafdtmd[169]: NO Established contact with 'SC-2' 2019-06-20 19:01:56.368 SC-2 osafamfnd[257]: NO Assigning 'safSi=ma_si,safApp=ma_app' ACTIVE to 'safSu=ma_su_2,safSg=ma_sg,safApp=ma_app' 2019-06-20 19:01:56.369 SC-2 amf_demo[444]: CSI Set - add 'safCsi=ma_csi,safSi=ma_si,safApp=ma_app' HAState Active 2019-06-20 19:02:04.372 SC-2 osafamfnd[257]: NO Assigned 'safSi=ma_si,safApp=ma_app' ACTIVE to 'safSu=ma_su_2,safSg=ma_sg,safApp=ma_app' 2019-06-20 19:02:04.390 SC-2 osafamfnd[257]: NO Assigning 'safSi=ma_si_new,safApp=ma_app_new' ACTIVE to 'safSu=ma_su_4_new,safSg=ma_sg_new,safApp=ma_app_new' 2019-06-20 19:02:04.391 SC-2 amf_demo_ori[440]: CSI Set - add 'safCsi=ma_csi_new,safSi=ma_si_new,safApp=ma_app_new' HAState Active 2019-06-20 19:02:04.391 SC-2 osafamfnd[257]: NO Assigned 'safSi=ma_si_new,safApp=ma_app_new' ACTIVE to 'safSu=ma_su_4_new,safSg=ma_sg_new,safApp=ma_app_new' On 12/6/19 12:01 pm, thang.d.nguyen wrote: When lock node invokes on active assignment. The dependent SI follow with sponsor SI move to QUIESCED. There is a case that the active assignment for sponsor is happening on remain SC node. And that remaining node was down. The remove SISU only happen for sponsor SI. The fix is to remove SUSI of dependent SI. --- src/amf/amfd/sg_2n_fsm.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/amf/amfd/sg_2n_fsm.cc b/src/amf/amfd/sg_2n_fsm.cc index 91ffc63..776696c 100644 --- a/src/amf/amfd/sg_2n_fsm.cc +++ b/src/amf/amfd/sg_2n_fsm.cc @@ -3175,6 +3175,9 @@ void SG_2N::node_fail_su_oper(AVD_SU *su) { } su->sg_of_su->set_fsm_state(AVD_SG_FSM_SG_REALIGN); + } else { +avd_sg_su_si_del_snd(cb, su_oper_list.front()); +su->sg_of_su->set_fsm_state(AVD_SG_FSM_SG_REALIGN); }
Re: [devel] [PATCH 1/1] amf: fix SU get stuck in INSTANTIATING presence state [#3047]
Hi Thuan, ack with minor comment. Thanks Minh On 3/6/19 5:10 pm, thuan.tran wrote: COMP restart recovery during SU restart recovery can lead to SU stuck in INSTANTIATING without further action. Because COMP instaniated event in RESTARTING does not trigger avnd_su_pres_fsm_run(). --- src/amf/amfnd/clc.cc | 4 src/amf/amfnd/susm.cc | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc index 675ca49..9b1b3a7 100644 --- a/src/amf/amfnd/clc.cc +++ b/src/amf/amfnd/clc.cc @@ -926,6 +926,7 @@ uint32_t avnd_comp_clc_st_chng_prc(AVND_CB *cb, AVND_COMP *comp, AVND_SU_PRES_FSM_EV ev = AVND_SU_PRES_FSM_EV_MAX; AVND_COMP_CSI_REC *csi = 0; bool is_en; + bool pi_comp_recover = false; uint32_t rc = NCSCC_RC_SUCCESS; TRACE_ENTER2("Comp '%s', Prv_state '%s', Final_state '%s'", comp->name.c_str(), presence_state[prv_st], @@ -953,6 +954,8 @@ uint32_t avnd_comp_clc_st_chng_prc(AVND_CB *cb, AVND_COMP *comp, TRACE_1( "Component restart is through admin opration, admin oper flag reset"); comp->admin_oper = false; + } else if (m_AVND_COMP_TYPE_IS_PREINSTANTIABLE(comp)) { +pi_comp_recover = true; [M]: It looks doubtful, the check itself only wants to know if the @comp is pi, it does not relate to the first *if* (@admin_oper and @final_st)? } if ((SA_AMF_PRESENCE_INSTANTIATED == prv_st) && @@ -1487,6 +1490,7 @@ uint32_t avnd_comp_clc_st_chng_prc(AVND_CB *cb, AVND_COMP *comp, (SA_AMF_PRESENCE_ORPHANED != prv_st) && ((prv_st == SA_AMF_PRESENCE_INSTANTIATING) || (prv_st == SA_AMF_PRESENCE_TERMINATING) || + (prv_st == SA_AMF_PRESENCE_RESTARTING && pi_comp_recover) || (comp->su->admin_op_Id == SA_AMF_ADMIN_RESTART))) ev = AVND_SU_PRES_FSM_EV_COMP_INSTANTIATED; else if (SA_AMF_PRESENCE_INSTANTIATION_FAILED == final_st) diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc index c023c8d..62e2db9 100644 --- a/src/amf/amfnd/susm.cc +++ b/src/amf/amfnd/susm.cc @@ -2282,7 +2282,9 @@ uint32_t avnd_su_pres_insting_compinst_hdler(AVND_CB *cb, AVND_SU *su, curr_comp; curr_comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET( m_NCS_DBLIST_FIND_NEXT(_comp->su_dll_node))) { /* instantiate the pi comp */ -if (m_AVND_COMP_TYPE_IS_PREINSTANTIABLE(curr_comp)) { +if (m_AVND_COMP_TYPE_IS_PREINSTANTIABLE(curr_comp) && + (!m_AVND_COMP_IS_FAILED(curr_comp) || +curr_comp->pres != SA_AMF_PRESENCE_RESTARTING)) { TRACE("Running the component clc FSM"); rc = avnd_comp_clc_fsm_run(cb, curr_comp, AVND_COMP_CLC_PRES_FSM_EV_INST); ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfd: ignore amfnd down event if node state is absent [#3015]
Hi Thang, I can see it's a race in main thread that how amfd processes the mds down and clm callback. Node is going down <143>1 2019-06-11T15:16:42.157517+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38507"] 275:amf/amfd/mds.cc:398 >> avd_mds_svc_evt <143>1 2019-06-11T15:16:42.157526+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38508"] 275:amf/amfd/mds.cc:459 TR avnd 2030f00bd down <143>1 2019-06-11T15:16:42.157535+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38509"] 275:amf/amfd/mds.cc:0 << avd_mds_svc_evt <143>1 2019-06-11T15:16:48.332481+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38623"] 272:amf/amfd/clm.cc:226 >> clm_track_cb: '0' '4' '1' <143>1 2019-06-11T15:16:48.33249+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38624"] 272:amf/amfd/clm.cc:242 TR numberOfMembers:'4', numberOfItems:'1' <143>1 2019-06-11T15:16:48.3325+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38625"] 272:amf/amfd/clm.cc:248 TR i = 0, node:'safNode=PL-3,safCluster=myClmCluster', clusterChange:3 <143>1 2019-06-11T15:16:48.33251+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38626"] 272:amf/amfd/clm.cc:332 TR Node Left: rootCauseEntity safNode=PL-3,safCluster=myClmCluster for node 131855 <143>1 2019-06-11T15:16:48.332519+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38627"] 272:amf/amfd/clm.cc:188 >> clm_node_exit_complete: 2030f <143>1 2019-06-11T15:16:48.332534+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38628"] 272:amf/amfd/ndproc.cc:1267 >> avd_node_failover: 'safAmfNode=PL-3,safAmfCluster=myAmfCluster' <143>1 2019-06-11T15:16:48.332545+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="38629"] 272:amf/amfd/ndfsm.cc:1153 >> avd_node_mark_absent Node is coming up again <143>1 2019-06-11T15:16:48.34867+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39826"] 272:amf/amfd/clm.cc:226 >> clm_track_cb: '0' '4' '1' <143>1 2019-06-11T15:16:48.348674+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39827"] 272:amf/amfd/clm.cc:242 TR numberOfMembers:'5', numberOfItems:'1' <143>1 2019-06-11T15:16:48.348678+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39828"] 272:amf/amfd/clm.cc:248 TR i = 0, node:'safNode=PL-3,safCluster=myClmCluster', clusterChange:2 <143>1 2019-06-11T15:16:48.348685+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39829"] 272:amf/amfd/node.cc:53 TR added node 131855 <143>1 2019-06-11T15:16:48.348689+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39830"] 272:amf/amfd/clm.cc:417 TR Node Joined 'safNode=PL-3,safCluster=myClmCluster' '36' Now amfd processes the mds down in main thread, its a race here then the @node_info.member set to FALSE <143>1 2019-06-11T15:16:48.351948+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39971"] 272:amf/amfd/ndfsm.cc:779 >> avd_mds_avnd_down_evh: 2030f, 0x558e549a1650 <143>1 2019-06-11T15:16:48.351954+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39972"] 272:amf/amfd/ndproc.cc:1267 >> avd_node_failover: 'safAmfNode=PL-3,safAmfCluster=myAmfCluster' <143>1 2019-06-11T15:16:48.351959+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="39973"] 272:amf/amfd/ndfsm.cc:1153 >> avd_node_mark_absent Now the mds up comes, node_up come, but the node is not a clm member <143>1 2019-06-11T15:16:48.701771+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40552"] 275:amf/amfd/mds.cc:398 >> avd_mds_svc_evt <143>1 2019-06-11T15:16:48.701791+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40553"] 275:amf/amfd/mds.cc:0 << avd_mds_svc_evt <143>1 2019-06-11T15:16:48.706254+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40560"] 272:amf/amfd/ndfsm.cc:743 >> avd_mds_avnd_up_evh <143>1 2019-06-11T15:16:48.706271+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40561"] 275:amf/amfd/ndmsg.cc:389 << avd_n2d_msg_rcv <143>1 2019-06-11T15:16:48.706288+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40562"] 272:amf/amfd/ndfsm.cc:757 TR amfnd on 2030f is up <143>1 2019-06-11T15:16:48.706298+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40563"] 272:amf/amfd/ndfsm.cc:0 << avd_mds_avnd_up_evh <143>1 2019-06-11T15:16:48.707145+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40596"] 272:amf/amfd/ndfsm.cc:275 >> avd_node_up_evh: from 2030f, safAmfNode=PL-3,safAmfCluster=myAmfCluster <143>1 2019-06-11T15:16:48.707153+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40597"] 272:amf/amfd/ndfsm.cc:292 TR leds_set 0 <143>1 2019-06-11T15:16:48.70716+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40598"] 272:amf/amfd/ndfsm.cc:308 TR node_id '2030f' not in failover_list. <141>1 2019-06-11T15:16:48.707185+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40599"] 272:amf/amfd/ndfsm.cc:232 NO Received node_up from 2030f: msg_id 1 <140>1 2019-06-11T15:16:48.7072+10:00 SC-1 osafamfd 272 osafamfd [meta sequenceId="40600"] 272:amf/amfd/ndfsm.cc:387 WA Not a Cluster Member dropping the msg <143>1 2019-06-11T15:16:48.707206+10:00 SC-1 osafamfd 272 osafamfd [meta
Re: [devel] [PATCH 1/1] amfd: do not queue sync messages from 'lost' nodes [#3050]
Hi Gary, Those variables e.g node_sync_window_closed have been used before headless sync complete. If there is a failover during the headless sync, the new active will start the headless sync again, so those variables have not been needed to checkpoint. But here the scenario happens in split brain, in which the new active is in separated network instead of coming from headless, so I guess we do need checkpoint it, but the checkpoint should be done after the headless sync ? And the change in timer.h seems not much relates to this ticket? Thanks Minh On 5/6/19 2:03 pm, Gary Lee wrote: The 'lost' nodes will be rebooted, thus there is no need to queue sync messages from these nodes. In addition, node_sync_window_closed is not reliable as it's not check pointed. We should remove all uses of it in another ticket? Instead, check if the timer is running. --- src/amf/amfd/cb.h | 2 ++ src/amf/amfd/ndproc.cc | 30 ++ src/amf/amfd/timer.h | 12 ++-- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/amf/amfd/cb.h b/src/amf/amfd/cb.h index 89cf15d..8902d78 100644 --- a/src/amf/amfd/cb.h +++ b/src/amf/amfd/cb.h @@ -237,6 +237,8 @@ typedef struct cl_cb_tag { */ bool active_services_exist; bool all_nodes_synced; + // @todo this should be checkpointed to standby? otherwise + // after a controller failover, it will still be false? bool node_sync_window_closed; /* diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc index 5f5cbcd..20008d9 100644 --- a/src/amf/amfd/ndproc.cc +++ b/src/amf/amfd/ndproc.cc @@ -345,19 +345,26 @@ void avd_nd_sisu_state_info_evh(AVD_CL_CB *cb, AVD_EVT *evt) { evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.node_id, evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.msg_id); - if (cb->node_sync_window_closed == false) { + const SaClmNodeIdT node_id = +evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.node_id; + + if (cb->failover_list.find(node_id) != cb->failover_list.end()) { +// ignore msg +LOG_WA("sisu_state_info messages received from lost node (%x)", + node_id); + } else if (cb->node_sync_tmr.is_active == true) { AVD_EVT_QUEUE *state_info_evt = new AVD_EVT_QUEUE(); state_info_evt->evt = new AVD_EVT{}; state_info_evt->evt->rcv_evt = evt->rcv_evt; state_info_evt->evt->info.avnd_msg = n2d_msg; cb->evt_queue.push(state_info_evt); +return; } else { LOG_WA( -"Ignore this sisu_state_info message since node sync window has closed"); -avsv_dnd_msg_free(n2d_msg); + "Ignore this sisu_state_info message since node sync window has closed"); } - TRACE_LEAVE(); + avsv_dnd_msg_free(n2d_msg); } /* @@ -387,19 +394,26 @@ void avd_nd_compcsi_state_info_evh(AVD_CL_CB *cb, AVD_EVT *evt) { evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.node_id, evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.msg_id); - if (cb->node_sync_window_closed == false) { + const SaClmNodeIdT node_id = +evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.node_id; + + if (cb->failover_list.find(node_id) != cb->failover_list.end()) { +// ignore msg +LOG_WA("compcsi_state_info messages received from lost node (%x)", + node_id); + } else if (cb->node_sync_tmr.is_active == true) { AVD_EVT_QUEUE *state_info_evt = new AVD_EVT_QUEUE(); state_info_evt->evt = new AVD_EVT{}; state_info_evt->evt->rcv_evt = evt->rcv_evt; state_info_evt->evt->info.avnd_msg = n2d_msg; cb->evt_queue.push(state_info_evt); +return; } else { LOG_WA( -"Ignore this compcsi_state_info message since node sync window has closed"); -avsv_dnd_msg_free(n2d_msg); + "Ignore this compcsi_state_info message since node sync window has closed"); } - TRACE_LEAVE(); + avsv_dnd_msg_free(n2d_msg); } /** diff --git a/src/amf/amfd/timer.h b/src/amf/amfd/timer.h index 5316879..6db04c7 100644 --- a/src/amf/amfd/timer.h +++ b/src/amf/amfd/timer.h @@ -52,12 +52,12 @@ typedef enum avd_tmr_type { /* AVD Timer definition */ typedef struct avd_tmr_tag { - tmr_t tmr_id; - AVD_TMR_TYPE type; - SaClmNodeIdT node_id; - std::string spons_si_name; - std::string dep_si_name; - bool is_active; + tmr_t tmr_id{}; + AVD_TMR_TYPE type{AVD_TMR_MAX}; + SaClmNodeIdT node_id{}; + std::string spons_si_name{}; + std::string dep_si_name{}; + bool is_active{}; } AVD_TMR; /* macro to start the cluster init timer. The cb structure ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 0/1] Review Request for mds: support multicast fragmented messages [#3033] V3
Hi, ack from me (code review) Thanks Minh On 25/4/19 9:33 pm, Vu Minh Nguyen wrote: Hi Hans, Probably you were looking at code that included this Thuan's patch. In legacy code, only mdtm_sendto() is called inside the function mdtm_frag_and_send(). Regards, Vu -Original Message- From: Hans Nordebäck Sent: Thursday, April 25, 2019 6:10 PM To: Vu Minh Nguyen ; Thuan Tran ; Minh Hon Chau Cc: opensaf-devel@lists.sourceforge.net Subject: RE: [PATCH 0/1] Review Request for mds: support multicast fragmented messages [#3033] V3 Hi Vu, It seems mdtm_mcast_sendto is used in mdtm_frag_and_send, at MDS_SENDTYPE_BCAST/BR Hans -Original Message- From: Vu Minh Nguyen Sent: den 25 april 2019 12:20 To: Hans Nordebäck ; Thuan Tran ; Minh Hon Chau Cc: opensaf-devel@lists.sourceforge.net Subject: RE: [PATCH 0/1] Review Request for mds: support multicast fragmented messages [#3033] V3 Hi Hans, See my responses inline. Regards, Vu -Original Message- From: Hans Nordebäck Sent: Thursday, April 25, 2019 4:28 PM To: Thuan Tran ; Vu Minh Nguyen ; Minh Hon Chau Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 0/1] Review Request for mds: support multicast fragmented messages [#3033] V3 Hi Vu and Thuan, a few question, is the text in the ticket description correct? E.g it says unicast is used if a multicast message is fragmented, (I think multicast still is used to send the fragments), this is what you mean with 2 different channels? (only one socket is used, BSRsock), [Vu] Yes. Unicast is used to send fragmented messages. Here is the current logic in case of sending a large package: Iterate over destinations { // mcm_pvt_process_svc_bcast_common() @ mds_c_sndrcv.c 1) Fragment the package // mdtm_frag_and_send() @ mds_dt_tipc.c 2) Unicast to a specific adest // mdtm_sendto() @ mds_dt_tipc.c 4) Continue with next adest } The problem stated is sending one large multicast message and then several smaller multicast messages, have you checked the fragment re-assembly part of the common code? [Vu] Yes. At the receive side, if msg is fragmented, mds will not forward to upper layer until all fragmented msgs are collected. If the message is not fragmented, mds will transfer the msg to upper right away. I checked with TIPC guys here, and he said that TIPC does not guarantee the order if we send msgs in different channels (unicast vs mcast). /BR Hans On 2019-04-24 13:06, thuan.tran wrote: Summary: mds: support multicast fragmented messages [#3033] Review request for Ticket(s): 3033 Peer Reviewer(s): Hans, Minh, Vu Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-3033 Base revision: 7916ac316e86478c621c8359cf2aca4886288a38 Personal repository: git://git.code.sf.net/u/thuantr/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesy OpenSAF servicesn Core libraries n Samples n Tests n Other n NOTE: Patch(es) contain lines longer than 80 characers Comments (indicate scope for each "y" above): - N/A revision 568f09774f936506f5e05e03813fa572af0fe0d3 Author: thuan.tran Date: Wed, 24 Apr 2019 17:54:25 +0700 mds: support multicast fragmented messages [#3033] - Sender may send broadcast big messages (> 65K) then small messages (< 65K). Current MDS just loop via all destinations to unicast all fragmented messages to one by one destinations. But sending multicast non-fragment messages to all destinations. Therefor, receivers may get messages with incorrect order, non-fragment messages may come before fragmented messages. For example, it may lead to OUT OF ORDER for IMMNDs during IMMD sync. - Solution: support send multicast each fragmented messages to avoid disorder of arrived broadcast messages. Complete diffstat: -- src/mds/mds_c_sndrcv.c | 3 +- src/mds/mds_dt_tipc.c | 104 +++- - 2 files changed, 40 insertions(+), 67 deletions(-) Testing Commands: - N/A Testing, Expected Results: -- N/A Conditions of Submission: - N/A Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 y y powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally in
Re: [devel] [PATCH 0/1] Review Request for amfd: increase mds priority of amfnd down event [#3015]
Hi Thang, + Hans If the issue is reproducible, can you upload the full log/trace to ticket please? Thanks Minh On 27/2/19 10:17 am, thang.d.nguyen wrote: Summary: amfd: increase mds priority of amfnd down event [#3015] Review request for Ticket(s): 3015 Peer Reviewer(s): Gary, Minh Pull request to: Minh Affected branch(es): develop Development branch: ticket-3015 Base revision: 1f9cf4636b07d28a906f62b44144c337c5280f1a Personal repository: git://git.code.sf.net/u/thangng/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesy OpenSAF servicesn Core libraries n Samples n Tests n Other n Comments (indicate scope for each "y" above): - revision e81b6874f37e9761594f7ee3328486062fcbddb3 Author: thang.d.nguyen Date: Wed, 27 Feb 2019 05:50:11 +0700 amfd: increase mds priority of amfnd down event [#3015] To avoid the issue a node can not join the cluster when the PBE hung. Complete diffstat: -- src/amf/amfd/mds.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Testing Commands: - N/A. Testing, Expected Results: -- N/A. Conditions of Submission: - Acked from reviwer. Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 y y powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible line length in your headers/comments/text. ___ You have failed to put in a proper Trac Ticket # into your commits. ___ You have incorrectly put/left internal data in your comments/files (i.e. internal bug tracking tool IDs, product names etc) ___ You have not given any evidence of testing beyond basic build tests. Demonstrate some level of runtime or other sanity testing. ___ You have ^M present in some of your files. These have to be removed. ___ You have needlessly changed whitespace or added whitespace crimes like trailing spaces, or spaces before tabs. ___ You have mixed real technical changes with whitespace and other cosmetic code cleanup changes. These have to be separate commits. ___ You need to refactor your submission into logical chunks; there is too much content into a single commit. ___ You have extraneous garbage in your review (merge commits etc) ___ You have giant attachments which should never have been sent; Instead you should place your content in a public tree to be pulled. ___ You have too many commits attached to an e-mail; resend as threaded commits, or place in a public tree for a pull. ___ You have resent this content multiple times without a clear indication of what has changed between each re-send. ___ You have failed to adequately and individually address all of the comments and change requests that were proposed in the initial review. ___ You have a misconfigured ~/.gitconfig file (i.e. user.name, user.email etc) ___ Your computer have a badly configured date and time; confusing the the threaded patch review. ___ Your changes affect IPC mechanism, and you don't present any results for in-service upgradability test. ___ Your changes affect user manual and documentation, your patch series do not contain the patch that updates the Doxygen manual. ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amf: fix Comp stuck in RESTARTING presence state [#3011]
Hi Thuan, ack (review + test). Thanks Minh : During SU (many COMPs) restart recovery, if any COMP finish instantiated then crash while other COMPs are still instantiating, AMF recovery it by restarting but AMF only cleanup without re-instantiation because AMF see the COMP is not eligible for instantiation. The error COMP is stuck in RESTARTING without further action from AMF. AMF should allow COMP re-instantiation if SU state is INSTANTIATING and error COMP state is RESTARTING. --- src/amf/amfnd/clc.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc index 463c5de..7a62a56 100644 --- a/src/amf/amfnd/clc.cc +++ b/src/amf/amfnd/clc.cc @@ -1805,7 +1805,8 @@ static bool is_failed_comp_eligible_for_instantiation(AVND_COMP *comp) { /*During surestart recovery, after cleanup of all components, amfnd starts instantiation of components. A component may fault at this stage. Such a component is eligible for instantiation.*/ - if ((comp->pres == SA_AMF_PRESENCE_INSTANTIATING) && + if (((comp->pres == SA_AMF_PRESENCE_RESTARTING) || + (comp->pres == SA_AMF_PRESENCE_INSTANTIATING)) && (comp->su->pres == SA_AMF_PRESENCE_INSTANTIATING)) return true; -- 2.7.4 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] clm: Incorrect encode/decode time_super [#3007]
Hi aThanh, ack for code review. Thanks Minh On 20/2/19 4:19 pm, Thanh Nguyen wrote: Changing ecoding of time_super using 64 bit instead of 32 bit. --- src/clm/clmd/clms_mds.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clm/clmd/clms_mds.cc b/src/clm/clmd/clms_mds.cc index 833d18c..5a77885 100644 --- a/src/clm/clmd/clms_mds.cc +++ b/src/clm/clmd/clms_mds.cc @@ -542,7 +542,7 @@ static uint32_t clms_enc_track_cbk_msg(NCS_UBAID *uba, CLMSV_MSG *msg) { TRACE("p8 nullptr!!!"); return 0; } - ncs_encode_32bit(, track->time_super); + ncs_encode_64bit(, track->time_super); ncs_enc_claim_space(uba, 8); total_bytes += 8; ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] Opensaf-devel Digest, Vol 69, Issue 11
Hi Shiva, Looks like your application is quite similar to an amf sample app, it's under samples/amf/sa_aware, you can try the sample app and then replace it with your confD. Thanks, Minh On 12/2/19 11:09 pm, opensaf-devel-requ...@lists.sourceforge.net wrote: Send Opensaf-devel mailing list submissions to opensaf-devel@lists.sourceforge.net To subscribe or unsubscribe via the World Wide Web, visit https://lists.sourceforge.net/lists/listinfo/opensaf-devel or, via email, send a message with subject or body 'help' to opensaf-devel-requ...@lists.sourceforge.net You can reach the person managing the list at opensaf-devel-ow...@lists.sourceforge.net When replying, please edit your Subject line so it is more specific than "Re: Contents of Opensaf-devel digest..." Today's Topics: 1. ConfD Integration with opensaf (shiva) -- Message: 1 Date: Tue, 12 Feb 2019 14:40:26 +0530 From: shiva To: opensaf-devel@lists.sourceforge.net Subject: [devel] ConfD Integration with opensaf Message-ID: Content-Type: text/plain; charset=utf-8; format=flowed Hello all, ??? I want to integrate confD with opensaf. Is there any document or example code that explains about the integration process? ??? ??? I want to configure opensaf to handle 2N nodes (master/slave) in confD. ??? ??? ??? My requirement is that when the master node goes down the slave node should automatically take over and become the master. ??? ??? ??? ??? Thanks in advance. Regards. -- -- Subject: Digest Footer ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel -- End of Opensaf-devel Digest, Vol 69, Issue 11 * ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] clmd: not send sync respond to client if node down [#3004]
Hi Thang, The patch looks ok, but I'm thinking of not introducing mds_node_down_list. In SAI-AIS-CLM-B.04.01: "The term unconfigured node is used in this document to designate an execution environment that is not configured to host a CLM node." May we add a check if a node is unconfigured because it's not in ee_lookup, to distinguish with if a node is down? Thanks Minh On 1/2/19 2:34 pm, Tran Thuan wrote: Hi Thang, ACK from me for code review, not tested. Best Regards, ThuanTr -Original Message- From: thang.d.nguyen Sent: Wednesday, January 30, 2019 1:20 AM To: gary@dektech.com.au; minh.c...@dektech.com.au; thuan.t...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; thang.d.nguyen Subject: [PATCH 1/1] clmd: not send sync respond to client if node down [#3004] Clmd will not send sync respond to client if the node that client resided on down. This will avoid timeout when clmd send via mds. --- src/clm/clmd/clms_cb.h | 3 +++ src/clm/clmd/clms_evt.cc | 22 +- src/clm/clmd/clms_mds.cc | 2 +- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/clm/clmd/clms_cb.h b/src/clm/clmd/clms_cb.h index 4d7fdc7..637d53a 100644 --- a/src/clm/clmd/clms_cb.h +++ b/src/clm/clmd/clms_cb.h @@ -22,6 +22,7 @@ #include "osaf/config.h" #endif #include +#include #include #include #include @@ -238,6 +239,8 @@ typedef struct clms_cb_t { *node_down_list_head; /*NODE_DOWN record - Fix when active node goes down */ NODE_DOWN_LIST *node_down_list_tail; + // Record node id when receive MDS node down std::set + mds_node_down_list; bool is_impl_set; bool nid_started; /**< true if started by NID */ NCS_PATRICIA_TREE iplist; /* To temporarily store ipaddress information diff --git a/src/clm/clmd/clms_evt.cc b/src/clm/clmd/clms_evt.cc index c2b83c2..5265002 100644 --- a/src/clm/clmd/clms_evt.cc +++ b/src/clm/clmd/clms_evt.cc @@ -943,6 +943,8 @@ static uint32_t proc_mds_node_evt(CLMSV_CLMS_EVT *evt) { goto done; } + clms_cb->mds_node_down_list.insert(node_id); + if ((clms_cb->ha_state == SA_AMF_HA_ACTIVE) || (clms_cb->ha_state == SA_AMF_HA_QUIESCED)) { clms_track_send_node_down(node); @@ -1531,19 +1533,24 @@ static uint32_t proc_initialize_msg(CLMS_CB *cb, CLMSV_CLMS_EVT *evt) { TRACE_ENTER2("dest %" PRIx64, evt->fr_dest); - /*Handle the wrap around */ - if (clms_cb->last_client_id == INT_MAX) clms_cb->last_client_id = 0; - - clms_cb->last_client_id++; - node = clms_node_get_by_id(node_id); TRACE("Node id = %d", node_id); if (node == nullptr) { LOG_IN("Initialize request of client on an unconfigured node: node_id = %d", node_id); ais_rc = SA_AIS_ERR_UNAVAILABLE; +std::set::iterator it = + clms_cb->mds_node_down_list.find(node_id); +if (it != clms_cb->mds_node_down_list.end()) { + return (uint32_t)ais_rc; +} } + /*Handle the wrap around */ + if (clms_cb->last_client_id == INT_MAX) clms_cb->last_client_id = 0; + + clms_cb->last_client_id++; + if ((client = clms_client_new(evt->fr_dest, clms_cb->last_client_id)) == nullptr) { TRACE("Creating a new client failed"); @@ -1564,6 +1571,11 @@ static uint32_t proc_initialize_msg(CLMS_CB *cb, CLMSV_CLMS_EVT *evt) { return rc; } + std::set::iterator it = + clms_cb->mds_node_down_list.find(node_id); + if (it != clms_cb->mds_node_down_list.end()) { +clms_cb->mds_node_down_list.erase(it); + } + if (node) { if (node->member == false) { rc = clms_send_is_member_info(clms_cb, node->node_id, node->member, diff --git a/src/clm/clmd/clms_mds.cc b/src/clm/clmd/clms_mds.cc index 58552cc..833d18c 100644 --- a/src/clm/clmd/clms_mds.cc +++ b/src/clm/clmd/clms_mds.cc @@ -1097,7 +1097,7 @@ static uint32_t clms_mds_node_event(struct ncsmds_callback_info *mds_info) { clmsv_evt->info.node_mds_info.node_id = mds_info->info.node_evt.node_id; clmsv_evt->info.node_mds_info.nodeup = SA_TRUE; -rc = m_NCS_IPC_SEND(_cb->mbx, clmsv_evt, NCS_IPC_PRIORITY_HIGH); +rc = m_NCS_IPC_SEND(_cb->mbx, clmsv_evt, + NCS_IPC_PRIORITY_VERY_HIGH); if (rc != NCSCC_RC_SUCCESS) { TRACE("IPC send failed %d", rc); free(clmsv_evt); -- 2.7.4 ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 0/4] Review Request for osaf: allow split brain prevention parameter changes at runtime [#3006]
Hi Gary, ack with comment - There's a daemon_sigterm_install(), I think we could make a daemon_sighup_install() in the /base/daemon.c - I see there's a @todo that stop the split brain prevention when it's running, when it's done we might have to document this runtime change. Thanks Minh On 4/2/19 9:41 pm, Gary Lee wrote: Summary: osaf: add ability to reload config from fmd.conf [#3006] Review request for Ticket(s): 3006 Peer Reviewer(s): Hans, Minh Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-3006 Base revision: e13f0ee64a0255dd54bc70b1f8d601fbb6162428 Personal repository: git://git.code.sf.net/u/userid-2226215/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesy OpenSAF servicesy Core libraries y Samples n Tests n Other n Comments (indicate scope for each "y" above): - revision b5206b54fbc5462eaf6f0599d2c449f22087635d Author: Gary Lee Date: Mon, 4 Feb 2019 21:33:11 +1100 rded: reload split brain prevention parameters on SIGHUP [#3006] If enabled at runtime and this node is active, promote this node in consensus service. If disabled at runtime, watch threads will terminate gracefully when the plugin exits after losing connectivty to the consensus service. revision 0a043e5b320e8c05beccf7b7ac3c9150abdf6cc5 Author: Gary Lee Date: Mon, 4 Feb 2019 20:57:32 +1100 amfd: reload split brain prevention parameters on SIGHUP [#3006] revision fd617aeb6c8f23d8b404a85f6aaa8c6b28ae26a1 Author: Gary Lee Date: Mon, 4 Feb 2019 20:56:10 +1100 fmd: reload split brain prevention parameters on SIGHUP [#3006] revision a3c6f632f2377afc47c0ae04861f9a4a0e06f498 Author: Gary Lee Date: Mon, 4 Feb 2019 20:52:42 +1100 osaf: add ability to reload config from fmd.conf [#3006] Add ReloadConfiguration() function - when called it will read fmd.conf and look for 'export FMS_X=', and overwrite current environment variable settings in the caller. This allows split brain prevention parameters to be changed at runtime without a node restart. Complete diffstat: -- src/amf/amfd/cb.h | 1 + src/amf/amfd/main.cc| 32 src/amf/amfd/osaf-amfd.in | 1 + src/fm/fmd/fm_main.cc | 2 ++ src/osaf/consensus/consensus.cc | 67 +++-- src/osaf/consensus/consensus.h | 20 src/osaf/consensus/key_value.cc | 41 + src/rde/rded/osaf-rded.in | 1 + src/rde/rded/rde_main.cc| 59 +--- src/rde/rded/rde_rda.h | 3 ++ src/rde/rded/role.cc| 27 + src/rde/rded/role.h | 1 + 12 files changed, 203 insertions(+), 52 deletions(-) Testing Commands: - 1) Start cluster with FMS_SPLIT_BRAIN_PREVENTION=0 On both active / standby SCs: modify fmd.conf and set FMS_SPLIT_BRAIN_PREVENTION=1 pkill -SIGHUP osafamfd pkill -SIGHUP osaffmd pkill -SIGHUP osafrded Ensure split brain prevention works as expected 2) Leave cluster from Step 1 running On both active / standby SCs: modify fmd.conf and set FMS_SPLIT_BRAIN_PREVENTION=0 pkill -SIGHUP osafamfd pkill -SIGHUP osaffmd pkill -SIGHUP osafrded Ensure split brain prevention is no longer in effect Testing, Expected Results: -- As above Conditions of Submission: - Ack from any reviewer Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 y y powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible line length in your headers/comments/text. ___ You have failed to put in a proper Trac Ticket # into your commits. ___ You have incorrectly put/left internal data in your comments/files (i.e. internal bug tracking tool IDs, product names etc) ___ You have not given any evidence of testing beyond basic build tests. Demonstrate some level of runtime or other sanity testing. ___ You have ^M present in
Re: [devel] [PATCH 4/5] amfd: allow node to remain active is peer SC can be seen [#2996]
ack, review only. Thanks/Minh On 21/1/19 2:52 pm, Gary Lee wrote: If relaxed node promotion is enabled, allow a SC to remain active if the peer SC can be seen, even if access to the consensus service is lost. --- src/amf/amfd/ndfsm.cc | 2 +- src/amf/amfd/ndproc.cc | 13 +++-- src/amf/amfd/proc.h| 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/amf/amfd/ndfsm.cc b/src/amf/amfd/ndfsm.cc index 4146ddc..8c8f3c5 100644 --- a/src/amf/amfd/ndfsm.cc +++ b/src/amf/amfd/ndfsm.cc @@ -817,7 +817,7 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) { if (cb->node_failover_delay == 0) { avd_node_failover(node); } - check_quorum(); + check_quorum(cb); node->node_info.member = SA_FALSE; // Update standby out of sync if standby sc goes down if (avd_cb->node_id_avd_other == node->node_info.nodeId) { diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc index c4eebb1..ec347fc 100644 --- a/src/amf/amfd/ndproc.cc +++ b/src/amf/amfd/ndproc.cc @@ -1245,15 +1245,24 @@ void avd_node_failover(AVD_AVND *node, const bool mw_only) { TRACE_LEAVE(); } -void check_quorum() { +void check_quorum(AVD_CL_CB *cb) { TRACE_ENTER(); Consensus consensus_service; if (consensus_service.IsRemoteFencingEnabled() == false && consensus_service.IsWritable() == false) { +// if relaxed mode is enabled, ignore failure if peer SC is up +if (consensus_service.IsRelaxedNodePromotionEnabled() == true) { + AVD_AVND* peer = avd_node_find_nodeid(cb->node_id_avd_other); + if (peer != nullptr && peer->node_state == AVD_AVND_STATE_PRESENT) { +LOG_NO("Relaxed node promotion is enabled, peer SC is connected"); +return; + } +} + // remote fencing is disabled and we have lost write access // reboot this node to prevent split brain opensaf_reboot(0, nullptr, "Quorum lost. Rebooting this node to prevent split-brain"); } -} \ No newline at end of file +} diff --git a/src/amf/amfd/proc.h b/src/amf/amfd/proc.h index a378218..f1dc7ba 100644 --- a/src/amf/amfd/proc.h +++ b/src/amf/amfd/proc.h @@ -96,7 +96,7 @@ void avd_process_hb_event(AVD_CL_CB *cb_now, struct AVD_EVT *evt); extern void avd_node_mark_absent(AVD_AVND *node); extern void avd_tmr_snd_hb_evh(AVD_CL_CB *cb, AVD_EVT *evt); extern void avd_node_failover(AVD_AVND *node, const bool mw_only = false); -extern void check_quorum(); +extern void check_quorum(AVD_CL_CB *cb); extern AVD_SU *get_other_su_from_oper_list(AVD_SU *su); extern void su_complete_admin_op(AVD_SU *su, SaAisErrorT result); extern void comp_complete_admin_op(AVD_COMP *comp, SaAisErrorT result); ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 3/5] osaf: allow active SC to be preferred during network split [#2996]
ack, review only. Thanks/Minh On 21/1/19 2:52 pm, Gary Lee wrote: Add FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE option to allow active SC to be preferred during a network split. The default behavior is to prefer the larger partition to maintain existing behaviour. Add configuration support for FMS_RELAXED_NODE_PROMOTION. --- src/osaf/consensus/consensus.cc | 39 --- src/osaf/consensus/consensus.h | 9 +++-- src/osaf/consensus/key_value.cc | 8 ++-- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc index 112af7d..5304c4f 100644 --- a/src/osaf/consensus/consensus.cc +++ b/src/osaf/consensus/consensus.cc @@ -64,6 +64,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool graceful_takeover, cluster_size); if (rc != SA_AIS_OK) { LOG_WA("Takeover request failed (%d)", rc); + rc = SA_AIS_ERR_EXIST; return rc; } take_over_request_created = true; @@ -99,7 +100,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool graceful_takeover, if (rc == SA_AIS_OK) { LOG_NO("Active controller set to %s", base::Conf::NodeName().c_str()); } else { -LOG_ER("Failed to promote this node (%u)", rc); +LOG_WA("Failed to promote this node (%u)", rc); } return rc; @@ -197,6 +198,10 @@ bool Consensus::IsWritable() const { bool Consensus::IsRemoteFencingEnabled() const { return use_remote_fencing_; } +bool Consensus::IsRelaxedNodePromotionEnabled() const { + return relaxed_node_promotion_; +} + std::string Consensus::CurrentActive() const { TRACE_ENTER(); if (use_consensus_ == false) { @@ -228,6 +233,10 @@ Consensus::Consensus() { uint32_t split_brain_enable = base::GetEnv("FMS_SPLIT_BRAIN_PREVENTION", 0); std::string kv_store_cmd = base::GetEnv("FMS_KEYVALUE_STORE_PLUGIN_CMD", ""); uint32_t use_remote_fencing = base::GetEnv("FMS_USE_REMOTE_FENCING", 0); + uint32_t prioritise_partition_size = +base::GetEnv("FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE", 1); + uint32_t relaxed_node_promotion = +base::GetEnv("FMS_RELAXED_NODE_PROMOTION", 0); // if not specified in fmd.conf, // takeover requests are valid for 20 seconds @@ -246,6 +255,14 @@ Consensus::Consensus() { use_remote_fencing_ = true; } + if (prioritise_partition_size == 1) { +prioritise_partition_size_ = true; + } + + if (use_consensus_ == true && relaxed_node_promotion == 1) { +relaxed_node_promotion_ = true; + } + // needed for base::Conf::NodeName() later base::Conf::InitNodeName(); } @@ -373,6 +390,10 @@ SaAisErrorT Consensus::CreateTakeoverRequest(const std::string& current_owner, return CreateTakeoverRequest(current_owner, proposed_owner, cluster_size); } + if (rc != SA_AIS_OK) { + return rc; + } + // wait up to max_takeover_retry seconds for request to be answered retries = 0; while (retries < max_takeover_retry) { @@ -546,9 +567,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest( LOG_NO("Other network size: %" PRIu64 ", our network size: %" PRIu64, proposed_cluster_size, cluster_size); + const std::string state_str = +tokens[static_cast(TakeoverElements::STATE)]; + TakeoverState result; - if (proposed_cluster_size > cluster_size) { -result = TakeoverState::ACCEPTED; + if (state_str != +TakeoverStateStr[static_cast(TakeoverState::NEW)]) { +return TakeoverState::UNDEFINED; + } + + if (prioritise_partition_size_ == true) { +if (proposed_cluster_size > cluster_size) { + result = TakeoverState::ACCEPTED; +} else { + result = TakeoverState::REJECTED; +} } else { result = TakeoverState::REJECTED; } diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h index 6421c7c..2fbd3bd 100644 --- a/src/osaf/consensus/consensus.h +++ b/src/osaf/consensus/consensus.h @@ -57,6 +57,9 @@ class Consensus { // Is remote fencing enabled? bool IsRemoteFencingEnabled() const; + // Is relaxed node promotion enabled? + bool IsRelaxedNodePromotionEnabled() const; + Consensus(); virtual ~Consensus(); @@ -66,7 +69,7 @@ class Consensus { UNDEFINED = 0, NEW = 1, ACCEPTED = 2, -REJECTED = 3, +REJECTED = 3 }; enum class TakeoverElements : std::uint8_t { @@ -85,13 +88,15 @@ class Consensus { private: bool use_consensus_ = false; bool use_remote_fencing_ = false; + bool prioritise_partition_size_ = false; + bool relaxed_node_promotion_ = false; uint32_t takeover_valid_time; uint32_t max_takeover_retry; const std::string kTestKeyname = "opensaf_write_test"; const std::chrono::milliseconds kSleepInterval = std::chrono::milliseconds(1000); // in ms static constexpr uint32_t kLockTimeout = 0; // lock is persistent
Re: [devel] [PATCH 0/5] Review Request for rded: add relaxed node promotion feature [#2996]
Hi Gary, I'm trying to understand the patch 3/5 and 4/5, there seems to be logic of *relaxed mode* left in 3/5 and 4/5. Thanks Minh On 21/1/19 2:52 pm, Gary Lee wrote: Summary: rded: add relaxed node promotion feature [#2996] Review request for Ticket(s): 2996 Peer Reviewer(s): Hans, Minh Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-2996 Base revision: 35035599567d1add6975a89f1286f20738d67bf1 Personal repository: git://git.code.sf.net/u/userid-2226215/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesn OpenSAF servicesy Core libraries y Samples n Tests n Other n Comments (indicate scope for each "y" above): - revision 9a681198810be2e2ad3f512ff966fe1d9eceb1ab Author: Gary Lee Date: Mon, 21 Jan 2019 14:35:49 +1100 rded: add relaxed node promotion feature [#2996] Allow promotion of node to active at cluster startup, even if the consensus service is unavailable, if the peer SC can be seen. During normal cluster operation, if the consensus service becomes unavailable but the peer SC can still be seen, allow the existing active SC to remain active. A new NCSMDS_SVC_ID_RDE_DISCOVERY service ID is exported by rded. This is installed as soon as rded is started, unlike NCSMDS_SVC_ID_RDE which is only installed when it becomes a candidate for election. revision d2fad05f5ab3b502403493763f5f2bb31608444f Author: Gary Lee Date: Mon, 21 Jan 2019 14:35:49 +1100 amfd: allow node to remain active is peer SC can be seen [#2996] If relaxed node promotion is enabled, allow a SC to remain active if the peer SC can be seen, even if access to the consensus service is lost. revision 4e1bbbd4997a6ea8307695e81a64dd9c53da15aa Author: Gary Lee Date: Mon, 21 Jan 2019 14:35:42 +1100 osaf: allow active SC to be preferred during network split [#2996] Add FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE option to allow active SC to be preferred during a network split. The default behavior is to prefer the larger partition to maintain existing behaviour. Add configuration support for FMS_RELAXED_NODE_PROMOTION. revision 7b50ffd37aafb82e71c726781824f8d6883c5aa5 Author: Gary Lee Date: Mon, 21 Jan 2019 14:27:38 +1100 fmd: add configuration parameters [#2996] Add parameters FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE and FMS_RELAXED_NODE_PROMOTION. revision 1bb52d591e6014e013c8335f7f1a1f516ecc8566 Author: Gary Lee Date: Mon, 21 Jan 2019 14:01:08 +1100 osaf: update etcd3 to poll instead of watch [#2996] The 'watch' command does not return if the etcd server goes down. We need to poll the etcd server to properly check we still have connectivity to the etcd server. Complete diffstat: -- src/amf/amfd/ndfsm.cc | 2 +- src/amf/amfd/ndproc.cc | 13 - src/amf/amfd/proc.h | 2 +- src/fm/fmd/fmd.conf | 17 ++ src/mds/mds_papi.h | 1 + src/osaf/consensus/consensus.cc | 39 - src/osaf/consensus/consensus.h | 9 ++- src/osaf/consensus/key_value.cc | 8 ++- src/osaf/consensus/plugins/etcd3.plugin | 50 + src/rde/rded/rde_cb.h | 12 +++- src/rde/rded/rde_main.cc| 71 +--- src/rde/rded/rde_mds.cc | 94 ++-- src/rde/rded/role.cc| 97 + src/rde/rded/role.h | 4 +- 14 files changed, 375 insertions(+), 44 deletions(-) Testing Commands: - *** LIST THE COMMAND LINE TOOLS/STEPS TO TEST YOUR CHANGES *** Testing, Expected Results: -- *** PASTE COMMAND OUTPUTS / TEST RESULTS *** Conditions of Submission: - Ack from any reviewer, or in 1 week Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 y y powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible
Re: [devel] Review request ntf: update document for limit of logger buffer [#2994]
Hi Canh, A very minor comment, that we may have to mention it's for alarm notif only. Thanks, Minh On 11/1/19 11:30 pm, Lennart Lund wrote: Hi Canh, I should have Acked the document in my previous answer. Just fix any minor things and push. Thanks Lennart *From:*Canh Van Truong *Sent:* den 11 januari 2019 12:01 *To:* Lennart Lund ; Minh Hon Chau *Cc:* opensaf-devel@lists.sourceforge.net *Subject:* RE: Review request ntf: update document for limit of logger buffer [#2994] Hi Lennart Yes, I add new column for “Default value”. Regards Canh *From:*Lennart Lund *Sent:* Friday, January 11, 2019 5:33 PM *To:* Canh Van Truong ; Minh Hon Chau *Cc:* opensaf-devel@lists.sourceforge.net *Subject:* RE: Review request ntf: update document for limit of logger buffer [#2994] Hi Canh, I have one minor comment in the attached document. Also, it may be better if the table has three columns “Environment Variable, Default value and Comment” Thanks Lennart *From:*Canh Van Truong <mailto:canh.v.tru...@dektech.com.au>> *Sent:* den 11 januari 2019 10:18 *To:* Lennart Lund <mailto:lennart.l...@ericsson.com>>; Minh Hon Chau mailto:minh.c...@dektech.com.au>> *Cc:* opensaf-devel@lists.sourceforge.net <mailto:opensaf-devel@lists.sourceforge.net> *Subject:* RE: Review request ntf: update document for limit of logger buffer [#2994] Thanks Lennart, I have updated with your comments. Please give the comments if there is something need to be updated. Regards Canh *From:*Lennart Lund <mailto:lennart.l...@ericsson.com>> *Sent:* Thursday, January 10, 2019 9:07 PM *To:* Canh Van Truong <mailto:canh.v.tru...@dektech.com.au>>; Minh Hon Chau mailto:minh.c...@dektech.com.au>> *Cc:* opensaf-devel@lists.sourceforge.net <mailto:opensaf-devel@lists.sourceforge.net> *Subject:* RE: Review request ntf: update document for limit of logger buffer [#2994] Hi Canh, The following text describing the NTFSV_LOGGER_BUFFER_CAPACITY environment variable should be improved/simplified (hard to understand) Original text from document: “The limit of logger buffer size in NTFD. The logger buffer is used to store the notification if writing notification to log file fail. The limit should be set with relevant value to avoid congestion in NTFD. Because if this value is set too big while writing notification is fail for long time, NTFD has to write a big number of notifications whenever handling sending notification request and that will delay to handle other requests come to NTFD. The value of variable is from 10 to 5000.” Suggestion: Note: my native language is not English so I suggest that this text is checked by someone who knows English better than I do. However, the following information is what I think is needed. “Notification log buffer size. Valid values are 10 to 5000 stored notifications. Default is 10. Some notifications are logged using the OpenSAF log service. NTF has a buffer to store notifications to be logged later in case the log service returns TRY AGAIN (may happen if the log service is temporary unavailable) when NTF writes the log record. When the log service is available again (returns OK) all notifications in the buffer will be written before the NTF service can service any new notification requests. If the buffer is big this may take some time and may cause the NTF client to timeout. If the buffer is full and the log service answers TRY AGAIN NTF will return TRY AGAIN when Notification send is called.” Thanks Lennart *From:*Canh Van Truong <mailto:canh.v.tru...@dektech.com.au>> *Sent:* den 10 januari 2019 12:52 *To:* Minh Hon Chau <mailto:minh.c...@dektech.com.au>>; Lennart Lund mailto:lennart.l...@ericsson.com>> *Cc:* opensaf-devel@lists.sourceforge.net <mailto:opensaf-devel@lists.sourceforge.net> *Subject:* Review request ntf: update document for limit of logger buffer [#2994] Update the document because of the ticket #2961 ntf: Limit the logger buffer [#2961] Document with recorded changes attached. Activate "Show Changes" to see them [Edit/Track Changes/Show Changes] Thanks Canh ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] mdstest: fix tet_svc_subscr_VDEST_11() fail with TIPC transport [#2978]
Hi Thuan, I think it's ok for sleep() to make the test simple, ack from me. thanks Minh On 29/11/18 7:47 pm, thuan.tran wrote: TIPC published event received is not as order MDS service install. Service 600 got published role active before role standby even install role standby before role active. The simplest and safe solution is add sleep 1s before change vdest role to active. --- src/mds/apitest/mdstipc_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mds/apitest/mdstipc_api.c b/src/mds/apitest/mdstipc_api.c index 22a4386..34dcd3b 100644 --- a/src/mds/apitest/mdstipc_api.c +++ b/src/mds/apitest/mdstipc_api.c @@ -1847,6 +1847,7 @@ void tet_svc_subscr_VDEST_11() printf("\nFail to subscribing for the service 500\n"); FAIL = 1; } + sleep(1); /* verifying the rem svc ver from 600 and 700*/ printf("\nChanging the role of vdest to active"); if (vdest_change_role(1001, V_DEST_RL_ACTIVE) != NCSCC_RC_SUCCESS) { ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/2] ntf: Limit the logger buffer [#2961]
Hi Canh, ack with comments, please see with [M] Thanks Minh On 9/1/19 8:22 pm, Canh Van Truong wrote: When writing the notificaion fail with TRY_AGAIN in callback, the notificaion is pushed again to the list. If this happens for long time, the list is going to be very big. This cause NTFD take time to process writing all the notification in the list and the request from NTFA come this time may be timeout. The patch does: - Limit the logger buffer - Provide the env variable that user can set the value of the limit - Return TRY_AGAIN error in case the limit of buffer is reached and write all the notifications in the buffer to the log file. The current of notification isn't written to log file. --- src/ntf/README| 10 src/ntf/ntfd/NtfAdmin.cc | 44 +- src/ntf/ntfd/NtfLogger.cc | 149 +++--- src/ntf/ntfd/NtfLogger.h | 11 +++- src/ntf/ntfd/ntfd.conf| 10 src/ntf/ntfd/ntfs.h | 2 + 6 files changed, 147 insertions(+), 79 deletions(-) diff --git a/src/ntf/README b/src/ntf/README index 6dd5173e1..5bf670647 100644 --- a/src/ntf/README +++ b/src/ntf/README @@ -233,6 +233,16 @@ NTFSV_ENV_CACHE_SIZE The size of the notification cache in the NTF server processes running on the Controller nodes. The default value is 1 notification. +NTFSV_LOGGER_BUFFER_CAPACITY + +The logger buffer is used to store the notification if writing notification +to log file fail. This variable is set for limit of logger buffer size in +NTFD. If the logger buffer is full and NTFD receives new notification, +the TRY_AGAIN error is returned to user. The limit should be set with relevant +value to avoid congestion in NTFD. Because if this value is set too big and +writing notification is fail for long time, NTF has to write a big number of +notifications whenever handling sending notification request and that will delay +to handle other requests come to NTFD. The value of variable is from 10 to 5000. for debug see DEBUG. diff --git a/src/ntf/ntfd/NtfAdmin.cc b/src/ntf/ntfd/NtfAdmin.cc index 2cb99457c..6c2d69b43 100644 --- a/src/ntf/ntfd/NtfAdmin.cc +++ b/src/ntf/ntfd/NtfAdmin.cc @@ -193,19 +193,32 @@ void NtfAdmin::processNotification(unsigned int clientId, notificationId, notificationType, (unsigned int)notificationMap.size()); - // log the notification. Callback from SAF log will confirm later. - logger.log(notification, activeController()); - - /* send notification to standby */ - sendNotificationUpdate(clientId, notification->getNotInfo()); + if ((logger.isLoggerBufferFull() == true) && + (logger.isAlarmNotification(notification) == true)) { +NtfClient *client = getClient(clientId); +MDS_DEST dest = client->getMdsDest(); +LOG_WA("The logger buffer is full. Check if there is issue in writing"); +if (activeController()) + notfication_result_lib(SA_AIS_ERR_TRY_AGAIN, notificationId, + mdsCtxt, dest); + } else { +/* send notification to standby */ +sendNotificationUpdate(clientId, notification->getNotInfo()); - ClientMap::iterator pos; - for (pos = clientMap.begin(); pos != clientMap.end(); pos++) { -NtfClient *client = pos->second; -client->notificationReceived(clientId, notification, mdsCtxt); +ClientMap::iterator pos; +for (pos = clientMap.begin(); pos != clientMap.end(); pos++) { + NtfClient *client = pos->second; + client->notificationReceived(clientId, notification, mdsCtxt); +} } - /* remove notification if sent to all subscribers and logged */ + // Log the notification. Callback from SAF log will confirm later. + if (activeController()) +logger.log(notification); + // Add the notification to Reader list + logger.addNotificationToReaderList(notification); + + // Remove the notification if it is sent to all subscribers and logged if (notification->isSubscriptionListEmpty() && notification->loggedOk()) { NotificationMap::iterator posNot; posNot = notificationMap.find(notificationId); [M]: If ntfd decides to return TRY_AGAIN, then the notification should not be added for readers to read, and for subscription checking, etc I think it looks like this if (the buffer is not empty) { // try to flush all pending log } if (the buffer is still full) { // return try again } else { // add this to buffer, checkpoint, add to reader lists, as normal } @@ -341,9 +354,9 @@ void NtfAdmin::notificationReceivedColdSync( TRACE_LEAVE(); } /** - * A cached notification is received in Cold Sync. - * This cached notification will be marked as logged, and stored - * only in NtfLogger class to serve the reader. + * A cached notifications are received in Cold Sync. + * This cached notifications are stored in NtfLogger + * class to serve the reader. * * @param clientId Node-wide unique id for the
Re: [devel] [PATCH 1/1] tests: test.sh should checkout release tag of googletest for stable [#2983]
Hi Thuan, ack from me. Thanks, Minh On 12/12/18 7:20 pm, thuan.tran wrote: --- test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test.sh b/test.sh index daf6293..ce90a62 100755 --- a/test.sh +++ b/test.sh @@ -37,6 +37,7 @@ if [[ ! -f "$OSAF_TEST_WORKDIR/googletest/googlemock/lib/libgmock.la" || fi cd "$OSAF_TEST_WORKDIR/googletest" +git checkout `git tag | grep "release" | tail -n 1` autoreconf -vi ./configure --with-pthreads make -j "$no_of_processors" ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 1/1] amfd: add node to failover_list before calling SetState [#2963]
Hi Gary, ack (code review only). Thanks Minh On 16/11/18 5:38 pm, Gary Lee wrote: node must be added to failover_list before SetState() is called. If the state is 'end', then it will be deleted by SetState(). Otherwise, we will leave a node in 'End' state mistakenly in failover_list. --- src/amf/amfd/ckpt_dec.cc | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc index 022fa8f4b..a46f6d306 100644 --- a/src/amf/amfd/ckpt_dec.cc +++ b/src/amf/amfd/ckpt_dec.cc @@ -2990,8 +2990,12 @@ static uint32_t dec_node_failover_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) { node->node_name.c_str()); auto new_node = std::make_shared(cb, node->node_info.nodeId); -new_node->SetState(state); +// node must be added to failover_list before SetState() is called. +// If the state is 'end', then it will be deleted by SetState(). +// Otherwise, we will leave a node in 'End' state mistakenly in +// failover_list. cb->failover_list[node->node_info.nodeId] = new_node; +new_node->SetState(state); } return NCSCC_RC_SUCCESS; ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 0/5] Review Request for ntf: add new test cases in ntf apitest [#2958]
Hi Mohan, I have pushed the patches. Thanks Minh On 14/11/18 6:19 pm, mo...@hasolutions.in wrote: Hi minh, I republished those patches. please check it and push into the community. Thanks Mohan High Availability Solutions Pvt. Ltd. www.hasolutions.in - Original Message - Subject: [PATCH 0/5] Review Request for ntf: add new test cases in ntf apitest [#2958] From: "Mohan Kanakam" Date: 11/14/18 12:42 pm To: minh.c...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net, "Mohan Kanakam" Summary: ntf: add new test case of API saNtfInitialize() of apitest v2 [#2958] Review request for Ticket(s): 2958 Peer Reviewer(s):minh Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-2958 Base revision: f8a6848a1cdbff0b518c3db951e4689e260226c7 Personal repository: git://git.code.sf.net/u/mohan-hasoln/review Impacted area Impact y/n Docs n Build system n RPM/packaging n Configuration files n Startup scripts n SAF services n OpenSAF services n Core libraries n Samples n Tests y Other n Comments (indicate scope for each "y" above): - *** EXPLAIN/COMMENT THE PATCH SERIES HERE *** revision f26d2ed56b5a091163cf9d2af75fe0d818b546e5 Author: Mohan Kanakam Date: Wed, 14 Nov 2018 11:46:16 +0530 ntf: add new test case of API saNtfNotificationReadFinalize() of apitest [#2958] revision 4ebfc2e5a06c36cc8324533f67486532edba139e Author: Mohan Kanakam Date: Wed, 14 Nov 2018 11:44:44 +0530 ntf: add new test case of API saNtfFinalize() of apitest [#2958] revision f23c17b01252e4d858f5e47e0b6f1d66abc9a74e Author: Mohan Kanakam Date: Wed, 14 Nov 2018 11:42:31 +0530 ntf: add new test case of API saNtfDispatch() of apitest v2 [#2958] revision b60b353c7c7abb12d15ef4547a578d10649da229 Author: Mohan Kanakam Date: Wed, 14 Nov 2018 11:40:02 +0530 ntf: add new test case of API saNtfSelectionObjectGet() of apitest [#2958] revision 3a01c0e3b2771b3b8b39747f9497178708b1c1f3 Author: Mohan Kanakam Date: Wed, 14 Nov 2018 11:37:27 +0530 ntf: add new test case of API saNtfInitialize() of apitest v2 [#2958] Complete diffstat: -- src/ntf/apitest/tet_saNtfDispatch.cc | 10 ++ src/ntf/apitest/tet_saNtfFinalize.cc | 7 +++ src/ntf/apitest/tet_saNtfInitialize.cc | 8 src/ntf/apitest/tet_saNtfNotificationReadFinalize.cc | 7 +++ src/ntf/apitest/tet_saNtfSelectionObjectGet.cc | 11 +++ 5 files changed, 43 insertions(+) Testing Commands: - ./ntftest Testing, Expected Results: -- 13 PASSED saNtfInitialize with NULL pointer to handle AND NULLptr to callbacks and nullptr to version 5 PASSED saNtfSelectionObjectGet Finalized handle SA_AIS_ERR_BAD_HANDLE 4 PASSED saNtfDispatch - Fianlized handle SA_AIS_ERR_BAD_HANDLE 6 PASSED saNtfFinalize SA_AIS_ERR_BAD_HANDLE - unintilized handle 2 PASSED saNtfNotificationReadFinalize SA_AIS_ERR_BAD_HANDLE Conditions of Submission: - Ack from maintainers Arch Built Started Linux distro --- mips n n mips64 n n x86 n n x86_64 y y powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible line length in your headers/comments/text. ___ You have failed to put in a proper Trac Ticket # into your commits. ___ You have incorrectly put/left internal data in your comments/files (i.e. internal bug tracking tool IDs, product names etc) ___ You have not given any evidence of testing beyond basic build tests. Demonstrate some level of runtime or other sanity testing. ___ You have ^M present in some of your files. These have to be removed. ___ You have needlessly changed whitespace or added whitespace crimes like trailing spaces, or spaces before tabs. ___ You have mixed real technical changes with whitespace and other cosmetic code cleanup changes. These have to be separate commits. ___ You need to refactor your submission into
Re: [devel] [PATCH 0/5] Review Request for ntf: add new test cases in apitest [#2958]
Hi Mohan, ack for series with minor comments in the sub-patches. Thanks Minh On 9/11/18 11:55 pm, Mohan Kanakam wrote: Summary: ntf: add new test case of API saNtfInitialize() of apitest [#2958] Review request for Ticket(s): 2958 Peer Reviewer(s):minh Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected branch(es): develop Development branch: ticket-2958 Base revision: f8a6848a1cdbff0b518c3db951e4689e260226c7 Personal repository: git://git.code.sf.net/u/mohan-hasoln/review Impacted area Impact y/n Docsn Build systemn RPM/packaging n Configuration files n Startup scripts n SAF servicesn OpenSAF servicesn Core libraries n Samples n Tests y Other n Comments (indicate scope for each "y" above): - *** EXPLAIN/COMMENT THE PATCH SERIES HERE *** revision ebef37fa591d4e63ee7de9ea4098000a3256b208 Author: Mohan Kanakam Date: Fri, 9 Nov 2018 17:19:08 +0530 ntf: add new test case of API saNtfNotificationReadFinalize() of apitest [#2958] revision c716e7524263131754a26b61cd305988d97a206c Author: Mohan Kanakam Date: Fri, 9 Nov 2018 17:16:38 +0530 ntf: add new test case of API saNtfFinalize() of apitest [#2958] revision cf271ad8784141560fdc6e616c5920b2fe975928 Author: Mohan Kanakam Date: Fri, 9 Nov 2018 17:14:20 +0530 ntf: add new test case of API saNtfDispatch() of apitest [#2958] revision 84f64a6aaa9b45ef55261764a52386030bfd0830 Author: Mohan Kanakam Date: Fri, 9 Nov 2018 17:12:01 +0530 ntf: add new test case of API saNtfSelectionObjectGet() of apitest [#2958] revision 176eb07f6e212334517af33879d085932324d4ef Author: Mohan Kanakam Date: Fri, 9 Nov 2018 17:08:57 +0530 ntf: add new test case of API saNtfInitialize() of apitest [#2958] Complete diffstat: -- src/ntf/apitest/tet_saNtfDispatch.cc | 10 ++ src/ntf/apitest/tet_saNtfFinalize.cc | 7 +++ src/ntf/apitest/tet_saNtfInitialize.cc | 8 src/ntf/apitest/tet_saNtfNotificationReadFinalize.cc | 7 +++ src/ntf/apitest/tet_saNtfSelectionObjectGet.cc | 11 +++ 5 files changed, 43 insertions(+) Testing Commands: - ./ntftest Testing, Expected Results: -- 13 PASSED saNtfInitialize with NULL pointer to handle AND NULL callbacks and unintilized version 5 PASSED saNtfSelectionObjectGet Finalized handle SA_AIS_ERR_BAD_HANDLE 4 PASSED saNtfDispatch - Fianlized handle SA_AIS_ERR_BAD_HANDLE 6 PASSED saNtfFinalize SA_AIS_ERR_BAD_HANDLE - unintilized handle 2 PASSED saNtfNotificationReadFinalize SA_AIS_ERR_BAD_HANDLE Conditions of Submission: - Ack from maintainers Arch Built StartedLinux distro --- mipsn n mips64 n n x86 n n x86_64 y y powerpc n n powerpc64 n n Reviewer Checklist: --- [Submitters: make sure that your review doesn't trigger any checkmarks!] Your checkin has not passed review because (see checked entries): ___ Your RR template is generally incomplete; it has too many blank entries that need proper data filled in. ___ You have failed to nominate the proper persons for review and push. ___ Your patches do not have proper short+long header ___ You have grammar/spelling in your header that is unacceptable. ___ You have exceeded a sensible line length in your headers/comments/text. ___ You have failed to put in a proper Trac Ticket # into your commits. ___ You have incorrectly put/left internal data in your comments/files (i.e. internal bug tracking tool IDs, product names etc) ___ You have not given any evidence of testing beyond basic build tests. Demonstrate some level of runtime or other sanity testing. ___ You have ^M present in some of your files. These have to be removed. ___ You have needlessly changed whitespace or added whitespace crimes like trailing spaces, or spaces before tabs. ___ You have mixed real technical changes with whitespace and other cosmetic code cleanup changes. These have to be separate commits. ___ You need to refactor your submission into logical chunks; there is too much content into a single commit. ___ You have extraneous garbage in your review (merge commits etc) ___ You have giant attachments which should never have been sent; Instead you should place your content in a public tree to be pulled. ___ You have too many commits attached to an e-mail; resend as threaded commits, or place in a public tree for a pull. ___ You have resent this content multiple times without a clear indication of what has changed
Re: [devel] [PATCH 1/5] ntf: add new test case of API saNtfInitialize() of apitest [#2958]
Hi Mohan, A minor comment, we could use nullptr instead. Thanks Minh On 9/11/18 11:55 pm, Mohan Kanakam wrote: --- src/ntf/apitest/tet_saNtfInitialize.cc | 8 1 file changed, 8 insertions(+) diff --git a/src/ntf/apitest/tet_saNtfInitialize.cc b/src/ntf/apitest/tet_saNtfInitialize.cc index 8538193..c1442dc 100644 --- a/src/ntf/apitest/tet_saNtfInitialize.cc +++ b/src/ntf/apitest/tet_saNtfInitialize.cc @@ -117,6 +117,11 @@ void saNtfInitialize_12(void) { test_validate(rc, SA_AIS_ERR_VERSION); } +void saNtfInitialize_13(void) { + rc = NtfTest::saNtfInitialize(NULL, NULL, NULL); + test_validate(rc, SA_AIS_ERR_INVALID_PARAM); +} + __attribute__((constructor)) static void saNtfInitialize_constructor(void) { test_suite_add(1, "Life cycle, initialize, API 1"); test_case_add(1, saNtfInitialize_01, "saNtfInitialize SA_AIS_OK"); @@ -142,4 +147,7 @@ __attribute__((constructor)) static void saNtfInitialize_constructor(void) { "saNtfInitialize with major version set to lower"); test_case_add(1, saNtfInitialize_12, "saNtfInitialize with version A.0.0"); + test_case_add(1, saNtfInitialize_13, + "saNtfInitialize with NULL pointer to handle AND NULL callbacks" + " and unintilized version"); } ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] [PATCH 3/5] ntf: add new test case of API saNtfDispatch() of apitest [#2958]
Hi Mohan, A minor comment, I think you meant to add saNtfDispatch_04. Thanks Minh On 9/11/18 11:55 pm, Mohan Kanakam wrote: --- src/ntf/apitest/tet_saNtfDispatch.cc | 10 ++ 1 file changed, 10 insertions(+) diff --git a/src/ntf/apitest/tet_saNtfDispatch.cc b/src/ntf/apitest/tet_saNtfDispatch.cc index 5fea4ef..81a722c 100644 --- a/src/ntf/apitest/tet_saNtfDispatch.cc +++ b/src/ntf/apitest/tet_saNtfDispatch.cc @@ -40,6 +40,14 @@ void saNtfDispatch_03(void) { test_validate(rc, SA_AIS_ERR_INVALID_PARAM); } +void saNtfDispatch_04(void) { + safassert(NtfTest::saNtfInitialize(, , ), + SA_AIS_OK); + safassert(NtfTest::saNtfFinalize(ntfHandle), SA_AIS_OK); + rc = NtfTest::saNtfDispatch(ntfHandle, SA_DISPATCH_ALL); + test_validate(rc, SA_AIS_ERR_BAD_HANDLE); +} + __attribute__((constructor)) static void saNtfDispatch_constructor(void) { test_suite_add(4, "Life cycle, dispatch, API 4"); test_case_add(4, saNtfDispatch_01, @@ -48,4 +56,6 @@ __attribute__((constructor)) static void saNtfDispatch_constructor(void) { "saNtfDispatch - invalid handle SA_AIS_ERR_BAD_HANDLE"); test_case_add(4, saNtfDispatch_03, "saNtfDispatch - zero flag SA_AIS_ERR_INVALID_PARAM"); + test_case_add(4, saNtfDispatch_03, + "saNtfDispatch - Fianlized handle SA_AIS_ERR_BAD_HANDLE"); } ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] Review Request for amf: update PR [#2918]
Hi Gary, ack from me. Thanks/Minh On 14/11/18 1:28 pm, Gary Lee wrote: Hi A small update to the AMF PR for #2918. * Renumbered 2.2.18 Excessive assignments to 2.2.19 * Added 2.2.18 Network partitioning * Added timers to Section 3.3 https://sourceforge.net/p/opensaf/tickets/_discuss/thread/cae26fce/0d37/attachment/OpenSAF_AMF_PR_new.odt.gz Thanks Gary ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] Review Request for amf: Update PR [#2929]
Hi Nagu, It's not the recovery in specification, I mean a new attribute. Thanks, Minh On 12/11/18 9:53 pm, Nagendra Kumar wrote: Hi Minh, Thanks for your response. In future, I think we can make it as configurable recovery method, so up to applications to choose from. You mean recommended recovery option? But how will it work? Thanks -Nagendra High Availability Solutions www.hasolutions.in cont...@hasolutions.in Hyderabad, India: +91-9866424860 | Delaware, USA: +1 508-422-7725 -Original Message- From: Minh Hon Chau [mailto:minh.c...@dektech.com.au] Sent: 12 November 2018 16:16 To: Nagendra Kumar; 'Hans Nordeback'; 'Gary Lee' Cc: opensaf-devel@lists.sourceforge.net Subject: Re: Review Request for amf: Update PR [#2929] Hi Nagu, Agree with you that we can do it for 2N. However the mutual active workload has to be exclusively one at a time, so there may be some sort of corruption to applications. But it also depends on how internal application logics are implemented. So reboot the node is a choice of safety for now. In future, I think we can make it as configurable recovery method, so up to applications to choose from. Thanks, Minh On 12/11/18 7:49 pm, Nagendra Kumar wrote: Hi Minh, Ack from me. Btw, why did you opt to remove assignments and restart admin operation for Nway Act and No Red. The same could have done in 2N by removing the assignments and restart and then provide fresh assignments. Thanks -Nagendra High Availability Solutions www.hasolutions.in cont...@hasolutions.in Hyderabad, India: +91-9866424860 | Delaware, USA: +1 508-422-7725 -Original Message- From: Minh Hon Chau [mailto:minh.c...@dektech.com.au] Sent: 12 November 2018 08:04 To: Hans Nordeback; Nagendra Kumar; Gary Lee Cc: opensaf-devel@lists.sourceforge.net Subject: Review Request for amf: Update PR [#2929] Hi all, Document update for #2929 in item 2.2.18 to be reviewed. https://sourceforge.net/p/opensaf/tickets/2929/attachment/OpenSAF_AMF_PR_2929.odt Thanks, Minh ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel
Re: [devel] Review Request for amf: Update PR [#2929]
Hi Nagu, Agree with you that we can do it for 2N. However the mutual active workload has to be exclusively one at a time, so there may be some sort of corruption to applications. But it also depends on how internal application logics are implemented. So reboot the node is a choice of safety for now. In future, I think we can make it as configurable recovery method, so up to applications to choose from. Thanks, Minh On 12/11/18 7:49 pm, Nagendra Kumar wrote: Hi Minh, Ack from me. Btw, why did you opt to remove assignments and restart admin operation for Nway Act and No Red. The same could have done in 2N by removing the assignments and restart and then provide fresh assignments. Thanks -Nagendra High Availability Solutions www.hasolutions.in cont...@hasolutions.in Hyderabad, India: +91-9866424860 | Delaware, USA: +1 508-422-7725 -Original Message- From: Minh Hon Chau [mailto:minh.c...@dektech.com.au] Sent: 12 November 2018 08:04 To: Hans Nordeback; Nagendra Kumar; Gary Lee Cc: opensaf-devel@lists.sourceforge.net Subject: Review Request for amf: Update PR [#2929] Hi all, Document update for #2929 in item 2.2.18 to be reviewed. https://sourceforge.net/p/opensaf/tickets/2929/attachment/OpenSAF_AMF_PR_2929.odt Thanks, Minh ___ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel