from:"Minh Hon Chau"

Re: [devel] [PATCH 1/1] osaf: support compile with gcc/g++ 10 [#3307]

2022-03-15 Thread Minh Hon Chau

Hi Thang,

I think it would be nice you can separate two commits, one for test, one for 
gcc/g++ 10 code changes.

Thanks,
Minh

From: Thang Duc Nguyen 
Sent: Wednesday, March 16, 2022 11:44 AM
To: Hieu Hong Hoang ; Thien Minh Huynh 
; Minh Hon Chau 
Cc: opensaf-devel@lists.sourceforge.net ; 
Thang Duc Nguyen 
Subject: [PATCH 1/1] osaf: support compile with gcc/g++ 10 [#3307]

- Fix error to support gcc/g++ 10.
- Fix memleak in api test.
---
 src/ckpt/agent/cpa_cb.h   |  2 +-
 src/ckpt/apitest/test_cpa.c   |  2 +
 src/ckpt/apitest/test_cpa_util.c  | 13 ++-
 src/ckpt/apitest/test_cpsv_conf.h |  2 +-
 src/ckpt/ckptd/cpd_amf.c  |  1 -
 src/ckpt/ckptd/cpd_init.h |  2 +-
 src/evt/agent/eda.h   |  2 +-
 src/evt/apitest/tet_eda.c | 32 
 src/evt/apitest/tet_eda.h | 80 ---
 src/evt/apitest/tet_edsv_func.c   |  1 +
 src/evt/evtd/eds.h|  2 +-
 src/evt/evtd/eds_amf.h|  6 +-
 src/evt/evtd/eds_cb.h |  2 +-
 .../test_saImmOmThreadInterference.c  |  4 +-
 src/imm/immd/immd.h   |  2 +-
 src/lck/apitest/tet_gld.c |  1 -
 src/lck/apitest/tet_glnd.c|  2 -
 src/lck/lckd/gld_dl_api.h |  2 +-
 src/lck/lcknd/glnd_cb.h   |  4 +-
 src/log/apitest/logtest.c |  6 ++
 src/log/apitest/logtest.h |  6 +-
 src/log/logd/lgs_dest.cc  |  4 +-
 src/mds/apitest/mdstipc.h | 30 +++
 src/mds/apitest/mdstipc_api.c | 17 
 src/mds/mds_core.h| 30 +++
 src/mds/mds_dt_common.c   |  3 +
 src/mds/mds_dt_tcp.c  |  3 +-
 src/mds/mds_dt_tcp.h  |  2 +-
 src/mds/mds_dt_tipc.c |  2 -
 src/mds/mds_main.c| 47 +++
 src/msg/msgnd/mqnd_db.h   |  2 +-
 tools/devel/fenced/node_state_hdlr_pl.cc  |  1 +
 32 files changed, 224 insertions(+), 91 deletions(-)

diff --git a/src/ckpt/agent/cpa_cb.h b/src/ckpt/agent/cpa_cb.h
index ac48c6c4f..d6335830f 100644
--- a/src/ckpt/agent/cpa_cb.h
+++ b/src/ckpt/agent/cpa_cb.h
@@ -119,7 +119,7 @@ typedef struct cpa_cb {

 } CPA_CB;

-uint32_t gl_cpa_hdl;
+extern uint32_t gl_cpa_hdl;

 typedef struct cpa_prcess_evt_sync {
   NCS_QELEM qelem;
diff --git a/src/ckpt/apitest/test_cpa.c b/src/ckpt/apitest/test_cpa.c
index 6c37e91d5..0093b91ea 100644
--- a/src/ckpt/apitest/test_cpa.c
+++ b/src/ckpt/apitest/test_cpa.c
@@ -364,6 +364,7 @@ void fill_testcase_data()
 *(ckpt_name + length) = '.';
 saAisNameLend(ckpt_name,
   _replicas_ckpt_with_valid_extended_name_length);
+   free(ckpt_name);

 ckpt_name = malloc(INVALID_EXTENDED_NAME_LENGTH);
 memset(ckpt_name, 0, INVALID_EXTENDED_NAME_LENGTH);
@@ -374,6 +375,7 @@ void fill_testcase_data()
 *(ckpt_name + length) = '.';
 saAisNameLend(ckpt_name,
   
_replicas_ckpt_with_invalid_extended_name_length);
+   free(ckpt_name);

 /* Variables for sec create */
 tcd.sec_id1 = (SaUint8T *)"11";
diff --git a/src/ckpt/apitest/test_cpa_util.c b/src/ckpt/apitest/test_cpa_util.c
index 474e76f0d..7da36e0c1 100644
--- a/src/ckpt/apitest/test_cpa_util.c
+++ b/src/ckpt/apitest/test_cpa_util.c
@@ -24,6 +24,7 @@ extern const char *saf_error_string[];
 int gl_try_again_cnt;
 int gl_tmout_cnt;
 int gl_sync_pointnum;
+NCSCONTEXT gl_task_hdl = NULL;
 int tmoutFlag;

 int cpsv_test_result(SaAisErrorT rc, SaAisErrorT exp_out, char *test_case,
@@ -651,23 +652,24 @@ void selection_thread_blocking(NCSCONTEXT arg)
 m_TEST_CPSV_PRINTF("\n Dispatching FAILED %d \n", rc);
 else
 m_TEST_CPSV_PRINTF("\n Thread selected \n");
+   m_NCS_TASK_RELEASE(gl_task_hdl);
 }

 void cpsv_createthread(SaCkptHandleT *cl_hdl)
 {
 SaAisErrorT rc;
-   NCSCONTEXT thread_handle;

 rc = m_NCS_TASK_CREATE((NCS_OS_CB)selection_thread_blocking,
(NCSCONTEXT)cl_hdl, "cpsv_block_test", 0,
-  SCHED_OTHER, 8000, _handle);
+  SCHED_OTHER, 8000, _task_hdl);
 if (rc != NCSCC_RC_SUCCESS) {
 m_TEST_CPSV_PRINTF(" Failed to create thread\n");
 return;
 }

-   rc = m_NCS_TASK_START(thread_handle);
+   rc = m_NCS_TASK_START(gl_task_hdl);
 if (rc != NCSCC_RC_SUCCESS) {
+   m_NCS_TASK_RELEASE(gl_task_hdl);
 m_TEST_CPSV_PRINTF(" Failed to start thread\n");

Re: [devel] [PATCH 1/1] amf: correct behavior SU restart [#3233]

2020-11-10 Thread Minh Hon Chau

Hi Thang
Ack from me
Thanks
Minh

Get Outlook for iOS<https://aka.ms/o0ukef>

From: Thang Duc Nguyen 
Sent: Tuesday, November 10, 2020 7:58:04 PM
To: Minh Hon Chau ; Thuan Tran 

Cc: opensaf-devel@lists.sourceforge.net ; 
Thang Duc Nguyen 
Subject: [PATCH 1/1] amf: correct behavior SU restart [#3233]

During standby SU restarts, active SU is failover. The standby SU
need re-assignment standby then take over active assignment.
This is to correct the issue in the ticket #3207.
---
 src/amf/amfnd/comp.cc |  3 +--
 src/amf/amfnd/susm.cc | 55 +--
 2 files changed, 17 insertions(+), 41 deletions(-)

diff --git a/src/amf/amfnd/comp.cc b/src/amf/amfnd/comp.cc
index f1e33c372..d805346bb 100644
--- a/src/amf/amfnd/comp.cc
+++ b/src/amf/amfnd/comp.cc
@@ -1083,8 +1083,7 @@ uint32_t avnd_comp_csi_assign(AVND_CB *cb, AVND_COMP 
*comp,
   if (curr_csi->curr_assign_state ==
   AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED &&
   curr_csi->prv_assign_state ==
-  AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED &&
-  !m_AVND_SU_IS_RESTART(comp->su)) {
+  AVND_COMP_CSI_ASSIGN_STATE_UNASSIGNED) {
 // Mark suspending_assignment for all unassigned csi(s) which are
 // going to be assigned to *curr_csi->comp*
 for (t_csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(
diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc
index d8ef66ea2..80b35ea8f 100644
--- a/src/amf/amfnd/susm.cc
+++ b/src/amf/amfnd/susm.cc
@@ -306,18 +306,15 @@ uint32_t avnd_su_siq_prc(AVND_CB *cb, AVND_SU *su) {
 return rc;
   }

+  /* unlink the buffered msg from the queue */
+  ncs_db_link_list_delink(>siq, >su_dll_node);
+
   /* initiate si asignment / removal */
   rc = avnd_su_si_msg_prc(cb, su, >info);

-  // Siq will used to su-si respond later
-  // in case modify SU-SI during SURestart
-  if ((siq->info.msg_act != AVSV_SUSI_ACT_MOD) ||
-  !m_AVND_SU_IS_RESTART(su)) {
-/* unlink the buffered msg from the queue */
-ncs_db_link_list_delink(>siq, >su_dll_node);
-/* delete the buffered msg */
-avnd_su_siq_rec_del(cb, su, siq);
-  }
+  /* delete the buffered msg */
+  avnd_su_siq_rec_del(cb, su, siq);
+
   TRACE_LEAVE2("%u", rc);
   return rc;
 }
@@ -1134,7 +1131,6 @@ static bool container_contained_shutdown(const AVND_SU 
*su) {
 uint32_t avnd_su_si_oper_done(AVND_CB *cb, AVND_SU *su, AVND_SU_SI_REC *si) {
   AVND_SU_SI_REC *curr_si = 0;
   AVND_COMP_CSI_REC *curr_csi = 0, *t_csi = 0;
-  AVND_SU_SIQ_REC *siq = 0;
   uint32_t rc = NCSCC_RC_SUCCESS;
   bool opr_done;

@@ -1212,18 +1208,6 @@ uint32_t avnd_su_si_oper_done(AVND_CB *cb, AVND_SU *su, 
AVND_SU_SI_REC *si) {
 if (NCSCC_RC_SUCCESS != rc) goto done;
   }

-  // Modify event during SURestart should be respond
-  siq = reinterpret_cast(m_NCS_DBLIST_FIND_LAST(>siq));
-  if (siq && (siq->info.msg_act == AVSV_SUSI_ACT_MOD) &&
-  m_AVND_SU_IS_RESTART(su)) {
-  ncs_db_link_list_delink(>siq, >su_dll_node);
-  /* delete the buffered msg */
-  avnd_su_siq_rec_del(avnd_cb, su, siq);
-  rc = avnd_di_susi_resp_send(cb, su,
-  m_AVND_SU_IS_ALL_SI(su) ? nullptr : si);
-  if (NCSCC_RC_SUCCESS != rc) goto done;
-  }
-
   if (si && (cb->term_state == AVND_TERM_STATE_OPENSAF_SHUTDOWN_INITIATED)) {
 (void)avnd_evt_last_step_term_evh(cb, nullptr);
   } else if (si &&
@@ -1713,23 +1697,16 @@ static uint32_t 
pi_su_instantiating_to_instantiated(AVND_SU *su) {
 /* reset the su failed flag & set the oper state to enabled */
 m_AVND_SU_OPER_STATE_SET(su, SA_AMF_OPERATIONAL_ENABLED);
 TRACE("Setting the Oper state to Enabled");
-
-AVND_SU_SIQ_REC *siq = 0;
-siq = reinterpret_cast(m_NCS_DBLIST_FIND_LAST(>siq));
-if (siq && (siq->info.msg_act == AVSV_SUSI_ACT_MOD)) {
-  rc = avnd_su_siq_prc(avnd_cb, su);
-} else {
-  /*
-   * reassign all the sis...
-   * it's possible that the si was never assigned. send su-oper
-   * enable msg instead.
-   */
-  if (su->si_list.n_nodes)
-rc = avnd_su_si_reassign(avnd_cb, su);
-  else {
-rc = avnd_di_oper_send(avnd_cb, su, 0);
-reset_suRestart_flag(su);
-  }
+/*
+ * reassign all the sis...
+ * it's possible that the si was never assigned. send su-oper
+ * enable msg instead.
+ */
+if (su->si_list.n_nodes)
+  rc = avnd_su_si_reassign(avnd_cb, su);
+else {
+  rc = avnd_di_oper_send(avnd_cb, su, 0);
+  reset_suRestart_flag(su);
 }
 su->admin_op_Id = static_cast(0);
   } else {
--
2.17.1


___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] ntf: fix ntfimcn fail to send notification with no space error [#3181]

2020-05-04 Thread Minh Hon Chau

Yes right, i was thinking of a 64bit value.

Get Outlook for iOS<https://aka.ms/o0ukef>

From: Thuan Tran 
Sent: Monday, May 4, 2020 8:12:43 PM
To: Minh Hon Chau ; Thang Duc Nguyen 

Cc: opensaf-devel@lists.sourceforge.net 
Subject: Re: [PATCH 1/1] ntf: fix ntfimcn fail to send notification with no 
space error [#3181]

Hi Minh,

Regarding to check max unit32, I think it's not necessary.
Because  the atoi() returns the converted integral number as an int value.
It cannot bigger than max of uint32.

Best Regards,
Thuan

From: Minh Hon Chau 
Sent: Monday, May 4, 2020 12:37 PM
To: Thuan Tran ; Thang Duc Nguyen 

Cc: opensaf-devel@lists.sourceforge.net 
Subject: Re: [PATCH 1/1] ntf: fix ntfimcn fail to send notification with no 
space error [#3181]

Hi Thuan

Ack with comment. I think we need to check the max value of unit32t for
ntf_var_data_limit when we source from the env var.

Thanks

Minh

On 27/4/20 9:05 pm, thuan.tran wrote:
> - Support NTFA_VARIABLE_DATA_LIMIT configuration for NTF Agent.
> Default value is SHRT_MAX(32767).
> - In system that object creation may have many info attributes/values,
> it should configure this env variable to suitable value for ntfimcn
> able send notification.
> ---
>   src/ntf/agent/ntfa_util.c  | 13 -
>   src/ntf/ntfd/ntfd.conf |  4 
>   src/ntf/ntfimcnd/ntfimcn_imm.c | 18 --
>   3 files changed, 28 insertions(+), 7 deletions(-)
>
> diff --git a/src/ntf/agent/ntfa_util.c b/src/ntf/agent/ntfa_util.c
> index 5bc859259..379348ab5 100644
> --- a/src/ntf/agent/ntfa_util.c
> +++ b/src/ntf/agent/ntfa_util.c
> @@ -60,8 +60,19 @@ static unsigned int ntfa_create(void)
>/* No longer needed */
>m_NCS_SEL_OBJ_DESTROY(_cb.ntfs_sync_sel);
>
> - /* TODO: fix env variable */
> + char *ptr = NULL;
> + int optval = 0;
>ntfa_cb.ntf_var_data_limit = NTFA_VARIABLE_DATA_LIMIT;
> + if ((ptr = getenv("NTFA_VARIABLE_DATA_LIMIT")) != NULL) {
> + optval = atoi(ptr);
> + if (optval > 0) {
> + ntfa_cb.ntf_var_data_limit = optval;
> + LOG_NO("NTFA_VARIABLE_DATA_LIMIT=%d", optval);
> + } else {
> + LOG_WA("Invalid NTFA_VARIABLE_DATA_LIMIT, using default 
> %d",
> +NTFA_VARIABLE_DATA_LIMIT);
> + }
> + }
>return rc;
>
>   error:
> diff --git a/src/ntf/ntfd/ntfd.conf b/src/ntf/ntfd/ntfd.conf
> index 91bfcd2e2..f2f67496f 100644
> --- a/src/ntf/ntfd/ntfd.conf
> +++ b/src/ntf/ntfd/ntfd.conf
> @@ -24,6 +24,10 @@ export NTFSV_ENV_HEALTHCHECK_KEY="Default"
>   # directory and the directory component of the path name (if any) is 
> ignored.
>   #export NTFSCN_TRACE_PATHNAME=osafntfcn
>
> +# Uncomment the next line to configure max allowed variable data size for the
> +# osafntfcn (configuration notifier). Default value is 32767 bytes
> +#export NTFA_VARIABLE_DATA_LIMIT=32767
> +
>   # Only log priority LOG_WARNING and higher to the system log file.
>   # All logging will be recorded in a new node local log file 
> $PKGLOGDIR/osaf.log.
>   # Uncomment the next line to enable this service to log to OpenSAF node 
> local log file.
> diff --git a/src/ntf/ntfimcnd/ntfimcn_imm.c b/src/ntf/ntfimcnd/ntfimcn_imm.c
> index c58e8a268..3f2c1a873 100644
> --- a/src/ntf/ntfimcnd/ntfimcn_imm.c
> +++ b/src/ntf/ntfimcnd/ntfimcn_imm.c
> @@ -680,8 +680,10 @@ static void saImmOiCcbApplyCallback(SaImmOiHandleT 
> immOiHandle,
>ccbUtilOperationData, rdn_attr_name, ccbLast);
>if (internal_rc != 0) {
>LOG_ER(
> - "%s send_object_create_notification fail",
> - __FUNCTION__);
> + "%s send_object_create_notification %s 
> fail",
> + __FUNCTION__,
> + osaf_extended_name_borrow(
> + >objectName));
>goto done;
>}
>break;
> @@ -706,8 +708,10 @@ static void saImmOiCcbApplyCallback(SaImmOiHandleT 
> immOiHandle,
>ccbUtilOperationData, invoke_name_ptr, ccbLast);
>if (internal_rc != 0) {
>LOG_ER(
> - "%s send_object_delete_notification fail",
> - __FUNCTION__);
> + "%s send_object_delete_no

Re: [devel] [PATCH 1/1] osaf: enhance vm frozen detection in tcp.plugin [#3164]

2020-03-19 Thread Minh Hon Chau


Hi Thuan,

I'm adding Thanh since he's looking at the patch as well.

I see you pushed the patch, here some late comments.

Thanks

Minh

On 9/3/20 4:49 pm, thuan.tran wrote:

- Active SC will reboot if arb time somehow has big gap b/w heartbeats
in watch takeover request. Active SC may still OK but be rebooted unexpectedly.
- Enhance VM was frozen detection base on arb time and local time counter.
[M]: The patch has a general solution for both vm and container, and 
running a counter thread stead of reading time.time(), we need to 
explain it with a bit more details.

---
  src/osaf/consensus/plugins/tcp/tcp.plugin | 43 ++-
  1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/osaf/consensus/plugins/tcp/tcp.plugin 
b/src/osaf/consensus/plugins/tcp/tcp.plugin
index 0be20fcee..aaa1c1c3f 100755
--- a/src/osaf/consensus/plugins/tcp/tcp.plugin
+++ b/src/osaf/consensus/plugins/tcp/tcp.plugin
@@ -23,8 +23,24 @@ import sys
  import time
  import xmlrpc.client
  import syslog
+import threading
  
  
+counter_run = False

+counter_time = 0.0
+
+def time_counting(hb_interval):
+'''
+When node is frozen, if it is VM, clock time not jump
+but if it is container, clock time still jump.
+This function to help know node is frozen or arbitrator server issue
+'''
+global counter_run, counter_time
+counter_time = 0.0
+while (counter_run):
+time.sleep(hb_interval)
+counter_time += hb_interval
+
  class ArbitratorPlugin(object):
  """ This class represents a TCP Plugin """
  
@@ -478,6 +494,8 @@ class ArbitratorPlugin(object):

  return ret
  
  last_arb_timestamp = 0

+global counter_run, counter_time
+counter = None
  while True:
  if key == self.takeover_request:
  if self.is_active() is False:
@@ -486,15 +504,24 @@ class ArbitratorPlugin(object):
  while True:
  try:
  time_at_arb = self.proxy.heartbeat(self.hostname)
-if last_arb_timestamp == 0:
-last_arb_timestamp = time_at_arb
-break
-elif (time_at_arb - last_arb_timestamp) > self.timeout:
-# VM was frozen?
-syslog.syslog('VM was frozen!')
-ret['code'] = 126
-return ret
+if counter is not None:
+counter_run = False
+counter.join()
+if (last_arb_timestamp != 0) and \
+   (time_at_arb - last_arb_timestamp > self.timeout):
+if counter_time < self.timeout:
+syslog.syslog('VM was frozen!')
+ret['code'] = 126
+return ret
+syslog.syslog('Arb server issue?')
+raise socket.error('Arb server issue?')
  else:
+counter = threading.Thread(
+target=time_counting,
+args=(self.heartbeat_interval,))
+counter_run = True
+counter.setDaemon(True)
+counter.start()
[M] What it means to we are going to start the thread, and wait for it 
join() back multiple times in this while loop.

  last_arb_timestamp = time_at_arb
  break
  except socket.error:



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] amfnd: fix unexpected reboot after split-brain recovery [#3162]

2020-03-09 Thread Minh Hon Chau


Hi Thuan,

ack from me.

Thanks

Minh

On 9/3/20 5:08 pm, thuan.tran wrote:

- Split-brain recovery in headless enable, IMMND may expected restart.
If AMFND not wait IMMND restart but reinit CLM, CLM callback trigger,
clm_to_amf_node() is called then AMFND stuck in init IMM OM causes delay
restart IMMND, delay resend node_up then AMFD will order reboot node.
- Only call clm_to_amf_node() if amf node name is empty.
---
  src/amf/amfnd/clm.cc | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc
index 06eb229c7..73c8ff83c 100644
--- a/src/amf/amfnd/clm.cc
+++ b/src/amf/amfnd/clm.cc
@@ -250,7 +250,7 @@ static void clm_track_cb(
memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode),
   sizeof(SaClmClusterNodeT_4));
/*get the amf node from clm node name */
-  clm_to_amf_node();
+  if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node();
avnd_send_node_up_msg();
avnd_cb->first_time_up = false;
  }



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] amfnd: correct handling "terminate success" evt in terminating state [#3157]

2020-02-20 Thread Minh Hon Chau


Hi Thang,

ack (not tested), would be good if you can elaborate the commit message 
to explain how/why the patch can fix coredump.


Thanks

Minh

On 20/2/20 5:27 pm, Thang Duc Nguyen wrote:

Amfnd need to exist in node in shutdown state and all
components terminated.
---
  src/amf/amfnd/clc.cc | 40 
  1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
index de57838c9..f78e1a707 100644
--- a/src/amf/amfnd/clc.cc
+++ b/src/amf/amfnd/clc.cc
@@ -80,6 +80,8 @@ uint32_t avnd_comp_clc_st_chng_prc(AVND_CB *, AVND_COMP *, 
SaAmfPresenceStateT,
  
  static uint32_t avnd_instfail_su_failover(AVND_CB *, AVND_SU *, AVND_COMP *);
  
+static void amfnd_clean_before_exit(AVND_CB *);

+
  /***
   ** C O M P O N E N T   C L C   F S M   M A T R I X   D E F I N I T I O N **
   ***/
@@ -297,6 +299,23 @@ static void log_failed_exec(NCS_OS_PROC_EXEC_STATUS_INFO 
*exec_stat,
 comp->clc_info.cmds[exec_cmd - 1].cmd);
  }
  
+/

+  Name  : amfnd_clean_before_exit
+
+  Description   : Clean database before exit
+
+  Arguments : cb  - ptr to the AvND control block
+
+  Return Values : None
+
+**/
+void amfnd_clean_before_exit(AVND_CB *cb) {
+  LOG_NO("Shutdown completed, exiting");
+  cb->nodeid_mdsdest_db.deleteAll();
+  cb->hctypedb.deleteAll();
+  daemon_exit();
+}
+
  /
Name  : avnd_evt_clc_resp
  
@@ -810,10 +829,7 @@ uint32_t avnd_comp_clc_fsm_run(AVND_CB *cb, AVND_COMP *comp,

  avnd_comp_pres_state_set(cb, comp, SA_AMF_PRESENCE_UNINSTANTIATED);
  if (all_comps_terminated()) {
LOG_NO("Terminated all AMF components");
-  LOG_NO("Shutdown completed, exiting");
-  cb->nodeid_mdsdest_db.deleteAll();
-  cb->hctypedb.deleteAll();
-  daemon_exit();
+  amfnd_clean_before_exit(cb);
  } else {
TRACE("Do nothing");
goto done;
@@ -2401,6 +2417,12 @@ uint32_t avnd_comp_clc_terming_termsucc_hdler(AVND_CB 
*cb, AVND_COMP *comp) {
  avnd_comp_curr_info_del(cb, comp);
}
  
+  if ((cb->term_state == AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED) &&

+  all_comps_terminated()) {
+LOG_NO("Terminated all AMF components");
+amfnd_clean_before_exit(cb);
+  }
+
TRACE_LEAVE();
return rc;
  }
@@ -2520,10 +2542,7 @@ uint32_t avnd_comp_clc_terming_cleansucc_hdler(AVND_CB 
*cb, AVND_COMP *comp) {
  }
  if (all_comps_terminated()) {
LOG_NO("Terminated all AMF components");
-  LOG_NO("Shutdown completed, exiting");
-  cb->nodeid_mdsdest_db.deleteAll();
-  cb->hctypedb.deleteAll();
-  daemon_exit();
+  amfnd_clean_before_exit(cb);
  }
}
/*
@@ -2584,10 +2603,7 @@ uint32_t avnd_comp_clc_terming_cleanfail_hdler(AVND_CB 
*cb, AVND_COMP *comp) {
if ((cb->term_state == AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED) &&
all_comps_terminated()) {
  LOG_WA("Terminated all AMF components (with failures)");
-LOG_NO("Shutdown completed, exiting");
-cb->nodeid_mdsdest_db.deleteAll();
-cb->hctypedb.deleteAll();
-daemon_exit();
+amfnd_clean_before_exit(cb);
}
  
TRACE_LEAVE();



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: fix memleak in agent enable flow control [#3151]

2020-02-12 Thread Minh Hon Chau


Hi Thuan,

Ack from me.

Thanks

Minh

On 12/2/20 9:29 pm, thuan.tran wrote:

Agent enable flow control keep add new portid without remove.
Remove portid when svc count become zero then handle portid reset
properly, peer A may see portid reset (peer B) then peer B should
accept fseq(1) message from peer A.
---
  src/mds/mds_tipc_fctrl_intf.cc   |  6 ++
  src/mds/mds_tipc_fctrl_portid.cc | 17 -
  2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index f3883ba36..f3504b901 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -428,6 +428,12 @@ uint32_t mds_tipc_fctrl_portid_down(struct tipc_portid id, 
uint32_t type) {
  portid->svc_cnt_--;
  m_MDS_LOG_DBG("FCTRL: Remove svc[node:%x, ref:%u svc_id:%u], svc_cnt:%u",
  id.node, id.ref, svc_id, portid->svc_cnt_);
+if (portid->svc_cnt_ == 0) {
+  delete portid;
+  portid_map.erase(TipcPortId::GetUniqueId(id));
+  m_MDS_LOG_NOTIFY("FCTRL: Remove portid[node:%x, ref:%u]",
+  id.node, id.ref);
+}
}
portid_map_mutex.unlock();
  
diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc

index 3562c4a00..57843b6de 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -373,7 +373,7 @@ uint32_t TipcPortId::ReceiveData(uint32_t mseq, uint16_t 
mfrag,
  if (rcvwnd_.rcv_ + Seq16(1) < Seq16(fseq)) {
if (rcvwnd_.rcv_ == 0 && rcvwnd_.acked_ == 0) {
  // peer does not realize that this portid reset
-m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], "
+m_MDS_LOG_NOTIFY("FCTRL: [me] <-- [node:%x, ref:%u], "
  "RcvData[mseq:%u, mfrag:%u, fseq:%u], "
  "rcvwnd[acked:%u, rcv:%u, nacked:%" PRIu64 "], "
  "Warning[portid reset]",
@@ -381,7 +381,9 @@ uint32_t TipcPortId::ReceiveData(uint32_t mseq, uint16_t 
mfrag,
  mseq, mfrag, fseq,
  rcvwnd_.acked_.v(), rcvwnd_.rcv_.v(), rcvwnd_.nacked_space_);
  
+SendChunkAck(fseq, svc_id, 1);

  rcvwnd_.rcv_ = fseq;
+rcvwnd_.acked_ = rcvwnd_.rcv_;
} else {
  rc = NCSCC_RC_FAILURE;
  // msg loss
@@ -395,6 +397,19 @@ uint32_t TipcPortId::ReceiveData(uint32_t mseq, uint16_t 
mfrag,
  // send nack
  SendNack((rcvwnd_.rcv_ + Seq16(1)).v(), svc_id);
}
+} else if (fseq == 1) {
+  // sender realize me as portid reset
+  m_MDS_LOG_NOTIFY("FCTRL: [me] <-- [node:%x, ref:%u], "
+  "RcvData[mseq:%u, mfrag:%u, fseq:%u], "
+  "rcvwnd[acked:%u, rcv:%u, nacked:%" PRIu64 "], "
+  "Warning[portid reset on sender]",
+  id_.node, id_.ref,
+  mseq, mfrag, fseq,
+  rcvwnd_.acked_.v(), rcvwnd_.rcv_.v(), rcvwnd_.nacked_space_);
+
+  SendChunkAck(fseq, svc_id, 1);
+  rcvwnd_.rcv_ = fseq;
+  rcvwnd_.acked_ = rcvwnd_.rcv_;
  } else if (Seq16(fseq) <= rcvwnd_.rcv_) {
rc = NCSCC_RC_FAILURE;
// unexpected retransmission



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] dtm: improve time accuracy in a trace record [#3144]

2020-02-09 Thread Minh Hon Chau


Hi aThanh,

ack from me.

Thanks

Minh

On 6/2/20 3:42 pm, Thanh Nguyen wrote:

In the trace record the time value is generated
after acquiring the mutex. The time accuracy is improved
when generated before seizing the mutext.
---
  src/base/logtrace.cc|  2 +-
  src/base/logtrace_client.cc | 15 ---
  src/base/logtrace_client.h  |  9 +
  3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/base/logtrace.cc b/src/base/logtrace.cc
index 8908c1ff3..9822879ab 100644
--- a/src/base/logtrace.cc
+++ b/src/base/logtrace.cc
@@ -97,7 +97,7 @@ void trace_output(const char *file, unsigned line, unsigned 
priority,
  if (!entry) {
entry = gl_local_thread_trace->CreateLogEntry(
static_cast(priority),
-  preamble, ap);
+  base::ReadRealtimeClock(), preamble, ap);
  }
  gl_thread_buffer.WriteToBuffer(entry);
}
diff --git a/src/base/logtrace_client.cc b/src/base/logtrace_client.cc
index e22112a43..104e08ce1 100644
--- a/src/base/logtrace_client.cc
+++ b/src/base/logtrace_client.cc
@@ -96,32 +96,33 @@ const char* LogTraceClient::Log(LogTraceClient* tracelog,
  const char* LogTraceClient::Log(base::LogMessage::Severity severity,
  const char *fmt, va_list ap) {
if (log_socket_ != nullptr && log_mutex_ != nullptr) {
-return LogInternal(severity, fmt, ap);
+return LogInternal(severity, base::ReadRealtimeClock(), fmt, ap);
}
return nullptr;
  }
  
  const char* LogTraceClient::LogInternal(base::LogMessage::Severity severity,

-const char *fmt, va_list ap) {
+timespec time_spec, const char *fmt, va_list ap) {
base::Lock lock(*log_mutex_);
-  CreateLogEntryInternal(severity, fmt, ap);
+  CreateLogEntryInternal(severity, time_spec, fmt, ap);
log_socket_->Send(buffer_.data(), buffer_.size());
return buffer_.data();
  }
  
  const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity,

-const char *fmt, va_list ap) {
+timespec time_spec, const char *fmt, va_list ap) {
base::Lock lock(*log_mutex_);
-  return CreateLogEntryInternal(severity, fmt, ap);
+  return CreateLogEntryInternal(severity, time_spec, fmt, ap);
  }
  
  const char* LogTraceClient::CreateLogEntryInternal(

-base::LogMessage::Severity severity, const char *fmt, va_list ap) {
+base::LogMessage::Severity severity, timespec time_spec,
+const char *fmt, va_list ap) {
uint32_t id = sequence_id_;
sequence_id_ = id < kMaxSequenceId ? id + 1 : 1;
buffer_.clear();
base::LogMessage::Write(
-  base::LogMessage::Facility::kLocal1, severity, base::ReadRealtimeClock(),
+  base::LogMessage::Facility::kLocal1, severity, time_spec,
fqdn_, app_name_, proc_id_, msg_id_,
{{base::LogMessage::SdName{"meta"},
  {base::LogMessage::Parameter{base::LogMessage::SdName{"sequenceId"},
diff --git a/src/base/logtrace_client.h b/src/base/logtrace_client.h
index 5b165e528..1ccb44d06 100644
--- a/src/base/logtrace_client.h
+++ b/src/base/logtrace_client.h
@@ -44,7 +44,7 @@ class LogTraceClient {
const char* Log(base::LogMessage::Severity severity, const char *fmt,
va_list ap);
const char* CreateLogEntry(base::LogMessage::Severity severity,
-  const char *fmt, va_list ap);
+  timespec time_spec, const char *fmt, va_list ap);
void AddExternalBuffer(int64_t tid, LogTraceBuffer* buffer);
void RemoveExternalBuffer(int64_t tid);
void RequestFlushExternalBuffer();
@@ -56,10 +56,11 @@ class LogTraceClient {
  
   private:

bool Init(const char *msg_id, WriteMode mode);
-  const char* LogInternal(base::LogMessage::Severity severity, const char *fmt,
-  va_list ap);
+
+  const char* LogInternal(base::LogMessage::Severity severity,
+  timespec time_spec, const char *fmt, va_list ap);
const char* CreateLogEntryInternal(base::LogMessage::Severity severity,
-  const char *fmt, va_list ap);
+  timespec time_spec, const char *fmt, va_list ap);
static constexpr const uint32_t kMaxSequenceId = uint32_t{0x7fff};
base::LogMessage::HostName fqdn_{""};
base::LogMessage::AppName app_name_{""};



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] dtm: improve time accuracy in a trace record [#3144]

2020-02-05 Thread Minh Hon Chau


Hi aThanh,

The CreateLogEntry/... are added recently in LogTraceClient.h/cc, and 
the "client" you mean should have been calling the log/trace in 
logtrace.h, which are the OpenSAF services and agents. The real client 
should include the SAF headers in src/ais/include to use SAF services.


Do you see any use cases that "client" should include LogTraceClient.h 
to call CreateLogEntry without logtrace.h?


Thanks, Minh

On 6/2/20 2:05 pm, Thanh Nguyen wrote:

Hello Minh,

1) For LogInternal(..), there is only one version. The new replaced the old. 
This is private method, thus it can be safely replaced.
2) For CreateLogEntry(...) which is a public method. For compatibility reasons, 
I keep the old method and create the new method. I do not know if there is any 
client code of opensaf out there calling CreateLogEntry. If it is known that 
there is no client code, I will remove the old method.

3) For CreateLogEntryInternal(..), I also keep two versions corresponding to 
two versions of calling methods CreateLogEntry(..).

I will remove the old version of (2) and (3) if it is confirmed that there is 
no client code calling CreateLogEntry(..).
Best Regards,
Thanh

-Original Message-
From: Minh Hon Chau [mailto:minh.c...@dektech.com.au]
Sent: Thursday, 6 February 2020 12:48 PM
To: Thanh Nguyen; peter.mcint...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] dtm: improve time accuracy in a trace record [#3144]

Hi aThanh,

The patch adds a new pair of CreateLogEntry/CreateLogEntryInternal with
one extra parameter. If the old one (within 3 parameters) is not being
used anywhere else, we can delete them.

Thanks

Minh

On 24/1/20 11:34 am, Thanh Nguyen wrote:

In the trace record the time value is generated
after acquiring the mutex. The time accuracy is improved
when generated before seizing the mutext.
---
   src/base/logtrace.cc|  2 +-
   src/base/logtrace_client.cc | 18 +-
   src/base/logtrace_client.h  | 13 ++---
   3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/base/logtrace.cc b/src/base/logtrace.cc
index 8908c1ff3..9822879ab 100644
--- a/src/base/logtrace.cc
+++ b/src/base/logtrace.cc
@@ -97,7 +97,7 @@ void trace_output(const char *file, unsigned line, unsigned 
priority,
   if (!entry) {
 entry = gl_local_thread_trace->CreateLogEntry(
 static_cast(priority),
-  preamble, ap);
+  base::ReadRealtimeClock(), preamble, ap);
   }
   gl_thread_buffer.WriteToBuffer(entry);
 }
diff --git a/src/base/logtrace_client.cc b/src/base/logtrace_client.cc
index e22112a43..484bd17e5 100644
--- a/src/base/logtrace_client.cc
+++ b/src/base/logtrace_client.cc
@@ -96,19 +96,26 @@ const char* LogTraceClient::Log(LogTraceClient* tracelog,
   const char* LogTraceClient::Log(base::LogMessage::Severity severity,
   const char *fmt, va_list ap) {
 if (log_socket_ != nullptr && log_mutex_ != nullptr) {
-return LogInternal(severity, fmt, ap);
+return LogInternal(severity, base::ReadRealtimeClock(), fmt, ap);
 }
 return nullptr;
   }
   
   const char* LogTraceClient::LogInternal(base::LogMessage::Severity severity,

-const char *fmt, va_list ap) {
+timespec time_spec, const char *fmt, va_list ap) {
 base::Lock lock(*log_mutex_);
-  CreateLogEntryInternal(severity, fmt, ap);
+  CreateLogEntryInternal(severity, time_spec, fmt, ap);
 log_socket_->Send(buffer_.data(), buffer_.size());
 return buffer_.data();
   }
   
+const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity,

+timespec time_spec, const char *fmt, va_list ap) {
+  base::Lock lock(*log_mutex_);
+  return CreateLogEntryInternal(severity, time_spec, fmt, ap);
+}
+
+// This is original
   const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity 
severity,
   const char *fmt, va_list ap) {
 base::Lock lock(*log_mutex_);
@@ -116,12 +123,13 @@ const char* 
LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity,
   }
   
   const char* LogTraceClient::CreateLogEntryInternal(

-base::LogMessage::Severity severity, const char *fmt, va_list ap) {
+base::LogMessage::Severity severity, timespec time_spec,
+const char *fmt, va_list ap) {
 uint32_t id = sequence_id_;
 sequence_id_ = id < kMaxSequenceId ? id + 1 : 1;
 buffer_.clear();
 base::LogMessage::Write(
-  base::LogMessage::Facility::kLocal1, severity, base::ReadRealtimeClock(),
+  base::LogMessage::Facility::kLocal1, severity, time_spec,
 fqdn_, app_name_, proc_id_, msg_id_,
 {{base::LogMessage::SdName{"meta"},
   {base::LogMessage::Parameter{base::LogMessage::SdName{"sequenceId"},
diff --git a/src/base/logtrace_client.h b/src/base/logtrace_client.h
index 5b165e528..29aa79b95 100644
--- a/src/base/logtrace_client.h
+++ b/src/base/logtrace_client.h
@@ -45,6 +45,8

Re: [devel] [PATCH 1/1] dtm: improve time accuracy in a trace record [#3144]

2020-02-05 Thread Minh Hon Chau


Hi aThanh,

The patch adds a new pair of CreateLogEntry/CreateLogEntryInternal with 
one extra parameter. If the old one (within 3 parameters) is not being 
used anywhere else, we can delete them.


Thanks

Minh

On 24/1/20 11:34 am, Thanh Nguyen wrote:

In the trace record the time value is generated
after acquiring the mutex. The time accuracy is improved
when generated before seizing the mutext.
---
  src/base/logtrace.cc|  2 +-
  src/base/logtrace_client.cc | 18 +-
  src/base/logtrace_client.h  | 13 ++---
  3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/base/logtrace.cc b/src/base/logtrace.cc
index 8908c1ff3..9822879ab 100644
--- a/src/base/logtrace.cc
+++ b/src/base/logtrace.cc
@@ -97,7 +97,7 @@ void trace_output(const char *file, unsigned line, unsigned 
priority,
  if (!entry) {
entry = gl_local_thread_trace->CreateLogEntry(
static_cast(priority),
-  preamble, ap);
+  base::ReadRealtimeClock(), preamble, ap);
  }
  gl_thread_buffer.WriteToBuffer(entry);
}
diff --git a/src/base/logtrace_client.cc b/src/base/logtrace_client.cc
index e22112a43..484bd17e5 100644
--- a/src/base/logtrace_client.cc
+++ b/src/base/logtrace_client.cc
@@ -96,19 +96,26 @@ const char* LogTraceClient::Log(LogTraceClient* tracelog,
  const char* LogTraceClient::Log(base::LogMessage::Severity severity,
  const char *fmt, va_list ap) {
if (log_socket_ != nullptr && log_mutex_ != nullptr) {
-return LogInternal(severity, fmt, ap);
+return LogInternal(severity, base::ReadRealtimeClock(), fmt, ap);
}
return nullptr;
  }
  
  const char* LogTraceClient::LogInternal(base::LogMessage::Severity severity,

-const char *fmt, va_list ap) {
+timespec time_spec, const char *fmt, va_list ap) {
base::Lock lock(*log_mutex_);
-  CreateLogEntryInternal(severity, fmt, ap);
+  CreateLogEntryInternal(severity, time_spec, fmt, ap);
log_socket_->Send(buffer_.data(), buffer_.size());
return buffer_.data();
  }
  
+const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity,

+timespec time_spec, const char *fmt, va_list ap) {
+  base::Lock lock(*log_mutex_);
+  return CreateLogEntryInternal(severity, time_spec, fmt, ap);
+}
+
+// This is original
  const char* LogTraceClient::CreateLogEntry(base::LogMessage::Severity 
severity,
  const char *fmt, va_list ap) {
base::Lock lock(*log_mutex_);
@@ -116,12 +123,13 @@ const char* 
LogTraceClient::CreateLogEntry(base::LogMessage::Severity severity,
  }
  
  const char* LogTraceClient::CreateLogEntryInternal(

-base::LogMessage::Severity severity, const char *fmt, va_list ap) {
+base::LogMessage::Severity severity, timespec time_spec,
+const char *fmt, va_list ap) {
uint32_t id = sequence_id_;
sequence_id_ = id < kMaxSequenceId ? id + 1 : 1;
buffer_.clear();
base::LogMessage::Write(
-  base::LogMessage::Facility::kLocal1, severity, base::ReadRealtimeClock(),
+  base::LogMessage::Facility::kLocal1, severity, time_spec,
fqdn_, app_name_, proc_id_, msg_id_,
{{base::LogMessage::SdName{"meta"},
  {base::LogMessage::Parameter{base::LogMessage::SdName{"sequenceId"},
diff --git a/src/base/logtrace_client.h b/src/base/logtrace_client.h
index 5b165e528..29aa79b95 100644
--- a/src/base/logtrace_client.h
+++ b/src/base/logtrace_client.h
@@ -45,6 +45,8 @@ class LogTraceClient {
va_list ap);
const char* CreateLogEntry(base::LogMessage::Severity severity,
const char *fmt, va_list ap);
+  const char* CreateLogEntry(base::LogMessage::Severity severity,
+  timespec time_spec, const char *fmt, va_list ap);
void AddExternalBuffer(int64_t tid, LogTraceBuffer* buffer);
void RemoveExternalBuffer(int64_t tid);
void RequestFlushExternalBuffer();
@@ -56,10 +58,15 @@ class LogTraceClient {
  
   private:

bool Init(const char *msg_id, WriteMode mode);
-  const char* LogInternal(base::LogMessage::Severity severity, const char *fmt,
-  va_list ap);
+
+  const char* LogInternal(base::LogMessage::Severity severity,
+  timespec time_spec, const char *fmt, va_list ap);
const char* CreateLogEntryInternal(base::LogMessage::Severity severity,
-  const char *fmt, va_list ap);
+  timespec time_spec, const char *fmt, va_list ap);
+  inline const char* CreateLogEntryInternal(
+  base::LogMessage::Severity severity, const char *fmt, va_list ap) {
+return CreateLogEntryInternal(severity, base::ReadRealtimeClock(), fmt, 
ap);
+  }
static constexpr const uint32_t kMaxSequenceId = uint32_t{0x7fff};
base::LogMessage::HostName fqdn_{""};
base::LogMessage::AppName app_name_{""};



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] log: fix segmentation fault in log agent [#3137]

2020-01-05 Thread Minh Hon CHAU


Hi Vu,

Ack(review).

Thanks,
Minh
Quoting Vu Minh Nguyen :


log agent did not protect the resource `unacked_invocations_ list` from
accessing by multiple threads, so caused segmentation fault.

This patch introduces a mutex in order to synchronize the access to that
common resource.
---
 src/log/agent/lga_client.cc |  2 +-
 src/log/agent/lga_client.h  | 16 +++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/log/agent/lga_client.cc b/src/log/agent/lga_client.cc
index cdc54904a..2eb37a0f7 100644
--- a/src/log/agent/lga_client.cc
+++ b/src/log/agent/lga_client.cc
@@ -86,7 +86,7 @@ LogClient::~LogClient() {
   }

   stream_list_.clear();
-  unacked_invocations_.clear();
+  CleanUnackedList();

   // Free the client handle allocated to this log client
   if (handle_ != 0) {
diff --git a/src/log/agent/lga_client.h b/src/log/agent/lga_client.h
index f5fa6faa4..e6e2c911e 100644
--- a/src/log/agent/lga_client.h
+++ b/src/log/agent/lga_client.h
@@ -174,13 +174,18 @@ class LogClient {
   // get acknowledgement from it.
   void KeepTrack(SaInvocationT inv, uint32_t ack_flags) {
 if (ack_flags != SA_LOG_RECORD_WRITE_ACK) return;
+base::Lock scope_lock{mutex_unacked_list_};
 unacked_invocations_.push_back(inv);
   }

   // Got an acknowledgment, so remove from the track list.
-  void RemoveTrack(SaInvocationT inv) { unacked_invocations_.remove(inv); }
+  void RemoveTrack(SaInvocationT inv) {
+base::Lock scope_lock{mutex_unacked_list_};
+unacked_invocations_.remove(inv);
+  }

   void NotifyClientAboutLostInvocations() {
+base::Lock scope_lock{mutex_unacked_list_};
 for (const auto& i : unacked_invocations_) {
   TRACE("The write async with this invocation %lld has been lost", i);
   // the below memory will be freed by lga_msg_destroy(cbk_msg)
@@ -232,6 +237,11 @@ class LogClient {
   // Invoke the registered callback
   void InvokeCallback(const lgsv_msg_t* msg);

+  void CleanUnackedList() {
+base::Lock scope_lock{mutex_unacked_list_};
+unacked_invocations_.clear();
+  }
+
   // Delete all messages from the mailbox
   static bool ClearMailBox(NCSCONTEXT, NCSCONTEXT);

@@ -290,6 +300,10 @@ class LogClient {
   // If cluster goes to headless, log agent will inform to log client with
   // SA_AIS_ERR_TRY_AGAIN code for these invocations.
   std::list unacked_invocations_{};
+
+  // To protect the `unacked_invocations_` list.
+  base::Mutex mutex_unacked_list_{};
+
   // LOG handle (derived from hdl-mngr)
   SaLogHandleT handle_;

--
2.17.1





___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] log: fix segmentation fault in log agent [#3137]

2020-01-05 Thread Minh Hon Chau


Hi Vu,

Don't you need to protect the list in ~LogClient()? And in 
NotifyClientAboutLostInvocations(), does it need to protect before 
'read' in the 'for' loop? Otherwise it's ack from me.


Thanks

Minh

On 6/1/20 2:15 pm, Vu Minh Nguyen wrote:

log agent did not protect the resource `unacked_invocations_ list` from
accessing by multiple threads, so caused segmentation fault.

This patch introduces a mutex in order to synchronize the access to that
common resource.
---
  src/log/agent/lga_client.h | 12 +++-
  1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/log/agent/lga_client.h b/src/log/agent/lga_client.h
index f5fa6faa4..c999d148e 100644
--- a/src/log/agent/lga_client.h
+++ b/src/log/agent/lga_client.h
@@ -174,11 +174,15 @@ class LogClient {
// get acknowledgement from it.
void KeepTrack(SaInvocationT inv, uint32_t ack_flags) {
  if (ack_flags != SA_LOG_RECORD_WRITE_ACK) return;
+base::Lock scope_lock{mutex_unacked_list_};
  unacked_invocations_.push_back(inv);
}
  
// Got an acknowledgment, so remove from the track list.

-  void RemoveTrack(SaInvocationT inv) { unacked_invocations_.remove(inv); }
+  void RemoveTrack(SaInvocationT inv) {
+base::Lock scope_lock{mutex_unacked_list_};
+unacked_invocations_.remove(inv);
+  }
  
void NotifyClientAboutLostInvocations() {

  for (const auto& i : unacked_invocations_) {
@@ -196,6 +200,8 @@ class LogClient {
  
SendMsgToMbx(msg, MDS_SEND_PRIORITY_HIGH);

  }
+
+base::Lock scope_lock{mutex_unacked_list_};
  unacked_invocations_.clear();
}
  
@@ -290,6 +296,10 @@ class LogClient {

// If cluster goes to headless, log agent will inform to log client with
// SA_AIS_ERR_TRY_AGAIN code for these invocations.
std::list unacked_invocations_{};
+
+  // To protect the `unacked_invocations_` list.
+  base::Mutex mutex_unacked_list_{};
+
// LOG handle (derived from hdl-mngr)
SaLogHandleT handle_;
  



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: fix ckpt 20 11 failure [#3127]

2019-12-09 Thread Minh Hon Chau


Hi Thuan,

- We could give the patch title a bit more meanings than "fix ckpt 20 
11..", for example, something as "Using timer to continue sending queued 
message".


- And a few comments inline

Thanks

Minh

On 5/12/19 3:05 pm, thuan.tran wrote:

- In overflow, receive chunk ack may stuck in retrying to send pending
messages then later chunk ack comming cannot proceed.
- Instead of retrying to send pending messages, reuse timer send chunk
ack to trigger send pending messages if any. By this, even no more Nack
or ChunkAck event comming, pending messages will be sent by timer.
---
  src/mds/mds_dt_tipc.c| 12 ++---
  src/mds/mds_tipc_fctrl_intf.cc   | 10 
  src/mds/mds_tipc_fctrl_portid.cc | 88 ++--
  src/mds/mds_tipc_fctrl_portid.h  |  1 +
  4 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 9b3290833..6b30846a1 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -3183,13 +3183,13 @@ ssize_t mds_retry_sendto(int sockfd, const void *buf, 
size_t len, int flags,
  {
int retry = 5;
ssize_t send_len = 0;
-   while (retry >= 0) {
+   while (retry-- >= 0) {
send_len = sendto(sockfd, buf, len, flags, dest_addr, addrlen);
if (send_len == len) {
return send_len;
-   } else if (retry-- > 0) {
-   if (errno != ENOMEM &&
-   errno != ENOBUFS &&
+   } else if (retry >= 0) {
+   if (errno != EAGAIN && errno != EWOULDBLOCK &&
+   errno != ENOMEM && errno != ENOBUFS &&
errno != EINTR)
break;
osaf_nanosleep();


[Minh] We may need to do error-log the strerror and errno in case of 
failure in mds_retry_sendto(). Also,


uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) {

...

  m_MDS_LOG_ERR("FCTRL: sendto() failed, Error[%s]", strerror(errno));
}

this logging "sendto()" should be now "TipcPortId::Send()"


@@ -3242,7 +3242,7 @@ static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t 
buff_len,
if (mds_tipc_fctrl_trysend(id, buffer, buff_len, is_queued)
== NCSCC_RC_SUCCESS) {
send_len = mds_retry_sendto(
-   tipc_cb.BSRsock, buffer, buff_len, 0,
+   tipc_cb.BSRsock, buffer, buff_len, MSG_DONTWAIT,
(struct sockaddr *)_addr, 
sizeof(server_addr));
[Minh] There must be a reason that you want to use non-blocking with 
MSG_DONTWAIT?

if (send_len == buff_len) {
m_MDS_LOG_INFO("MDTM: Successfully sent message");
@@ -3289,7 +3289,7 @@ static uint32_t mdtm_mcast_sendto(void *buffer, size_t 
size,
/*This can be scope-down to dest_svc_id  server_inst TBD*/
server_addr.addr.nameseq.upper = HTONL(MDS_MDTM_UPPER_INSTANCE);
ssize_t send_len =
-   mds_retry_sendto(tipc_cb.BSRsock, buffer, size, 0,
+   mds_retry_sendto(tipc_cb.BSRsock, buffer, size, MSG_DONTWAIT,
   (struct sockaddr *)_addr, sizeof(server_addr));
  
  	if (send_len == size) {

diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index 7d0571e7c..b20205686 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -102,6 +102,8 @@ void tmr_exp_cbk(void* uarg) {
  
  void process_timer_event(const Event& evt) {

bool txprob_restart = false;
+  m_MDS_LOG_DBG("FCTRL: process timer event start [evt:%d]",
+static_cast(evt.type_));
for (auto i : portid_map) {
  TipcPortId* portid = i.second;
  
@@ -113,16 +115,20 @@ void process_timer_event(const Event& evt) {
  
  if (evt.type_ == Event::Type::kEvtTmrChunkAck) {

portid->ReceiveTmrChunkAck();
+  portid->SendUnsentMsg();
  }
[Minh] The idea now is using ChunkAck timer to continue sending unsent 
message. This fix comes from a situation that we failed in the middle of 
sending unsent message due to "Cannot allocate memory...". In the 
scenario without such error "Cannot allocate ...", the function 
SendUnsentMsg() here will be sending extra messages from the "receiving 
channel" as ChunkAck timer apart from the "sending channel" as 
ReceiveChunkAck(). That would cause more undeliverable messages (the 
ones are now sent from ChunkAck timer) if the overloading starts to 
happen and sender keeps pushing more messages to send (more message 
pushes into queue).

}
if (txprob_restart) {
  txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk);
  m_MDS_LOG_DBG("FCTRL: Restart txprob");
}
+  m_MDS_LOG_DBG("FCTRL: process timer event end");
  }
  
  uint32_t process_flow_event(const Event& evt) {

uint32_t rc = NCSCC_RC_SUCCESS;
+  m_MDS_LOG_DBG("FCTRL: process flow event start [evt:%d]",
+static_cast(evt.type_));

Re: [devel] [PATCH 0/1] Review Request for mds: not waste 1.5s in waiting dead Adest to send RSP [#3102] V2 (updated)

2019-12-04 Thread Minh Hon Chau


Hi Thuan

One minor comment, we could separate this commit into one for code 
change, one for test case.


@Vu, you have any comments?

Thanks

Minh

On 27/11/19 1:21 pm, thuan.tran wrote:

Summary: mds: not waste 1.5s in waiting dead Adest to send RSP [#3102]
Review request for Ticket(s): 3102
Peer Reviewer(s): Minh, Vu, Thang, Gary
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-3102
Base revision: b61bee5c8accd79e573ef726d40b945afc7c7b3e
Personal repository: git://git.code.sf.net/u/thuantr/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesn
  OpenSAF servicesn
  Core libraries  y
  Samples n
  Tests   y
  Other   n

NOTE: Patch(es) contain lines longer than 80 characers

Comments (indicate scope for each "y" above):
-
N/A

revision f4f5ab3efe19bdd11c5cb43e4f4d48af79656737
Author: thuan.tran 
Date:   Tue, 26 Nov 2019 15:58:34 +0700

mds: not waste 1.5s in waiting dead Adest to send RSP [#3102]

- When sending response message to Adest which is not exist (crash/terminate),
current MDS try to wait for 1.5 seconds before conclude no route to send RSP.

- Here are scenarios may waste 1.5s waiting:
SVCs DOWN (dead adest or vdest role change) -> get SNDRSP -> send RSP (wait 
1.5s)
get SNDRSP -> SVCs DOWN (dead adest or vdest role change) -> send RSP (wait 
1.5s)
This long wait time cause trouble for higher layer services, e.g: ntf, imm, 
etc...
where there are many agents send initialize request (use message SNDRSP type)

- Solution: create adest list, a timer start when last SVC of adest DOWN.
When sending RSP to this adest, the wait time will reduce to only 10ms.
Notice that following origin behavior is kept:
No any SVC UP before -> get SNDRSP -> send RSP (wait 1.5s)

- New TC tet_send_response_tp_13() is created to verify this scenario.



Complete diffstat:
--
  src/mds/apitest/mdstipc.h  |   1 +
  src/mds/apitest/mdstipc_api.c  | 107 ++
  src/mds/apitest/mdstipc_conf.c |   1 -
  src/mds/mds_c_api.c| 199 +
  src/mds/mds_c_sndrcv.c |  38 +---
  src/mds/mds_core.h |  30 ++-
  src/mds/mds_dt2c.h |   2 +-
  src/mds/mds_dt_common.c|  24 -
  src/mds/mds_main.c |   4 +
  9 files changed, 350 insertions(+), 56 deletions(-)


Testing Commands:
-
N/A

Testing, Expected Results:
--
N/A

Conditions of Submission:
-
ACK by reviewers

Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  y  y
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many blank entries
 that need proper data filled in.

___ You have failed to nominate the proper persons for review and push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible line length in your headers/comments/text.

___ You have failed to put in a proper Trac Ticket # into your commits.

___ You have incorrectly put/left internal data in your comments/files
 (i.e. internal bug tracking tool IDs, product names etc)

___ You have not given any evidence of testing beyond basic build tests.
 Demonstrate some level of runtime or other sanity testing.

___ You have ^M present in some of your files. These have to be removed.

___ You have needlessly changed whitespace or added whitespace crimes
 like trailing spaces, or spaces before tabs.

___ You have mixed real technical changes with whitespace and other
 cosmetic code cleanup changes. These have to be separate commits.

___ You need to refactor your submission into logical chunks; there is
 too much content into a single commit.

___ You have extraneous garbage in your review (merge commits etc)

___ You have giant attachments which should never have been sent;
 Instead you should place your content in a public tree to be pulled.

___ You have too many commits attached to an e-mail; resend as threaded
 commits, or place in a public tree for a pull.

___ You have resent this content multiple times without a clear indication
 of what has changed between each re-send.

___ You have failed to adequately and

Re: [devel] [PATCH 1/1] mds: close sockets at the end of mdtm_tipc_destroy() [#3125]

2019-12-04 Thread Minh Hon Chau


hi Thuan,

ack (review only).

Thanks

Minh

On 3/12/19 7:28 pm, thuan.tran wrote:

Aslo create wrapper of sendto() to retry if errno is ENOMEM/ENOBUFFS/EINTR.
And return for other errors, do not assert() cause coredump.
---
  src/mds/mds_dt_tipc.c| 47 +++
  src/mds/mds_dt_tipc.h|  3 ++
  src/mds/mds_tipc_fctrl_portid.cc | 65 +++-
  3 files changed, 74 insertions(+), 41 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index fdf0da7fb..b0f38ee49 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -51,6 +51,7 @@
  #include "mds_tipc_recvq_stats.h"
  #include "base/osaf_utility.h"
  #include "base/osaf_poll.h"
+#include "base/osaf_time.h"
  
  #ifndef SOCK_CLOEXEC

  enum { SOCK_CLOEXEC = 0x8 };
@@ -523,9 +524,7 @@ uint32_t mdtm_tipc_destroy(void)
MDTM_REASSEMBLY_QUEUE *reassem_queue = NULL;
MDTM_REASSEMBLY_KEY reassembly_key;
  
-	/* close sockets first */

-   close(tipc_cb.BSRsock);
-   close(tipc_cb.Dsock);
+   mds_tipc_fctrl_shutdown();
  
  	/* Destroy receiving task */

if (mdtm_destroy_rcv_task() != NCSCC_RC_SUCCESS) {
@@ -537,7 +536,6 @@ uint32_t mdtm_tipc_destroy(void)
 NULL);
m_NCS_IPC_RELEASE(_cb.tmr_mbx,
  (NCS_IPC_CB)mdtm_mailbox_mbx_cleanup);
-   mds_tipc_fctrl_shutdown();
/* Clear reference hdl list */
while (mdtm_ref_hdl_list_hdr != NULL) {
/* Store temporary the pointer of entry to be deleted */
@@ -587,6 +585,9 @@ uint32_t mdtm_tipc_destroy(void)
handle = 0;
mdtm_global_frag_num = 0;
  
+	close(tipc_cb.BSRsock);

+   close(tipc_cb.Dsock);
+
return NCSCC_RC_SUCCESS;
  }
  
@@ -3135,6 +3136,37 @@ uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num,

return NCSCC_RC_SUCCESS;
  }
  
+/*

+
+  Function NAME: mds_retry_sendto
+
+  DESCRIPTION: wrapper of sendto() for retry purpose
+
+  ARGUMENTS: same as sendto()
+
+  RETURNS: same as sendto()
+
+*/
+ssize_t mds_retry_sendto(int sockfd, const void *buf, size_t len, int flags,
+   const struct sockaddr *dest_addr, socklen_t addrlen)
+{
+   int retry = 5;
+   ssize_t send_len = 0;
+   while (retry >= 0) {
+   send_len = sendto(sockfd, buf, len, flags, dest_addr, addrlen);
+   if (send_len == len) {
+   return send_len;
+   } else if (retry-- > 0) {
+   if (errno != ENOMEM &&
+   errno != ENOBUFS &&
+   errno != EINTR)
+   break;
+   osaf_nanosleep();
+   }
+   }
+   return send_len;
+}
+
  /*
  
Function NAME: mdtm_sendto

@@ -3176,7 +3208,8 @@ static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t 
buff_len,
}
  #endif
if (mds_tipc_fctrl_trysend(buffer, buff_len, id) == NCSCC_RC_SUCCESS) {
-   send_len = sendto(tipc_cb.BSRsock, buffer, buff_len, 0,
+   send_len = mds_retry_sendto(
+   tipc_cb.BSRsock, buffer, buff_len, 0,
(struct sockaddr *)_addr, 
sizeof(server_addr));
if (send_len == buff_len) {
m_MDS_LOG_INFO("MDTM: Successfully sent message");
@@ -3222,8 +3255,8 @@ static uint32_t mdtm_mcast_sendto(void *buffer, size_t 
size,
server_addr.addr.nameseq.lower = HTONL(MDS_MDTM_LOWER_INSTANCE);
/*This can be scope-down to dest_svc_id  server_inst TBD*/
server_addr.addr.nameseq.upper = HTONL(MDS_MDTM_UPPER_INSTANCE);
-   int send_len =
-   sendto(tipc_cb.BSRsock, buffer, size, 0,
+   ssize_t send_len =
+   mds_retry_sendto(tipc_cb.BSRsock, buffer, size, 0,
   (struct sockaddr *)_addr, sizeof(server_addr));
  
  	if (send_len == size) {

diff --git a/src/mds/mds_dt_tipc.h b/src/mds/mds_dt_tipc.h
index e73a11b09..65175839e 100644
--- a/src/mds/mds_dt_tipc.h
+++ b/src/mds/mds_dt_tipc.h
@@ -107,4 +107,7 @@ extern uint32_t mds_mdtm_node_subscribe_tipc(MDS_SVC_HDL 
svc_hdl,
   MDS_SUBTN_REF_VAL 
*subtn_ref_val);
  extern uint32_t mds_mdtm_node_unsubscribe_tipc(MDS_SUBTN_REF_VAL 
subtn_ref_val);
  
+ssize_t mds_retry_sendto(int sockfd, const void *buf, size_t len, int flags,

+   const struct sockaddr *dest_addr, socklen_t addrlen);
+
  #endif  // MDS_MDS_DT_TIPC_H_
diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc
index dab2b8c69..6b033b0e5 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -17,11 +17,14 @@
  
  #include

Re: [devel] [PATCH 0/2] Review Request for mds: Avoid message reallocation [#3089] V3

2019-12-02 Thread Minh Hon Chau


Hi Vu, Thuan

Any comments on the patches.

Thanks

Minh

On 28/11/19 10:54 pm, Minh Chau wrote:

Summary: mds: Avoid message reallocation [#3089]
Review request for Ticket(s): 3089
Peer Reviewer(s): Thuan, Vu, Gary
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-3089
Base revision: 8e07c19aed63c249f4e7fa8470270d2de1a56046
Personal repository: git://git.code.sf.net/u/minh-chau/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesn
  OpenSAF servicesn
  Core libraries  y
  Samples n
  Tests   n
  Other   n

NOTE: Patch(es) contain lines longer than 80 characers

Comments (indicate scope for each "y" above):
-
*** EXPLAIN/COMMENT THE PATCH SERIES HERE ***

revision d3bdf53e99523785cdc932d62b25267ea900c643
Author: Minh Chau 
Date:   Thu, 28 Nov 2019 21:08:50 +1100

mds: Avoid message reallocation [#3089]

The patch avoids message reallocation if the message is in
retransmission queue



revision 7be0f5404ebb8ec5b8752813899d6aefd1ef6c33
Author: Minh Chau 
Date:   Thu, 28 Nov 2019 21:08:38 +1100

mds: Improve readibility [#3089]

Correct indent and reduce code lines (<80 chars) for
mds_mdtm_send_tipc() and mdtm_frag_and_send()



Complete diffstat:
--
  src/mds/mds_dt_tipc.c| 534 +--
  src/mds/mds_tipc_fctrl_intf.cc   |   6 +-
  src/mds/mds_tipc_fctrl_intf.h|   4 +-
  src/mds/mds_tipc_fctrl_msg.cc|   2 +-
  src/mds/mds_tipc_fctrl_portid.cc |   9 +-
  5 files changed, 294 insertions(+), 261 deletions(-)


Testing Commands:
-
*** LIST THE COMMAND LINE TOOLS/STEPS TO TEST YOUR CHANGES ***


Testing, Expected Results:
--
*** PASTE COMMAND OUTPUTS / TEST RESULTS ***


Conditions of Submission:
-
*** HOW MANY DAYS BEFORE PUSHING, CONSENSUS ETC ***


Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  n  n
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many blank entries
 that need proper data filled in.

___ You have failed to nominate the proper persons for review and push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible line length in your headers/comments/text.

___ You have failed to put in a proper Trac Ticket # into your commits.

___ You have incorrectly put/left internal data in your comments/files
 (i.e. internal bug tracking tool IDs, product names etc)

___ You have not given any evidence of testing beyond basic build tests.
 Demonstrate some level of runtime or other sanity testing.

___ You have ^M present in some of your files. These have to be removed.

___ You have needlessly changed whitespace or added whitespace crimes
 like trailing spaces, or spaces before tabs.

___ You have mixed real technical changes with whitespace and other
 cosmetic code cleanup changes. These have to be separate commits.

___ You need to refactor your submission into logical chunks; there is
 too much content into a single commit.

___ You have extraneous garbage in your review (merge commits etc)

___ You have giant attachments which should never have been sent;
 Instead you should place your content in a public tree to be pulled.

___ You have too many commits attached to an e-mail; resend as threaded
 commits, or place in a public tree for a pull.

___ You have resent this content multiple times without a clear indication
 of what has changed between each re-send.

___ You have failed to adequately and individually address all of the
 comments and change requests that were proposed in the initial review.

___ You have a misconfigured ~/.gitconfig file (i.e. user.name, user.email etc)

___ Your computer have a badly configured date and time; confusing the
 the threaded patch review.

___ Your changes affect IPC mechanism, and you don't present any results
 for in-service upgradability test.

___ Your changes affect user manual and documentation, your patch series
 do not contain the patch that updates the Doxygen manual.





___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net

Re: [devel] [PATCH 1/1] amfd: not accept lock-in if su is reparing [#3121]

2019-12-02 Thread Minh Hon Chau


Hi Thang,

I assume you have tried and there is no way to reuse the current *state* 
of su to prevent the lock-in op in this scenario, and this patch tested 
ok with upgrade/downgrade. The down side of adding checkpoint is that we 
will stick with it even then we find better solution later on, since 
removing the checkpoint would cause a nbc.


No comments from me.

Thanks

Minh

On 2/12/19 2:39 pm, thang.d.nguyen wrote:

AMFD should not accept lock-in admin op on SU
if the SU is repairing.
---
  src/amf/amfd/chkop.cc |  9 +
  src/amf/amfd/ckpt.h   |  3 ++-
  src/amf/amfd/ckpt_dec.cc  | 42 +--
  src/amf/amfd/ckpt_enc.cc  | 30 +++-
  src/amf/amfd/ckpt_msg.h   |  1 +
  src/amf/amfd/ckpt_updt.cc |  1 +
  src/amf/amfd/sgproc.cc|  1 +
  src/amf/amfd/su.cc| 19 ++
  src/amf/amfd/su.h |  3 +++
  9 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfd/chkop.cc b/src/amf/amfd/chkop.cc
index 56b0142a6..15408b657 100644
--- a/src/amf/amfd/chkop.cc
+++ b/src/amf/amfd/chkop.cc
@@ -923,6 +923,14 @@ uint32_t avsv_send_ckpt_data(AVD_CL_CB *cb, uint32_t 
action,
  /* No need to send the message as standy would get the applier 
callback
   */
  return NCSCC_RC_SUCCESS;
+case AVSV_CKPT_SU_INST_PROCESSED:
+  if (avd_cb->avd_peer_ver < AVD_MBCSV_SUB_PART_VERSION_11) {
+/* No need to send the message to old std as this async is newly added.
+ */
+return NCSCC_RC_SUCCESS;
+  }
+  cb->async_updt_cnt.su_updt++;
+  break;
  /* else fall through */
  case AVSV_CKPT_SU_SI_CURR_ACTIVE:
  case AVSV_CKPT_SU_SI_CURR_STBY:
@@ -1366,6 +1374,7 @@ static uint32_t avsv_validate_reo_type_in_csync(AVD_CL_CB 
*cb,
  case AVSV_CKPT_SU_SI_CURR_STBY:
  case AVSV_CKPT_SU_ADMIN_STATE:
  case AVSV_CKPT_SU_TERM_STATE:
+case AVSV_CKPT_SU_INST_PROCESSED:
  case AVSV_CKPT_SU_SWITCH:
  case AVSV_CKPT_SU_OPER_STATE:
  case AVSV_CKPT_SU_PRES_STATE:
diff --git a/src/amf/amfd/ckpt.h b/src/amf/amfd/ckpt.h
index 2e1538719..f092f5b8c 100644
--- a/src/amf/amfd/ckpt.h
+++ b/src/amf/amfd/ckpt.h
@@ -35,9 +35,10 @@
  #define AMF_AMFD_CKPT_H_
  
  // current version

-#define AVD_MBCSV_SUB_PART_VERSION 10
+#define AVD_MBCSV_SUB_PART_VERSION 11
  
  // supported versions

+#define AVD_MBCSV_SUB_PART_VERSION_11 11
  #define AVD_MBCSV_SUB_PART_VERSION_10 10
  #define AVD_MBCSV_SUB_PART_VERSION_9 9
  #define AVD_MBCSV_SUB_PART_VERSION_8 8
diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc
index 75213f821..7030f43b1 100644
--- a/src/amf/amfd/ckpt_dec.cc
+++ b/src/amf/amfd/ckpt_dec.cc
@@ -63,6 +63,7 @@ static uint32_t dec_su_si_curr_active(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec);
  static uint32_t dec_su_si_curr_stby(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec);
  static uint32_t dec_su_admin_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec);
  static uint32_t dec_su_term_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec);
+static uint32_t dec_su_inst_msg_processed(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC 
*dec);
  static uint32_t dec_su_switch(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec);
  static uint32_t dec_su_oper_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec);
  static uint32_t dec_su_pres_state(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec);
@@ -162,8 +163,8 @@ const AVSV_DECODE_CKPT_DATA_FUNC_PTR 
avd_dec_data_func_list[] = {
  dec_comp_pres_state, dec_comp_restart_count, nullptr, /* AVSV_SYNC_COMMIT 
*/
  dec_su_restart_count, dec_si_dep_state, dec_ng_admin_state,
  dec_avd_to_avd_job_queue_status,
-dec_node_failover_state
-
+dec_node_failover_state,
+dec_su_inst_msg_processed
  };
  
  /*

@@ -445,6 +446,9 @@ static void decode_su(NCS_UBAID *ub, AVD_SU *su, uint16_t 
peer_version) {
  
if (peer_version >= AVD_MBCSV_SUB_PART_VERSION_2)

  osaf_decode_bool(ub, >su_is_external);
+
+  if (peer_version >= AVD_MBCSV_SUB_PART_VERSION_11)
+osaf_decode_bool(ub, >is_inst_msg_processed);
  }
  
  /\

@@ -1538,6 +1542,40 @@ static uint32_t dec_su_term_state(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec) {
return NCSCC_RC_SUCCESS;
  }
  
+/\

+ *
+ * Purpose:  Decode SU inst msg of service
+ *
+ * Input: cb - CB pointer.
+ *dec - Decode arguments passed by MBCSV.
+ *
+ * Returns: NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
+ *
+ * NOTES:
+ *
+ *
+\**/
+static uint32_t dec_su_inst_msg_processed(
+ AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec) {
+  SaNameT name;
+
+  TRACE_ENTER();
+
+  osaf_decode_sanamet(>i_uba, );
+  AVD_SU *su = su_db->find(Amf::to_string());
+  osafassert(su != nullptr);
+  osaf_decode_uint32(>i_uba,
+reinterpret_cast(>is_inst_msg_processed));
+
+

Re: [devel] [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123]

2019-11-28 Thread Minh Hon Chau


Hi Thuan,

ack with comments.

Thanks

Minh

On 28/11/19 6:55 pm, thuan.tran wrote:

When overflow happens, mds with flow control enabled may keep
all messages in queue if it fails to send a message when receiving
Nack or ChunkAck since no more trigger come after that.
MDS flow control should retry to send message in this scenario.
---
  src/mds/mds_tipc_fctrl_portid.cc | 47 ++--
  1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc
index 316e1ba75..d5314d5bc 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -17,6 +17,7 @@
  
  #include "mds/mds_tipc_fctrl_portid.h"

  #include "base/ncssysf_def.h"
+#include "base/osaf_time.h"
  
  #include "mds/mds_dt.h"

  #include "mds/mds_log.h"
@@ -149,23 +150,24 @@ void TipcPortId::FlushData() {
  
  uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) {

struct sockaddr_tipc server_addr;
-  ssize_t send_len = 0;
-  uint32_t rc = NCSCC_RC_SUCCESS;
-
memset(_addr, 0, sizeof(server_addr));
server_addr.family = AF_TIPC;
server_addr.addrtype = TIPC_ADDR_ID;
server_addr.addr.id = id_;
-  send_len = sendto(bsrsock_, data, length, 0,
-(struct sockaddr *)_addr, sizeof(server_addr));
-
-  if (send_len == length) {
-rc = NCSCC_RC_SUCCESS;
-  } else {
-m_MDS_LOG_ERR("FCTRL: sendto() failed, Error[%s]", strerror(errno));
-rc = NCSCC_RC_FAILURE;
+  int retry = 5;
+  while (retry >= 0) {
+ssize_t send_len = sendto(bsrsock_, data, length, 0,
+  (struct sockaddr *)_addr, sizeof(server_addr));
+
+if (send_len == length) {
+  return NCSCC_RC_SUCCESS;
+} else if (retry-- > 0) {
+  assert(errno == ENOMEM || errno == ENOBUFS);
+  osaf_nanosleep();
+}
}
[Minh] It might be a good thing to make a wrapper of sendto(), since the 
sendto() is currently called in fctrl_portid.cc and mds_dt_tipc.c. So we 
only call the wrapper of sendto(), which handles the error code of 
sendto(). I think the only  EINTR code to be checked, there are a few 
places in opensaf that is handling error code of sendto() which we can 
take as reference.

-  return rc;
+  m_MDS_LOG_ERR("FCTRL: sendto() failed, Error[%s]", strerror(errno));
+  return NCSCC_RC_FAILURE;
  }
  
  uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length,

@@ -440,13 +442,16 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
  // try to send a few pending msg
  DataMessage* msg = nullptr;
  uint16_t send_msg_cnt = 0;
-while (send_msg_cnt++ < chunk_size_) {
+int retry = 0;
+while (send_msg_cnt < chunk_size_) {
// find the lowest sequence unsent yet
msg = sndqueue_.FirstUnsent();
if (msg == nullptr) {
  break;
} else {
if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {
+retry = 0;
+send_msg_cnt++;
  msg->is_sent_ = true;
  m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
  "SndQData[fseq:%u, len:%u], "
@@ -454,6 +459,12 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
  id_.node, id_.ref,
  msg->header_.fseq_, msg->header_.msg_len_,
  sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_);
+  } else if (send_msg_cnt == 0) {
+// If not retry, all messages are kept in queue
+// and no more trigger to send messages
+retry++;
+assert(retry < 100);
+continue;


[Minh] We can accept to use the assert for now, and 100 should be 
defined as constant. But I do think we need a fallback mechanism, if the 
socket fd is not able to send data, we can terminate the portid, and 
trigger a MDS_DOWN event, ... and this could be looked in another ticket.


Also, the patch title does not seem to be right in the context of this 
ticket, where we have problem of "Cannot allocate memeory", we might not 
be able to send any more message (not that for all) and hit the assert. 
We can say "Add retry for tipc sendto()" or you have a better 
description for it.



} else {
  break;
}
@@ -508,9 +519,15 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag,
DataMessage* msg = sndqueue_.Find(Seq16(fseq));
if (msg != nullptr) {
  // Resend the msg found
-if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) {
-  msg->is_sent_ = true;
+int retry = 0;
+while (Send(msg->msg_data_, msg->header_.msg_len_) != NCSCC_RC_SUCCESS) {
+  // If not retry, all messages are kept in queue
+  // and no more trigger to send messages
+  retry++;
+  assert(retry < 100);
+  continue;
  }
+msg->is_sent_ = true;
  m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
  "RsndData[mseq:%u, mfrag:%u, fseq:%u], "

Re: [devel] [PATCH 2/2] mds: Avoid message reallocation [#3089]

2019-11-27 Thread Minh Hon Chau


Hi Thuan,

We should free() the memory at the same function level where the memory 
is allocated. The @buffer is passed to mdtm_sendto() could be from a 
stack memory (as it is used to be before this patch).


Thanks

Minh

On 27/11/19 5:40 pm, Tran Thuan wrote:

Hi Minh,

Why not free() inside mdtm_sendto() and mdtm_mcast_sendto()?
It will help reduce much code change.

Best Regards,
ThuanTr

-Original Message-
From: Minh Chau 
Sent: Tuesday, November 26, 2019 7:02 PM
To: thuan.t...@dektech.com.au; vu.m.ngu...@dektech.com.au; 
gary@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net; Minh Chau 
Subject: [PATCH 2/2] mds: Avoid message reallocation [#3089]

The patch avoids message reallocation if the message is in
retransmission queue
---
  src/mds/mds_dt_tipc.c| 42 +++-
  src/mds/mds_tipc_fctrl_intf.cc   |  6 --
  src/mds/mds_tipc_fctrl_intf.h|  4 ++--
  src/mds/mds_tipc_fctrl_msg.cc|  2 +-
  src/mds/mds_tipc_fctrl_portid.cc |  9 +++--
  5 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 16cf11b..866c370 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -120,7 +120,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req);
  
  /* Tipc actual send, can be made as Macro even*/

  static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t buff_len,
-   struct tipc_portid tipc_id);
+   struct tipc_portid tipc_id, uint8_t *is_queued);
  static uint32_t mdtm_mcast_sendto(void *buffer, size_t size,
  const MDTM_SEND_REQ *req);
  
@@ -2643,7 +2643,8 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)

if (req->snd_type == MDS_SENDTYPE_ACK ||
req->snd_type == MDS_SENDTYPE_RACK) {
uint8_t len = mds_and_mdtm_hdr_len;
-   uint8_t buffer_ack[len];
+   uint8_t *buffer_ack = calloc(1, len);
+   uint8_t is_queued = 0;
  
  			/* Add mds_hdr */

if (mdtm_add_mds_hdr(buffer_ack, req)
@@ -2657,18 +2658,24 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
_seq_num) == NCSCC_RC_FAILURE){
m_MDS_LOG_ERR("FCTRL: Failed to send message"
" len :%d", len);
+   free(buffer_ack);
return NCSCC_RC_FAILURE;
}
/* Add frag_hdr */
if (mdtm_add_frag_hdr(buffer_ack, len, frag_seq_num,
0, fctrl_seq_num) != NCSCC_RC_SUCCESS) {
+   free(buffer_ack);
return NCSCC_RC_FAILURE;
}
  
  			m_MDS_LOG_DBG("MDTM:Sending message with Service"

" Seqno=%d, TO Dest_Tipc_id=<0x%08x:%u> ",
req->svc_seq_num, tipc_id.node, tipc_id.ref);
-   return mdtm_sendto(buffer_ack, len, tipc_id);
+   status = mdtm_sendto(buffer_ack, len, tipc_id,
+   _queued);
+   if (is_queued == 0)
+   free(buffer_ack);
+   return status;
}
  
  		if (req->msg.encoding == MDS_ENC_TYPE_FLAT) {

@@ -2730,6 +2737,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
} else {
uint8_t *p8;
uint8_t *body = NULL;
+   uint8_t is_queued = 0;
  
  body = calloc(1, len +

mds_and_mdtm_hdr_len);
@@ -2824,7 +2832,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
return NCSCC_RC_FAILURE;
}
} else {
-   if (mdtm_sendto(body, len, tipc_id)
+   if (mdtm_sendto(body, len, tipc_id, 
_queued)
!= NCSCC_RC_SUCCESS) {
m_MDS_LOG_ERR("MDTM: Unable to"
" send the msg thru"
@@ -2835,7 +2843,8 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
}
}
m_MMGR_FREE_BUFR_LIST(usrbuf);
-   free(body);
+   if (is_queued == 0)
+   free(body);
return NCSCC_RC_SUCCESS;
}
} break;
@@ -2864,6 +2873,7 @@ uint32_t

Re: [devel] [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123]

2019-11-26 Thread Minh Hon Chau

But after all retries are still failed, we might need to terminate the 
portid, which leads to a MDS DOWN event, but let's look at it later.


On 27/11/19 3:23 pm, Minh Hon Chau wrote:

Hi Thuan,

I'm thinking to retry 3 times with 100 ms in between, but you can 
decide it. Also, we need to ensure not to make the mds main receiving 
thread being blocked with the retry (on the flow of processing data). 
The retry in this patch is ok since it retries on the mds flow control 
thread, so it does not delay the mds main receiving thread.


Thanks

Minh

On 27/11/19 2:40 pm, Tran Thuan wrote:

Hi Minh,

I think it's good if retry some times for normal Send().
Do you have any idea how many retries? Interval b/w tries?

Best Regards,
ThuanTr

-Original Message-
From: Minh Hon Chau 
Sent: Wednesday, November 27, 2019 10:30 AM
To: thuan.tran ; thang . d . nguyen 
; 'Nguyen Minh Vu' 
; gary@dektech.com.au

Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: Fix mds flow control keep all messages 
in queue [#3123]


Hi Thuan,

The TipcPortId:Send is also called at a few other places, do you think
it is good if we make a wrapper of TipcPortId::Send with a few retries
on failures, says TipcPortId::TryToSend(), and call TryToSend() instead
of Send()?

Thanks

Minh

On 27/11/19 1:26 pm, thuan.tran wrote:

When overflow happens, mds with flow control enabled may keep
all messages in queue if it fails to send a message when receiving
Nack or ChunkAck since no more trigger come after that.
MDS flow control should retry to send message in this scenario.
---
   src/mds/mds_tipc_fctrl_portid.cc | 16 
   1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_portid.cc 
b/src/mds/mds_tipc_fctrl_portid.cc

index 724eb7b7b..e6e179669 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -17,6 +17,7 @@
      #include "mds/mds_tipc_fctrl_portid.h"
   #include "base/ncssysf_def.h"
+#include "base/osaf_time.h"
      #include "mds/mds_dt.h"
   #include "mds/mds_log.h"
@@ -440,13 +441,14 @@ void TipcPortId::ReceiveChunkAck(uint16_t 
fseq, uint16_t chksize) {

   // try to send a few pending msg
   DataMessage* msg = nullptr;
   uint16_t send_msg_cnt = 0;
-    while (send_msg_cnt++ < chunk_size_) {
+    while (send_msg_cnt < chunk_size_) {
 // find the lowest sequence unsent yet
 msg = sndqueue_.FirstUnsent();
 if (msg == nullptr) {
   break;
 } else {
 if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {

+    send_msg_cnt++;
   msg->is_sent_ = true;
   m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
   "SndQData[fseq:%u, len:%u], "
@@ -455,7 +457,10 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, 
uint16_t chksize) {

   msg->header_.fseq_, msg->header_.msg_len_,
   sndwnd_.acked_.v(), sndwnd_.send_.v(), 
sndwnd_.nacked_space_);

 } else {
-    break;
+    // If not retry, all messages are kept in queue
+    // and no more trigger to send messages
+    osaf_nanosleep();
+    continue;
 }
 }
   }
@@ -508,9 +513,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, 
uint16_t mfrag,

 DataMessage* msg = sndqueue_.Find(Seq16(fseq));
 if (msg != nullptr) {
   // Resend the msg found
-    if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {

-  msg->is_sent_ = true;
+    while (Send(msg->msg_data_, msg->header_.msg_len_) != 
NCSCC_RC_SUCCESS) {

+  // If not retry, all messages are kept in queue
+  // and no more trigger to send messages
+  osaf_nanosleep();
   }
+    msg->is_sent_ = true;
   m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
   "RsndData[mseq:%u, mfrag:%u, fseq:%u], "
   "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",





___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123]

2019-11-26 Thread Minh Hon Chau


Hi Thuan,

I'm thinking to retry 3 times with 100 ms in between, but you can decide 
it. Also, we need to ensure not to make the mds main receiving thread 
being blocked with the retry (on the flow of processing data). The retry 
in this patch is ok since it retries on the mds flow control thread, so 
it does not delay the mds main receiving thread.


Thanks

Minh

On 27/11/19 2:40 pm, Tran Thuan wrote:

Hi Minh,

I think it's good if retry some times for normal Send().
Do you have any idea how many retries? Interval b/w tries?

Best Regards,
ThuanTr

-Original Message-
From: Minh Hon Chau 
Sent: Wednesday, November 27, 2019 10:30 AM
To: thuan.tran ; thang . d . nguyen 
; 'Nguyen Minh Vu' ; 
gary@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: Fix mds flow control keep all messages in queue 
[#3123]

Hi Thuan,

The TipcPortId:Send is also called at a few other places, do you think
it is good if we make a wrapper of TipcPortId::Send with a few retries
on failures, says TipcPortId::TryToSend(), and call TryToSend() instead
of Send()?

Thanks

Minh

On 27/11/19 1:26 pm, thuan.tran wrote:

When overflow happens, mds with flow control enabled may keep
all messages in queue if it fails to send a message when receiving
Nack or ChunkAck since no more trigger come after that.
MDS flow control should retry to send message in this scenario.
---
   src/mds/mds_tipc_fctrl_portid.cc | 16 
   1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc
index 724eb7b7b..e6e179669 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -17,6 +17,7 @@
   
   #include "mds/mds_tipc_fctrl_portid.h"

   #include "base/ncssysf_def.h"
+#include "base/osaf_time.h"
   
   #include "mds/mds_dt.h"

   #include "mds/mds_log.h"
@@ -440,13 +441,14 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
   // try to send a few pending msg
   DataMessage* msg = nullptr;
   uint16_t send_msg_cnt = 0;
-while (send_msg_cnt++ < chunk_size_) {
+while (send_msg_cnt < chunk_size_) {
 // find the lowest sequence unsent yet
 msg = sndqueue_.FirstUnsent();
 if (msg == nullptr) {
   break;
 } else {
 if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {
+send_msg_cnt++;
   msg->is_sent_ = true;
   m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
   "SndQData[fseq:%u, len:%u], "
@@ -455,7 +457,10 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
   msg->header_.fseq_, msg->header_.msg_len_,
   sndwnd_.acked_.v(), sndwnd_.send_.v(), 
sndwnd_.nacked_space_);
 } else {
-break;
+// If not retry, all messages are kept in queue
+// and no more trigger to send messages
+osaf_nanosleep();
+continue;
 }
 }
   }
@@ -508,9 +513,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag,
 DataMessage* msg = sndqueue_.Find(Seq16(fseq));
 if (msg != nullptr) {
   // Resend the msg found
-if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) {
-  msg->is_sent_ = true;
+while (Send(msg->msg_data_, msg->header_.msg_len_) != NCSCC_RC_SUCCESS) {
+  // If not retry, all messages are kept in queue
+  // and no more trigger to send messages
+  osaf_nanosleep();
   }
+msg->is_sent_ = true;
   m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
   "RsndData[mseq:%u, mfrag:%u, fseq:%u], "
   "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",





___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: Fix mds flow control keep all messages in queue [#3123]

2019-11-26 Thread Minh Hon Chau


Hi Thuan,

The TipcPortId:Send is also called at a few other places, do you think 
it is good if we make a wrapper of TipcPortId::Send with a few retries 
on failures, says TipcPortId::TryToSend(), and call TryToSend() instead 
of Send()?


Thanks

Minh

On 27/11/19 1:26 pm, thuan.tran wrote:

When overflow happens, mds with flow control enabled may keep
all messages in queue if it fails to send a message when receiving
Nack or ChunkAck since no more trigger come after that.
MDS flow control should retry to send message in this scenario.
---
  src/mds/mds_tipc_fctrl_portid.cc | 16 
  1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc
index 724eb7b7b..e6e179669 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -17,6 +17,7 @@
  
  #include "mds/mds_tipc_fctrl_portid.h"

  #include "base/ncssysf_def.h"
+#include "base/osaf_time.h"
  
  #include "mds/mds_dt.h"

  #include "mds/mds_log.h"
@@ -440,13 +441,14 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
  // try to send a few pending msg
  DataMessage* msg = nullptr;
  uint16_t send_msg_cnt = 0;
-while (send_msg_cnt++ < chunk_size_) {
+while (send_msg_cnt < chunk_size_) {
// find the lowest sequence unsent yet
msg = sndqueue_.FirstUnsent();
if (msg == nullptr) {
  break;
} else {
if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {
+send_msg_cnt++;
  msg->is_sent_ = true;
  m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
  "SndQData[fseq:%u, len:%u], "
@@ -455,7 +457,10 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
  msg->header_.fseq_, msg->header_.msg_len_,
  sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_);
} else {
-break;
+// If not retry, all messages are kept in queue
+// and no more trigger to send messages
+osaf_nanosleep();
+continue;
}
}
  }
@@ -508,9 +513,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t mfrag,
DataMessage* msg = sndqueue_.Find(Seq16(fseq));
if (msg != nullptr) {
  // Resend the msg found
-if (Send(msg->msg_data_, msg->header_.msg_len_) == NCSCC_RC_SUCCESS) {
-  msg->is_sent_ = true;
+while (Send(msg->msg_data_, msg->header_.msg_len_) != NCSCC_RC_SUCCESS) {
+  // If not retry, all messages are kept in queue
+  // and no more trigger to send messages
+  osaf_nanosleep();
  }
+msg->is_sent_ = true;
  m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
  "RsndData[mseq:%u, mfrag:%u, fseq:%u], "
  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: Avoid message re-allocation [#3089]

2019-11-25 Thread Minh Hon Chau


Hi Vu, Thuan,

The patch misses the error cases and the kDisabled state. I rework for 
the V2.


Thanks

Minh

On 25/11/19 6:44 pm, Nguyen Minh Vu wrote:

Hi Minh,

Ack with comments inline.

Regards, Vu

On 11/25/19 1:12 PM, Minh Chau wrote:

The patch avoids message reallocation if enable
MDS_TIPC_FCTRL_ENABLED
---
  src/mds/mds_dt_tipc.c    | 27 ---
  src/mds/mds_tipc_fctrl_msg.cc    |  2 +-
  src/mds/mds_tipc_fctrl_portid.cc |  9 +++--
  3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index fdf0da7..aa8d5c2 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -2644,7 +2644,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
  if (req->snd_type == MDS_SENDTYPE_ACK ||
  req->snd_type == MDS_SENDTYPE_RACK) {
  uint8_t len = sum_mds_hdr_plus_mdtm_hdr_plus_len;
-    uint8_t buffer_ack[len];
+    uint8_t* buffer_ack = calloc(1, len);
[Vu] Below this allocation, there are several error handlings, but not 
free memory before returning.

Is that expected?


    /* Add mds_hdr */
  if (NCSCC_RC_SUCCESS !=
@@ -2667,7 +2667,11 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
  m_MDS_LOG_DBG(
  "MDTM:Sending message with Service Seqno=%d, TO 
Dest_Tipc_id=<0x%08x:%u> ",

  req->svc_seq_num, tipc_id.node, tipc_id.ref);
-    return mdtm_sendto(buffer_ack, len, tipc_id);
+    status = mdtm_sendto(buffer_ack, len, tipc_id);
+    if (gl_mds_pro_ver != MDS_PROT_FCTRL) {
+    free(buffer_ack);
+    }
[Vu] Above allocation does not stick with `MDS_PROT_FCTRL` check, so 
if the above condition check

gets failure, the allocated memory is leaked?

+    return status;
  }
    if (MDS_ENC_TYPE_FLAT == req->msg.encoding) {
@@ -2815,6 +2819,8 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
  free(body);
  return NCSCC_RC_FAILURE;
  }
+    m_MMGR_FREE_BUFR_LIST(usrbuf);
+    free(body);
  } else {
  if (NCSCC_RC_SUCCESS !=
  mdtm_sendto(body, len, tipc_id)) {
@@ -2824,9 +2830,12 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
  free(body);
  return NCSCC_RC_FAILURE;
  }
+    if (gl_mds_pro_ver != MDS_PROT_FCTRL) {
+    m_MMGR_FREE_BUFR_LIST(usrbuf);
+    free(body);
+    }
  }
-    m_MMGR_FREE_BUFR_LIST(usrbuf);
-    free(body);
+
  return NCSCC_RC_SUCCESS;
  }
  } break;
@@ -2909,7 +2918,9 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
  mds_free_direct_buff(
  req->msg.data.buff_info.buff);
  }
-    free(body);
+    if (gl_mds_pro_ver != MDS_PROT_FCTRL) {
+    free(body);
+    }
  return NCSCC_RC_SUCCESS;
  } break;
  @@ -3059,21 +3070,23 @@ uint32_t mdtm_frag_and_send(MDTM_SEND_REQ 
*req, uint32_t seq_num,

  get_svc_names(req->src_svc_id), req->src_svc_id,
  get_svc_names(req->dest_svc_id), 
req->dest_svc_id);

  ret = mdtm_mcast_sendto(body, len_buf, req);
+    free(body);
  } else {
  m_MDS_LOG_DBG(
  "MDTM:Sending message with Service Seqno=%d, 
Fragment Seqnum=%d, frag_num=%d, TO Dest_Tipc_id=<0x%08x:%u>",

  req->svc_seq_num, seq_num, frag_val,
  id.node, id.ref);
  ret = mdtm_sendto(body, len_buf, id);
+    if (gl_mds_pro_ver != MDS_PROT_FCTRL) {
+    free(body);
+    }
  }
  if (ret != NCSCC_RC_SUCCESS) {
  // Failed to send a fragmented msg, stop sending
  m_MMGR_FREE_BUFR_LIST(usrbuf);
-    free(body);
  break;
  }
  m_MMGR_REMOVE_FROM_START(, len_buf - hdr_plus);
-    free(body);
  len = len - (len_buf - hdr_plus);
  if (len == 0)
  break;
diff --git a/src/mds/mds_tipc_fctrl_msg.cc 
b/src/mds/mds_tipc_fctrl_msg.cc

index 454c02c..0f9fd09 100644
--- a/src/mds/mds_tipc_fctrl_msg.cc
+++ b/src/mds/mds_tipc_fctrl_msg.cc
@@ -138,7 +138,7 @@ void DataMessage::Decode(uint8_t *msg) {
    DataMessage::~DataMessage() {
    if (msg_data_ != nullptr) {
-    delete[] msg_data_;
+    free(msg_data_);
  msg_data_ = nullptr;
    }
  }
diff --git a/src/mds/mds_tipc_fctrl_portid.cc 
b/src/mds/mds_tipc_fctrl_portid.cc

index 724eb7b..08e8dce 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++

Re: [devel] [PATCH 1/1] amfd: not accept lock-in admin op if presence msg not processed [#3121]

2019-11-25 Thread Minh Hon Chau


Hi Thang,

Instead of adding is_presence_msg_processed, which requires a checkpoint 
to standby, can we make it as a function (or might be *if* statement) 
that utilizes the pres_state and term_state to (dis)allow the lock-in op?


Thanks

Minh

On 25/11/19 5:50 pm, thang.d.nguyen wrote:

AMFD should not accept lock-in admin op on SU if the presence msg
has already sent to that SU.
---
  src/amf/amfd/sgproc.cc |  1 +
  src/amf/amfd/su.cc | 13 +
  src/amf/amfd/su.h  |  2 ++
  3 files changed, 16 insertions(+)

diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc
index ddd825d44..8aeb9ec3c 100644
--- a/src/amf/amfd/sgproc.cc
+++ b/src/amf/amfd/sgproc.cc
@@ -2126,6 +2126,7 @@ uint32_t avd_sg_app_su_inst_func(AVD_CL_CB *cb, AVD_SG 
*sg) {
  }
} else {
  if (avd_snd_presence_msg(cb, i_su, false) == NCSCC_RC_SUCCESS) {
+  i_su->is_presence_msg_processed = true;
num_try_insvc_su++;
  }
}
diff --git a/src/amf/amfd/su.cc b/src/amf/amfd/su.cc
index 8c8ef9d4f..494022893 100644
--- a/src/amf/amfd/su.cc
+++ b/src/amf/amfd/su.cc
@@ -51,6 +51,7 @@ void AVD_SU::initialize() {
term_state = false;
su_switch = AVSV_SI_TOGGLE_STABLE;
su_is_external = false;
+  is_presence_msg_processed = false;
su_act_state = 0;
sg_of_su = nullptr;
su_on_node = nullptr;
@@ -810,6 +811,12 @@ void AVD_SU::set_pres_state(SaAmfPresenceStateT 
pres_state) {
   */
  return;
  
+  if ((pres_state == SA_AMF_PRESENCE_INSTANTIATED) ||

+  (pres_state == SA_AMF_PRESENCE_INSTANTIATION_FAILED) ||
+  (pres_state == SA_AMF_PRESENCE_TERMINATION_FAILED)) {
+this->is_presence_msg_processed = false;
+  }
+
osafassert(pres_state <= SA_AMF_PRESENCE_TERMINATION_FAILED);
TRACE_ENTER2("'%s' %s => %s", name.c_str(),
 avd_pres_state_name[saAmfSUPresenceState],
@@ -1085,6 +1092,12 @@ void AVD_SU::lock_instantiation(SaImmOiHandleT 
immoi_handle,
  goto done;
}
  
+  if (is_presence_msg_processed == true) {

+report_admin_op_error(immoi_handle, invocation, SA_AIS_ERR_TRY_AGAIN,
+  nullptr, "'%s' instantiate not done", name.c_str());
+goto done;
+  }
+
if (list_of_susi != nullptr) {
  report_admin_op_error(immoi_handle, invocation, SA_AIS_ERR_TRY_AGAIN,
nullptr, "SIs still assigned to this SU '%s'",
diff --git a/src/amf/amfd/su.h b/src/amf/amfd/su.h
index 7afc5abee..722c68b9c 100644
--- a/src/amf/amfd/su.h
+++ b/src/amf/amfd/su.h
@@ -87,6 +87,8 @@ class AVD_SU {
  
bool su_is_external; /* indicates if this SU is external */
  
+  bool is_presence_msg_processed; /* indicate inst msg sent to nd */

+
int su_act_state;  // not used, kept for EDU, remove later
  
bool wait_for_contained_to_quiesce;



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: fix memleak in code and test [#1860]

2019-11-20 Thread Minh Hon Chau


Hi Thuan

ack (review only)

Thanks

Minh

On 19/11/19 5:49 pm, thuan.tran wrote:

---
  src/mds/apitest/mdstipc.h  |   2 +-
  src/mds/apitest/mdstipc_api.c  | 134 +++--
  src/mds/apitest/mdstipc_conf.c |   9 ++-
  src/mds/mds_c_sndrcv.c |   1 +
  src/mds/mds_tipc_fctrl_intf.cc |   4 +-
  5 files changed, 88 insertions(+), 62 deletions(-)

diff --git a/src/mds/apitest/mdstipc.h b/src/mds/apitest/mdstipc.h
index 5fd7b9c6e..b56940ea6 100644
--- a/src/mds/apitest/mdstipc.h
+++ b/src/mds/apitest/mdstipc.h
@@ -203,7 +203,7 @@ uint32_t destroy_pwe_on_vdest(MDS_HDL);
  
  /** USER DEFINED WRAPPERS FOR MDS SERVICE APIs **/
  
-uint32_t tet_create_task(NCS_OS_CB, NCSCONTEXT);

+uint32_t tet_create_task(NCS_OS_CB, NCSCONTEXT*);
  uint32_t tet_release_task(void *task_handle);
  int is_adest_sel_obj_found(int);
  int is_sel_obj_found(int);
diff --git a/src/mds/apitest/mdstipc_api.c b/src/mds/apitest/mdstipc_api.c
index 651365e95..847f9a7f1 100644
--- a/src/mds/apitest/mdstipc_api.c
+++ b/src/mds/apitest/mdstipc_api.c
@@ -398,7 +398,7 @@ void tet_svc_install_tp_10()
printf(
"\nTest case 10:Installing the External MIN service EXTMIN in a seperate 
thread and Uninstalling it here\n");
// Install thread
-   rc = tet_create_task((NCS_OS_CB)tet_vdest_install_thread, t_handle);
+   rc = tet_create_task((NCS_OS_CB)tet_vdest_install_thread, _handle);
if (rc != NCSCC_RC_SUCCESS) {
printf("\nFail to Install thread\n");
FAIL = 1;
@@ -999,7 +999,7 @@ void tet_svc_unstall_tp_5()
// Uninstalling the above service in a seperate thread
// Uninstall thread
rc = tet_create_task((NCS_OS_CB)tet_vdest_uninstall_thread,
-gl_tet_vdest[0].svc[0].task.t_handle);
+_tet_vdest[0].svc[0].task.t_handle);
if (rc != NCSCC_RC_SUCCESS) {
printf("\nFail to create the uninstall thread\n");
FAIL = 1;
@@ -2141,12 +2141,18 @@ void cleanup_ADEST_srv()
  {
int id;
printf("\nUninstalling all the services on this ADESt\n");
-   for (id = gl_tet_adest.svc_count - 1; id >= 0; id--)
+   for (id = gl_tet_adest.svc_count - 1; id >= 0; id--) {
+   if (mds_service_retrieve(gl_tet_adest.mds_pwe1_hdl,
+gl_tet_adest.svc[id].svc_id,
+SA_DISPATCH_ALL) != NCSCC_RC_SUCCESS) {
+   printf("Adest Svc  Retrieve Fail\n");
+   }
if (mds_service_uninstall(gl_tet_adest.mds_pwe1_hdl,
  gl_tet_adest.svc[id].svc_id) !=
NCSCC_RC_SUCCESS) {
printf("\nFail mds_service_uninstall\n");
}
+   }
  }
  
  void tet_svc_subscr_ADEST_1()

@@ -2441,7 +2447,7 @@ void tet_svc_subscr_ADEST_8()
}
printf("\nAction: Cancel in a seperate thread\n");
if (tet_create_task((NCS_OS_CB)tet_adest_cancel_thread,
-   gl_tet_adest.svc[0].task.t_handle) ==
+   _tet_adest.svc[0].task.t_handle) ==
NCSCC_RC_SUCCESS) {
printf("\nTask has been Created\n");
fflush(stdout);
@@ -2547,7 +2553,7 @@ void tet_svc_subscr_ADEST_10()
printf("\nAction: Retrieve in a seperate thread\n");
/*Retrieve thread*/
if (tet_create_task((NCS_OS_CB)tet_adest_retrieve_thread,
-   gl_tet_adest.svc[0].task.t_handle) ==
+   _tet_adest.svc[0].task.t_handle) ==
NCSCC_RC_SUCCESS) {
printf("\nTask has been Created\n");
fflush(stdout);
@@ -2751,7 +2757,10 @@ uint32_t tet_cleanup_setup()
printf("Fail mds_service_retrieve\n");
FAIL = 1;
}
-
+   if (gl_rcvdmsginfo.msg) {
+   free(gl_rcvdmsginfo.msg);
+   gl_rcvdmsginfo.msg = NULL;
+   }
if (mds_service_uninstall(
gl_tet_vdest[i].mds_pwe1_hdl,
gl_tet_vdest[i].svc[id].svc_id) !=
@@ -2785,6 +2794,10 @@ uint32_t tet_cleanup_setup()
printf("Adest Svc  Retrieve Fail\n");
FAIL = 1;
}
+   if (gl_rcvdmsginfo.msg) {
+   free(gl_rcvdmsginfo.msg);
+   gl_rcvdmsginfo.msg = NULL;
+   }
if (mds_service_uninstall(gl_tet_adest.mds_pwe1_hdl, i) !=
NCSCC_RC_SUCCESS) {
printf("Adest Svc  Uninstall

Re: [devel] [PATCH 1/1] ntf: Fix coding issues identified by codechecker [#3114]

2019-11-18 Thread Minh Hon Chau


Hi Thuan

ack from me.

Thanks

Minh

On 4/11/19 6:42 pm, thuan.tran wrote:

---
  src/ntf/agent/ntfa_api.c | 29 +++--
  1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/ntf/agent/ntfa_api.c b/src/ntf/agent/ntfa_api.c
index 417c9d688..e89479bf6 100644
--- a/src/ntf/agent/ntfa_api.c
+++ b/src/ntf/agent/ntfa_api.c
@@ -1379,30 +1379,31 @@ SaAisErrorT recoverClient(ntfa_client_hdl_rec_t 
*client_hdl)
if ((rc = reinitializeClient(client_hdl)) == SA_AIS_OK) {
/* Restore reader */
ntfa_reader_hdl_rec_t *reader_hdl = client_hdl->reader_list;
-   while (reader_hdl != NULL && rc == SA_AIS_OK) {
+   while (reader_hdl != NULL) {
rc = recoverReader(client_hdl, reader_hdl);
+   if (rc != SA_AIS_OK) {
+   TRACE("Failed to restore reader (readerId:%d)",
+ reader_hdl->reader_id);
+   goto done;
+   }
reader_hdl = reader_hdl->next;
}
-   if (rc != SA_AIS_OK) {
-   TRACE("Failed to restore reader (readerId:%d)",
- reader_hdl->reader_id);
-   goto done;
-   }
/* Restore subscriber */
ntfa_subscriber_list_t *subscriber_hdl = subscriberNoList;
-   while (subscriber_hdl != NULL && rc == SA_AIS_OK) {
+   while (subscriber_hdl != NULL) {
if (client_hdl->local_hdl ==
-   subscriber_hdl->subscriberListNtfHandle)
+   subscriber_hdl->subscriberListNtfHandle) {
rc = recoverSubscriber(client_hdl,
   subscriber_hdl);
+   if (rc != SA_AIS_OK) {
+   TRACE(
+   "Failed to restore subscriber 
(subscriptionId:%d)",
+   
subscriber_hdl->subscriberListSubscriptionId);
+   goto done;
+   }
+   }
subscriber_hdl = subscriber_hdl->next;
}
-   if (rc != SA_AIS_OK) {
-   TRACE(
-   "Failed to restore subscriber (subscriptionId:%d)",
-   subscriber_hdl->subscriberListSubscriptionId);
-   goto done;
-   }
client_hdl->valid = true;
} else {
TRACE("Failed to restore client (id:%d)",



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: Fix coding issues identified by codechecker [#3112]

2019-11-18 Thread Minh Hon Chau


Hi Thuan

ack from me.

thanks

Minh

On 4/11/19 5:56 pm, thuan.tran wrote:

---
  src/mds/mds_c_db.c | 1 +
  src/mds/mds_c_sndrcv.c | 2 +-
  2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mds/mds_c_db.c b/src/mds/mds_c_db.c
index 58f0e3aee..e1991517e 100644
--- a/src/mds/mds_c_db.c
+++ b/src/mds/mds_c_db.c
@@ -433,6 +433,7 @@ uint32_t mds_vdest_tbl_get_role(MDS_VDEST_ID vdest_id, 
V_DEST_RL *role)
vdest_info = (MDS_VDEST_INFO *)ncs_patricia_tree_get(
_mds_mcm_cb->vdest_list, (uint8_t *)_id);
if (vdest_info == NULL) {
+   *role = V_DEST_RL_INVALID;
m_MDS_LOG_DBG("MDS:DB: VDEST not present");
m_MDS_LEAVE();
return NCSCC_RC_FAILURE;
diff --git a/src/mds/mds_c_sndrcv.c b/src/mds/mds_c_sndrcv.c
index 7850ac714..0dc76eef4 100644
--- a/src/mds/mds_c_sndrcv.c
+++ b/src/mds/mds_c_sndrcv.c
@@ -2319,7 +2319,7 @@ static uint32_t mcm_query_for_node_dest(MDS_DEST adest, 
uint8_t *to)
*to = DESTINATION_SAME_PROCESS;
else
*to = DESTINATION_ON_NODE;
-   } else if (dest_node_id != src_node_id) {
+   } else {
*to = DESTINATION_OFF_NODE;
}
return NCSCC_RC_SUCCESS;



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111]

2019-11-14 Thread Minh Hon Chau


Hi Thuan,

I add one comment inline for explanation.

Thanks

Minh

On 14/11/19 8:33 pm, Tran Thuan wrote:


Hi Minh,

I thought you will update check state of port id to know FCTRL or LEGACY.

Since if (msg_len_ - fseq_ - 2 == MDTM_FRAG_HDR_LEN) may be not LEGACY 
protocol.


[Minh] Yes, this case we can not tell whether it is FCTRL or LEGACY, 
thus the pro_ver_ remains UNDEFINED. In the mds_tipc_fctrl_rcv_data(), 
this UNDEFINED pro_ver_ fragment is forwarded to portid under the "if 
(header.IsFlowMessage() || header.IsUndefinedMessage())". The portid 
will skip this fragment if the state is kDisabled. In short, the 
fragment is forwarded to portid to check internally to follow the data 
flow, instead of checking the portid state inside message decoding which 
we need to refer the portid in mds_tipc_fctrl_msg.cc .


Agree if (msg_len_ - fseq_ - 2 != MDTM_FRAG_HDR_LEN) 100% is FCTRL 
protocol.


*Best Regards,*

*ThuanTr***

*From:*Minh Hon Chau 
*Sent:* Thursday, November 14, 2019 4:28 PM
*To:* Tran Thuan ; 
hans.nordeb...@ericsson.com; gary@dektech.com.au; 
vu.m.ngu...@dektech.com.au

*Cc:* opensaf-devel@lists.sourceforge.net
*Subject:* Re: [PATCH 1/3] mds: Distinguish protocol version of 
fragment [#3111]


Hi Thuan,

Are you happy with my reply?

Thanks

Minh

On 14/11/19 9:35 am, Minh Hon Chau wrote:

Hi Thuan,

Please see my reply inline.

Thanks

Minh

On 13/11/19 9:54 pm, Tran Thuan wrote:

Hi Minh,

See my comment inline.

Best Regards,

ThuanTr

-Original Message-

From: Minh Chau  <mailto:minh.c...@dektech.com.au>  


Sent: Friday, November 8, 2019 5:33 PM

To:hans.nordeb...@ericsson.com  
<mailto:hans.nordeb...@ericsson.com>;gary@dektech.com.au  
<mailto:gary@dektech.com.au>;vu.m.ngu...@dektech.com.au  
<mailto:vu.m.ngu...@dektech.com.au>;thuan.t...@dektech.com.au  
<mailto:thuan.t...@dektech.com.au>

Cc:opensaf-devel@lists.sourceforge.net  
<mailto:opensaf-devel@lists.sourceforge.net>; Minh Chau  
<mailto:minh.c...@dektech.com.au>

Subject: [PATCH 1/3] mds: Distinguish protocol version of fragment 
[#3111]

The legacy mds encodes the protocol version in either non fragment

message or the first fragment only. Hence, the subsequent fragment

after the first one is not able for mds to determine the protocol

version.

The patch maintains the encoding of lengthcheck as same as the legacy

mds version. Also, the subsequent fragments needs to consult the

stateful portid to determine the protocol version, so that the

fragment will be skipped if it is sent from legacy mds, or inspected

the sequence if it is sent from new mds.

---

  src/mds/mds_dt.h |   6 ++

  src/mds/mds_dt_tipc.c    |  11 ++-

  src/mds/mds_tipc_fctrl_intf.cc   | 154 
++-

  src/mds/mds_tipc_fctrl_msg.cc    |  86 +++---

  src/mds/mds_tipc_fctrl_msg.h |   5 ++

  src/mds/mds_tipc_fctrl_portid.cc |  23 ++

  src/mds/mds_tipc_fctrl_portid.h  |   1 +

  7 files changed, 193 insertions(+), 93 deletions(-)

diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h

index 64da600..007ff98 100644

--- a/src/mds/mds_dt.h

+++ b/src/mds/mds_dt.h

@@ -243,6 +243,12 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, 
NCSCONTEXT msg);

  #define MDS_PROT_VER_MASK 0xFC

  #define MDTM_PRI_MASK 0x3

  


+/* Unknown or undefined MDS protocol/version */

+#define MDS_PROT_UNDEFINED 0x00

+

+/* MDS protocol/version for non flow control (legacy) */

+#define MDS_PROT_LEGACY (MDS_PROT | MDS_VERSION)

+

  /* MDS protocol/version for flow control */

  #define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)

  #define MDS_PROT_FCTRL_ID 0xFDAC13F5

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c

index e085de7..fdf0da7 100644

--- a/src/mds/mds_dt_tipc.c

+++ b/src/mds/mds_dt_tipc.c

@@ -166,7 +166,7 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;

  uint32_t mdtm_global_frag_num;

  


  const unsigned int MAX_RECV_THRESHOLD = 30;

-static uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;

+static uint8_t gl_mds_pro_ver = MDS_PROT_LEGACY;

  static int gl_mds_fctrl_acksize = -1;

  static int gl_mds_fctrl_ackto = -1;

  


@@ -381,7 +381,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t 
*mds_tipc_ref)

   "MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ACKSIZE");

    }

    } else {

-  gl_mds_pro_ver = MDS_

Re: [devel] [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111]

2019-11-14 Thread Minh Hon Chau


Hi Thuan,

Are you happy with my reply?

Thanks

Minh

On 14/11/19 9:35 am, Minh Hon Chau wrote:


Hi Thuan,

Please see my reply inline.

Thanks

Minh

On 13/11/19 9:54 pm, Tran Thuan wrote:

Hi Minh,

See my comment inline.

Best Regards,
ThuanTr

-Original Message-
From: Minh Chau  
Sent: Friday, November 8, 2019 5:33 PM

To:hans.nordeb...@ericsson.com;gary@dektech.com.au;vu.m.ngu...@dektech.com.au;thuan.t...@dektech.com.au
Cc:opensaf-devel@lists.sourceforge.net; Minh Chau
Subject: [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111]

The legacy mds encodes the protocol version in either non fragment
message or the first fragment only. Hence, the subsequent fragment
after the first one is not able for mds to determine the protocol
version.

The patch maintains the encoding of lengthcheck as same as the legacy
mds version. Also, the subsequent fragments needs to consult the
stateful portid to determine the protocol version, so that the
fragment will be skipped if it is sent from legacy mds, or inspected
the sequence if it is sent from new mds.
---
  src/mds/mds_dt.h |   6 ++
  src/mds/mds_dt_tipc.c|  11 ++-
  src/mds/mds_tipc_fctrl_intf.cc   | 154 ++-
  src/mds/mds_tipc_fctrl_msg.cc|  86 +++---
  src/mds/mds_tipc_fctrl_msg.h |   5 ++
  src/mds/mds_tipc_fctrl_portid.cc |  23 ++
  src/mds/mds_tipc_fctrl_portid.h  |   1 +
  7 files changed, 193 insertions(+), 93 deletions(-)

diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h
index 64da600..007ff98 100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -243,6 +243,12 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT 
msg);
  #define MDS_PROT_VER_MASK 0xFC
  #define MDTM_PRI_MASK 0x3
  
+/* Unknown or undefined MDS protocol/version */

+#define MDS_PROT_UNDEFINED 0x00
+
+/* MDS protocol/version for non flow control (legacy) */
+#define MDS_PROT_LEGACY (MDS_PROT | MDS_VERSION)
+
  /* MDS protocol/version for flow control */
  #define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)
  #define MDS_PROT_FCTRL_ID 0xFDAC13F5
diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index e085de7..fdf0da7 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -166,7 +166,7 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;
  uint32_t mdtm_global_frag_num;
  
  const unsigned int MAX_RECV_THRESHOLD = 30;

-static uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
+static uint8_t gl_mds_pro_ver = MDS_PROT_LEGACY;
  static int gl_mds_fctrl_acksize = -1;
  static int gl_mds_fctrl_ackto = -1;
  
@@ -381,7 +381,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref)

"MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ACKSIZE");
}
} else {
-   gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
+   gl_mds_pro_ver = MDS_PROT_LEGACY;
syslog(LOG_ERR, "MDTM:TIPC Invalid value of"
"MDS_TIPC_FCTRL_ENABLED");
}
@@ -3125,7 +3125,12 @@ uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t 
len, uint32_t seq_num,
 * hereafter, these 2 bytes will be used as sequence number in flow 
control
 * (per tipc portid)
 * */
-   ncs_encode_16bit(, fctrl_seq_num);
+   if (gl_mds_pro_ver == MDS_PROT_FCTRL) {
+   ncs_encode_16bit(, fctrl_seq_num);
+   } else {
+   ncs_encode_16bit(, len - MDTM_FRAG_HDR_LEN - 2);
+   }
+
  #endif
return NCSCC_RC_SUCCESS;
  }
diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index c9073b2..3d92290 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -132,8 +132,16 @@ uint32_t process_flow_event(const Event& evt) {
portid = new TipcPortId(evt.id_, data_sock_fd,
chunk_ack_size, sock_buf_size);
portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
-  rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
-evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled);
+  if (evt.legacy_data_ == true) {
+// we create portid and set state kDisabled even though we know
+// this portid has no flow control. It is because the 2nd, 3rd fragment
+// could not reflect the protocol version, so need to keep this portid
+// remained stateful
+portid->ChangeState(TipcPortId::State::kDisabled);
+  } else {
+rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
+  evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled);
+  }
  } else if (evt.type_ == Event::Type::kEvtRcvIntro) {
portid = new TipcPortId(evt.id_, data_sock_fd,
chunk_ack_size, sock_buf_size);
@@ -146,8 +154,12 @@ uint32_t process_flow_event(const Event& evt) {
  }
} else {
  if (evt.type_ == Event::Type::kEvtRcvDa

Re: [devel] [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119]

2019-11-13 Thread Minh Hon Chau


Hi Thuan,

ack from me.

THanks

Minh

On 13/11/19 10:00 pm, thuan.tran wrote:

When overload happens, sender will wait for chunkAck to continue
sending more messages, it should send number of message equal chunkAck
size of receiver. If not, receiver don't receive enough messages to send
chunkAck and wait until timer timeout to send chunkAck to sender.
This loop will make sender take very long time to sending all messages.
---
  src/mds/mds_tipc_fctrl_portid.cc | 14 ++
  1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc
index 3704baddb..bd1825446 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -190,6 +190,7 @@ uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t 
length,
  sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_);
} else {
  ++sndwnd_.send_;
+sndwnd_.nacked_space_ += length;
  m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
  "QueData[mseq:%u, mfrag:%u, fseq:%u, len:%u], "
  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
@@ -444,32 +445,29 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
  // the nacked_space_ of sender
  uint64_t acked_bytes = sndqueue_.Erase(Seq16(fseq) - (chksize-1),
  Seq16(fseq));
+assert(sndwnd_.nacked_space_ >= acked_bytes);
  sndwnd_.nacked_space_ -= acked_bytes;
  
  // try to send a few pending msg

  DataMessage* msg = nullptr;
-uint64_t resend_bytes = 0;
-while (resend_bytes < acked_bytes) {
+uint16_t send_msg_cnt = 0;
+while (send_msg_cnt++ < chunk_size_) {
// find the lowest sequence unsent yet
msg = sndqueue_.FirstUnsent();
if (msg == nullptr) {
  break;
} else {
-if (resend_bytes < acked_bytes) {
if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {
-sndwnd_.nacked_space_ += msg->header_.msg_len_;
  msg->is_sent_ = true;
-resend_bytes += msg->header_.msg_len_;
  m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
  "SndQData[fseq:%u, len:%u], "
  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
  id_.node, id_.ref,
  msg->header_.fseq_, msg->header_.msg_len_,
  sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_);
+  } else {
+break;
}
-} else {
-  break;
-}
}
  }
  // no more unsent message, back to kEnabled



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111]

2019-11-13 Thread Minh Hon Chau


Hi Thuan,

Please see my reply inline.

Thanks

Minh

On 13/11/19 9:54 pm, Tran Thuan wrote:

Hi Minh,

See my comment inline.

Best Regards,
ThuanTr

-Original Message-
From: Minh Chau 
Sent: Friday, November 8, 2019 5:33 PM
To: hans.nordeb...@ericsson.com; gary@dektech.com.au; 
vu.m.ngu...@dektech.com.au; thuan.t...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net; Minh Chau 
Subject: [PATCH 1/3] mds: Distinguish protocol version of fragment [#3111]

The legacy mds encodes the protocol version in either non fragment
message or the first fragment only. Hence, the subsequent fragment
after the first one is not able for mds to determine the protocol
version.

The patch maintains the encoding of lengthcheck as same as the legacy
mds version. Also, the subsequent fragments needs to consult the
stateful portid to determine the protocol version, so that the
fragment will be skipped if it is sent from legacy mds, or inspected
the sequence if it is sent from new mds.
---
  src/mds/mds_dt.h |   6 ++
  src/mds/mds_dt_tipc.c|  11 ++-
  src/mds/mds_tipc_fctrl_intf.cc   | 154 ++-
  src/mds/mds_tipc_fctrl_msg.cc|  86 +++---
  src/mds/mds_tipc_fctrl_msg.h |   5 ++
  src/mds/mds_tipc_fctrl_portid.cc |  23 ++
  src/mds/mds_tipc_fctrl_portid.h  |   1 +
  7 files changed, 193 insertions(+), 93 deletions(-)

diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h
index 64da600..007ff98 100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -243,6 +243,12 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT 
msg);
  #define MDS_PROT_VER_MASK 0xFC
  #define MDTM_PRI_MASK 0x3
  
+/* Unknown or undefined MDS protocol/version */

+#define MDS_PROT_UNDEFINED 0x00
+
+/* MDS protocol/version for non flow control (legacy) */
+#define MDS_PROT_LEGACY (MDS_PROT | MDS_VERSION)
+
  /* MDS protocol/version for flow control */
  #define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)
  #define MDS_PROT_FCTRL_ID 0xFDAC13F5
diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index e085de7..fdf0da7 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -166,7 +166,7 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;
  uint32_t mdtm_global_frag_num;
  
  const unsigned int MAX_RECV_THRESHOLD = 30;

-static uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
+static uint8_t gl_mds_pro_ver = MDS_PROT_LEGACY;
  static int gl_mds_fctrl_acksize = -1;
  static int gl_mds_fctrl_ackto = -1;
  
@@ -381,7 +381,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t *mds_tipc_ref)

"MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ACKSIZE");
}
} else {
-   gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
+   gl_mds_pro_ver = MDS_PROT_LEGACY;
syslog(LOG_ERR, "MDTM:TIPC Invalid value of"
"MDS_TIPC_FCTRL_ENABLED");
}
@@ -3125,7 +3125,12 @@ uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t 
len, uint32_t seq_num,
 * hereafter, these 2 bytes will be used as sequence number in flow 
control
 * (per tipc portid)
 * */
-   ncs_encode_16bit(, fctrl_seq_num);
+   if (gl_mds_pro_ver == MDS_PROT_FCTRL) {
+   ncs_encode_16bit(, fctrl_seq_num);
+   } else {
+   ncs_encode_16bit(, len - MDTM_FRAG_HDR_LEN - 2);
+   }
+
  #endif
return NCSCC_RC_SUCCESS;
  }
diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index c9073b2..3d92290 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -132,8 +132,16 @@ uint32_t process_flow_event(const Event& evt) {
portid = new TipcPortId(evt.id_, data_sock_fd,
chunk_ack_size, sock_buf_size);
portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
-  rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
-evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled);
+  if (evt.legacy_data_ == true) {
+// we create portid and set state kDisabled even though we know
+// this portid has no flow control. It is because the 2nd, 3rd fragment
+// could not reflect the protocol version, so need to keep this portid
+// remained stateful
+portid->ChangeState(TipcPortId::State::kDisabled);
+  } else {
+rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
+  evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled);
+  }
  } else if (evt.type_ == Event::Type::kEvtRcvIntro) {
portid = new TipcPortId(evt.id_, data_sock_fd,
chunk_ack_size, sock_buf_size);
@@ -146,8 +154,12 @@ uint32_t process_flow_event(const Event& evt) {
  }
} else {
  if (evt.type_ == Event::Type::kEvtRcvData) {
-  rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
-  evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled);
+  if

Re: [devel] [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119]

2019-11-12 Thread Minh Hon Chau


Hi Thuan,

Please see comment inline

Thanks

Minh

On 13/11/19 2:24 pm, Tran Thuan wrote:

Hi Minh,

Please check replies inline. Thanks.

Best Regards,
ThuanTr

-Original Message-
From: Minh Hon Chau 
Sent: Wednesday, November 13, 2019 10:05 AM
To: Tran Thuan ; 'Nguyen Minh Vu' 
; gary@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: fix sender take very long time to send all 
messages [#3119]

Hi Thuan,

Please see comment inline.

Thanks

Minh

On 13/11/19 1:11 pm, Tran Thuan wrote:

Hi Minh,

Thanks for comments, please check my replies inline.

Best Regards,
ThuanTr

-Original Message-
From: Minh Hon Chau 
Sent: Wednesday, November 13, 2019 7:47 AM
To: thuan.tran ; 'Nguyen Minh Vu' 
; gary@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: fix sender take very long time to send all 
messages [#3119]

Hi Thuan,

Some comments inline.

Thanks

Minh

On 12/11/19 5:04 pm, thuan.tran wrote:

When overload happens, sender will wait for chunkAck to continue
sending more messages, it should send number of message equal chunkAck
size of receiver. If not, receiver don't receive enough messages to send
chunkAck and wait until timer timeout to send chunkAck to sender.
This loop will make sender take very long time to sending all messages.
---
src/mds/mds_tipc_fctrl_portid.cc | 30 +++---
1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc
index 3704baddb..1fff4c855 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -190,6 +190,7 @@ uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t 
length,
sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_);
  } else {
++sndwnd_.send_;
+sndwnd_.nacked_space_ += length;

[Minh] We haven't sent the msg out to wait for ack, thus nacked_space_
should not be increased

m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
"QueData[mseq:%u, mfrag:%u, fseq:%u, len:%u], "
"sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
@@ -444,32 +445,29 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
// the nacked_space_ of sender
uint64_t acked_bytes = sndqueue_.Erase(Seq16(fseq) - (chksize-1),
Seq16(fseq));
+assert(sndwnd_.nacked_space_ >= acked_bytes);
sndwnd_.nacked_space_ -= acked_bytes;

// try to send a few pending msg

DataMessage* msg = nullptr;
-uint64_t resend_bytes = 0;
-while (resend_bytes < acked_bytes) {
+uint16_t send_msg_cnt = 0;
+while (send_msg_cnt++ < chunk_size_) {
  // find the lowest sequence unsent yet
  msg = sndqueue_.FirstUnsent();
  if (msg == nullptr) {
break;
  } else {
-if (resend_bytes < acked_bytes) {
  if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {
-sndwnd_.nacked_space_ += msg->header_.msg_len_;

[Minh] We now send it out and wait for acked, thus the nacked_space_ is
increased here, so any reason moving the nacked_space_ from Queue() to here?
[Thuan] Because the message could be in sndwnd (resend) either in sndqueue 
(send)
Cannot increase nacked_space with resend message.
I have tried another way to increase/decrease nacked_space dynamic
but it become complex with markUnsent() since sender may receiver Nack for same 
msg > 2 times.

[Minh] OK.

msg->is_sent_ = true;
-resend_bytes += msg->header_.msg_len_;
m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
"SndQData[fseq:%u, len:%u], "
"sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
id_.node, id_.ref,
msg->header_.fseq_, msg->header_.msg_len_,
sndwnd_.acked_.v(), sndwnd_.send_.v(), 
sndwnd_.nacked_space_);
+  } else {
+break;
  }
-} else {
-  break;
-}
  }
}
// no more unsent message, back to kEnabled

[Minh] Agree, the new strategy to resend with chunk_size_ is better than
with acked_bytes, it will increase transmission rate and not to depend
on the timer
[Thuan] Thanks

@@ -502,26 +500,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t 
mfrag,
fseq);
return;
  }
-  if (state_ == State::kRcvBuffOverflow) {
-sndqueue_.MarkUnsentFrom(Seq16(fseq));
-if (Seq16(fseq) - sndwnd_.acked_ > 1) {
-  m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], "
-  "RcvNack[fseq:%u], "
-  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "], "
-  "queue[size:%" PRIu

Re: [devel] [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119]

2019-11-12 Thread Minh Hon Chau


Hi Thuan,

Please see comment inline.

Thanks

Minh

On 13/11/19 1:11 pm, Tran Thuan wrote:

Hi Minh,

Thanks for comments, please check my replies inline.

Best Regards,
ThuanTr

-Original Message-
From: Minh Hon Chau 
Sent: Wednesday, November 13, 2019 7:47 AM
To: thuan.tran ; 'Nguyen Minh Vu' 
; gary@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: fix sender take very long time to send all 
messages [#3119]

Hi Thuan,

Some comments inline.

Thanks

Minh

On 12/11/19 5:04 pm, thuan.tran wrote:

When overload happens, sender will wait for chunkAck to continue
sending more messages, it should send number of message equal chunkAck
size of receiver. If not, receiver don't receive enough messages to send
chunkAck and wait until timer timeout to send chunkAck to sender.
This loop will make sender take very long time to sending all messages.
---
   src/mds/mds_tipc_fctrl_portid.cc | 30 +++---
   1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc
index 3704baddb..1fff4c855 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -190,6 +190,7 @@ uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t 
length,
   sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_);
 } else {
   ++sndwnd_.send_;
+sndwnd_.nacked_space_ += length;

[Minh] We haven't sent the msg out to wait for ack, thus nacked_space_
should not be increased

   m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
   "QueData[mseq:%u, mfrag:%u, fseq:%u, len:%u], "
   "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
@@ -444,32 +445,29 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
   // the nacked_space_ of sender
   uint64_t acked_bytes = sndqueue_.Erase(Seq16(fseq) - (chksize-1),
   Seq16(fseq));
+assert(sndwnd_.nacked_space_ >= acked_bytes);
   sndwnd_.nacked_space_ -= acked_bytes;
   
   // try to send a few pending msg

   DataMessage* msg = nullptr;
-uint64_t resend_bytes = 0;
-while (resend_bytes < acked_bytes) {
+uint16_t send_msg_cnt = 0;
+while (send_msg_cnt++ < chunk_size_) {
 // find the lowest sequence unsent yet
 msg = sndqueue_.FirstUnsent();
 if (msg == nullptr) {
   break;
 } else {
-if (resend_bytes < acked_bytes) {
 if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {
-sndwnd_.nacked_space_ += msg->header_.msg_len_;

[Minh] We now send it out and wait for acked, thus the nacked_space_ is
increased here, so any reason moving the nacked_space_ from Queue() to here?
[Thuan] Because the message could be in sndwnd (resend) either in sndqueue 
(send)
Cannot increase nacked_space with resend message.
I have tried another way to increase/decrease nacked_space dynamic
but it become complex with markUnsent() since sender may receiver Nack for same 
msg > 2 times.

[Minh] OK.

   msg->is_sent_ = true;
-resend_bytes += msg->header_.msg_len_;
   m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
   "SndQData[fseq:%u, len:%u], "
   "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
   id_.node, id_.ref,
   msg->header_.fseq_, msg->header_.msg_len_,
   sndwnd_.acked_.v(), sndwnd_.send_.v(), 
sndwnd_.nacked_space_);
+  } else {
+break;
 }
-} else {
-  break;
-}
 }
   }
   // no more unsent message, back to kEnabled

[Minh] Agree, the new strategy to resend with chunk_size_ is better than
with acked_bytes, it will increase transmission rate and not to depend
on the timer
[Thuan] Thanks

@@ -502,26 +500,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t 
mfrag,
   fseq);
   return;
 }
-  if (state_ == State::kRcvBuffOverflow) {
-sndqueue_.MarkUnsentFrom(Seq16(fseq));
-if (Seq16(fseq) - sndwnd_.acked_ > 1) {
-  m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], "
-  "RcvNack[fseq:%u], "
-  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "], "
-  "queue[size:%" PRIu64 "], "
-  "Warning[Ignore Nack]",
-  id_.node, id_.ref, fseq,
-  sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_,
-  sndqueue_.Size());
-  return;
-}
-  }
 if (state_ != State::kRcvBuffOverflow) {
   state_ = State::kRcvBuffOverflow;
   m_MDS_LOG_NOTIFY("FCTRL: [node:%x, ref:%u] --> Overflow ",
   id_.node, id_.ref);
-sndqueue_.MarkUnsentFrom(Seq16(fseq));
 }
+  s

Re: [devel] [PATCH 0/3] Review Request for mds: Fix backward compatibility of mds fragmentation message [#3111]

2019-11-12 Thread Minh Hon Chau


Hi,

Any comments on the patches? Otherwise I wish to push them in the next 
day or two.


Thanks

Minh

On 8/11/19 9:33 pm, Minh Chau wrote:

Summary: mds: Distinguish protocol version of fragment [#3111]
Review request for Ticket(s): 3111
Peer Reviewer(s): Gary, Vu, Thuan
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-3111
Base revision: ddb9d7065376df7757716013779755864d53ebe5
Personal repository: git://git.code.sf.net/u/minh-chau/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesn
  OpenSAF servicesn
  Core libraries  y
  Samples n
  Tests   n
  Other   n


Comments (indicate scope for each "y" above):
-
*** EXPLAIN/COMMENT THE PATCH SERIES HERE ***

revision 2cb2d135827d920155323a70a9587264e5c62ae2
Author: Minh Chau 
Date:   Fri, 8 Nov 2019 21:17:22 +1100

mds: Add backward compatibility mdstest for fragment [#3111]



revision 153b657d2873019160f31a3091fa660e4e469a9e
Author: Minh Chau 
Date:   Fri, 8 Nov 2019 21:08:18 +1100

mds: Refactor logging [#3111]

Since adding TipcPortId:ChangeState(), the patch refactors
logging to shorten the code.



revision 1ce0c74ca96fa028d02abe72932171e98c034342
Author: Minh Chau 
Date:   Fri, 8 Nov 2019 20:51:54 +1100

mds: Distinguish protocol version of fragment [#3111]

The legacy mds encodes the protocol version in either non fragment
message or the first fragment only. Hence, the subsequent fragment
after the first one is not able for mds to determine the protocol
version.

The patch maintains the encoding of lengthcheck as same as the legacy
mds version. Also, the subsequent fragments needs to consult the
stateful portid to determine the protocol version, so that the
fragment will be skipped if it is sent from legacy mds, or inspected
the sequence if it is sent from new mds.



Complete diffstat:
--
  src/mds/apitest/mdstipc_api.c|  83 +++--
  src/mds/mds_dt.h |   6 ++
  src/mds/mds_dt_tipc.c|  11 ++-
  src/mds/mds_tipc_fctrl_intf.cc   | 154 ++-
  src/mds/mds_tipc_fctrl_msg.cc|  86 +++---
  src/mds/mds_tipc_fctrl_msg.h |   5 ++
  src/mds/mds_tipc_fctrl_portid.cc |  94 +++-
  src/mds/mds_tipc_fctrl_portid.h  |   1 +
  8 files changed, 292 insertions(+), 148 deletions(-)


Testing Commands:
-
mdstest


Testing, Expected Results:
--
all tests pass

Conditions of Submission:
-
*** HOW MANY DAYS BEFORE PUSHING, CONSENSUS ETC ***


Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  y  y
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many blank entries
 that need proper data filled in.

___ You have failed to nominate the proper persons for review and push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible line length in your headers/comments/text.

___ You have failed to put in a proper Trac Ticket # into your commits.

___ You have incorrectly put/left internal data in your comments/files
 (i.e. internal bug tracking tool IDs, product names etc)

___ You have not given any evidence of testing beyond basic build tests.
 Demonstrate some level of runtime or other sanity testing.

___ You have ^M present in some of your files. These have to be removed.

___ You have needlessly changed whitespace or added whitespace crimes
 like trailing spaces, or spaces before tabs.

___ You have mixed real technical changes with whitespace and other
 cosmetic code cleanup changes. These have to be separate commits.

___ You need to refactor your submission into logical chunks; there is
 too much content into a single commit.

___ You have extraneous garbage in your review (merge commits etc)

___ You have giant attachments which should never have been sent;
 Instead you should place your content in a public tree to be pulled.

___ You have too many commits attached to an e-mail; resend as threaded
 commits, or place in a public tree for a pull.

___ You have resent this content multiple times without a clear indication
 of what has changed between each re-send.

___ You have

Re: [devel] [PATCH 1/1] mds: fix sender take very long time to send all messages [#3119]

2019-11-12 Thread Minh Hon Chau


Hi Thuan,

Some comments inline.

Thanks

Minh

On 12/11/19 5:04 pm, thuan.tran wrote:

When overload happens, sender will wait for chunkAck to continue
sending more messages, it should send number of message equal chunkAck
size of receiver. If not, receiver don't receive enough messages to send
chunkAck and wait until timer timeout to send chunkAck to sender.
This loop will make sender take very long time to sending all messages.
---
  src/mds/mds_tipc_fctrl_portid.cc | 30 +++---
  1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_portid.cc b/src/mds/mds_tipc_fctrl_portid.cc
index 3704baddb..1fff4c855 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -190,6 +190,7 @@ uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t 
length,
  sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_);
} else {
  ++sndwnd_.send_;
+sndwnd_.nacked_space_ += length;
[Minh] We haven't sent the msg out to wait for ack, thus nacked_space_ 
should not be increased

  m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
  "QueData[mseq:%u, mfrag:%u, fseq:%u, len:%u], "
  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
@@ -444,32 +445,29 @@ void TipcPortId::ReceiveChunkAck(uint16_t fseq, uint16_t 
chksize) {
  // the nacked_space_ of sender
  uint64_t acked_bytes = sndqueue_.Erase(Seq16(fseq) - (chksize-1),
  Seq16(fseq));
+assert(sndwnd_.nacked_space_ >= acked_bytes);
  sndwnd_.nacked_space_ -= acked_bytes;
  
  // try to send a few pending msg

  DataMessage* msg = nullptr;
-uint64_t resend_bytes = 0;
-while (resend_bytes < acked_bytes) {
+uint16_t send_msg_cnt = 0;
+while (send_msg_cnt++ < chunk_size_) {
// find the lowest sequence unsent yet
msg = sndqueue_.FirstUnsent();
if (msg == nullptr) {
  break;
} else {
-if (resend_bytes < acked_bytes) {
if (Send(msg->msg_data_, msg->header_.msg_len_) == 
NCSCC_RC_SUCCESS) {
-sndwnd_.nacked_space_ += msg->header_.msg_len_;
[Minh] We now send it out and wait for acked, thus the nacked_space_ is 
increased here, so any reason moving the nacked_space_ from Queue() to here?

  msg->is_sent_ = true;
-resend_bytes += msg->header_.msg_len_;
  m_MDS_LOG_NOTIFY("FCTRL: [me] --> [node:%x, ref:%u], "
  "SndQData[fseq:%u, len:%u], "
  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
  id_.node, id_.ref,
  msg->header_.fseq_, msg->header_.msg_len_,
  sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_);
+  } else {
+break;
}
-} else {
-  break;
-}
}
  }
  // no more unsent message, back to kEnabled
[Minh] Agree, the new strategy to resend with chunk_size_ is better than 
with acked_bytes, it will increase transmission rate and not to depend 
on the timer

@@ -502,26 +500,12 @@ void TipcPortId::ReceiveNack(uint32_t mseq, uint16_t 
mfrag,
  fseq);
  return;
}
-  if (state_ == State::kRcvBuffOverflow) {
-sndqueue_.MarkUnsentFrom(Seq16(fseq));
-if (Seq16(fseq) - sndwnd_.acked_ > 1) {
-  m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], "
-  "RcvNack[fseq:%u], "
-  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "], "
-  "queue[size:%" PRIu64 "], "
-  "Warning[Ignore Nack]",
-  id_.node, id_.ref, fseq,
-  sndwnd_.acked_.v(), sndwnd_.send_.v(), sndwnd_.nacked_space_,
-  sndqueue_.Size());
-  return;
-}
-  }
if (state_ != State::kRcvBuffOverflow) {
  state_ = State::kRcvBuffOverflow;
  m_MDS_LOG_NOTIFY("FCTRL: [node:%x, ref:%u] --> Overflow ",
  id_.node, id_.ref);
-sndqueue_.MarkUnsentFrom(Seq16(fseq));
}
+  sndqueue_.MarkUnsentFrom(Seq16(fseq));
[Minh] I have a doubt with this change in ReceiveNack(), so every Nack 
will trigger a retransmission on the Nacked sequence even though we are 
already in kRcvBufferOverFlow state. This will increase the "unexpected 
retransmission" error rate. On reception of 2nd-Nack, 3rd-Nack,  we 
already moved into kRcvBufferOverFlow state, we don't need to resend the 
2nd-Nack, 3rd-Nack as we already did at the 1st-Nack. Only mark it as 
Unsent, the actual retransmission of 2nd-Nack, 3rd-Nack,  is done in 
the loop ReceiveChunkAck() as you have improved in this patch, that will 
keep msg in order at receivers. So any reason for this change?

DataMessage* msg = sndqueue_.Find(Seq16(fseq));
if (msg != nullptr) {
  // Resend the msg found



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: Unset flow control env var [#3109]

2019-10-31 Thread Minh Hon Chau


Hi Vu,

Please find comments below.

Thanks

Minh

On 31/10/19 6:15 pm, Nguyen Minh Vu wrote:

Hi Minh,

Ack with minor comments.

Regards, Vu

On 10/31/19 11:55 AM, Minh Chau wrote:

Patch unsets MDS_TIPC_FCTRL_ENABLED, MDS_TIPC_FCTRL_ACKTIMEOUT,
and MDS_TIPC_FCTRL_ACKSIZE to prevent child process inheritance.
---
  src/mds/mds_dt_tipc.c | 39 +--
  1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index e7a7b48..096e4ca 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -167,6 +167,8 @@ uint32_t mdtm_global_frag_num;
    const unsigned int MAX_RECV_THRESHOLD = 30;
  uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
+int gl_mds_fctrl_acksize = -1;
+int gl_mds_fctrl_ackto = -1;
[Vu] Should we declare these ones as static variables if they are only 
used in this file ?

[M]: Yes, make them static
    static bool get_tipc_port_id(int sock, struct tipc_portid* 
port_id) {

  struct sockaddr_tipc addr;
@@ -347,32 +349,49 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, 
uint32_t *mds_tipc_ref)

  if ((ptr = getenv("MDS_TIPC_FCTRL_ENABLED")) != NULL) {
  if (atoi(ptr) == 1) {
  gl_mds_pro_ver = MDS_PROT_FCTRL;
-    int ackto = -1;
-    int acksize = -1;
  if ((ptr = getenv("MDS_TIPC_FCTRL_ACKTIMEOUT")) != NULL) {
-    ackto = atoi(ptr);
-    if (ackto == 0) {
+    gl_mds_fctrl_ackto = atoi(ptr);
+    if (gl_mds_fctrl_ackto == 0) {
  syslog(LOG_ERR, "MDTM:TIPC Invalid "
  "MDS_TIPC_FCTRL_ACKTIMEOUT, using 
default value");

-    ackto = -1;
+    gl_mds_fctrl_ackto = -1;
  }
  }
  if ((ptr = getenv("MDS_TIPC_FCTRL_ACKSIZE")) != NULL) {
-    acksize = atoi(ptr);
-    if (acksize == 0) {
+    gl_mds_fctrl_acksize = atoi(ptr);
+    if (gl_mds_fctrl_acksize == 0) {
  syslog(LOG_ERR, "MDTM:TIPC Invalid "
  "MDS_TIPC_FCTRL_ACKSIZE, using default 
value");

-    acksize = -1;
+    gl_mds_fctrl_acksize = -1;
  }
  }
-    mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, optval,
-    ackto, acksize, tipc_mcast_enabled);
+    /* unset the env var to prevent child process 
inheritance */

+    if (unsetenv("MDS_TIPC_FCTRL_ENABLED") != 0) {
+    syslog(LOG_ERR,
+    "MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ENABLED");

+    }
+    if (gl_mds_fctrl_ackto != -1 &&
+    unsetenv("MDS_TIPC_FCTRL_ACKTIMEOUT") != 0) {
+    syslog(LOG_ERR,
+    "MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ACKTIMEOUT");

+    }
+    if (gl_mds_fctrl_acksize != -1 &&
+    unsetenv("MDS_TIPC_FCTRL_ACKSIZE") != 0) {
+    syslog(LOG_ERR,
+    "MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ACKSIZE");

+    }
  } else {
+    gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
[Vu] This line may be not necessary as the default value of 
gl_mds_pro_ver is `MDS_PROT | MDS_VERSION`.
[M] It may be invalid value by setenv() in the scenario you suggested: 
Init/Finalize/Init with setenv(invalid value).

  syslog(LOG_ERR, "MDTM:TIPC Invalid value of"
  "MDS_TIPC_FCTRL_ENABLED");
  }
  }
  +    if (gl_mds_pro_ver == MDS_PROT_FCTRL) {
+    mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, optval,
+    gl_mds_fctrl_ackto, gl_mds_fctrl_acksize, 
tipc_mcast_enabled);

+    }
+
  /* Create a task to receive the events and data */
  if (mdtm_create_rcv_task(tipc_cb.hdle_mdtm) != NCSCC_RC_SUCCESS) {
  syslog(LOG_ERR,






___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mds: Unset flow control env var [#3109]

2019-10-30 Thread Minh Hon Chau


Hi Vu,

I think users will lose the value set in env var if users repeat 
Init/Finalize/Init. I have sent out V2.


Thanks

Minh

On 31/10/19 2:33 pm, Nguyen Minh Vu wrote:

Hi Minh,

Ack with one question.

What happens if user does following sequence:
1) Init service handle
--> Have this env variable set, then unset later on.
2) Finalize the handle

3) Init service handle
--> I am not sure if previous unset has any affects to tipc flow 
control from this point

e.g. has tipc flow control been disabled from previous unset?

Regards, Vu

On 10/31/19 5:32 AM, Minh Chau wrote:

Patch unsets MDS_TIPC_FCTRL_ENABLED, MDS_TIPC_FCTRL_ACKTIMEOUT,
and MDS_TIPC_FCTRL_ACKSIZE to prevent child process inheritance.
---
  src/mds/mds_dt_tipc.c | 13 +
  1 file changed, 13 insertions(+)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index e7a7b48..12b275d 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -367,6 +367,19 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t 
*mds_tipc_ref)

  }
  mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, 
optval,

  ackto, acksize, tipc_mcast_enabled);
+    /* unset the env var to prevent child process 
inheritance */

+    if (unsetenv("MDS_TIPC_FCTRL_ENABLED") != 0) {
+    syslog(LOG_ERR,
+    "MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ENABLED");

+    }
+    if (ackto != -1 && unsetenv("MDS_TIPC_FCTRL_ACKTIMEOUT") 
!= 0) {

+    syslog(LOG_ERR,
+    "MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ACKTIMEOUT");

+    }
+    if (acksize != -1 && unsetenv("MDS_TIPC_FCTRL_ACKSIZE") 
!= 0) {

+    syslog(LOG_ERR,
+    "MDTM:TIPC Failed to unset 
MDS_TIPC_FCTRL_ACKSIZE");

+    }
  } else {
  syslog(LOG_ERR, "MDTM:TIPC Invalid value of"
  "MDS_TIPC_FCTRL_ENABLED");






___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] dtm: rotate logtraces on demand [#3086]

2019-10-24 Thread Minh Hon Chau


Hi Vu,

ack from me.

Thanks

Minh

On 21/10/19 5:51 pm, Nguyen Minh Vu wrote:

Hi,

Any comments on this patch? I will push it by this week if no comments.

Regards, Vu

On 10/4/19 5:30 PM, Vu Minh Nguyen wrote:

Introducing a new option '--rotate' to rotate given logtrace stream(s).

This patch also cleans the code of LogServer::ExecuteCommand().
---
  src/base/log_writer.h   |   2 +-
  src/dtm/tools/osaflog.cc    |  25 ++-
  src/dtm/transport/log_server.cc | 125 +---
  src/dtm/transport/log_server.h  |  11 ++-
  4 files changed, 115 insertions(+), 48 deletions(-)

diff --git a/src/base/log_writer.h b/src/base/log_writer.h
index 0a03e9253..ab2bf32ae 100644
--- a/src/base/log_writer.h
+++ b/src/base/log_writer.h
@@ -45,13 +45,13 @@ class LogWriter {
    void Write(size_t size);
    void Write(const char* bytes, size_t size);
    void Flush();
+  void RotateLog();
    void SetLogFile(const std::string& log_file) { log_file_ = 
log_file; }

     private:
    constexpr static const size_t kBufferSize = 128 * size_t{1024};
    void Open();
    void Close();
-  void RotateLog();
      std::string log_file(size_t backup) const;
  diff --git a/src/dtm/tools/osaflog.cc b/src/dtm/tools/osaflog.cc
index 4e0956aa2..f6fa16801 100644
--- a/src/dtm/tools/osaflog.cc
+++ b/src/dtm/tools/osaflog.cc
@@ -54,6 +54,7 @@ base::UnixServerSocket* CreateSocket();
  uint64_t Random64Bits(uint64_t seed);
  bool PrettyPrint(const std::string& log_stream);
  bool Delete(const std::string& log_stream);
+bool Rotate(const std::string& log_stream);
  std::list OpenLogFiles(const std::string& log_stream);
  std::string PathName(const std::string& log_stream, int suffix);
  uint64_t GetInode(int fd);
@@ -70,6 +71,7 @@ int main(int argc, char** argv) {
    {"flush", no_argument, 0, 'f'},
    {"print", no_argument, nullptr, 
'p'},
    {"delete", no_argument, nullptr, 
'd'},
+  {"rotate", no_argument, nullptr, 
'r'},
    {"extract-trace", 
required_argument, 0, 'e'},
    {"max-idle", required_argument, 
0, 'i'},

    {0, 0, 0, 0}};
@@ -83,12 +85,14 @@ int main(int argc, char** argv) {
    bool flush_result =  true;
    bool print_result =  true;
    bool delete_result =  true;
+  bool rotate_result = true;
    bool max_file_size_result = true;
    bool number_of_backups_result = true;
    bool max_idle_result = true;
    bool flush_set = false;
    bool pretty_print_set = false;
    bool delete_set = false;
+  bool rotate_set = false;
    bool max_file_size_set = false;
    bool max_backups_set = false;
    bool max_idle_set = false;
@@ -101,7 +105,7 @@ int main(int argc, char** argv) {
  exit(EXIT_FAILURE);
    }
  -  while ((option = getopt_long(argc, argv, "m:b:p:f:e:",
+  while ((option = getopt_long(argc, argv, "m:b:p:f:e:i:r",
 long_options, _index)) != -1) {
  switch (option) {
    case 'p':
@@ -114,6 +118,9 @@ int main(int argc, char** argv) {
    case 'f':
  flush_set = true;
  break;
+  case 'r':
+    rotate_set = true;
+    break;
    case 'm':
  max_file_size = base::StrToUint64(optarg,
_file_size_set);
@@ -164,12 +171,12 @@ int main(int argc, char** argv) {
      if (thread_trace) exit(ExtractTrace(input_core, output_trace));
  -  if (argc > optind && !pretty_print_set && !delete_set) {
+  if (argc > optind && !pretty_print_set && !delete_set && 
!rotate_set) {

  pretty_print_set = true;
  flush_set = true;
    }
  -  if ((argc <= optind && (pretty_print_set || delete_set)) ||
+  if ((argc <= optind && (pretty_print_set || delete_set || 
rotate_set)) ||

    (pretty_print_set && delete_set)) {
   PrintUsage(argv[0]);
   exit(EXIT_FAILURE);
@@ -188,6 +195,11 @@ int main(int argc, char** argv) {
    delete_result = Delete(argv[optind++]);
  }
    }
+  if (rotate_set == true) {
+    while (rotate_result && optind < argc) {
+  rotate_result = Rotate(argv[optind++]);
+    }
+  }
    if (max_backups_set == true) {
   number_of_backups_result = NoOfBackupFiles(max_backups);
    }
@@ -197,7 +209,7 @@ int main(int argc, char** argv) {
    if (max_idle_set == true) {
  max_idle_result = SetMaxIdleTime(max_idle);
    }
-  if (flush_result && print_result && max_file_size_result &&
+  if (flush_result && print_result && max_file_size_result && 
rotate_result &&

    delete_result && number_of_backups_result && max_idle_result)
   exit(EXIT_SUCCESS);
    exit(EXIT_FAILURE);
@@ -224,6 +236,7 @@ void PrintUsage(const char* program_name) {
    "--delete  Delete the specified LOGSTREAM(s) 
by\n"
    "  removing allocated resources in 
the log\n"
    "  server. Does not

Re: [devel] [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102]

2019-10-22 Thread Minh Hon Chau

Hi Thuan,

Please see comment inline

Thanks

Minh

On 23/10/19 3:32 pm, Tran Thuan wrote:

Hi Minh,

Thanks for comments. See my response inline.

Btw, I am preparing to send out new patch, I think I found an issue in 
current patch.

Best Regards,

ThuanTr

-Original Message-
From: Minh Hon Chau 
Sent: Wednesday, October 23, 2019 5:52 AM
To: Tran Thuan ; 'Nguyen Minh Vu' 
; hans.nordeb...@ericsson.com; 
gary@dektech.com.au

Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: not waste 1.5s in waiting Adest already 
down to send response message [#3102]

Hi Thuan,

I wonder the patch would work in the same reproduced steps if the both

adests have subscribed each other more than 2 services. The svc_cnt will

be greater than 1 until it is the last service down event. I think

that's why mds has the database @subtn_results, in which each item is an

adest associated with a service id separately.

[Thuan] We can understand that adest still alive, then go with origin 
flow (wait 1.5s).

But can a process send SNDRSP then mds unregister? I think it cannot, 
because it’s in SNDRSP (blocking)

[M] Not unregister, it can be unsubscribe. Or do you mean a process can 
not send two SNDRSP at the same time on 2 different subscribed services?

The scenario of this ticket happens for the process terminated/crash.

[M] Yes my doubt is in the context of this ticket - terminated/crash - 
you would get 2 service down event I think

[M] I don't think adding a new database at the global scope for this 
specific case is a good idea, if we can reuse the existing database. Can 
you try to use MDS_SUBSCRIPTION_INFO, add a flag or something similar to 
indicate which case mds should wait 1.5 sec. It would isolate the bug 
fix in the scope of this problem.

The problem originally resides at the services code e.g ntf, imm...

where the threads structure between mds receiving thread and main thread

cause a race condition. Thus the service sends a message with a death

adest which is removed from mds database, that confuses mds and hit 1.5

secs wait time.

If I read the code correctly, the 1.5 wait time is for another case, it

gives another chance to wait 1.5 when the subscription result is empty

in @subtn_results because the service up has not arrived yet.

[Thuan] Yes, it will give a chance if adest not yet UP any.

My patch still keep that chance as origin code.

But I think I need reduce timeout for adest down timer, I am verifying 
this change.

mds  subscribe >

mds  sends message A x

mds wait 1.5 sec

mds <--- service up 

mds  sends message A >

So the 1.5 sec time is for early phase of waiting service up.

    } else if (sub_info->tmr_flag != true) {

        if ((MDS_SENDTYPE_RSP == req->i_sendtype) ||

            (MDS_SENDTYPE_RRSP == req->i_sendtype)) {

            time_wait = true;

            m_MDS_LOG_INFO(

                "MDS_SND_RCV:Disc queue: Subscr exists no timer

running: Waiting for some time\n");

-> I think at this line, it means: the SUBSCRIPTION_TMR has timeout, and

mds is sending RSP/RRSP which means mds should have received the

*request* message (?), so mds wants to wait for another 1.5 second for

service_up to create the subscription result in database.

The problem in this ticket hit 1.5 because the service down has already

come and mds removed the subscription result item, now the ntf, imm...

sends msg with a death adest, and mds now it thinks it is waiting for a

service up to come as at the early phase, so it waits. Both two

scenarios can be distinguished themselves to avoid to wait 1.5 secs for

the latter case I think.

Thanks

Minh

On 22/10/19 9:50 pm, Tran Thuan wrote:

> Hi Vu,

>

> Thanks for additional comments.

> I reply your concerns inline below.

>

> Best Regards,

> ThuanTr

>

> -Original Message-

> From: Nguyen Minh Vu <mailto:vu.m.ngu...@dektech.com.au>>

> Sent: Tuesday, October 22, 2019 5:28 PM

> To: thuan.tran <mailto:thuan.t...@dektech.com.au>>; 'Minh Chau' 
mailto:minh.c...@dektech.com.au>>; 
hans.nordeb...@ericsson.com <mailto:hans.nordeb...@ericsson.com>; 
gary@dektech.com.au <mailto:gary@dektech.com.au>

> Cc: opensaf-devel@lists.sourceforge.net 
<mailto:opensaf-devel@lists.sourceforge.net>

> Subject: Re: [PATCH 1/1] mds: not waste 1.5s in waiting Adest 
already down to send response message [#3102]

>

> Hi Thuan,

>

> I have additional comments below.

>

> Regards, Vu

>

> On 10/22/19 7:14 AM, thuan.tran wrote:

>> - When sending response message which Adest not exist (already down)

>> current MDS try to wait for 1.5 seconds before conclude no route to

>> send response message.

>>

>> - There are 2 scenarios may have:

>> UP -> DOWN -> receive SNDRSP -> response timeout after

Re: [devel] [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102]

2019-10-22 Thread Minh Hon Chau


Hi Thuan,

I wonder the patch would work in the same reproduced steps if the both 
adests have subscribed each other more than 2 services. The svc_cnt will 
be greater than 1 until it is the last service down event. I think 
that's why mds has the database @subtn_results, in which each item is an 
adest associated with a service id separately.


The problem originally resides at the services code e.g ntf, imm... 
where the threads structure between mds receiving thread and main thread 
cause a race condition. Thus the service sends a message with a death 
adest which is removed from mds database, that confuses mds and hit 1.5 
secs wait time.


If I read the code correctly, the 1.5 wait time is for another case, it 
gives another chance to wait 1.5 when the subscription result is empty 
in @subtn_results because the service up has not arrived yet.


mds  subscribe >

mds  sends message A x

mds wait 1.5 sec

mds <--- service up 

mds  sends message A >

So the 1.5 sec time is for early phase of waiting service up.

    } else if (sub_info->tmr_flag != true) {
        if ((MDS_SENDTYPE_RSP == req->i_sendtype) ||
            (MDS_SENDTYPE_RRSP == req->i_sendtype)) {
            time_wait = true;
            m_MDS_LOG_INFO(
                "MDS_SND_RCV:Disc queue: Subscr exists no timer 
running: Waiting for some time\n");


-> I think at this line, it means: the SUBSCRIPTION_TMR has timeout, and 
mds is sending RSP/RRSP which means mds should have received the 
*request* message (?), so mds wants to wait for another 1.5 second for 
service_up to create the subscription result in database.


The problem in this ticket hit 1.5 because the service down has already 
come and mds removed the subscription result item, now the ntf, imm... 
sends msg with a death adest, and mds now it thinks it is waiting for a 
service up to come as at the early phase, so it waits. Both two 
scenarios can be distinguished themselves to avoid to wait 1.5 secs for 
the latter case I think.


Thanks

Minh

On 22/10/19 9:50 pm, Tran Thuan wrote:

Hi Vu,

Thanks for additional comments.
I reply your concerns inline below.

Best Regards,
ThuanTr

-Original Message-
From: Nguyen Minh Vu 
Sent: Tuesday, October 22, 2019 5:28 PM
To: thuan.tran ; 'Minh Chau' 
; hans.nordeb...@ericsson.com; gary@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to 
send response message [#3102]

Hi Thuan,

I have additional comments below.

Regards, Vu

On 10/22/19 7:14 AM, thuan.tran wrote:

- When sending response message which Adest not exist (already down)
current MDS try to wait for 1.5 seconds before conclude no route to
send response message.

- There are 2 scenarios may have:
UP -> DOWN -> receive SNDRSP -> response timeout after 1.5s
UP -> receive SNDRSP -> DOWN -> response timeout after 1.5s

- With this change, MDS will not waste for 1.5s which can cause trouble
for higher layer services, e.g: ntf, imm, etc...
---
   src/mds/mds_c_api.c | 70 -
   src/mds/mds_c_sndrcv.c  | 52 --
   src/mds/mds_core.h  | 25 +--
   src/mds/mds_dt2c.h  |  2 +-
   src/mds/mds_dt_common.c | 22 -
   5 files changed, 162 insertions(+), 9 deletions(-)

diff --git a/src/mds/mds_c_api.c b/src/mds/mds_c_api.c
index 132555b8e..5dd30c536 100644
--- a/src/mds/mds_c_api.c
+++ b/src/mds/mds_c_api.c
@@ -1900,6 +1900,32 @@ uint32_t mds_mcm_svc_up(PW_ENV_ID pwe_id, MDS_SVC_ID 
svc_id, V_DEST_RL role,
   
   	/*** Validation for SCOPE **/
   
+	if (adest != m_MDS_GET_ADEST) {

+   MDS_ADEST_INFO *adest_info =
+   (MDS_ADEST_INFO *)ncs_patricia_tree_get(
+   _mds_mcm_cb->adest_list,
+   (uint8_t *));
+   if (!adest_info) {
+   /* Add adest to adest list */
+   adest_info = m_MMGR_ALLOC_ADEST_INFO;
+   memset(adest_info, 0, sizeof(MDS_ADEST_INFO));
+   adest_info->adest = adest;
+   adest_info->node.key_info =
+   (uint8_t *)_info->adest;
+   adest_info->svc_cnt = 1;
+   adest_info->tmr_start = false;
+   ncs_patricia_tree_add(
+   _mds_mcm_cb->adest_list,
+   (NCS_PATRICIA_NODE *)adest_info);
+   m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64
+   " svc_cnt=%u", adest, adest_info->svc_cnt);
+   } else {
+   adest_info->svc_cnt++;
+   m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64
+   " svc_cnt=%u", adest, adest_info->svc_cnt);
+   }
+   }
+
status =

Re: [devel] [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102]

2019-10-21 Thread Minh Hon Chau


Hi Thuan,

1- Can you point out where is the mds code that waits for 1.5 seconds, 
is it hard coded within 1.5 secs?


2- Is existing db (mds_c_db.c) in mds not enough so we need to introduce 
adest_list? I think mds must have a memory of adest, perhaps in another 
implicit form, so mds can validate an adest, a svc_id associated with adest.


thanks

Minh

On 22/10/19 11:14 am, thuan.tran wrote:

- When sending response message which Adest not exist (already down)
current MDS try to wait for 1.5 seconds before conclude no route to
send response message.

- There are 2 scenarios may have:
UP -> DOWN -> receive SNDRSP -> response timeout after 1.5s
UP -> receive SNDRSP -> DOWN -> response timeout after 1.5s

- With this change, MDS will not waste for 1.5s which can cause trouble
for higher layer services, e.g: ntf, imm, etc...
---
  src/mds/mds_c_api.c | 70 -
  src/mds/mds_c_sndrcv.c  | 52 --
  src/mds/mds_core.h  | 25 +--
  src/mds/mds_dt2c.h  |  2 +-
  src/mds/mds_dt_common.c | 22 -
  5 files changed, 162 insertions(+), 9 deletions(-)

diff --git a/src/mds/mds_c_api.c b/src/mds/mds_c_api.c
index 132555b8e..5dd30c536 100644
--- a/src/mds/mds_c_api.c
+++ b/src/mds/mds_c_api.c
@@ -1900,6 +1900,32 @@ uint32_t mds_mcm_svc_up(PW_ENV_ID pwe_id, MDS_SVC_ID 
svc_id, V_DEST_RL role,
  
  	/*** Validation for SCOPE **/
  
+	if (adest != m_MDS_GET_ADEST) {

+   MDS_ADEST_INFO *adest_info =
+   (MDS_ADEST_INFO *)ncs_patricia_tree_get(
+   _mds_mcm_cb->adest_list,
+   (uint8_t *));
+   if (!adest_info) {
+   /* Add adest to adest list */
+   adest_info = m_MMGR_ALLOC_ADEST_INFO;
+   memset(adest_info, 0, sizeof(MDS_ADEST_INFO));
+   adest_info->adest = adest;
+   adest_info->node.key_info =
+   (uint8_t *)_info->adest;
+   adest_info->svc_cnt = 1;
+   adest_info->tmr_start = false;
+   ncs_patricia_tree_add(
+   _mds_mcm_cb->adest_list,
+   (NCS_PATRICIA_NODE *)adest_info);
+   m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64
+   " svc_cnt=%u", adest, adest_info->svc_cnt);
+   } else {
+   adest_info->svc_cnt++;
+   m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64
+   " svc_cnt=%u", adest, adest_info->svc_cnt);
+   }
+   }
+
status = mds_get_subtn_res_tbl_by_adest(local_svc_hdl, svc_id, vdest_id,
adest, _subtn_result_info);
  
@@ -3571,6 +3597,24 @@ uint32_t mds_mcm_svc_down(PW_ENV_ID pwe_id, MDS_SVC_ID svc_id, V_DEST_RL role,

/* Discard : Getting down before getting up */
} else { /* Entry exist in subscription result table */
  
+		/* If adest exist and no sndrsp, start a timer */

+   MDS_ADEST_INFO *adest_info =
+   (MDS_ADEST_INFO *)ncs_patricia_tree_get(
+   _mds_mcm_cb->adest_list,
+   (uint8_t *));
+   if (adest_info) {
+   adest_info->svc_cnt--;
+   if (adest_info->svc_cnt == 0 &&
+   adest_info->sndrsp_cnt == 0) {
+   m_MDS_LOG_INFO("MCM:API: Adest=%" PRIu64
+   " down timer start", adest);
+   if (adest_info->tmr_start == false) {
+   adest_info->tmr_start = true;
+   start_mds_down_tmr(adest, svc_id);
+   }
+   }
+   }
+
if (vdest_id == m_VDEST_ID_FOR_ADEST_ENTRY) {
status = mds_subtn_res_tbl_del(
local_svc_hdl, svc_id, vdest_id, adest,
@@ -4956,6 +5000,17 @@ uint32_t mds_mcm_init(void)
return NCSCC_RC_FAILURE;
}
  
+	/* ADEST TREE */

+   memset(_tree_params, 0, sizeof(NCS_PATRICIA_PARAMS));
+   pat_tree_params.key_size = sizeof(MDS_DEST);
+   if (NCSCC_RC_SUCCESS !=
+   ncs_patricia_tree_init(_mds_mcm_cb->adest_list,
+  _tree_params)) {
+   m_MDS_LOG_ERR(
+   "MCM:API: patricia_tree_init: adest :failure, L 
mds_mcm_init");
+   return NCSCC_RC_FAILURE;
+   }
+
/* SERVICE TREE */
memset(_tree_params, 0, sizeof(NCS_PATRICIA_PARAMS));
pat_tree_params.key_size = sizeof(MDS_SVC_HDL);
@@ -4966,7 +5021,12 @@ uint32_t mds_mcm_init(void)
if

Re: [devel] [PATCH 1/1] mds: Disable mds flow control for mds broadcast/multicast message [#3101]

2019-10-20 Thread Minh Hon Chau


Hi Thuan,

The patch is acked and I pushed it.

The commit message may cause you a missundestanding, but in this context 
it does not mention anything regarding to configuration, so I hope it's ok.


Another comment inline.

Thanks

Minh

On 21/10/19 1:56 pm, Tran Thuan wrote:

Hi Minh,

I suggest commit message as following
mds: skip flow control for bcast/mcast message if tipc multicast
enabled.
because "disable mds flow control" cause misunderstood overwrite configure
MDS_TIPC_FCTRL_ENABLED

And another comment with [Thuan] inline. Thanks.

Best Regards,
ThuanTr

-Original Message-
From: Minh Chau 
Sent: Thursday, October 17, 2019 10:00 AM
To: hans.nordeb...@ericsson.com; thuan.t...@dektech.com.au;
gary@dektech.com.au; vu.m.ngu...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net; Minh Chau

Subject: [PATCH 1/1] mds: Disable mds flow control for mds
broadcast/multicast message [#3101]

The mds flow control has been disabled for broadcast/mulitcast unfragment
message if tipc multicast is enabled. This patch revisits and continues with
fragment messages.
---
  src/mds/mds_tipc_fctrl_intf.cc   | 47

  src/mds/mds_tipc_fctrl_msg.h | 11 +++---
  src/mds/mds_tipc_fctrl_portid.cc | 47
++--
  src/mds/mds_tipc_fctrl_portid.h  |  3 ++-
  4 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index b803bfe..fe3dbd5 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -133,7 +133,7 @@ uint32_t process_flow_event(const Event& evt) {
kChunkAckSize, sock_buf_size);
portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
-evt.fseq_, evt.svc_id_);
+evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled);
  } else if (evt.type_ == Event::Type::kEvtRcvIntro) {
portid = new TipcPortId(evt.id_, data_sock_fd,
kChunkAckSize, sock_buf_size); @@ -147,7 +147,7 @@ uint32_t
process_flow_event(const Event& evt) {
} else {
  if (evt.type_ == Event::Type::kEvtRcvData) {
rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
-  evt.fseq_, evt.svc_id_);
+  evt.fseq_, evt.svc_id_, evt.snd_type_, is_mcast_enabled);
  }
  if (evt.type_ == Event::Type::kEvtRcvChunkAck) {
portid->ReceiveChunkAck(evt.fseq_, evt.chunk_size_); @@ -430,6 +430,7
@@ uint32_t mds_tipc_fctrl_drop_data(uint8_t *buffer, uint16_t len,
  
HeaderMessage header;

header.Decode(buffer);
+  Event* pevt = nullptr;
// if mds support flow control
if ((header.pro_ver_ & MDS_PROT_VER_MASK) == MDS_PROT_FCTRL) {
  if (header.pro_id_ == MDS_PROT_FCTRL_ID) { @@ -438,9 +439,10 @@
uint32_t mds_tipc_fctrl_drop_data(uint8_t *buffer, uint16_t len,
  ChunkAck ack;
  ack.Decode(buffer);
  // send to the event thread
-if (m_NCS_IPC_SEND(_events,
-new Event(Event::Type::kEvtSendChunkAck, id, ack.svc_id_,
-header.mseq_, header.mfrag_, ack.acked_fseq_,
ack.chunk_size_),
+pevt = new Event(Event::Type::kEvtSendChunkAck, id, ack.svc_id_,
+header.mseq_, header.mfrag_, ack.acked_fseq_);
+pevt->chunk_size_ = ack.chunk_size_;
+if (m_NCS_IPC_SEND(_events, pevt,
  NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) {
m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events,
Error[%s]",
strerror(errno));
@@ -453,9 +455,9 @@ uint32_t mds_tipc_fctrl_drop_data(uint8_t *buffer,
uint16_t len,
DataMessage data;
data.Decode(buffer);
// send to the event thread
-  if (m_NCS_IPC_SEND(_events,
-  new Event(Event::Type::kEvtDropData, id, data.svc_id_,
-  header.mseq_, header.mfrag_, header.fseq_),
+  pevt = new Event(Event::Type::kEvtDropData, id, data.svc_id_,
+  header.mseq_, header.mfrag_, header.fseq_);
+  if (m_NCS_IPC_SEND(_events, pevt,
NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) {
  m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]",
  strerror(errno));
@@ -474,6 +476,7 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer,
uint16_t len,
  
HeaderMessage header;

header.Decode(buffer);
+  Event* pevt = nullptr;
// if mds support flow control
if ((header.pro_ver_ & MDS_PROT_VER_MASK) == MDS_PROT_FCTRL) {
  if (header.pro_id_ == MDS_PROT_FCTRL_ID) { @@ -482,9 +485,10 @@
uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t len,
  ChunkAck ack;
  ack.Decode(buffer);
  // send to the event thread
-if (m_NCS_IPC_SEND(_events,
-new Event(Event::Type::kEvtRcvChunkAck, id, ack.svc_id_,
-header.mseq_, header.mfrag_, ack.acked_fseq_,
ack.chunk_size_),
+pevt = new Event(Event::Type::kEvtRcvChunkAck,

Re: [devel] [PATCH 1/1] mds: Add Intro message [#3090]

2019-10-14 Thread Minh Hon Chau


Hi,

The counters reset will be removed in ReceiveIntro().

Thanks

Minh


On 15/10/19 12:50 pm, Minh Chau wrote:

mds relies on data message sent from the peer to determine
whether the MDS_TIPC_FCTRL_ENABLED is set. The data message
may not be sent right after TIPC_PUBLISHED event, which can
cause the tx probation timer timeout.

This patch add Intro message, which is sent right after the
TIPC_PUBLISHED to help mds determine the flow control supported
at the peer earlier.
---
  src/mds/mds_main.c   |  2 +-
  src/mds/mds_tipc_fctrl_intf.cc   | 27 ++
  src/mds/mds_tipc_fctrl_msg.cc| 11 +
  src/mds/mds_tipc_fctrl_msg.h | 18 +++
  src/mds/mds_tipc_fctrl_portid.cc | 49 ++--
  src/mds/mds_tipc_fctrl_portid.h  |  2 ++
  6 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/src/mds/mds_main.c b/src/mds/mds_main.c
index 8c9b1f1..c7d2f7b 100644
--- a/src/mds/mds_main.c
+++ b/src/mds/mds_main.c
@@ -408,7 +408,7 @@ uint32_t mds_lib_req(NCS_LIB_REQ_INFO *req)
if (tipc_mcast_enabled != false)
tipc_mcast_enabled = true;
  
-m_MDS_LOG_DBG(

+   m_MDS_LOG_NOTIFY(
"MDS: TIPC_MCAST_ENABLED: %d  Set argument 
\n",
tipc_mcast_enabled);
}
diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index 6271890..b803bfe 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -39,6 +39,7 @@ using mds::DataMessage;
  using mds::ChunkAck;
  using mds::HeaderMessage;
  using mds::Nack;
+using mds::Intro;
  
  namespace {

  // flow control enabled/disabled
@@ -124,12 +125,20 @@ uint32_t process_flow_event(const Event& evt) {
uint32_t rc = NCSCC_RC_SUCCESS;
TipcPortId *portid = portid_lookup(evt.id_);
if (portid == nullptr) {
+// the null portid normally should not happen; however because the
+// tipc_cb.Dsock and tipc_cb.BSRsock are separated; the data message
+// sent from BSRsock may come before reception of TIPC_PUBLISHED
  if (evt.type_ == Event::Type::kEvtRcvData) {
portid = new TipcPortId(evt.id_, data_sock_fd,
kChunkAckSize, sock_buf_size);
portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
  evt.fseq_, evt.svc_id_);
+} else if (evt.type_ == Event::Type::kEvtRcvIntro) {
+  portid = new TipcPortId(evt.id_, data_sock_fd,
+  kChunkAckSize, sock_buf_size);
+  portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
+  portid->ReceiveIntro();
  } else {
m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], "
"RcvEvt[evt:%d], Error[PortId not found]",
@@ -151,6 +160,9 @@ uint32_t process_flow_event(const Event& evt) {
portid->ReceiveNack(evt.mseq_, evt.mfrag_,
evt.fseq_);
  }
+if (evt.type_ == Event::Type::kEvtRcvIntro) {
+  portid->ReceiveIntro();
+}
}
return rc;
  }
@@ -489,6 +501,16 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t 
len,
m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]",
strerror(errno));
  }
+  } else if (header.msg_type_ == Intro::kIntroMsgType) {
+// no need to decode intro message
+// the decoding intro message type is done in header decoding
+// send to the event thread
+if (m_NCS_IPC_SEND(_events,
+new Event(Event::Type::kEvtRcvIntro, id, 0, 0, 0, 0),
+NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) {
+  m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events, Error[%s]",
+  strerror(errno));
+}
} else {
  m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], "
  "[msg_type:%u], Error[not supported message type]",
@@ -516,6 +538,11 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t 
len,
portid_map_mutex.unlock();
return rc;
  }
+  } else {
+m_MDS_LOG_DBG("FCTRL: [me] <-- [node:%x, ref:%u], "
+"Receive non-flow-control data message, "
+"header.pro_ver:%u",
+id.node, id.ref, header.pro_ver_);
}
return NCSCC_RC_SUCCESS;
  }
diff --git a/src/mds/mds_tipc_fctrl_msg.cc b/src/mds/mds_tipc_fctrl_msg.cc
index 932120f..180dcb6 100644
--- a/src/mds/mds_tipc_fctrl_msg.cc
+++ b/src/mds/mds_tipc_fctrl_msg.cc
@@ -178,4 +178,15 @@ void Nack::Decode(uint8_t *msg) {
nacked_fseq_ = ncs_decode_16bit();
  }
  
+

+void Intro::Encode(uint8_t *msg) {
+  uint8_t *ptr;
+  // encode protocol identifier
+  ptr = [Intro::FieldIndex::kProtocolIdentifier];
+  ncs_encode_32bit(, MDS_PROT_FCTRL_ID);
+  // encode message type
+  ptr = [Intro::FieldIndex::kFlowControlMessageType];
+  ncs_encode_8bit(, kIntroMsgType);
+}
+
  }  // end

Re: [devel] [PATCH 1/1] mds: Add Reset message [#3090]

2019-10-14 Thread Minh Hon Chau


Hi Thuan,

I can rename it as "Intro" message, then the rcvwnd counter shall be 
removed.


This new message can not replace the tx prob timer. This new message is 
to speed up the determinatin of flow control at the peer side rather 
than mds data message. It is needed for the flow control sender 'talk' 
with the non-flow-control receiver who will not send any ack back.


THanks,

Minh

On 14/10/19 7:06 pm, Tran Thuan wrote:

Hi bro.Minh,

Thanks for explanation.
I think the "reset" message should be rename to "introduce" message.
Another question: with this fix, will tx probation timer become redundant or 
still useful in somehow?

Best Regards,
ThuanTr

-----Original Message-
From: Minh Hon Chau 
Sent: Monday, October 14, 2019 1:01 PM
To: Tran Thuan ; hans.nordeb...@ericsson.com; 
gary@dektech.com.au; vu.m.ngu...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: Add Reset message [#3090]

Hi Thuan,

If the chunkack is configured to send after a few data messages, then the 
sender is not getting any chunkack for the first message from receiver until 
chunkack timeout (which is also configurable to be a bit larger value). Then, 
the probation timer would be timeout at sender.

The rcvwnd.acked_ will be fixed.

Thanks

Minh

On 14/10/19 4:39 pm, Tran Thuan wrote:

Hi bro.Minh,

- In my understanding, tx probation timer only start when sender send
first message.
Then sender relies on chunkAck to know receiver support MDS FCTRL or
timeout as not support.
But from what you describe, sender got tx probation timer timeout
before sending first message?
Or after sending first message but sender cannot get any chunkAck somehow?
I am confused this point. Could you help explain?

- About the code, mistake set '0' twice for .acked_ in
TipcPortId::ReceiveReset()

Best Regards,
ThuanTr

-Original Message-
From: Minh Chau 
Sent: Friday, October 11, 2019 10:52 AM
To: hans.nordeb...@ericsson.com; gary@dektech.com.au;
vu.m.ngu...@dektech.com.au; thuan.t...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net; Minh Chau

Subject: [PATCH 1/1] mds: Add Reset message [#3090]

mds relies on data message sent from the peer to determine whether the
MDS_TIPC_FCTRL_ENABLED is set. The data message may not be sent right
after TIPC_PUBLISHED event, which can cause the tx probation timer timeout.

This patch add Reset message, which is sent right after the
TIPC_PUBLISHED to help mds determine the flow control supported at the peer 
earlier.
---
   src/mds/mds_main.c   |  2 +-
   src/mds/mds_tipc_fctrl_intf.cc   | 27 ++
   src/mds/mds_tipc_fctrl_msg.cc| 11 +
   src/mds/mds_tipc_fctrl_msg.h | 18 +++
   src/mds/mds_tipc_fctrl_portid.cc | 49
++--
   src/mds/mds_tipc_fctrl_portid.h  |  2 ++
   6 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/src/mds/mds_main.c b/src/mds/mds_main.c index
8c9b1f1..c7d2f7b
100644
--- a/src/mds/mds_main.c
+++ b/src/mds/mds_main.c
@@ -408,7 +408,7 @@ uint32_t mds_lib_req(NCS_LIB_REQ_INFO *req)
if (tipc_mcast_enabled != false)
tipc_mcast_enabled = true;
   
-m_MDS_LOG_DBG(

+   m_MDS_LOG_NOTIFY(
"MDS: TIPC_MCAST_ENABLED: %d  Set argument 
\n",
tipc_mcast_enabled);
}
diff --git a/src/mds/mds_tipc_fctrl_intf.cc
b/src/mds/mds_tipc_fctrl_intf.cc index 6271890..e8c9121 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -39,6 +39,7 @@ using mds::DataMessage;  using mds::ChunkAck;  using
mds::HeaderMessage;  using mds::Nack;
+using mds::Reset;
   
   namespace {

   // flow control enabled/disabled
@@ -124,12 +125,20 @@ uint32_t process_flow_event(const Event& evt) {
 uint32_t rc = NCSCC_RC_SUCCESS;
 TipcPortId *portid = portid_lookup(evt.id_);
 if (portid == nullptr) {
+// the null portid normally should not happen; however because the
+// tipc_cb.Dsock and tipc_cb.BSRsock are separated; the data message
+// sent from BSRsock may come before reception of TIPC_PUBLISHED
   if (evt.type_ == Event::Type::kEvtRcvData) {
 portid = new TipcPortId(evt.id_, data_sock_fd,
 kChunkAckSize, sock_buf_size);
 portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
 rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
   evt.fseq_, evt.svc_id_);
+} else if (evt.type_ == Event::Type::kEvtRcvReset) {
+  portid = new TipcPortId(evt.id_, data_sock_fd,
+  kChunkAckSize, sock_buf_size);
+  portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
+  portid->ReceiveReset();
   } else {
 m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], "
 "RcvEvt[evt:%d], Error

Re: [devel] [PATCH 1/1] mds: Add Reset message [#3090]

2019-10-14 Thread Minh Hon Chau


Hi Thuan,

If the chunkack is configured to send after a few data messages, then 
the sender is not getting any chunkack for the first message from 
receiver until chunkack timeout (which is also configurable to be a bit 
larger value). Then, the probation timer would be timeout at sender.


The rcvwnd.acked_ will be fixed.

Thanks

Minh

On 14/10/19 4:39 pm, Tran Thuan wrote:

Hi bro.Minh,

- In my understanding, tx probation timer only start when sender send first
message.
Then sender relies on chunkAck to know receiver support MDS FCTRL or timeout
as not support.
But from what you describe, sender got tx probation timer timeout before
sending first message?
Or after sending first message but sender cannot get any chunkAck somehow?
I am confused this point. Could you help explain?

- About the code, mistake set '0' twice for .acked_ in
TipcPortId::ReceiveReset()

Best Regards,
ThuanTr

-Original Message-
From: Minh Chau 
Sent: Friday, October 11, 2019 10:52 AM
To: hans.nordeb...@ericsson.com; gary@dektech.com.au;
vu.m.ngu...@dektech.com.au; thuan.t...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net; Minh Chau

Subject: [PATCH 1/1] mds: Add Reset message [#3090]

mds relies on data message sent from the peer to determine whether the
MDS_TIPC_FCTRL_ENABLED is set. The data message may not be sent right after
TIPC_PUBLISHED event, which can cause the tx probation timer timeout.

This patch add Reset message, which is sent right after the TIPC_PUBLISHED
to help mds determine the flow control supported at the peer earlier.
---
  src/mds/mds_main.c   |  2 +-
  src/mds/mds_tipc_fctrl_intf.cc   | 27 ++
  src/mds/mds_tipc_fctrl_msg.cc| 11 +
  src/mds/mds_tipc_fctrl_msg.h | 18 +++
  src/mds/mds_tipc_fctrl_portid.cc | 49
++--
  src/mds/mds_tipc_fctrl_portid.h  |  2 ++
  6 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/src/mds/mds_main.c b/src/mds/mds_main.c index 8c9b1f1..c7d2f7b
100644
--- a/src/mds/mds_main.c
+++ b/src/mds/mds_main.c
@@ -408,7 +408,7 @@ uint32_t mds_lib_req(NCS_LIB_REQ_INFO *req)
if (tipc_mcast_enabled != false)
tipc_mcast_enabled = true;
  
-m_MDS_LOG_DBG(

+   m_MDS_LOG_NOTIFY(
"MDS: TIPC_MCAST_ENABLED: %d  Set
argument \n",
tipc_mcast_enabled);
}
diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index 6271890..e8c9121 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -39,6 +39,7 @@ using mds::DataMessage;  using mds::ChunkAck;  using
mds::HeaderMessage;  using mds::Nack;
+using mds::Reset;
  
  namespace {

  // flow control enabled/disabled
@@ -124,12 +125,20 @@ uint32_t process_flow_event(const Event& evt) {
uint32_t rc = NCSCC_RC_SUCCESS;
TipcPortId *portid = portid_lookup(evt.id_);
if (portid == nullptr) {
+// the null portid normally should not happen; however because the
+// tipc_cb.Dsock and tipc_cb.BSRsock are separated; the data message
+// sent from BSRsock may come before reception of TIPC_PUBLISHED
  if (evt.type_ == Event::Type::kEvtRcvData) {
portid = new TipcPortId(evt.id_, data_sock_fd,
kChunkAckSize, sock_buf_size);
portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
rc = portid->ReceiveData(evt.mseq_, evt.mfrag_,
  evt.fseq_, evt.svc_id_);
+} else if (evt.type_ == Event::Type::kEvtRcvReset) {
+  portid = new TipcPortId(evt.id_, data_sock_fd,
+  kChunkAckSize, sock_buf_size);
+  portid_map[TipcPortId::GetUniqueId(evt.id_)] = portid;
+  portid->ReceiveReset();
  } else {
m_MDS_LOG_ERR("FCTRL: [me] <-- [node:%x, ref:%u], "
"RcvEvt[evt:%d], Error[PortId not found]", @@ -151,6 +160,9 @@
uint32_t process_flow_event(const Event& evt) {
portid->ReceiveNack(evt.mseq_, evt.mfrag_,
evt.fseq_);
  }
+if (evt.type_ == Event::Type::kEvtRcvReset) {
+  portid->ReceiveReset();
+}
}
return rc;
  }
@@ -489,6 +501,16 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer,
uint16_t len,
m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events,
Error[%s]",
strerror(errno));
  }
+  } else if (header.msg_type_ == Reset::kResetMsgType) {
+// no need to decode reset message
+// the decoding reset message type is done in header decoding
+// send to the event thread
+if (m_NCS_IPC_SEND(_events,
+new Event(Event::Type::kEvtRcvReset, id, 0, 0, 0, 0),
+NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) {
+  m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events,
Error[%s]",
+  strerror(errno));
+}
} else {

Re: [devel] [PATCH 1/1] ntfd: Do not send response to client if client down [#3084]

2019-10-09 Thread Minh Hon Chau


Hi all,

What I guess from the ticket that it is a race condition between the mds 
thread and main thread in ntfd. We normally get NCSDOWN callback from 
mds, and send event to main thread to remove the client. But the mds 
callback here comes in the middle of processing Initialize().


We have something similar done in ntfd with 
SearchAndSetClientsDownFlag(), GetClientDownFlag(), SetClientDownFlag(), 
can we try to reuse them?


Thanks,

Minh

On 9/10/19 5:10 pm, Thien Minh Huynh wrote:

Hi Vu,

Thanks for your time to review the patch.

Best Regards,
ThienHuynh

-Original Message-
From: Nguyen Minh Vu 
Sent: Wednesday, October 9, 2019 11:15 AM
To: thien.m.huynh ; thuan.t...@dektech.com.au; 
minh.c...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] ntfd: Do not send response to client if client down 
[#3084]

Hi Thien,

I have some comments below.

I see this enhancement does not bring much value to NTF as it deals with a very 
rare case - process is terminated before saNtfInitialize() returns. In reality, 
if NTF server is getting overloaded by such process, there must be an error in 
that process.

@Minh: how about your opinion? is this ticket valid?

Anyway, here are my comments:
1) Only C source files, ntfs_mds.c & ntfs_evt.c, access the new added list 
`ntfa_down_list_head`, why put new added methods in the C++ file and add C wrapper 
functions for them?
It should be more clean if you move these functions into a new files
e.g: ntfs_client_down.{h,c}.

2) C++ method name should start with a capital letter (refer to C++ google 
coding rule)

3) Naming methods that represent adding a down client to list, and removing 
from the list should pair/opposite with each other e.g. Open vs Close, Add vs 
Remove, not mark vs remove

4) The list is accessing from 02 different threads, mds and main thread, 
therefore must use mutex to prevent race conditions.

5) Should have a check to ensure *not* adding the down client into the list if 
that client has successfully initialized.

Regards, Vu

On 10/9/19 9:36 AM, thien.m.huynh wrote:

Ntfd will not send response to a client when client already down.
This will avoid timeout when ntfd send via mds.
---
   src/ntf/ntfd/NtfAdmin.cc | 93 

   src/ntf/ntfd/NtfAdmin.h  |  3 ++
   src/ntf/ntfd/ntfs_cb.h   |  6 
   src/ntf/ntfd/ntfs_com.h  |  3 ++
   src/ntf/ntfd/ntfs_evt.c  |  1 +
   src/ntf/ntfd/ntfs_mds.c  |  9 -
   6 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/src/ntf/ntfd/NtfAdmin.cc b/src/ntf/ntfd/NtfAdmin.cc index
8bbee69..641171b 100644
--- a/src/ntf/ntfd/NtfAdmin.cc
+++ b/src/ntf/ntfd/NtfAdmin.cc
@@ -560,6 +560,85 @@ void NtfAdmin::SearchAndSetClientsDownFlag(MDS_DEST 
mds_dest) {
   }
   
   /**

+ * @brief Add mds_dest tag into ntfa down list
+ * @param mds_dest
+ */
+void NtfAdmin::markAgentDown(MDS_DEST mds_dest) {
+  TRACE_ENTER();
+  NTFA_DOWN_LIST *ntfa_down_rec = NULL;
+  if ((ntfa_down_rec = reinterpret_cast(
+   malloc(sizeof(NTFA_DOWN_LIST == NULL) {
+LOG_ER("memory allocation for the NTFA_DOWN_LIST failed");
+return;
+  }
+  memset(ntfa_down_rec, 0, sizeof(NTFA_DOWN_LIST));
+  ntfa_down_rec->mds_dest = mds_dest;
+  ntfa_down_rec->next = NULL;
+
+  if (ntfs_cb->ntfa_down_list_head == NULL) {
+ntfs_cb->ntfa_down_list_head = ntfa_down_rec;
+  } else {
+NTFA_DOWN_LIST *p = ntfs_cb->ntfa_down_list_head;
+while (p->next != NULL) {
+  p = p->next;
+}
+p->next = ntfa_down_rec;
+  }
+  TRACE_1("Added MDS dest: %" PRIx64, ntfa_down_rec->mds_dest);
+  TRACE_LEAVE();
+}
+
+/**
+ * @brief Find and remove agent from ntfa down list
+ * @param mds_dest
+ */
+void NtfAdmin::removeAgentFromDownList(MDS_DEST mds_dest) {
+  NTFA_DOWN_LIST *ntfa_down_rec = ntfs_cb->ntfa_down_list_head;
+  NTFA_DOWN_LIST *prev = NULL;
+  TRACE_ENTER();
+  while (ntfa_down_rec != NULL) {
+if (mds_dest == ntfa_down_rec->mds_dest) {
+  if (ntfa_down_rec == ntfs_cb->ntfa_down_list_head) {
+if (ntfa_down_rec->next == NULL) {
+  ntfs_cb->ntfa_down_list_head = NULL;
+} else {
+  ntfs_cb->ntfa_down_list_head = ntfa_down_rec->next;
+}
+  } else if (prev) {
+prev->next = ntfa_down_rec->next;
+  }
+  TRACE("Deleted MDS dest: %" PRIx64, ntfa_down_rec->mds_dest);
+  free(ntfa_down_rec);
+  ntfa_down_rec = NULL;
+  break;
+}
+prev = ntfa_down_rec;
+ntfa_down_rec = ntfa_down_rec->next;
+  }
+  TRACE_LEAVE();
+}
+
+/**
+ * @brief  Check if agent exists in down list
+ * @param  mds_dest
+ * @return true/false
+ */
+bool NtfAdmin::isInNtfaDownList(MDS_DEST mds_dest) {
+  bool found = false;
+  NTFA_DOWN_LIST *ntfa_down_rec = ntfs_cb->ntfa_down_list_head;
+  TRACE_ENTER();
+  while (ntfa_down_rec != NULL) {
+if (mds_dest == ntfa_down_rec->mds_dest) {
+  found = true;
+  break;
+}
+ntfa_down_rec = ntfa_down_rec->next;
+  }
+  TRACE_LEAVE();
+

Re: [devel] [PATCH 1/1] mds: Enhance decoding for mds flow control message [#3097]

2019-10-06 Thread Minh Hon Chau


Hi Thuan,

Please see comments inline.

Thanks

Minh

On 7/10/19 3:18 pm, Tran Thuan wrote:

Hi Minh,

Some minor comments from me, check [Thuan] inline.
Thanks.

Best Regards,
ThuanTr

-Original Message-
From: Minh Chau 
Sent: Monday, October 7, 2019 7:12 AM
To: hans.nordeb...@ericsson.com; vu.m.ngu...@dektech.com.au;
gary@dektech.com.au; thuan.t...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net; Minh Chau

Subject: [PATCH 1/1] mds: Enhance decoding for mds flow control message
[#3097]

mds currently uses MDS_PROT_FCTRL_ID 4 bytes value (0x00AC13F5) from octet11
to octet14 to identify the flow control message e.g., chunkack message. In
case of fragmentation from big message, the second fragment onwards will
start from the octet11, which may have arbitrary value and cause mds to
incorrectly decode as a flow control message if the fragment starts with
value of 0x00AC13F5.

This patch fixes this rare case by decoding flow control message only if the
oct2-5 (mds global sequence number) and oct6-7 (mds fragment number) are
non-zero. Change MDS_PROT_FCTRL_ID:0xFDAC13F5
[Thuan]: typo "non-zero" -> "zero"?

[Minh]: Yes, typo, it's "zero"

[Thuan] Can you give info in commit message about why change
MDS_PROT_FCTRL_ID to FDAC13F5?
[Minh]: It is only a random number for identifier, but 0x00AC will 
occupy the oct11&12 which is msd header length, and may cause a higher 
probability to be identical

---
  src/mds/mds_dt.h  |  2 +-
  src/mds/mds_tipc_fctrl_msg.cc | 20 +---
  2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h index d9e8633..64da600
100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -245,7 +245,7 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT
msg);
  
  /* MDS protocol/version for flow control */  #define MDS_PROT_FCTRL (0xB0 |

MDS_VERSION) -#define MDS_PROT_FCTRL_ID 0x00AC13F5
+#define MDS_PROT_FCTRL_ID 0xFDAC13F5
  
  /* Added for the subscription changes */  #define MDS_NCS_CHASSIS_ID

(m_NCS_GET_NODE_ID & 0x00ff) diff --git a/src/mds/mds_tipc_fctrl_msg.cc
b/src/mds/mds_tipc_fctrl_msg.cc index 064d977..8375673 100644
--- a/src/mds/mds_tipc_fctrl_msg.cc
+++ b/src/mds/mds_tipc_fctrl_msg.cc
@@ -64,13 +64,19 @@ void HeaderMessage::Decode(uint8_t *msg) {
  // decode flow control sequence number
  ptr = [HeaderMessage::FieldIndex::kFlowControlSequenceNumber];
  fseq_ = ncs_decode_16bit();
-// decode protocol identifier
-ptr = [ChunkAck::FieldIndex::kProtocolIdentifier];
-pro_id_ = ncs_decode_32bit();
-if (pro_id_ == MDS_PROT_FCTRL_ID) {
-  // decode message type
-  ptr = [ChunkAck::FieldIndex::kFlowControlMessageType];
-  msg_type_ = ncs_decode_8bit();
+// decode protocol identifier if the mfrag_ and mseq_ are 0
+// otherwise it is always DataMessage within non-zero mseq_ and mfrag_
+if (mfrag_ == 0 && mseq_ == 0) {
+  ptr = [ChunkAck::FieldIndex::kProtocolIdentifier];
+  pro_id_ = ncs_decode_32bit();
+  if (pro_id_ == MDS_PROT_FCTRL_ID) {
+// decode message type
+ptr = [ChunkAck::FieldIndex::kFlowControlMessageType];
+msg_type_ = ncs_decode_8bit();
+  }
+} else {
+  pro_id_ = 0;
+  msg_type_ = 0;
[Thuan] Don't need ELSE as values 0 already?
[Minh]: I think we should explicitly set again, the variable header 
might be reused to decode

  }
} else {
  if (mfrag_ != 0) {
--
2.7.4






___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 0/2] Review Request for mds: Add Nack message for MDS_TIPC_FCTRL_ENABLED [#3095] V2

2019-10-06 Thread Minh Hon Chau


Hi,

I would like to push the patches today if no more comment for them.

Thanks

Minh

On 4/10/19 3:20 pm, Minh Chau wrote:

Summary: mds: Add Nack message for MDS_TIPC_FCTRL_ENABLED [#3095] V2
Review request for Ticket(s): 3095
Peer Reviewer(s): Hans, Vu, Gary, Thuan
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-3095
Base revision: 05064a1cfd0aeaf824dce7602d535654b3457e30
Personal repository: git://git.code.sf.net/u/minh-chau/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesn
  OpenSAF servicesn
  Core libraries  y
  Samples n
  Tests   n
  Other   n


Comments (indicate scope for each "y" above):
-
*** EXPLAIN/COMMENT THE PATCH SERIES HERE ***

revision cbbeab8f2299620aa3eb9b0e29710a2b159b5a45
Author: Minh Chau 
Date:   Fri, 4 Oct 2019 12:59:27 +1000

mds: Improve error log for MDS_TIPC_FCTRL_ENABLED [#3095]

This commit as part of #3095 updates the error string with
pattern "FCTRL:*Error[*]", in order to help grep-ing the
error in mds debug log.



revision cc666586717fa82df70471748d8766e8fe901460
Author: Minh Chau 
Date:   Fri, 4 Oct 2019 12:59:16 +1000

mds: Add Nack message for MDS_TIPC_FCTRL_ENABLED [#3095]

In the scenario of recovery from split-brain, where both
active director services may suffer mds message loss due
to lost-contact tipc link. If MDS_TIPC_FCTRL_ENABLED is
set, the out-of-order message will be dropped, and there
is no mechanism to trigger the retransmission from receiver
side at this moment (the retransmission is only triggered
from sender as result of TIPC_ERR_OVERLOAD).

In reception of disordered message, the receiver can send
not-acknowledgement to notify the sender for retransmission.
Therefore, the sender can trigger retransmisison in the same
way as receiving TIPC_ERR_OVERLOAD.

This patch adds Nack message for retransmission of disordered
message detected from receiver side, and adds a missing call
to portid_map_mutex.unlock() in process_all_events().



Complete diffstat:
--
  src/mds/mds_c_api.c  |  2 +-
  src/mds/mds_dt_common.c  |  2 +-
  src/mds/mds_tipc_fctrl_intf.cc   | 72 +---
  src/mds/mds_tipc_fctrl_msg.cc| 35 ++-
  src/mds/mds_tipc_fctrl_msg.h | 22 
  src/mds/mds_tipc_fctrl_portid.cc | 42 ---
  src/mds/mds_tipc_fctrl_portid.h  |  3 +-
  7 files changed, 143 insertions(+), 35 deletions(-)


Testing Commands:
-
*** LIST THE COMMAND LINE TOOLS/STEPS TO TEST YOUR CHANGES ***


Testing, Expected Results:
--
*** PASTE COMMAND OUTPUTS / TEST RESULTS ***


Conditions of Submission:
-
*** HOW MANY DAYS BEFORE PUSHING, CONSENSUS ETC ***


Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  n  n
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many blank entries
 that need proper data filled in.

___ You have failed to nominate the proper persons for review and push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible line length in your headers/comments/text.

___ You have failed to put in a proper Trac Ticket # into your commits.

___ You have incorrectly put/left internal data in your comments/files
 (i.e. internal bug tracking tool IDs, product names etc)

___ You have not given any evidence of testing beyond basic build tests.
 Demonstrate some level of runtime or other sanity testing.

___ You have ^M present in some of your files. These have to be removed.

___ You have needlessly changed whitespace or added whitespace crimes
 like trailing spaces, or spaces before tabs.

___ You have mixed real technical changes with whitespace and other
 cosmetic code cleanup changes. These have to be separate commits.

___ You need to refactor your submission into logical chunks; there is
 too much content into a single commit.

___ You have extraneous garbage in your review (merge commits etc)

___ You have giant attachments which should never have been sent;
 Instead you should place your content in a public tree to be pulled.

___ You have too many commits attached to an e-mail; resend as

Re: [devel] [PATCH 1/1] dtm: close unused log streams [#2642]

2019-10-01 Thread Minh Hon Chau


Hi Vu,

No problem, I hope users will get the meaning of "clean-up job" :).

Thanks

Minh

On 1/10/19 8:46 pm, Nguyen Minh Vu wrote:

Hi Minh,

I put that note in the usage of max-idle option. See below:

+  "--max-idle=NUM    Set the maximum number of idle time 
to NUM\n"
+  "  minutes. If a stream has not been 
used for\n"
+  "  NUM minutes, the stream will be 
closed.\n"
+  "  The default value is zero (disable 
the\n"

+  "      clean-up job)\n",

Regards, Vu

On 10/1/19 5:38 PM, Minh Hon Chau wrote:

Hi Vu,

Ok, then the value '0' needs to be written somewhere (README?) for 
this special purpose I guess, to avoid a confusion later on.


Thanks

Minh

On 1/10/19 8:27 pm, Nguyen Minh Vu wrote:

Hi Minh,

Thanks for your comment.

When passing zero to max-idle, the server will disable 'close unused 
log streams' functionality.
It may be useful when user has previously set max-idle to a specific 
value, and want to disable it later.


If the range starts from 1, there is no chance to disable it.

Regards, Vu

On 10/1/19 5:17 PM, Minh Hon Chau wrote:

Hi Vu,

ack for minor comment.

The range of --max-idle, I think, should be starting from 1, as the 
log_server ignores the tv_sec=0. From user's perspective, if 
allowing max-idle=0, the meaning seems that the stream must be 
constantly writing traces, or the stream will be deleted.


Thanks

Minh

On 24/9/19 12:57 pm, Vu Minh Nguyen wrote:
Providing a new option '--max-idle' to configure the maximum idle 
time
of logtrace streams. If a stream has not been used for such time, 
logtrace

server will close the stream from its database.

This patch also corrects wrong indentation in osaflog.cc file.
---
  src/dtm/Makefile  |   2 +-
  src/dtm/common/osaflog_protocol.h |   2 +
  src/dtm/tools/Makefile    |  18 
  src/dtm/tools/osaflog.cc  | 132 
++

  src/dtm/transport/log_server.cc   |  57 -
  src/dtm/transport/log_server.h    |   7 +-
  src/dtm/transport/transportd.conf |   6 ++
  7 files changed, 168 insertions(+), 56 deletions(-)
  create mode 100644 src/dtm/tools/Makefile

diff --git a/src/dtm/Makefile b/src/dtm/Makefile
index 533b0f273..fb0221075 100644
--- a/src/dtm/Makefile
+++ b/src/dtm/Makefile
@@ -15,7 +15,7 @@
  #
    all:
-    $(MAKE) -C ../.. bin/osafdtmd bin/osaftransportd
+    $(MAKE) -C ../.. bin/osafdtmd bin/osaftransportd bin/osaflog
    check:
  $(MAKE) -C ../.. bin/transport_test
diff --git a/src/dtm/common/osaflog_protocol.h 
b/src/dtm/common/osaflog_protocol.h

index 61e9f6f39..d35e5f345 100644
--- a/src/dtm/common/osaflog_protocol.h
+++ b/src/dtm/common/osaflog_protocol.h
@@ -27,6 +27,8 @@ namespace Osaflog {
  static constexpr const char* kServerSocketPath =
  PKGLOCALSTATEDIR "/osaf_log.sock";
  +static constexpr const uint64_t kOneDayInMinute = 24*60;
+
  struct __attribute__((__packed__)) ClientAddressConstantPrefix {
    sa_family_t family = AF_UNIX;
    char abstract = '\0';
diff --git a/src/dtm/tools/Makefile b/src/dtm/tools/Makefile
new file mode 100644
index 0..8c48b70a5
--- /dev/null
+++ b/src/dtm/tools/Makefile
@@ -0,0 +1,18 @@
+#  -*- OpenSAF  -*-
+#
+# (C) Copyright 2019 The OpenSAF Foundation
+#
+# This program is distributed in the hope that it will be useful, 
but
+# WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. This file and program are 
licensed
+# under the GNU Lesser General Public License Version 2.1, 
February 1999.

+# The complete license can be accessed from the following location:
+# http://opensource.org/licenses/lgpl-license.php
+# See the Copying file included with the OpenSAF distribution for 
full

+# licensing terms.
+#
+# Author(s): Ericsson AB
+#
+
+all:
+    $(MAKE) -C ../../.. bin/osaflog
diff --git a/src/dtm/tools/osaflog.cc b/src/dtm/tools/osaflog.cc
index 64be253e9..abbf0b164 100644
--- a/src/dtm/tools/osaflog.cc
+++ b/src/dtm/tools/osaflog.cc
@@ -47,6 +47,7 @@ namespace {
  void PrintUsage(const char* program_name);
  bool SendCommand(const std::string& command);
  bool MaxTraceFileSize(uint64_t max_file_size);
+bool SetMaxIdleTime(uint64_t max_idle);
  bool NoOfBackupFiles(uint64_t number_of_backups);
  bool Flush();
  base::UnixServerSocket* CreateSocket();
@@ -70,10 +71,12 @@ int main(int argc, char** argv) {
    {"print", no_argument, 
nullptr, 'p'},
    {"delete", no_argument, 
nullptr, 'd'},
    {"extract-trace", 
required_argument, 0, 'e'},
+  {"max-idle", required_argument, 
0, 'i'},

    {0, 0, 0, 0}};
      uint64_t max_file_size = 0;

Re: [devel] [PATCH 1/1] mds: Add Nack message for MDS_TIPC_FCTRL_ENABLED [#3095]

2019-10-01 Thread Minh Hon Chau


Hi Vu,

See comments below.

Thanks

Minh

On 1/10/19 8:34 pm, Nguyen Minh Vu wrote:

Hi Minh,

Ack with minor comments. Thanks.

Regards, Vu

On 10/1/19 12:49 PM, Minh Chau wrote:

In the scenario of recovery from split-brain, where both
active director services may suffer mds message loss due
to lost-contact tipc link. If MDS_TIPC_FCTRL_ENABLED is
set, the out-of-order message will be dropped, and there
is no mechanism to trigger the retransmission from receiver
side at this moment (the retransmission is only triggered
from sender as result of TIPC_ERR_OVERLOAD).

In reception of disordered message, the receiver can send
not-acknowledgement to notify the sender for retransmission.
Therefore, the sender can trigger retransmisison in the same
way as receiving TIPC_ERR_OVERLOAD.

This patch adds Nack message for retransmission of disordered
message detected from receiver side.
---
  src/mds/mds_c_api.c  |  2 +-
  src/mds/mds_dt_common.c  |  2 +-
  src/mds/mds_tipc_fctrl_intf.cc   | 19 ++-
  src/mds/mds_tipc_fctrl_msg.cc    | 33 
+

  src/mds/mds_tipc_fctrl_msg.h | 22 ++
  src/mds/mds_tipc_fctrl_portid.cc | 18 +-
  src/mds/mds_tipc_fctrl_portid.h  |  1 +
  7 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/src/mds/mds_c_api.c b/src/mds/mds_c_api.c
index c41c8dd..132555b 100644
--- a/src/mds/mds_c_api.c
+++ b/src/mds/mds_c_api.c
@@ -4196,7 +4196,7 @@ void mds_mcm_msg_loss(MDS_SVC_HDL 
local_svc_hdl, MDS_DEST rem_adest,

    /* Check whether the msg loss is enabled or not */
  if (true != local_svc_info->i_msg_loss_indication) {
-    m_MDS_LOG_INFO(" MSG loss not enbaled mds_mcm_msg_loss\n");
+    m_MDS_LOG_NOTIFY("MSG loss is not enabled mds_mcm_msg_loss\n");
  return;
  }
  diff --git a/src/mds/mds_dt_common.c b/src/mds/mds_dt_common.c
index 66652af..de13883 100644
--- a/src/mds/mds_dt_common.c
+++ b/src/mds/mds_dt_common.c
@@ -972,7 +972,7 @@ uint32_t mds_tmr_mailbox_processing(void)
  .vdest_id);
  break;
  case MDS_REASSEMBLY_TMR:
-    m_MDS_LOG_DBG(
+    m_MDS_LOG_ERR(
  "MDTM: Tmr Mailbox Processing:Reassemble Tmr 
Hdl=0x%08x",

  mbx_evt_info->info.tmr_info_hdl);
  mdtm_process_reassem_timer_event(
diff --git a/src/mds/mds_tipc_fctrl_intf.cc 
b/src/mds/mds_tipc_fctrl_intf.cc

index 2366672..65f1849 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -38,6 +38,7 @@ using mds::Timer;
  using mds::DataMessage;
  using mds::ChunkAck;
  using mds::HeaderMessage;
+using mds::Nack;
    namespace {
  // flow control enabled/disabled
@@ -142,7 +143,8 @@ uint32_t process_flow_event(const Event& evt) {
  if (evt.type_ == Event::Type::kEvtSendChunkAck) {
    portid->SendChunkAck(evt.fseq_, evt.svc_id_, evt.chunk_size_);
  }
-    if (evt.type_ == Event::Type::kEvtDropData) {
+    if (evt.type_ == Event::Type::kEvtDropData ||
+    evt.type_ == Event::Type::kEvtRcvNack) {
    portid->ReceiveNack(evt.mseq_, evt.mfrag_,
    evt.fseq_);
  }
@@ -464,6 +466,21 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t 
*buffer, uint16_t len,

  // skip this data msg
  return NCSCC_RC_FAILURE;
    }
+  if (header.msg_type_ == Nack::kNackMsgType) {
+    // receive nack message
+    Nack nack;
+    nack.Decode(buffer);
+    // send to the event thread
+    if (m_NCS_IPC_SEND(_events,
+    new Event(Event::Type::kEvtRcvNack, id, nack.svc_id_,
+    header.mseq_, header.mfrag_, nack.nacked_fseq_),
+    NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) {
+  m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events\n");
+    }
+    // return NCSCC_RC_FAILURE, so the tipc receiving thread 
(legacy) will

+    // skip this data msg
+    return NCSCC_RC_FAILURE;
+  }
  } else {
    // receive data message
    DataMessage data;
diff --git a/src/mds/mds_tipc_fctrl_msg.cc 
b/src/mds/mds_tipc_fctrl_msg.cc

index 064d977..f85568c 100644
--- a/src/mds/mds_tipc_fctrl_msg.cc
+++ b/src/mds/mds_tipc_fctrl_msg.cc
@@ -139,4 +139,37 @@ void ChunkAck::Decode(uint8_t *msg) {
    chunk_size_ = ncs_decode_16bit();
  }
  +
+Nack::Nack(uint16_t svc_id, uint16_t fseq):
+    svc_id_(svc_id), nacked_fseq_(fseq) {
+  msg_type_ = kNackMsgType;
+}
+
+void Nack::Encode(uint8_t *msg) {
+  uint8_t *ptr;
+  // encode protocol identifier
+  ptr = [Nack::FieldIndex::kProtocolIdentifier];
+  ncs_encode_32bit(, MDS_PROT_FCTRL_ID);
+  // encode message type
+  ptr = [Nack::FieldIndex::kFlowControlMessageType];
+  ncs_encode_8bit(, kNackMsgType);
+  // encode service id
+  ptr = [Nack::FieldIndex::kServiceId];
+  ncs_encode_16bit(, svc_id_);
+  // encode flow control sequence number
+  ptr = [Nack::FieldIndex::kFlowControlSequenceNumber];
+

Re: [devel] [PATCH 1/1] dtm: close unused log streams [#2642]

2019-10-01 Thread Minh Hon Chau


Hi Vu,

Ok, then the value '0' needs to be written somewhere (README?) for this 
special purpose I guess, to avoid a confusion later on.


Thanks

Minh

On 1/10/19 8:27 pm, Nguyen Minh Vu wrote:

Hi Minh,

Thanks for your comment.

When passing zero to max-idle, the server will disable 'close unused 
log streams' functionality.
It may be useful when user has previously set max-idle to a specific 
value, and want to disable it later.


If the range starts from 1, there is no chance to disable it.

Regards, Vu

On 10/1/19 5:17 PM, Minh Hon Chau wrote:

Hi Vu,

ack for minor comment.

The range of --max-idle, I think, should be starting from 1, as the 
log_server ignores the tv_sec=0. From user's perspective, if allowing 
max-idle=0, the meaning seems that the stream must be constantly 
writing traces, or the stream will be deleted.


Thanks

Minh

On 24/9/19 12:57 pm, Vu Minh Nguyen wrote:

Providing a new option '--max-idle' to configure the maximum idle time
of logtrace streams. If a stream has not been used for such time, 
logtrace

server will close the stream from its database.

This patch also corrects wrong indentation in osaflog.cc file.
---
  src/dtm/Makefile  |   2 +-
  src/dtm/common/osaflog_protocol.h |   2 +
  src/dtm/tools/Makefile    |  18 
  src/dtm/tools/osaflog.cc  | 132 
++

  src/dtm/transport/log_server.cc   |  57 -
  src/dtm/transport/log_server.h    |   7 +-
  src/dtm/transport/transportd.conf |   6 ++
  7 files changed, 168 insertions(+), 56 deletions(-)
  create mode 100644 src/dtm/tools/Makefile

diff --git a/src/dtm/Makefile b/src/dtm/Makefile
index 533b0f273..fb0221075 100644
--- a/src/dtm/Makefile
+++ b/src/dtm/Makefile
@@ -15,7 +15,7 @@
  #
    all:
-    $(MAKE) -C ../.. bin/osafdtmd bin/osaftransportd
+    $(MAKE) -C ../.. bin/osafdtmd bin/osaftransportd bin/osaflog
    check:
  $(MAKE) -C ../.. bin/transport_test
diff --git a/src/dtm/common/osaflog_protocol.h 
b/src/dtm/common/osaflog_protocol.h

index 61e9f6f39..d35e5f345 100644
--- a/src/dtm/common/osaflog_protocol.h
+++ b/src/dtm/common/osaflog_protocol.h
@@ -27,6 +27,8 @@ namespace Osaflog {
  static constexpr const char* kServerSocketPath =
  PKGLOCALSTATEDIR "/osaf_log.sock";
  +static constexpr const uint64_t kOneDayInMinute = 24*60;
+
  struct __attribute__((__packed__)) ClientAddressConstantPrefix {
    sa_family_t family = AF_UNIX;
    char abstract = '\0';
diff --git a/src/dtm/tools/Makefile b/src/dtm/tools/Makefile
new file mode 100644
index 0..8c48b70a5
--- /dev/null
+++ b/src/dtm/tools/Makefile
@@ -0,0 +1,18 @@
+#  -*- OpenSAF  -*-
+#
+# (C) Copyright 2019 The OpenSAF Foundation
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. This file and program are 
licensed
+# under the GNU Lesser General Public License Version 2.1, February 
1999.

+# The complete license can be accessed from the following location:
+# http://opensource.org/licenses/lgpl-license.php
+# See the Copying file included with the OpenSAF distribution for full
+# licensing terms.
+#
+# Author(s): Ericsson AB
+#
+
+all:
+    $(MAKE) -C ../../.. bin/osaflog
diff --git a/src/dtm/tools/osaflog.cc b/src/dtm/tools/osaflog.cc
index 64be253e9..abbf0b164 100644
--- a/src/dtm/tools/osaflog.cc
+++ b/src/dtm/tools/osaflog.cc
@@ -47,6 +47,7 @@ namespace {
  void PrintUsage(const char* program_name);
  bool SendCommand(const std::string& command);
  bool MaxTraceFileSize(uint64_t max_file_size);
+bool SetMaxIdleTime(uint64_t max_idle);
  bool NoOfBackupFiles(uint64_t number_of_backups);
  bool Flush();
  base::UnixServerSocket* CreateSocket();
@@ -70,10 +71,12 @@ int main(int argc, char** argv) {
    {"print", no_argument, nullptr, 
'p'},
    {"delete", no_argument, nullptr, 
'd'},
    {"extract-trace", 
required_argument, 0, 'e'},
+  {"max-idle", required_argument, 
0, 'i'},

    {0, 0, 0, 0}};
      uint64_t max_file_size = 0;
    uint64_t max_backups = 0;
+  uint64_t max_idle = 0;
    int option = 0;
      int long_index = 0;
@@ -82,71 +85,81 @@ int main(int argc, char** argv) {
    bool delete_result =  true;
    bool max_file_size_result = true;
    bool number_of_backups_result = true;
+  bool max_idle_result = true;
    bool flush_set = false;
    bool pretty_print_set = false;
    bool delete_set = false;
    bool max_file_size_set = false;
    bool max_backups_set = false;
+  bool max_idle_set = false;
    bool thread_trace = false;
    std::string input_core = "";
    std::string output_trace = "";
      if (argc == 1) {
- PrintUsage(argv[0]);
- exit(EXIT_FAILURE);
+

Re: [devel] [PATCH 1/1] mds: optimize mdstest suite 27 [#3087]

2019-09-24 Thread Minh Hon Chau


Hi Thuan,

ack from me.

Thanks

Minh

On 25/9/19 2:05 pm, thuan.tran wrote:

- Just allocate a small buffer instead of huge buffer
---
  src/mds/apitest/mdstipc_api.c | 119 +++---
  1 file changed, 53 insertions(+), 66 deletions(-)

diff --git a/src/mds/apitest/mdstipc_api.c b/src/mds/apitest/mdstipc_api.c
index 805728464..f667d7385 100644
--- a/src/mds/apitest/mdstipc_api.c
+++ b/src/mds/apitest/mdstipc_api.c
@@ -13105,10 +13105,14 @@ void tet_create_default_PWE_VDEST_tp()
test_validate(FAIL, 0);
  }
  
-void tet_sender(char *send_buff, uint32_t buff_len, int msg_count)

+void tet_sender(uint32_t msg_count, uint32_t msg_size)
  {
int live = 100; // sender live max 100s
TET_MDS_MSG *mesg;
+   if (msg_size > TET_MSG_SIZE_MIN) {
+   printf("\nSender: msg_size > TET_MSG_SIZE_MIN\n");
+   exit(1);
+   }
mesg = (TET_MDS_MSG *)malloc(sizeof(TET_MDS_MSG));
memset(mesg, 0, sizeof(TET_MDS_MSG));
  
@@ -13134,7 +13138,7 @@ void tet_sender(char *send_buff, uint32_t buff_len, int msg_count)

exit(1);
}
  
-	while(!gl_tet_adest.svc[0].svcevt[0].dest && live-- > 0) {

+   while (!gl_tet_adest.svc[0].svcevt[0].dest && live-- > 0) {
printf("\nSender is waiting for receiver UP\n");
sleep(1);
}
@@ -13147,11 +13151,11 @@ void tet_sender(char *send_buff, uint32_t buff_len, 
int msg_count)
// otherwise, receiver won't detect loss message
sleep(1);
  
-	uint32_t offset = 0;

-   uint32_t msg_len = buff_len / msg_count;
-   for (int i = 1; i <= msg_count; i++) {
-   memcpy(mesg->send_data, _buff[offset], msg_len);
-   mesg->send_len = msg_len;
+   for (uint32_t i = 1; i <= msg_count; i++) {
+   /* to verify received correct order */
+   memset(mesg->send_data, 'X', msg_size);
+   sprintf(mesg->send_data, "%u", i);
+   mesg->send_len = msg_size;
if (mds_just_send(gl_tet_adest.mds_pwe1_hdl,
  NCSMDS_SVC_ID_INTERNAL_MIN,
  NCSMDS_SVC_ID_EXTERNAL_MIN,
@@ -13163,23 +13167,25 @@ void tet_sender(char *send_buff, uint32_t buff_len, 
int msg_count)
} else {
printf("\nSender SENT message %d successfully\n", i);
}
-   offset += msg_len;
}
free(mesg);
-   while(live-- > 0) {
+   while (live-- > 0) {
// Keep sender alive for retransmission
sleep(1);
}
  }
  
-bool tet_receiver(char *expected_buff, uint32_t buff_len, int msg_count)

+bool tet_receiver(uint32_t msg_count, uint32_t msg_size)
  {
-   int ret = 1;
+   if (msg_size > TET_MSG_SIZE_MIN) {
+   printf("\nReceiver: msg_size > TET_MSG_SIZE_MIN\n");
+   return 1;
+   }
printf("\nStarted Receiver (pid:%d) svc_id=%d\n",
(int)getpid(), NCSMDS_SVC_ID_EXTERNAL_MIN);
if (adest_get_handle() != NCSCC_RC_SUCCESS) {
printf("\nReceiver FAIL to get adest handle\n");
-   return ret;
+   return 1;
}
  
  	sleep(1); //Let sender subscribe before receiver install

@@ -13197,14 +13203,13 @@ bool tet_receiver(char *expected_buff, uint32_t 
buff_len, int msg_count)
exit(1);
}
  
-	char *received_buff = malloc(buff_len);

-   memset(received_buff, 0, buff_len);
-   uint32_t offset = 0;
struct pollfd sel;
-   int counter = 0;
+   uint32_t counter = 0;
+   char *expected_buff = malloc(msg_size);
+   memset(expected_buff, 'X', msg_size);
sel.fd = m_GET_FD_FROM_SEL_OBJ(gl_tet_adest.svc[0].sel_obj);
sel.events = POLLIN;
-   while(counter < msg_count) {
+   while (counter < msg_count) {
int ret = osaf_poll(, 1, 1);
if (ret > 0) {
gl_rcvdmsginfo.msg = NULL;
@@ -13214,11 +13219,18 @@ bool tet_receiver(char *expected_buff, uint32_t 
buff_len, int msg_count)
printf("\nReceiver FAIL to retrieve message\n");
break;
}
-   TET_MDS_MSG *msg = (TET_MDS_MSG*)gl_rcvdmsginfo.msg;
+   TET_MDS_MSG *msg = (TET_MDS_MSG *)gl_rcvdmsginfo.msg;
if (msg != NULL) {
-   memcpy(_buff[offset],msg->recvd_data, 
msg->recvd_len);
-   offset += msg->recvd_len;
counter++;
+   sprintf(expected_buff, "%u", counter);
+   if (memcmp(msg->recvd_data,
+   expected_buff, msg_size) != 0) {
+   printf("\nReceived incorrect 
message\n");
+

Re: [devel] [PATCH 1/1] mds: optimize mdstest suite 27 [#3087]

2019-09-24 Thread Minh Hon Chau


Hi Thuan,

Some comments:

- a few warnings for >80 chars line

- Need to free(msg) that is returned from each MDS callback

- Another minor comment below

Thanks

Minh

On 24/9/19 1:10 pm, thuan.tran wrote:

- Just allocate a small buffer instead of huge buffer
---
  src/mds/apitest/mdstipc_api.c | 116 +++---
  1 file changed, 52 insertions(+), 64 deletions(-)

diff --git a/src/mds/apitest/mdstipc_api.c b/src/mds/apitest/mdstipc_api.c
index 805728464..33e7d6c12 100644
--- a/src/mds/apitest/mdstipc_api.c
+++ b/src/mds/apitest/mdstipc_api.c
@@ -13105,10 +13105,14 @@ void tet_create_default_PWE_VDEST_tp()
test_validate(FAIL, 0);
  }
  
-void tet_sender(char *send_buff, uint32_t buff_len, int msg_count)

+void tet_sender(uint32_t msg_count, uint32_t msg_size)
  {
int live = 100; // sender live max 100s
TET_MDS_MSG *mesg;
+   if (msg_size > TET_MSG_SIZE_MIN) {
+   printf("\nSender: msg_size cannot bigger than 
TET_MSG_SIZE_MIN\n");
+   exit(1);
+   }
mesg = (TET_MDS_MSG *)malloc(sizeof(TET_MDS_MSG));
memset(mesg, 0, sizeof(TET_MDS_MSG));
  
@@ -13134,7 +13138,7 @@ void tet_sender(char *send_buff, uint32_t buff_len, int msg_count)

exit(1);
}
  
-	while(!gl_tet_adest.svc[0].svcevt[0].dest && live-- > 0) {

+   while (!gl_tet_adest.svc[0].svcevt[0].dest && live-- > 0) {
printf("\nSender is waiting for receiver UP\n");
sleep(1);
}
@@ -13147,11 +13151,11 @@ void tet_sender(char *send_buff, uint32_t buff_len, 
int msg_count)
// otherwise, receiver won't detect loss message
sleep(1);
  
-	uint32_t offset = 0;

-   uint32_t msg_len = buff_len / msg_count;
-   for (int i = 1; i <= msg_count; i++) {
-   memcpy(mesg->send_data, _buff[offset], msg_len);
-   mesg->send_len = msg_len;
+   for (uint32_t i = 1; i <= msg_count; i++) {
+   /* to verify received correct order */
+   memset(mesg->send_data, 'X', msg_size);
+   sprintf(mesg->send_data, "%u", i);
+   mesg->send_len = msg_size;
if (mds_just_send(gl_tet_adest.mds_pwe1_hdl,
  NCSMDS_SVC_ID_INTERNAL_MIN,
  NCSMDS_SVC_ID_EXTERNAL_MIN,
@@ -13163,23 +13167,25 @@ void tet_sender(char *send_buff, uint32_t buff_len, 
int msg_count)
} else {
printf("\nSender SENT message %d successfully\n", i);
}
-   offset += msg_len;
}
free(mesg);
-   while(live-- > 0) {
+   while (live-- > 0) {
// Keep sender alive for retransmission
sleep(1);
}
  }
  
-bool tet_receiver(char *expected_buff, uint32_t buff_len, int msg_count)

+bool tet_receiver(uint32_t msg_count, uint32_t msg_size)
  {
-   int ret = 1;
+   if (msg_size > TET_MSG_SIZE_MIN) {
+   printf("\nReceiver: msg_size cannot bigger than 
TET_MSG_SIZE_MIN\n");
+   return 1;
+   }
printf("\nStarted Receiver (pid:%d) svc_id=%d\n",
(int)getpid(), NCSMDS_SVC_ID_EXTERNAL_MIN);
if (adest_get_handle() != NCSCC_RC_SUCCESS) {
printf("\nReceiver FAIL to get adest handle\n");
-   return ret;
+   return 1;
}
  
  	sleep(1); //Let sender subscribe before receiver install

@@ -13197,14 +13203,12 @@ bool tet_receiver(char *expected_buff, uint32_t 
buff_len, int msg_count)
exit(1);
}
  
-	char *received_buff = malloc(buff_len);

-   memset(received_buff, 0, buff_len);
-   uint32_t offset = 0;
+   char *expected_buff = malloc(msg_size);
struct pollfd sel;
-   int counter = 0;
+   uint32_t counter = 0;
sel.fd = m_GET_FD_FROM_SEL_OBJ(gl_tet_adest.svc[0].sel_obj);
sel.events = POLLIN;
-   while(counter < msg_count) {
+   while (counter < msg_count) {
int ret = osaf_poll(, 1, 1);
if (ret > 0) {
gl_rcvdmsginfo.msg = NULL;
@@ -13214,11 +13218,23 @@ bool tet_receiver(char *expected_buff, uint32_t 
buff_len, int msg_count)
printf("\nReceiver FAIL to retrieve message\n");
break;
}
-   TET_MDS_MSG *msg = (TET_MDS_MSG*)gl_rcvdmsginfo.msg;
+   TET_MDS_MSG *msg = (TET_MDS_MSG *)gl_rcvdmsginfo.msg;
if (msg != NULL) {
-   memcpy(_buff[offset],msg->recvd_data, 
msg->recvd_len);
-   offset += msg->recvd_len;
counter++;
+   memset(expected_buff, 'X', msg_size);
[M] I think you can move the above memset(expected_buff,...) before the 
while (counter,...) loop, since it constantly

Re: [devel] [PATCH 0/9] Review Request for mds: Add solution for TIPC buffer overflow [#1960]

2019-09-22 Thread Minh Hon Chau


Hi all,

Below is the patch #10 that updates most of comments, it applies on top 
of current patch #9.


This patch #10 does not use the shared_ptr and base:Mutex as comments 
given by Gary and Vu, the reason is that it will cause a similar problem 
reported in #2860 (user call exit() without properly doing mds 
shutdown), unless those variables are allocated on the heap.


I would like to push the #1960 patches today if we don't have any more 
comments. There are some other incremental improvements/fixes that will 
be addressed in other tickets.


Thanks

Minh

---
 src/mds/README   |  2 +-
 src/mds/mds_dt_tipc.c    | 28 -
 src/mds/mds_tipc_fctrl_intf.cc   | 67 
++--

 src/mds/mds_tipc_fctrl_intf.h    |  2 +-
 src/mds/mds_tipc_fctrl_msg.cc    | 44 +-
 src/mds/mds_tipc_fctrl_msg.h | 22 +++--
 src/mds/mds_tipc_fctrl_portid.cc | 46 ---
 7 files changed, 137 insertions(+), 74 deletions(-)

diff --git a/src/mds/README b/src/mds/README
index 1b94632..0819bdc 100644
--- a/src/mds/README
+++ b/src/mds/README
@@ -182,7 +182,7 @@ TIPC portid state machine and its transition
 
 kDisabled, // no flow control support at this state
 kStartup,  // a newly published portid starts at this state
-kTxProb,   // txprob timer is running to confirm if the flow control is 
supported
+kTxProb,   // tx probation timer is running to confirm if the flow 
control is supported

 kEnabled   // flow control support is confirmed, data flow is controlled
 kRcvBuffOverflow // anticipating (or experienced) the receiver's 
buffer overflow


diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 1b6c3f8..e7a7b48 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -247,6 +247,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t 
*mds_tipc_ref)

 if (!get_tipc_port_id(tipc_cb.BSRsock, _id)) {
     close(tipc_cb.Dsock);
     close(tipc_cb.BSRsock);
+        *mds_tipc_ref = 0;
     return NCSCC_RC_FAILURE;
 }
 *mds_tipc_ref = port_id.ref;
@@ -330,7 +331,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t 
*mds_tipc_ref)

 }

 /* Get tipc socket receive buffer size */
-    int optval;
+    int optval = 0;
 socklen_t optlen = sizeof(optval);
 if (getsockopt(tipc_cb.BSRsock, SOL_SOCKET, SO_RCVBUF,
     , ) != 0) {
@@ -350,12 +351,25 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t 
*mds_tipc_ref)

         int acksize = -1;
         if ((ptr = getenv("MDS_TIPC_FCTRL_ACKTIMEOUT")) != NULL) {
             ackto = atoi(ptr);
+                if (ackto == 0) {
+                    syslog(LOG_ERR, "MDTM:TIPC Invalid "
+                            "MDS_TIPC_FCTRL_ACKTIMEOUT, using default 
value");

+                    ackto = -1;
+                }
         }
         if ((ptr = getenv("MDS_TIPC_FCTRL_ACKSIZE")) != NULL) {
             acksize = atoi(ptr);
+                if (acksize == 0) {
+                    syslog(LOG_ERR, "MDTM:TIPC Invalid "
+                            "MDS_TIPC_FCTRL_ACKSIZE, using default value");
+                    acksize = -1;
+                }
         }
-            mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, 
(uint64_t)optval,

+            mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, optval,
             ackto, acksize, tipc_mcast_enabled);
+        } else {
+            syslog(LOG_ERR, "MDTM:TIPC Invalid value of"
+                "MDS_TIPC_FCTRL_ENABLED");
     }
 }

@@ -366,6 +380,7 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t 
*mds_tipc_ref)

     close(tipc_cb.Dsock);
     close(tipc_cb.BSRsock);
     m_NCS_IPC_RELEASE(_cb.tmr_mbx, NULL);
+        mds_tipc_fctrl_shutdown();
     return NCSCC_RC_FAILURE;
 }

@@ -2528,7 +2543,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
  */
 uint32_t status = 0;
 uint32_t sum_mds_hdr_plus_mdtm_hdr_plus_len;
-  uint16_t fctrl_seq_num = 0;
+    uint16_t fctrl_seq_num = 0;
 int version = req->msg_arch_word & 0x7;
 if (version > 1) {
     sum_mds_hdr_plus_mdtm_hdr_plus_len =
@@ -2618,7 +2633,7 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
             return NCSCC_RC_FAILURE;
         }
       /* if sndqueue is capable, then obtain the current sending 
seq */

-          if (mds_tipc_fctrl_sndqueue_capable(tipc_id, len, _seq_num)
+          if (mds_tipc_fctrl_sndqueue_capable(tipc_id, _seq_num)
       == NCSCC_RC_FAILURE){
         m_MDS_LOG_ERR("FCTRL: Failed to send message len :%d", len);
         return NCSCC_RC_FAILURE;
@@ -2717,10 +2732,10 @@ uint32_t mds_mdtm_send_tipc(MDTM_SEND_REQ *req)
             }
             /* if sndqueue is capable, then obtain the current 
sending seq */

             if (mds_tipc_fctrl_sndqueue_capable(tipc_id,
-                    len +

Re: [devel] [PATCH 1/1] amf: handle errors identified by codechecker [#3077]

2019-09-16 Thread Minh Hon Chau


Hi Gary,

ack from me (code review only)

Thanks

Minh

On 3/9/19 12:12 pm, Gary Lee wrote:

add assertions where pointers should not be null
fix a couple of typos
---
  src/amf/amfd/comp.cc   |  1 +
  src/amf/amfd/csi.cc|  3 ++-
  src/amf/amfd/cstype.cc |  2 ++
  src/amf/amfd/hlt.cc|  1 +
  src/amf/amfd/nodeswbundle.cc   |  2 +-
  src/amf/amfd/ntf.cc|  1 +
  src/amf/amfd/sg_npm_fsm.cc | 34 +++---
  src/amf/amfd/sg_nway_fsm.cc|  2 +-
  src/amf/amfd/sgproc.cc |  1 +
  src/amf/amfd/su.cc |  1 +
  src/amf/amfd/sutype.cc |  3 ++-
  src/amf/amfd/svctype.cc|  1 +
  src/amf/amfd/svctypecstypes.cc |  1 +
  src/amf/amfnd/cbq.cc   |  2 ++
  src/amf/amfnd/clc.cc   |  1 +
  src/amf/amfnd/comp.cc  |  4 
  src/amf/amfnd/compdb.cc|  2 +-
  src/amf/amfnd/susm.cc  | 11 +++
  18 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/src/amf/amfd/comp.cc b/src/amf/amfd/comp.cc
index 0ff365e..5c6a283 100644
--- a/src/amf/amfd/comp.cc
+++ b/src/amf/amfd/comp.cc
@@ -2117,6 +2117,7 @@ static void comp_ccb_apply_modify_hdlr(struct 
CcbUtilOperationData *opdata) {
attribute->attrValuesNumber);
  
  if (!strcmp(attribute->attrName, "saAmfCompType")) {

+  osafassert(value != nullptr);
SaNameT *dn = (SaNameT *)value;
const std::string oldType(comp->saAmfCompType);
if (oldType.compare(Amf::to_string(dn)) == 0) {
diff --git a/src/amf/amfd/csi.cc b/src/amf/amfd/csi.cc
index f7e3730..1856610 100644
--- a/src/amf/amfd/csi.cc
+++ b/src/amf/amfd/csi.cc
@@ -913,7 +913,8 @@ static void ccb_apply_delete_hdlr(CcbUtilOperationData_t 
*opdata) {
  goto done;
}
  
-  TRACE("'%s'", csi ? csi->name.c_str() : nullptr);

+  osafassert(csi != nullptr);
+  TRACE("'%s'", csi->name.c_str());
  
/* Check whether si has been assigned to any SU. */

if ((nullptr != csi->si->list_of_sisu) && (csi->compcsi_cnt != 0)) {
diff --git a/src/amf/amfd/cstype.cc b/src/amf/amfd/cstype.cc
index cadc6df..683d3cd 100644
--- a/src/amf/amfd/cstype.cc
+++ b/src/amf/amfd/cstype.cc
@@ -62,6 +62,7 @@ static AVD_CS_TYPE *cstype_create(const std::string ,
   * @param cst
   */
  static void cstype_delete(AVD_CS_TYPE *cst) {
+  osafassert(cst != nullptr);
cstype_db->erase(cst->name);
cst->saAmfCSAttrName.clear();
delete cst;
@@ -205,6 +206,7 @@ static SaAisErrorT 
cstype_ccb_completed_hdlr(CcbUtilOperationData_t *opdata) {
  opdata->userData = nullptr;
  break;
}
+  osafassert(cst != nullptr);
if (cst->list_of_csi != nullptr) {
  /* check whether there exists a delete operation for
   * each of the CSI in the cs_type list in the current CCB
diff --git a/src/amf/amfd/hlt.cc b/src/amf/amfd/hlt.cc
index 27863db..4c2737e 100644
--- a/src/amf/amfd/hlt.cc
+++ b/src/amf/amfd/hlt.cc
@@ -75,6 +75,7 @@ static SaAisErrorT 
ccb_completed_delete_hdlr(CcbUtilOperationData_t *opdata) {
  opdata->userData = nullptr;
  goto done;
}
+  osafassert(comp != nullptr);
for (curr_susi = comp->su->list_of_susi; curr_susi != nullptr;
 curr_susi = curr_susi->su_next)
  for (compcsi = curr_susi->list_of_csicomp; compcsi;
diff --git a/src/amf/amfd/nodeswbundle.cc b/src/amf/amfd/nodeswbundle.cc
index 4ab79f7..cf280cb 100644
--- a/src/amf/amfd/nodeswbundle.cc
+++ b/src/amf/amfd/nodeswbundle.cc
@@ -125,7 +125,7 @@ static int is_swbdl_delete_ok(const std::string _dn,
if (node == nullptr && avd_cb->is_active() == false) {
  return 1;
}
-
+  osafassert(node != nullptr);
if (!is_swbdl_delete_ok_for_node(bundle_dn, node_dn, node->list_of_ncs_su,
 opdata))
  return 0;
diff --git a/src/amf/amfd/ntf.cc b/src/amf/amfd/ntf.cc
index eb2654a..52ee745 100644
--- a/src/amf/amfd/ntf.cc
+++ b/src/amf/amfd/ntf.cc
@@ -505,6 +505,7 @@ SaAisErrorT avd_try_send_notification(NtfSend* job) {
  >notification.alarmNotification.notificationHandle;
}
  
+  osafassert(notificationHandle != nullptr);

// Try to send the notification if not sent.
if (job->already_sent == false) {
  rc = saNtfNotificationSend(*notificationHandle);
diff --git a/src/amf/amfd/sg_npm_fsm.cc b/src/amf/amfd/sg_npm_fsm.cc
index 0ef094d..0e91eb5 100644
--- a/src/amf/amfd/sg_npm_fsm.cc
+++ b/src/amf/amfd/sg_npm_fsm.cc
@@ -2773,23 +2773,26 @@ static uint32_t avd_sg_npm_susi_sucss_si_oper(AVD_CL_CB 
*cb, AVD_SU *su,
 * modify standby all to the Quiesced SU. Remove the SI from
 * admin pointer and add the quiesced SU to the SU oper list.
 */
-  if (su->sg_of_su->admin_si->list_of_sisu == i_susi) {
-o_susi = i_susi->si_next;
-  } else {
-o_susi = su->sg_of_su->admin_si->list_of_sisu;
-  }
+  i_susi = avd_su_susi_find(cb, su, su->sg_of_su->admin_si->name);
+  if

Re: [devel] [PATCH 6/9] mds: Implement kRcvBuffOverflow state [#1960]

2019-09-16 Thread Minh Hon Chau


Hi Vu,

Agree with your comments. Any comments for patches 8/9 and 9/9?

thanks

Minh

On 16/9/19 5:22 pm, Nguyen Minh Vu wrote:

Hi Minh,

I has few comments below.

Regards, Vu

On 8/14/19 1:38 PM, Minh Chau wrote:

This patch implements the kRcvBuffOverflow state machine as
described in README file.
---
  src/mds/mds_tipc_fctrl_intf.cc   |   6 +-
  src/mds/mds_tipc_fctrl_msg.h |   1 +
  src/mds/mds_tipc_fctrl_portid.cc | 137 
++-

  src/mds/mds_tipc_fctrl_portid.h  |   5 +-
  4 files changed, 131 insertions(+), 18 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_intf.cc 
b/src/mds/mds_tipc_fctrl_intf.cc

index c2d0922..397114e 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -285,14 +285,16 @@ uint32_t mds_tipc_fctrl_trysend(const uint8_t 
*buffer, uint16_t len,

  rc = NCSCC_RC_FAILURE;
    } else {
  if (portid->state_ != TipcPortId::State::kDisabled) {
-  portid->Queue(buffer, len);
+  bool sendable = portid->ReceiveCapable(len);
+  portid->Queue(buffer, len, sendable);
    // start txprob timer for the first msg sent out
    // do not start for other states
-  if (portid->state_ == TipcPortId::State::kStartup) {
+  if (sendable && portid->state_ == TipcPortId::State::kStartup) {
  txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk);
  m_MDS_LOG_DBG("FCTRL: Start txprob");
  portid->state_ = TipcPortId::State::kTxProb;
    }
+  if (sendable == false) rc = NCSCC_RC_FAILURE;
  }
    }
  diff --git a/src/mds/mds_tipc_fctrl_msg.h 
b/src/mds/mds_tipc_fctrl_msg.h

index 69f8048..e6b9662 100644
--- a/src/mds/mds_tipc_fctrl_msg.h
+++ b/src/mds/mds_tipc_fctrl_msg.h
@@ -110,6 +110,7 @@ class DataMessage: public BaseMessage {
    uint8_t* msg_data_{nullptr};
    uint8_t snd_type_{0};
  +  bool is_sent_{true};
    DataMessage() {}
    virtual ~DataMessage();
    void Decode(uint8_t *msg) override;
diff --git a/src/mds/mds_tipc_fctrl_portid.cc 
b/src/mds/mds_tipc_fctrl_portid.cc

index 84ecee9..e762290 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -82,6 +82,23 @@ uint64_t MessageQueue::Erase(uint16_t fseq_from, 
uint16_t fseq_to) {

    return msg_len;
  }
  +DataMessage* MessageQueue::FirstUnsent() {
+  for (auto it = queue_.begin(); it != queue_.end(); ++it) {

[Vu] Use the shorter version `for (const auto& it : queue_)

+    DataMessage *m = *it;
+    if (m->is_sent_ == false) {
+  return m;
+    }
+  }
+  return nullptr;
+}
+
+void MessageQueue::MarkUnsentFrom(uint16_t fseq) {
+  for (auto it = queue_.begin(); it != queue_.end(); ++it) {

[Vu] as above comment

+    DataMessage *m = *it;
+    if (m->header_.fseq_ >= fseq) m->is_sent_ = false;
+  }
+}
+
  void MessageQueue::Clear() {
    while (queue_.empty() == false) {
  DataMessage* msg = queue_.front();
@@ -99,7 +116,8 @@ TipcPortId::TipcPortId(struct tipc_portid id, int 
sock, uint16_t chksize,

  TipcPortId::~TipcPortId() {
    // Fake a TmrChunkAck event to ack all received messages
    ReceiveTmrChunkAck();
-  // clear all msg in sndqueue_
+  // flush all unsent msg in sndqueue_
+  FlushData();
    sndqueue_.Clear();
[Vu] If sndqueue_.Clear() must be called every time calling 
`FlushData`, should move `Clear()` into FlushData() ?

  }
  @@ -109,6 +127,24 @@ uint64_t TipcPortId::GetUniqueId(struct 
tipc_portid id) {

    return uid;
  }
  +void TipcPortId::FlushData() {
+  DataMessage* msg = nullptr;
+  do {
+    // find the lowest sequence unsent yet
+    msg = sndqueue_.FirstUnsent();
+    if (msg != nullptr) {
+  Send(msg->msg_data_, msg->header_.msg_len_);
+  msg->is_sent_ = true;
+  m_MDS_LOG_DBG("FCTRL: [me] --> [node:%x, ref:%u], "
+  "FlushData[mseq:%u, mfrag:%u, fseq:%u], "
+  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
+  id_.node, id_.ref,
+  msg->header_.mseq_, msg->header_.mfrag_, msg->header_.fseq_,
+  sndwnd_.acked_, sndwnd_.send_, sndwnd_.nacked_space_);
+    }
+  } while (msg != nullptr);
+}
+
  uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) {
    struct sockaddr_tipc server_addr;
    ssize_t send_len = 0;
@@ -130,29 +166,49 @@ uint32_t TipcPortId::Send(uint8_t* data, 
uint16_t length) {

    return rc;
  }
  -uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length) {
+uint32_t TipcPortId::Queue(const uint8_t* data, uint16_t length,
+    bool is_sent) {
    uint32_t rc = NCSCC_RC_SUCCESS;
      DataMessage *msg = new DataMessage;
    msg->header_.Decode(const_cast(data));
    msg->Decode(const_cast(data));
    msg->msg_data_ = new uint8_t[length];
+  msg->is_sent_ = is_sent;
    memcpy(msg->msg_data_, data, length);
    sndqueue_.Queue(msg);
-  ++sndwnd_.send_;
-  sndwnd_.nacked_space_ += length;
-  m_MDS_LOG_DBG("FCTRL: [me] --> [node:%x, ref:%u], "
-  "SndData[mseq:%u, mfrag:%u, fseq:%u, len:%u], "
-  "sndwnd[acked:%u, send:%u, nacked:%" PRIu64 "]",
-

Re: [devel] [PATCH 1/9] mds: Add README for solution of TIPC buffer overflow at MDS [#1960]

2019-09-16 Thread Minh Hon Chau


Hi Vu,

Thanks for your time to review the patches, the question is interesting.

At this moment with normal load traffic, the resource towards the new 
standby (old active) is not released and will be reused if standby 
switches back to active. The reason is that mds won't start the "tx 
probation" again to confirm flow control support as mds has known it had 
enabled flow control on this port id. The messages towards the new 
active are sent on another port id thus they are running on a different 
flow control counter. The test of multiple switchover looks ok so far. 
However, the problem probably happens with overloaded traffic while a 
failover/switchover (I haven't tested this case). The pending messages 
under overload state to be sent to the old active won't be sent to the 
new active, I guess the mds user would get TIMEOUT and try again to send 
the message to the new active, which at least corresponds to legacy 
behavior. However, this could be looked at as an improvement as we have 
pending messages, we know the new active, we can send the pending 
messages to new active, but another question is that whether the 
existing users expect to receive these pending messages according to 
their current logics.


Regards,

Minh

On 16/9/19 5:34 pm, Nguyen Minh Vu wrote:

Hi Minh,

I have just finished my review to your MDS patches, and I have a 
question:


With 2N services, suppose the active is having TIPC overloaded issue;
it will do some memory allocations, and probably starting a timer 
there too.


Then, what happens if that active service is changed to the standby role?
Shall allocated memory/timer be freed up and is there any impact on 
the subsequent messages sent to the new active?


Regards, Vu

On 8/14/19 1:38 PM, Minh Chau wrote:

---
  src/mds/README | 221 
+

  1 file changed, 221 insertions(+)
  create mode 100644 src/mds/README

diff --git a/src/mds/README b/src/mds/README
new file mode 100644
index 000..1b94632
--- /dev/null
+++ b/src/mds/README
@@ -0,0 +1,221 @@
+/*  -*- OpenSAF  -*-
+ *
+ * (C) Copyright 2019 The OpenSAF Foundation
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. This file and program are 
licensed
+ * under the GNU Lesser General Public License Version 2.1, February 
1999.

+ * The complete license can be accessed from the following location:
+ * http://opensource.org/licenses/lgpl-license.php
+ * See the Copying file included with the OpenSAF distribution for full
+ * licensing terms.
+ *
+ * Author(s): Ericsson AB
+ *
+ */
+Background
+==
+If OpenSAF configures TIPC as transport, the MDS library today will use
+TIPC SOCK_RDM socket for message distribution in the cluster. The 
SOCK_RDM
+datagram socket possibly encounters buffer overflow at receiver ends 
which

+has been documented in tipc.io[1]. A temporary solution for this buffer
+overflow issue is that the socket buffer size can be increased to a 
larger
+number. However, if the cluster continues either scaling out or 
adding more

+components, the system will be under dimensioned, thus the TIPC buffer
+overflow can occur again.
+
+MDS's solution for TIPC buffer overflow
+===
+If MDS disables TIPC_DEST_DROPPABLE, TIPC will return the ancillary 
message
+when the original message is failed to deliver. By this event, if 
the message
+has been saved in queue, MDS at sender sides can search and 
retransmit this

+message to the receivers.
+Once the messages in the sender's queue has been delivered 
successfully, MDS

+needs to remove them. MDS introduces its internal ACK message as an
+acknowledgment from receivers so that the senders can remove the 
messages

+out of the queue.
+Also, as such situation of buffer overflow at receivers, the 
retransmission may
+not succeed or even become worse at receiver ends (the more 
retransmission,
+the more overflow to occur). MDS imitates the sliding window in 
TCP[2] to

+control the flow of data message towards the receivers.
+
+Legacy MDS data message, new (data + ACK) MDS message, and 
upgradability
+ 

+Below is the MDS legacy message format that has been used till 
OpenSAF 5.19.07

+
+oct 0  message length
+oct 1
+--
+oct 2  sequence number: incremented for every message sent out to 
all destined

+...   tipc portid.
+oct 5
+--
+oct 6  fragment number: a message with same sequence number can be 
fragmented,

+oct 7  identified by this fragment number.
+--
+oct 8  length check: cross check with message length(oct0,1), NOT USED.
+oct 9
+--
+oct 10 protocol version: (MDS_PROT:0xA0 |

Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]

2019-09-16 Thread Minh Hon Chau


Hi Vu,

I see it, will add.

Thanks

Minh

On 16/9/19 4:21 pm, Nguyen Minh Vu wrote:

Hi Minh,

See my responses to your comments below, started with [Vu2].

Regards, Vu

On 9/16/19 1:06 PM, Minh Hon Chau wrote:

Hi Vu,

Several comments with [M] too :).

Thanks

Minh

On 16/9/19 2:24 pm, Nguyen Minh Vu wrote:

Hi Minh,

I have several comments below, started with [Vu].

Regards, Vu

On 8/14/19 1:01 PM, Minh Chau wrote:

This is a collaborative patch of two participants:
- Tran Thuan 
- Minh Chau 

Main changes:
- Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files
introduce new functions which are called in mds_dt_tipc.c if the flow
control is enabled
- Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files
implements the tipc portid instance, which supports the sliding 
window,

mds msg queue
- Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define
the event and messages which are used for this solution.
---
  src/mds/Makefile.am  |  10 +-
  src/mds/mds_dt.h |   8 +-
  src/mds/mds_dt_tipc.c    | 188 +---
  src/mds/mds_tipc_fctrl_intf.cc   | 376 
+++

  src/mds/mds_tipc_fctrl_intf.h    |  47 +
  src/mds/mds_tipc_fctrl_msg.cc    | 142 +++
  src/mds/mds_tipc_fctrl_msg.h | 129 ++
  src/mds/mds_tipc_fctrl_portid.cc | 261 +++
  src/mds/mds_tipc_fctrl_portid.h  |  87 +
  9 files changed, 1184 insertions(+), 64 deletions(-)
  create mode 100644 src/mds/mds_tipc_fctrl_intf.cc
  create mode 100644 src/mds/mds_tipc_fctrl_intf.h
  create mode 100644 src/mds/mds_tipc_fctrl_msg.cc
  create mode 100644 src/mds/mds_tipc_fctrl_msg.h
  create mode 100644 src/mds/mds_tipc_fctrl_portid.cc
  create mode 100644 src/mds/mds_tipc_fctrl_portid.h

diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index 2d7b652..d849e8f 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \
  if ENABLE_TIPC_TRANSPORT
  noinst_HEADERS += src/mds/mds_dt_tipc.h \
  src/mds/mds_tipc_recvq_stats.h \
-    src/mds/mds_tipc_recvq_stats_impl.h
+    src/mds/mds_tipc_recvq_stats_impl.h \
+    src/mds/mds_tipc_fctrl_intf.h \
+    src/mds/mds_tipc_fctrl_portid.h \
+    src/mds/mds_tipc_fctrl_msg.h
  lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \
  src/mds/mds_tipc_recvq_stats.cc \
-    src/mds/mds_tipc_recvq_stats_impl.cc
+    src/mds/mds_tipc_recvq_stats_impl.cc \
+    src/mds/mds_tipc_fctrl_intf.cc \
+    src/mds/mds_tipc_fctrl_portid.cc \
+    src/mds/mds_tipc_fctrl_msg.cc
  endif
    if ENABLE_TESTS
diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h
index b645bb4..d9e8633 100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -162,7 +162,7 @@ uint32_t 
mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL ref);

  uint32_t mds_tmr_mailbox_processing(void);
  uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL 
*svc_hdl);
  uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, 
uint32_t seq_num,

-   uint16_t frag_byte);
+   uint16_t frag_byte, uint16_t 
fctrl_seq_num);

  uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg);
  uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, 
uint64_t tipc_id,

  uint32_t *buff_dump);
@@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, 
NCSCONTEXT msg);

    #define MDS_PROT 0xA0
  #define MDS_VERSION 0x08
-#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION)
+#define MDS_PROT_VER_MASK 0xFC
  #define MDTM_PRI_MASK 0x3
  +/* MDS protocol/version for flow control */
+#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)
+#define MDS_PROT_FCTRL_ID 0x00AC13F5
+
  /* Added for the subscription changes */
  #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff)
  #define MDS_TIPC_COMMON_ID 0x01001000
diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 86b52bb..fef1c50 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -47,6 +47,7 @@
  #include "mds_dt_tipc.h"
  #include "mds_dt_tcp_disc.h"
  #include "mds_core.h"
+#include "mds_tipc_fctrl_intf.h"
  #include "mds_tipc_recvq_stats.h"
  #include "base/osaf_utility.h"
  #include "base/osaf_poll.h"
@@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;
  uint32_t mdtm_global_frag_num;
    const unsigned int MAX_RECV_THRESHOLD = 30;
+uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
  -static bool get_tipc_port_id(int sock, uint32_t* port_id) {
+static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) {
  struct sockaddr_tipc addr;
  socklen_t sz = sizeof(addr);
    memset(, 0, sizeof(addr));
-    *port_id = 0;
+    port_id->node = 0;
+    port_id->ref = 0;
  if (0 > getsockname(sock, (struct sockaddr *), )) {
  syslog(LOG_ERR, "MDTM:TIPC Failed t

Re: [devel] [PATCH 5/9] mds: Add state machine for tipc portid instance [#1960]

2019-09-16 Thread Minh Hon Chau


Hi Vu,

Some comments with [M]

Thanks

Minh

On 16/9/19 2:56 pm, Nguyen Minh Vu wrote:

Hi Minh,

I has few comments below.

Regards, Vu

On 8/14/19 1:38 PM, Minh Chau wrote:

This patch adds state machine to support tx probation timer.
---
  src/mds/mds_tipc_fctrl_intf.cc   |  47 +++--
  src/mds/mds_tipc_fctrl_msg.h |   1 +
  src/mds/mds_tipc_fctrl_portid.cc | 109 
+++

  src/mds/mds_tipc_fctrl_portid.h  |  22 
  4 files changed, 176 insertions(+), 3 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_intf.cc 
b/src/mds/mds_tipc_fctrl_intf.cc

index bd0a8f6..c2d0922 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -34,6 +34,7 @@
    using mds::Event;
  using mds::TipcPortId;
+using mds::Timer;
  using mds::DataMessage;
  using mds::ChunkAck;
  using mds::HeaderMessage;
@@ -65,6 +66,11 @@ uint64_t sock_buf_size = 0;
  std::map portid_map;
  std::mutex portid_map_mutex;
  +// probation timer event to enable flow control at receivers
+const int64_t kBaseTimerInt = 200;  // in centisecond
+const uint8_t kTxProbMaxRetries = 10;
+Timer txprob_timer(Event::Type::kEvtTmrTxProb);
+
  // chunk ack parameters
  // todo: The chunk ack timeout and chunk ack size should be 
configurable

  int kChunkAckTimeout = 1000;  // in miliseconds
@@ -76,13 +82,37 @@ TipcPortId* portid_lookup(struct tipc_portid id) {
    return portid_map[uid];
  }
  +void tmr_exp_cbk(void* uarg) {
+  Timer* timer = reinterpret_cast(uarg);
+  if (timer != nullptr) {
+    timer->is_active_ = false;
+    // send to fctrl thread
+    if (m_NCS_IPC_SEND(_events, new Event(timer->type_),
+    NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) {
+  m_MDS_LOG_ERR("FCTRL: Failed to send msg to mbx_events\n");
+    }
+  }
+}
+
  void process_timer_event(const Event evt) {
+  bool txprob_restart = false;
    for (auto i : portid_map) {
  TipcPortId* portid = i.second;
+
+    if (evt.type_ == Event::Type::kEvtTmrTxProb) {
+  if (portid->ReceiveTmrTxProb(kTxProbMaxRetries) == true) {
+    txprob_restart = true;
+  }
+    }
+
  if (evt.type_ == Event::Type::kEvtTmrChunkAck) {
    portid->ReceiveTmrChunkAck();
  }
    }
+  if (txprob_restart) {
+    txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk);
+    m_MDS_LOG_DBG("FCTRL: Restart txprob");
+  }
  }
    uint32_t process_flow_event(const Event evt) {
@@ -231,8 +261,10 @@ uint32_t mds_tipc_fctrl_sndqueue_capable(struct 
tipc_portid id, uint16_t len,

  id.node, id.ref, __LINE__);
  rc = NCSCC_RC_FAILURE;
    } else {
-    // assign the sequence number of the outgoing message
-    *next_seq = portid->GetCurrentSeq();
+    if (portid->state_ != TipcPortId::State::kDisabled) {
+  // assign the sequence number of the outgoing message
+  *next_seq = portid->GetCurrentSeq();
+    }
    }
      portid_map_mutex.unlock();
@@ -252,7 +284,16 @@ uint32_t mds_tipc_fctrl_trysend(const uint8_t 
*buffer, uint16_t len,

  id.node, id.ref, __LINE__);
  rc = NCSCC_RC_FAILURE;
    } else {
-    portid->Queue(buffer, len);
+    if (portid->state_ != TipcPortId::State::kDisabled) {
+  portid->Queue(buffer, len);
+  // start txprob timer for the first msg sent out
+  // do not start for other states
+  if (portid->state_ == TipcPortId::State::kStartup) {
+    txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk);
+    m_MDS_LOG_DBG("FCTRL: Start txprob");
+    portid->state_ = TipcPortId::State::kTxProb;
+  }
+    }
    }
      portid_map_mutex.unlock();
diff --git a/src/mds/mds_tipc_fctrl_msg.h b/src/mds/mds_tipc_fctrl_msg.h
index 8e6a874..69f8048 100644
--- a/src/mds/mds_tipc_fctrl_msg.h
+++ b/src/mds/mds_tipc_fctrl_msg.h
@@ -45,6 +45,7 @@ class Event {
  kEvtDropData,  // event reported from tipc that a 
message is not

 // delivered
  kEvtTmrAll,
+    kEvtTmrTxProb,    // event that tx probation timer expired for once
  kEvtTmrChunkAck,  // event to send the chunk ack
    };
    NCS_IPC_MSG next_{0};
diff --git a/src/mds/mds_tipc_fctrl_portid.cc 
b/src/mds/mds_tipc_fctrl_portid.cc

index 64115d5..84ecee9 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -23,6 +23,35 @@
    namespace mds {
  +Timer::Timer(Event::Type type) {
+  tmr_id_ = nullptr;
+  type_ = type;
+  is_active_ = false;
+}
+
+Timer::~Timer() {

[Vu] Is it required to stop the timer here if it still in active?

[M]: Yes, will add the Stop() here

+}
+
+void Timer::Start(int64_t period, void (*tmr_exp_func)(void*)) {
+  // timer will not start if it's already started
+  // period is in centiseconds
+  if (is_active_ == false) {
+    if (tmr_id_ == nullptr) {
+  tmr_id_ = ncs_tmr_alloc(nullptr, 0);
+    }
+    tmr_id_ = ncs_tmr_start(tmr_id_, period, tmr_exp_func, this,
+    nullptr, 0);
+    is_active_ = true;
+  }
+}
+
+void Timer::Stop() {
[Vu] This method is not called from anywhere. Is there

Re: [devel] [PATCH 4/9] mds: Add timeout for ack message [#1960]

2019-09-16 Thread Minh Hon Chau


Hi Vu,

Some comments with [M]

Thanks

Minh

On 16/9/19 2:37 pm, Nguyen Minh Vu wrote:

Hi Minh,

I have minor comments below.

Regards, Vu

On 8/14/19 1:38 PM, Minh Chau wrote:

If the ack size is configured greater than 1, there should be a timeout
at receiver ends to send the ack message back to senders.
The ack message timeout utilizes the poll timeout in flow control thread
to make mds lightweight (in contrast to additional timer threads).
---
  src/mds/mds_tipc_fctrl_intf.cc   | 33 
++---

  src/mds/mds_tipc_fctrl_msg.h |  6 ++
  src/mds/mds_tipc_fctrl_portid.cc | 15 +++
  src/mds/mds_tipc_fctrl_portid.h  |  1 +
  4 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/src/mds/mds_tipc_fctrl_intf.cc 
b/src/mds/mds_tipc_fctrl_intf.cc

index 91b9107..bd0a8f6 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -66,7 +66,8 @@ std::map portid_map;
  std::mutex portid_map_mutex;
    // chunk ack parameters
-// todo: The chunk ack size should be configurable
+// todo: The chunk ack timeout and chunk ack size should be 
configurable

+int kChunkAckTimeout = 1000;  // in miliseconds
  uint16_t kChunkAckSize = 3;
    TipcPortId* portid_lookup(struct tipc_portid id) {
@@ -75,6 +76,15 @@ TipcPortId* portid_lookup(struct tipc_portid id) {
    return portid_map[uid];
  }
  +void process_timer_event(const Event evt) {
+  for (auto i : portid_map) {
+    TipcPortId* portid = i.second;
+    if (evt.type_ == Event::Type::kEvtTmrChunkAck) {
+  portid->ReceiveTmrChunkAck();
+    }
+  }
+}
+
  uint32_t process_flow_event(const Event evt) {
    uint32_t rc = NCSCC_RC_SUCCESS;
    TipcPortId *portid = portid_lookup(evt.id_);
@@ -110,7 +120,7 @@ uint32_t process_flow_event(const Event evt) {
  uint32_t process_all_events(void) {
    enum { FD_FCTRL = 0, NUM_FDS };
  -  int poll_tmo = MDTM_TIPC_POLL_TIMEOUT;
+  int poll_tmo = kChunkAckTimeout;
    while (true) {
  int pollres;
  struct pollfd pfd[NUM_FDS] = {{0}};
@@ -135,11 +145,24 @@ uint32_t process_all_events(void) {
  if (evt == nullptr) continue;
    portid_map_mutex.lock();
-    process_flow_event(*evt);
+
+    if (evt->IsTimerEvent()) {
+  process_timer_event(*evt);
+    }
+    if (evt->IsFlowEvent()) {
+  process_flow_event(*evt);
+    }
+

[Vu] Should log something here if the event is none of above?
[M] Probably not, the event is created internally so we know there won't 
be any rather than the above

  delete evt;
  portid_map_mutex.unlock();
    }
  }
+    // timeout, scan all portid and send ack msgs
+    if (pollres == 0) {
+  portid_map_mutex.lock();
+  process_timer_event(Event(Event::Type::kEvtTmrChunkAck));
+  portid_map_mutex.unlock();
+    }
    }  /* while */
    return NCSCC_RC_SUCCESS;
  }
@@ -368,6 +391,10 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t 
*buffer, uint16_t len,

    portid_map_mutex.lock();
    uint32_t rc = process_flow_event(Event(Event::Type::kEvtRcvData,
    id, data.svc_id_, header.mseq_, header.mfrag_, 
header.fseq_));

+  if (rc == NCSCC_RC_CONTINUE) {
+ process_timer_event(Event(Event::Type::kEvtTmrChunkAck));

[Vu] Missed to unlock the mutex here

[M] It's not missed, it's called before return

+    rc = NCSCC_RC_SUCCESS;
+  }
    portid_map_mutex.unlock();
    return rc;
  }
diff --git a/src/mds/mds_tipc_fctrl_msg.h b/src/mds/mds_tipc_fctrl_msg.h
index 677f256..8e6a874 100644
--- a/src/mds/mds_tipc_fctrl_msg.h
+++ b/src/mds/mds_tipc_fctrl_msg.h
@@ -44,6 +44,8 @@ class Event {
 // selective data msgs (not supported)
  kEvtDropData,  // event reported from tipc that a 
message is not

 // delivered
+    kEvtTmrAll,
+    kEvtTmrChunkAck,  // event to send the chunk ack
    };
    NCS_IPC_MSG next_{0};
    Type type_;
@@ -68,6 +70,10 @@ class Event {
  fseq_(f_seg_num), chunk_size_(chunk_size) {
  type_ = type;
    }
+  bool IsTimerEvent() { return (type_ > Type::kEvtTmrAll); }
+  bool IsFlowEvent() {
+    return (Type::kEvtDataFlowAll < type_ && type_ < Type::kEvtTmrAll);
+  }
[Vu] Consider making these ones  to be constant methods if they do not 
change any of their attribute values.

[M] Yes, will add const

  };
    class BaseMessage {
diff --git a/src/mds/mds_tipc_fctrl_portid.cc 
b/src/mds/mds_tipc_fctrl_portid.cc

index 24d13ee..64115d5 100644
--- a/src/mds/mds_tipc_fctrl_portid.cc
+++ b/src/mds/mds_tipc_fctrl_portid.cc
@@ -67,6 +67,8 @@ TipcPortId::TipcPortId(struct tipc_portid id, int 
sock, uint16_t chksize,

  }
    TipcPortId::~TipcPortId() {
+  // Fake a TmrChunkAck event to ack all received messages
+  ReceiveTmrChunkAck();
    // clear all msg in sndqueue_
    sndqueue_.Clear();
  }
@@ -156,6 +158,7 @@ uint32_t TipcPortId::ReceiveData(uint32_t mseq, 
uint16_t mfrag,

    // send ack for @chunk_size_ msgs starting

Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]

2019-09-16 Thread Minh Hon Chau


Hi Vu,

Several comments with [M] too :).

Thanks

Minh

On 16/9/19 2:24 pm, Nguyen Minh Vu wrote:

Hi Minh,

I have several comments below, started with [Vu].

Regards, Vu

On 8/14/19 1:01 PM, Minh Chau wrote:

This is a collaborative patch of two participants:
- Tran Thuan 
- Minh Chau 

Main changes:
- Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files
introduce new functions which are called in mds_dt_tipc.c if the flow
control is enabled
- Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files
implements the tipc portid instance, which supports the sliding window,
mds msg queue
- Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define
the event and messages which are used for this solution.
---
  src/mds/Makefile.am  |  10 +-
  src/mds/mds_dt.h |   8 +-
  src/mds/mds_dt_tipc.c    | 188 +---
  src/mds/mds_tipc_fctrl_intf.cc   | 376 
+++

  src/mds/mds_tipc_fctrl_intf.h    |  47 +
  src/mds/mds_tipc_fctrl_msg.cc    | 142 +++
  src/mds/mds_tipc_fctrl_msg.h | 129 ++
  src/mds/mds_tipc_fctrl_portid.cc | 261 +++
  src/mds/mds_tipc_fctrl_portid.h  |  87 +
  9 files changed, 1184 insertions(+), 64 deletions(-)
  create mode 100644 src/mds/mds_tipc_fctrl_intf.cc
  create mode 100644 src/mds/mds_tipc_fctrl_intf.h
  create mode 100644 src/mds/mds_tipc_fctrl_msg.cc
  create mode 100644 src/mds/mds_tipc_fctrl_msg.h
  create mode 100644 src/mds/mds_tipc_fctrl_portid.cc
  create mode 100644 src/mds/mds_tipc_fctrl_portid.h

diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index 2d7b652..d849e8f 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \
  if ENABLE_TIPC_TRANSPORT
  noinst_HEADERS += src/mds/mds_dt_tipc.h \
  src/mds/mds_tipc_recvq_stats.h \
-    src/mds/mds_tipc_recvq_stats_impl.h
+    src/mds/mds_tipc_recvq_stats_impl.h \
+    src/mds/mds_tipc_fctrl_intf.h \
+    src/mds/mds_tipc_fctrl_portid.h \
+    src/mds/mds_tipc_fctrl_msg.h
  lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \
  src/mds/mds_tipc_recvq_stats.cc \
-    src/mds/mds_tipc_recvq_stats_impl.cc
+    src/mds/mds_tipc_recvq_stats_impl.cc \
+    src/mds/mds_tipc_fctrl_intf.cc \
+    src/mds/mds_tipc_fctrl_portid.cc \
+    src/mds/mds_tipc_fctrl_msg.cc
  endif
    if ENABLE_TESTS
diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h
index b645bb4..d9e8633 100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL 
ref);

  uint32_t mds_tmr_mailbox_processing(void);
  uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL 
*svc_hdl);
  uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t 
seq_num,

-   uint16_t frag_byte);
+   uint16_t frag_byte, uint16_t fctrl_seq_num);
  uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg);
  uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, 
uint64_t tipc_id,

  uint32_t *buff_dump);
@@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, 
NCSCONTEXT msg);

    #define MDS_PROT 0xA0
  #define MDS_VERSION 0x08
-#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION)
+#define MDS_PROT_VER_MASK 0xFC
  #define MDTM_PRI_MASK 0x3
  +/* MDS protocol/version for flow control */
+#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)
+#define MDS_PROT_FCTRL_ID 0x00AC13F5
+
  /* Added for the subscription changes */
  #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff)
  #define MDS_TIPC_COMMON_ID 0x01001000
diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 86b52bb..fef1c50 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -47,6 +47,7 @@
  #include "mds_dt_tipc.h"
  #include "mds_dt_tcp_disc.h"
  #include "mds_core.h"
+#include "mds_tipc_fctrl_intf.h"
  #include "mds_tipc_recvq_stats.h"
  #include "base/osaf_utility.h"
  #include "base/osaf_poll.h"
@@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;
  uint32_t mdtm_global_frag_num;
    const unsigned int MAX_RECV_THRESHOLD = 30;
+uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
  -static bool get_tipc_port_id(int sock, uint32_t* port_id) {
+static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) {
  struct sockaddr_tipc addr;
  socklen_t sz = sizeof(addr);
    memset(, 0, sizeof(addr));
-    *port_id = 0;
+    port_id->node = 0;
+    port_id->ref = 0;
  if (0 > getsockname(sock, (struct sockaddr *), )) {
  syslog(LOG_ERR, "MDTM:TIPC Failed to get socket name, err: 
%s",

 strerror(errno));
  return false;
  }
  -    *port_id = addr.addr.id.ref;
+    *port_id = addr.addr.id;
  return true;
  }
  @@ -240,12 +243,13 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, 
uint32_t *mds_tipc_ref)

  }
    /* Code for

Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]

2019-09-15 Thread Minh Hon Chau


Hi Hans, Gary, Vu

Do you have any comments on remaining patches?

Thanks

Minh

On 11/9/19 11:01 am, Minh Hon Chau wrote:

Hi Gary,

Thanks for the review, please find comments with [M].

/Minh

On 10/9/19 6:02 pm, Gary Lee wrote:

Hi Minh & Thuan

Some minor comments marked with [GL].

On 14/8/19 4:38 pm, Minh Chau wrote:

This is a collaborative patch of two participants:Thuan, Minh.

Main changes:
- Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files
introduce new functions which are called in mds_dt_tipc.c if the flow
control is enabled
- Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files
implements the tipc portid instance, which supports the sliding window,
mds msg queue
- Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define
the event and messages which are used for this solution.
---
  src/mds/Makefile.am  |  10 +-
  src/mds/mds_dt.h |   8 +-
  src/mds/mds_dt_tipc.c    | 188 +---
  src/mds/mds_tipc_fctrl_intf.cc   | 376 
+++

  src/mds/mds_tipc_fctrl_intf.h    |  47 +
  src/mds/mds_tipc_fctrl_msg.cc    | 142 +++
  src/mds/mds_tipc_fctrl_msg.h | 129 ++
  src/mds/mds_tipc_fctrl_portid.cc | 261 +++
  src/mds/mds_tipc_fctrl_portid.h  |  87 +
  9 files changed, 1184 insertions(+), 64 deletions(-)
  create mode 100644 src/mds/mds_tipc_fctrl_intf.cc
  create mode 100644 src/mds/mds_tipc_fctrl_intf.h
  create mode 100644 src/mds/mds_tipc_fctrl_msg.cc
  create mode 100644 src/mds/mds_tipc_fctrl_msg.h
  create mode 100644 src/mds/mds_tipc_fctrl_portid.cc
  create mode 100644 src/mds/mds_tipc_fctrl_portid.h

diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index 2d7b652..d849e8f 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \
  if ENABLE_TIPC_TRANSPORT
  noinst_HEADERS += src/mds/mds_dt_tipc.h \
  src/mds/mds_tipc_recvq_stats.h \
-    src/mds/mds_tipc_recvq_stats_impl.h
+    src/mds/mds_tipc_recvq_stats_impl.h \
+    src/mds/mds_tipc_fctrl_intf.h \
+    src/mds/mds_tipc_fctrl_portid.h \
+    src/mds/mds_tipc_fctrl_msg.h
  lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \
  src/mds/mds_tipc_recvq_stats.cc \
-    src/mds/mds_tipc_recvq_stats_impl.cc
+    src/mds/mds_tipc_recvq_stats_impl.cc \
+    src/mds/mds_tipc_fctrl_intf.cc \
+    src/mds/mds_tipc_fctrl_portid.cc \
+    src/mds/mds_tipc_fctrl_msg.cc
  endif
    if ENABLE_TESTS
diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h
index b645bb4..d9e8633 100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL 
ref);

  uint32_t mds_tmr_mailbox_processing(void);
  uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL 
*svc_hdl);
  uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, 
uint32_t seq_num,

-   uint16_t frag_byte);
+   uint16_t frag_byte, uint16_t 
fctrl_seq_num);

  uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg);
  uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, 
uint64_t tipc_id,

  uint32_t *buff_dump);
@@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, 
NCSCONTEXT msg);

    #define MDS_PROT 0xA0
  #define MDS_VERSION 0x08
-#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION)
+#define MDS_PROT_VER_MASK 0xFC
  #define MDTM_PRI_MASK 0x3
  +/* MDS protocol/version for flow control */
+#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)
+#define MDS_PROT_FCTRL_ID 0x00AC13F5
+
  /* Added for the subscription changes */
  #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff)
  #define MDS_TIPC_COMMON_ID 0x01001000
diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 86b52bb..fef1c50 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -47,6 +47,7 @@
  #include "mds_dt_tipc.h"
  #include "mds_dt_tcp_disc.h"
  #include "mds_core.h"
+#include "mds_tipc_fctrl_intf.h"
  #include "mds_tipc_recvq_stats.h"
  #include "base/osaf_utility.h"
  #include "base/osaf_poll.h"
@@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;
  uint32_t mdtm_global_frag_num;
    const unsigned int MAX_RECV_THRESHOLD = 30;
+uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
  -static bool get_tipc_port_id(int sock, uint32_t* port_id) {
+static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) {
  struct sockaddr_tipc addr;
  socklen_t sz = sizeof(addr);
    memset(, 0, sizeof(addr));
-    *port_id = 0;
+    port_id->node = 0;
+    port_id->ref = 0;
  if (0 > getsockname(sock, (struct sockaddr *), )) {
  syslog(LOG_ERR, "MDTM:TIPC Failed to get socket name, err: 
%s",

 strerror(errno));
  return false;
  }
  -    *port

Re: [devel] [PATCH 7/9] mds: Add configurable parameters [#1960]

2019-09-13 Thread Minh Hon Chau


Hi Vu,

I have the comments [M].

Thanks

Minh

On 13/9/19 6:40 pm, Nguyen Minh Vu wrote:

Hi Minh,

I have minor comments below.

Regards, Vu

On 8/14/19 1:38 PM, Minh Chau wrote:

This patch makes the solution of TIPC buffer overflow configurable,
as well as the ack timeout/ack size.
For example:
The service config file can export the following environment variables

export MDS_TIPC_FCTRL_ENABLED=1
export MDS_TIPC_FCTRL_ACKTIMEOUT=1000
export MDS_TIPC_FCTRL_ACKSIZE=1

If MDS_TIPC_FCTRL_ACKTIMEOUT, MDS_TIPC_FCTRL_ACKSIZE are not specified,
the default values are used.
---
  src/mds/mds_dt_tipc.c  | 19 ---
  src/mds/mds_tipc_fctrl_intf.cc | 25 +++--
  src/mds/mds_tipc_fctrl_intf.h  |  3 ++-
  3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index fef1c50..1b6c3f8 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -342,9 +342,22 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, uint32_t 
*mds_tipc_ref)

  }
    /* Create flow control tasks if enabled*/
-    gl_mds_pro_ver = MDS_PROT_FCTRL;
-    mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id,
-    (uint64_t)optval, tipc_mcast_enabled);
+    char* ptr;
+    if ((ptr = getenv("MDS_TIPC_FCTRL_ENABLED")) != NULL) {
+    if (atoi(ptr) == 1) {
+    gl_mds_pro_ver = MDS_PROT_FCTRL;
+    int ackto = -1;
+    int acksize = -1;
+    if ((ptr = getenv("MDS_TIPC_FCTRL_ACKTIMEOUT")) != NULL) {
+    ackto = atoi(ptr);
+    }
+    if ((ptr = getenv("MDS_TIPC_FCTRL_ACKSIZE")) != NULL) {
+    acksize = atoi(ptr);
+    }
[Vu] Do we have valid range of these environment variables? What if 
they mistakenly set them to empty values?

e.g:

export MDS_TIPC_FCTRL_ACKTIMEOUT=""
[M] We have base::GetEnv and will try to use it here, if not possible 
due to this source file is C code, then will add more handling for 
out-of-range values or a warning if it's set a value e.g too big.



+ mds_tipc_fctrl_initialize(tipc_cb.BSRsock, port_id, (uint64_t)optval,
+    ackto, acksize, tipc_mcast_enabled);
+    }
+    }
    /* Create a task to receive the events and data */
  if (mdtm_create_rcv_task(tipc_cb.hdle_mdtm) != NCSCC_RC_SUCCESS) {
diff --git a/src/mds/mds_tipc_fctrl_intf.cc 
b/src/mds/mds_tipc_fctrl_intf.cc

index 397114e..8949937 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -40,6 +40,9 @@ using mds::ChunkAck;
  using mds::HeaderMessage;
    namespace {
+// flow control enabled/disabled
+bool is_fctrl_enabled = false;
+
  // multicast/broadcast enabled
  // todo: to be removed if flow control support it
  bool is_mcast_enabled = true;
@@ -225,7 +228,8 @@ uint32_t create_ncs_task(void *task_hdl) {
  }  // end local namespace
    uint32_t mds_tipc_fctrl_initialize(int dgramsock, struct 
tipc_portid id,

-    uint64_t rcv_buf_size, bool mcast_enabled) {
+    uint64_t rcv_buf_size, int32_t ackto, int32_t acksize,
+    bool mcast_enabled) {
    if (create_ncs_task(_task_hdl) !=
    NCSCC_RC_SUCCESS) {
  m_MDS_LOG_ERR("FCTRL: Start of the Created Task-failed:\n");
@@ -234,8 +238,10 @@ uint32_t mds_tipc_fctrl_initialize(int 
dgramsock, struct tipc_portid id,

    data_sock_fd = dgramsock;
    snd_rcv_portid = id;
    sock_buf_size = rcv_buf_size;
+  is_fctrl_enabled = true;
    is_mcast_enabled = mcast_enabled;
-
+  if (ackto != -1) kChunkAckTimeout = ackto;
+  if (acksize != -1) kChunkAckSize = acksize;
    m_MDS_LOG_NOTIFY("FCTRL: Initialize [node:%x, ref:%u]",
    id.node, id.ref);
  @@ -243,6 +249,7 @@ uint32_t mds_tipc_fctrl_initialize(int 
dgramsock, struct tipc_portid id,

  }
    uint32_t mds_tipc_fctrl_shutdown(void) {
+  if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS;
    if (ncs_task_release(p_task_hdl) != NCSCC_RC_SUCCESS) {
  m_MDS_LOG_ERR("FCTRL: Stop of the Created Task-failed:\n");
    }
@@ -251,6 +258,8 @@ uint32_t mds_tipc_fctrl_shutdown(void) {
    uint32_t mds_tipc_fctrl_sndqueue_capable(struct tipc_portid id, 
uint16_t len,

    uint16_t* next_seq) {
+  if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS;
+
    uint32_t rc = NCSCC_RC_SUCCESS;
      portid_map_mutex.lock();
[Vu] We has a common class base::Lock that can help to unlock 
automatically when it goes out

the scope. Should we make  portid_map_mutex to be an Lock object?

[M]: Yes I should use base::Lock, will change it.
@@ -274,6 +283,8 @@ uint32_t mds_tipc_fctrl_sndqueue_capable(struct 
tipc_portid id, uint16_t len,

    uint32_t mds_tipc_fctrl_trysend(const uint8_t *buffer, uint16_t len,
  struct tipc_portid id) {
+  if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS;
+
    uint32_t rc = NCSCC_RC_SUCCESS;
      portid_map_mutex.lock();
@@ -304,6 +315,8 @@ uint32_t mds_tipc_fctrl_trysend(const uint8_t 
*buffer, uint16_t len,

  }
    uint32_t mds_tipc_fctrl_portid_up(struct tipc_portid id,

Re: [devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]

2019-09-12 Thread Minh Hon Chau


Hi Gary,

This V2 has fixed the error reported in V1, ack from me.

Thanks

Minh

On 12/9/19 5:20 pm, Gary Lee wrote:

If delayed failover is enabled, and a downgrade to a version without #3060 
occurs,
then the standby running a newer version with #3060 may complain about an out
of sync error during warm sync.
---
  src/amf/amfd/ckpt_dec.cc | 23 +++
  1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc
index 6288b4f..75213f8 100644
--- a/src/amf/amfd/ckpt_dec.cc
+++ b/src/amf/amfd/ckpt_dec.cc
@@ -2721,10 +2721,25 @@ uint32_t avd_dec_warm_sync_rsp(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec) {
  if (updt_cnt->ng_updt != cb->async_updt_cnt.ng_updt)
LOG_ER("ng_updt counters mismatch: Active: %u Standby: %u",
   updt_cnt->ng_updt, cb->async_updt_cnt.ng_updt);
-if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt)
-  LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
- updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
-
+if (updt_cnt->failover_updt != cb->async_updt_cnt.failover_updt) {
+  if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) {
+LOG_ER("failover_updt counters mismatch: Active: %u Standby: %u",
+   updt_cnt->failover_updt, cb->async_updt_cnt.failover_updt);
+  } else {
+// Versions before 10 did not support failover_updt
+// After a downgrade scenario, where the active is < v10
+// and this node is >= v10, then there will be failover_updt mismatch
+// If so, just set the value to what's on the older active
+cb->async_updt_cnt.failover_updt = updt_cnt->failover_updt;
+
+// check again
+if (0 == memcmp(updt_cnt, >async_updt_cnt,
+sizeof(AVSV_ASYNC_UPDT_CNT))) {
+  cb->stby_sync_state = AVD_STBY_IN_SYNC;
+  return status;
+}
+  }
+}
  LOG_ER("Out of sync detected in warm sync response, exiting");
  osafassert(0);
  



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] amfd: fix coredump during downgrade if delayed failover is enabled [#3078]

2019-09-10 Thread Minh Hon Chau


Hi Gary,

The patch works fine in the reported scenario, no coredump in amfd.

But after downgrade succeeds (meaning the sc1 is active and running old 
software, the sc2 is standby running latest software + #3078), I 
continue another switchover to make sc2 back to active, I got error


Thanks

Minh

2019-09-11 14:31:58.633 SC-2 osafamfd[280]: WA 
avsv_validate_reo_type_in_csync: unknown type 52
2019-09-11 14:31:58.674 SC-2 osafimmnd[234]: NO Implementer (applier) 
connected: 43 (@OpenSafImmReplicatorB) <0, 2010f>
2019-09-11 14:31:59.496 SC-2 osafimmnd[234]: NO Implementer disconnected 
35 <0, 2010f> (safAmfService)
2019-09-11 14:31:59.500 SC-2 osafimmnd[234]: NO Implementer (applier) 
connected: 44 (@safAmfService2010f) <0, 2010f>
2019-09-11 14:31:59.524 SC-2 osafamfd[280]: NO Switching StandBy --> 
Active State
2019-09-11 14:31:59.526 SC-2 osafamfd[280]: ER Switch Standby --> Active 
FAILED, Standby OUT OF SYNC
2019-09-11 14:31:59.526 SC-2 osafamfd[280]: ER avd_role_change role 
change failure
2019-09-11 14:31:59.544 SC-2 osafimmd[223]: NO MDS event from svc_id 24 
(change:7, dest:13)
2019-09-11 14:31:59.547 SC-2 osafimmd[223]: NO MDS event from svc_id 24 
(change:7, dest:13)

2019-09-11 14:31:59.551 SC-2 osafamfnd[290]: NO AVD NEW_ACTIVE, adest:1
2019-09-11 14:31:59.563 SC-2 osafimmnd[234]: NO Implementer disconnected 
44 <0, 2010f> (@safAmfService2010f)
2019-09-11 14:31:59.566 SC-2 osafimmnd[234]: NO Implementer connected: 
45 (safAmfService) <0, 2010f>
2019-09-11 14:31:59.580 SC-2 osafamfd[280]: WA 
avsv_validate_reo_type_in_csync: unknown type 52
2019-09-11 14:32:09.626 SC-2 osafamfd[280]: message repeated 4 times: [ 
WA avsv_validate_reo_type_in_csync: unknown type 52]
2019-09-11 14:32:59.775 SC-2 osafimmd[223]: NO MDS event from svc_id 25 
(change:4, dest:564114788998701)
2019-09-11 14:32:59.775 SC-2 osafimmd[223]: NO MDS event from svc_id 24 
(change:1, dest:13)
2019-09-11 14:32:59.776 SC-2 osafimmd[223]: NO MDS event from svc_id 24 
(change:6, dest:13)

2019-09-11 14:32:59.777 SC-2 osaffmd[213]: NO IMMND down on: 2010f
2019-09-11 14:32:59.777 SC-2 osafimmnd[234]: WA DISCARD DUPLICATE FEVS 
message:2334
2019-09-11 14:32:59.778 SC-2 osafimmnd[234]: WA Error code 2 returned 
for message type 82 - ignoring
2019-09-11 14:32:59.778 SC-2 osafimmd[223]: WA IMMD lost contact with 
peer IMMD (NCSMDS_RED_DOWN)
2019-09-11 14:32:59.780 SC-2 osaffmd[213]: NO Node Down event for node 
id 2010f:

2019-09-11 14:32:59.780 SC-2 osafrded[204]: NO Peer down on node 0x2010f
2019-09-11 14:32:59.782 SC-2 osaffmd[213]: NO AMFND down on: 2010f
2019-09-11 14:32:59.783 SC-2 osaffmd[213]: NO FM down on: 2010f
2019-09-11 14:32:59.784 SC-2 osafamfd[280]: NO Node 'SC-1' is down. 
Start failover delay timer

2019-09-11 14:32:59.784 SC-2 osaffmd[213]: NO IMMD down on: 2010f
2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO AVD down on: 2010f
2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO Core services went down on 
node_id: 2010f

2019-09-11 14:32:59.788 SC-2 osaffmd[213]: NO Current role: STANDBY
2019-09-11 14:32:59.788 SC-2 osaffmd[213]: Rebooting OpenSAF NodeId = 
131343 EE Name = , Reason: Received Node Down for peer controller, 
OwnNodeId = 131599, SupervisionTime = 60
2019-09-11 14:32:59.789 SC-2 osafclmd[270]: NO Node 131343 went down. 
Not sending track callback for agents on that node
2019-09-11 14:32:59.792 SC-2 osafclmd[270]: message repeated 4 times: [ 
NO Node 131343 went down. Not sending track callback for agents on that 
node]
2019-09-11 14:32:59.792 SC-2 osafclmd[270]: NO saflog write 
"safNode=SC-1,safCluster=myClmCluster LEFT, init view=9, cluster 
view=10" FAILED: SA_AIS_ERR_TRY_AGAIN (6)

2019-09-11 14:32:59.792 SC-2 osafamfd[280]: NO Start timer for '2010f'
2019-09-11 14:32:59.808 SC-2 opensaf_reboot: Rebooting remote node in 
the absence of PLM is outside the scope of OpenSAF
2019-09-11 14:32:59.809 SC-2 osaffmd[213]: NO Controller Failover: 
Setting role to ACTIVE

2019-09-11 14:32:59.809 SC-2 osafrded[204]: NO RDE role set to ACTIVE
2019-09-11 14:32:59.810 SC-2 osafrded[204]: NO Running 
'/usr/local/lib/opensaf/opensaf_sc_active' with 0 argument(s)

2019-09-11 14:32:59.812 SC-2 osafamfd[280]: NO FAILOVER StandBy --> Active
2019-09-11 14:32:59.812 SC-2 osafamfd[280]: ER FAILOVER StandBy --> 
Active FAILED, Standby OUT OF SYNC
2019-09-11 14:32:59.812 SC-2 osafamfd[280]: Rebooting OpenSAF NodeId = 0 
EE Name = No EE Mapped, Reason: FAILOVER failed, OwnNodeId = 131599, 
SupervisionTime = 60


2019-09-11 14:31:58.181 SC-1 osafamfd[273]: NO ROLE SWITCH Active --> 
Quiesced
2019-09-11 14:31:58.675 SC-1 osafimmnd[233]: NO Implementer (applier) 
connected: 43 (@OpenSafImmReplicatorB) <269, 2010f>

2019-09-11 14:31:58.676 SC-1 osafntfimcnd[471]: NO Started
2019-09-11 14:31:59.496 SC-1 osafimmnd[233]: NO Implementer disconnected 
35 <97, 2010f> (safAmfService)
2019-09-11 14:31:59.501 SC-1 osafimmnd[233]: NO Implementer (applier) 
connected: 44 (@safAmfService2010f) <97, 2010f>
2019-09-11 14:31:59.525 SC-1

Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]

2019-09-10 Thread Minh Hon Chau


Hi Gary,

Thanks for the review, please find comments with [M].

/Minh

On 10/9/19 6:02 pm, Gary Lee wrote:

Hi Minh & Thuan

Some minor comments marked with [GL].

On 14/8/19 4:38 pm, Minh Chau wrote:

This is a collaborative patch of two participants:Thuan, Minh.

Main changes:
- Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files
introduce new functions which are called in mds_dt_tipc.c if the flow
control is enabled
- Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files
implements the tipc portid instance, which supports the sliding window,
mds msg queue
- Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define
the event and messages which are used for this solution.
---
  src/mds/Makefile.am  |  10 +-
  src/mds/mds_dt.h |   8 +-
  src/mds/mds_dt_tipc.c    | 188 +---
  src/mds/mds_tipc_fctrl_intf.cc   | 376 
+++

  src/mds/mds_tipc_fctrl_intf.h    |  47 +
  src/mds/mds_tipc_fctrl_msg.cc    | 142 +++
  src/mds/mds_tipc_fctrl_msg.h | 129 ++
  src/mds/mds_tipc_fctrl_portid.cc | 261 +++
  src/mds/mds_tipc_fctrl_portid.h  |  87 +
  9 files changed, 1184 insertions(+), 64 deletions(-)
  create mode 100644 src/mds/mds_tipc_fctrl_intf.cc
  create mode 100644 src/mds/mds_tipc_fctrl_intf.h
  create mode 100644 src/mds/mds_tipc_fctrl_msg.cc
  create mode 100644 src/mds/mds_tipc_fctrl_msg.h
  create mode 100644 src/mds/mds_tipc_fctrl_portid.cc
  create mode 100644 src/mds/mds_tipc_fctrl_portid.h

diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index 2d7b652..d849e8f 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \
  if ENABLE_TIPC_TRANSPORT
  noinst_HEADERS += src/mds/mds_dt_tipc.h \
  src/mds/mds_tipc_recvq_stats.h \
-    src/mds/mds_tipc_recvq_stats_impl.h
+    src/mds/mds_tipc_recvq_stats_impl.h \
+    src/mds/mds_tipc_fctrl_intf.h \
+    src/mds/mds_tipc_fctrl_portid.h \
+    src/mds/mds_tipc_fctrl_msg.h
  lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \
  src/mds/mds_tipc_recvq_stats.cc \
-    src/mds/mds_tipc_recvq_stats_impl.cc
+    src/mds/mds_tipc_recvq_stats_impl.cc \
+    src/mds/mds_tipc_fctrl_intf.cc \
+    src/mds/mds_tipc_fctrl_portid.cc \
+    src/mds/mds_tipc_fctrl_msg.cc
  endif
    if ENABLE_TESTS
diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h
index b645bb4..d9e8633 100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL 
ref);

  uint32_t mds_tmr_mailbox_processing(void);
  uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL 
*svc_hdl);
  uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t 
seq_num,

-   uint16_t frag_byte);
+   uint16_t frag_byte, uint16_t fctrl_seq_num);
  uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg);
  uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, 
uint64_t tipc_id,

  uint32_t *buff_dump);
@@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, 
NCSCONTEXT msg);

    #define MDS_PROT 0xA0
  #define MDS_VERSION 0x08
-#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION)
+#define MDS_PROT_VER_MASK 0xFC
  #define MDTM_PRI_MASK 0x3
  +/* MDS protocol/version for flow control */
+#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)
+#define MDS_PROT_FCTRL_ID 0x00AC13F5
+
  /* Added for the subscription changes */
  #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff)
  #define MDS_TIPC_COMMON_ID 0x01001000
diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 86b52bb..fef1c50 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -47,6 +47,7 @@
  #include "mds_dt_tipc.h"
  #include "mds_dt_tcp_disc.h"
  #include "mds_core.h"
+#include "mds_tipc_fctrl_intf.h"
  #include "mds_tipc_recvq_stats.h"
  #include "base/osaf_utility.h"
  #include "base/osaf_poll.h"
@@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;
  uint32_t mdtm_global_frag_num;
    const unsigned int MAX_RECV_THRESHOLD = 30;
+uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
  -static bool get_tipc_port_id(int sock, uint32_t* port_id) {
+static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) {
  struct sockaddr_tipc addr;
  socklen_t sz = sizeof(addr);
    memset(, 0, sizeof(addr));
-    *port_id = 0;
+    port_id->node = 0;
+    port_id->ref = 0;
  if (0 > getsockname(sock, (struct sockaddr *), )) {
  syslog(LOG_ERR, "MDTM:TIPC Failed to get socket name, err: 
%s",

 strerror(errno));
  return false;
  }
  -    *port_id = addr.addr.id.ref;
+    *port_id = addr.addr.id;
  return true;
  }
  @@ -240,12 +243,13 @@ uint32_t mdtm_tipc_init(NODE_ID nodeid, 
uint32_t *mds_tipc_ref)

  }
    /* Code for getting the self tipc random

Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]

2019-08-25 Thread Minh Hon Chau


Hi Hans,

I will update the code for that point.

Thanks

Minh

On 23/8/19 11:14 pm, Hans Nordebäck wrote:

Hi Minh,

see one comment below. /Thanks Hans

On 2019-08-23 03:48, Minh Hon Chau wrote:

Hi Hans,

Thanks for your time to review the patch, please see my replies below
your comments.

Regards,

Minh

On 22/8/19 11:07 pm, Hans Nordebäck wrote:

Hi Minh,

it is a large patch so i have to review parts of it, below are my
comments, marked with [HansN], for files:

src/mds/Makefile.am
src/mds/mds_dt.h
src/mds/mds_dt_tipc.c

I'll continue with the rest of the files a bit later. /Thanks Hans

On 2019-08-14 08:38, Minh Chau wrote:

This is a collaborative patch of two participants:Thuan, Minh.

Main changes:
- Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files
introduce new functions which are called in mds_dt_tipc.c if the flow
control is enabled
- Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files
implements the tipc portid instance, which supports the sliding window,
mds msg queue
- Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define
the event and messages which are used for this solution.
---
    src/mds/Makefile.am  |  10 +-
    src/mds/mds_dt.h |   8 +-
    src/mds/mds_dt_tipc.c    | 188 +---
    src/mds/mds_tipc_fctrl_intf.cc   | 376
+++
    src/mds/mds_tipc_fctrl_intf.h    |  47 +
    src/mds/mds_tipc_fctrl_msg.cc    | 142 +++
    src/mds/mds_tipc_fctrl_msg.h | 129 ++
    src/mds/mds_tipc_fctrl_portid.cc | 261 +++
    src/mds/mds_tipc_fctrl_portid.h  |  87 +
    9 files changed, 1184 insertions(+), 64 deletions(-)
    create mode 100644 src/mds/mds_tipc_fctrl_intf.cc
    create mode 100644 src/mds/mds_tipc_fctrl_intf.h
    create mode 100644 src/mds/mds_tipc_fctrl_msg.cc
    create mode 100644 src/mds/mds_tipc_fctrl_msg.h
    create mode 100644 src/mds/mds_tipc_fctrl_portid.cc
    create mode 100644 src/mds/mds_tipc_fctrl_portid.h

diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index 2d7b652..d849e8f 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \
    if ENABLE_TIPC_TRANSPORT
    noinst_HEADERS += src/mds/mds_dt_tipc.h \
    src/mds/mds_tipc_recvq_stats.h \
-    src/mds/mds_tipc_recvq_stats_impl.h
+    src/mds/mds_tipc_recvq_stats_impl.h \
+    src/mds/mds_tipc_fctrl_intf.h \
+    src/mds/mds_tipc_fctrl_portid.h \
+    src/mds/mds_tipc_fctrl_msg.h
    lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \
    src/mds/mds_tipc_recvq_stats.cc \
-    src/mds/mds_tipc_recvq_stats_impl.cc
+    src/mds/mds_tipc_recvq_stats_impl.cc \
+    src/mds/mds_tipc_fctrl_intf.cc \
+    src/mds/mds_tipc_fctrl_portid.cc \
+    src/mds/mds_tipc_fctrl_msg.cc
    endif
       if ENABLE_TESTS
diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h
index b645bb4..d9e8633 100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL
ref);
    uint32_t mds_tmr_mailbox_processing(void);
    uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL
*svc_hdl);
    uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len,
uint32_t seq_num,
-   uint16_t frag_byte);
+   uint16_t frag_byte, uint16_t
fctrl_seq_num);
    uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg);
    uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len,
uint64_t tipc_id,
    uint32_t *buff_dump);
@@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg,
NCSCONTEXT msg);
       #define MDS_PROT 0xA0
    #define MDS_VERSION 0x08
-#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION)
+#define MDS_PROT_VER_MASK 0xFC
    #define MDTM_PRI_MASK 0x3
    +/* MDS protocol/version for flow control */
+#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)
+#define MDS_PROT_FCTRL_ID 0x00AC13F5
+
    /* Added for the subscription changes */
    #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff)
    #define MDS_TIPC_COMMON_ID 0x01001000
diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 86b52bb..fef1c50 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -47,6 +47,7 @@
    #include "mds_dt_tipc.h"
    #include "mds_dt_tcp_disc.h"
    #include "mds_core.h"
+#include "mds_tipc_fctrl_intf.h"
    #include "mds_tipc_recvq_stats.h"
    #include "base/osaf_utility.h"
    #include "base/osaf_poll.h"
@@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;
    uint32_t mdtm_global_frag_num;
       const unsigned int MAX_RECV_THRESHOLD = 30;
+uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
    -static bool get_tipc_port_id(int sock, uint32_t* port_id) {
+static bool get_tipc_port_id(int sock, struct tipc_portid* port_i

Re: [devel] [PATCH 3/9] mds: Add implementation for TIPC buffer overflow solution [#1960]

2019-08-22 Thread Minh Hon Chau


Hi Hans,

Thanks for your time to review the patch, please see my replies below 
your comments.


Regards,

Minh

On 22/8/19 11:07 pm, Hans Nordebäck wrote:

Hi Minh,

it is a large patch so i have to review parts of it, below are my
comments, marked with [HansN], for files:

src/mds/Makefile.am
src/mds/mds_dt.h
src/mds/mds_dt_tipc.c

I'll continue with the rest of the files a bit later. /Thanks Hans

On 2019-08-14 08:38, Minh Chau wrote:

This is a collaborative patch of two participants:Thuan, Minh.

Main changes:
- Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files
introduce new functions which are called in mds_dt_tipc.c if the flow
control is enabled
- Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files
implements the tipc portid instance, which supports the sliding window,
mds msg queue
- Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define
the event and messages which are used for this solution.
---
   src/mds/Makefile.am  |  10 +-
   src/mds/mds_dt.h |   8 +-
   src/mds/mds_dt_tipc.c| 188 +---
   src/mds/mds_tipc_fctrl_intf.cc   | 376 
+++
   src/mds/mds_tipc_fctrl_intf.h|  47 +
   src/mds/mds_tipc_fctrl_msg.cc| 142 +++
   src/mds/mds_tipc_fctrl_msg.h | 129 ++
   src/mds/mds_tipc_fctrl_portid.cc | 261 +++
   src/mds/mds_tipc_fctrl_portid.h  |  87 +
   9 files changed, 1184 insertions(+), 64 deletions(-)
   create mode 100644 src/mds/mds_tipc_fctrl_intf.cc
   create mode 100644 src/mds/mds_tipc_fctrl_intf.h
   create mode 100644 src/mds/mds_tipc_fctrl_msg.cc
   create mode 100644 src/mds/mds_tipc_fctrl_msg.h
   create mode 100644 src/mds/mds_tipc_fctrl_portid.cc
   create mode 100644 src/mds/mds_tipc_fctrl_portid.h

diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index 2d7b652..d849e8f 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -48,10 +48,16 @@ lib_libopensaf_core_la_SOURCES += \
   if ENABLE_TIPC_TRANSPORT
   noinst_HEADERS += src/mds/mds_dt_tipc.h \
src/mds/mds_tipc_recvq_stats.h \
-   src/mds/mds_tipc_recvq_stats_impl.h
+   src/mds/mds_tipc_recvq_stats_impl.h \
+   src/mds/mds_tipc_fctrl_intf.h \
+   src/mds/mds_tipc_fctrl_portid.h \
+   src/mds/mds_tipc_fctrl_msg.h
   lib_libopensaf_core_la_SOURCES += src/mds/mds_dt_tipc.c \
src/mds/mds_tipc_recvq_stats.cc \
-   src/mds/mds_tipc_recvq_stats_impl.cc
+   src/mds/mds_tipc_recvq_stats_impl.cc \
+   src/mds/mds_tipc_fctrl_intf.cc \
+   src/mds/mds_tipc_fctrl_portid.cc \
+   src/mds/mds_tipc_fctrl_msg.cc
   endif
   
   if ENABLE_TESTS

diff --git a/src/mds/mds_dt.h b/src/mds/mds_dt.h
index b645bb4..d9e8633 100644
--- a/src/mds/mds_dt.h
+++ b/src/mds/mds_dt.h
@@ -162,7 +162,7 @@ uint32_t mdtm_del_from_ref_tbl(MDS_SUBTN_REF_VAL ref);
   uint32_t mds_tmr_mailbox_processing(void);
   uint32_t mdtm_get_from_ref_tbl(MDS_SUBTN_REF_VAL ref, MDS_SVC_HDL *svc_hdl);
   uint32_t mdtm_add_frag_hdr(uint8_t *buf_ptr, uint16_t len, uint32_t seq_num,
-   uint16_t frag_byte);
+   uint16_t frag_byte, uint16_t fctrl_seq_num);
   uint32_t mdtm_free_reassem_msg_mem(MDS_ENCODED_MSG *msg);
   uint32_t mdtm_process_recv_data(uint8_t *buf, uint16_t len, uint64_t tipc_id,
   uint32_t *buff_dump);
@@ -240,9 +240,13 @@ bool mdtm_mailbox_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT 
msg);
   
   #define MDS_PROT 0xA0

   #define MDS_VERSION 0x08
-#define MDS_PROT_VER_MASK (MDS_PROT | MDS_VERSION)
+#define MDS_PROT_VER_MASK 0xFC
   #define MDTM_PRI_MASK 0x3
   
+/* MDS protocol/version for flow control */

+#define MDS_PROT_FCTRL (0xB0 | MDS_VERSION)
+#define MDS_PROT_FCTRL_ID 0x00AC13F5
+
   /* Added for the subscription changes */
   #define MDS_NCS_CHASSIS_ID (m_NCS_GET_NODE_ID & 0x00ff)
   #define MDS_TIPC_COMMON_ID 0x01001000
diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 86b52bb..fef1c50 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -47,6 +47,7 @@
   #include "mds_dt_tipc.h"
   #include "mds_dt_tcp_disc.h"
   #include "mds_core.h"
+#include "mds_tipc_fctrl_intf.h"
   #include "mds_tipc_recvq_stats.h"
   #include "base/osaf_utility.h"
   #include "base/osaf_poll.h"
@@ -165,20 +166,22 @@ NCS_PATRICIA_TREE mdtm_reassembly_list;
   uint32_t mdtm_global_frag_num;
   
   const unsigned int MAX_RECV_THRESHOLD = 30;

+uint8_t gl_mds_pro_ver = MDS_PROT | MDS_VERSION;
   
-static bool get_tipc_port_id(int sock, uint32_t* port_id) {

+static bool get_tipc_port_id(int sock, struct tipc_portid* port_id) {
struct sockaddr_tipc addr;
socklen_t sz = sizeof(addr);
   
   	memset(, 0, sizeof(addr));

-   *port_id = 0;
+   port_id->node = 0;
+   port_id->ref = 0;
if (0 > getsockname(sock, (struct sockaddr *), )) {
syslog(LOG_ERR, "MDTM:TIPC

Re: [devel] [PATCH 1/1] amfd: set failover_state on standby [#3072]

2019-08-21 Thread Minh Hon CHAU


Hi, Ack, review only.

Quoting Gary Lee :


Otherwise, after two controller failovers, unexpected
reboot of previously rebooted payloads may occur.
---
 src/amf/amfd/node_state_machine.cc | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/amf/amfd/node_state_machine.cc  
b/src/amf/amfd/node_state_machine.cc

index efe2085..d38f79e 100644
--- a/src/amf/amfd/node_state_machine.cc
+++ b/src/amf/amfd/node_state_machine.cc
@@ -63,6 +63,12 @@ void NodeStateMachine::SetState(uint32_t state) {
 LOG_NO("New state '%u'", state);
   }

+  // this is needed for cold sync, in case this node (currently standby)
+  // becomes active later
+  AVD_AVND *node = avd_node_find_nodeid(node_id_);
+  osafassert(node != nullptr);
+  node->failover_state = state;
+
   switch (state) {
 case NodeState::kStart:
   state_ = std::make_shared(this);
--
2.7.4





___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/9] mds: Add README for solution of TIPC buffer overflow at MDS [#1960]

2019-08-15 Thread Minh Hon Chau


Hi Hans,

I will update txprob -> "tx probation"

The kEnabled, it means for a state of a tipc portid only. There's 
another @is_fctrl_enabled, that's for the feature whether mds has flow 
control enabled/disabled.


Thanks

Minh

On 14/8/19 5:48 pm, Hans Nordebäck wrote:

Hi Minh,

ack, some minor comments below/Thanks Hans

On 2019-08-14 08:38, Minh Chau wrote:

---
   src/mds/README | 221 
+
   1 file changed, 221 insertions(+)
   create mode 100644 src/mds/README

diff --git a/src/mds/README b/src/mds/README
new file mode 100644
index 000..1b94632
--- /dev/null
+++ b/src/mds/README
@@ -0,0 +1,221 @@
+/*  -*- OpenSAF  -*-
+ *
+ * (C) Copyright 2019 The OpenSAF Foundation
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed
+ * under the GNU Lesser General Public License Version 2.1, February 1999.
+ * The complete license can be accessed from the following location:
+ * http://opensource.org/licenses/lgpl-license.php
+ * See the Copying file included with the OpenSAF distribution for full
+ * licensing terms.
+ *
+ * Author(s): Ericsson AB
+ *
+ */
+Background
+==
+If OpenSAF configures TIPC as transport, the MDS library today will use
+TIPC SOCK_RDM socket for message distribution in the cluster. The SOCK_RDM
+datagram socket possibly encounters buffer overflow at receiver ends which
+has been documented in tipc.io[1]. A temporary solution for this buffer
+overflow issue is that the socket buffer size can be increased to a larger
+number. However, if the cluster continues either scaling out or adding more
+components, the system will be under dimensioned, thus the TIPC buffer
+overflow can occur again.
+
+MDS's solution for TIPC buffer overflow
+===
+If MDS disables TIPC_DEST_DROPPABLE, TIPC will return the ancillary message
+when the original message is failed to deliver. By this event, if the message
+has been saved in queue, MDS at sender sides can search and retransmit this
+message to the receivers.
+Once the messages in the sender's queue has been delivered successfully, MDS
+needs to remove them. MDS introduces its internal ACK message as an
+acknowledgment from receivers so that the senders can remove the messages
+out of the queue.
+Also, as such situation of buffer overflow at receivers, the retransmission may
+not succeed or even become worse at receiver ends (the more retransmission,
+the more overflow to occur). MDS imitates the sliding window in TCP[2] to
+control the flow of data message towards the receivers.
+
+Legacy MDS data message, new (data + ACK) MDS message, and upgradability
+
+Below is the MDS legacy message format that has been used till OpenSAF 5.19.07
+
+oct 0  message length
+oct 1
+--
+oct 2  sequence number: incremented for every message sent out to all destined
+...   tipc portid.
+oct 5
+--
+oct 6  fragment number: a message with same sequence number can be fragmented,
+oct 7  identified by this fragment number.
+--
+oct 8  length check: cross check with message length(oct0,1), NOT USED.
+oct 9
+--
+oct 10 protocol version: (MDS_PROT:0xA0 | MDS_VERSION:0x08) = 0xA8, NOT USED
+--
+oct 11 mds length: length of mds header and mds data, starting from oct13
+oct 12
+--
+oct 13 mds header and data
+...
+--
+
+The current sequence number/fragment number are being used in MDS for all
+messages sent to all discovered tipc portid(s), meaning that every message is 
sent
+to any tipc portid, the sequence/fragment number is increased. The flow control
+needs its own sequence number sliding between two tipc porid(s) so that 
receivers
+can detect message drop due to buffer overload. Therefore, the oct8 and oct9 
are
+now reused as flow control sequence number. The oct10, protocol version, has 
new
+value of 0xB8. The format of new data message as below:
+
+oct 0  same
+...
+oct 7
+--
+oct 8  flow control sequence number
+oct 9
+--
+oct 10 protocol version: (MDS_PROT_TIPC_FCTRL:0xB0 | MDS_VERSION:0x08) = 0xB8
+--
+oct 11 same
+...
+--
+
+The ACK message is introduced to acknowledge one data message or a chunk of
+accumulative data message. The ACK message format:
+
+oct 0  message length
+oct 1
+--
+oct 2  8 bytes, NOT USED
+
+oct 9

Re: [devel] [PATCH 0/9] Review Request for mds: Add solution for TIPC buffer overflow [#1960]

2019-08-14 Thread Minh Hon Chau


Hi all,

Please ignore this patch series, the patch 2/9 and 9/9 committed under 
Thuan name/email, have been dropped (for some reason :) ) when sending 
for review.


I am sending again.

Thanks

Minh

On 14/8/19 4:01 pm, Minh Chau wrote:

Summary: mds: Add solution of TIPC buffer overflow at MDS [#1960]
Review request for Ticket(s): 1960
Peer Reviewer(s): Anders, HansN, Lennart, Gary, Vu, Thuan
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-1960
Base revision: 2d85d5d9264c6a7d1c6601b900fb810facbee3ac
Personal repository: git://git.code.sf.net/u/minh-chau/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesy
  OpenSAF servicesn
  Core libraries  y
  Samples n
  Tests   n
  Other   n

NOTE: Patch(es) contain lines longer than 80 characers

Comments (indicate scope for each "y" above):
-
Sending on behalf of Thuan & Minh.
Some pending tasks to accomplish

. Handle broadcast/multicast mds message with flow control.
. Reduce the memory re-allocation overhead if enables flow control.
(At this moment, memory is allocated at mds_dt_tip.c and cloned to buffer
for retransmission queue again).
. The sequence number arithmetic (sna) should be implemented in /base code.
. Adding mdstest to cover sna wrapped-round
. MDS_CHECKSUM_ENABLE_FLAG

revision c49fdeb17fae20b4e0e9af134cc9b60de846c271
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:40:05 +1000

mds: Add TIPC buffer overflow for mdstest [#1960]



revision 6948a2456642600d541b55c9787bb17cfde48a7e
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:40:05 +1000

mds: Apply serial number arithmetic for sequence counter [#1960]

This patch applies the serial number arithmetic for the flow control
sequence number, referenced to RFC1982.

This is only temporary patch, a proper one could be made in /base
with template for others type, e.g uint32. Then mds reuses it from
/base.



revision 87662f659682f813f6746eef0e60d1e52ab03ff1
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:40:05 +1000

mds: Add configurable parameters [#1960]

This patch makes the solution of TIPC buffer overflow configurable,
as well as the ack timeout/ack size.
For example:
The service config file can export the following environment variables

export MDS_TIPC_FCTRL_ENABLED=1
export MDS_TIPC_FCTRL_ACKTIMEOUT=1000
export MDS_TIPC_FCTRL_ACKSIZE=1

If MDS_TIPC_FCTRL_ACKTIMEOUT, MDS_TIPC_FCTRL_ACKSIZE are not specified,
the default values are used.



revision cd4f8af3f53b16aa05d11f30e25da209e7e51e98
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:40:05 +1000

mds: Implement kRcvBuffOverflow state [#1960]

This patch implements the kRcvBuffOverflow state machine as
described in README file.



revision d5c9e8fc8605f453155f4a260ebda0f78ee017b4
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:40:05 +1000

mds: Add state machine for tipc portid instance [#1960]

This patch adds state machine to support tx probation timer.



revision f3f159d0aa3f43c4b28cbd6f0c7c9f041f4b6fd8
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:40:05 +1000

mds: Add timeout for ack message [#1960]

If the ack size is configured greater than 1, there should be a timeout
at receiver ends to send the ack message back to senders.
The ack message timeout utilizes the poll timeout in flow control thread
to make mds lightweight (in contrast to additional timer threads).



revision 6b69713c85dfc46b4d570a61eb2e2c4b71c354f9
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:39:39 +1000

mds: Add implementation for TIPC buffer overflow solution [#1960]

This is a collaborative patch of two participants:
- Tran Thuan 
- Minh Chau 

Main changes:
- Add mds_tipc_fctrl_intf.h, mds_tipc_fctrl_intf.cc: These two files
introduce new functions which are called in mds_dt_tipc.c if the flow
control is enabled
- Add mds_tipc_fctrl_portid.h, mds_tipc_fctrl_portid.cc: These files
implements the tipc portid instance, which supports the sliding window,
mds msg queue
- Add mds_tipc_fctrl_msg.h, mds_tipc_fctrl_msg.cc: These files define
the event and messages which are used for this solution.



revision f71e0ba303ea75b8f845d9f72ab903af93817c87
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:08:30 +1000

mds: Resolve c/c++ linking issue [#1960]

This patch solves the linking issue if mds_dt.h or mds_core.h
is included in c++ sources.



revision 983ad4f94c9b9d458ba5a3f12351cd5b143c78d5
Author: Minh Chau 
Date:   Wed, 14 Aug 2019 15:08:30 +1000

mds: Add README for solution of TIPC buffer overflow at MDS [#1960]



Added Files:

  src/mds/mds_tipc_fctrl_intf.cc
  src/mds/mds_tipc_fctrl_intf.h
  src/mds/mds_tipc_fctrl_msg.cc
  src/mds/mds_tipc_fctrl_msg.h

Re: [devel] [PATCH 1/1] amf: fix no active assignment even one in-service SU can be assigned [#3020]

2019-08-08 Thread Minh Hon Chau


Hi Thuan,

ack with minor comments.

Thanks

Minh

On 18/3/19 7:04 pm, thuan.tran wrote:

AMFD should try assign SI active for other in-service SUs if fail to assign
for current in-service SU
---
  src/amf/amfd/sg_2n_fsm.cc | 75 +--
  1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/src/amf/amfd/sg_2n_fsm.cc b/src/amf/amfd/sg_2n_fsm.cc
index 91ffc63..ba0f72e 100644
--- a/src/amf/amfd/sg_2n_fsm.cc
+++ b/src/amf/amfd/sg_2n_fsm.cc
@@ -630,6 +630,43 @@ done:
  }
  
  /*

+ * Function: avd_sg_2n_assign_si
+ *
+ * Purpose:  This function choose and assign SIs in the SG that dont have
+ *   active assignment.
+ *
+ * Input: cb - the AVD control block
+ *sg - The pointer to the service group.
+ *su - The pointer to the service unit to be assigned ACTIVE.
+ *
+ * Returns: True if assign succeed, otherwise return false
+ *
+ **/
+static bool avd_sg_2n_assign_si(AVD_CL_CB *cb, AVD_SG *sg, AVD_SU *su) {
[M]: This function only creates active assignment, the name could be 
avd_sg_2n_assign_act_si (or you can come up another name) to suggest 
what it is actually doing inside. And add TRACE_ENTER()/LEAVE().

+  bool l_flag = false;
+  AVD_SU_SI_REL *tmp_susi;
+  /* choose and assign SIs in the SG that dont have active assignment */
+  for (const auto _si : sg->list_of_si) {
+if ((i_si->saAmfSIAdminState == SA_AMF_ADMIN_UNLOCKED) &&
+(i_si->list_of_csi != nullptr) &&
+(i_si->si_dep_state != AVD_SI_SPONSOR_UNASSIGNED) &&
+(i_si->si_dep_state != AVD_SI_UNASSIGNING_DUE_TO_DEP) &&
+(i_si->si_dep_state != AVD_SI_READY_TO_UNASSIGN) &&
+(i_si->list_of_sisu == AVD_SU_SI_REL_NULL) &&
+(su->saAmfSUNumCurrActiveSIs < sg->saAmfSGMaxActiveSIsperSU)) {
+  /* found a SI that needs active assignment. */
+  if (avd_new_assgn_susi(cb, su, i_si, SA_AMF_HA_ACTIVE, false,
+ _susi) == NCSCC_RC_SUCCESS) {
+l_flag = true;
+  } else {
+LOG_ER("%s:%u: %s", __FILE__, __LINE__, i_si->name.c_str());
+  }
+}
+  }
+  return l_flag;
+}
+
+/*
   * Function: avd_sg_2n_su_chose_asgn
   *
   * Purpose:  This function will identify the current active SU.
@@ -675,7 +712,10 @@ static AVD_SU *avd_sg_2n_su_chose_asgn(AVD_CL_CB *cb, 
AVD_SG *sg) {
  for (const auto  : sg->list_of_su) {
if (iter->saAmfSuReadinessState == SA_AMF_READINESS_IN_SERVICE) {
  a_su = iter;
-break;
+l_flag = avd_sg_2n_assign_si(cb, sg, a_su);
+if (l_flag == true) {
+  break;
+}
}
  }
  
@@ -683,36 +723,13 @@ static AVD_SU *avd_sg_2n_su_chose_asgn(AVD_CL_CB *cb, AVD_SG *sg) {

TRACE("No in service SUs available in the SG");
goto done;
  }
-  } else { /* if (a_susi == AVD_SU_SI_REL_NULL) */
-
+  } else { /* if (a_susi != AVD_SU_SI_REL_NULL) */
  a_su = a_susi->su;
-  }
-
-  if (a_su->saAmfSuReadinessState != SA_AMF_READINESS_IN_SERVICE) {
-TRACE("The current active SU is OOS so return");
-goto done;
-  }
-
-  /* check if any more active SIs can be assigned to this SU */
-  l_flag = false;
-
-  /* choose and assign SIs in the SG that dont have active assignment */
-  for (const auto _si : sg->list_of_si) {
-if ((i_si->saAmfSIAdminState == SA_AMF_ADMIN_UNLOCKED) &&
-(i_si->list_of_csi != nullptr) &&
-(i_si->si_dep_state != AVD_SI_SPONSOR_UNASSIGNED) &&
-(i_si->si_dep_state != AVD_SI_UNASSIGNING_DUE_TO_DEP) &&
-(i_si->si_dep_state != AVD_SI_READY_TO_UNASSIGN) &&
-(i_si->list_of_sisu == AVD_SU_SI_REL_NULL) &&
-(a_su->saAmfSUNumCurrActiveSIs < sg->saAmfSGMaxActiveSIsperSU)) {
-  /* found a SI that needs active assignment. */
-  if (avd_new_assgn_susi(cb, a_su, i_si, SA_AMF_HA_ACTIVE, false,
- _susi) == NCSCC_RC_SUCCESS) {
-l_flag = true;
-  } else {
-LOG_ER("%s:%u: %s", __FILE__, __LINE__, i_si->name.c_str());
-  }
+if (a_su->saAmfSuReadinessState != SA_AMF_READINESS_IN_SERVICE) {
+  TRACE("The current active SU is OOS so return");
+  goto done;
  }
+l_flag = avd_sg_2n_assign_si(cb, sg, a_su);
}
  
/* if any assignments have been done return the SU */



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] amfd: include failover info in coldsync [#3060]

2019-07-21 Thread Minh Hon Chau


Hi,

ack (code review only)

Thanks

Minh

On 19/7/19 4:47 pm, Gary Lee wrote:

Failover information is not currently included in coldsync. This means
if a delayed failover is in progress *before* a standby controller is
available, *and* a controller failover occurs, then information about
the delayed failover is lost.
---
  src/amf/amfd/chkop.cc  |  4 ++
  src/amf/amfd/ckpt.h|  4 +-
  src/amf/amfd/ckpt_dec.cc   | 77 --
  src/amf/amfd/ckpt_edu.cc   |  2 +
  src/amf/amfd/ckpt_enc.cc   |  5 ++-
  src/amf/amfd/node.h|  3 ++
  src/amf/amfd/node_state_machine.cc |  2 +
  src/amf/amfd/util.cc   |  1 +
  8 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/src/amf/amfd/chkop.cc b/src/amf/amfd/chkop.cc
index e9a68f4..56b0142 100644
--- a/src/amf/amfd/chkop.cc
+++ b/src/amf/amfd/chkop.cc
@@ -1051,6 +1051,10 @@ uint32_t avsv_send_ckpt_data(AVD_CL_CB *cb, uint32_t 
action,
  avd_cb->avd_peer_ver);
  return NCSCC_RC_SUCCESS;
}
+  if (avd_cb->avd_peer_ver >= AVD_MBCSV_SUB_PART_VERSION_10) {
+cb->async_updt_cnt.failover_updt++;
+  }
+
break;
  default:
return NCSCC_RC_SUCCESS;
diff --git a/src/amf/amfd/ckpt.h b/src/amf/amfd/ckpt.h
index 875776a..2e15387 100644
--- a/src/amf/amfd/ckpt.h
+++ b/src/amf/amfd/ckpt.h
@@ -35,9 +35,10 @@
  #define AMF_AMFD_CKPT_H_
  
  // current version

-#define AVD_MBCSV_SUB_PART_VERSION 9
+#define AVD_MBCSV_SUB_PART_VERSION 10
  
  // supported versions

+#define AVD_MBCSV_SUB_PART_VERSION_10 10
  #define AVD_MBCSV_SUB_PART_VERSION_9 9
  #define AVD_MBCSV_SUB_PART_VERSION_8 8
  #define AVD_MBCSV_SUB_PART_VERSION_7 7
@@ -109,6 +110,7 @@ typedef struct avsv_async_updt_cnt {
uint32_t compcstype_updt;
uint32_t si_trans_updt;
uint32_t ng_updt;
+  uint32_t failover_updt;
  } AVSV_ASYNC_UPDT_CNT;
  
  /*

diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc
index a46f6d3..6288b4f 100644
--- a/src/amf/amfd/ckpt_dec.cc
+++ b/src/amf/amfd/ckpt_dec.cc
@@ -178,6 +178,31 @@ const AVSV_DECODE_COLD_SYNC_RSP_DATA_FUNC_PTR 
dec_cs_data_func_list[] = {
  dec_cs_comp_config, dec_cs_comp_cs_type_config, dec_cs_siass,
  dec_cs_si_trans,dec_cs_async_updt_cnt};
  
+void set_node_failover_state(AVD_CL_CB *cb, const SaClmNodeIdT node_id,

+const uint32_t state) {
+  TRACE_ENTER();
+
+  if (state == NodeState::NodeStates::kUndefined) {
+// not in failover list
+return;
+  }
+
+  auto failed_node = cb->failover_list.find(node_id);
+  if (failed_node != cb->failover_list.end()) {
+failed_node->second->SetState(state);
+  } else {
+LOG_NO("Node '%u' not found in failover_list. Create new entry",
+node_id);
+auto new_node = std::make_shared(cb, node_id);
+// node must be added to failover_list before SetState() is called.
+// If the state is 'end', then it will be deleted by SetState().
+// Otherwise, we will leave a node in 'End' state mistakenly in
+// failover_list.
+cb->failover_list[node_id] = new_node;
+new_node->SetState(state);
+  }
+}
+
  void decode_cb(NCS_UBAID *ub, AVD_CL_CB *cb, const uint16_t peer_version) {
osaf_decode_uint32(ub, reinterpret_cast(>init_state));
osaf_decode_satimet(ub, >cluster_init_time);
@@ -254,6 +279,9 @@ void decode_node_config(NCS_UBAID *ub, AVD_AVND *avnd,
osaf_decode_uint32(ub, >rcv_msg_id);
osaf_decode_uint32(ub, >snd_msg_id);
osaf_extended_name_free(_name);
+  if (peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) {
+osaf_decode_uint32(ub, >failover_state);
+  }
TRACE_LEAVE();
  }
  
@@ -585,7 +613,7 @@ void decode_siass(NCS_UBAID *ub, AVSV_SU_SI_REL_CKPT_MSG *su_si_ckpt,

  su_si_ckpt->csi_add_rem = static_cast(csi_add_rem);
  osaf_decode_sanamet(ub, _si_ckpt->comp_name);
  osaf_decode_sanamet(ub, _si_ckpt->csi_name);
-  };
+  }
  }
  
  /\

@@ -2199,6 +2227,7 @@ static uint32_t dec_cs_node_config(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec,
for (count = 0; count < num_of_obj; count++) {
  decode_node_config(>i_uba, , dec->i_peer_version);
  status = avd_ckpt_node(cb, , dec->i_action);
+set_node_failover_state(cb, avnd.node_info.nodeId, avnd.failover_state);
  osafassert(status == NCSCC_RC_SUCCESS);
}
  
@@ -2552,14 +2581,23 @@ static uint32_t dec_cs_async_updt_cnt(AVD_CL_CB *cb, NCS_MBCSV_CB_DEC *dec,

/*
 * Decode and send async update counts for all the data structures.
 */
-  if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_7) {
+  if (dec->i_peer_version >= AVD_MBCSV_SUB_PART_VERSION_10) {
  TRACE(
-"Peer AMFD version is >= AVD_MBCSV_SUB_PART_VERSION_7,"
+"Peer AMFD version is >= AVD_MBCSV_SUB_PART_VERSION_10,"
  "peer ver:%d",
  avd_cb->avd_peer_ver);
  status = m_NCS_EDU_VER_EXEC(>edu_hdl,

Re: [devel] [PATCH 1/1] amfd: ignore amfnd down event if node state is absent [#3015]

2019-06-23 Thread Minh Hon Chau


Hi Thang,

Ack from me.

Thanks

Minh

On 11/6/19 4:11 pm, Minh Hon Chau wrote:

Hi Thang,

I can see it's a race in main thread that how amfd processes the mds 
down and clm callback.


Node is going down

<143>1 2019-06-11T15:16:42.157517+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38507"] 275:amf/amfd/mds.cc:398 >> avd_mds_svc_evt
<143>1 2019-06-11T15:16:42.157526+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38508"] 275:amf/amfd/mds.cc:459 TR avnd 
2030f00bd down
<143>1 2019-06-11T15:16:42.157535+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38509"] 275:amf/amfd/mds.cc:0 << avd_mds_svc_evt


<143>1 2019-06-11T15:16:48.332481+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38623"] 272:amf/amfd/clm.cc:226 >> clm_track_cb: '0' 
'4' '1'
<143>1 2019-06-11T15:16:48.33249+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38624"] 272:amf/amfd/clm.cc:242 TR 
numberOfMembers:'4', numberOfItems:'1'
<143>1 2019-06-11T15:16:48.3325+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38625"] 272:amf/amfd/clm.cc:248 TR i = 0, 
node:'safNode=PL-3,safCluster=myClmCluster', clusterChange:3
<143>1 2019-06-11T15:16:48.33251+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38626"] 272:amf/amfd/clm.cc:332 TR Node Left: 
rootCauseEntity safNode=PL-3,safCluster=myClmCluster for node 131855
<143>1 2019-06-11T15:16:48.332519+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38627"] 272:amf/amfd/clm.cc:188 >> 
clm_node_exit_complete: 2030f
<143>1 2019-06-11T15:16:48.332534+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38628"] 272:amf/amfd/ndproc.cc:1267 >> 
avd_node_failover: 'safAmfNode=PL-3,safAmfCluster=myAmfCluster'
<143>1 2019-06-11T15:16:48.332545+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="38629"] 272:amf/amfd/ndfsm.cc:1153 >> 
avd_node_mark_absent


Node is coming up again

<143>1 2019-06-11T15:16:48.34867+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="39826"] 272:amf/amfd/clm.cc:226 >> clm_track_cb: '0' 
'4' '1'
<143>1 2019-06-11T15:16:48.348674+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="39827"] 272:amf/amfd/clm.cc:242 TR 
numberOfMembers:'5', numberOfItems:'1'
<143>1 2019-06-11T15:16:48.348678+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="39828"] 272:amf/amfd/clm.cc:248 TR i = 0, 
node:'safNode=PL-3,safCluster=myClmCluster', clusterChange:2
<143>1 2019-06-11T15:16:48.348685+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="39829"] 272:amf/amfd/node.cc:53 TR added node 131855
<143>1 2019-06-11T15:16:48.348689+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="39830"] 272:amf/amfd/clm.cc:417 TR Node Joined 
'safNode=PL-3,safCluster=myClmCluster' '36'


Now amfd processes the mds down in main thread, its a race here then 
the @node_info.member set to FALSE


<143>1 2019-06-11T15:16:48.351948+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="39971"] 272:amf/amfd/ndfsm.cc:779 >> 
avd_mds_avnd_down_evh: 2030f, 0x558e549a1650
<143>1 2019-06-11T15:16:48.351954+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="39972"] 272:amf/amfd/ndproc.cc:1267 >> 
avd_node_failover: 'safAmfNode=PL-3,safAmfCluster=myAmfCluster'
<143>1 2019-06-11T15:16:48.351959+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="39973"] 272:amf/amfd/ndfsm.cc:1153 >> 
avd_node_mark_absent


Now the mds up comes, node_up come, but the node is not a clm member

<143>1 2019-06-11T15:16:48.701771+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="40552"] 275:amf/amfd/mds.cc:398 >> avd_mds_svc_evt
<143>1 2019-06-11T15:16:48.701791+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="40553"] 275:amf/amfd/mds.cc:0 << avd_mds_svc_evt


<143>1 2019-06-11T15:16:48.706254+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="40560"] 272:amf/amfd/ndfsm.cc:743 >> 
avd_mds_avnd_up_evh
<143>1 2019-06-11T15:16:48.706271+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="40561"] 275:amf/amfd/ndmsg.cc:389 << avd_n2d_msg_rcv
<143>1 2019-06-11T15:16:48.706288+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="40562"] 272:amf/amfd/ndfsm.cc:757 TR amfnd on 2030f 
is up
<143>1 2019-06-11T15:16:48.706298+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="40563"] 272:amf/amfd/ndfsm.cc:0 << avd_mds_avnd_up_evh


<143>1 2019-06-11T15:16:48.707145+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="40596"] 272:amf/amfd/ndfsm.cc:275 >> 
avd_node_up_evh: from 2030f, safAmfNode=PL-3,safAmfCluster=myAmfCluster
<143>1 2019-06-11T15:16:48.707153+10:00 SC-1 osafamfd 272 osafamfd 
[meta sequenceId="40597"] 272:amf/am

Re: [devel] [PATCH 1/1] amf: remove SUSI assignemnt of dependent SI during failover [#3049]

2019-06-20 Thread Minh Hon Chau


Hi Thang,

ack (review + test). In below syslog, I got the assignment of sponsor + 
dependent on the locked SC removed, and the other SC creates new active 
assignments.


Minor comment: In sg_2n_fsm:node_fail_su_oper(), starting from line 
3153, the codes are now most likely the same for both standby and active :)


3152:  } else {
3154:    /* the SU is not the same as the SU in the list */
3153:    if (avd_su_state_determine(su) == SA_AMF_HA_STANDBY) {

*// same as the below active*

    } /* if(avd_su_state_determine(su) == SA_AMF_HA_STANDBY) */
    else if (avd_su_state_determine(su) == SA_AMF_HA_ACTIVE) {

    }

Thanks

Minh

--syslog--

2019-06-20 19:01:44.975 SC-1 osafamfnd[331]: NO Assigning 
'safSi=ma_si,safApp=ma_app' QUIESCED to 
'safSu=ma_su_1,safSg=ma_sg,safApp=ma_app'
2019-06-20 19:01:44.976 SC-1 amf_demo[533]: CSI Set - HAState Quiesced 
for all assigned CSIs
2019-06-20 19:01:44.977 SC-1 osafamfnd[331]: NO Assigning 
'safSi=ma_si_new,safApp=ma_app_new' QUIESCED to 
'safSu=ma_su_3_new,safSg=ma_sg_new,safApp=ma_app_new'
2019-06-20 19:01:44.977 SC-1 amf_demo_ori[599]: CSI Set - HAState 
Quiesced for all assigned CSIs
2019-06-20 19:01:44.977 SC-1 osafamfnd[331]: NO Assigned 
'safSi=ma_si_new,safApp=ma_app_new' QUIESCED to 
'safSu=ma_su_3_new,safSg=ma_sg_new,safApp=ma_app_new'
2019-06-20 19:01:51.978 SC-1 osafamfnd[331]: NO Assigned 
'safSi=ma_si,safApp=ma_app' QUIESCED to 
'safSu=ma_su_1,safSg=ma_sg,safApp=ma_app'

2019-06-20 19:01:52.895 SC-1 osafdtmd[169]: NO Lost contact with 'SC-2'

2019-06-20 19:01:52.903 SC-1 osafclmd[306]: NO Node 131599 went down. 
Not sending track callback for agents on that node

2019-06-20 19:01:52.903 SC-1 osafamfd[316]: NO Node 'SC-2' left the cluster

2019-06-20 19:01:52.937 SC-1 osaffmd[201]: NO Current role: ACTIVE
2019-06-20 19:01:52.938 SC-1 osaffmd[201]: Rebooting OpenSAF NodeId = 
131599 EE Name = , Reason: Received Node Down for peer controller, 
OwnNodeId = 131343, SupervisionTime = 60
2019-06-20 19:01:52.957 SC-1 opensaf_reboot: Rebooting remote node in 
the absence of PLM is outside the scope of OpenSAF
2019-06-20 19:01:52.958 SC-1 osafamfnd[331]: NO Removing 
'safSi=ma_si,safApp=ma_app' from 'safSu=ma_su_1,safSg=ma_sg,safApp=ma_app'

2019-06-20 19:01:52.958 SC-1 amf_demo[533]: CSI Remove for all CSIs
2019-06-20 19:01:52.959 SC-1 amf_demo[533]: state: 3, mode: 1, code: 1
2019-06-20 19:01:52.959 SC-1 osafamfnd[331]: NO Removing 
'safSi=ma_si_new,safApp=ma_app_new' from 
'safSu=ma_su_3_new,safSg=ma_sg_new,safApp=ma_app_new'

2019-06-20 19:01:52.959 SC-1 amf_demo_ori[599]: CSI Remove for all CSIs
2019-06-20 19:01:52.959 SC-1 osafamfnd[331]: NO Removed 
'safSi=ma_si,safApp=ma_app' from 'safSu=ma_su_1,safSg=ma_sg,safApp=ma_app'
2019-06-20 19:01:52.960 SC-1 osafamfnd[331]: NO Removed 
'safSi=ma_si_new,safApp=ma_app_new' from 
'safSu=ma_su_3_new,safSg=ma_sg_new,safApp=ma_app_new'
2019-06-20 19:01:52.962 SC-1 osafamfd[316]: NO Assigning due to dep 
'safSi=ma_si,safApp=ma_app'
2019-06-20 19:01:52.964 SC-1 osafamfd[316]: NO Tolerance timer started, 
sponsor si:'safSi=ma_si,safApp=ma_app', dependent 
si:safSi=ma_si_new,safApp=ma_app_new
2019-06-20 19:01:54.352 SC-1 osafdtmd[169]: NO Established contact with 
'SC-2'


2019-06-20 19:01:56.368 SC-2 osafamfnd[257]: NO Assigning 
'safSi=ma_si,safApp=ma_app' ACTIVE to 
'safSu=ma_su_2,safSg=ma_sg,safApp=ma_app'
2019-06-20 19:01:56.369 SC-2 amf_demo[444]: CSI Set - add 
'safCsi=ma_csi,safSi=ma_si,safApp=ma_app' HAState Active
2019-06-20 19:02:04.372 SC-2 osafamfnd[257]: NO Assigned 
'safSi=ma_si,safApp=ma_app' ACTIVE to 
'safSu=ma_su_2,safSg=ma_sg,safApp=ma_app'
2019-06-20 19:02:04.390 SC-2 osafamfnd[257]: NO Assigning 
'safSi=ma_si_new,safApp=ma_app_new' ACTIVE to 
'safSu=ma_su_4_new,safSg=ma_sg_new,safApp=ma_app_new'
2019-06-20 19:02:04.391 SC-2 amf_demo_ori[440]: CSI Set - add 
'safCsi=ma_csi_new,safSi=ma_si_new,safApp=ma_app_new' HAState Active
2019-06-20 19:02:04.391 SC-2 osafamfnd[257]: NO Assigned 
'safSi=ma_si_new,safApp=ma_app_new' ACTIVE to 
'safSu=ma_su_4_new,safSg=ma_sg_new,safApp=ma_app_new'


On 12/6/19 12:01 pm, thang.d.nguyen wrote:

When lock node invokes on active assignment. The dependent SI
follow with sponsor SI move to QUIESCED. There is a case that
the active assignment for sponsor is happening on remain SC node.
And that remaining node was down. The remove SISU only happen for
sponsor SI.
The fix is to remove SUSI of dependent SI.
---
  src/amf/amfd/sg_2n_fsm.cc | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/src/amf/amfd/sg_2n_fsm.cc b/src/amf/amfd/sg_2n_fsm.cc
index 91ffc63..776696c 100644
--- a/src/amf/amfd/sg_2n_fsm.cc
+++ b/src/amf/amfd/sg_2n_fsm.cc
@@ -3175,6 +3175,9 @@ void SG_2N::node_fail_su_oper(AVD_SU *su) {
  }
  
  su->sg_of_su->set_fsm_state(AVD_SG_FSM_SG_REALIGN);

+  } else {
+avd_sg_su_si_del_snd(cb, su_oper_list.front());
+su->sg_of_su->set_fsm_state(AVD_SG_FSM_SG_REALIGN);
}

Re: [devel] [PATCH 1/1] amf: fix SU get stuck in INSTANTIATING presence state [#3047]

2019-06-13 Thread Minh Hon Chau


Hi Thuan,

ack with minor comment.

Thanks

Minh

On 3/6/19 5:10 pm, thuan.tran wrote:

COMP restart recovery during SU restart recovery can lead to SU
stuck in INSTANTIATING without further action. Because COMP instaniated
event in RESTARTING does not trigger avnd_su_pres_fsm_run().
---
  src/amf/amfnd/clc.cc  | 4 
  src/amf/amfnd/susm.cc | 4 +++-
  2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
index 675ca49..9b1b3a7 100644
--- a/src/amf/amfnd/clc.cc
+++ b/src/amf/amfnd/clc.cc
@@ -926,6 +926,7 @@ uint32_t avnd_comp_clc_st_chng_prc(AVND_CB *cb, AVND_COMP 
*comp,
AVND_SU_PRES_FSM_EV ev = AVND_SU_PRES_FSM_EV_MAX;
AVND_COMP_CSI_REC *csi = 0;
bool is_en;
+  bool pi_comp_recover = false;
uint32_t rc = NCSCC_RC_SUCCESS;
TRACE_ENTER2("Comp '%s', Prv_state '%s', Final_state '%s'",
 comp->name.c_str(), presence_state[prv_st],
@@ -953,6 +954,8 @@ uint32_t avnd_comp_clc_st_chng_prc(AVND_CB *cb, AVND_COMP 
*comp,
  TRACE_1(
  "Component restart is through admin opration, admin oper flag reset");
  comp->admin_oper = false;
+  } else if (m_AVND_COMP_TYPE_IS_PREINSTANTIABLE(comp)) {
+pi_comp_recover = true;


[M]: It looks doubtful, the check itself only wants to know if the @comp 
is pi, it does not relate to the first *if* (@admin_oper and @final_st)?



}
  
if ((SA_AMF_PRESENCE_INSTANTIATED == prv_st) &&

@@ -1487,6 +1490,7 @@ uint32_t avnd_comp_clc_st_chng_prc(AVND_CB *cb, AVND_COMP 
*comp,
   (SA_AMF_PRESENCE_ORPHANED != prv_st) &&
   ((prv_st == SA_AMF_PRESENCE_INSTANTIATING) ||
(prv_st == SA_AMF_PRESENCE_TERMINATING) ||
+  (prv_st == SA_AMF_PRESENCE_RESTARTING && pi_comp_recover) ||
(comp->su->admin_op_Id == SA_AMF_ADMIN_RESTART)))
ev = AVND_SU_PRES_FSM_EV_COMP_INSTANTIATED;
  else if (SA_AMF_PRESENCE_INSTANTIATION_FAILED == final_st)
diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc
index c023c8d..62e2db9 100644
--- a/src/amf/amfnd/susm.cc
+++ b/src/amf/amfnd/susm.cc
@@ -2282,7 +2282,9 @@ uint32_t avnd_su_pres_insting_compinst_hdler(AVND_CB *cb, 
AVND_SU *su,
 curr_comp; curr_comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET(
m_NCS_DBLIST_FIND_NEXT(_comp->su_dll_node))) {
  /* instantiate the pi comp */
-if (m_AVND_COMP_TYPE_IS_PREINSTANTIABLE(curr_comp)) {
+if (m_AVND_COMP_TYPE_IS_PREINSTANTIABLE(curr_comp) &&
+   (!m_AVND_COMP_IS_FAILED(curr_comp) ||
+curr_comp->pres != SA_AMF_PRESENCE_RESTARTING)) {
TRACE("Running the component clc FSM");
rc = avnd_comp_clc_fsm_run(cb, curr_comp,
   AVND_COMP_CLC_PRES_FSM_EV_INST);



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] amfd: ignore amfnd down event if node state is absent [#3015]

2019-06-11 Thread Minh Hon Chau


Hi Thang,

I can see it's a race in main thread that how amfd processes the mds 
down and clm callback.


Node is going down

<143>1 2019-06-11T15:16:42.157517+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38507"] 275:amf/amfd/mds.cc:398 >> avd_mds_svc_evt
<143>1 2019-06-11T15:16:42.157526+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38508"] 275:amf/amfd/mds.cc:459 TR avnd 2030f00bd down
<143>1 2019-06-11T15:16:42.157535+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38509"] 275:amf/amfd/mds.cc:0 << avd_mds_svc_evt


<143>1 2019-06-11T15:16:48.332481+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38623"] 272:amf/amfd/clm.cc:226 >> clm_track_cb: '0' '4' '1'
<143>1 2019-06-11T15:16:48.33249+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38624"] 272:amf/amfd/clm.cc:242 TR numberOfMembers:'4', 
numberOfItems:'1'
<143>1 2019-06-11T15:16:48.3325+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38625"] 272:amf/amfd/clm.cc:248 TR i = 0, 
node:'safNode=PL-3,safCluster=myClmCluster', clusterChange:3
<143>1 2019-06-11T15:16:48.33251+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38626"] 272:amf/amfd/clm.cc:332 TR Node Left: 
rootCauseEntity safNode=PL-3,safCluster=myClmCluster for node 131855
<143>1 2019-06-11T15:16:48.332519+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38627"] 272:amf/amfd/clm.cc:188 >> clm_node_exit_complete: 2030f
<143>1 2019-06-11T15:16:48.332534+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38628"] 272:amf/amfd/ndproc.cc:1267 >> avd_node_failover: 
'safAmfNode=PL-3,safAmfCluster=myAmfCluster'
<143>1 2019-06-11T15:16:48.332545+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="38629"] 272:amf/amfd/ndfsm.cc:1153 >> avd_node_mark_absent


Node is coming up again

<143>1 2019-06-11T15:16:48.34867+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="39826"] 272:amf/amfd/clm.cc:226 >> clm_track_cb: '0' '4' '1'
<143>1 2019-06-11T15:16:48.348674+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="39827"] 272:amf/amfd/clm.cc:242 TR numberOfMembers:'5', 
numberOfItems:'1'
<143>1 2019-06-11T15:16:48.348678+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="39828"] 272:amf/amfd/clm.cc:248 TR i = 0, 
node:'safNode=PL-3,safCluster=myClmCluster', clusterChange:2
<143>1 2019-06-11T15:16:48.348685+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="39829"] 272:amf/amfd/node.cc:53 TR added node 131855
<143>1 2019-06-11T15:16:48.348689+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="39830"] 272:amf/amfd/clm.cc:417 TR Node Joined 
'safNode=PL-3,safCluster=myClmCluster' '36'


Now amfd processes the mds down in main thread, its a race here then the 
@node_info.member set to FALSE


<143>1 2019-06-11T15:16:48.351948+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="39971"] 272:amf/amfd/ndfsm.cc:779 >> avd_mds_avnd_down_evh: 
2030f, 0x558e549a1650
<143>1 2019-06-11T15:16:48.351954+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="39972"] 272:amf/amfd/ndproc.cc:1267 >> avd_node_failover: 
'safAmfNode=PL-3,safAmfCluster=myAmfCluster'
<143>1 2019-06-11T15:16:48.351959+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="39973"] 272:amf/amfd/ndfsm.cc:1153 >> avd_node_mark_absent


Now the mds up comes, node_up come, but the node is not a clm member

<143>1 2019-06-11T15:16:48.701771+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40552"] 275:amf/amfd/mds.cc:398 >> avd_mds_svc_evt
<143>1 2019-06-11T15:16:48.701791+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40553"] 275:amf/amfd/mds.cc:0 << avd_mds_svc_evt


<143>1 2019-06-11T15:16:48.706254+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40560"] 272:amf/amfd/ndfsm.cc:743 >> avd_mds_avnd_up_evh
<143>1 2019-06-11T15:16:48.706271+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40561"] 275:amf/amfd/ndmsg.cc:389 << avd_n2d_msg_rcv
<143>1 2019-06-11T15:16:48.706288+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40562"] 272:amf/amfd/ndfsm.cc:757 TR amfnd on 2030f is up
<143>1 2019-06-11T15:16:48.706298+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40563"] 272:amf/amfd/ndfsm.cc:0 << avd_mds_avnd_up_evh


<143>1 2019-06-11T15:16:48.707145+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40596"] 272:amf/amfd/ndfsm.cc:275 >> avd_node_up_evh: from 
2030f, safAmfNode=PL-3,safAmfCluster=myAmfCluster
<143>1 2019-06-11T15:16:48.707153+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40597"] 272:amf/amfd/ndfsm.cc:292 TR leds_set 0
<143>1 2019-06-11T15:16:48.70716+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40598"] 272:amf/amfd/ndfsm.cc:308 TR node_id '2030f' not in 
failover_list.
<141>1 2019-06-11T15:16:48.707185+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40599"] 272:amf/amfd/ndfsm.cc:232 NO Received node_up from 
2030f: msg_id 1
<140>1 2019-06-11T15:16:48.7072+10:00 SC-1 osafamfd 272 osafamfd [meta 
sequenceId="40600"] 272:amf/amfd/ndfsm.cc:387 WA Not a Cluster Member 
dropping the msg
<143>1 2019-06-11T15:16:48.707206+10:00 SC-1 osafamfd 272 osafamfd [meta

Re: [devel] [PATCH 1/1] amfd: do not queue sync messages from 'lost' nodes [#3050]

2019-06-10 Thread Minh Hon Chau


Hi Gary,

Those variables e.g node_sync_window_closed have been used before 
headless sync complete. If there is a failover during the headless sync, 
the new active will start the headless sync again, so those variables 
have not been needed to checkpoint. But here the scenario happens in 
split brain, in which the new active is in separated network instead of 
coming from headless, so I guess we do need checkpoint it, but the 
checkpoint should be done after the headless sync ?


And the change in timer.h seems not much relates to this ticket?

Thanks

Minh

On 5/6/19 2:03 pm, Gary Lee wrote:

The 'lost' nodes will be rebooted, thus there is no need
to queue sync messages from these nodes.

In addition, node_sync_window_closed is not reliable as it's not
check pointed. We should remove all uses of it in another ticket?

Instead, check if the timer is running.
---
  src/amf/amfd/cb.h  |  2 ++
  src/amf/amfd/ndproc.cc | 30 ++
  src/amf/amfd/timer.h   | 12 ++--
  3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/amf/amfd/cb.h b/src/amf/amfd/cb.h
index 89cf15d..8902d78 100644
--- a/src/amf/amfd/cb.h
+++ b/src/amf/amfd/cb.h
@@ -237,6 +237,8 @@ typedef struct cl_cb_tag {
 */
bool active_services_exist;
bool all_nodes_synced;
+  // @todo this should be checkpointed to standby? otherwise
+  // after a controller failover, it will still be false?
bool node_sync_window_closed;
  
/*

diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc
index 5f5cbcd..20008d9 100644
--- a/src/amf/amfd/ndproc.cc
+++ b/src/amf/amfd/ndproc.cc
@@ -345,19 +345,26 @@ void avd_nd_sisu_state_info_evh(AVD_CL_CB *cb, AVD_EVT 
*evt) {
evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.node_id,
evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.msg_id);
  
-  if (cb->node_sync_window_closed == false) {

+  const SaClmNodeIdT node_id =
+evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.node_id;
+
+  if (cb->failover_list.find(node_id) != cb->failover_list.end()) {
+// ignore msg
+LOG_WA("sisu_state_info messages received from lost node (%x)",
+   node_id);
+  } else if (cb->node_sync_tmr.is_active == true) {
  AVD_EVT_QUEUE *state_info_evt = new AVD_EVT_QUEUE();
  state_info_evt->evt = new AVD_EVT{};
  state_info_evt->evt->rcv_evt = evt->rcv_evt;
  state_info_evt->evt->info.avnd_msg = n2d_msg;
  cb->evt_queue.push(state_info_evt);
+return;
} else {
  LOG_WA(
-"Ignore this sisu_state_info message since node sync window has 
closed");
-avsv_dnd_msg_free(n2d_msg);
+  "Ignore this sisu_state_info message since node sync window has closed");
}
  
-  TRACE_LEAVE();

+  avsv_dnd_msg_free(n2d_msg);
  }
  
  /*

@@ -387,19 +394,26 @@ void avd_nd_compcsi_state_info_evh(AVD_CL_CB *cb, AVD_EVT 
*evt) {
evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.node_id,
evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.msg_id);
  
-  if (cb->node_sync_window_closed == false) {

+  const SaClmNodeIdT node_id =
+evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.node_id;
+
+  if (cb->failover_list.find(node_id) != cb->failover_list.end()) {
+// ignore msg
+LOG_WA("compcsi_state_info messages received from lost node (%x)",
+   node_id);
+  } else if (cb->node_sync_tmr.is_active == true) {
  AVD_EVT_QUEUE *state_info_evt = new AVD_EVT_QUEUE();
  state_info_evt->evt = new AVD_EVT{};
  state_info_evt->evt->rcv_evt = evt->rcv_evt;
  state_info_evt->evt->info.avnd_msg = n2d_msg;
  cb->evt_queue.push(state_info_evt);
+return;
} else {
  LOG_WA(
-"Ignore this compcsi_state_info message since node sync window has 
closed");
-avsv_dnd_msg_free(n2d_msg);
+  "Ignore this compcsi_state_info message since node sync window has 
closed");
}
  
-  TRACE_LEAVE();

+  avsv_dnd_msg_free(n2d_msg);
  }
  
  /**

diff --git a/src/amf/amfd/timer.h b/src/amf/amfd/timer.h
index 5316879..6db04c7 100644
--- a/src/amf/amfd/timer.h
+++ b/src/amf/amfd/timer.h
@@ -52,12 +52,12 @@ typedef enum avd_tmr_type {
  
  /* AVD Timer definition */

  typedef struct avd_tmr_tag {
-  tmr_t tmr_id;
-  AVD_TMR_TYPE type;
-  SaClmNodeIdT node_id;
-  std::string spons_si_name;
-  std::string dep_si_name;
-  bool is_active;
+  tmr_t tmr_id{};
+  AVD_TMR_TYPE type{AVD_TMR_MAX};
+  SaClmNodeIdT node_id{};
+  std::string spons_si_name{};
+  std::string dep_si_name{};
+  bool is_active{};
  } AVD_TMR;
  
  /* macro to start the cluster init timer. The cb structure



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 0/1] Review Request for mds: support multicast fragmented messages [#3033] V3

2019-04-26 Thread Minh Hon Chau


Hi,

ack from me (code review)

Thanks

Minh

On 25/4/19 9:33 pm, Vu Minh Nguyen wrote:

Hi Hans,

Probably you were looking at code that included this Thuan's patch.

In legacy code, only mdtm_sendto() is called inside the function 
mdtm_frag_and_send().

Regards, Vu


-Original Message-
From: Hans Nordebäck 
Sent: Thursday, April 25, 2019 6:10 PM
To: Vu Minh Nguyen ; Thuan Tran
; Minh Hon Chau

Cc: opensaf-devel@lists.sourceforge.net
Subject: RE: [PATCH 0/1] Review Request for mds: support multicast
fragmented messages [#3033] V3


Hi Vu,
It seems mdtm_mcast_sendto is used in mdtm_frag_and_send, at
MDS_SENDTYPE_BCAST/BR Hans
-Original Message-
From: Vu Minh Nguyen 
Sent: den 25 april 2019 12:20
To: Hans Nordebäck ; Thuan Tran
; Minh Hon Chau

Cc: opensaf-devel@lists.sourceforge.net
Subject: RE: [PATCH 0/1] Review Request for mds: support multicast
fragmented messages [#3033] V3

Hi Hans,

See my responses inline.

Regards, Vu


-Original Message-
From: Hans Nordebäck 
Sent: Thursday, April 25, 2019 4:28 PM
To: Thuan Tran ; Vu Minh Nguyen
; Minh Hon Chau



Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 0/1] Review Request for mds: support multicast
fragmented messages [#3033] V3

Hi Vu and Thuan,

a few question, is the text in the ticket description correct? E.g it
says unicast is used if a multicast message is fragmented, (I think
multicast still is used

to send the fragments), this is what you mean with 2 different channels?
(only one socket is used, BSRsock),

[Vu] Yes. Unicast is used to send fragmented messages. Here is the current
logic in case of sending a large package:
Iterate over destinations { // mcm_pvt_process_svc_bcast_common() @
mds_c_sndrcv.c
1) Fragment the package // mdtm_frag_and_send() @ mds_dt_tipc.c
2) Unicast to a specific adest  // mdtm_sendto() @
mds_dt_tipc.c
4) Continue with next adest
}


The problem stated is sending one large multicast message and then
several smaller multicast messages, have you checked the

fragment re-assembly part of the common code?

[Vu] Yes. At the receive side, if msg is fragmented, mds will not forward to
upper layer until all fragmented msgs are collected.
If the message is not fragmented, mds will transfer the msg to upper right
away.

I checked with TIPC guys here, and he said that TIPC does not guarantee the
order if we send msgs in different channels (unicast vs mcast).


/BR Hans


On 2019-04-24 13:06, thuan.tran wrote:

Summary: mds: support multicast fragmented messages [#3033] Review
request for Ticket(s): 3033 Peer Reviewer(s): Hans, Minh, Vu Pull
request to: *** LIST THE PERSON WITH PUSH ACCESS HERE *** Affected
branch(es): develop Development branch: ticket-3033 Base revision:
7916ac316e86478c621c8359cf2aca4886288a38
Personal repository: git://git.code.sf.net/u/thuantr/review


Impacted area   Impact y/n

   Docsn
   Build systemn
   RPM/packaging   n
   Configuration files n
   Startup scripts n
   SAF servicesy
   OpenSAF servicesn
   Core libraries  n
   Samples n
   Tests   n
   Other   n

NOTE: Patch(es) contain lines longer than 80 characers

Comments (indicate scope for each "y" above):
-
N/A

revision 568f09774f936506f5e05e03813fa572af0fe0d3
Author: thuan.tran 
Date:   Wed, 24 Apr 2019 17:54:25 +0700

mds: support multicast fragmented messages [#3033]

- Sender may send broadcast big messages (> 65K) then small messages
(<

65K).

Current MDS just loop via all destinations to unicast all fragmented

messages

to one by one destinations. But sending multicast non-fragment
messages

to all

destinations. Therefor, receivers may get messages with incorrect
order, non-fragment messages may come before fragmented messages.
For example, it may lead to OUT OF ORDER for IMMNDs during IMMD

sync.

- Solution: support send multicast each fragmented messages to avoid
disorder of arrived broadcast messages.



Complete diffstat:
--
   src/mds/mds_c_sndrcv.c |   3 +-
   src/mds/mds_dt_tipc.c  | 104
+++-

-

   2 files changed, 40 insertions(+), 67 deletions(-)


Testing Commands:
-
N/A

Testing, Expected Results:
--
N/A

Conditions of Submission:
-
N/A

Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  y  y
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any
checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally in

Re: [devel] [PATCH 0/1] Review Request for amfd: increase mds priority of amfnd down event [#3015]

2019-03-01 Thread Minh Hon Chau


Hi Thang,

+ Hans

If the issue is reproducible, can you upload the full log/trace to 
ticket please?


Thanks

Minh

On 27/2/19 10:17 am, thang.d.nguyen wrote:

Summary: amfd: increase mds priority of amfnd down event [#3015]
Review request for Ticket(s): 3015
Peer Reviewer(s): Gary, Minh
Pull request to: Minh
Affected branch(es): develop
Development branch: ticket-3015
Base revision: 1f9cf4636b07d28a906f62b44144c337c5280f1a
Personal repository: git://git.code.sf.net/u/thangng/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesy
  OpenSAF servicesn
  Core libraries  n
  Samples n
  Tests   n
  Other   n


Comments (indicate scope for each "y" above):
-

revision e81b6874f37e9761594f7ee3328486062fcbddb3
Author: thang.d.nguyen 
Date:   Wed, 27 Feb 2019 05:50:11 +0700

amfd: increase mds priority of amfnd down event [#3015]

To avoid the issue a node can not join the cluster
when the PBE hung.



Complete diffstat:
--
  src/amf/amfd/mds.cc | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Testing Commands:
-
N/A.

Testing, Expected Results:
--
N/A.

Conditions of Submission:
-
Acked from reviwer.

Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  y  y
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many blank entries
 that need proper data filled in.

___ You have failed to nominate the proper persons for review and push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible line length in your headers/comments/text.

___ You have failed to put in a proper Trac Ticket # into your commits.

___ You have incorrectly put/left internal data in your comments/files
 (i.e. internal bug tracking tool IDs, product names etc)

___ You have not given any evidence of testing beyond basic build tests.
 Demonstrate some level of runtime or other sanity testing.

___ You have ^M present in some of your files. These have to be removed.

___ You have needlessly changed whitespace or added whitespace crimes
 like trailing spaces, or spaces before tabs.

___ You have mixed real technical changes with whitespace and other
 cosmetic code cleanup changes. These have to be separate commits.

___ You need to refactor your submission into logical chunks; there is
 too much content into a single commit.

___ You have extraneous garbage in your review (merge commits etc)

___ You have giant attachments which should never have been sent;
 Instead you should place your content in a public tree to be pulled.

___ You have too many commits attached to an e-mail; resend as threaded
 commits, or place in a public tree for a pull.

___ You have resent this content multiple times without a clear indication
 of what has changed between each re-send.

___ You have failed to adequately and individually address all of the
 comments and change requests that were proposed in the initial review.

___ You have a misconfigured ~/.gitconfig file (i.e. user.name, user.email etc)

___ Your computer have a badly configured date and time; confusing the
 the threaded patch review.

___ Your changes affect IPC mechanism, and you don't present any results
 for in-service upgradability test.

___ Your changes affect user manual and documentation, your patch series
 do not contain the patch that updates the Doxygen manual.





___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] amf: fix Comp stuck in RESTARTING presence state [#3011]

2019-02-28 Thread Minh Hon Chau


Hi Thuan,

ack (review + test).

Thanks

Minh

:

During SU (many COMPs) restart recovery, if any COMP finish instantiated
then crash while other COMPs are still instantiating, AMF recovery it by
restarting but AMF only cleanup without re-instantiation because AMF see
the COMP is not eligible for instantiation. The error COMP is stuck in
RESTARTING without further action from AMF.

AMF should allow COMP re-instantiation if SU state is INSTANTIATING and
error COMP state is RESTARTING.
---
  src/amf/amfnd/clc.cc | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
index 463c5de..7a62a56 100644
--- a/src/amf/amfnd/clc.cc
+++ b/src/amf/amfnd/clc.cc
@@ -1805,7 +1805,8 @@ static bool 
is_failed_comp_eligible_for_instantiation(AVND_COMP *comp) {
/*During surestart recovery, after cleanup of all components, amfnd 
starts
  instantiation of components. A component may fault at this stage. 
Such a
  component is eligible for instantiation.*/
-  if ((comp->pres == SA_AMF_PRESENCE_INSTANTIATING) &&
+  if (((comp->pres == SA_AMF_PRESENCE_RESTARTING) ||
+   (comp->pres == SA_AMF_PRESENCE_INSTANTIATING)) &&
(comp->su->pres == SA_AMF_PRESENCE_INSTANTIATING))
  return true;
  
--

2.7.4


___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] clm: Incorrect encode/decode time_super [#3007]

2019-02-20 Thread Minh Hon Chau


Hi aThanh,

ack for code review.

Thanks

Minh

On 20/2/19 4:19 pm, Thanh Nguyen wrote:

Changing ecoding of time_super using 64 bit instead of 32 bit.
---
  src/clm/clmd/clms_mds.cc | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/clm/clmd/clms_mds.cc b/src/clm/clmd/clms_mds.cc
index 833d18c..5a77885 100644
--- a/src/clm/clmd/clms_mds.cc
+++ b/src/clm/clmd/clms_mds.cc
@@ -542,7 +542,7 @@ static uint32_t clms_enc_track_cbk_msg(NCS_UBAID *uba, 
CLMSV_MSG *msg) {
  TRACE("p8 nullptr!!!");
  return 0;
}
-  ncs_encode_32bit(, track->time_super);
+  ncs_encode_64bit(, track->time_super);
ncs_enc_claim_space(uba, 8);
total_bytes += 8;
  



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] Opensaf-devel Digest, Vol 69, Issue 11

2019-02-12 Thread Minh Hon Chau


Hi Shiva,

Looks like your application is quite similar to an amf sample app, it's 
under samples/amf/sa_aware, you can try the sample app and then replace 
it with your confD.


Thanks,

Minh

On 12/2/19 11:09 pm, opensaf-devel-requ...@lists.sourceforge.net wrote:

Send Opensaf-devel mailing list submissions to
opensaf-devel@lists.sourceforge.net

To subscribe or unsubscribe via the World Wide Web, visit
https://lists.sourceforge.net/lists/listinfo/opensaf-devel
or, via email, send a message with subject or body 'help' to
opensaf-devel-requ...@lists.sourceforge.net

You can reach the person managing the list at
opensaf-devel-ow...@lists.sourceforge.net

When replying, please edit your Subject line so it is more specific
than "Re: Contents of Opensaf-devel digest..."


Today's Topics:

1. ConfD Integration with opensaf (shiva)


--

Message: 1
Date: Tue, 12 Feb 2019 14:40:26 +0530
From: shiva 
To: opensaf-devel@lists.sourceforge.net
Subject: [devel] ConfD Integration with opensaf
Message-ID: 
Content-Type: text/plain; charset=utf-8; format=flowed

Hello all,

  ??? I want to integrate confD with opensaf. Is there any document or
example code that explains about the integration process?

  ??? ??? I want to configure opensaf to handle 2N nodes (master/slave)
in confD.

  ??? ??? ??? My requirement is that when the master node goes down the
slave node should automatically take over and become the master.

  ??? ??? ??? ??? Thanks in advance.

Regards.




--



--

Subject: Digest Footer

___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


--

End of Opensaf-devel Digest, Vol 69, Issue 11
*




___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] clmd: not send sync respond to client if node down [#3004]

2019-02-12 Thread Minh Hon Chau


Hi Thang,

The patch looks ok, but I'm thinking of not introducing mds_node_down_list.

In SAI-AIS-CLM-B.04.01:

"The term unconfigured node is used in this document to designate an 
execution environment that is not configured to host a CLM node."


May we add a check if a node is unconfigured because it's not in 
ee_lookup, to distinguish with if a node is down?


Thanks

Minh

On 1/2/19 2:34 pm, Tran Thuan wrote:

Hi Thang,

ACK from me for code review, not tested.

Best Regards,
ThuanTr

-Original Message-
From: thang.d.nguyen 
Sent: Wednesday, January 30, 2019 1:20 AM
To: gary@dektech.com.au; minh.c...@dektech.com.au;
thuan.t...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net; thang.d.nguyen

Subject: [PATCH 1/1] clmd: not send sync respond to client if node down
[#3004]

Clmd will not send sync respond to client if the node that client resided on
down. This will avoid timeout when clmd send via mds.
---
  src/clm/clmd/clms_cb.h   |  3 +++
  src/clm/clmd/clms_evt.cc | 22 +-
src/clm/clmd/clms_mds.cc |  2 +-
  3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/clm/clmd/clms_cb.h b/src/clm/clmd/clms_cb.h index
4d7fdc7..637d53a 100644
--- a/src/clm/clmd/clms_cb.h
+++ b/src/clm/clmd/clms_cb.h
@@ -22,6 +22,7 @@
  #include "osaf/config.h"
  #endif
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -238,6 +239,8 @@ typedef struct clms_cb_t {
*node_down_list_head; /*NODE_DOWN record - Fix when active node goes
down
   */
NODE_DOWN_LIST *node_down_list_tail;
+  // Record node id when receive MDS node down  std::set
+ mds_node_down_list;
bool is_impl_set;
bool nid_started; /**< true if started by NID */
NCS_PATRICIA_TREE iplist; /* To temporarily store ipaddress information
diff --git a/src/clm/clmd/clms_evt.cc b/src/clm/clmd/clms_evt.cc index
c2b83c2..5265002 100644
--- a/src/clm/clmd/clms_evt.cc
+++ b/src/clm/clmd/clms_evt.cc
@@ -943,6 +943,8 @@ static uint32_t proc_mds_node_evt(CLMSV_CLMS_EVT *evt) {
  goto done;
}
  
+  clms_cb->mds_node_down_list.insert(node_id);

+
if ((clms_cb->ha_state == SA_AMF_HA_ACTIVE) ||
(clms_cb->ha_state == SA_AMF_HA_QUIESCED)) {
  clms_track_send_node_down(node);
@@ -1531,19 +1533,24 @@ static uint32_t proc_initialize_msg(CLMS_CB *cb,
CLMSV_CLMS_EVT *evt) {
  
TRACE_ENTER2("dest %" PRIx64, evt->fr_dest);
  
-  /*Handle the wrap around */

-  if (clms_cb->last_client_id == INT_MAX) clms_cb->last_client_id = 0;
-
-  clms_cb->last_client_id++;
-
node = clms_node_get_by_id(node_id);
TRACE("Node id = %d", node_id);
if (node == nullptr) {
  LOG_IN("Initialize request of client on an unconfigured node: node_id =
%d",
 node_id);
  ais_rc = SA_AIS_ERR_UNAVAILABLE;
+std::set::iterator it =
+  clms_cb->mds_node_down_list.find(node_id);
+if (it != clms_cb->mds_node_down_list.end()) {
+  return (uint32_t)ais_rc;
+}
}
  
+  /*Handle the wrap around */

+  if (clms_cb->last_client_id == INT_MAX) clms_cb->last_client_id = 0;
+
+  clms_cb->last_client_id++;
+
if ((client = clms_client_new(evt->fr_dest, clms_cb->last_client_id)) ==
nullptr) {
  TRACE("Creating a new client failed"); @@ -1564,6 +1571,11 @@ static
uint32_t proc_initialize_msg(CLMS_CB *cb, CLMSV_CLMS_EVT *evt) {
  return rc;
}
  
+  std::set::iterator it =

+ clms_cb->mds_node_down_list.find(node_id);
+  if (it != clms_cb->mds_node_down_list.end()) {
+clms_cb->mds_node_down_list.erase(it);
+  }
+
if (node) {
  if (node->member == false) {
rc = clms_send_is_member_info(clms_cb, node->node_id, node->member,
diff --git a/src/clm/clmd/clms_mds.cc b/src/clm/clmd/clms_mds.cc index
58552cc..833d18c 100644
--- a/src/clm/clmd/clms_mds.cc
+++ b/src/clm/clmd/clms_mds.cc
@@ -1097,7 +1097,7 @@ static uint32_t clms_mds_node_event(struct
ncsmds_callback_info *mds_info) {
  clmsv_evt->info.node_mds_info.node_id =
mds_info->info.node_evt.node_id;
  clmsv_evt->info.node_mds_info.nodeup = SA_TRUE;
  
-rc = m_NCS_IPC_SEND(_cb->mbx, clmsv_evt, NCS_IPC_PRIORITY_HIGH);

+rc = m_NCS_IPC_SEND(_cb->mbx, clmsv_evt,
+ NCS_IPC_PRIORITY_VERY_HIGH);
  if (rc != NCSCC_RC_SUCCESS) {
TRACE("IPC send failed %d", rc);
free(clmsv_evt);
--
2.7.4






___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 0/4] Review Request for osaf: allow split brain prevention parameter changes at runtime [#3006]

2019-02-06 Thread Minh Hon Chau


Hi Gary,

ack with comment

- There's a  daemon_sigterm_install(), I think we could make a 
daemon_sighup_install() in the /base/daemon.c


- I see there's a @todo that stop the split brain prevention when it's 
running, when it's done we might have to document this runtime change.


Thanks

Minh

On 4/2/19 9:41 pm, Gary Lee wrote:

Summary: osaf: add ability to reload config from fmd.conf [#3006]
Review request for Ticket(s): 3006
Peer Reviewer(s): Hans, Minh
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-3006
Base revision: e13f0ee64a0255dd54bc70b1f8d601fbb6162428
Personal repository: git://git.code.sf.net/u/userid-2226215/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesy
  OpenSAF servicesy
  Core libraries  y
  Samples n
  Tests   n
  Other   n


Comments (indicate scope for each "y" above):
-

revision b5206b54fbc5462eaf6f0599d2c449f22087635d
Author: Gary Lee 
Date:   Mon, 4 Feb 2019 21:33:11 +1100

rded: reload split brain prevention parameters on SIGHUP [#3006]

If enabled at runtime and this node is active, promote this node
in consensus service.

If disabled at runtime, watch threads will terminate gracefully when
the plugin exits after losing connectivty to the consensus service.



revision 0a043e5b320e8c05beccf7b7ac3c9150abdf6cc5
Author: Gary Lee 
Date:   Mon, 4 Feb 2019 20:57:32 +1100

amfd: reload split brain prevention parameters on SIGHUP [#3006]



revision fd617aeb6c8f23d8b404a85f6aaa8c6b28ae26a1
Author: Gary Lee 
Date:   Mon, 4 Feb 2019 20:56:10 +1100

fmd: reload split brain prevention parameters on SIGHUP [#3006]



revision a3c6f632f2377afc47c0ae04861f9a4a0e06f498
Author: Gary Lee 
Date:   Mon, 4 Feb 2019 20:52:42 +1100

osaf: add ability to reload config from fmd.conf [#3006]

Add ReloadConfiguration() function - when called it will
read fmd.conf and look for 'export FMS_X=', and overwrite
current environment variable settings in the caller.

This allows split brain prevention parameters to be changed at
runtime without a node restart.



Complete diffstat:
--
  src/amf/amfd/cb.h   |  1 +
  src/amf/amfd/main.cc| 32 
  src/amf/amfd/osaf-amfd.in   |  1 +
  src/fm/fmd/fm_main.cc   |  2 ++
  src/osaf/consensus/consensus.cc | 67 +++--
  src/osaf/consensus/consensus.h  | 20 
  src/osaf/consensus/key_value.cc | 41 +
  src/rde/rded/osaf-rded.in   |  1 +
  src/rde/rded/rde_main.cc| 59 +---
  src/rde/rded/rde_rda.h  |  3 ++
  src/rde/rded/role.cc| 27 +
  src/rde/rded/role.h |  1 +
  12 files changed, 203 insertions(+), 52 deletions(-)


Testing Commands:
-
1)

Start cluster with FMS_SPLIT_BRAIN_PREVENTION=0

On both active / standby SCs:
modify fmd.conf and set FMS_SPLIT_BRAIN_PREVENTION=1
pkill -SIGHUP osafamfd
pkill -SIGHUP osaffmd
pkill -SIGHUP osafrded

Ensure split brain prevention works as expected

2)

Leave cluster from Step 1 running

On both active / standby SCs:
modify fmd.conf and set FMS_SPLIT_BRAIN_PREVENTION=0
pkill -SIGHUP osafamfd
pkill -SIGHUP osaffmd
pkill -SIGHUP osafrded

Ensure split brain prevention is no longer in effect


Testing, Expected Results:
--
As above

Conditions of Submission:
-
Ack from any reviewer

Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  y  y
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many blank entries
 that need proper data filled in.

___ You have failed to nominate the proper persons for review and push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible line length in your headers/comments/text.

___ You have failed to put in a proper Trac Ticket # into your commits.

___ You have incorrectly put/left internal data in your comments/files
 (i.e. internal bug tracking tool IDs, product names etc)

___ You have not given any evidence of testing beyond basic build tests.
 Demonstrate some level of runtime or other sanity testing.

___ You have ^M present in

Re: [devel] [PATCH 4/5] amfd: allow node to remain active is peer SC can be seen [#2996]

2019-01-21 Thread Minh Hon Chau


ack, review only. Thanks/Minh

On 21/1/19 2:52 pm, Gary Lee wrote:

If relaxed node promotion is enabled, allow a SC to remain
active if the peer SC can be seen, even if access to the
consensus service is lost.
---
  src/amf/amfd/ndfsm.cc  |  2 +-
  src/amf/amfd/ndproc.cc | 13 +++--
  src/amf/amfd/proc.h|  2 +-
  3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfd/ndfsm.cc b/src/amf/amfd/ndfsm.cc
index 4146ddc..8c8f3c5 100644
--- a/src/amf/amfd/ndfsm.cc
+++ b/src/amf/amfd/ndfsm.cc
@@ -817,7 +817,7 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
if (cb->node_failover_delay == 0) {
  avd_node_failover(node);
}
-  check_quorum();
+  check_quorum(cb);
node->node_info.member = SA_FALSE;
// Update standby out of sync if standby sc goes down
if (avd_cb->node_id_avd_other == node->node_info.nodeId) {
diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc
index c4eebb1..ec347fc 100644
--- a/src/amf/amfd/ndproc.cc
+++ b/src/amf/amfd/ndproc.cc
@@ -1245,15 +1245,24 @@ void avd_node_failover(AVD_AVND *node, const bool 
mw_only) {
TRACE_LEAVE();
  }
  
-void check_quorum() {

+void check_quorum(AVD_CL_CB *cb) {
TRACE_ENTER();
  
Consensus consensus_service;

if (consensus_service.IsRemoteFencingEnabled() == false &&
consensus_service.IsWritable() == false) {
+// if relaxed mode is enabled, ignore failure if peer SC is up
+if (consensus_service.IsRelaxedNodePromotionEnabled() == true) {
+  AVD_AVND* peer = avd_node_find_nodeid(cb->node_id_avd_other);
+  if (peer != nullptr && peer->node_state == AVD_AVND_STATE_PRESENT) {
+LOG_NO("Relaxed node promotion is enabled, peer SC is connected");
+return;
+  }
+}
+
  // remote fencing is disabled and we have lost write access
  // reboot this node to prevent split brain
  opensaf_reboot(0, nullptr,
"Quorum lost. Rebooting this node to prevent split-brain");
}
-}
\ No newline at end of file
+}
diff --git a/src/amf/amfd/proc.h b/src/amf/amfd/proc.h
index a378218..f1dc7ba 100644
--- a/src/amf/amfd/proc.h
+++ b/src/amf/amfd/proc.h
@@ -96,7 +96,7 @@ void avd_process_hb_event(AVD_CL_CB *cb_now, struct AVD_EVT 
*evt);
  extern void avd_node_mark_absent(AVD_AVND *node);
  extern void avd_tmr_snd_hb_evh(AVD_CL_CB *cb, AVD_EVT *evt);
  extern void avd_node_failover(AVD_AVND *node, const bool mw_only = false);
-extern void check_quorum();
+extern void check_quorum(AVD_CL_CB *cb);
  extern AVD_SU *get_other_su_from_oper_list(AVD_SU *su);
  extern void su_complete_admin_op(AVD_SU *su, SaAisErrorT result);
  extern void comp_complete_admin_op(AVD_COMP *comp, SaAisErrorT result);



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 3/5] osaf: allow active SC to be preferred during network split [#2996]

2019-01-21 Thread Minh Hon Chau


ack, review only. Thanks/Minh

On 21/1/19 2:52 pm, Gary Lee wrote:

Add FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE option to allow
active SC to be preferred during a network split. The default
behavior is to prefer the larger partition to maintain
existing behaviour.

Add configuration support for FMS_RELAXED_NODE_PROMOTION.
---
  src/osaf/consensus/consensus.cc | 39 ---
  src/osaf/consensus/consensus.h  |  9 +++--
  src/osaf/consensus/key_value.cc |  8 ++--
  3 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc
index 112af7d..5304c4f 100644
--- a/src/osaf/consensus/consensus.cc
+++ b/src/osaf/consensus/consensus.cc
@@ -64,6 +64,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool 
graceful_takeover,
 cluster_size);
  if (rc != SA_AIS_OK) {
LOG_WA("Takeover request failed (%d)", rc);
+  rc = SA_AIS_ERR_EXIST;
return rc;
  }
  take_over_request_created = true;
@@ -99,7 +100,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool 
graceful_takeover,
if (rc == SA_AIS_OK) {
  LOG_NO("Active controller set to %s", base::Conf::NodeName().c_str());
} else {
-LOG_ER("Failed to promote this node (%u)", rc);
+LOG_WA("Failed to promote this node (%u)", rc);
}
  
return rc;

@@ -197,6 +198,10 @@ bool Consensus::IsWritable() const {
  
  bool Consensus::IsRemoteFencingEnabled() const { return use_remote_fencing_; }
  
+bool Consensus::IsRelaxedNodePromotionEnabled() const {

+  return relaxed_node_promotion_;
+}
+
  std::string Consensus::CurrentActive() const {
TRACE_ENTER();
if (use_consensus_ == false) {
@@ -228,6 +233,10 @@ Consensus::Consensus() {
uint32_t split_brain_enable = base::GetEnv("FMS_SPLIT_BRAIN_PREVENTION", 0);
std::string kv_store_cmd = base::GetEnv("FMS_KEYVALUE_STORE_PLUGIN_CMD", 
"");
uint32_t use_remote_fencing = base::GetEnv("FMS_USE_REMOTE_FENCING", 0);
+  uint32_t prioritise_partition_size =
+base::GetEnv("FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE", 1);
+  uint32_t relaxed_node_promotion =
+base::GetEnv("FMS_RELAXED_NODE_PROMOTION", 0);
  
// if not specified in fmd.conf,

// takeover requests are valid for 20 seconds
@@ -246,6 +255,14 @@ Consensus::Consensus() {
  use_remote_fencing_ = true;
}
  
+  if (prioritise_partition_size == 1) {

+prioritise_partition_size_ = true;
+  }
+
+  if (use_consensus_ == true && relaxed_node_promotion == 1) {
+relaxed_node_promotion_ = true;
+  }
+
// needed for base::Conf::NodeName() later
base::Conf::InitNodeName();
  }
@@ -373,6 +390,10 @@ SaAisErrorT Consensus::CreateTakeoverRequest(const 
std::string& current_owner,
  return CreateTakeoverRequest(current_owner, proposed_owner, cluster_size);
}
  
+  if (rc != SA_AIS_OK) {

+ return rc;
+  }
+
// wait up to max_takeover_retry seconds for request to be answered
retries = 0;
while (retries < max_takeover_retry) {
@@ -546,9 +567,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest(
LOG_NO("Other network size: %" PRIu64 ", our network size: %" PRIu64,
   proposed_cluster_size, cluster_size);
  
+  const std::string state_str =

+tokens[static_cast(TakeoverElements::STATE)];
+
TakeoverState result;
-  if (proposed_cluster_size > cluster_size) {
-result = TakeoverState::ACCEPTED;
+  if (state_str !=
+TakeoverStateStr[static_cast(TakeoverState::NEW)]) {
+return TakeoverState::UNDEFINED;
+  }
+
+  if (prioritise_partition_size_ == true) {
+if (proposed_cluster_size > cluster_size) {
+  result = TakeoverState::ACCEPTED;
+} else {
+  result = TakeoverState::REJECTED;
+}
} else {
  result = TakeoverState::REJECTED;
}
diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h
index 6421c7c..2fbd3bd 100644
--- a/src/osaf/consensus/consensus.h
+++ b/src/osaf/consensus/consensus.h
@@ -57,6 +57,9 @@ class Consensus {
// Is remote fencing enabled?
bool IsRemoteFencingEnabled() const;
  
+  // Is relaxed node promotion enabled?

+  bool IsRelaxedNodePromotionEnabled() const;
+
Consensus();
virtual ~Consensus();
  
@@ -66,7 +69,7 @@ class Consensus {

  UNDEFINED = 0,
  NEW = 1,
  ACCEPTED = 2,
-REJECTED = 3,
+REJECTED = 3
};
  
enum class TakeoverElements : std::uint8_t {

@@ -85,13 +88,15 @@ class Consensus {
   private:
bool use_consensus_ = false;
bool use_remote_fencing_ = false;
+  bool prioritise_partition_size_ = false;
+  bool relaxed_node_promotion_ = false;
uint32_t takeover_valid_time;
uint32_t max_takeover_retry;
const std::string kTestKeyname = "opensaf_write_test";
const std::chrono::milliseconds kSleepInterval =
std::chrono::milliseconds(1000);  // in ms
static constexpr uint32_t kLockTimeout = 0;  // lock is persistent

Re: [devel] [PATCH 0/5] Review Request for rded: add relaxed node promotion feature [#2996]

2019-01-21 Thread Minh Hon Chau


Hi Gary,

I'm trying to understand the patch 3/5 and 4/5, there seems to be logic 
of *relaxed mode* left in 3/5 and 4/5.


Thanks

Minh

On 21/1/19 2:52 pm, Gary Lee wrote:

Summary: rded: add relaxed node promotion feature [#2996]
Review request for Ticket(s): 2996
Peer Reviewer(s): Hans, Minh
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-2996
Base revision: 35035599567d1add6975a89f1286f20738d67bf1
Personal repository: git://git.code.sf.net/u/userid-2226215/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesn
  OpenSAF servicesy
  Core libraries  y
  Samples n
  Tests   n
  Other   n


Comments (indicate scope for each "y" above):
-

revision 9a681198810be2e2ad3f512ff966fe1d9eceb1ab
Author: Gary Lee 
Date:   Mon, 21 Jan 2019 14:35:49 +1100

rded: add relaxed node promotion feature [#2996]

Allow promotion of node to active at cluster startup, even if the
consensus service is unavailable, if the peer SC can be seen.

During normal cluster operation, if the consensus service becomes
unavailable but the peer SC can still be seen, allow the existing
active SC to remain active.

A new NCSMDS_SVC_ID_RDE_DISCOVERY service ID is exported by rded.
This is installed as soon as rded is started, unlike
NCSMDS_SVC_ID_RDE which is only installed when it becomes
a candidate for election.



revision d2fad05f5ab3b502403493763f5f2bb31608444f
Author: Gary Lee 
Date:   Mon, 21 Jan 2019 14:35:49 +1100

amfd: allow node to remain active is peer SC can be seen [#2996]

If relaxed node promotion is enabled, allow a SC to remain
active if the peer SC can be seen, even if access to the
consensus service is lost.



revision 4e1bbbd4997a6ea8307695e81a64dd9c53da15aa
Author: Gary Lee 
Date:   Mon, 21 Jan 2019 14:35:42 +1100

osaf: allow active SC to be preferred during network split [#2996]

Add FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE option to allow
active SC to be preferred during a network split. The default
behavior is to prefer the larger partition to maintain
existing behaviour.

Add configuration support for FMS_RELAXED_NODE_PROMOTION.



revision 7b50ffd37aafb82e71c726781824f8d6883c5aa5
Author: Gary Lee 
Date:   Mon, 21 Jan 2019 14:27:38 +1100

fmd: add configuration parameters [#2996]

Add parameters FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE and
FMS_RELAXED_NODE_PROMOTION.



revision 1bb52d591e6014e013c8335f7f1a1f516ecc8566
Author: Gary Lee 
Date:   Mon, 21 Jan 2019 14:01:08 +1100

osaf: update etcd3 to poll instead of watch [#2996]

The 'watch' command does not return if the etcd server goes down.
We need to poll the etcd server to properly check we still have
connectivity to the etcd server.



Complete diffstat:
--
  src/amf/amfd/ndfsm.cc   |  2 +-
  src/amf/amfd/ndproc.cc  | 13 -
  src/amf/amfd/proc.h |  2 +-
  src/fm/fmd/fmd.conf | 17 ++
  src/mds/mds_papi.h  |  1 +
  src/osaf/consensus/consensus.cc | 39 -
  src/osaf/consensus/consensus.h  |  9 ++-
  src/osaf/consensus/key_value.cc |  8 ++-
  src/osaf/consensus/plugins/etcd3.plugin | 50 +
  src/rde/rded/rde_cb.h   | 12 +++-
  src/rde/rded/rde_main.cc| 71 +---
  src/rde/rded/rde_mds.cc | 94 ++--
  src/rde/rded/role.cc| 97 +
  src/rde/rded/role.h |  4 +-
  14 files changed, 375 insertions(+), 44 deletions(-)


Testing Commands:
-
*** LIST THE COMMAND LINE TOOLS/STEPS TO TEST YOUR CHANGES ***


Testing, Expected Results:
--
*** PASTE COMMAND OUTPUTS / TEST RESULTS ***


Conditions of Submission:
-
Ack from any reviewer, or in 1 week

Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  y  y
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many blank entries
 that need proper data filled in.

___ You have failed to nominate the proper persons for review and push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible

Re: [devel] Review request ntf: update document for limit of logger buffer [#2994]

2019-01-17 Thread Minh Hon Chau


Hi Canh,

A very minor comment, that we may have to mention it's for alarm notif 
only. Thanks, Minh


On 11/1/19 11:30 pm, Lennart Lund wrote:


Hi Canh,

I should have Acked the document in my previous answer. Just fix any 
minor things and push.


Thanks

Lennart

*From:*Canh Van Truong 
*Sent:* den 11 januari 2019 12:01
*To:* Lennart Lund ; Minh Hon Chau 


*Cc:* opensaf-devel@lists.sourceforge.net
*Subject:* RE: Review request ntf: update document for limit of logger 
buffer [#2994]


Hi Lennart

Yes, I add new column for “Default value”.

Regards

Canh

*From:*Lennart Lund 
*Sent:* Friday, January 11, 2019 5:33 PM
*To:* Canh Van Truong ; Minh Hon Chau 


*Cc:* opensaf-devel@lists.sourceforge.net
*Subject:* RE: Review request ntf: update document for limit of logger 
buffer [#2994]


Hi Canh,

I have one minor comment in the attached document. Also, it may be 
better if the table has three columns “Environment Variable, Default 
value and Comment”


Thanks

Lennart

*From:*Canh Van Truong <mailto:canh.v.tru...@dektech.com.au>>

*Sent:* den 11 januari 2019 10:18
*To:* Lennart Lund <mailto:lennart.l...@ericsson.com>>; Minh Hon Chau 
mailto:minh.c...@dektech.com.au>>
*Cc:* opensaf-devel@lists.sourceforge.net 
<mailto:opensaf-devel@lists.sourceforge.net>
*Subject:* RE: Review request ntf: update document for limit of logger 
buffer [#2994]


Thanks Lennart,

I have updated with your comments.

Please give the comments if there is something need to be updated.

Regards

Canh

*From:*Lennart Lund <mailto:lennart.l...@ericsson.com>>

*Sent:* Thursday, January 10, 2019 9:07 PM
*To:* Canh Van Truong <mailto:canh.v.tru...@dektech.com.au>>; Minh Hon Chau 
mailto:minh.c...@dektech.com.au>>
*Cc:* opensaf-devel@lists.sourceforge.net 
<mailto:opensaf-devel@lists.sourceforge.net>
*Subject:* RE: Review request ntf: update document for limit of logger 
buffer [#2994]


Hi Canh,

The following text describing the NTFSV_LOGGER_BUFFER_CAPACITY 
environment variable should be improved/simplified (hard to understand)


Original text from document:

“The limit of logger buffer size in NTFD. The logger buffer is used to 
store the notification if writing notification to log file fail. The 
limit should be set with relevant value to avoid congestion in NTFD. 
Because if this value is set too big while writing notification is 
fail for long time, NTFD has to write a big number of notifications 
whenever handling sending notification request and that will delay to 
handle other requests come to NTFD. The value of variable is from 10 
to 5000.”


Suggestion:

Note: my native language is not English so I suggest that this text is 
checked by someone who knows English better than I do. However, the 
following information is what I think is needed.


“Notification log buffer size. Valid values are 10 to 5000 stored 
notifications. Default is 10.
Some notifications are logged using the OpenSAF log service. NTF has a 
buffer to store notifications to be logged later in case the log 
service returns TRY AGAIN (may happen if the log service is temporary 
unavailable) when NTF writes the log record. When the log service is 
available again (returns OK) all notifications in the buffer will be 
written before the NTF service can service any new notification 
requests. If the buffer is big this may take some time and may cause 
the NTF client to timeout.
If the buffer is full and the log service answers TRY AGAIN NTF will 
return TRY AGAIN when Notification send is called.”


Thanks

Lennart

*From:*Canh Van Truong <mailto:canh.v.tru...@dektech.com.au>>

*Sent:* den 10 januari 2019 12:52
*To:* Minh Hon Chau <mailto:minh.c...@dektech.com.au>>; Lennart Lund 
mailto:lennart.l...@ericsson.com>>
*Cc:* opensaf-devel@lists.sourceforge.net 
<mailto:opensaf-devel@lists.sourceforge.net>
*Subject:* Review request ntf: update document for limit of logger 
buffer [#2994]


Update the document because of the ticket #2961

ntf: Limit the logger buffer [#2961]

Document with recorded changes attached. Activate "Show Changes" to see

them [Edit/Track Changes/Show Changes]

Thanks

Canh



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] mdstest: fix tet_svc_subscr_VDEST_11() fail with TIPC transport [#2978]

2019-01-17 Thread Minh Hon Chau


Hi Thuan,

I think it's ok for sleep() to make the test simple, ack from me.

thanks

Minh

On 29/11/18 7:47 pm, thuan.tran wrote:

TIPC published event received is not as order MDS service install.
Service 600 got published role active before role standby even
install role standby before role active.
The simplest and safe solution is add sleep 1s before change
vdest role to active.
---
  src/mds/apitest/mdstipc_api.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/src/mds/apitest/mdstipc_api.c b/src/mds/apitest/mdstipc_api.c
index 22a4386..34dcd3b 100644
--- a/src/mds/apitest/mdstipc_api.c
+++ b/src/mds/apitest/mdstipc_api.c
@@ -1847,6 +1847,7 @@ void tet_svc_subscr_VDEST_11()
printf("\nFail to subscribing for the service 500\n");
FAIL = 1;
}
+   sleep(1);
/* verifying the rem svc ver from 600 and 700*/
printf("\nChanging the role of vdest to active");
if (vdest_change_role(1001, V_DEST_RL_ACTIVE) != NCSCC_RC_SUCCESS) {



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/2] ntf: Limit the logger buffer [#2961]

2019-01-09 Thread Minh Hon Chau


Hi Canh,

ack with comments, please see with [M]

Thanks

Minh

On 9/1/19 8:22 pm, Canh Van Truong wrote:

When writing the notificaion fail with TRY_AGAIN in callback, the notificaion 
is pushed
again to the list. If this happens for long time, the list is going to be very 
big.
This cause NTFD take time to process writing all the notification in the list 
and
the request from NTFA come this time may be timeout.

The patch does:
- Limit the logger buffer
- Provide the env variable that user can set the value of the limit
- Return TRY_AGAIN error in case the limit of buffer is reached and write 
all the
  notifications in the buffer to the log file. The current of notification 
isn't
  written to log file.
---
  src/ntf/README|  10 
  src/ntf/ntfd/NtfAdmin.cc  |  44 +-
  src/ntf/ntfd/NtfLogger.cc | 149 +++---
  src/ntf/ntfd/NtfLogger.h  |  11 +++-
  src/ntf/ntfd/ntfd.conf|  10 
  src/ntf/ntfd/ntfs.h   |   2 +
  6 files changed, 147 insertions(+), 79 deletions(-)

diff --git a/src/ntf/README b/src/ntf/README
index 6dd5173e1..5bf670647 100644
--- a/src/ntf/README
+++ b/src/ntf/README
@@ -233,6 +233,16 @@ NTFSV_ENV_CACHE_SIZE
  The size of the notification cache in the NTF server processes running on the 
Controller nodes.
  The default value is 1 notification.
  
+NTFSV_LOGGER_BUFFER_CAPACITY

+
+The logger buffer is used to store the notification if writing notification
+to log file fail. This variable is set for limit of logger buffer size in
+NTFD. If the logger buffer is full and NTFD receives new notification,
+the TRY_AGAIN  error is returned to user. The limit should be set with relevant
+value to avoid congestion in NTFD. Because if this value is set too big and
+writing notification is fail for long time, NTF has to write a big number of
+notifications whenever handling sending notification request and that will 
delay
+to handle other requests come to NTFD. The value of variable is from 10 to 
5000.
  
  for debug see DEBUG.
  
diff --git a/src/ntf/ntfd/NtfAdmin.cc b/src/ntf/ntfd/NtfAdmin.cc

index 2cb99457c..6c2d69b43 100644
--- a/src/ntf/ntfd/NtfAdmin.cc
+++ b/src/ntf/ntfd/NtfAdmin.cc
@@ -193,19 +193,32 @@ void NtfAdmin::processNotification(unsigned int clientId,
notificationId, notificationType,
(unsigned int)notificationMap.size());
  
-  // log the notification. Callback from SAF log will confirm later.

-  logger.log(notification, activeController());
-
-  /* send notification to standby */
-  sendNotificationUpdate(clientId, notification->getNotInfo());
+  if ((logger.isLoggerBufferFull() == true) &&
+  (logger.isAlarmNotification(notification) == true)) {
+NtfClient *client = getClient(clientId);
+MDS_DEST dest = client->getMdsDest();
+LOG_WA("The logger buffer is full. Check if there is issue in writing");
+if (activeController())
+  notfication_result_lib(SA_AIS_ERR_TRY_AGAIN, notificationId,
+ mdsCtxt, dest);
+  } else {
+/* send notification to standby */
+sendNotificationUpdate(clientId, notification->getNotInfo());
  
-  ClientMap::iterator pos;

-  for (pos = clientMap.begin(); pos != clientMap.end(); pos++) {
-NtfClient *client = pos->second;
-client->notificationReceived(clientId, notification, mdsCtxt);
+ClientMap::iterator pos;
+for (pos = clientMap.begin(); pos != clientMap.end(); pos++) {
+  NtfClient *client = pos->second;
+  client->notificationReceived(clientId, notification, mdsCtxt);
+}
}
  
-  /* remove notification if sent to all subscribers and logged */

+  // Log the notification. Callback from SAF log will confirm later.
+  if (activeController())
+logger.log(notification);
+  // Add the notification to Reader list
+  logger.addNotificationToReaderList(notification);
+
+  // Remove the notification if it is sent to all subscribers and logged
if (notification->isSubscriptionListEmpty() && notification->loggedOk()) {
  NotificationMap::iterator posNot;
  posNot = notificationMap.find(notificationId);


[M]: If ntfd decides to return TRY_AGAIN, then the notification should 
not be added for readers to read, and for subscription checking, etc 
I think it looks like this


if (the buffer is not empty) {

    // try to flush all pending log

}

if (the buffer is still full) {

    // return try again

} else {

    // add this to buffer, checkpoint, add to reader lists,  as normal

}



@@ -341,9 +354,9 @@ void NtfAdmin::notificationReceivedColdSync(
TRACE_LEAVE();
  }
  /**
- * A cached notification is received in Cold Sync.
- * This cached notification will be marked as logged, and stored
- * only in NtfLogger class to serve the reader.
+ * A cached notifications are received in Cold Sync.
+ * This cached notifications are stored in NtfLogger
+ * class to serve the reader.
   *
   * @param clientId Node-wide unique id for the

Re: [devel] [PATCH 1/1] tests: test.sh should checkout release tag of googletest for stable [#2983]

2019-01-08 Thread Minh Hon Chau


Hi Thuan,

ack from me.

Thanks,

Minh

On 12/12/18 7:20 pm, thuan.tran wrote:

---
  test.sh | 1 +
  1 file changed, 1 insertion(+)

diff --git a/test.sh b/test.sh
index daf6293..ce90a62 100755
--- a/test.sh
+++ b/test.sh
@@ -37,6 +37,7 @@ if [[ ! -f 
"$OSAF_TEST_WORKDIR/googletest/googlemock/lib/libgmock.la" ||
  fi
  
  cd "$OSAF_TEST_WORKDIR/googletest"

+git checkout `git tag | grep "release" | tail -n 1`
  autoreconf -vi
  ./configure --with-pthreads
  make -j "$no_of_processors"



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1/1] amfd: add node to failover_list before calling SetState [#2963]

2018-11-16 Thread Minh Hon Chau


Hi Gary, ack (code review only). Thanks Minh

On 16/11/18 5:38 pm, Gary Lee wrote:

node must be added to failover_list before SetState() is called.
If the state is 'end', then it will be deleted by SetState().
Otherwise, we will leave a node in 'End' state mistakenly in
failover_list.
---
  src/amf/amfd/ckpt_dec.cc | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/amf/amfd/ckpt_dec.cc b/src/amf/amfd/ckpt_dec.cc
index 022fa8f4b..a46f6d306 100644
--- a/src/amf/amfd/ckpt_dec.cc
+++ b/src/amf/amfd/ckpt_dec.cc
@@ -2990,8 +2990,12 @@ static uint32_t dec_node_failover_state(AVD_CL_CB *cb, 
NCS_MBCSV_CB_DEC *dec) {
  node->node_name.c_str());
  auto new_node = std::make_shared(cb,
node->node_info.nodeId);
-new_node->SetState(state);
+// node must be added to failover_list before SetState() is called.
+// If the state is 'end', then it will be deleted by SetState().
+// Otherwise, we will leave a node in 'End' state mistakenly in
+// failover_list.
  cb->failover_list[node->node_info.nodeId] = new_node;
+new_node->SetState(state);
}
  
return NCSCC_RC_SUCCESS;



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 0/5] Review Request for ntf: add new test cases in ntf apitest [#2958]

2018-11-15 Thread Minh Hon Chau


Hi Mohan,

I have pushed the patches.

Thanks

Minh

On 14/11/18 6:19 pm, mo...@hasolutions.in wrote:

Hi minh,
I republished those patches.
please check it and push into the community.
 Thanks
Mohan
High Availability Solutions Pvt. Ltd.
www.hasolutions.in

- Original Message -
Subject: [PATCH 0/5] Review Request for ntf: add new test cases in
ntf apitest [#2958]
From: "Mohan Kanakam" 
Date: 11/14/18 12:42 pm
To: minh.c...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net, "Mohan Kanakam"


Summary: ntf: add new test case of API saNtfInitialize() of
apitest v2 [#2958]
Review request for Ticket(s): 2958
Peer Reviewer(s):minh
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-2958
Base revision: f8a6848a1cdbff0b518c3db951e4689e260226c7
Personal repository: git://git.code.sf.net/u/mohan-hasoln/review


Impacted area Impact y/n

Docs n
Build system n
RPM/packaging n
Configuration files n
Startup scripts n
SAF services n
OpenSAF services n
Core libraries n
Samples n
Tests y
Other n


Comments (indicate scope for each "y" above):
-
*** EXPLAIN/COMMENT THE PATCH SERIES HERE ***

revision f26d2ed56b5a091163cf9d2af75fe0d818b546e5
Author: Mohan Kanakam 
Date: Wed, 14 Nov 2018 11:46:16 +0530

ntf: add new test case of API saNtfNotificationReadFinalize() of
apitest [#2958]



revision 4ebfc2e5a06c36cc8324533f67486532edba139e
Author: Mohan Kanakam 
Date: Wed, 14 Nov 2018 11:44:44 +0530

ntf: add new test case of API saNtfFinalize() of apitest [#2958]



revision f23c17b01252e4d858f5e47e0b6f1d66abc9a74e
Author: Mohan Kanakam 
Date: Wed, 14 Nov 2018 11:42:31 +0530

ntf: add new test case of API saNtfDispatch() of apitest v2 [#2958]



revision b60b353c7c7abb12d15ef4547a578d10649da229
Author: Mohan Kanakam 
Date: Wed, 14 Nov 2018 11:40:02 +0530

ntf: add new test case of API saNtfSelectionObjectGet() of apitest
[#2958]



revision 3a01c0e3b2771b3b8b39747f9497178708b1c1f3
Author: Mohan Kanakam 
Date: Wed, 14 Nov 2018 11:37:27 +0530

ntf: add new test case of API saNtfInitialize() of apitest v2 [#2958]



Complete diffstat:
--
src/ntf/apitest/tet_saNtfDispatch.cc | 10 ++
src/ntf/apitest/tet_saNtfFinalize.cc | 7 +++
src/ntf/apitest/tet_saNtfInitialize.cc | 8 
src/ntf/apitest/tet_saNtfNotificationReadFinalize.cc | 7 +++
src/ntf/apitest/tet_saNtfSelectionObjectGet.cc | 11 +++
5 files changed, 43 insertions(+)


Testing Commands:
-
./ntftest
Testing, Expected Results:
--
13 PASSED saNtfInitialize with NULL pointer to handle AND NULLptr
to callbacks and nullptr to version
5 PASSED saNtfSelectionObjectGet Finalized handle
SA_AIS_ERR_BAD_HANDLE
4 PASSED saNtfDispatch - Fianlized handle SA_AIS_ERR_BAD_HANDLE
6 PASSED saNtfFinalize SA_AIS_ERR_BAD_HANDLE - unintilized handle
2 PASSED saNtfNotificationReadFinalize SA_AIS_ERR_BAD_HANDLE

Conditions of Submission:
-
Ack from maintainers

Arch Built Started Linux distro
---
mips n n
mips64 n n
x86 n n
x86_64 y y
powerpc n n
powerpc64 n n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any
checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many
blank entries
that need proper data filled in.

___ You have failed to nominate the proper persons for review and
push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible line length in your
headers/comments/text.

___ You have failed to put in a proper Trac Ticket # into your
commits.

___ You have incorrectly put/left internal data in your comments/files
(i.e. internal bug tracking tool IDs, product names etc)

___ You have not given any evidence of testing beyond basic build
tests.
Demonstrate some level of runtime or other sanity testing.

___ You have ^M present in some of your files. These have to be
removed.

___ You have needlessly changed whitespace or added whitespace crimes
like trailing spaces, or spaces before tabs.

___ You have mixed real technical changes with whitespace and other
cosmetic code cleanup changes. These have to be separate commits.

___ You need to refactor your submission into

Re: [devel] [PATCH 0/5] Review Request for ntf: add new test cases in apitest [#2958]

2018-11-13 Thread Minh Hon Chau


Hi Mohan,

ack for series with minor comments in the sub-patches.

Thanks

Minh

On 9/11/18 11:55 pm, Mohan Kanakam wrote:

Summary: ntf: add new test case of API saNtfInitialize() of apitest [#2958]
Review request for Ticket(s): 2958
Peer Reviewer(s):minh
Pull request to: *** LIST THE PERSON WITH PUSH ACCESS HERE ***
Affected branch(es): develop
Development branch: ticket-2958
Base revision: f8a6848a1cdbff0b518c3db951e4689e260226c7
Personal repository: git://git.code.sf.net/u/mohan-hasoln/review


Impacted area   Impact y/n

  Docsn
  Build systemn
  RPM/packaging   n
  Configuration files n
  Startup scripts n
  SAF servicesn
  OpenSAF servicesn
  Core libraries  n
  Samples n
  Tests   y
  Other   n


Comments (indicate scope for each "y" above):
-
*** EXPLAIN/COMMENT THE PATCH SERIES HERE ***

revision ebef37fa591d4e63ee7de9ea4098000a3256b208
Author: Mohan Kanakam 
Date:   Fri, 9 Nov 2018 17:19:08 +0530

ntf: add new test case of API saNtfNotificationReadFinalize() of apitest [#2958]



revision c716e7524263131754a26b61cd305988d97a206c
Author: Mohan Kanakam 
Date:   Fri, 9 Nov 2018 17:16:38 +0530

ntf: add new test case of API saNtfFinalize() of apitest [#2958]



revision cf271ad8784141560fdc6e616c5920b2fe975928
Author: Mohan Kanakam 
Date:   Fri, 9 Nov 2018 17:14:20 +0530

ntf: add new test case of API saNtfDispatch() of apitest [#2958]



revision 84f64a6aaa9b45ef55261764a52386030bfd0830
Author: Mohan Kanakam 
Date:   Fri, 9 Nov 2018 17:12:01 +0530

ntf: add new test case of API saNtfSelectionObjectGet() of apitest [#2958]



revision 176eb07f6e212334517af33879d085932324d4ef
Author: Mohan Kanakam 
Date:   Fri, 9 Nov 2018 17:08:57 +0530

ntf: add new test case of API saNtfInitialize() of apitest [#2958]



Complete diffstat:
--
  src/ntf/apitest/tet_saNtfDispatch.cc | 10 ++
  src/ntf/apitest/tet_saNtfFinalize.cc |  7 +++
  src/ntf/apitest/tet_saNtfInitialize.cc   |  8 
  src/ntf/apitest/tet_saNtfNotificationReadFinalize.cc |  7 +++
  src/ntf/apitest/tet_saNtfSelectionObjectGet.cc   | 11 +++
  5 files changed, 43 insertions(+)


Testing Commands:
-
./ntftest

Testing, Expected Results:
--
13  PASSED   saNtfInitialize with NULL pointer to handle AND NULL callbacks and 
unintilized version
5  PASSED   saNtfSelectionObjectGet Finalized handle SA_AIS_ERR_BAD_HANDLE
4  PASSED   saNtfDispatch - Fianlized handle SA_AIS_ERR_BAD_HANDLE
6  PASSED   saNtfFinalize SA_AIS_ERR_BAD_HANDLE - unintilized handle
2  PASSED   saNtfNotificationReadFinalize SA_AIS_ERR_BAD_HANDLE

Conditions of Submission:
-
Ack from maintainers

Arch  Built StartedLinux distro
---
mipsn  n
mips64  n  n
x86 n  n
x86_64  y  y
powerpc n  n
powerpc64   n  n


Reviewer Checklist:
---
[Submitters: make sure that your review doesn't trigger any checkmarks!]


Your checkin has not passed review because (see checked entries):

___ Your RR template is generally incomplete; it has too many blank entries
 that need proper data filled in.

___ You have failed to nominate the proper persons for review and push.

___ Your patches do not have proper short+long header

___ You have grammar/spelling in your header that is unacceptable.

___ You have exceeded a sensible line length in your headers/comments/text.

___ You have failed to put in a proper Trac Ticket # into your commits.

___ You have incorrectly put/left internal data in your comments/files
 (i.e. internal bug tracking tool IDs, product names etc)

___ You have not given any evidence of testing beyond basic build tests.
 Demonstrate some level of runtime or other sanity testing.

___ You have ^M present in some of your files. These have to be removed.

___ You have needlessly changed whitespace or added whitespace crimes
 like trailing spaces, or spaces before tabs.

___ You have mixed real technical changes with whitespace and other
 cosmetic code cleanup changes. These have to be separate commits.

___ You need to refactor your submission into logical chunks; there is
 too much content into a single commit.

___ You have extraneous garbage in your review (merge commits etc)

___ You have giant attachments which should never have been sent;
 Instead you should place your content in a public tree to be pulled.

___ You have too many commits attached to an e-mail; resend as threaded
 commits, or place in a public tree for a pull.

___ You have resent this content multiple times without a clear indication
 of what has changed

Re: [devel] [PATCH 1/5] ntf: add new test case of API saNtfInitialize() of apitest [#2958]

2018-11-13 Thread Minh Hon Chau


Hi Mohan,

A minor comment, we could use nullptr instead.

Thanks

Minh

On 9/11/18 11:55 pm, Mohan Kanakam wrote:

---
  src/ntf/apitest/tet_saNtfInitialize.cc | 8 
  1 file changed, 8 insertions(+)

diff --git a/src/ntf/apitest/tet_saNtfInitialize.cc 
b/src/ntf/apitest/tet_saNtfInitialize.cc
index 8538193..c1442dc 100644
--- a/src/ntf/apitest/tet_saNtfInitialize.cc
+++ b/src/ntf/apitest/tet_saNtfInitialize.cc
@@ -117,6 +117,11 @@ void saNtfInitialize_12(void) {
test_validate(rc, SA_AIS_ERR_VERSION);
  }
  
+void saNtfInitialize_13(void) {

+  rc = NtfTest::saNtfInitialize(NULL, NULL, NULL);
+  test_validate(rc, SA_AIS_ERR_INVALID_PARAM);
+}
+
  __attribute__((constructor)) static void saNtfInitialize_constructor(void) {
test_suite_add(1, "Life cycle, initialize, API 1");
test_case_add(1, saNtfInitialize_01, "saNtfInitialize SA_AIS_OK");
@@ -142,4 +147,7 @@ __attribute__((constructor)) static void 
saNtfInitialize_constructor(void) {
"saNtfInitialize with major version set to lower");
test_case_add(1, saNtfInitialize_12,
"saNtfInitialize with version A.0.0");
+  test_case_add(1, saNtfInitialize_13,
+  "saNtfInitialize with NULL pointer to handle AND NULL callbacks"
+  " and unintilized version");
  }



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 3/5] ntf: add new test case of API saNtfDispatch() of apitest [#2958]

2018-11-13 Thread Minh Hon Chau


Hi Mohan,

A minor comment, I think you meant to add saNtfDispatch_04.

Thanks

Minh

On 9/11/18 11:55 pm, Mohan Kanakam wrote:

---
  src/ntf/apitest/tet_saNtfDispatch.cc | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/src/ntf/apitest/tet_saNtfDispatch.cc 
b/src/ntf/apitest/tet_saNtfDispatch.cc
index 5fea4ef..81a722c 100644
--- a/src/ntf/apitest/tet_saNtfDispatch.cc
+++ b/src/ntf/apitest/tet_saNtfDispatch.cc
@@ -40,6 +40,14 @@ void saNtfDispatch_03(void) {
test_validate(rc, SA_AIS_ERR_INVALID_PARAM);
  }
  
+void saNtfDispatch_04(void) {

+  safassert(NtfTest::saNtfInitialize(, , ),
+  SA_AIS_OK);
+  safassert(NtfTest::saNtfFinalize(ntfHandle), SA_AIS_OK);
+  rc = NtfTest::saNtfDispatch(ntfHandle, SA_DISPATCH_ALL);
+  test_validate(rc, SA_AIS_ERR_BAD_HANDLE);
+}
+
  __attribute__((constructor)) static void saNtfDispatch_constructor(void) {
test_suite_add(4, "Life cycle, dispatch, API 4");
test_case_add(4, saNtfDispatch_01,
@@ -48,4 +56,6 @@ __attribute__((constructor)) static void 
saNtfDispatch_constructor(void) {
"saNtfDispatch - invalid handle SA_AIS_ERR_BAD_HANDLE");
test_case_add(4, saNtfDispatch_03,
"saNtfDispatch - zero flag SA_AIS_ERR_INVALID_PARAM");
+  test_case_add(4, saNtfDispatch_03,
+  "saNtfDispatch - Fianlized handle SA_AIS_ERR_BAD_HANDLE");
  }



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] Review Request for amf: update PR [#2918]

2018-11-13 Thread Minh Hon Chau


Hi Gary, ack from me. Thanks/Minh

On 14/11/18 1:28 pm, Gary Lee wrote:


Hi

A small update to the AMF PR for #2918.

  * Renumbered 2.2.18 Excessive assignments to 2.2.19
  * Added 2.2.18 Network partitioning
  * Added timers to Section 3.3

https://sourceforge.net/p/opensaf/tickets/_discuss/thread/cae26fce/0d37/attachment/OpenSAF_AMF_PR_new.odt.gz

Thanks

Gary



___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] Review Request for amf: Update PR [#2929]

2018-11-12 Thread Minh Hon Chau


Hi Nagu,

It's not the recovery in specification, I mean a new attribute.

Thanks,

Minh

On 12/11/18 9:53 pm, Nagendra Kumar wrote:

Hi Minh,
Thanks for your response.

In future, I think we can make it as configurable recovery method, so up to 
applications to choose from.

You mean recommended recovery option? But how will it work?


Thanks
-Nagendra
High Availability Solutions
www.hasolutions.in
cont...@hasolutions.in
Hyderabad, India: +91-9866424860   |   Delaware, USA: +1 508-422-7725

-Original Message-
From: Minh Hon Chau [mailto:minh.c...@dektech.com.au]
Sent: 12 November 2018 16:16
To: Nagendra Kumar; 'Hans Nordeback'; 'Gary Lee'
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: Review Request for amf: Update PR [#2929]

Hi Nagu,

Agree with you that we can do it for 2N. However the mutual active
workload has to be exclusively one at a time, so there may be some sort
of corruption to applications. But it also depends on how internal
application logics are implemented. So reboot the node is a choice of
safety for now. In future, I think we can make it as configurable
recovery method, so up to applications to choose from.

Thanks,

Minh

On 12/11/18 7:49 pm, Nagendra Kumar wrote:

Hi Minh,
Ack from me.
Btw, why did you opt to remove assignments and restart admin operation for Nway 
Act and No Red.
The same could have done in 2N by removing the assignments and restart and then 
provide fresh assignments.

Thanks
-Nagendra
High Availability Solutions
www.hasolutions.in
cont...@hasolutions.in
Hyderabad, India: +91-9866424860   |   Delaware, USA: +1 508-422-7725

-Original Message-
From: Minh Hon Chau [mailto:minh.c...@dektech.com.au]
Sent: 12 November 2018 08:04
To: Hans Nordeback; Nagendra Kumar; Gary Lee
Cc: opensaf-devel@lists.sourceforge.net
Subject: Review Request for amf: Update PR [#2929]

Hi all,

Document update for #2929 in item 2.2.18 to be reviewed.

https://sourceforge.net/p/opensaf/tickets/2929/attachment/OpenSAF_AMF_PR_2929.odt

Thanks,

Minh








___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] Review Request for amf: Update PR [#2929]

2018-11-12 Thread Minh Hon Chau


Hi Nagu,

Agree with you that we can do it for 2N. However the mutual active 
workload has to be exclusively one at a time, so there may be some sort 
of corruption to applications. But it also depends on how internal 
application logics are implemented. So reboot the node is a choice of 
safety for now. In future, I think we can make it as configurable 
recovery method, so up to applications to choose from.


Thanks,

Minh

On 12/11/18 7:49 pm, Nagendra Kumar wrote:

Hi Minh,
Ack from me.
Btw, why did you opt to remove assignments and restart admin operation for Nway 
Act and No Red.
The same could have done in 2N by removing the assignments and restart and then 
provide fresh assignments.

Thanks
-Nagendra
High Availability Solutions
www.hasolutions.in
cont...@hasolutions.in
Hyderabad, India: +91-9866424860   |   Delaware, USA: +1 508-422-7725

-Original Message-
From: Minh Hon Chau [mailto:minh.c...@dektech.com.au]
Sent: 12 November 2018 08:04
To: Hans Nordeback; Nagendra Kumar; Gary Lee
Cc: opensaf-devel@lists.sourceforge.net
Subject: Review Request for amf: Update PR [#2929]

Hi all,

Document update for #2929 in item 2.2.18 to be reviewed.

https://sourceforge.net/p/opensaf/tickets/2929/attachment/OpenSAF_AMF_PR_2929.odt

Thanks,

Minh






___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

1 2 3 4 >

1 - 100 of 394 matches

Mail list logo