Ack,
Mathi.

> -----Original Message-----
> From: Anders Widell [mailto:[email protected]]
> Sent: Friday, December 09, 2016 6:20 PM
> To: Mathivanan Naickan Palanivelu
> Cc: [email protected]
> Subject: [PATCH 1 of 1] clm: Send MDS messages asynchronously to avoid
> blocking the main thread [#2220]
> 
>  osaf/services/saf/clmsv/nodeagent/cb.h   |    1 +
>  osaf/services/saf/clmsv/nodeagent/evt.h  |   22 ++-
>  osaf/services/saf/clmsv/nodeagent/main.c |  198 +++++++++++++++--------
> -------
>  3 files changed, 117 insertions(+), 104 deletions(-)
> 
> 
> Send the join request message asynchronously to the CLM server. This
> avoids blocking the CLM main thread for a potentially very long time in case
> no reply is received (e.g. because of message loss or because the CLM server
> died before being able to send the reply). The result is a faster recovery 
> from
> such error scenarios.
> 
> diff --git a/osaf/services/saf/clmsv/nodeagent/cb.h
> b/osaf/services/saf/clmsv/nodeagent/cb.h
> --- a/osaf/services/saf/clmsv/nodeagent/cb.h
> +++ b/osaf/services/saf/clmsv/nodeagent/cb.h
> @@ -56,6 +56,7 @@ typedef struct clmna_cb_t {
>    NODE_INFO node_info;
>    tmr_t scale_out_retry_tmr;
>    bool is_scale_out_retry_tmr_running;
> +  bool try_again_received;
>    bool nid_started;       /**< true if started by NID */
>    void* election_starter;
>  } CLMNA_CB;
> diff --git a/osaf/services/saf/clmsv/nodeagent/evt.h
> b/osaf/services/saf/clmsv/nodeagent/evt.h
> --- a/osaf/services/saf/clmsv/nodeagent/evt.h
> +++ b/osaf/services/saf/clmsv/nodeagent/evt.h
> @@ -25,17 +25,29 @@ typedef NCS_IPC_MSG CLMNA_MBX_MSG;  typedef
> enum clmna_evt_type {
>    CLMNA_EVT_INVALID = 0,
>    CLMNA_EVT_CHANGE_MSG,
> +  CLMNA_EVT_JOIN_RESPONSE,
>    CLMNA_EVT_MAX_MSG
>  } CLMNA_EVT_TYPE;
> 
> +struct Change {
> +  bool caused_by_timer_expiry;
> +  NCSMDS_CHG change;
> +  NODE_ID node_id;
> +  MDS_SVC_ID svc_id;
> +};
> +
> +struct JoinResponse {
> +  SaAisErrorT rc;
> +  SaNameT node_name;
> +};
> +
>  typedef struct clmna_evt_tags {
>    CLMNA_MBX_MSG next;
>    CLMNA_EVT_TYPE type;
> -  bool caused_by_timer_expiry;
> -  NCSMDS_CHG change;
> -  NODE_ID node_id;
> -  MDS_SVC_ID svc_id;
> +  union {
> +    struct Change change;
> +    struct JoinResponse join_response;
> +  };
>  } CLMNA_EVT;
> 
> -
>  #endif
> diff --git a/osaf/services/saf/clmsv/nodeagent/main.c
> b/osaf/services/saf/clmsv/nodeagent/main.c
> --- a/osaf/services/saf/clmsv/nodeagent/main.c
> +++ b/osaf/services/saf/clmsv/nodeagent/main.c
> @@ -18,6 +18,8 @@
> 
>  #define _GNU_SOURCE
>  #include <errno.h>
> +#include <inttypes.h>
> +#include <stdint.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <stdbool.h>
> @@ -55,8 +57,8 @@ static NCS_SEL_OBJ usr1_sel_obj;
> 
> 
>  #define CLMNA_MDS_SUB_PART_VERSION   1
> -#define CLMS_NODEUP_WAIT_TIME 60000
>  #define CLMNA_SCALE_OUT_RETRY_TIME 100
> +#define CLMNA_JOIN_RETRY_TIME 3000
>  #define CLMNA_SVC_PVT_SUBPART_VERSION  1  #define
> CLMNA_WRT_CLMS_SUBPART_VER_AT_MIN_MSG_FMT 1  #define
> CLMNA_WRT_CLMS_SUBPART_VER_AT_MAX_MSG_FMT 1 @@ -75,7
> +77,11 @@ static MDS_CLIENT_MSG_FORMAT_VER
> 
>  static uint32_t clmna_mds_enc(struct ncsmds_callback_info *info);  static
> uint32_t clmna_mds_callback(struct ncsmds_callback_info *info); -
> SaAisErrorT clmna_process_dummyup_msg(bool caused_by_timer_expiry);
> +static void clmna_process_dummyup_msg(void); static void
> +clmna_handle_join_response(SaAisErrorT error, const SaNameT*
> +node_name); static void start_scale_out_retry_tmr(int64_t timeout);
> +static void stop_scale_out_retry_tmr(void); static uint32_t
> +clmna_mds_msg_send(CLMSV_MSG * i_msg);
> 
>  static uint32_t clmna_mds_cpy(struct ncsmds_callback_info *info)  { @@ -
> 173,6 +179,16 @@ static uint32_t clmna_mds_dec_flat(struc
> 
>  static uint32_t clmna_mds_rcv(struct ncsmds_callback_info *mds_cb_info)
> {
> +     CLMSV_MSG* msg = (CLMSV_MSG*) mds_cb_info-
> >info.receive.i_msg;
> +     CLMNA_EVT* evt = calloc(1, sizeof(CLMNA_EVT));
> +     evt->type = CLMNA_EVT_JOIN_RESPONSE;
> +     evt->join_response.rc = msg->info.api_resp_info.rc;
> +     evt->join_response.node_name = msg-
> >info.api_resp_info.param.node_name;
> +     free(msg);
> +     if (m_NCS_IPC_SEND(&clmna_cb->mbx, evt,
> NCS_IPC_PRIORITY_VERY_HIGH) !=
> +         NCSCC_RC_SUCCESS) {
> +             LOG_ER("IPC send to mailbox failed: %s", __FUNCTION__);
> +     }
>       return NCSCC_RC_SUCCESS;
>  }
> 
> @@ -181,10 +197,7 @@ static void clmna_handle_mds_change_evt(  {
>       if ((change == NCSMDS_NEW_ACTIVE || change == NCSMDS_UP)
> &&
>           svc_id == NCSMDS_SVC_ID_CLMS && clmna_cb->server_synced
> == false) {
> -             if (clmna_process_dummyup_msg(caused_by_timer_expiry)
> != SA_AIS_OK) {
> -                     /* NID will anyway stop and retry */
> -                     LOG_ER("Exiting");
> -             }
> +             clmna_process_dummyup_msg();
>       }
> 
>       if (caused_by_timer_expiry == false && @@ -206,6 +219,40 @@
> static void clmna_handle_mds_change_evt(
>       }
>  }
> 
> +static void clmna_handle_join_response(SaAisErrorT error, const
> +SaNameT* node_name) {
> +     if (error == SA_AIS_ERR_NOT_EXIST) {
> +             LOG_ER("%s is not a configured node",
> +                    node_name->value);
> +     } else if (error == SA_AIS_ERR_EXIST) {
> +             LOG_ER("%s is already up. Specify a unique name in"
> PKGSYSCONFDIR "/node_name",
> +                    node_name->value);
> +     } else if (error == SA_AIS_ERR_TRY_AGAIN) {
> +             if (clmna_cb->try_again_received) {
> +                     LOG_IN("Re-trying to scale out %s", node_name-
> >value);
> +             } else {
> +                     // Avoid spamming the log with more than one
> message at NOTICE priority.
> +                     clmna_cb->try_again_received = true;
> +                     LOG_NO("%s has been queued for scale-out",
> node_name->value);
> +             }
> +             stop_scale_out_retry_tmr();
> +
>       start_scale_out_retry_tmr(CLMNA_SCALE_OUT_RETRY_TIME);
> +     } else if (error == SA_AIS_OK) {
> +             stop_scale_out_retry_tmr();
> +             if (clmna_cb->server_synced == false) {
> +                     NODE_INFO self_node = clmna_cb->node_info;
> +                     clmna_cb->server_synced = true;
> +                     LOG_NO("%s Joined cluster, nodeid=%x",
> node_name->value, self_node.node_id);
> +                     if (clmna_cb->nid_started &&
> +                         nid_notify("CLMNA", NCSCC_RC_SUCCESS, NULL)
> != NCSCC_RC_SUCCESS) {
> +                             LOG_ER("nid notify failed");
> +                     }
> +             }
> +     } else {
> +             LOG_ER("Received unexpected join response error code
> %d", (int) error);
> +     }
> +}
> +
>  static uint32_t clmna_mds_svc_evt(struct ncsmds_callback_info
> *mds_cb_info)  {
>       TRACE_ENTER2("%d", mds_cb_info->info.svc_evt.i_change);
> @@ -229,6 +276,8 @@ static uint32_t clmna_mds_svc_evt(struct
>               case NCSMDS_SVC_ID_CLMS:
>                       // if CLMS dies, then we have to send nodeup again
>                       clmna_cb->server_synced = false;
> +                     clmna_cb->clms_mds_dest = 0;
> +                     clmna_cb->try_again_received = false;
>                       break;
>               default:
>                       break;
> @@ -240,10 +289,10 @@ static uint32_t clmna_mds_svc_evt(struct
> 
>       evt = calloc(1, sizeof(CLMNA_EVT));
>       evt->type = CLMNA_EVT_CHANGE_MSG;
> -     evt->caused_by_timer_expiry = false;
> -     evt->change = mds_cb_info->info.svc_evt.i_change;
> -     evt->node_id = mds_cb_info->info.svc_evt.i_node_id;
> -     evt->svc_id = mds_cb_info->info.svc_evt.i_svc_id;
> +     evt->change.caused_by_timer_expiry = false;
> +     evt->change.change = mds_cb_info->info.svc_evt.i_change;
> +     evt->change.node_id = mds_cb_info->info.svc_evt.i_node_id;
> +     evt->change.svc_id = mds_cb_info->info.svc_evt.i_svc_id;
>       if (m_NCS_IPC_SEND(&clmna_cb->mbx, evt,
> NCS_IPC_PRIORITY_VERY_HIGH) !=
>           NCSCC_RC_SUCCESS) {
>               LOG_ER("IPC send to mailbox failed: %s", __FUNCTION__);
> @@ -462,7 +511,7 @@ static int get_node_info(NODE_INFO *node
>       return 0;
>  }
> 
> -static uint32_t clmna_mds_msg_sync_send(CLMSV_MSG * i_msg,
> CLMSV_MSG ** o_msg, SaTimeT timeout)
> +static uint32_t clmna_mds_msg_send(CLMSV_MSG * i_msg)
>  {
>       NCSMDS_INFO mds_info;
>       uint32_t rc = NCSCC_RC_SUCCESS;
> @@ -476,19 +525,14 @@ static uint32_t clmna_mds_msg_sync_send(
>       /* Fill the send structure */
>       mds_info.info.svc_send.i_msg = (NCSCONTEXT)i_msg;
>       mds_info.info.svc_send.i_to_svc = NCSMDS_SVC_ID_CLMS;
> -     mds_info.info.svc_send.i_sendtype = MDS_SENDTYPE_SNDRSP;
> +     mds_info.info.svc_send.i_sendtype = MDS_SENDTYPE_SND;
>       mds_info.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH;
> -     /* fill the sub send rsp strcuture */
> -     mds_info.info.svc_send.info.sndrsp.i_time_to_wait = timeout;
>       /* timeto wait in 10ms FIX!!! */
> -     mds_info.info.svc_send.info.sndrsp.i_to_dest = clmna_cb-
> >clms_mds_dest;
> +     /* fill the sub send strcuture */
> +     mds_info.info.svc_send.info.snd.i_to_dest = clmna_cb-
> >clms_mds_dest;
> 
>       /* send the message */
> -     if (NCSCC_RC_SUCCESS == (rc = ncsmds_api(&mds_info))) {
> -             /* Retrieve the response and take ownership of the memory
> */
> -             *o_msg = (CLMSV_MSG *)
> mds_info.info.svc_send.info.sndrsp.o_rsp;
> -             mds_info.info.svc_send.info.sndrsp.o_rsp = NULL;
> -     } else
> -             TRACE("clma_mds_msg_sync_send FAILED");
> +     if (NCSCC_RC_SUCCESS != (rc = ncsmds_api(&mds_info)))
> +             TRACE("clma_mds_msg_send FAILED");
> 
>       TRACE_LEAVE();
>       return rc;
> @@ -502,10 +546,10 @@ static void scale_out_tmr_exp(void *arg)
>               CLMNA_EVT *evt = calloc(1, sizeof(CLMNA_EVT));
>               if (evt != NULL) {
>                       evt->type = CLMNA_EVT_CHANGE_MSG;
> -                     evt->caused_by_timer_expiry = true;
> -                     evt->change = NCSMDS_UP;
> -                     evt->node_id = 0;
> -                     evt->svc_id = NCSMDS_SVC_ID_CLMS;
> +                     evt->change.caused_by_timer_expiry = true;
> +                     evt->change.change = NCSMDS_UP;
> +                     evt->change.node_id = 0;
> +                     evt->change.svc_id = NCSMDS_SVC_ID_CLMS;
>                       if (m_NCS_IPC_SEND(&clmna_cb->mbx, evt,
>                               NCS_IPC_PRIORITY_VERY_HIGH) !=
> NCSCC_RC_SUCCESS)
>                               LOG_ER("IPC send to mailbox failed: %s",
> @@ -522,7 +566,7 @@ static void scale_out_tmr_exp(void *arg)
>       TRACE_LEAVE();
>  }
> 
> -static void start_scale_out_retry_tmr(void)
> +static void start_scale_out_retry_tmr(int64_t timeout)
>  {
>       TRACE_ENTER();
>       if (clmna_cb->scale_out_retry_tmr == NULL) { @@ -533,7 +577,7 @@
> static void start_scale_out_retry_tmr(vo
>       if (clmna_cb->scale_out_retry_tmr != NULL &&
>               clmna_cb->is_scale_out_retry_tmr_running == false) {
>               m_NCS_TMR_START(clmna_cb->scale_out_retry_tmr,
> -                     CLMNA_SCALE_OUT_RETRY_TIME,
> scale_out_tmr_exp, NULL);
> +                     timeout, scale_out_tmr_exp, NULL);
>               if (clmna_cb->scale_out_retry_tmr != NULL) {
>                       clmna_cb->is_scale_out_retry_tmr_running = true;
>               }
> @@ -541,89 +585,40 @@ static void start_scale_out_retry_tmr(vo
>       TRACE_LEAVE();
>  }
> 
> -SaAisErrorT clmna_process_dummyup_msg(bool caused_by_timer_expiry)
> +static void stop_scale_out_retry_tmr(void)
>  {
> -     uint32_t rc = NCSCC_RC_SUCCESS;
> -     SaAisErrorT error = SA_AIS_OK;
> -     CLMSV_MSG msg, *o_msg = NULL;
> -     NODE_INFO self_node = clmna_cb->node_info;
> +     TRACE_ENTER();
> +     if (clmna_cb->scale_out_retry_tmr != NULL &&
> +         clmna_cb->is_scale_out_retry_tmr_running == true) {
> +             m_NCS_TMR_STOP(clmna_cb->scale_out_retry_tmr);
> +             clmna_cb->is_scale_out_retry_tmr_running = false;
> +     }
> +     TRACE_LEAVE();
> +}
> 
> +static void clmna_process_dummyup_msg(void) {
>       if (clmna_cb->clms_mds_dest != 0) {
> +             CLMSV_MSG msg;
> +             NODE_INFO self_node = clmna_cb->node_info;
>               msg.evt_type = CLMSV_CLMA_TO_CLMS_API_MSG;
>               msg.info.api_info.type = CLMSV_CLUSTER_JOIN_REQ;
>               msg.info.api_info.param.nodeup_info.node_id =
> self_node.node_id;
>               msg.info.api_info.param.nodeup_info.node_name =
> self_node.node_name;
> -             rc = clmna_mds_msg_sync_send(&msg, &o_msg,
> CLMS_NODEUP_WAIT_TIME);
> +             stop_scale_out_retry_tmr();
> +             start_scale_out_retry_tmr(CLMNA_JOIN_RETRY_TIME);
> +             uint32_t rc = clmna_mds_msg_send(&msg);
>               switch (rc) {
>               case NCSCC_RC_SUCCESS:
> -                     error = SA_AIS_OK;
>                       break;
>               case NCSCC_RC_REQ_TIMOUT:
> -                     error = SA_AIS_ERR_TIMEOUT;
>                       LOG_ER("Send timed out. Check status(network,
> process) of ACTIVE controller");
> -                     goto done;
> +                     break;
>               default:
> -                     LOG_ER("Send failed: %u. Check network
> connectivity", rc);
> -                     goto done;
> +                     LOG_ER("Send failed: %" PRIu32 " Check network
> connectivity", rc);
> +                     break;
>               }
> -
> -             if (o_msg != NULL) {
> -                     error = o_msg->info.api_resp_info.rc;
> -             } else
> -                     error = SA_AIS_ERR_NO_RESOURCES;
> -
> -             if (error == SA_AIS_ERR_NOT_EXIST) {
> -                     LOG_ER("%s is not a configured node",
> -                             o_msg-
> >info.api_resp_info.param.node_name.value);
> -                     free(o_msg);
> -                     rc = error; /* For now, just pass on the error to nid.
> -                                  * This is not needed in future when node
> local
> -                                  * cluster management policy based
> decisions can be made.
> -                                  */
> -                     goto done;
> -             } else if (error == SA_AIS_ERR_EXIST) {
> -                     LOG_ER("%s is already up. Specify a unique name in"
> PKGSYSCONFDIR "/node_name",
> -                             o_msg-
> >info.api_resp_info.param.node_name.value);
> -                     free(o_msg);
> -                     rc = error; /* This is not needed in future when node
> local
> -                                  * cluster management policy based
> decisions can be made.
> -                                  * For now, just pass on the error to nid.
> -                                  */
> -                     goto done;
> -             } else if (error == SA_AIS_ERR_TRY_AGAIN) {
> -                     if (caused_by_timer_expiry) {
> -                             LOG_IN("Re-trying to scale out %s",
> -                                     o_msg->info.api_resp_info.param.
> -                                     node_name.value);
> -                     } else {
> -                             LOG_NO("%s has been queued for scale-
> out",
> -                                     o_msg->info.api_resp_info.param.
> -                                     node_name.value);
> -                     }
> -                     start_scale_out_retry_tmr();
> -                     free(o_msg);
> -                     goto retry;
> -             }
> -
> -             if (error == SA_AIS_OK) {
> -                     clmna_cb->server_synced = true;
> -                     LOG_NO("%s Joined cluster, nodeid=%x",
> -                             o_msg-
> >info.api_resp_info.param.node_name.value,
> -                             self_node.node_id);
> -
> -             }
> -
> -             if (o_msg != NULL)
> -                     free(o_msg);
> -
>       }
> -done:
> -     if (clmna_cb->nid_started &&
> -             nid_notify("CLMNA", rc, NULL) != NCSCC_RC_SUCCESS) {
> -             LOG_ER("nid notify failed");
> -     }
> -retry:
> -     return rc;
>  }
> 
>  void clmna_process_mbx(SYSF_MBX *mbx)
> @@ -638,10 +633,14 @@ void clmna_process_mbx(SYSF_MBX *mbx)
>       }
>       switch (msg->type) {
>       case CLMNA_EVT_CHANGE_MSG:
> -             clmna_handle_mds_change_evt(msg-
> >caused_by_timer_expiry,
> -                     msg->change,
> -                     msg->node_id,
> -                     msg->svc_id);
> +             clmna_handle_mds_change_evt(msg-
> >change.caused_by_timer_expiry,
> +                     msg->change.change,
> +                     msg->change.node_id,
> +                     msg->change.svc_id);
> +             break;
> +     case CLMNA_EVT_JOIN_RESPONSE:
> +             clmna_handle_join_response(msg->join_response.rc,
> +                                        &msg-
> >join_response.node_name);
>               break;
>       default:
>               TRACE("Invalid message type");
> @@ -681,6 +680,7 @@ int main(int argc, char *argv[])
>       clmna_cb->server_synced = false;
>       clmna_cb->scale_out_retry_tmr = NULL;
>       clmna_cb->is_scale_out_retry_tmr_running = false;
> +     clmna_cb->try_again_received = false;
> 
>       /* Determine how this process was started, by NID or AMF */
>       if (getenv("SA_AMF_COMPONENT_NAME") == NULL)

------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most 
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to