A question:

I noticed that clms_send_is_member_info() is called twice from 
clms_imm_node_unlock(), but you only modified the first call (the one 
without PLM). Shouldn't the PLM case also be updated? Is a similar 
change also applicable in clms_imm_node_shutdown?

regards,

Anders Widell


On 04/03/2017 10:09 AM, [email protected] wrote:
>   src/clm/clmd/clms_evt.c   |   2 +-
>   src/clm/clmd/clms_imm.c   |  11 +++++------
>   src/clm/clmd/clms_mbcsv.c |   2 +-
>   src/clm/clmd/clms_util.c  |  16 ++++++++++------
>   4 files changed, 17 insertions(+), 14 deletions(-)
>
>
> In this problem, first user performs CLM lock operation on payload and 
> restarts it immediately.
> When node was joining, user performs UNLOCK operation on it. Operation gets 
> timed out.
> After this, CLM rejects any admin operation on this payload with BAD_OP 
> indicating
> that an admin operation is already going on.
>
> For unlock operation when CLM tries to send membership status to clients on 
> the node being
> unlocked, MDS returns failure for a client. CLMS does not continue with 
> remaining clients and it does
> not reply to IMM client also. This the reason of unlocked opreration getting 
> timed out. Also
> CLM does not clear internal parameter related to admin operation. Due to this 
> subsequent admin
> operationa on this node are rejected with BAD_OP. There is no clue in traces 
> why MDS returned failure.
>
> Generally this can happen when CLMS is trying to send message to a bunch of 
> clients and
> one of them goes down. Since CLMS has not processed this DOWN event, it will 
> try to send
> message to this cleint for which MDS will return failure. Currently issue is 
> reproduced
> on this basis. This patch is based on this. With the patch CLM will send 
> membership status to remaining
> clients even if MDS returns failure for a client. Also in such a situation, 
> CLMS will return
> TIMEOUT to the user and it will clear internal admin op params so that 
> subsequent operation will
> continue.
>
> diff --git a/src/clm/clmd/clms_evt.c b/src/clm/clmd/clms_evt.c
> --- a/src/clm/clmd/clms_evt.c
> +++ b/src/clm/clmd/clms_evt.c
> @@ -137,7 +137,7 @@ CLMS_CLIENT_INFO *clms_client_get_by_id(
>       rec = (CLMS_CLIENT_INFO *) ncs_patricia_tree_get(&clms_cb->client_db, 
> (uint8_t *)&client_id_net);
>   
>       if (NULL == rec)
> -             TRACE("client_id: %u lookup failed", client_id);
> +             TRACE("client_id: %u not found", client_id);
>   
>       return rec;
>   }
> diff --git a/src/clm/clmd/clms_imm.c b/src/clm/clmd/clms_imm.c
> --- a/src/clm/clmd/clms_imm.c
> +++ b/src/clm/clmd/clms_imm.c
> @@ -2221,16 +2221,15 @@ uint32_t clms_imm_node_unlock(CLMS_CLUST
>   {
>       uint32_t rc = NCSCC_RC_SUCCESS;
>       TRACE_ENTER2("Node name %s to unlock", nodeop->node_name.value);
> -
> +     SaAisErrorT ais_rc = SA_AIS_OK;
>       if (nodeop->admin_state == SA_CLM_ADMIN_UNLOCKED) {
> -             LOG_ER("Node is already in an unlocked state");
> +             LOG_NO("Node is already in an unlocked state");
>               nodeop->admin_op = 0;
>               (void)immutil_saImmOiAdminOperationResult(clms_cb->immOiHandle, 
> nodeop->curr_admin_inv,
>                                                         SA_AIS_ERR_NO_OP);
>               rc = NCSCC_RC_FAILURE;
>               goto done;
>       }
> -
>       if (((nodeop->admin_state == SA_CLM_ADMIN_LOCKED) || 
> (nodeop->admin_state == SA_CLM_ADMIN_SHUTTING_DOWN))) {
>   
>               if (clms_cb->reg_with_plm == SA_FALSE) {
> @@ -2259,9 +2258,9 @@ uint32_t clms_imm_node_unlock(CLMS_CLUST
>                               clms_node_join_ntf(clms_cb, nodeop);
>   
>                               rc = clms_send_is_member_info(clms_cb, 
> nodeop->node_id, nodeop->member, true);
> -                             if(rc != NCSCC_RC_SUCCESS) {
> +                             if (rc != NCSCC_RC_SUCCESS) {
>                                       TRACE("clms_send_is_member_info failed 
> %u", rc);
> -                                     goto done;
> +                                     ais_rc = SA_AIS_ERR_TIMEOUT;
>                               }
>                               nodeop->change = SA_CLM_NODE_NO_CHANGE;
>                       }
> @@ -2322,7 +2321,7 @@ uint32_t clms_imm_node_unlock(CLMS_CLUST
>       nodeop->admin_op = 0;
>   
>       /* Send node join notification */
> -     (void)immutil_saImmOiAdminOperationResult(clms_cb->immOiHandle, 
> nodeop->curr_admin_inv, SA_AIS_OK);
> +     (void)immutil_saImmOiAdminOperationResult(clms_cb->immOiHandle, 
> nodeop->curr_admin_inv, ais_rc);
>       clms_node_admin_state_change_ntf(clms_cb, nodeop, 
> SA_CLM_ADMIN_UNLOCKED);
>    done:
>       TRACE_LEAVE();
> diff --git a/src/clm/clmd/clms_mbcsv.c b/src/clm/clmd/clms_mbcsv.c
> --- a/src/clm/clmd/clms_mbcsv.c
> +++ b/src/clm/clmd/clms_mbcsv.c
> @@ -103,7 +103,7 @@ static CLMS_CKPT_HDLR ckpt_data_handler[
>   static uint32_t ckpt_proc_cluster_rec(CLMS_CB * cb, CLMS_CKPT_REC * data)
>   {
>       CLMSV_CKPT_CLUSTER_INFO *param = &data->param.cluster_rec;
> -
> +     TRACE_ENTER();
>       osaf_cluster->num_nodes = param->num_nodes;
>       osaf_cluster->init_time = param->init_time;
>       cb->cluster_view_num = param->cluster_view_num;
> diff --git a/src/clm/clmd/clms_util.c b/src/clm/clmd/clms_util.c
> --- a/src/clm/clmd/clms_util.c
> +++ b/src/clm/clmd/clms_util.c
> @@ -396,6 +396,7 @@ SaClmClusterNotificationT_4 *clms_notbuf
>   
>       if (num == 0) {
>               LOG_ER("Zero num of node's changed");
> +             TRACE_LEAVE();
>               return NULL;
>       }
>   
> @@ -462,6 +463,7 @@ SaClmClusterNotificationT_4 *clms_notbuf
>       num = clms_nodedb_lookup(1);
>   
>       if (num == 0) {
> +             TRACE_LEAVE();
>               return NULL;
>       }
>   
> @@ -1149,14 +1151,13 @@ uint32_t clms_send_is_member_info(CLMS_C
>       CLMSV_MSG msg;
>       SaClmNodeIdT local_node_id;
>       uint32_t rc = NCSCC_RC_SUCCESS;
> +     bool send_failed = false;
>       
>       TRACE_ENTER();
>       
>       client = clms_client_getnext_by_id(0);
>       while(client != NULL) {
> -
>               local_node_id = m_NCS_NODE_ID_FROM_MDS_DEST(client->mds_dest);
> -
>               if(local_node_id == node_id) {
>                       msg.evt_type = CLMSV_CLMS_TO_CLMA_IS_MEMBER_MSG;
>                       msg.info.is_member_info.is_member = member;
> @@ -1164,16 +1165,19 @@ uint32_t clms_send_is_member_info(CLMS_C
>                       msg.info.is_member_info.client_id = client->client_id;
>                       rc = clms_mds_msg_send(cb, &msg, &client->mds_dest, 0,
>                                               MDS_SEND_PRIORITY_MEDIUM, 
> NCSMDS_SVC_ID_CLMA);
> +                     /* A client may go down while sending message to 
> another client. For such
> +                        a client MDS will return failure. Continue for other 
> clients.*/
>                       if (rc != NCSCC_RC_SUCCESS) {
> -                             TRACE_LEAVE2("clms_mds_msg_send failed rc = 
> %u", (unsigned int)rc);
> -                             return rc;
> +                             TRACE_2("clms_mds_msg_send failed for client 
> '%u', rc = %u",
> +                                             client->client_id, (unsigned 
> int)rc);
> +                             send_failed = true;
>                       }
>               }
> -
>               client = clms_client_getnext_by_id(client->client_id);
>       }
> -
>       TRACE_LEAVE();
> +     if (send_failed)
> +             return NCSCC_RC_FAILURE;
>       return rc;
>   }
>   


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to