A question: I noticed that clms_send_is_member_info() is called twice from clms_imm_node_unlock(), but you only modified the first call (the one without PLM). Shouldn't the PLM case also be updated? Is a similar change also applicable in clms_imm_node_shutdown?
regards, Anders Widell On 04/03/2017 10:09 AM, [email protected] wrote: > src/clm/clmd/clms_evt.c | 2 +- > src/clm/clmd/clms_imm.c | 11 +++++------ > src/clm/clmd/clms_mbcsv.c | 2 +- > src/clm/clmd/clms_util.c | 16 ++++++++++------ > 4 files changed, 17 insertions(+), 14 deletions(-) > > > In this problem, first user performs CLM lock operation on payload and > restarts it immediately. > When node was joining, user performs UNLOCK operation on it. Operation gets > timed out. > After this, CLM rejects any admin operation on this payload with BAD_OP > indicating > that an admin operation is already going on. > > For unlock operation when CLM tries to send membership status to clients on > the node being > unlocked, MDS returns failure for a client. CLMS does not continue with > remaining clients and it does > not reply to IMM client also. This the reason of unlocked opreration getting > timed out. Also > CLM does not clear internal parameter related to admin operation. Due to this > subsequent admin > operationa on this node are rejected with BAD_OP. There is no clue in traces > why MDS returned failure. > > Generally this can happen when CLMS is trying to send message to a bunch of > clients and > one of them goes down. Since CLMS has not processed this DOWN event, it will > try to send > message to this cleint for which MDS will return failure. Currently issue is > reproduced > on this basis. This patch is based on this. With the patch CLM will send > membership status to remaining > clients even if MDS returns failure for a client. Also in such a situation, > CLMS will return > TIMEOUT to the user and it will clear internal admin op params so that > subsequent operation will > continue. > > diff --git a/src/clm/clmd/clms_evt.c b/src/clm/clmd/clms_evt.c > --- a/src/clm/clmd/clms_evt.c > +++ b/src/clm/clmd/clms_evt.c > @@ -137,7 +137,7 @@ CLMS_CLIENT_INFO *clms_client_get_by_id( > rec = (CLMS_CLIENT_INFO *) ncs_patricia_tree_get(&clms_cb->client_db, > (uint8_t *)&client_id_net); > > if (NULL == rec) > - TRACE("client_id: %u lookup failed", client_id); > + TRACE("client_id: %u not found", client_id); > > return rec; > } > diff --git a/src/clm/clmd/clms_imm.c b/src/clm/clmd/clms_imm.c > --- a/src/clm/clmd/clms_imm.c > +++ b/src/clm/clmd/clms_imm.c > @@ -2221,16 +2221,15 @@ uint32_t clms_imm_node_unlock(CLMS_CLUST > { > uint32_t rc = NCSCC_RC_SUCCESS; > TRACE_ENTER2("Node name %s to unlock", nodeop->node_name.value); > - > + SaAisErrorT ais_rc = SA_AIS_OK; > if (nodeop->admin_state == SA_CLM_ADMIN_UNLOCKED) { > - LOG_ER("Node is already in an unlocked state"); > + LOG_NO("Node is already in an unlocked state"); > nodeop->admin_op = 0; > (void)immutil_saImmOiAdminOperationResult(clms_cb->immOiHandle, > nodeop->curr_admin_inv, > SA_AIS_ERR_NO_OP); > rc = NCSCC_RC_FAILURE; > goto done; > } > - > if (((nodeop->admin_state == SA_CLM_ADMIN_LOCKED) || > (nodeop->admin_state == SA_CLM_ADMIN_SHUTTING_DOWN))) { > > if (clms_cb->reg_with_plm == SA_FALSE) { > @@ -2259,9 +2258,9 @@ uint32_t clms_imm_node_unlock(CLMS_CLUST > clms_node_join_ntf(clms_cb, nodeop); > > rc = clms_send_is_member_info(clms_cb, > nodeop->node_id, nodeop->member, true); > - if(rc != NCSCC_RC_SUCCESS) { > + if (rc != NCSCC_RC_SUCCESS) { > TRACE("clms_send_is_member_info failed > %u", rc); > - goto done; > + ais_rc = SA_AIS_ERR_TIMEOUT; > } > nodeop->change = SA_CLM_NODE_NO_CHANGE; > } > @@ -2322,7 +2321,7 @@ uint32_t clms_imm_node_unlock(CLMS_CLUST > nodeop->admin_op = 0; > > /* Send node join notification */ > - (void)immutil_saImmOiAdminOperationResult(clms_cb->immOiHandle, > nodeop->curr_admin_inv, SA_AIS_OK); > + (void)immutil_saImmOiAdminOperationResult(clms_cb->immOiHandle, > nodeop->curr_admin_inv, ais_rc); > clms_node_admin_state_change_ntf(clms_cb, nodeop, > SA_CLM_ADMIN_UNLOCKED); > done: > TRACE_LEAVE(); > diff --git a/src/clm/clmd/clms_mbcsv.c b/src/clm/clmd/clms_mbcsv.c > --- a/src/clm/clmd/clms_mbcsv.c > +++ b/src/clm/clmd/clms_mbcsv.c > @@ -103,7 +103,7 @@ static CLMS_CKPT_HDLR ckpt_data_handler[ > static uint32_t ckpt_proc_cluster_rec(CLMS_CB * cb, CLMS_CKPT_REC * data) > { > CLMSV_CKPT_CLUSTER_INFO *param = &data->param.cluster_rec; > - > + TRACE_ENTER(); > osaf_cluster->num_nodes = param->num_nodes; > osaf_cluster->init_time = param->init_time; > cb->cluster_view_num = param->cluster_view_num; > diff --git a/src/clm/clmd/clms_util.c b/src/clm/clmd/clms_util.c > --- a/src/clm/clmd/clms_util.c > +++ b/src/clm/clmd/clms_util.c > @@ -396,6 +396,7 @@ SaClmClusterNotificationT_4 *clms_notbuf > > if (num == 0) { > LOG_ER("Zero num of node's changed"); > + TRACE_LEAVE(); > return NULL; > } > > @@ -462,6 +463,7 @@ SaClmClusterNotificationT_4 *clms_notbuf > num = clms_nodedb_lookup(1); > > if (num == 0) { > + TRACE_LEAVE(); > return NULL; > } > > @@ -1149,14 +1151,13 @@ uint32_t clms_send_is_member_info(CLMS_C > CLMSV_MSG msg; > SaClmNodeIdT local_node_id; > uint32_t rc = NCSCC_RC_SUCCESS; > + bool send_failed = false; > > TRACE_ENTER(); > > client = clms_client_getnext_by_id(0); > while(client != NULL) { > - > local_node_id = m_NCS_NODE_ID_FROM_MDS_DEST(client->mds_dest); > - > if(local_node_id == node_id) { > msg.evt_type = CLMSV_CLMS_TO_CLMA_IS_MEMBER_MSG; > msg.info.is_member_info.is_member = member; > @@ -1164,16 +1165,19 @@ uint32_t clms_send_is_member_info(CLMS_C > msg.info.is_member_info.client_id = client->client_id; > rc = clms_mds_msg_send(cb, &msg, &client->mds_dest, 0, > MDS_SEND_PRIORITY_MEDIUM, > NCSMDS_SVC_ID_CLMA); > + /* A client may go down while sending message to > another client. For such > + a client MDS will return failure. Continue for other > clients.*/ > if (rc != NCSCC_RC_SUCCESS) { > - TRACE_LEAVE2("clms_mds_msg_send failed rc = > %u", (unsigned int)rc); > - return rc; > + TRACE_2("clms_mds_msg_send failed for client > '%u', rc = %u", > + client->client_id, (unsigned > int)rc); > + send_failed = true; > } > } > - > client = clms_client_getnext_by_id(client->client_id); > } > - > TRACE_LEAVE(); > + if (send_failed) > + return NCSCC_RC_FAILURE; > return rc; > } > ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
