Hi Vu I have looked at this: "[Vu] Lock mutex here to protect the common resource using in `lga_find_hdl_rec_by_regid`, so I think it would be better if we move above mutex lock/unlock to `lga_find_hdl_rec_by_regid()` function."
The lga_find_hdl_rec_by_regid() function is actually taking a pointer to a lga_cb structure as an in-parameter and is using that in-parameter when accessing data of lga_cb_t. This is not a good way of handling global data actually even worse than handling them directly. Because of this it is not a good idea to lock the cb_lock inside of this function, the cb_lock is a global parameter as well! Also this function is used only by the lga_lgs_msg_proc() function so it could be moved to the lga_mds.c file and made static but I have not done that. I have left the cb_lock as is and removed the TBD comment I have added a note to the lga_find_hdl_rec_by_regid() function saying that it is not thread safe if the lga:cb_t pointer points to the global lga_cb structure. I will send the update patch for official review since Mathi has not yet given any comment. Thanks Lennart > -----Original Message----- > From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au] > Sent: den 5 maj 2016 10:49 > To: mathi.naic...@oracle.com; Lennart Lund > Cc: opensaf-devel@lists.sourceforge.net > Subject: RE: [PATCH 1 of 1] log: Deadlock in log agent makes client hang > [#1805] > > Hi Lennart, > > Ack with few comments, start with [Vu]. > > Regards, Vu. > > > >-----Original Message----- > >From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au] > >Sent: Thursday, May 05, 2016 9:56 AM > >To: mathi.naic...@oracle.com; lennart.l...@ericsson.com; > >vu.m.ngu...@dektech.com.au > >Cc: opensaf-devel@lists.sourceforge.net > >Subject: [PATCH 1 of 1] log: Deadlock in log agent makes client hang > [#1805] > > > > osaf/libs/agents/saf/lga/lga.h | 1 + > > osaf/libs/agents/saf/lga/lga_api.c | 12 ++++ > > osaf/libs/agents/saf/lga/lga_mds.c | 10 ++- > > osaf/libs/agents/saf/lga/lga_state.c | 87 > ++++++++++++------------------------ > > osaf/libs/agents/saf/lga/lga_state.h | 3 - > > osaf/libs/agents/saf/lga/lga_util.c | 23 +++++++++ > > 6 files changed, 72 insertions(+), 64 deletions(-) > > > > > >Remove deadlock when two mutexes conflict in MDS and log agent. > >Make sure that cb_lock is not taken when calling any MDS API > >Some cleaning of code e.g. removing "dead" code and moving functions > from > >lga_state.c > >that are not related to lga_state.c > > > >diff --git a/osaf/libs/agents/saf/lga/lga.h > b/osaf/libs/agents/saf/lga/lga.h > >--- a/osaf/libs/agents/saf/lga/lga.h > >+++ b/osaf/libs/agents/saf/lga/lga.h > >@@ -122,6 +122,7 @@ typedef struct { > > > > /* lga_saf_api.c */ > > extern lga_cb_t lga_cb; > >+bool is_lgs_state(lgs_state_t state); > > > > /* lga_mds.c */ > > extern uint32_t lga_mds_init(lga_cb_t *cb); > >diff --git a/osaf/libs/agents/saf/lga/lga_api.c > >b/osaf/libs/agents/saf/lga/lga_api.c > >--- a/osaf/libs/agents/saf/lga/lga_api.c > >+++ b/osaf/libs/agents/saf/lga/lga_api.c > >@@ -44,6 +44,18 @@ lga_cb_t lga_cb = { > > .lgs_state = LGS_START > > }; > > > >+bool is_lgs_state(lgs_state_t state) > >+{ > >+ bool rc = false; > >+ > >+ osaf_mutex_lock_ordie(&lga_cb.cb_lock); > >+ if (state == lga_cb.lgs_state) > >+ rc = true; > >+ osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > >+ > >+ return rc; > >+} > [Vu] This function is only called in lga_api.c. Should make it static? > >+ > > static void populate_open_params(lgsv_stream_open_req_t > *open_param, > > const SaNameT > *logStreamName, > > > lga_client_hdl_rec_t *hdl_rec, > >diff --git a/osaf/libs/agents/saf/lga/lga_mds.c > >b/osaf/libs/agents/saf/lga/lga_mds.c > >--- a/osaf/libs/agents/saf/lga/lga_mds.c > >+++ b/osaf/libs/agents/saf/lga/lga_mds.c > >@@ -16,6 +16,7 @@ > > */ > > > > #include <stdlib.h> > >+#include <saf_error.h> > > #include "lga.h" > > #include "lga_state.h" > > > >@@ -446,8 +447,9 @@ static uint32_t lga_lgs_msg_proc(lga_cb_ > > { > > > lga_client_hdl_rec_t *lga_hdl_rec; > > > >- > TRACE_2("LGSV_LGS_WRITE_LOG_CBK: inv = > >%d, error = %d", > >- > (int)lgsv_msg->info.cbk_info.inv, > >(int)lgsv_msg->info.cbk_info.write_cbk.error); > >+ > TRACE_2("LGSV_LGS_WRITE_LOG_CBK: inv_id = > >%d, cbk_error %s", > >+ > (int)lgsv_msg->info.cbk_info.inv, > >+ > saf_error((int)lgsv_msg- > >>info.cbk_info.write_cbk.error)); > > > > /** Create the chan hdl record > here before > > ** queing this message onto the priority queue > >@@ -590,6 +592,7 @@ static uint32_t lga_mds_rcv(struct ncsmd > > lgsv_msg_t *lgsv_msg = (lgsv_msg_t > *)mds_cb_info->info.receive.i_msg; > > uint32_t rc; > > > >+ /* TBD Probably incorrect usage of this mutex here */ > > osaf_mutex_lock_ordie(&lga_cb.cb_lock); > [Vu] Lock mutex here to protect the common resource using in > `lga_find_hdl_rec_by_regid`, > so I think it would be better if we move above mutex lock/unlock to > `lga_find_hdl_rec_by_regid()` function. > > > > /* process the message */ > >@@ -930,7 +933,8 @@ static uint32_t lga_mds_dec(struct ncsmd > > > TRACE_2("LGSV_LGS_CBK_MSG"); > > switch (msg->info.cbk_info.type) > { > > case > LGSV_WRITE_LOG_CALLBACK_IND: > >- TRACE_2("decode > writelog message"); > >+ TRACE_2("decode > writelog message, > >lgs_client_id=%d", > >+ msg- > >info.cbk_info.lgs_client_id); > > total_bytes += > lga_dec_write_cbk_msg(uba, > >msg); > > break; > > default: > >diff --git a/osaf/libs/agents/saf/lga/lga_state.c > >b/osaf/libs/agents/saf/lga/lga_state.c > >--- a/osaf/libs/agents/saf/lga/lga_state.c > >+++ b/osaf/libs/agents/saf/lga/lga_state.c > >@@ -28,13 +28,6 @@ > > * Common data > > */ > > > >-/* And critical sections. Clients APIs must not pass recovery 2 state > check if > >- * recovery 2 thread is started but state has not yet been changed by the > >- * thread. Also the thread must not start recovery if an API is executing > and > >- * has passed the recovery 2 check > >- */ > >-static pthread_mutex_t lga_recov2_lock = PTHREAD_MUTEX_INITIALIZER; > >- > > /* Selection object for terminating recovery thread for state 2 (after > timeout) > > * NOTE: Only for recovery2_thread > > */ > >@@ -228,7 +221,10 @@ static int initialize_one_client(lga_cli > > > > TRACE_ENTER(); > > > >- if (p_client->initialized_flag == true) { > >+ osaf_mutex_lock_ordie(&lga_cb.cb_lock); > >+ bool initialized_flag = p_client->initialized_flag; > >+ osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > >+ if (initialized_flag == true) { > > /* The client is already initialized */ > > rc = 1; > > goto done; > >@@ -242,11 +238,13 @@ static int initialize_one_client(lga_cli > > /* Restore the client Id with the one returned by the LGS and > > * set the initialized flag > > */ > >+ osaf_mutex_lock_ordie(&lga_cb.cb_lock); > > p_client->lgs_client_id = client_id; > > p_client->initialized_flag = true; > >+ osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > > > > done: > >- TRACE_LEAVE2("rc = %d", rc); > >+ TRACE_LEAVE2("rc = %d, client_id = %d", rc, client_id); > > return rc; > > } > > > >@@ -267,7 +265,10 @@ static int recover_one_stream(lga_log_st > > > > TRACE_ENTER(); > > > >- if (p_stream->recovered_flag == true) { > >+ osaf_mutex_lock_ordie(&lga_cb.cb_lock); > >+ bool recovered_flag = p_stream->recovered_flag; > >+ osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > >+ if (recovered_flag == true) { > > /* The stream is already recovered */ > > rc = 1; > > goto done; > >@@ -282,8 +283,10 @@ static int recover_one_stream(lga_log_st > > /* Restore the stream Id with the Id returned by the LGS and > > * set the recovered flag > > */ > >+ osaf_mutex_lock_ordie(&lga_cb.cb_lock); > > p_stream->lgs_log_stream_id = stream_id; > > p_stream->recovered_flag = true; > >+ osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > > > > done: > > TRACE_LEAVE2("rc = %d", rc); > >@@ -588,11 +591,17 @@ int lga_recover_one_client(lga_client_hd > > > > TRACE_ENTER(); > > > >- /* Synchronize b/w client/mds thread */ > >+ /* TBD Incorrect usage of the cb_lock mutex removed here. > Could > >cause > >+ * a deadlock together with the MDS internal > gl_mds_library_mutex > >+ * Check mutex handling in log agent in general > >+ */ > > osaf_mutex_lock_ordie(&lga_cb.cb_lock); > >- if (p_client->recovered_flag == true) { > >+ bool recovered_flag = p_client->recovered_flag; > >+ osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > >+ if (recovered_flag == true) { > > /* Client is already recovered */ > >- TRACE("\t Already recovered"); > >+ TRACE("\t Already recovered, cilent_id=%d", > >+ p_client->lgs_client_id); > [Vu] Accessing to p_client here is not safe. > > goto done; > > } > > > >@@ -620,35 +629,20 @@ int lga_recover_one_client(lga_client_hd > > p_stream = p_stream->next; > > } > > > >+ osaf_mutex_lock_ordie(&lga_cb.cb_lock); > > p_client->recovered_flag = true; > >+ osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > > done: > >- osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > > TRACE_LEAVE(); > > return rc; > > } > > > >-/** > >- * Free the memory allocated for one client handle with mutext protection. > >- * > >- * @param p_client_hdl > >- * @return void > >+/* Clients APIs must not pass recovery 2 state check if > >+ * recovery 2 thread is started but state has not yet been changed by the > >+ * thread. Also the thread must not start recovery if an API is executing > and > >+ * has passed the recovery 2 check > > */ > >-void lga_free_client_hdl(lga_client_hdl_rec_t **p_client_hdl) > >-{ > >- lga_client_hdl_rec_t *client_hdl = NULL; > >- > >- /* Synchronize b/w client & mds thread */ > >- osaf_mutex_lock_ordie(&lga_cb.cb_lock); > >- > >- client_hdl = *p_client_hdl; > >- if (client_hdl == NULL) goto done; > >- > >- free(client_hdl); > >- client_hdl = NULL; > >- > >-done: > >- osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > >-} > >+static pthread_mutex_t lga_recov2_lock = PTHREAD_MUTEX_INITIALIZER; > > > > void lga_recovery2_lock(void) > > { > >@@ -660,29 +654,6 @@ void lga_recovery2_unlock(void) > > osaf_mutex_unlock_ordie(&lga_recov2_lock); > > } > > > >-//> > >-// Handling lga_cb.lgs_state and lga_state in thread safe > >-//< > >- > >-void set_lgs_state(lgs_state_t state) > >-{ > >- osaf_mutex_lock_ordie(&lga_cb.cb_lock); > >- lga_cb.lgs_state = state; > >- osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > >-} > >- > >-bool is_lgs_state(lgs_state_t state) > >-{ > >- bool rc = false; > >- > >- osaf_mutex_lock_ordie(&lga_cb.cb_lock); > >- if (state == lga_cb.lgs_state) > >- rc = true; > >- osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > >- > >- return rc; > >-} > >- > > typedef struct { > > lga_state_t state; > > pthread_mutex_t lock; > >diff --git a/osaf/libs/agents/saf/lga/lga_state.h > >b/osaf/libs/agents/saf/lga/lga_state.h > >--- a/osaf/libs/agents/saf/lga/lga_state.h > >+++ b/osaf/libs/agents/saf/lga/lga_state.h > >@@ -30,11 +30,8 @@ void lga_serv_recov1state_set(void); > > int lga_recover_one_client(lga_client_hdl_rec_t *p_client); > > void lga_recovery2_lock(void); > > void lga_recovery2_unlock(void); > >-void lga_free_client_hdl(lga_client_hdl_rec_t **p_client_hdl); > > > >-void set_lgs_state(lgs_state_t state); > > void set_lga_state(lga_state_t state); > >-bool is_lgs_state(lgs_state_t state); > > bool is_lga_state(lga_state_t state); > > > > > >diff --git a/osaf/libs/agents/saf/lga/lga_util.c > >b/osaf/libs/agents/saf/lga/lga_util.c > >--- a/osaf/libs/agents/saf/lga/lga_util.c > >+++ b/osaf/libs/agents/saf/lga/lga_util.c > >@@ -273,6 +273,29 @@ static uint32_t lga_hdl_cbk_dispatch_blo > > } > > > > /** > >+ * Free the memory allocated for one client handle with mutex protection. > >+ * > >+ * @param p_client_hdl > >+ * @return void > >+ */ > >+static void lga_free_client_hdl(lga_client_hdl_rec_t **p_client_hdl) > >+{ > >+ lga_client_hdl_rec_t *client_hdl = NULL; > >+ > >+ /* Synchronize b/w client & mds thread */ > >+ osaf_mutex_lock_ordie(&lga_cb.cb_lock); > >+ > >+ client_hdl = *p_client_hdl; > >+ if (client_hdl == NULL) goto done; > >+ > >+ free(client_hdl); > >+ client_hdl = NULL; > >+ > >+done: > >+ osaf_mutex_unlock_ordie(&lga_cb.cb_lock); > >+} > >+ > >+/** > > * Initiate the agent when first used. > > * Start NCS service > > * Register with MDS ------------------------------------------------------------------------------ Find and fix application performance issues faster with Applications Manager Applications Manager provides deep performance insights into multiple tiers of your business applications. It resolves application problems quickly and reduces your MTTR. Get your free trial! https://ad.doubleclick.net/ddm/clk/302982198;130105516;z _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel