Hi Lennart, Is it possible for following case happens?
Client thread comes to point send_Finalize_msg() @lga_api, then LOG service up, recovery thread starts and comes to the point right after "while (pclient != NULL)" in recovery2_thread @lga_state.c Then, the context is switching to the client thread and it deletes the resource while recovery2_thread is running? Regards, Vu. Quoting Lennart Lund <lennart.l...@ericsson.com>: > Hi Vu, > > /* Recover clients one at a time */ > rc = recover_one_client(p_client); > + if (p_client == NULL) goto done; > + > [Lennart] Why is this needed? This check is done in "while > (p_client != NULL) { " > and p_client cannot become NULL until "p_client = p_client->next; " > is done as > the last line in the while loop. > > [Vu] It is my intention. I am afraid if the context is switched to > client thread > right after the recovery thread enters the while loop. > If it is the case, p_client will be NULL after done recover_one_client(). > > [Lennart] I don't really understand your explanation. What is > happening in the client thread when we are in recovery state 2 that > can set p_client == NULL? The client thread should not be able to do > anything with the data involved when recovering client in the > recovery thread? > > Regards > Lennart > >> -----Original Message----- >> From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au] >> Sent: den 1 april 2016 14:14 >> To: Lennart Lund >> Cc: mathi.naic...@oracle.com; opensaf-devel@lists.sourceforge.net >> Subject: Re: [PATCH 1 of 1] log: miss mutex protection for common resource >> in log agent [#1705] >> >> Hi Lennart, >> >> Please see my comment inline. >> >> Regards, Vu >> >> Quoting Lennart Lund <lennart.l...@ericsson.com>: >> >> > Hi Vu >> > >> > Ack with comments >> > >> > See my comments inline [Lennart]: >> > >> > Thanks >> > Lennart >> > >> >> -----Original Message----- >> >> From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au] >> >> Sent: den 23 mars 2016 08:16 >> >> To: mathi.naic...@oracle.com; Lennart Lund >> >> Cc: opensaf-devel@lists.sourceforge.net >> >> Subject: [PATCH 1 of 1] log: miss mutex protection for common resource >> in >> >> log agent [#1705] >> >> >> >> osaf/libs/agents/saf/lga/lga_state.c | 41 >> >> +++++++++++++++++++++++++++++++---- >> >> osaf/libs/agents/saf/lga/lga_state.h | 1 + >> >> osaf/libs/agents/saf/lga/lga_util.c | 30 +++++++++++-------------- >> >> 3 files changed, 50 insertions(+), 22 deletions(-) >> >> >> >> >> >> There was an race condition between client thread and recovery thread >> >> accessing to common resource `client_list`. >> >> >> >> Add mutex protection. >> >> >> >> diff --git a/osaf/libs/agents/saf/lga/lga_state.c >> >> b/osaf/libs/agents/saf/lga/lga_state.c >> >> --- a/osaf/libs/agents/saf/lga/lga_state.c >> >> +++ b/osaf/libs/agents/saf/lga/lga_state.c >> >> @@ -355,8 +355,11 @@ static void *recovery2_thread(void *dumm >> >> /* We have been signaled to exit >> >> */ >> >> goto done; >> >> } >> >> + >> >> /* Recover clients one at a time */ >> >> rc = recover_one_client(p_client); >> >> + if (p_client == NULL) goto done; >> >> + >> > [Lennart] Why is this needed? This check is done in "while (p_client >> > != NULL) { " and p_client cannot become NULL until "p_client = >> > p_client->next; " is done as the last line in the while loop. >> > >> [Vu] It is my intention. I am afraid if the context is switched to >> client thread >> right after the recovery thread enters the while loop. >> If it is the case, p_client will be NULL after done recover_one_client(). >> >> >> TRACE("\t Client %d is recovered", p_client- >> >> >lgs_client_id); >> >> if (rc == -1) { >> >> TRACE("%s recover_one_client >> >> Fail Deleting cllient (id %d)", >> >> @@ -593,6 +596,15 @@ int recover_one_client(lga_client_hdl_re >> >> >> >> TRACE_ENTER(); >> >> >> >> + if (p_client == NULL) { >> >> + /** >> >> + * Probably the client_list has been deleted by >> >> client thread >> >> + * in context calling saLogFinalize(). So, give up >> >> recovering. >> >> + */ >> >> + rc = -1; >> >> + goto done; >> >> + } >> >> + >> >> /* This function may be called both in the recovery thread and >> >> in the >> >> * client thread. A possible scenario is that the recovery 2 >> >> thread >> >> * times out and calls this function in recovery mode 2 while >> >> there >> >> @@ -658,13 +670,32 @@ uint32_t delete_one_client( >> >> lga_client_hdl_rec_t *rm_node >> >> ) >> >> { >> >> - TRACE_ENTER2(); >> >> - uint32_t ncs_rc; >> >> + return lga_hdl_rec_del(list_head, rm_node); >> >> +} >> >> >> >> +/** >> >> + * Free the memory allocated for one client handle with mutext >> protection. >> >> + * >> >> + * @param p_client_hdl >> >> + * @return void >> >> + */ >> >> +void free_client_hdl(lga_client_hdl_rec_t **p_client_hdl) >> >> +{ >> >> + lga_client_hdl_rec_t *client_hdl; >> >> + >> >> + /** >> >> + * To prevent race condition b/w client thread and recovery >> >> thread >> >> + * accessing to common resource `client_list`. [#1705] >> >> + */ >> >> osaf_mutex_lock_ordie(&lga_recov_mutex); >> >> - ncs_rc = lga_hdl_rec_del(list_head, rm_node); >> >> + >> >> + client_hdl = *p_client_hdl; >> >> + if (client_hdl == NULL) goto done; >> >> + >> >> + free(client_hdl); >> >> + client_hdl = NULL; >> >> + >> >> +done: >> >> osaf_mutex_unlock_ordie(&lga_recov_mutex); >> >> >> >> - TRACE_LEAVE(); >> >> - return ncs_rc; >> >> } >> >> diff --git a/osaf/libs/agents/saf/lga/lga_state.h >> >> b/osaf/libs/agents/saf/lga/lga_state.h >> >> --- a/osaf/libs/agents/saf/lga/lga_state.h >> >> +++ b/osaf/libs/agents/saf/lga/lga_state.h >> >> @@ -32,6 +32,7 @@ uint32_t delete_one_client( >> >> lga_client_hdl_rec_t **list_head, >> >> lga_client_hdl_rec_t *rm_node >> >> ); >> >> +void free_client_hdl(lga_client_hdl_rec_t **p_client_hdl); >> >> >> >> #ifdef __cplusplus >> >> } >> >> diff --git a/osaf/libs/agents/saf/lga/lga_util.c >> >> b/osaf/libs/agents/saf/lga/lga_util.c >> >> --- a/osaf/libs/agents/saf/lga/lga_util.c >> >> +++ b/osaf/libs/agents/saf/lga/lga_util.c >> >> @@ -19,6 +19,7 @@ >> >> #include <syslog.h> >> >> #include "lga.h" >> >> #include "osaf_poll.h" >> >> +#include "lga_state.h" >> >> >> >> /* Variables used during startup/shutdown only */ >> >> static pthread_mutex_t lga_lock = PTHREAD_MUTEX_INITIALIZER; >> >> @@ -449,11 +450,9 @@ void lga_hdl_list_del(lga_client_hdl_rec >> >> /** clean up the channel records for this lga-client >> >> **/ >> >> lga_log_stream_hdl_rec_list_del(&client_hdl- >> >> >stream_list); >> >> - /** remove the association with hdl-mngr >> >> - **/ >> >> - free(client_hdl); >> >> - client_hdl = 0; >> >> + free_client_hdl(&client_hdl); >> >> } >> >> + >> >> TRACE_LEAVE(); >> >> } >> >> >> >> @@ -536,38 +535,35 @@ uint32_t lga_hdl_rec_del(lga_client_hdl_ >> >> if (list_iter == rm_node) { >> >> *list_head = rm_node->next; >> >> >> >> - /** detach & release the IPC >> >> - **/ >> >> + /* detach & release the IPC */ >> >> m_NCS_IPC_DETACH(&rm_node->mbx, >> >> lga_clear_mbx, NULL); >> >> m_NCS_IPC_RELEASE(&rm_node->mbx, NULL); >> >> >> >> ncshm_give_hdl(rm_node->local_hdl); >> >> ncshm_destroy_hdl(NCS_SERVICE_ID_LGA, >> >> rm_node->local_hdl); >> >> - /** Free the channel records off this hdl >> >> - **/ >> >> + >> >> + /* Free the channel records off this hdl */ >> >> lga_log_stream_hdl_rec_list_del(&rm_node- >> >> >stream_list); >> >> + free_client_hdl(&rm_node); >> >> >> >> - /** free the hdl rec >> >> - **/ >> >> - free(rm_node); >> >> rc = NCSCC_RC_SUCCESS; >> >> goto out; >> >> - } else { /* find the rec */ >> >> + } else { >> >> + /* find the rec */ >> >> while (NULL != list_iter) { >> >> if (list_iter->next == rm_node) { >> >> list_iter->next = >> >> rm_node->next; >> >> >> >> - /** detach & release the IPC */ >> >> + /* detach & release >> >> the IPC */ >> >> >> >> m_NCS_IPC_DETACH(&rm_node->mbx, lga_clear_mbx, >> NULL); >> >> >> >> m_NCS_IPC_RELEASE(&rm_node->mbx, NULL); >> >> >> >> >> >> ncshm_give_hdl(rm_node->local_hdl); >> >> >> >> ncshm_destroy_hdl(NCS_SERVICE_ID_LGA, rm_node- >> >> >local_hdl); >> >> - /** Free the channel records off this lga_hdl */ >> >> + >> >> + /* Free the channel >> >> records off this lga_hdl */ >> >> >> >> lga_log_stream_hdl_rec_list_del(&rm_node->stream_list); >> >> - >> >> - /** free the hdl rec */ >> >> - free(rm_node); >> >> + >> >> free_client_hdl(&rm_node); >> >> >> >> rc = >> >> NCSCC_RC_SUCCESS; >> >> goto out; >> ------------------------------------------------------------------------------ Transform Data into Opportunity. Accelerate data analysis in your applications with Intel Data Analytics Acceleration Library. Click to learn more. http://pubads.g.doubleclick.net/gampad/clk?id=278785471&iu=/4140 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel