Hi Lennart,

Please see my comment inline.

Regards, Vu

Quoting Lennart Lund <lennart.l...@ericsson.com>:

> Hi Vu
>
> Ack with comments
>
> See my comments inline [Lennart]:
>
> Thanks
> Lennart
>
>> -----Original Message-----
>> From: Vu Minh Nguyen [mailto:vu.m.ngu...@dektech.com.au]
>> Sent: den 23 mars 2016 08:16
>> To: mathi.naic...@oracle.com; Lennart Lund
>> Cc: opensaf-devel@lists.sourceforge.net
>> Subject: [PATCH 1 of 1] log: miss mutex protection for common resource in
>> log agent [#1705]
>>
>>  osaf/libs/agents/saf/lga/lga_state.c |  41
>> +++++++++++++++++++++++++++++++----
>>  osaf/libs/agents/saf/lga/lga_state.h |   1 +
>>  osaf/libs/agents/saf/lga/lga_util.c  |  30 +++++++++++--------------
>>  3 files changed, 50 insertions(+), 22 deletions(-)
>>
>>
>> There was an race condition between client thread and recovery thread
>> accessing to common resource `client_list`.
>>
>> Add mutex protection.
>>
>> diff --git a/osaf/libs/agents/saf/lga/lga_state.c
>> b/osaf/libs/agents/saf/lga/lga_state.c
>> --- a/osaf/libs/agents/saf/lga/lga_state.c
>> +++ b/osaf/libs/agents/saf/lga/lga_state.c
>> @@ -355,8 +355,11 @@ static void *recovery2_thread(void *dumm
>>                      /* We have been signaled to exit
>> */
>>                      goto done;
>>              }
>> +
>>              /* Recover clients one at a time */
>>              rc = recover_one_client(p_client);
>> +            if (p_client == NULL) goto done;
>> +
> [Lennart] Why is this needed? This check is done in "while (p_client  
> != NULL) { " and p_client cannot become NULL until "p_client =  
> p_client->next; " is done as the last line in the while loop.
>
[Vu] It is my intention. I am afraid if the context is switched to  
client thread
right after the recovery thread enters the while loop.
If it is the case, p_client will be NULL after done recover_one_client().

>>              TRACE("\t Client %d is recovered", p_client-
>> >lgs_client_id);
>>              if (rc == -1) {
>>                      TRACE("%s recover_one_client
>> Fail Deleting cllient (id %d)",
>> @@ -593,6 +596,15 @@ int recover_one_client(lga_client_hdl_re
>>
>>      TRACE_ENTER();
>>
>> +    if (p_client == NULL) {
>> +            /**
>> +             * Probably the client_list has been deleted by
>> client thread
>> +             * in context calling saLogFinalize(). So, give up
>> recovering.
>> +             */
>> +            rc = -1;
>> +            goto done;
>> +    }
>> +
>>      /* This function may be called both in the recovery thread and
>> in the
>>       * client thread. A possible scenario is that the recovery 2
>> thread
>>       * times out and calls this function in recovery mode 2 while
>> there
>> @@ -658,13 +670,32 @@ uint32_t delete_one_client(
>>      lga_client_hdl_rec_t *rm_node
>>      )
>>  {
>> -    TRACE_ENTER2();
>> -    uint32_t ncs_rc;
>> +    return lga_hdl_rec_del(list_head, rm_node);
>> +}
>>
>> +/**
>> + * Free the memory allocated for one client handle with mutext protection.
>> + *
>> + * @param p_client_hdl
>> + * @return void
>> + */
>> +void free_client_hdl(lga_client_hdl_rec_t **p_client_hdl)
>> +{
>> +    lga_client_hdl_rec_t *client_hdl;
>> +
>> +    /**
>> +     * To prevent race condition b/w client thread and recovery
>> thread
>> +     * accessing to common resource `client_list`. [#1705]
>> +     */
>>      osaf_mutex_lock_ordie(&lga_recov_mutex);
>> -    ncs_rc = lga_hdl_rec_del(list_head, rm_node);
>> +
>> +    client_hdl = *p_client_hdl;
>> +    if (client_hdl == NULL) goto done;
>> +
>> +    free(client_hdl);
>> +    client_hdl = NULL;
>> +
>> +done:
>>      osaf_mutex_unlock_ordie(&lga_recov_mutex);
>>
>> -    TRACE_LEAVE();
>> -    return ncs_rc;
>>  }
>> diff --git a/osaf/libs/agents/saf/lga/lga_state.h
>> b/osaf/libs/agents/saf/lga/lga_state.h
>> --- a/osaf/libs/agents/saf/lga/lga_state.h
>> +++ b/osaf/libs/agents/saf/lga/lga_state.h
>> @@ -32,6 +32,7 @@ uint32_t delete_one_client(
>>      lga_client_hdl_rec_t **list_head,
>>      lga_client_hdl_rec_t *rm_node
>>      );
>> +void free_client_hdl(lga_client_hdl_rec_t **p_client_hdl);
>>
>>  #ifdef      __cplusplus
>>  }
>> diff --git a/osaf/libs/agents/saf/lga/lga_util.c
>> b/osaf/libs/agents/saf/lga/lga_util.c
>> --- a/osaf/libs/agents/saf/lga/lga_util.c
>> +++ b/osaf/libs/agents/saf/lga/lga_util.c
>> @@ -19,6 +19,7 @@
>>  #include <syslog.h>
>>  #include "lga.h"
>>  #include "osaf_poll.h"
>> +#include "lga_state.h"
>>
>>  /* Variables used during startup/shutdown only */
>>  static pthread_mutex_t lga_lock = PTHREAD_MUTEX_INITIALIZER;
>> @@ -449,11 +450,9 @@ void lga_hdl_list_del(lga_client_hdl_rec
>>      /** clean up the channel records for this lga-client
>>           **/
>>              lga_log_stream_hdl_rec_list_del(&client_hdl-
>> >stream_list);
>> -    /** remove the association with hdl-mngr
>> -         **/
>> -            free(client_hdl);
>> -            client_hdl = 0;
>> +            free_client_hdl(&client_hdl);
>>      }
>> +
>>      TRACE_LEAVE();
>>  }
>>
>> @@ -536,38 +535,35 @@ uint32_t lga_hdl_rec_del(lga_client_hdl_
>>      if (list_iter == rm_node) {
>>              *list_head = rm_node->next;
>>
>> -    /** detach & release the IPC
>> -         **/
>> +            /* detach & release the IPC */
>>              m_NCS_IPC_DETACH(&rm_node->mbx,
>> lga_clear_mbx, NULL);
>>              m_NCS_IPC_RELEASE(&rm_node->mbx, NULL);
>>
>>              ncshm_give_hdl(rm_node->local_hdl);
>>              ncshm_destroy_hdl(NCS_SERVICE_ID_LGA,
>> rm_node->local_hdl);
>> -    /** Free the channel records off this hdl
>> -         **/
>> +
>> +            /* Free the channel records off this hdl */
>>              lga_log_stream_hdl_rec_list_del(&rm_node-
>> >stream_list);
>> +            free_client_hdl(&rm_node);
>>
>> -    /** free the hdl rec
>> -         **/
>> -            free(rm_node);
>>              rc = NCSCC_RC_SUCCESS;
>>              goto out;
>> -    } else {                /* find the rec */
>> +    } else {
>> +            /* find the rec */
>>              while (NULL != list_iter) {
>>                      if (list_iter->next == rm_node) {
>>                              list_iter->next =
>> rm_node->next;
>>
>> -            /** detach & release the IPC */
>> +                            /* detach & release
>> the IPC */
>>
>>      m_NCS_IPC_DETACH(&rm_node->mbx, lga_clear_mbx, NULL);
>>
>>      m_NCS_IPC_RELEASE(&rm_node->mbx, NULL);
>>
>>
>>      ncshm_give_hdl(rm_node->local_hdl);
>>
>>      ncshm_destroy_hdl(NCS_SERVICE_ID_LGA, rm_node-
>> >local_hdl);
>> -            /** Free the channel records off this lga_hdl  */
>> +
>> +                            /* Free the channel
>> records off this lga_hdl  */
>>
>>      lga_log_stream_hdl_rec_list_del(&rm_node->stream_list);
>> -
>> -            /** free the hdl rec */
>> -                            free(rm_node);
>> +
>>      free_client_hdl(&rm_node);
>>
>>                              rc =
>> NCSCC_RC_SUCCESS;
>>                              goto out;



------------------------------------------------------------------------------
Transform Data into Opportunity.
Accelerate data analysis in your applications with
Intel Data Analytics Acceleration Library.
Click to learn more.
http://pubads.g.doubleclick.net/gampad/clk?id=278785471&iu=/4140
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to