I think I made these changes due to a problem  that we were having on our
Palmetto system.  Just a quick look leads me to believe that the change to
PINT_context_test_all should not have freed the op_id; I suspect that this
may prevent a cleanup of it later down the road.  I will look at it closer
to see if that is the case.  The change to PINT_context_test is okay, I
think.

Becky

On Thu, Jun 30, 2011 at 11:38 AM, Becky Ligon <[email protected]> wrote:

> I remember making this change but I'm not certain if I found the problem or
> was asked to add this to the repository.  Seems like Sam Lang was helping
> someone with a problem like this and then I patched it.  In any case, I'll
> track it down and see what I can do.
>
> Becky
>
>
> On Thu, Jun 30, 2011 at 11:20 AM, Benjamin Severs <
> [email protected]> wrote:
>
>> On Wednesday, June 29, 2011 08:49:46 Michael Moore wrote:
>> > Bart,
>> >
>> > This doesn't look like a leak in the "can't free memory because there
>> isn't
>> > a reference to it" sense. However, there is definitely memory usage
>> growth,
>> > although, I haven't been able to replicate usage with quite as steep a
>> > curve as your graph. That graph was based on system memory usage? Would
>> > you mind running the same test and collecting the output from
>> > /proc/<PID>/statm at some interval while the test is running where PID
>> is
>> > the server process' PID? Heap profiling so far makes it look like the
>> > increased usage is coming from libraries not due to memory allocations
>> > within OrangeFS but no definitive answer yet.
>> >
>> > Has anyone seen this behavior in the past?
>> >
>> > Thanks,
>> > Michael
>> >
>>
>> I believe I tracked the commit down which introduced the leak.  I can
>> revert
>> this commit and the leak appears to go away.  I haven't yet figured out
>> what is
>> actually leaking though.
>>
>> The commit is:
>> Author: bligon <bligon>
>> Date:   Fri Nov 5 16:09:27 2010 +0000
>>
>>    Corrects problem where the server tries to cancel a timed-out i/o
>> operation but
>>    before the cancel can happen, the operation finishes; so, the cancel
>> can't
>>    find the op id and subsequently caused a segfault.'
>>
>>
>> diff --git a/src/common/mgmt/pint-context.c
>> b/src/common/mgmt/pint-context.c
>> index 0a07256..cda038c 100644
>> --- a/src/common/mgmt/pint-context.c
>> +++ b/src/common/mgmt/pint-context.c
>> @@ -422,10 +422,17 @@ int PINT_context_test_all(PINT_context_id
>> context_id,
>>
>>     for(; i < *count; ++i)
>>     {
>> +        struct PINT_op_entry *op_entry;
>>         qentry = user_ptrs[i];
>>         ctx_entry = PINT_queue_entry_object(
>>             qentry, struct PINT_context_queue_entry, qentry);
>>         op_ids[i] = ctx_entry->op_id;
>> +        op_entry = id_gen_safe_lookup(ctx_entry->op_id);
>> +        if (op_entry)
>> +        {
>> +           id_gen_safe_unregister(op_entry->op.id);
>> +           free(op_entry);
>> +        }
>>         uptr = ctx_entry->user_ptr;
>>         errors[i] = ctx_entry->result;
>>         user_ptrs[i] = uptr;
>> @@ -519,11 +526,19 @@ int PINT_context_test(PINT_context_id context_id,
>>                                     &op_id, &qentry, microsecs);
>>     if(ret == 0)
>>     {
>> +        struct PINT_op_entry *op_entry;
>>         entry = PINT_queue_entry_object(
>>             qentry,
>>             struct PINT_context_queue_entry, qentry);
>> +        op_entry = id_gen_safe_lookup(entry->op_id);
>> +        if (op_entry)
>> +        {
>> +           id_gen_safe_unregister(op_entry->op.id);
>> +           free(op_entry);
>> +        }
>>         *user_ptr = entry->user_ptr;
>>         *error = entry->result;
>> +
>>         free(entry);
>>     }
>>
>> diff --git a/src/common/mgmt/pint-context.h
>> b/src/common/mgmt/pint-context.h
>> index 78dfb4b..147be5c 100644
>> --- a/src/common/mgmt/pint-context.h
>> +++ b/src/common/mgmt/pint-context.h
>> @@ -9,6 +9,7 @@
>>
>>  #include "pint-op.h"
>>  #include "pint-queue.h"
>> +#include "quickhash.h"
>>
>>  enum PINT_context_type
>>  {
>> @@ -18,6 +19,19 @@ enum PINT_context_type
>>
>>  typedef PVFS_id_gen_t PINT_context_id;
>>
>> +struct PINT_op_entry
>> +{
>> +   void *user_ptr;
>> +   PINT_operation_t op;
>> +   PVFS_id_gen_t wq_id;
>> +   PINT_context_id ctx_id;
>> +   PVFS_error error;
>> +
>> +   struct qhash_head link;
>> +};
>> +
>> +
>> +
>>  typedef int (*PINT_completion_callback)(PINT_context_id ctx_id,
>>                                         int count,
>>                                         PINT_op_id *op_ids,
>> diff --git a/src/common/mgmt/pint-mgmt.c b/src/common/mgmt/pint-mgmt.c
>> index fa52d8d..120a5a1 100644
>> --- a/src/common/mgmt/pint-mgmt.c
>> +++ b/src/common/mgmt/pint-mgmt.c
>> @@ -42,17 +42,6 @@ struct PINT_worker_s
>>     struct qlist_head link;
>>  };
>>
>> -struct PINT_op_entry
>> -{
>> -    void *user_ptr;
>> -    PINT_operation_t op;
>> -    PVFS_id_gen_t wq_id;
>> -    PINT_context_id ctx_id;
>> -    PVFS_error error;
>> -
>> -    struct qhash_head link;
>> -};
>> -
>>  struct PINT_manager_s
>>  {
>>     PINT_context_id context;
>> @@ -634,12 +623,15 @@ int PINT_manager_ctx_post(PINT_manager_t manager,
>>         return ret;
>>     }
>>
>> -    id_gen_fast_register(&op_entry->op.id, op_entry);
>> -
>>     gossip_debug(GOSSIP_MGMT_DEBUG,
>>                  "[MGMT]: manager ops: adding op id: %llu\n",
>>                  llu(op_entry->op.id));
>>     gen_mutex_lock(&manager->mutex);
>> +    ret = id_gen_safe_register(&op_entry->op.id, op_entry);
>> +    if (ret < 0)
>> +    {
>> +       return ret;
>> +    }
>>     qhash_add(manager->ops, &op_entry->op.id, &op_entry->link);
>>     manager->op_count++;
>>     gen_mutex_unlock(&manager->mutex);
>> @@ -657,6 +649,7 @@ int PINT_manager_ctx_post(PINT_manager_t manager,
>>          * operation and return the error
>>          */
>>         gen_mutex_lock(&manager->mutex);
>> +        id_gen_safe_unregister(op_entry->op.id);
>>         qhash_search_and_remove(manager->ops, &op_entry->op.id);
>>         gen_mutex_unlock(&manager->mutex);
>>
>> @@ -1047,7 +1040,7 @@ int PINT_manager_test_op(PINT_manager_t manager,
>>     struct PINT_op_entry *entry;
>>     PINT_context_id context;
>>
>> -    entry = id_gen_fast_lookup(op_id);
>> +    entry = id_gen_safe_lookup(op_id);
>>     if(!entry)
>>     {
>>         return -PVFS_EINVAL;
>> @@ -1143,13 +1136,17 @@ int PINT_manager_cancel(PINT_manager_t manager,
>>                        PINT_queue_id,
>>                        PINT_operation_t *);
>>
>> -    entry = id_gen_fast_lookup(op_id);
>> +    entry = id_gen_safe_lookup(op_id);
>>     if(!entry)
>>     {
>>         return -PVFS_EINVAL;
>>     }
>>     ret = PINT_manager_find_worker(
>>         manager, NULL, NULL, NULL, entry->wq_id, &worker, &queue_id);
>> +    if (ret != 0)
>> +    {
>> +       return ret;
>> +    }
>>     context = entry->ctx_id;
>>     op = &entry->op;
>>
>> @@ -1215,7 +1212,7 @@ int PINT_manager_wait_op(PINT_manager_t manager,
>>     int msecs_remaining;
>>     struct timeval last, now;
>>
>> -    entry = id_gen_fast_lookup(op_id);
>> +    entry = id_gen_safe_lookup(op_id);
>>     if(!entry)
>>     {
>>         return -PVFS_EINVAL;
>> @@ -1451,9 +1448,6 @@ int PINT_manager_complete_op(PINT_manager_t manager,
>>      * since blocking calls never add the op
>>      * to the table
>>      */
>> -
>> -    /* free the op entry */
>> -    free(entry);
>>     return ret;
>>  }
>>
>>
>> --
>> Benjamin Severs
>> _______________________________________________
>> Pvfs2-developers mailing list
>> [email protected]
>> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers
>>
>
>
>
> --
> Becky Ligon
> OrangeFS Support and Development
> Omnibond Systems
> Anderson, South Carolina
>
>
>


-- 
Becky Ligon
OrangeFS Support and Development
Omnibond Systems
Anderson, South Carolina
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers

Reply via email to