I remember making this change but I'm not certain if I found the problem or
was asked to add this to the repository.  Seems like Sam Lang was helping
someone with a problem like this and then I patched it.  In any case, I'll
track it down and see what I can do.

Becky

On Thu, Jun 30, 2011 at 11:20 AM, Benjamin Severs
<[email protected]>wrote:

> On Wednesday, June 29, 2011 08:49:46 Michael Moore wrote:
> > Bart,
> >
> > This doesn't look like a leak in the "can't free memory because there
> isn't
> > a reference to it" sense. However, there is definitely memory usage
> growth,
> > although, I haven't been able to replicate usage with quite as steep a
> > curve as your graph. That graph was based on system memory usage? Would
> > you mind running the same test and collecting the output from
> > /proc/<PID>/statm at some interval while the test is running where PID is
> > the server process' PID? Heap profiling so far makes it look like the
> > increased usage is coming from libraries not due to memory allocations
> > within OrangeFS but no definitive answer yet.
> >
> > Has anyone seen this behavior in the past?
> >
> > Thanks,
> > Michael
> >
>
> I believe I tracked the commit down which introduced the leak.  I can
> revert
> this commit and the leak appears to go away.  I haven't yet figured out
> what is
> actually leaking though.
>
> The commit is:
> Author: bligon <bligon>
> Date:   Fri Nov 5 16:09:27 2010 +0000
>
>    Corrects problem where the server tries to cancel a timed-out i/o
> operation but
>    before the cancel can happen, the operation finishes; so, the cancel
> can't
>    find the op id and subsequently caused a segfault.'
>
>
> diff --git a/src/common/mgmt/pint-context.c
> b/src/common/mgmt/pint-context.c
> index 0a07256..cda038c 100644
> --- a/src/common/mgmt/pint-context.c
> +++ b/src/common/mgmt/pint-context.c
> @@ -422,10 +422,17 @@ int PINT_context_test_all(PINT_context_id context_id,
>
>     for(; i < *count; ++i)
>     {
> +        struct PINT_op_entry *op_entry;
>         qentry = user_ptrs[i];
>         ctx_entry = PINT_queue_entry_object(
>             qentry, struct PINT_context_queue_entry, qentry);
>         op_ids[i] = ctx_entry->op_id;
> +        op_entry = id_gen_safe_lookup(ctx_entry->op_id);
> +        if (op_entry)
> +        {
> +           id_gen_safe_unregister(op_entry->op.id);
> +           free(op_entry);
> +        }
>         uptr = ctx_entry->user_ptr;
>         errors[i] = ctx_entry->result;
>         user_ptrs[i] = uptr;
> @@ -519,11 +526,19 @@ int PINT_context_test(PINT_context_id context_id,
>                                     &op_id, &qentry, microsecs);
>     if(ret == 0)
>     {
> +        struct PINT_op_entry *op_entry;
>         entry = PINT_queue_entry_object(
>             qentry,
>             struct PINT_context_queue_entry, qentry);
> +        op_entry = id_gen_safe_lookup(entry->op_id);
> +        if (op_entry)
> +        {
> +           id_gen_safe_unregister(op_entry->op.id);
> +           free(op_entry);
> +        }
>         *user_ptr = entry->user_ptr;
>         *error = entry->result;
> +
>         free(entry);
>     }
>
> diff --git a/src/common/mgmt/pint-context.h
> b/src/common/mgmt/pint-context.h
> index 78dfb4b..147be5c 100644
> --- a/src/common/mgmt/pint-context.h
> +++ b/src/common/mgmt/pint-context.h
> @@ -9,6 +9,7 @@
>
>  #include "pint-op.h"
>  #include "pint-queue.h"
> +#include "quickhash.h"
>
>  enum PINT_context_type
>  {
> @@ -18,6 +19,19 @@ enum PINT_context_type
>
>  typedef PVFS_id_gen_t PINT_context_id;
>
> +struct PINT_op_entry
> +{
> +   void *user_ptr;
> +   PINT_operation_t op;
> +   PVFS_id_gen_t wq_id;
> +   PINT_context_id ctx_id;
> +   PVFS_error error;
> +
> +   struct qhash_head link;
> +};
> +
> +
> +
>  typedef int (*PINT_completion_callback)(PINT_context_id ctx_id,
>                                         int count,
>                                         PINT_op_id *op_ids,
> diff --git a/src/common/mgmt/pint-mgmt.c b/src/common/mgmt/pint-mgmt.c
> index fa52d8d..120a5a1 100644
> --- a/src/common/mgmt/pint-mgmt.c
> +++ b/src/common/mgmt/pint-mgmt.c
> @@ -42,17 +42,6 @@ struct PINT_worker_s
>     struct qlist_head link;
>  };
>
> -struct PINT_op_entry
> -{
> -    void *user_ptr;
> -    PINT_operation_t op;
> -    PVFS_id_gen_t wq_id;
> -    PINT_context_id ctx_id;
> -    PVFS_error error;
> -
> -    struct qhash_head link;
> -};
> -
>  struct PINT_manager_s
>  {
>     PINT_context_id context;
> @@ -634,12 +623,15 @@ int PINT_manager_ctx_post(PINT_manager_t manager,
>         return ret;
>     }
>
> -    id_gen_fast_register(&op_entry->op.id, op_entry);
> -
>     gossip_debug(GOSSIP_MGMT_DEBUG,
>                  "[MGMT]: manager ops: adding op id: %llu\n",
>                  llu(op_entry->op.id));
>     gen_mutex_lock(&manager->mutex);
> +    ret = id_gen_safe_register(&op_entry->op.id, op_entry);
> +    if (ret < 0)
> +    {
> +       return ret;
> +    }
>     qhash_add(manager->ops, &op_entry->op.id, &op_entry->link);
>     manager->op_count++;
>     gen_mutex_unlock(&manager->mutex);
> @@ -657,6 +649,7 @@ int PINT_manager_ctx_post(PINT_manager_t manager,
>          * operation and return the error
>          */
>         gen_mutex_lock(&manager->mutex);
> +        id_gen_safe_unregister(op_entry->op.id);
>         qhash_search_and_remove(manager->ops, &op_entry->op.id);
>         gen_mutex_unlock(&manager->mutex);
>
> @@ -1047,7 +1040,7 @@ int PINT_manager_test_op(PINT_manager_t manager,
>     struct PINT_op_entry *entry;
>     PINT_context_id context;
>
> -    entry = id_gen_fast_lookup(op_id);
> +    entry = id_gen_safe_lookup(op_id);
>     if(!entry)
>     {
>         return -PVFS_EINVAL;
> @@ -1143,13 +1136,17 @@ int PINT_manager_cancel(PINT_manager_t manager,
>                        PINT_queue_id,
>                        PINT_operation_t *);
>
> -    entry = id_gen_fast_lookup(op_id);
> +    entry = id_gen_safe_lookup(op_id);
>     if(!entry)
>     {
>         return -PVFS_EINVAL;
>     }
>     ret = PINT_manager_find_worker(
>         manager, NULL, NULL, NULL, entry->wq_id, &worker, &queue_id);
> +    if (ret != 0)
> +    {
> +       return ret;
> +    }
>     context = entry->ctx_id;
>     op = &entry->op;
>
> @@ -1215,7 +1212,7 @@ int PINT_manager_wait_op(PINT_manager_t manager,
>     int msecs_remaining;
>     struct timeval last, now;
>
> -    entry = id_gen_fast_lookup(op_id);
> +    entry = id_gen_safe_lookup(op_id);
>     if(!entry)
>     {
>         return -PVFS_EINVAL;
> @@ -1451,9 +1448,6 @@ int PINT_manager_complete_op(PINT_manager_t manager,
>      * since blocking calls never add the op
>      * to the table
>      */
> -
> -    /* free the op entry */
> -    free(entry);
>     return ret;
>  }
>
>
> --
> Benjamin Severs
> _______________________________________________
> Pvfs2-developers mailing list
> [email protected]
> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers
>



-- 
Becky Ligon
OrangeFS Support and Development
Omnibond Systems
Anderson, South Carolina
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers

Reply via email to