Ben: Do you have a copy of the original fix, "pint-mgmt-segv-fix.patch" that Sam sent to you? If so, can you send it to me?
Thanks, Becky On Thu, Jun 30, 2011 at 12:08 PM, Becky Ligon <[email protected]> wrote: > I think I made these changes due to a problem that we were having on our > Palmetto system. Just a quick look leads me to believe that the change to > PINT_context_test_all should not have freed the op_id; I suspect that this > may prevent a cleanup of it later down the road. I will look at it closer > to see if that is the case. The change to PINT_context_test is okay, I > think. > > Becky > > > On Thu, Jun 30, 2011 at 11:38 AM, Becky Ligon <[email protected]> wrote: > >> I remember making this change but I'm not certain if I found the problem >> or was asked to add this to the repository. Seems like Sam Lang was helping >> someone with a problem like this and then I patched it. In any case, I'll >> track it down and see what I can do. >> >> Becky >> >> >> On Thu, Jun 30, 2011 at 11:20 AM, Benjamin Severs < >> [email protected]> wrote: >> >>> On Wednesday, June 29, 2011 08:49:46 Michael Moore wrote: >>> > Bart, >>> > >>> > This doesn't look like a leak in the "can't free memory because there >>> isn't >>> > a reference to it" sense. However, there is definitely memory usage >>> growth, >>> > although, I haven't been able to replicate usage with quite as steep a >>> > curve as your graph. That graph was based on system memory usage? Would >>> > you mind running the same test and collecting the output from >>> > /proc/<PID>/statm at some interval while the test is running where PID >>> is >>> > the server process' PID? Heap profiling so far makes it look like the >>> > increased usage is coming from libraries not due to memory allocations >>> > within OrangeFS but no definitive answer yet. >>> > >>> > Has anyone seen this behavior in the past? >>> > >>> > Thanks, >>> > Michael >>> > >>> >>> I believe I tracked the commit down which introduced the leak. I can >>> revert >>> this commit and the leak appears to go away. I haven't yet figured out >>> what is >>> actually leaking though. >>> >>> The commit is: >>> Author: bligon <bligon> >>> Date: Fri Nov 5 16:09:27 2010 +0000 >>> >>> Corrects problem where the server tries to cancel a timed-out i/o >>> operation but >>> before the cancel can happen, the operation finishes; so, the cancel >>> can't >>> find the op id and subsequently caused a segfault.' >>> >>> >>> diff --git a/src/common/mgmt/pint-context.c >>> b/src/common/mgmt/pint-context.c >>> index 0a07256..cda038c 100644 >>> --- a/src/common/mgmt/pint-context.c >>> +++ b/src/common/mgmt/pint-context.c >>> @@ -422,10 +422,17 @@ int PINT_context_test_all(PINT_context_id >>> context_id, >>> >>> for(; i < *count; ++i) >>> { >>> + struct PINT_op_entry *op_entry; >>> qentry = user_ptrs[i]; >>> ctx_entry = PINT_queue_entry_object( >>> qentry, struct PINT_context_queue_entry, qentry); >>> op_ids[i] = ctx_entry->op_id; >>> + op_entry = id_gen_safe_lookup(ctx_entry->op_id); >>> + if (op_entry) >>> + { >>> + id_gen_safe_unregister(op_entry->op.id); >>> + free(op_entry); >>> + } >>> uptr = ctx_entry->user_ptr; >>> errors[i] = ctx_entry->result; >>> user_ptrs[i] = uptr; >>> @@ -519,11 +526,19 @@ int PINT_context_test(PINT_context_id context_id, >>> &op_id, &qentry, microsecs); >>> if(ret == 0) >>> { >>> + struct PINT_op_entry *op_entry; >>> entry = PINT_queue_entry_object( >>> qentry, >>> struct PINT_context_queue_entry, qentry); >>> + op_entry = id_gen_safe_lookup(entry->op_id); >>> + if (op_entry) >>> + { >>> + id_gen_safe_unregister(op_entry->op.id); >>> + free(op_entry); >>> + } >>> *user_ptr = entry->user_ptr; >>> *error = entry->result; >>> + >>> free(entry); >>> } >>> >>> diff --git a/src/common/mgmt/pint-context.h >>> b/src/common/mgmt/pint-context.h >>> index 78dfb4b..147be5c 100644 >>> --- a/src/common/mgmt/pint-context.h >>> +++ b/src/common/mgmt/pint-context.h >>> @@ -9,6 +9,7 @@ >>> >>> #include "pint-op.h" >>> #include "pint-queue.h" >>> +#include "quickhash.h" >>> >>> enum PINT_context_type >>> { >>> @@ -18,6 +19,19 @@ enum PINT_context_type >>> >>> typedef PVFS_id_gen_t PINT_context_id; >>> >>> +struct PINT_op_entry >>> +{ >>> + void *user_ptr; >>> + PINT_operation_t op; >>> + PVFS_id_gen_t wq_id; >>> + PINT_context_id ctx_id; >>> + PVFS_error error; >>> + >>> + struct qhash_head link; >>> +}; >>> + >>> + >>> + >>> typedef int (*PINT_completion_callback)(PINT_context_id ctx_id, >>> int count, >>> PINT_op_id *op_ids, >>> diff --git a/src/common/mgmt/pint-mgmt.c b/src/common/mgmt/pint-mgmt.c >>> index fa52d8d..120a5a1 100644 >>> --- a/src/common/mgmt/pint-mgmt.c >>> +++ b/src/common/mgmt/pint-mgmt.c >>> @@ -42,17 +42,6 @@ struct PINT_worker_s >>> struct qlist_head link; >>> }; >>> >>> -struct PINT_op_entry >>> -{ >>> - void *user_ptr; >>> - PINT_operation_t op; >>> - PVFS_id_gen_t wq_id; >>> - PINT_context_id ctx_id; >>> - PVFS_error error; >>> - >>> - struct qhash_head link; >>> -}; >>> - >>> struct PINT_manager_s >>> { >>> PINT_context_id context; >>> @@ -634,12 +623,15 @@ int PINT_manager_ctx_post(PINT_manager_t manager, >>> return ret; >>> } >>> >>> - id_gen_fast_register(&op_entry->op.id, op_entry); >>> - >>> gossip_debug(GOSSIP_MGMT_DEBUG, >>> "[MGMT]: manager ops: adding op id: %llu\n", >>> llu(op_entry->op.id)); >>> gen_mutex_lock(&manager->mutex); >>> + ret = id_gen_safe_register(&op_entry->op.id, op_entry); >>> + if (ret < 0) >>> + { >>> + return ret; >>> + } >>> qhash_add(manager->ops, &op_entry->op.id, &op_entry->link); >>> manager->op_count++; >>> gen_mutex_unlock(&manager->mutex); >>> @@ -657,6 +649,7 @@ int PINT_manager_ctx_post(PINT_manager_t manager, >>> * operation and return the error >>> */ >>> gen_mutex_lock(&manager->mutex); >>> + id_gen_safe_unregister(op_entry->op.id); >>> qhash_search_and_remove(manager->ops, &op_entry->op.id); >>> gen_mutex_unlock(&manager->mutex); >>> >>> @@ -1047,7 +1040,7 @@ int PINT_manager_test_op(PINT_manager_t manager, >>> struct PINT_op_entry *entry; >>> PINT_context_id context; >>> >>> - entry = id_gen_fast_lookup(op_id); >>> + entry = id_gen_safe_lookup(op_id); >>> if(!entry) >>> { >>> return -PVFS_EINVAL; >>> @@ -1143,13 +1136,17 @@ int PINT_manager_cancel(PINT_manager_t manager, >>> PINT_queue_id, >>> PINT_operation_t *); >>> >>> - entry = id_gen_fast_lookup(op_id); >>> + entry = id_gen_safe_lookup(op_id); >>> if(!entry) >>> { >>> return -PVFS_EINVAL; >>> } >>> ret = PINT_manager_find_worker( >>> manager, NULL, NULL, NULL, entry->wq_id, &worker, &queue_id); >>> + if (ret != 0) >>> + { >>> + return ret; >>> + } >>> context = entry->ctx_id; >>> op = &entry->op; >>> >>> @@ -1215,7 +1212,7 @@ int PINT_manager_wait_op(PINT_manager_t manager, >>> int msecs_remaining; >>> struct timeval last, now; >>> >>> - entry = id_gen_fast_lookup(op_id); >>> + entry = id_gen_safe_lookup(op_id); >>> if(!entry) >>> { >>> return -PVFS_EINVAL; >>> @@ -1451,9 +1448,6 @@ int PINT_manager_complete_op(PINT_manager_t >>> manager, >>> * since blocking calls never add the op >>> * to the table >>> */ >>> - >>> - /* free the op entry */ >>> - free(entry); >>> return ret; >>> } >>> >>> >>> -- >>> Benjamin Severs >>> _______________________________________________ >>> Pvfs2-developers mailing list >>> [email protected] >>> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers >>> >> >> >> >> -- >> Becky Ligon >> OrangeFS Support and Development >> Omnibond Systems >> Anderson, South Carolina >> >> >> > > > -- > Becky Ligon > OrangeFS Support and Development > Omnibond Systems > Anderson, South Carolina > > > -- Becky Ligon OrangeFS Support and Development Omnibond Systems Anderson, South Carolina
_______________________________________________ Pvfs2-developers mailing list [email protected] http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers
