+Valentine

Jeff,

Hcoll uses the PML as an "OOB" to bootstrap itself. When a communicator is 
destroyed, by the time we destroy the hcoll module, the communicator context is 
no longer valid and any pending operations that rely on its existence will 
fail. In particular, we have a non-blocking synchronization barrier that may be 
in progress when the communicator is destroyed. Registering the delete callback 
allows us to finish these operations because the context is still valid inside 
of this callback. The commented out code is the "prototype" protocol that 
attempted to handle this scenario in an entirely different (and more complex) 
way. It is not needed now. We don't want to introduce solutions that are OMPI 
specific, because we need to be able to integrate hcoll into other runtimes. We 
considered approaching the community about changing the comm destroy flow in 
OMPI to keep the context alive long enough to complete our synchronization 
barriers, but then the solution is tied to a particular MPI.  

Is this a reasonable way to go about this?

Josh

-----Original Message-----
From: devel [mailto:devel-boun...@open-mpi.org] On Behalf Of Jeff Squyres 
(jsquyres)
Sent: Thursday, January 09, 2014 6:46 AM
To: <de...@open-mpi.org>
Cc: Devendar Bureddy; Mike Dubman
Subject: [OMPI devel] hcoll destruction via MPI attribute

Mike / Devendar --

I'd like to understand the use case for this a bit more, from the perspective 
of: is there more infrastructure that we need to provide in the coll framework? 
 Can you explain what you're trying to do, and when you need your cleanup 
action(s) to run?

I ask because this seems a little odd (use an MPI attribute to trigger 
something during shutdown).  Also, you commented out a bunch of code that 
doesn't seem to have been replaced with anything -- the attribute delete 
function simply calls hcoll_group_destroy_notify(); it doesn't do any of the 
things that the now-commented-out code in the hcoll module destructor does.

Thanks.




On Jan 9, 2014, at 6:27 AM, <svn-commit-mai...@open-mpi.org> wrote:

> Author: miked (Mike Dubman)
> Date: 2014-01-09 06:27:24 EST (Thu, 09 Jan 2014) New Revision: 30175
> URL: https://svn.open-mpi.org/trac/ompi/changeset/30175
> 
> Log:
> Create a comm keyval for hcoll component with delete callback function.
> Set comm attribute with keyval.
> Wait for pending hcoll module tasks in comm delete callback where PML 
> still valid on the communicator. safely destroy hcoll context during 
> hcoll module destructor.
> 
> Author: Devendar Bureddy
> reviewed by miked
> 
> cmr=v1.7.4:reviewer=ompi-rm1.7
> 
> Text files modified: 
>   trunk/ompi/mca/coll/hcoll/coll_hcoll.h        |     1                       
>                   
>   trunk/ompi/mca/coll/hcoll/coll_hcoll_module.c |    51 
> ++++++++++++++++++++++++++++++++++++--- 
>   2 files changed, 48 insertions(+), 4 deletions(-)
> 
> Modified: trunk/ompi/mca/coll/hcoll/coll_hcoll.h
> ==============================================================================
> --- trunk/ompi/mca/coll/hcoll/coll_hcoll.h    Thu Jan  9 00:56:29 2014        
> (r30174)
> +++ trunk/ompi/mca/coll/hcoll/coll_hcoll.h    2014-01-09 06:27:24 EST (Thu, 
> 09 Jan 2014)      (r30175)
> @@ -19,6 +19,7 @@
> #include "ompi/mca/pml/pml.h"
> #include "ompi/mca/coll/base/coll_tags.h"
> #include "ompi/communicator/communicator.h"
> +#include "ompi/attribute/attribute.h"
> #include "ompi/op/op.h"
> 
> #include "orte/runtime/orte_globals.h"
> 
> Modified: trunk/ompi/mca/coll/hcoll/coll_hcoll_module.c
> ==============================================================================
> --- trunk/ompi/mca/coll/hcoll/coll_hcoll_module.c     Thu Jan  9 00:56:29 
> 2014        (r30174)
> +++ trunk/ompi/mca/coll/hcoll/coll_hcoll_module.c     2014-01-09 06:27:24 EST 
> (Thu, 09 Jan 2014)      (r30175)
> @@ -10,6 +10,8 @@
> #include "ompi_config.h"
> #include "coll_hcoll.h"
> 
> +int hcoll_comm_attr_keyval;
> +
> /*
>  * Initial query function that is invoked during MPI_INIT, allowing
>  * this module to indicate what level of thread support it provides.
> @@ -57,6 +59,7 @@
>     am = &mca_coll_hcoll_component.active_modules;
> 
>     if (hcoll_module->comm == &ompi_mpi_comm_world.comm){
> +#if 0
>         /* If we get here then we are detroying MPI_COMM_WORLD now. So,
>          * it is safe to destory all the other communicators and corresponding
>          * hcoll contexts that could still be on the "active_modules" list.
> @@ -86,6 +89,10 @@
>                                   (rte_grp_handle_t)hcoll_module->comm,
>                                   &context_destroyed);
>         }
> +#endif
> +        if (OMPI_SUCCESS != ompi_attr_free_keyval(COMM_ATTR, 
> &hcoll_comm_attr_keyval, 0)) {
> +            HCOL_VERBOSE(1,"hcoll ompi_attr_free_keyval failed");
> +        }
>     }
> 
>     OBJ_RELEASE(hcoll_module->previous_barrier_module);
> @@ -104,6 +111,11 @@
>     OBJ_RELEASE(hcoll_module->previous_ibcast_module);
>     OBJ_RELEASE(hcoll_module->previous_iallreduce_module);
>     OBJ_RELEASE(hcoll_module->previous_iallgather_module);
> +    context_destroyed = 0;
> +    hcoll_destroy_context(hcoll_module->hcoll_context,
> +                                  (rte_grp_handle_t)hcoll_module->comm,
> +                                  &context_destroyed);
> +    assert(context_destroyed);
> 
>     mca_coll_hcoll_module_clear(hcoll_module);
> }
> @@ -142,7 +154,18 @@
>     return OMPI_SUCCESS;
> }
> 
> +/*
> +** Communicator free callback
> +*/
> +int hcoll_comm_attr_del_fn(MPI_Comm comm, int keyval, void *attr_val, 
> +void *extra) {
> +
> +    mca_coll_hcoll_module_t *hcoll_module = 
> + (mca_coll_hcoll_module_t*) attr_val;
> 
> +    hcoll_group_destroy_notify(hcoll_module->hcoll_context);
> +    return OMPI_SUCCESS;
> +
> +}
> /*
>  * Initialize module on the communicator  */ @@ -150,6 +173,7 @@
>                                         struct ompi_communicator_t 
> *comm) {
>     mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) 
> module;
> +    int ret;
>     hcoll_module->comm = comm;
>     if (OMPI_SUCCESS != __save_coll_handlers(hcoll_module)){
>         HCOL_ERROR("coll_hcol: __save_coll_handlers failed"); @@ 
> -166,6 +190,7 @@
>         return OMPI_ERROR;
>     }
> 
> +#if 0
>     if (comm != &ompi_mpi_comm_world.comm){
>         mca_coll_hcoll_module_list_item_wrapper_t *mw =
>             OBJ_NEW(mca_coll_hcoll_module_list_item_wrapper_t);
> @@ -174,6 +199,13 @@
>         opal_list_append(&mca_coll_hcoll_component.active_modules,
>                          (opal_list_item_t*)mw);
>     }
> +#endif
> +
> +    ret = ompi_attr_set_c(COMM_ATTR, comm, &comm->c_keyhash, 
> hcoll_comm_attr_keyval, (void *)hcoll_module, false);
> +    if (OMPI_SUCCESS != ret) {
> +        HCOL_VERBOSE(1,"hcoll ompi_attr_set_c failed");
> +        return OMPI_ERROR;
> +    }
> 
>     return OMPI_SUCCESS;
> }
> @@ -197,7 +229,7 @@
>     if (ompi_mpi_finalized){
>         hcoll_rte_p2p_disabled_notify();
>     }
> -
> +#if 0
>     item = opal_list_get_first(am);
>     while (item != opal_list_get_end(am)){
>         item_next = opal_list_get_next(item); @@ -225,10 +257,9 @@
>         }
>         item = item_next;
>     }
> +#endif
> 
> -    if (!ompi_mpi_finalized) {
> -        (*hcoll_progress_fn)();
> -    }
> +    (*hcoll_progress_fn)();
>     OPAL_THREAD_ADD32(&mca_coll_hcoll_component.progress_lock,-1);
>     return OMPI_SUCCESS;
> }
> @@ -245,6 +276,9 @@
>     mca_coll_base_module_t *module;
>     mca_coll_hcoll_module_t *hcoll_module;
>     static bool libhcoll_initialized = false;
> +    ompi_attribute_fn_ptr_union_t del_fn;
> +    ompi_attribute_fn_ptr_union_t copy_fn;
> +    int err;
>     int rc;
> 
>     *priority = 0;
> @@ -269,6 +303,15 @@
>             HCOL_VERBOSE(0,"Hcol library init failed");
>             return NULL;
>         }
> +
> +        copy_fn.attr_communicator_copy_fn = 
> (MPI_Comm_internal_copy_attr_function*) MPI_COMM_NULL_COPY_FN;
> +        del_fn.attr_communicator_delete_fn = hcoll_comm_attr_del_fn; 
> +        err = ompi_attr_create_keyval(COMM_ATTR, copy_fn, del_fn, 
> &hcoll_comm_attr_keyval, NULL ,0, NULL);
> +        if (OMPI_SUCCESS != err) {
> +            HCOL_VERBOSE(0,"Hcol comm keyval create failed");
> +            return NULL;
> +        }
> +
>         libhcoll_initialized = true;
>     }
>     hcoll_module = OBJ_NEW(mca_coll_hcoll_module_t); 
> _______________________________________________
> svn-full mailing list
> svn-f...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full


--
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to: 
http://www.cisco.com/web/about/doing_business/legal/cri/

_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to