Not quite right. There still is no topology information at collective selection time for either graph or dist graph.
-Nathan On Tue, Sep 30, 2014 at 02:03:27PM +0900, Gilles Gouaillardet wrote: > Nathan, > > here is a revision of the previously attached patch, and that supports > graph and dist graph. > > Cheers, > > Gilles > > On 2014/09/30 0:05, Nathan Hjelm wrote: > > An equivalent change would need to be made for graph and dist graph as > well. That will take a little more work. Also, I was avoiding changing > anything in topo for 1.8. > > -Nathan > > On Mon, Sep 29, 2014 at 08:02:41PM +0900, Gilles Gouaillardet wrote: > > Nathan, > > why not just make the topology information available at that point as you > described it ? > > the attached patch does this, could you please review it ? > > Cheers, > > Gilles > > On 2014/09/26 2:50, Nathan Hjelm wrote: > > On Tue, Aug 26, 2014 at 07:03:24PM +0300, Lisandro Dalcin wrote: > > I finally managed to track down some issues in mpi4py's test suite > using Open MPI 1.8+. The code below should be enough to reproduce the > problem. Run it under valgrind to make sense of my following > diagnostics. > > In this code I'm creating a 2D, periodic Cartesian topology out of > COMM_SELF. In this case, the process in COMM_SELF has 4 logical in/out > links to itself. So we have size=1 but indegree=outdegree=4. However, > in ompi/mca/coll/basic/coll_basic_module.c, "size * 2" request are > being allocated to manage communication: > > if (OMPI_COMM_IS_INTER(comm)) { > size = ompi_comm_remote_size(comm); > } else { > size = ompi_comm_size(comm); > } > basic_module->mccb_num_reqs = size * 2; > basic_module->mccb_reqs = (ompi_request_t**) > malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs); > > I guess you have to also special-case for topologies and allocate > indegree+outdegree requests (not sure about this number, just > guessing). > > > I wish this was possible but the topology information is not available > at that point. We may be able to change that but I don't see the work > completing anytime soon. I committed an alternative fix as r32796 and > CMR'd it to 1.8.3. I can confirm that the attached reproducer no longer > produces a SEGV. Let me know if you run into any more issues. > > > -Nathan > > _______________________________________________ > devel mailing list > de...@open-mpi.org > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel > Link to this post: > http://www.open-mpi.org/community/lists/devel/2014/09/15915.php > > Index: ompi/mca/topo/base/topo_base_cart_create.c > =================================================================== > --- ompi/mca/topo/base/topo_base_cart_create.c (revision 32807) > +++ ompi/mca/topo/base/topo_base_cart_create.c (working copy) > @@ -163,10 +163,18 @@ > return MPI_ERR_INTERN; > } > > + assert(NULL == new_comm->c_topo); > + assert(!(new_comm->c_flags & OMPI_COMM_CART)); > + new_comm->c_topo = topo; > + new_comm->c_topo->mtc.cart = cart; > + new_comm->c_topo->reorder = reorder; > + new_comm->c_flags |= OMPI_COMM_CART; > ret = ompi_comm_enable(old_comm, new_comm, > new_rank, num_procs, topo_procs); > if (OMPI_SUCCESS != ret) { > /* something wrong happened during setting the communicator */ > + new_comm->c_topo = NULL; > + new_comm->c_flags &= ~OMPI_COMM_CART; > ompi_comm_free (&new_comm); > free(topo_procs); > if(NULL != cart->periods) free(cart->periods); > @@ -176,10 +184,6 @@ > return ret; > } > > - new_comm->c_topo = topo; > - new_comm->c_topo->mtc.cart = cart; > - new_comm->c_topo->reorder = reorder; > - new_comm->c_flags |= OMPI_COMM_CART; > *comm_topo = new_comm; > > if( MPI_UNDEFINED == new_rank ) { > Index: ompi/mca/coll/basic/coll_basic_module.c > =================================================================== > --- ompi/mca/coll/basic/coll_basic_module.c (revision 32807) > +++ ompi/mca/coll/basic/coll_basic_module.c (working copy) > @@ -13,6 +13,8 @@ > * Copyright (c) 2012 Sandia National Laboratories. All rights > reserved. > * Copyright (c) 2013 Los Alamos National Security, LLC. All rights > * reserved. > + * Copyright (c) 2014 Research Organization for Information Science > + * and Technology (RIST). All rights reserved. > * $COPYRIGHT$ > * > * Additional copyrights may follow > @@ -28,6 +30,7 @@ > #include "mpi.h" > #include "ompi/mca/coll/coll.h" > #include "ompi/mca/coll/base/base.h" > +#include "ompi/mca/topo/topo.h" > #include "coll_basic.h" > > > @@ -70,6 +73,15 @@ > } else { > size = ompi_comm_size(comm); > } > + if (comm->c_flags & OMPI_COMM_CART) { > + int cart_size; > + assert (NULL != comm->c_topo); > + comm->c_topo->topo.cart.cartdim_get(comm, &cart_size); > + cart_size *= 2; > + if (cart_size > size) { > + size = cart_size; > + } > + } > basic_module->mccb_num_reqs = size * 2; > basic_module->mccb_reqs = (ompi_request_t**) > malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs); > > _______________________________________________ > devel mailing list > de...@open-mpi.org > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel > Link to this post: > http://www.open-mpi.org/community/lists/devel/2014/09/15929.php > > _______________________________________________ > devel mailing list > de...@open-mpi.org > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel > Link to this post: > http://www.open-mpi.org/community/lists/devel/2014/09/15930.php > Index: ompi/mca/topo/base/topo_base_cart_create.c > =================================================================== > --- ompi/mca/topo/base/topo_base_cart_create.c (revision 32815) > +++ ompi/mca/topo/base/topo_base_cart_create.c (working copy) > @@ -163,10 +163,18 @@ > return MPI_ERR_INTERN; > } > > + assert(NULL == new_comm->c_topo); > + assert(!(new_comm->c_flags & OMPI_COMM_CART)); > + new_comm->c_topo = topo; > + new_comm->c_topo->mtc.cart = cart; > + new_comm->c_topo->reorder = reorder; > + new_comm->c_flags |= OMPI_COMM_CART; > ret = ompi_comm_enable(old_comm, new_comm, > new_rank, num_procs, topo_procs); > if (OMPI_SUCCESS != ret) { > /* something wrong happened during setting the communicator */ > + new_comm->c_topo = NULL; > + new_comm->c_flags &= ~OMPI_COMM_CART; > ompi_comm_free (&new_comm); > free(topo_procs); > if(NULL != cart->periods) free(cart->periods); > @@ -176,10 +184,6 @@ > return ret; > } > > - new_comm->c_topo = topo; > - new_comm->c_topo->mtc.cart = cart; > - new_comm->c_topo->reorder = reorder; > - new_comm->c_flags |= OMPI_COMM_CART; > *comm_topo = new_comm; > > if( MPI_UNDEFINED == new_rank ) { > Index: ompi/mca/coll/basic/coll_basic_module.c > =================================================================== > --- ompi/mca/coll/basic/coll_basic_module.c (revision 32815) > +++ ompi/mca/coll/basic/coll_basic_module.c (working copy) > @@ -13,6 +13,8 @@ > * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. > * Copyright (c) 2013 Los Alamos National Security, LLC. All rights > * reserved. > + * Copyright (c) 2014 Research Organization for Information Science > + * and Technology (RIST). All rights reserved. > * $COPYRIGHT$ > * > * Additional copyrights may follow > @@ -28,6 +30,8 @@ > #include "mpi.h" > #include "ompi/mca/coll/coll.h" > #include "ompi/mca/coll/base/base.h" > +#include "ompi/mca/topo/topo.h" > +#include "ompi/mca/topo/base/base.h" > #include "coll_basic.h" > > > @@ -70,7 +74,36 @@ > } else { > size = ompi_comm_size(comm); > } > - basic_module->mccb_num_reqs = size * 2; > + size *= 2; > + if (OMPI_COMM_IS_CART(comm)) { > + int cart_size; > + mca_topo_base_comm_cart_2_1_0_t *cart; > + assert (NULL != comm->c_topo); > + cart = comm->c_topo->mtc.cart; > + cart_size = cart->ndims * 4; > + if (cart_size > size) { > + size = cart_size; > + } > + } else if (OMPI_COMM_IS_GRAPH(comm)) { > + int rank, degree; > + assert (NULL != comm->c_topo); > + rank = ompi_comm_rank (comm); > + mca_topo_base_graph_neighbors_count (comm, rank, °ree); > + degree *= 2; > + if (degree > size) { > + size = degree; > + } > + } else if (OMPI_COMM_IS_DIST_GRAPH(comm)) { > + int dist_graph_size; > + mca_topo_base_comm_dist_graph_2_1_0_t *dist_graph; > + assert (NULL != comm->c_topo); > + dist_graph = comm->c_topo->mtc.dist_graph; > + dist_graph_size = dist_graph->indegree + dist_graph->outdegree; > + if (dist_graph_size > size) { > + size = dist_graph_size; > + } > + } > + basic_module->mccb_num_reqs = size; > basic_module->mccb_reqs = (ompi_request_t**) > malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs); > > _______________________________________________ > devel mailing list > de...@open-mpi.org > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/devel > Link to this post: > http://www.open-mpi.org/community/lists/devel/2014/09/15941.php
pgp_NxQZWFh88.pgp
Description: PGP signature