Hi Ralph

I have found a bug in the 'grpcomm' : 'hier'. This bug create a infinite loop in mpi_finalize. In this module: the barrier is executed as an allgather with data length of zero. This allgather function can go to an infinite loop , depend of rank execution order.

In orte/mca/grpcomm/hier/grpcomm_hier_module.c:402, this allgather function need send/recv a piece of data: - before the receive loop : the receive counter is reseted and the "allgather_recv()" resistered. - the "allgather_recv" function is not unregistered after the receive loop. The receive counter is incremented by "allgather_recv". This allgather function is use many times. So when the data is received before reseting the receive counter , the receive loop will always wait data.

I have fix this with a patch( I reset the counter after the receive loop):
--- a/orte/mca/grpcomm/hier/grpcomm_hier_module.c Fri Dec 04 19:59:26 2009 +0100 +++ b/orte/mca/grpcomm/hier/grpcomm_hier_module.c Mon Dec 07 16:35:40 2009 +0100
@@ -251,7 +251,7 @@
}

static opal_buffer_t allgather_buf;
-static int allgather_num_recvd;
+static int allgather_num_recvd=0;

static void process_msg(int fd, short event, void *data)
{
@@ -366,7 +366,6 @@
        /* now receive the final result. Be sure to do this in
         * a manner that allows us to return without being in a recv!
         */
-        allgather_num_recvd = 0;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_NON_PERSISTENT, allgather_recv, NULL);
        if (rc != ORTE_SUCCESS) {
@@ -375,6 +374,7 @@
        }

        ORTE_PROGRESSED_WAIT(false, allgather_num_recvd, 1);
+       allgather_num_recvd = 0;

        /* copy payload to the caller's buffer */
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, &allgather_buf))) {
@@ -394,7 +394,6 @@
        /* wait to receive their data. Be sure to do this in
         * a manner that allows us to return without being in a recv!
         */
-        allgather_num_recvd = 0;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_NON_PERSISTENT, allgather_recv, NULL);
        if (rc != ORTE_SUCCESS) {
@@ -403,6 +402,7 @@
        }

        ORTE_PROGRESSED_WAIT(false, allgather_num_recvd, num_local_peers);
+       allgather_num_recvd = 0;

        /* take the recv'd data and use one of the base collectives
         * to exchange it with all other local_rank=0 procs in a scalable

An another way is to unregister "allgather_recv()" after the receive loop.
--- a/orte/mca/grpcomm/hier/grpcomm_hier_module.c Fri Dec 04 19:59:26 2009 +0100 +++ b/orte/mca/grpcomm/hier/grpcomm_hier_module.c Mon Dec 07 17:46:13 2009 +0100
@@ -375,7 +375,7 @@
        }

        ORTE_PROGRESSED_WAIT(false, allgather_num_recvd, 1);
-
+ if(ORTE_SUCCESS!= (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD,ORTE_RML_TAG_ALLGATHER))) { ORTE_ERROR_LOG(rc); return rc;}
        /* copy payload to the caller's buffer */
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, &allgather_buf))) {
            ORTE_ERROR_LOG(rc);
@@ -403,7 +403,7 @@
        }

        ORTE_PROGRESSED_WAIT(false, allgather_num_recvd, num_local_peers);
- + if(ORTE_SUCCESS!= (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD,ORTE_RML_TAG_ALLGATHER))) { ORTE_ERROR_LOG(rc); return rc;}
        /* take the recv'd data and use one of the base collectives
         * to exchange it with all other local_rank=0 procs in a scalable
         * manner - the exact collective will depend upon the number of

Do you have any preferences ?

Thanks
Damien

Reply via email to