Hi Ralph
I have found a bug in the 'grpcomm' : 'hier'. This bug create a
infinite loop in mpi_finalize. In this module: the barrier is executed
as an allgather with data length of zero. This allgather function can go
to an infinite loop , depend of rank execution order.
In orte/mca/grpcomm/hier/grpcomm_hier_module.c:402, this allgather
function need send/recv a piece of data:
- before the receive loop : the receive counter is reseted and the
"allgather_recv()" resistered.
- the "allgather_recv" function is not unregistered after the receive
loop. The receive counter is incremented by "allgather_recv".
This allgather function is use many times. So when the data is received
before reseting the receive counter , the receive loop will always wait
data.
I have fix this with a patch( I reset the counter after the receive loop):
--- a/orte/mca/grpcomm/hier/grpcomm_hier_module.c Fri Dec 04
19:59:26 2009 +0100
+++ b/orte/mca/grpcomm/hier/grpcomm_hier_module.c Mon Dec 07
16:35:40 2009 +0100
@@ -251,7 +251,7 @@
}
static opal_buffer_t allgather_buf;
-static int allgather_num_recvd;
+static int allgather_num_recvd=0;
static void process_msg(int fd, short event, void *data)
{
@@ -366,7 +366,6 @@
/* now receive the final result. Be sure to do this in
* a manner that allows us to return without being in a recv!
*/
- allgather_num_recvd = 0;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_ALLGATHER,
ORTE_RML_NON_PERSISTENT,
allgather_recv, NULL);
if (rc != ORTE_SUCCESS) {
@@ -375,6 +374,7 @@
}
ORTE_PROGRESSED_WAIT(false, allgather_num_recvd, 1);
+ allgather_num_recvd = 0;
/* copy payload to the caller's buffer */
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf,
&allgather_buf))) {
@@ -394,7 +394,6 @@
/* wait to receive their data. Be sure to do this in
* a manner that allows us to return without being in a recv!
*/
- allgather_num_recvd = 0;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_ALLGATHER,
ORTE_RML_NON_PERSISTENT,
allgather_recv, NULL);
if (rc != ORTE_SUCCESS) {
@@ -403,6 +402,7 @@
}
ORTE_PROGRESSED_WAIT(false, allgather_num_recvd, num_local_peers);
+ allgather_num_recvd = 0;
/* take the recv'd data and use one of the base collectives
* to exchange it with all other local_rank=0 procs in a scalable
An another way is to unregister "allgather_recv()" after the receive loop.
--- a/orte/mca/grpcomm/hier/grpcomm_hier_module.c Fri Dec 04
19:59:26 2009 +0100
+++ b/orte/mca/grpcomm/hier/grpcomm_hier_module.c Mon Dec 07
17:46:13 2009 +0100
@@ -375,7 +375,7 @@
}
ORTE_PROGRESSED_WAIT(false, allgather_num_recvd, 1);
-
+ if(ORTE_SUCCESS!= (rc =
orte_rml.recv_cancel(ORTE_NAME_WILDCARD,ORTE_RML_TAG_ALLGATHER))) {
ORTE_ERROR_LOG(rc); return rc;}
/* copy payload to the caller's buffer */
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf,
&allgather_buf))) {
ORTE_ERROR_LOG(rc);
@@ -403,7 +403,7 @@
}
ORTE_PROGRESSED_WAIT(false, allgather_num_recvd, num_local_peers);
-
+ if(ORTE_SUCCESS!= (rc =
orte_rml.recv_cancel(ORTE_NAME_WILDCARD,ORTE_RML_TAG_ALLGATHER))) {
ORTE_ERROR_LOG(rc); return rc;}
/* take the recv'd data and use one of the base collectives
* to exchange it with all other local_rank=0 procs in a scalable
* manner - the exact collective will depend upon the number of
Do you have any preferences ?
Thanks
Damien