Folks,
Since r32672 (trunk), grpcomm/rcd is the default module.
the attached spawn.c test program is a trimmed version of the
spawn_with_env_vars.c test case
from the ibm test suite.
when invoked on two nodes :
- the program hangs with -np 2
- the program can crash with np > 2
error message is
[node0:30701] [[42913,0],0] TWO RECEIVES WITH SAME PEER [[42913,0],1]
AND TAG -33 - ABORTING
here is my full command line (from node0) :
mpirun -host node0,node1 -np 2 --oversubscribe --mca btl tcp,self --mca
coll ^ml ./spawn
a simple workaround is to add the following extra parameter to the
mpirun command line :
--mca grpcomm_rcd_priority 0
my understanding it that the race condition occurs when all the
processes call MPI_Finalize()
internally, the pmix module will have mpirun/orted issue two ALLGATHER
involving mpirun and orted
(one job 1 aka the parent, and one for job 2 aka the spawned tasks)
the error message is very explicit : this is not (currently) supported
i wrote the attached rml.patch which is really a workaround and not a fix :
in this case, each job will invoke an ALLGATHER but with a different tag
/* that works for a limited number of jobs only */
i did not commit this patch since this is not a fix, could someone
(Ralph ?) please review the issue and comment ?
Cheers,
Gilles
/*
* $HEADER$
*
* Program to test MPI_Comm_spawn with environment variables.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mpi.h"
static void do_parent(char *cmd, int rank, int count)
{
int *errcode, err;
int i;
MPI_Comm child_inter;
MPI_Comm intra;
FILE *fp;
int found;
int size;
/* First, see if cmd exists on all ranks */
fp = fopen(cmd, "r");
if (NULL == fp) {
found = 0;
} else {
fclose(fp);
found = 1;
}
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Allreduce(&found, &count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
if (count != size) {
if (rank == 0) {
MPI_Abort(MPI_COMM_WORLD, 77);
}
return;
}
/* Now try the spawn if it's found anywhere */
errcode = malloc(sizeof(int) * count);
if (NULL == errcode) {
MPI_Abort(MPI_COMM_WORLD, 1);
}
memset(errcode, -1, count);
MPI_Comm_spawn(cmd, MPI_ARGV_NULL, count, MPI_INFO_NULL, 0,
MPI_COMM_WORLD, &child_inter, errcode);
/* Clean up */
MPI_Barrier(child_inter);
MPI_Comm_disconnect(&child_inter);
free(errcode);
}
static void do_target(MPI_Comm parent)
{
MPI_Barrier(parent);
MPI_Comm_disconnect(&parent);
}
int main(int argc, char *argv[])
{
int rank, size;
MPI_Comm parent;
/* Ok, we're good. Proceed with the test. */
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* Check to see if we *were* spawned -- because this is a test, we
can only assume the existence of this one executable. Hence, we
both mpirun it and spawn it. */
parent = MPI_COMM_NULL;
MPI_Comm_get_parent(&parent);
if (parent != MPI_COMM_NULL) {
do_target(parent);
} else {
do_parent(argv[0], rank, size);
}
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (0 < rank) sleep(3);
MPI_Finalize();
/* All done */
return 0;
}
Index: orte/mca/grpcomm/brks/grpcomm_brks.c
===================================================================
--- orte/mca/grpcomm/brks/grpcomm_brks.c (revision 32688)
+++ orte/mca/grpcomm/brks/grpcomm_brks.c (working copy)
@@ -6,6 +6,8 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
* rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
+ * Copyright (c) 2014 Research Organization for Information Science
+ * and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -111,6 +113,7 @@
static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t
distance) {
orte_process_name_t peer_send, peer_recv;
opal_buffer_t *send_buf;
+ orte_rml_tag_t tag;
int rc;
peer_send.jobid = ORTE_PROC_MY_NAME->jobid;
@@ -174,8 +177,14 @@
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer_send)));
+ if (1 != coll->sig->sz || ORTE_VPID_WILDCARD !=
coll->sig->signature[0].vpid) {
+ tag = ORTE_RML_TAG_ALLGATHER;
+ } else {
+ tag = ORTE_RML_TAG_JOB_ALLGATHER +
ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) %
(ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER);
+ }
+
if (0 > (rc = orte_rml.send_buffer_nb(&peer_send, send_buf,
- -ORTE_RML_TAG_ALLGATHER,
+ -tag,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(send_buf);
@@ -189,7 +198,7 @@
/* setup recv for distance data */
orte_rml.recv_buffer_nb(&peer_recv,
- -ORTE_RML_TAG_ALLGATHER,
+ -tag,
ORTE_RML_NON_PERSISTENT,
brks_allgather_recv_dist, NULL);
Index: orte/mca/grpcomm/rcd/grpcomm_rcd.c
===================================================================
--- orte/mca/grpcomm/rcd/grpcomm_rcd.c (revision 32688)
+++ orte/mca/grpcomm/rcd/grpcomm_rcd.c (working copy)
@@ -6,6 +6,8 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
* rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
+ * Copyright (c) 2014 Research Organization for Information Science
+ * and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -105,6 +107,7 @@
static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t
distance) {
orte_process_name_t peer;
opal_buffer_t *send_buf;
+ orte_rml_tag_t tag;
int rc;
peer.jobid = ORTE_PROC_MY_NAME->jobid;
@@ -163,8 +166,14 @@
ORTE_NAME_PRINT(&peer)));
+ if (1 != coll->sig->sz || ORTE_VPID_WILDCARD !=
coll->sig->signature[0].vpid) {
+ tag = ORTE_RML_TAG_ALLGATHER;
+ } else {
+ tag = ORTE_RML_TAG_JOB_ALLGATHER +
ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) %
(ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER);
+ }
+
if (0 > (rc = orte_rml.send_buffer_nb(&peer, send_buf,
- -ORTE_RML_TAG_ALLGATHER,
+ -tag,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(send_buf);
@@ -178,7 +187,7 @@
/* setup recv for distance data */
orte_rml.recv_buffer_nb(&peer,
- -ORTE_RML_TAG_ALLGATHER,
+ -tag,
ORTE_RML_NON_PERSISTENT,
rcd_allgather_recv_dist, NULL);
Index: orte/mca/grpcomm/direct/grpcomm_direct.c
===================================================================
--- orte/mca/grpcomm/direct/grpcomm_direct.c (revision 32688)
+++ orte/mca/grpcomm/direct/grpcomm_direct.c (working copy)
@@ -6,6 +6,8 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
* rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
+ * Copyright (c) 2014 Research Organization for Information Science
+ * and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -70,6 +72,7 @@
*/
static int init(void)
{
+ orte_rml_tag_t tag;
OBJ_CONSTRUCT(&tracker, opal_list_t);
/* post the receives */
@@ -81,6 +84,12 @@
ORTE_RML_TAG_ALLGATHER,
ORTE_RML_PERSISTENT,
allgather_recv, NULL);
+ for (tag=ORTE_RML_TAG_JOB_ALLGATHER; tag<ORTE_RML_TAG_MAX; tag++) {
+ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
+ tag,
+ ORTE_RML_PERSISTENT,
+ allgather_recv, NULL);
+ }
/* setup recv for barrier release */
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_COLL_RELEASE,
@@ -125,6 +134,7 @@
int rc, ret;
opal_buffer_t *relay;
orte_job_t *jdata;
+ orte_rml_tag_t tag;
uint64_t nprocs;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
@@ -189,8 +199,15 @@
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* send the info to the HNP for tracking */
+
+ if (1 != coll->sig->sz || ORTE_VPID_WILDCARD !=
coll->sig->signature[0].vpid) {
+ tag = ORTE_RML_TAG_ALLGATHER;
+ } else {
+ tag = ORTE_RML_TAG_JOB_ALLGATHER +
ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) %
(ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER);
+ }
+
rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, relay,
- ORTE_RML_TAG_ALLGATHER,
+ tag,
orte_rml_send_callback, NULL);
return rc;
}
Index: orte/mca/rml/rml_types.h
===================================================================
--- orte/mca/rml/rml_types.h (revision 32688)
+++ orte/mca/rml/rml_types.h (working copy)
@@ -12,6 +12,8 @@
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Research Organization for Information Science
+ * and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -152,6 +154,8 @@
/* global collective ID request */
#define ORTE_RML_TAG_FULL_COLL_ID 51
+#define ORTE_RML_TAG_JOB_ALLGATHER 52
+
#define ORTE_RML_TAG_MAX 100