Folks,

Since r32672 (trunk), grpcomm/rcd is the default module.
the attached spawn.c test program is a trimmed version of the
spawn_with_env_vars.c test case
from the ibm test suite.

when invoked on two nodes :
- the program hangs with -np 2
- the program can crash with np > 2
error message is
[node0:30701] [[42913,0],0] TWO RECEIVES WITH SAME PEER [[42913,0],1]
AND TAG -33 - ABORTING

here is my full command line (from node0) :

mpirun -host node0,node1 -np 2 --oversubscribe --mca btl tcp,self --mca
coll ^ml ./spawn

a simple workaround is to add the following extra parameter to the
mpirun command line :
--mca grpcomm_rcd_priority 0

my understanding it that the race condition occurs when all the
processes call MPI_Finalize()
internally, the pmix module will have mpirun/orted issue two ALLGATHER
involving mpirun and orted
(one job 1 aka the parent, and one for job 2 aka the spawned tasks)
the error message is very explicit : this is not (currently) supported

i wrote the attached rml.patch which is really a workaround and not a fix :
in this case, each job will invoke an ALLGATHER but with a different tag
/* that works for a limited number of jobs only */

i did not commit this patch since this is not a fix, could someone
(Ralph ?) please review the issue and comment ?


Cheers,

Gilles

/*
 * $HEADER$
 *
 * Program to test MPI_Comm_spawn with environment variables.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "mpi.h"

static void do_parent(char *cmd, int rank, int count)
{
    int *errcode, err;
    int i;
    MPI_Comm child_inter;
    MPI_Comm intra;
    FILE *fp;
    int found;
    int size;

    /* First, see if cmd exists on all ranks */

    fp = fopen(cmd, "r");
    if (NULL == fp) {
        found = 0;
    } else {
        fclose(fp);
        found = 1;
    }
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Allreduce(&found, &count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    if (count != size) {
        if (rank == 0) {
            MPI_Abort(MPI_COMM_WORLD, 77);
        }
        return;
    }

    /* Now try the spawn if it's found anywhere */

    errcode = malloc(sizeof(int) * count);
    if (NULL == errcode) {
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    memset(errcode, -1, count);
    MPI_Comm_spawn(cmd, MPI_ARGV_NULL, count, MPI_INFO_NULL, 0,
                   MPI_COMM_WORLD, &child_inter, errcode);

    /* Clean up */
    MPI_Barrier(child_inter);

    MPI_Comm_disconnect(&child_inter);
    free(errcode);
}


static void do_target(MPI_Comm parent)
{
    MPI_Barrier(parent);
    MPI_Comm_disconnect(&parent);
}


int main(int argc, char *argv[])
{
    int rank, size;
    MPI_Comm parent;

    /* Ok, we're good.  Proceed with the test. */
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    /* Check to see if we *were* spawned -- because this is a test, we
       can only assume the existence of this one executable.  Hence, we
       both mpirun it and spawn it. */

    parent = MPI_COMM_NULL;
    MPI_Comm_get_parent(&parent);
    if (parent != MPI_COMM_NULL) {
        do_target(parent);
    } else {
        do_parent(argv[0], rank, size);
    }

    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if (0 < rank) sleep(3);

    MPI_Finalize();

    /* All done */

    return 0;
}
Index: orte/mca/grpcomm/brks/grpcomm_brks.c
===================================================================
--- orte/mca/grpcomm/brks/grpcomm_brks.c        (revision 32688)
+++ orte/mca/grpcomm/brks/grpcomm_brks.c        (working copy)
@@ -6,6 +6,8 @@
  * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
  *                         rights reserved.
  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
+ * Copyright (c) 2014      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -111,6 +113,7 @@
 static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t 
distance) {
     orte_process_name_t peer_send, peer_recv;
     opal_buffer_t *send_buf;
+    orte_rml_tag_t tag;
     int rc;

     peer_send.jobid = ORTE_PROC_MY_NAME->jobid;
@@ -174,8 +177,14 @@
                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                              ORTE_NAME_PRINT(&peer_send)));

+    if (1 != coll->sig->sz || ORTE_VPID_WILDCARD != 
coll->sig->signature[0].vpid) {
+        tag = ORTE_RML_TAG_ALLGATHER;
+    } else {
+        tag = ORTE_RML_TAG_JOB_ALLGATHER + 
ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) % 
(ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER);
+    }
+
     if (0 > (rc = orte_rml.send_buffer_nb(&peer_send, send_buf,
-                                          -ORTE_RML_TAG_ALLGATHER,
+                                          -tag,
                                           orte_rml_send_callback, NULL))) {
         ORTE_ERROR_LOG(rc);
         OBJ_RELEASE(send_buf);
@@ -189,7 +198,7 @@

     /* setup recv for distance data */
     orte_rml.recv_buffer_nb(&peer_recv,
-                            -ORTE_RML_TAG_ALLGATHER,
+                            -tag,
                             ORTE_RML_NON_PERSISTENT,
                             brks_allgather_recv_dist, NULL);

Index: orte/mca/grpcomm/rcd/grpcomm_rcd.c
===================================================================
--- orte/mca/grpcomm/rcd/grpcomm_rcd.c  (revision 32688)
+++ orte/mca/grpcomm/rcd/grpcomm_rcd.c  (working copy)
@@ -6,6 +6,8 @@
  * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
  *                         rights reserved.
  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
+ * Copyright (c) 2014      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -105,6 +107,7 @@
 static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t 
distance) {
     orte_process_name_t peer;
     opal_buffer_t *send_buf;
+    orte_rml_tag_t tag;
     int rc;

     peer.jobid = ORTE_PROC_MY_NAME->jobid;
@@ -163,8 +166,14 @@
                          ORTE_NAME_PRINT(&peer)));


+    if (1 != coll->sig->sz || ORTE_VPID_WILDCARD != 
coll->sig->signature[0].vpid) {
+        tag = ORTE_RML_TAG_ALLGATHER;
+    } else {
+        tag = ORTE_RML_TAG_JOB_ALLGATHER + 
ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) % 
(ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER);
+    }
+
     if (0 > (rc = orte_rml.send_buffer_nb(&peer, send_buf,
-                                          -ORTE_RML_TAG_ALLGATHER,
+                                          -tag,
                                           orte_rml_send_callback, NULL))) {
         ORTE_ERROR_LOG(rc);
         OBJ_RELEASE(send_buf);
@@ -178,7 +187,7 @@

     /* setup recv for distance data */
     orte_rml.recv_buffer_nb(&peer,
-                            -ORTE_RML_TAG_ALLGATHER,
+                            -tag,
                             ORTE_RML_NON_PERSISTENT,
                             rcd_allgather_recv_dist, NULL);

Index: orte/mca/grpcomm/direct/grpcomm_direct.c
===================================================================
--- orte/mca/grpcomm/direct/grpcomm_direct.c    (revision 32688)
+++ orte/mca/grpcomm/direct/grpcomm_direct.c    (working copy)
@@ -6,6 +6,8 @@
  * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
  *                         rights reserved.
  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
+ * Copyright (c) 2014      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -70,6 +72,7 @@
  */
 static int init(void)
 {
+    orte_rml_tag_t tag;
     OBJ_CONSTRUCT(&tracker, opal_list_t);

     /* post the receives */
@@ -81,6 +84,12 @@
                             ORTE_RML_TAG_ALLGATHER,
                             ORTE_RML_PERSISTENT,
                             allgather_recv, NULL);
+    for (tag=ORTE_RML_TAG_JOB_ALLGATHER; tag<ORTE_RML_TAG_MAX; tag++) {
+        orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
+                                tag,
+                                ORTE_RML_PERSISTENT,
+                                allgather_recv, NULL);
+    }
     /* setup recv for barrier release */
     orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                             ORTE_RML_TAG_COLL_RELEASE,
@@ -125,6 +134,7 @@
     int rc, ret;
     opal_buffer_t *relay;
     orte_job_t *jdata;
+    orte_rml_tag_t tag;
     uint64_t nprocs;

     OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
@@ -189,8 +199,15 @@
                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

     /* send the info to the HNP for tracking */
+
+    if (1 != coll->sig->sz || ORTE_VPID_WILDCARD != 
coll->sig->signature[0].vpid) {
+        tag = ORTE_RML_TAG_ALLGATHER;
+    } else {
+        tag = ORTE_RML_TAG_JOB_ALLGATHER + 
ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) % 
(ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER);
+    }
+
     rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, relay,
-                                 ORTE_RML_TAG_ALLGATHER,
+                                 tag,
                                  orte_rml_send_callback, NULL);
     return rc;
 }
Index: orte/mca/rml/rml_types.h
===================================================================
--- orte/mca/rml/rml_types.h    (revision 32688)
+++ orte/mca/rml/rml_types.h    (working copy)
@@ -12,6 +12,8 @@
  * Copyright (c) 2007-2012 Los Alamos National Security, LLC.  All rights
  *                         reserved. 
  * Copyright (c) 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  * 
  * Additional copyrights may follow
@@ -152,6 +154,8 @@
 /* global collective ID request */
 #define ORTE_RML_TAG_FULL_COLL_ID           51

+#define ORTE_RML_TAG_JOB_ALLGATHER          52
+
 #define ORTE_RML_TAG_MAX                   100


Reply via email to