I have a very simple program which spawns a number of slaves. I am getting erratic results from this program. It seems that all the slave processes are spawned but not all of them complete the MPI_Init() before the main program ends. In addition I get the following error messages for which I haven't been able to find any documentation:
[turkana:26736] [0,0,0] ORTE_ERROR_LOG: Not found in file base/soh_base_get_proc_soh.c at line 80 [turkana:26736] [0,0,0] ORTE_ERROR_LOG: Not found in file base/oob_base_xcast.c at line 108 [turkana:26736] [0,0,0] ORTE_ERROR_LOG: Not found in file base/rmgr_base_stage_gate.c at line 276 [turkana:26736] [0,0,0] ORTE_ERROR_LOG: Not found in file base/soh_base_get_proc_soh.c at line 80 [turkana:26736] [0,0,0] ORTE_ERROR_LOG: Not found in file base/oob_base_xcast.c at line 108 [turkana:26736] [0,0,0] ORTE_ERROR_LOG: Not found in file base/rmgr_base_stage_gate.c at line 276 I am using openmpi 1.1 on FC4 on a dual AMD Athlon machine. My program is as follows: #include <mpi.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <string.h> int main(int ac, char *av[]) { int rank, size; char name[MPI_MAX_PROCESSOR_NAME]; int nameLen; int n = 5, i; int slave = 0; int errs[5]; char *args[] = { av[0], "-W", NULL}; MPI_Comm intercomm; int err; memset(name, sizeof(name), 0); for(i=1; i<ac; i++){ if (strcmp(av[i],"-W") == 0){ slave = 1; } } fprintf(stderr, "%s before MPI_Init() in %d\n", slave?"slave":"master", getpid()); MPI_Init(&ac, &av); fprintf(stderr, "%s after MPI_Init() in %d\n", slave?"slave":"master", getpid()); if (!slave){ err = MPI_Comm_spawn(av[0], args, n, MPI_INFO_NULL, 0, MPI_COMM_SELF, &intercomm, errs); if (err){ fprintf(stderr, "MPI_Comm_spawn generated error %d.\n", err); } } else { fprintf(stderr, "%s before MPI_Comm_get_parent() in %d\n", slave?"slave":"master", getpid()); MPI_Comm_get_parent(&intercomm); } MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); fprintf(stderr, "%s %d (%s) of %d\n", slave?"slave":"master", rank, name, size); MPI_Finalize(); return 0; }