Hi Slurm gurus! We are seeing an issue when launching large rank count
jobs in our IB cluster using PMI2 and could use your help. When the jobs
fail, the first line of output seems to have the most useful information:
srun: error: mpi/pmi2: failed to send temp kvs to compute nodes
One of our Mellanox friends stripped down the test case to just include
the PMI2 startup code from MPI to help try to isolate the issue
further. This code just takes one argument which is how many bytes to
put in the message. When we start up this test code on 1534 nodes with
PPN=24, 436 byte messages will pass, but 437 byte messages will fail.
(Maybe those numbers will help someone figure this out!)
The only interesting slurm configuration option that we have updated is:
MessageTimeout=60, but it did not impact this issue. We are looking
for advice on how to proceed in debugging/troubleshooting this issue
further.
I have attached the test program to this message.
We are using RHEL 6.5 x86_64 and Slurm 14.03.10 on this system.
Andy
--
Andy Riebs
Hewlett-Packard Company
High Performance Computing
+1 404 648 9024
My opinions are not necessarily those of HP
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/time.h>
#include <string.h>
#include <pmi2.h>
static int rank;
static int nnodes;
static char jid[1024];
static void print_delta(struct timeval *base, char *msg)
{
struct timeval t, t_d;
if (rank != 0)
return;
gettimeofday(&t, 0);
timersub(&t, base, &t_d);
printf("delta(%s): %1.6lf seconds\n", msg, t_d.tv_sec+t_d.tv_usec/1000000.0);
}
void test_init()
{
int spawned, appnum;
int ret;
ret = PMI2_Init(&spawned, &nnodes, &rank, &appnum);
if (ret != PMI2_SUCCESS) {
printf("Failed to init PMI2\n");
exit(1);
}
ret = PMI2_Job_GetId(jid, sizeof(jid));
//printf("id=%s me=%d total=%d\n", jid, rank, nnodes);
fflush(stdout);
}
void test_fini()
{
PMI2_Finalize();
}
int test_fence(int vlen)
{
char key[PMI2_MAX_KEYLEN], val[PMI2_MAX_VALLEN];
static int id;
int i, ret;
struct timeval t;
sprintf(key, "r%did%d", rank, id);
memset(val, 0, sizeof(val));
for (i = 0; i < vlen; i++) {
val[i] = 'A';
}
ret = PMI2_KVS_Put(key, val);
assert(ret == PMI2_SUCCESS);
gettimeofday(&t, NULL);
PMI2_KVS_Fence();
sprintf(val, "fence_%d vlen %d", id, vlen);
print_delta(&t, val);
return id++;
}
void test_get(int id, int vlen)
{
char key[PMI2_MAX_KEYLEN], val[PMI2_MAX_VALLEN];
int ret;
int i;
int len;
struct timeval t;
gettimeofday(&t, NULL);
for (i = 0; i < nnodes; i++) {
sprintf(key, "r%did%d", rank, id);
ret = PMI2_KVS_Get(jid, PMI2_ID_NULL, key, val, sizeof(val), &len);
assert(ret == PMI2_SUCCESS);
//if (rank == 0)
// printf("rank %d peer %d val %s\n", rank, i, val);
}
sprintf(val, "get_%d vlen %d", id, vlen);
print_delta(&t, val);
}
int main(int argc, char **argv)
{
int len;
if (argc != 2) {
printf("pmi_allgather2 size\n");
exit(1);
}
len = atoi(argv[1]);
test_init();
if (len < 0 || len > PMI2_MAX_VALLEN) {
if (rank == 0) {
printf("size (%d) must be between [%d, %d)\n",
len, 0, PMI2_MAX_VALLEN);
}
exit(1);
}
test_get(test_fence(len), len);
test_fini();
return 0;
}