Hi,
I think that I have found a bug on the implementation of GM
collectives routines included in OpenMPI. The version of the GM
software is 2.0.30 for the PCI64 cards.
Sometimes, when I broadcast a vector with 1024 integer by using the
MPI_Bcast call, some processor receives a bad packet. Usually, the
difference with the original packet is only 1 bit, but it is enough
to break the communication.
I obtain the same problems when I use the 1.4.1 or the 1.4.2 version.
Could you help me? Thanks.
Best regards,
José I. Aliaga
=================================================
COMPILATION COMMAND
=================================================
mpicc test_comm.c -o test_comm
=================================================
EXECUTION COMMAND
=================================================
mpirun --mca btl gm,sm,self -np 8 -machinefile mach_file test_comm
1 10 1000
=================================================
SOME EXECUTION ERRORS
=================================================
## EXECUTION 1 ##
[2] receives 3039 when it must receive 7135. Its subtraction is 4096.
[2] receives 7142 when it must receive 7143. Its subtraction is 1.
[2,411th] (Bcast of 7 en 8) 1024 integers with 2 errors.
## EXECUTION 2 ##
[5] receives 7142 when it must receive 7136. Its subtraction is 6.
[5,277th] (Bcast of 0 en 8) 1024 integers with 1 errors.
[1] receives 7138 when it must receive 7140. Its subtraction is 2.
[1,385th] (Bcast of 4 en 8) 1024 integers with 1 errors.
## EXECUTION 3 ##
[5] receives 3038 when it must receive 7134. Its subtraction is 4096.
[5] receives 7141 when it must receive 7142. Its subtraction is 1.
[5,479th] (Bcast of 6 en 8) 1024 integers with 2 errors.
## EXECUTION 4 ##
[3] receives 3034 when it must receive 7130. Its subtraction is 4096.
[3] receives 7140 when it must receive 7138. Its subtraction is 2.
[3,539th] (Bcast of 2 en 8) 1024 integers with 2 errors.
## EXECUTION 5 ##
[5] receives 7135 when it must receive 3039. Its subtraction is 4096.
[5] receives 3046 when it must receive 3047. Its subtraction is 1.
[5,135th] (Bcast of 7 en 8) 1024 integers with 2 errors.
## EXECUTION 6 ##
[5] receives 7135 when it must receive 3039. Its subtraction is 4096.
[5] receives 3046 when it must receive 3047. Its subtraction is 1.
[5,246th] (Bcast of 7 en 8) 1024 integers with 2 errors.
## EXECUTION 7 ##
[2] receives 7128 when it must receive 3032. Its subtraction is 4096.
[2] receives 3047 when it must receive 3040. Its subtraction is 7.
[2,232th] (Bcast of 0 en 8) 1024 integers with 2 errors.
## EXECUTION 8 ##
[3] receives 3036 when it must receive 7132. Its subtraction is 4096.
[3] receives 7139 when it must receive 7140. Its subtraction is 1.
[3,344th] (Bcast of 4 en 8) 1024 integers with 2 errors.
=================================================
SOURCE CODE --> test_comm.c
=================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <mpi.h>
int *CreateVector (int tam) {
int *ptr = NULL, my_id;
MPI_Comm_rank(MPI_COMM_WORLD, &my_id);
ptr = (int *) malloc (sizeof(int) * tam);
if (ptr == NULL)
{ printf ("ERROR MEMORIA (%d)\n", my_id); exit(-1); }
return ptr;
}
void InitVector (int *vec, int tam, int inic) {
int i, val = inic, numprocs;
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if (inic >= 0)
for (i= 0; i<tam; i++)
{ vec[i] = val; val += numprocs; }
else
for (i= 0; i<tam; i++) vec[i] = -1;
}
int CompareVector (int *vec, int tam, int inic) {
int i, val = inic, numprocs, my_id, bool = 0;
MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank
(MPI_COMM_WORLD, &my_id);
for (i= 0; i<tam; i++) {
if (vec[i] != val) {
printf ("[%d] receives %d when it must receive %d. Its
subtraction is %d.\n",
my_id, vec[i], val, ((val>vec[i])?(val-vec[i]):
(vec[i]-val)));
bool++;
}
val += numprocs;
}
return bool;
}
int main (int argc, char **argv) {
int i, j, k, tam, num, bool;
int pos1, pos2, dim1, dim2, nexecs;
int my_id, numprocs, prc_src, prc_dst;
int *mess = NULL;
int *mess1 = NULL, *mess2 = NULL;;
char name[MPI_MAX_PROCESSOR_NAME];
MPI_Request req;
MPI_Status sta;
MPI_Init (&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank
(MPI_COMM_WORLD, &my_id);
pos1 = atoi(argv[1]); dim1 = (1 << pos1);
pos2 = atoi(argv[2]); dim2 = (1 << pos2);
nexecs = atoi(argv[3]);
MPI_Barrier(MPI_COMM_WORLD);
mess = CreateVector(dim2); InitVector(mess, dim2, -1);
for (tam=dim1; tam<=dim2; tam <<= 1) {
for (k=0; k<nexecs; k++)
for (i=0; i<numprocs; i++) {
InitVector(mess, tam, ((my_id==i)?i:-1));
MPI_Bcast(mess, tam, MPI_INT, i, MPI_COMM_WORLD);
bool = CompareVector (mess, tam, i);
if (bool > 0)
printf ("[%d,%dth] (Bcast of %d in %d) %d integers with %d
errors\n",
my_id, k, i, numprocs, tam, bool);
}
}
free (mess); mess = NULL;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize ();
return 0;
}