I'm running into issues when trying to use GPUs in a multiprocessor
system when using the latest release candidate (1.7rc8).
Specifically, it looks like the OpenMPI code is still assuming that
all GPUs are on the same IOH, as in this message from a few months
ago:

http://www.open-mpi.org/community/lists/users/2012/07/19879.php

I couldn't determine what happened to the ticket mentioned in that thread.

For the moment, I'm just constraining myself to using the GPUs
attached to one processor, but obviously that's less then ideal :).

Curiously, the eager send path doesn't seem to have the same issue -
if I adjust btl_smcuda_eager_limit up, sends work up to that
threshold.  Unfortunately, if I increase it beyond 10 megabytes I
start seeing bus errors.

I can manually breakup my own sends to be below the eager limit, but
that seems non-optimal.

Any other recommendations?

Thanks,

R

The testing code and output is pasted below.

---

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <mpi.h>


#  define CUDA_SAFE_CALL( call) {                                    \
    cudaError err = call;                                                    \
    if( cudaSuccess != err) {                                                \
        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
                __FILE__, __LINE__, cudaGetErrorString( err) );              \
        exit(EXIT_FAILURE);                                                  \
    } }

int recv(int src) {
  cudaSetDevice(0);
  for (int bSize = 1; bSize < 100e6; bSize *= 2) {
    fprintf(stderr, "Recv: %d\n", bSize);
    void* buffer;
    CUDA_SAFE_CALL(cudaMalloc(&buffer, bSize));
    auto world = MPI::COMM_WORLD;
    world.Recv(buffer, bSize, MPI::BYTE, src, 0);
    CUDA_SAFE_CALL(cudaFree(buffer))
  }
}

int send(int dst) {
  cudaSetDevice(2);
  for (int bSize = 1; bSize < 100e6; bSize *= 2) {
    fprintf(stderr, "Send: %d\n", bSize);
    void* buffer;
    CUDA_SAFE_CALL(cudaMalloc(&buffer, bSize));
    auto world = MPI::COMM_WORLD;
    world.Send(buffer, bSize, MPI::BYTE, dst, 0);
    CUDA_SAFE_CALL(cudaFree(buffer))
  }
}

void checkPeerAccess() {
  fprintf(stderr, "Access capability: gpu -> gpu\n");
  for (int a = 0; a < 3; ++a) {
    for (int b = a; b < 3; ++b) {
      if (a == b) { continue; }
      int res;
      cudaDeviceCanAccessPeer(&res, a, b);
      fprintf(stderr, "%d <-> %d: %d\n", a, b, res);
    }
  }
}

int main() {
  MPI::Init_thread(MPI::THREAD_MULTIPLE);
  if (MPI::COMM_WORLD.Get_rank() == 0) {
    checkPeerAccess();
    recv(1);
  } else {
    send(0);
  }
  MPI::Finalize();
}

output from running:
mpirun -mca btl_smcuda_eager_limit 64 -n 2 ./a.out
Access capability: gpu -> gpu
0 <-> 1: 1
0 <-> 2: 0
1 <-> 2: 0
Send: 1
Recv: 1
Send: 2
Recv: 2
Send: 4
Recv: 4
Send: 8
Recv: 8
Send: 16
Recv: 16
--------------------------------------------------------------------------
The call to cuIpcOpenMemHandle failed. This is an unrecoverable error
and will cause the program to abort.
  cuIpcOpenMemHandle return value:   217
  address: 0x2300200000
Check the cuda.h file for what the return value means. Perhaps a reboot
of the node will clear the problem.
--------------------------------------------------------------------------

Reply via email to