Re: [PR] refactor(extensions/nanoarrow_device): Migrate CUDA device implementation to use the driver API [arrow-nanoarrow]

via GitHub Tue, 28 May 2024 16:48:43 -0700


zeroshade commented on code in PR #488:
URL: https://github.com/apache/arrow-nanoarrow/pull/488#discussion_r1617991915



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -32,67 +71,63 @@ static void ArrowDeviceCudaDeallocator(struct 
ArrowBufferAllocator* allocator,
   struct ArrowDeviceCudaAllocatorPrivate* allocator_private =
       (struct ArrowDeviceCudaAllocatorPrivate*)allocator->private_data;
 
-  int prev_device = 0;
-  // Not ideal: we have no place to communicate any errors here
-  cudaGetDevice(&prev_device);
-  cudaSetDevice((int)allocator_private->device_id);
-
   switch (allocator_private->device_type) {
     case ARROW_DEVICE_CUDA:
-      cudaFree(allocator_private->allocated_ptr);
+      cuMemFree((CUdeviceptr)allocator_private->allocated_ptr);
       break;
     case ARROW_DEVICE_CUDA_HOST:
-      cudaFreeHost(allocator_private->allocated_ptr);
+      cuMemFreeHost(allocator_private->allocated_ptr);
       break;
     default:
       break;
   }

Review Comment:
   I assume we're punting on handling Async scenarios for the time being, right?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -32,67 +71,63 @@ static void ArrowDeviceCudaDeallocator(struct 
ArrowBufferAllocator* allocator,
   struct ArrowDeviceCudaAllocatorPrivate* allocator_private =
       (struct ArrowDeviceCudaAllocatorPrivate*)allocator->private_data;
 
-  int prev_device = 0;
-  // Not ideal: we have no place to communicate any errors here
-  cudaGetDevice(&prev_device);
-  cudaSetDevice((int)allocator_private->device_id);
-
   switch (allocator_private->device_type) {
     case ARROW_DEVICE_CUDA:
-      cudaFree(allocator_private->allocated_ptr);
+      cuMemFree((CUdeviceptr)allocator_private->allocated_ptr);
       break;
     case ARROW_DEVICE_CUDA_HOST:
-      cudaFreeHost(allocator_private->allocated_ptr);
+      cuMemFreeHost(allocator_private->allocated_ptr);
       break;
     default:
       break;
   }
 
-  cudaSetDevice(prev_device);
   ArrowFree(allocator_private);
 }
 
 static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowDevice* device,
                                                     struct ArrowBuffer* buffer,
                                                     int64_t size_bytes) {
-  int prev_device = 0;
-  cudaError_t result = cudaGetDevice(&prev_device);
-  if (result != cudaSuccess) {
-    return EINVAL;
-  }

Review Comment:
   Does the context itself manage the current device id by pushing and popping 
it?



##########
extensions/nanoarrow_device/CMakeLists.txt:
##########
@@ -119,7 +119,7 @@ else()
   if(NANOARROW_DEVICE_WITH_CUDA)
     find_package(CUDAToolkit REQUIRED)
     set(NANOARROW_DEVICE_SOURCES_CUDA src/nanoarrow/nanoarrow_device_cuda.c)
-    set(NANOARROW_DEVICE_LIBS_CUDA CUDA::cudart_static)
+    set(NANOARROW_DEVICE_LIBS_CUDA CUDA::cuda_driver)
     set(NANOARROW_DEVICE_DEFS_CUDA "NANOARROW_DEVICE_WITH_CUDA")
   endif()

Review Comment:
   is there anything on the documentation that needs to be updated accordingly 
(lists of dependencies, description of options, etc.)?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -161,108 +190,142 @@ static ArrowErrorCode ArrowDeviceCudaArrayInit(struct 
ArrowDevice* device,
 
   device_array->device_id = device->device_id;
   device_array->device_type = device->device_type;
-  device_array->sync_event = &private_data->sync_event;
+  device_array->sync_event = &private_data->cu_event;
 
-  cudaSetDevice(prev_device);
   return NANOARROW_OK;
 }
 
 // TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there 
is
 // no good way to incorporate that just yet
 
-static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
-                                                struct ArrowBufferView src,
-                                                struct ArrowDevice* device_dst,
-                                                struct ArrowBuffer* dst) {
-  struct ArrowBuffer tmp;
-  enum cudaMemcpyKind memcpy_kind;
-
+static ArrowErrorCode ArrowDeviceCudaBufferCopyInternal(struct ArrowDevice* 
device_src,
+                                                        struct ArrowBufferView 
src,
+                                                        struct ArrowDevice* 
device_dst,
+                                                        struct ArrowBufferView 
dst,
+                                                        int* n_pop_context,
+                                                        struct ArrowError* 
error) {
   if (device_src->device_type == ARROW_DEVICE_CPU &&
       device_dst->device_type == ARROW_DEVICE_CUDA) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyHostToDevice;
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyHtoD((CUdeviceptr)dst.data.data, src.data.data, 
(size_t)src.size_bytes),
+        "cuMemcpyHtoD", error);
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyDtoD((CUdeviceptr)dst.data.data, (CUdeviceptr)src.data.data,
+                     (size_t)src.size_bytes),
+        "cuMemcpytoD", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
              device_dst->device_type == ARROW_DEVICE_CUDA) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyDeviceToDevice;
+    struct ArrowDeviceCudaPrivate* src_private =
+        (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyPeer((CUdeviceptr)dst.data.data, dst_private->cu_context,
+                     (CUdeviceptr)src.data.data, src_private->cu_context,
+                     (size_t)src.size_bytes),
+        "cuMemcpyPeer", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
              device_dst->device_type == ARROW_DEVICE_CPU) {
-    ArrowBufferInit(&tmp);
-    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
-    tmp.size_bytes = src.size_bytes;
-    memcpy_kind = cudaMemcpyDeviceToHost;
+    struct ArrowDeviceCudaPrivate* src_private =
+        (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(src_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyDtoH((void*)dst.data.data, (CUdeviceptr)src.data.data,
+                     (size_t)src.size_bytes),
+        "cuMemcpyDtoH", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CPU &&
              device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyHostToHost;
+    // TODO: Synchronize device_src?
+    memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);

Review Comment:
   You could potentially use `cuCtxSynchronize` though...?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -105,19 +140,19 @@ static ArrowErrorCode 
ArrowDeviceCudaAllocateBuffer(struct ArrowDevice* device,
   buffer->allocator =
       ArrowBufferDeallocator(&ArrowDeviceCudaDeallocator, allocator_private);
 
-  cudaSetDevice(prev_device);
+  cuCtxPopCurrent(&unused);
   return NANOARROW_OK;
 }
 
 struct ArrowDeviceCudaArrayPrivate {
   struct ArrowArray parent;
-  cudaEvent_t sync_event;
+  CUevent cu_event;
 };

Review Comment:
   This will need to be exposed somehow so that a producer can get access to 
this in order to record it on a stream or otherwise manage and use the event so 
that a consumer can benefit. 
   
   If we're not exposing this event anywhere yet (since you're creating it 
privately and not accepting a user provided event) then we should probably just 
leave it null for now and not bother trying to create and destroy an event 
until we are also exposing it



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -32,67 +71,63 @@ static void ArrowDeviceCudaDeallocator(struct 
ArrowBufferAllocator* allocator,
   struct ArrowDeviceCudaAllocatorPrivate* allocator_private =
       (struct ArrowDeviceCudaAllocatorPrivate*)allocator->private_data;
 
-  int prev_device = 0;
-  // Not ideal: we have no place to communicate any errors here
-  cudaGetDevice(&prev_device);
-  cudaSetDevice((int)allocator_private->device_id);
-
   switch (allocator_private->device_type) {
     case ARROW_DEVICE_CUDA:
-      cudaFree(allocator_private->allocated_ptr);
+      cuMemFree((CUdeviceptr)allocator_private->allocated_ptr);
       break;
     case ARROW_DEVICE_CUDA_HOST:
-      cudaFreeHost(allocator_private->allocated_ptr);
+      cuMemFreeHost(allocator_private->allocated_ptr);
       break;
     default:
       break;
   }
 
-  cudaSetDevice(prev_device);

Review Comment:
   Are we always guaranteed that this has already been called or that we know 
we're using the correct device?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -161,108 +190,142 @@ static ArrowErrorCode ArrowDeviceCudaArrayInit(struct 
ArrowDevice* device,
 
   device_array->device_id = device->device_id;
   device_array->device_type = device->device_type;
-  device_array->sync_event = &private_data->sync_event;
+  device_array->sync_event = &private_data->cu_event;
 
-  cudaSetDevice(prev_device);
   return NANOARROW_OK;
 }
 
 // TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there 
is
 // no good way to incorporate that just yet
 
-static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
-                                                struct ArrowBufferView src,
-                                                struct ArrowDevice* device_dst,
-                                                struct ArrowBuffer* dst) {
-  struct ArrowBuffer tmp;
-  enum cudaMemcpyKind memcpy_kind;
-
+static ArrowErrorCode ArrowDeviceCudaBufferCopyInternal(struct ArrowDevice* 
device_src,
+                                                        struct ArrowBufferView 
src,
+                                                        struct ArrowDevice* 
device_dst,
+                                                        struct ArrowBufferView 
dst,
+                                                        int* n_pop_context,
+                                                        struct ArrowError* 
error) {
   if (device_src->device_type == ARROW_DEVICE_CPU &&
       device_dst->device_type == ARROW_DEVICE_CUDA) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyHostToDevice;
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyHtoD((CUdeviceptr)dst.data.data, src.data.data, 
(size_t)src.size_bytes),
+        "cuMemcpyHtoD", error);
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyDtoD((CUdeviceptr)dst.data.data, (CUdeviceptr)src.data.data,
+                     (size_t)src.size_bytes),
+        "cuMemcpytoD", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
              device_dst->device_type == ARROW_DEVICE_CUDA) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyDeviceToDevice;
+    struct ArrowDeviceCudaPrivate* src_private =
+        (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyPeer((CUdeviceptr)dst.data.data, dst_private->cu_context,
+                     (CUdeviceptr)src.data.data, src_private->cu_context,
+                     (size_t)src.size_bytes),
+        "cuMemcpyPeer", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
              device_dst->device_type == ARROW_DEVICE_CPU) {
-    ArrowBufferInit(&tmp);
-    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
-    tmp.size_bytes = src.size_bytes;
-    memcpy_kind = cudaMemcpyDeviceToHost;
+    struct ArrowDeviceCudaPrivate* src_private =
+        (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(src_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyDtoH((void*)dst.data.data, (CUdeviceptr)src.data.data,
+                     (size_t)src.size_bytes),
+        "cuMemcpyDtoH", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CPU &&
              device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyHostToHost;
+    // TODO: Synchronize device_src?
+    memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);

Review Comment:
   synchronizing wouldn't be limited to the CPU/CUDA_HOST cases. if we need to 
synchronize, we'd need to synchronize for *all* cases. 
   
   But as I mentioned in the comment above, since we don't expose the event 
currently, you'd create a deadlock if you try to synchronize since nothing can 
mark the event as recorded and completed. 
   
   Also since the event is at the top level of the `ArrowDeviceArray` i'd say 
that if we *are* going to synchronize, we shouldn't synchronize at this level 
but rather above this on the call stack. And until we start using the 
cuMemcpyAsync or other Async calls, we don't need to bother attempting to 
manage synchronization yet. we can punt on that for now



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -32,67 +71,63 @@ static void ArrowDeviceCudaDeallocator(struct 
ArrowBufferAllocator* allocator,
   struct ArrowDeviceCudaAllocatorPrivate* allocator_private =
       (struct ArrowDeviceCudaAllocatorPrivate*)allocator->private_data;
 
-  int prev_device = 0;
-  // Not ideal: we have no place to communicate any errors here
-  cudaGetDevice(&prev_device);
-  cudaSetDevice((int)allocator_private->device_id);
-
   switch (allocator_private->device_type) {
     case ARROW_DEVICE_CUDA:
-      cudaFree(allocator_private->allocated_ptr);
+      cuMemFree((CUdeviceptr)allocator_private->allocated_ptr);
       break;
     case ARROW_DEVICE_CUDA_HOST:
-      cudaFreeHost(allocator_private->allocated_ptr);
+      cuMemFreeHost(allocator_private->allocated_ptr);
       break;
     default:
       break;
   }
 
-  cudaSetDevice(prev_device);
   ArrowFree(allocator_private);
 }
 
 static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowDevice* device,
                                                     struct ArrowBuffer* buffer,
                                                     int64_t size_bytes) {
-  int prev_device = 0;
-  cudaError_t result = cudaGetDevice(&prev_device);
-  if (result != cudaSuccess) {
-    return EINVAL;
-  }
+  struct ArrowDeviceCudaPrivate* private_data =
+      (struct ArrowDeviceCudaPrivate*)device->private_data;
 
-  result = cudaSetDevice((int)device->device_id);
-  if (result != cudaSuccess) {
-    cudaSetDevice(prev_device);
-    return EINVAL;
-  }
+  NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(private_data->cu_context),
+                               "cuCtxPushCurrent", NULL);
+  CUcontext unused;  // needed for cuCtxPopCurrent()
 
   struct ArrowDeviceCudaAllocatorPrivate* allocator_private =
       (struct ArrowDeviceCudaAllocatorPrivate*)ArrowMalloc(
           sizeof(struct ArrowDeviceCudaAllocatorPrivate));
   if (allocator_private == NULL) {
-    cudaSetDevice(prev_device);
+    cuCtxPopCurrent(&unused);
     return ENOMEM;
   }
 
+  CUresult err;
   void* ptr = NULL;
+  const char* op = "";
   switch (device->device_type) {
-    case ARROW_DEVICE_CUDA:
-      result = cudaMalloc(&ptr, (int64_t)size_bytes);
+    case ARROW_DEVICE_CUDA: {
+      CUdeviceptr dptr = 0;
+      err = cuMemAlloc(&dptr, (size_t)size_bytes);
+      ptr = (void*)dptr;
+      op = "cuMemAlloc";
       break;
+    }
     case ARROW_DEVICE_CUDA_HOST:
-      result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+      err = cuMemAllocHost(&ptr, (size_t)size_bytes);
+      op = "cuMemAllocHost";
       break;
     default:
+      cuCtxPopCurrent(&unused);
       ArrowFree(allocator_private);
-      cudaSetDevice(prev_device);
       return EINVAL;
   }
 
-  if (result != cudaSuccess) {
+  if (err != CUDA_SUCCESS) {
+    cuCtxPopCurrent(&unused);
     ArrowFree(allocator_private);
-    cudaSetDevice(prev_device);
-    return ENOMEM;
+    return EIO;

Review Comment:
   since this function only returns an error code, but doesn't allow populating 
an error, is it okay that we're swallowing this error here? Should you leave a 
TODO comment so that we can remember to improve this?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] refactor(extensions/nanoarrow_device): Migrate CUDA device implementation to use the driver API [arrow-nanoarrow]

Reply via email to