paleolimbot commented on code in PR #488:
URL: https://github.com/apache/arrow-nanoarrow/pull/488#discussion_r1621076875


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -161,108 +190,142 @@ static ArrowErrorCode ArrowDeviceCudaArrayInit(struct 
ArrowDevice* device,
 
   device_array->device_id = device->device_id;
   device_array->device_type = device->device_type;
-  device_array->sync_event = &private_data->sync_event;
+  device_array->sync_event = &private_data->cu_event;
 
-  cudaSetDevice(prev_device);
   return NANOARROW_OK;
 }
 
 // TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there 
is
 // no good way to incorporate that just yet
 
-static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
-                                                struct ArrowBufferView src,
-                                                struct ArrowDevice* device_dst,
-                                                struct ArrowBuffer* dst) {
-  struct ArrowBuffer tmp;
-  enum cudaMemcpyKind memcpy_kind;
-
+static ArrowErrorCode ArrowDeviceCudaBufferCopyInternal(struct ArrowDevice* 
device_src,
+                                                        struct ArrowBufferView 
src,
+                                                        struct ArrowDevice* 
device_dst,
+                                                        struct ArrowBufferView 
dst,
+                                                        int* n_pop_context,
+                                                        struct ArrowError* 
error) {
   if (device_src->device_type == ARROW_DEVICE_CPU &&
       device_dst->device_type == ARROW_DEVICE_CUDA) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyHostToDevice;
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyHtoD((CUdeviceptr)dst.data.data, src.data.data, 
(size_t)src.size_bytes),
+        "cuMemcpyHtoD", error);
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyDtoD((CUdeviceptr)dst.data.data, (CUdeviceptr)src.data.data,
+                     (size_t)src.size_bytes),
+        "cuMemcpytoD", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
              device_dst->device_type == ARROW_DEVICE_CUDA) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyDeviceToDevice;
+    struct ArrowDeviceCudaPrivate* src_private =
+        (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyPeer((CUdeviceptr)dst.data.data, dst_private->cu_context,
+                     (CUdeviceptr)src.data.data, src_private->cu_context,
+                     (size_t)src.size_bytes),
+        "cuMemcpyPeer", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
              device_dst->device_type == ARROW_DEVICE_CPU) {
-    ArrowBufferInit(&tmp);
-    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
-    tmp.size_bytes = src.size_bytes;
-    memcpy_kind = cudaMemcpyDeviceToHost;
+    struct ArrowDeviceCudaPrivate* src_private =
+        (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(src_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyDtoH((void*)dst.data.data, (CUdeviceptr)src.data.data,
+                     (size_t)src.size_bytes),
+        "cuMemcpyDtoH", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CPU &&
              device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyHostToHost;
+    // TODO: Synchronize device_src?
+    memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);

Review Comment:
   I looked into this and I am almost positive that calling 
`cuCtxSynchronize()` before memcpy is the right thing to do; however, it should 
also be done *once* before copying a bunch of buffers (as you suggested). We 
really just need to test this properly (which will happen naturally when we 
implement async buffer copying, since that should result in an 
not-fully-synchronized array with a non-null sync event to properly test on).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to