paleolimbot commented on code in PR #488:
URL: https://github.com/apache/arrow-nanoarrow/pull/488#discussion_r1618048277


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -161,108 +190,142 @@ static ArrowErrorCode ArrowDeviceCudaArrayInit(struct 
ArrowDevice* device,
 
   device_array->device_id = device->device_id;
   device_array->device_type = device->device_type;
-  device_array->sync_event = &private_data->sync_event;
+  device_array->sync_event = &private_data->cu_event;
 
-  cudaSetDevice(prev_device);
   return NANOARROW_OK;
 }
 
 // TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there 
is
 // no good way to incorporate that just yet
 
-static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
-                                                struct ArrowBufferView src,
-                                                struct ArrowDevice* device_dst,
-                                                struct ArrowBuffer* dst) {
-  struct ArrowBuffer tmp;
-  enum cudaMemcpyKind memcpy_kind;
-
+static ArrowErrorCode ArrowDeviceCudaBufferCopyInternal(struct ArrowDevice* 
device_src,
+                                                        struct ArrowBufferView 
src,
+                                                        struct ArrowDevice* 
device_dst,
+                                                        struct ArrowBufferView 
dst,
+                                                        int* n_pop_context,
+                                                        struct ArrowError* 
error) {
   if (device_src->device_type == ARROW_DEVICE_CPU &&
       device_dst->device_type == ARROW_DEVICE_CUDA) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyHostToDevice;
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyHtoD((CUdeviceptr)dst.data.data, src.data.data, 
(size_t)src.size_bytes),
+        "cuMemcpyHtoD", error);
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyDtoD((CUdeviceptr)dst.data.data, (CUdeviceptr)src.data.data,
+                     (size_t)src.size_bytes),
+        "cuMemcpytoD", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
              device_dst->device_type == ARROW_DEVICE_CUDA) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyDeviceToDevice;
+    struct ArrowDeviceCudaPrivate* src_private =
+        (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+    struct ArrowDeviceCudaPrivate* dst_private =
+        (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyPeer((CUdeviceptr)dst.data.data, dst_private->cu_context,
+                     (CUdeviceptr)src.data.data, src_private->cu_context,
+                     (size_t)src.size_bytes),
+        "cuMemcpyPeer", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
              device_dst->device_type == ARROW_DEVICE_CPU) {
-    ArrowBufferInit(&tmp);
-    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
-    tmp.size_bytes = src.size_bytes;
-    memcpy_kind = cudaMemcpyDeviceToHost;
+    struct ArrowDeviceCudaPrivate* src_private =
+        (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+
+    NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(src_private->cu_context),
+                                 "cuCtxPushCurrent", error);
+    (*n_pop_context)++;
+    NANOARROW_CUDA_RETURN_NOT_OK(
+        cuMemcpyDtoH((void*)dst.data.data, (CUdeviceptr)src.data.data,
+                     (size_t)src.size_bytes),
+        "cuMemcpyDtoH", error);
 
   } else if (device_src->device_type == ARROW_DEVICE_CPU &&
              device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
-    NANOARROW_RETURN_NOT_OK(
-        ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
-    memcpy_kind = cudaMemcpyHostToHost;
+    // TODO: Synchronize device_src?
+    memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);

Review Comment:
   That's a great point (synchronization of the source must have happened 
before this function is called). I was/am worried that `cudaMemcpy()` might 
have been flushing something from the device to the page-locked memory that a 
straight `memcpy()` wouldn't be doing. I'll look into `cuCtxSynchronize()` to 
see if that's doing what I think it is (or whether it should be called before 
any of this happens anyway).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to