paleolimbot commented on code in PR #488:
URL: https://github.com/apache/arrow-nanoarrow/pull/488#discussion_r1618048277
##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -161,108 +190,142 @@ static ArrowErrorCode ArrowDeviceCudaArrayInit(struct
ArrowDevice* device,
device_array->device_id = device->device_id;
device_array->device_type = device->device_type;
- device_array->sync_event = &private_data->sync_event;
+ device_array->sync_event = &private_data->cu_event;
- cudaSetDevice(prev_device);
return NANOARROW_OK;
}
// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there
is
// no good way to incorporate that just yet
-static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
- struct ArrowBufferView src,
- struct ArrowDevice* device_dst,
- struct ArrowBuffer* dst) {
- struct ArrowBuffer tmp;
- enum cudaMemcpyKind memcpy_kind;
-
+static ArrowErrorCode ArrowDeviceCudaBufferCopyInternal(struct ArrowDevice*
device_src,
+ struct ArrowBufferView
src,
+ struct ArrowDevice*
device_dst,
+ struct ArrowBufferView
dst,
+ int* n_pop_context,
+ struct ArrowError*
error) {
if (device_src->device_type == ARROW_DEVICE_CPU &&
device_dst->device_type == ARROW_DEVICE_CUDA) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyHostToDevice;
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyHtoD((CUdeviceptr)dst.data.data, src.data.data,
(size_t)src.size_bytes),
+ "cuMemcpyHtoD", error);
+
+ } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+ device_dst->device_type == ARROW_DEVICE_CUDA &&
+ device_src->device_id == device_dst->device_id) {
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyDtoD((CUdeviceptr)dst.data.data, (CUdeviceptr)src.data.data,
+ (size_t)src.size_bytes),
+ "cuMemcpytoD", error);
} else if (device_src->device_type == ARROW_DEVICE_CUDA &&
device_dst->device_type == ARROW_DEVICE_CUDA) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyDeviceToDevice;
+ struct ArrowDeviceCudaPrivate* src_private =
+ (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyPeer((CUdeviceptr)dst.data.data, dst_private->cu_context,
+ (CUdeviceptr)src.data.data, src_private->cu_context,
+ (size_t)src.size_bytes),
+ "cuMemcpyPeer", error);
} else if (device_src->device_type == ARROW_DEVICE_CUDA &&
device_dst->device_type == ARROW_DEVICE_CPU) {
- ArrowBufferInit(&tmp);
- NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
- tmp.size_bytes = src.size_bytes;
- memcpy_kind = cudaMemcpyDeviceToHost;
+ struct ArrowDeviceCudaPrivate* src_private =
+ (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(src_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyDtoH((void*)dst.data.data, (CUdeviceptr)src.data.data,
+ (size_t)src.size_bytes),
+ "cuMemcpyDtoH", error);
} else if (device_src->device_type == ARROW_DEVICE_CPU &&
device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyHostToHost;
+ // TODO: Synchronize device_src?
+ memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);
Review Comment:
That's a great point (synchronization of the source must have happened
before this function is called). I was/am worried that `cudaMemcpy()` might
have been flushing something from the device to the page-locked memory that a
straight `memcpy()` wouldn't be doing. I'll look into `cuCtxSynchronize()` to
see if that's doing what I think it is (or whether it should be called before
any of this happens anyway).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]