paleolimbot commented on code in PR #488:
URL: https://github.com/apache/arrow-nanoarrow/pull/488#discussion_r1621076875
##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -161,108 +190,142 @@ static ArrowErrorCode ArrowDeviceCudaArrayInit(struct
ArrowDevice* device,
device_array->device_id = device->device_id;
device_array->device_type = device->device_type;
- device_array->sync_event = &private_data->sync_event;
+ device_array->sync_event = &private_data->cu_event;
- cudaSetDevice(prev_device);
return NANOARROW_OK;
}
// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there
is
// no good way to incorporate that just yet
-static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
- struct ArrowBufferView src,
- struct ArrowDevice* device_dst,
- struct ArrowBuffer* dst) {
- struct ArrowBuffer tmp;
- enum cudaMemcpyKind memcpy_kind;
-
+static ArrowErrorCode ArrowDeviceCudaBufferCopyInternal(struct ArrowDevice*
device_src,
+ struct ArrowBufferView
src,
+ struct ArrowDevice*
device_dst,
+ struct ArrowBufferView
dst,
+ int* n_pop_context,
+ struct ArrowError*
error) {
if (device_src->device_type == ARROW_DEVICE_CPU &&
device_dst->device_type == ARROW_DEVICE_CUDA) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyHostToDevice;
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyHtoD((CUdeviceptr)dst.data.data, src.data.data,
(size_t)src.size_bytes),
+ "cuMemcpyHtoD", error);
+
+ } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+ device_dst->device_type == ARROW_DEVICE_CUDA &&
+ device_src->device_id == device_dst->device_id) {
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyDtoD((CUdeviceptr)dst.data.data, (CUdeviceptr)src.data.data,
+ (size_t)src.size_bytes),
+ "cuMemcpytoD", error);
} else if (device_src->device_type == ARROW_DEVICE_CUDA &&
device_dst->device_type == ARROW_DEVICE_CUDA) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyDeviceToDevice;
+ struct ArrowDeviceCudaPrivate* src_private =
+ (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyPeer((CUdeviceptr)dst.data.data, dst_private->cu_context,
+ (CUdeviceptr)src.data.data, src_private->cu_context,
+ (size_t)src.size_bytes),
+ "cuMemcpyPeer", error);
} else if (device_src->device_type == ARROW_DEVICE_CUDA &&
device_dst->device_type == ARROW_DEVICE_CPU) {
- ArrowBufferInit(&tmp);
- NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
- tmp.size_bytes = src.size_bytes;
- memcpy_kind = cudaMemcpyDeviceToHost;
+ struct ArrowDeviceCudaPrivate* src_private =
+ (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(src_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyDtoH((void*)dst.data.data, (CUdeviceptr)src.data.data,
+ (size_t)src.size_bytes),
+ "cuMemcpyDtoH", error);
} else if (device_src->device_type == ARROW_DEVICE_CPU &&
device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyHostToHost;
+ // TODO: Synchronize device_src?
+ memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);
Review Comment:
I looked into this and I am almost positive that calling
`cuCtxSynchronize()` before memcpy is the right thing to do; however, it should
also be done *once* before copying a bunch of buffers (as you suggested). We
really just need to test this properly (which will happen naturally when we
implement async buffer copying, since that should result in an
not-fully-synchronized array with a non-null sync event to properly test on).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]