This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9410bd3a refactor(extensions/nanoarrow_device): Migrate CUDA device
implementation to use the driver API (#488)
9410bd3a is described below
commit 9410bd3afbd6695b05e80b7f23d145ee38da176c
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Jun 4 09:51:03 2024 -0300
refactor(extensions/nanoarrow_device): Migrate CUDA device implementation
to use the driver API (#488)
Closes #246.
This PR doesn't change much about the existing implementation (and I
think there are some things that need to be changed!), it just
eliminates the dependency on the runtime library. The driver API is a
better fit here anyway since we're doing very low-level things!
This isn't tested in CI yet (working on that here:
https://github.com/apache/arrow-nanoarrow/pull/490 ).
---
extensions/nanoarrow_device/CMakeLists.txt | 3 +-
.../src/nanoarrow/nanoarrow_device.c | 19 +-
.../src/nanoarrow/nanoarrow_device.h | 4 +-
.../src/nanoarrow/nanoarrow_device_cuda.c | 397 +++++++++++++--------
.../src/nanoarrow/nanoarrow_device_cuda_test.cc | 78 +++-
.../src/nanoarrow/nanoarrow_device_metal.cc | 7 +-
.../src/nanoarrow/nanoarrow_device_metal_test.cc | 2 +-
.../src/nanoarrow/nanoarrow_device_test.cc | 2 +-
8 files changed, 340 insertions(+), 172 deletions(-)
diff --git a/extensions/nanoarrow_device/CMakeLists.txt
b/extensions/nanoarrow_device/CMakeLists.txt
index 9667303a..d985ea4c 100644
--- a/extensions/nanoarrow_device/CMakeLists.txt
+++ b/extensions/nanoarrow_device/CMakeLists.txt
@@ -119,7 +119,7 @@ else()
if(NANOARROW_DEVICE_WITH_CUDA)
find_package(CUDAToolkit REQUIRED)
set(NANOARROW_DEVICE_SOURCES_CUDA src/nanoarrow/nanoarrow_device_cuda.c)
- set(NANOARROW_DEVICE_LIBS_CUDA CUDA::cudart_static)
+ set(NANOARROW_DEVICE_LIBS_CUDA CUDA::cuda_driver)
set(NANOARROW_DEVICE_DEFS_CUDA "NANOARROW_DEVICE_WITH_CUDA")
endif()
@@ -138,6 +138,7 @@ else()
${NANOARROW_DEVICE_DEFS_CUDA})
target_link_libraries(nanoarrow_device PUBLIC ${NANOARROW_DEVICE_LIBS_METAL}
${NANOARROW_DEVICE_LIBS_CUDA})
+ target_compile_definitions(nanoarrow_device PUBLIC
"$<$<CONFIG:Debug>:NANOARROW_DEBUG>")
install(TARGETS nanoarrow_device DESTINATION lib)
install(FILES src/nanoarrow/nanoarrow_device.h DESTINATION include/nanoarrow)
diff --git a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c
b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c
index 3896283f..3acf6908 100644
--- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c
+++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c
@@ -159,13 +159,18 @@ struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType
device_type, int64_t devi
ArrowErrorCode ArrowDeviceArrayInit(struct ArrowDevice* device,
struct ArrowDeviceArray* device_array,
- struct ArrowArray* array) {
+ struct ArrowArray* array, void*
sync_event) {
if (device->array_init != NULL) {
- return device->array_init(device, device_array, array);
- } else {
- ArrowDeviceArrayInitDefault(device, device_array, array);
- return NANOARROW_OK;
+ return device->array_init(device, device_array, array, sync_event);
+ }
+
+ // Handling a sync event is not supported in the default constructor
+ if (sync_event != NULL) {
+ return EINVAL;
}
+
+ ArrowDeviceArrayInitDefault(device, device_array, array);
+ return NANOARROW_OK;
}
ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
@@ -224,7 +229,7 @@ static int ArrowDeviceBasicArrayStreamGetNext(struct
ArrowDeviceArrayStream* arr
struct ArrowArray tmp;
NANOARROW_RETURN_NOT_OK(
private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
- int result = ArrowDeviceArrayInit(private_data->device, device_array, &tmp);
+ int result = ArrowDeviceArrayInit(private_data->device, device_array, &tmp,
NULL);
if (result != NANOARROW_OK) {
ArrowArrayRelease(&tmp);
return result;
@@ -449,7 +454,7 @@ ArrowErrorCode ArrowDeviceArrayViewCopy(struct
ArrowDeviceArrayView* src,
return result;
}
- result = ArrowDeviceArrayInit(device_dst, dst, &tmp);
+ result = ArrowDeviceArrayInit(device_dst, dst, &tmp, NULL);
if (result != NANOARROW_OK) {
ArrowArrayRelease(&tmp);
return result;
diff --git a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h
b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h
index 96a2baeb..47af82bd 100644
--- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h
+++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h
@@ -177,7 +177,7 @@ struct ArrowDevice {
/// device_array->sync_event (if sync_event applies to this device type).
ArrowErrorCode (*array_init)(struct ArrowDevice* device,
struct ArrowDeviceArray* device_array,
- struct ArrowArray* array);
+ struct ArrowArray* array, void* sync_event);
/// \brief Move an ArrowDeviceArray between devices without copying buffers
///
@@ -243,7 +243,7 @@ struct ArrowDeviceArrayView {
/// initialize an ArrowDeviceArray.
ArrowErrorCode ArrowDeviceArrayInit(struct ArrowDevice* device,
struct ArrowDeviceArray* device_array,
- struct ArrowArray* array);
+ struct ArrowArray* array, void*
sync_event);
/// \brief Initialize an ArrowDeviceArrayView
///
diff --git a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c
b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c
index df6ab682..8b010319 100644
--- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c
+++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c
@@ -15,10 +15,62 @@
// specific language governing permissions and limitations
// under the License.
-#include <cuda_runtime_api.h>
+#include <cuda.h>
#include "nanoarrow_device.h"
+static inline void ArrowDeviceCudaSetError(CUresult err, const char* op,
+ struct ArrowError* error) {
+ if (error == NULL) {
+ return;
+ }
+
+ const char* name = NULL;
+ CUresult err_result = cuGetErrorName(err, &name);
+ if (err_result != CUDA_SUCCESS || name == NULL) {
+ name = "name unknown";
+ }
+
+ const char* description = NULL;
+ err_result = cuGetErrorString(err, &description);
+ if (err_result != CUDA_SUCCESS || description == NULL) {
+ description = "description unknown";
+ }
+
+ ArrowErrorSet(error, "[%s][%s] %s", op, name, description);
+}
+
+#define _NANOARROW_CUDA_RETURN_NOT_OK_IMPL(NAME, EXPR, OP, ERROR) \
+ do { \
+ CUresult NAME = (EXPR); \
+ if (NAME != CUDA_SUCCESS) { \
+ ArrowDeviceCudaSetError(NAME, OP, ERROR); \
+ return EIO; \
+ } \
+ } while (0)
+
+#define NANOARROW_CUDA_RETURN_NOT_OK(EXPR, OP, ERROR)
\
+ _NANOARROW_CUDA_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(cuda_err_,
__COUNTER__), EXPR, \
+ OP, ERROR)
+
+#if defined(NANOARROW_DEBUG)
+#define _NANOARROW_CUDA_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \
+ do { \
+ const CUresult NAME = (EXPR); \
+ if (NAME != CUDA_SUCCESS) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \
+ } while (0)
+#define NANOARROW_CUDA_ASSERT_OK(EXPR)
\
+ _NANOARROW_CUDA_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_,
__COUNTER__), EXPR, \
+ #EXPR)
+#else
+#define NANOARROW_CUDA_ASSERT_OK(EXPR) (void)(EXPR)
+#endif
+
+struct ArrowDeviceCudaPrivate {
+ CUdevice cu_device;
+ CUcontext cu_context;
+};
+
struct ArrowDeviceCudaAllocatorPrivate {
ArrowDeviceType device_type;
int64_t device_id;
@@ -32,67 +84,63 @@ static void ArrowDeviceCudaDeallocator(struct
ArrowBufferAllocator* allocator,
struct ArrowDeviceCudaAllocatorPrivate* allocator_private =
(struct ArrowDeviceCudaAllocatorPrivate*)allocator->private_data;
- int prev_device = 0;
- // Not ideal: we have no place to communicate any errors here
- cudaGetDevice(&prev_device);
- cudaSetDevice((int)allocator_private->device_id);
-
switch (allocator_private->device_type) {
case ARROW_DEVICE_CUDA:
- cudaFree(allocator_private->allocated_ptr);
+ cuMemFree((CUdeviceptr)allocator_private->allocated_ptr);
break;
case ARROW_DEVICE_CUDA_HOST:
- cudaFreeHost(allocator_private->allocated_ptr);
+
NANOARROW_CUDA_ASSERT_OK(cuMemFreeHost(allocator_private->allocated_ptr));
break;
default:
break;
}
- cudaSetDevice(prev_device);
ArrowFree(allocator_private);
}
static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowDevice* device,
struct ArrowBuffer* buffer,
int64_t size_bytes) {
- int prev_device = 0;
- cudaError_t result = cudaGetDevice(&prev_device);
- if (result != cudaSuccess) {
- return EINVAL;
- }
+ struct ArrowDeviceCudaPrivate* private_data =
+ (struct ArrowDeviceCudaPrivate*)device->private_data;
- result = cudaSetDevice((int)device->device_id);
- if (result != cudaSuccess) {
- cudaSetDevice(prev_device);
- return EINVAL;
- }
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(private_data->cu_context),
+ "cuCtxPushCurrent", NULL);
+ CUcontext unused; // needed for cuCtxPopCurrent()
struct ArrowDeviceCudaAllocatorPrivate* allocator_private =
(struct ArrowDeviceCudaAllocatorPrivate*)ArrowMalloc(
sizeof(struct ArrowDeviceCudaAllocatorPrivate));
if (allocator_private == NULL) {
- cudaSetDevice(prev_device);
+ NANOARROW_CUDA_ASSERT_OK(cuCtxPopCurrent(&unused));
return ENOMEM;
}
+ CUresult err;
void* ptr = NULL;
+ const char* op = "";
switch (device->device_type) {
- case ARROW_DEVICE_CUDA:
- result = cudaMalloc(&ptr, (int64_t)size_bytes);
+ case ARROW_DEVICE_CUDA: {
+ CUdeviceptr dptr = 0;
+ err = cuMemAlloc(&dptr, (size_t)size_bytes);
+ ptr = (void*)dptr;
+ op = "cuMemAlloc";
break;
+ }
case ARROW_DEVICE_CUDA_HOST:
- result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+ err = cuMemAllocHost(&ptr, (size_t)size_bytes);
+ op = "cuMemAllocHost";
break;
default:
+ cuCtxPopCurrent(&unused);
ArrowFree(allocator_private);
- cudaSetDevice(prev_device);
return EINVAL;
}
- if (result != cudaSuccess) {
+ if (err != CUDA_SUCCESS) {
+ NANOARROW_CUDA_ASSERT_OK(cuCtxPopCurrent(&unused));
ArrowFree(allocator_private);
- cudaSetDevice(prev_device);
- return ENOMEM;
+ return EIO;
}
allocator_private->device_id = device->device_id;
@@ -105,19 +153,23 @@ static ArrowErrorCode
ArrowDeviceCudaAllocateBuffer(struct ArrowDevice* device,
buffer->allocator =
ArrowBufferDeallocator(&ArrowDeviceCudaDeallocator, allocator_private);
- cudaSetDevice(prev_device);
+ NANOARROW_CUDA_ASSERT_OK(cuCtxPopCurrent(&unused));
return NANOARROW_OK;
}
struct ArrowDeviceCudaArrayPrivate {
struct ArrowArray parent;
- cudaEvent_t sync_event;
+ CUevent cu_event;
};
static void ArrowDeviceCudaArrayRelease(struct ArrowArray* array) {
struct ArrowDeviceCudaArrayPrivate* private_data =
(struct ArrowDeviceCudaArrayPrivate*)array->private_data;
- cudaEventDestroy(private_data->sync_event);
+
+ if (private_data->cu_event != NULL) {
+ NANOARROW_CUDA_ASSERT_OK(cuEventDestroy(private_data->cu_event));
+ }
+
ArrowArrayRelease(&private_data->parent);
ArrowFree(private_data);
array->release = NULL;
@@ -125,7 +177,13 @@ static void ArrowDeviceCudaArrayRelease(struct ArrowArray*
array) {
static ArrowErrorCode ArrowDeviceCudaArrayInit(struct ArrowDevice* device,
struct ArrowDeviceArray*
device_array,
- struct ArrowArray* array) {
+ struct ArrowArray* array,
+ void* sync_event) {
+ struct ArrowDeviceCudaPrivate* device_private =
+ (struct ArrowDeviceCudaPrivate*)device->private_data;
+ // One can create an event with cuEventCreate(&cu_event, CU_EVENT_DEFAULT);
+ // Requires cuCtxPushCurrent() + cuEventCreate() + cuCtxPopCurrent()
+
struct ArrowDeviceCudaArrayPrivate* private_data =
(struct ArrowDeviceCudaArrayPrivate*)ArrowMalloc(
sizeof(struct ArrowDeviceCudaArrayPrivate));
@@ -133,26 +191,6 @@ static ArrowErrorCode ArrowDeviceCudaArrayInit(struct
ArrowDevice* device,
return ENOMEM;
}
- int prev_device = 0;
- cudaError_t result = cudaGetDevice(&prev_device);
- if (result != cudaSuccess) {
- ArrowFree(private_data);
- return EINVAL;
- }
-
- result = cudaSetDevice((int)device->device_id);
- if (result != cudaSuccess) {
- cudaSetDevice(prev_device);
- ArrowFree(private_data);
- return EINVAL;
- }
-
- cudaError_t error = cudaEventCreate(&private_data->sync_event);
- if (error != cudaSuccess) {
- ArrowFree(private_data);
- return EINVAL;
- }
-
memset(device_array, 0, sizeof(struct ArrowDeviceArray));
device_array->array = *array;
device_array->array.private_data = private_data;
@@ -161,72 +199,99 @@ static ArrowErrorCode ArrowDeviceCudaArrayInit(struct
ArrowDevice* device,
device_array->device_id = device->device_id;
device_array->device_type = device->device_type;
- device_array->sync_event = &private_data->sync_event;
- cudaSetDevice(prev_device);
+ if (sync_event != NULL) {
+ private_data->cu_event = *((CUevent*)sync_event);
+ device_array->sync_event = sync_event;
+ } else {
+ private_data->cu_event = NULL;
+ device_array->sync_event = NULL;
+ }
+
return NANOARROW_OK;
}
// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there
is
// no good way to incorporate that just yet
-static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
- struct ArrowBufferView src,
- struct ArrowDevice* device_dst,
- struct ArrowBuffer* dst) {
- struct ArrowBuffer tmp;
- enum cudaMemcpyKind memcpy_kind;
+static ArrowErrorCode ArrowDeviceCudaBufferCopyInternal(struct ArrowDevice*
device_src,
+ struct ArrowBufferView
src,
+ struct ArrowDevice*
device_dst,
+ struct ArrowBufferView
dst,
+ int* n_pop_context,
+ struct ArrowError*
error) {
+ // Note: the device_src/sync event must be synchronized before calling these
methods,
+ // even though the cuMemcpyXXX() functions may do this automatically in some
cases.
if (device_src->device_type == ARROW_DEVICE_CPU &&
device_dst->device_type == ARROW_DEVICE_CUDA) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyHostToDevice;
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyHtoD((CUdeviceptr)dst.data.data, src.data.data,
(size_t)src.size_bytes),
+ "cuMemcpyHtoD", error);
+
+ } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+ device_dst->device_type == ARROW_DEVICE_CUDA &&
+ device_src->device_id == device_dst->device_id) {
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(dst_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyDtoD((CUdeviceptr)dst.data.data, (CUdeviceptr)src.data.data,
+ (size_t)src.size_bytes),
+ "cuMemcpytoD", error);
} else if (device_src->device_type == ARROW_DEVICE_CUDA &&
device_dst->device_type == ARROW_DEVICE_CUDA) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyDeviceToDevice;
+ struct ArrowDeviceCudaPrivate* src_private =
+ (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+ struct ArrowDeviceCudaPrivate* dst_private =
+ (struct ArrowDeviceCudaPrivate*)device_dst->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyPeer((CUdeviceptr)dst.data.data, dst_private->cu_context,
+ (CUdeviceptr)src.data.data, src_private->cu_context,
+ (size_t)src.size_bytes),
+ "cuMemcpyPeer", error);
} else if (device_src->device_type == ARROW_DEVICE_CUDA &&
device_dst->device_type == ARROW_DEVICE_CPU) {
- ArrowBufferInit(&tmp);
- NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
- tmp.size_bytes = src.size_bytes;
- memcpy_kind = cudaMemcpyDeviceToHost;
+ struct ArrowDeviceCudaPrivate* src_private =
+ (struct ArrowDeviceCudaPrivate*)device_src->private_data;
+
+ NANOARROW_CUDA_RETURN_NOT_OK(cuCtxPushCurrent(src_private->cu_context),
+ "cuCtxPushCurrent", error);
+ (*n_pop_context)++;
+ NANOARROW_CUDA_RETURN_NOT_OK(
+ cuMemcpyDtoH((void*)dst.data.data, (CUdeviceptr)src.data.data,
+ (size_t)src.size_bytes),
+ "cuMemcpyDtoH", error);
} else if (device_src->device_type == ARROW_DEVICE_CPU &&
device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyHostToHost;
+ memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);
} else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
- NANOARROW_RETURN_NOT_OK(
- ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
- memcpy_kind = cudaMemcpyHostToHost;
+ memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);
} else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
device_dst->device_type == ARROW_DEVICE_CPU) {
- ArrowBufferInit(&tmp);
- NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
- tmp.size_bytes = src.size_bytes;
- memcpy_kind = cudaMemcpyHostToHost;
+ memcpy((void*)dst.data.data, src.data.data, (size_t)src.size_bytes);
} else {
return ENOTSUP;
}
- cudaError_t result =
- cudaMemcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes,
memcpy_kind);
- if (result != cudaSuccess) {
- ArrowBufferReset(&tmp);
- return EINVAL;
- }
-
- ArrowBufferMove(&tmp, dst);
return NANOARROW_OK;
}
@@ -234,35 +299,49 @@ static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct
ArrowDevice* device_src,
struct ArrowBufferView src,
struct ArrowDevice* device_dst,
struct ArrowBufferView dst) {
- enum cudaMemcpyKind memcpy_kind;
+ int n_pop_context = 0;
+ struct ArrowError error;
+
+ int result = ArrowDeviceCudaBufferCopyInternal(device_src, src, device_dst,
dst,
+ &n_pop_context, &error);
+ for (int i = 0; i < n_pop_context; i++) {
+ CUcontext unused;
+ NANOARROW_CUDA_ASSERT_OK(cuCtxPopCurrent(&unused));
+ }
- if (device_src->device_type == ARROW_DEVICE_CPU &&
- device_dst->device_type == ARROW_DEVICE_CUDA) {
- memcpy_kind = cudaMemcpyHostToDevice;
- } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
- device_dst->device_type == ARROW_DEVICE_CUDA) {
- memcpy_kind = cudaMemcpyDeviceToDevice;
- } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
- device_dst->device_type == ARROW_DEVICE_CPU) {
- memcpy_kind = cudaMemcpyDeviceToHost;
- } else if (device_src->device_type == ARROW_DEVICE_CPU &&
- device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
- memcpy_kind = cudaMemcpyHostToHost;
- } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
- device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
- memcpy_kind = cudaMemcpyHostToHost;
- } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
- device_dst->device_type == ARROW_DEVICE_CPU) {
- memcpy_kind = cudaMemcpyHostToHost;
- } else {
- return ENOTSUP;
+ return result;
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+ struct ArrowBufferView src,
+ struct ArrowDevice* device_dst,
+ struct ArrowBuffer* dst) {
+ struct ArrowBuffer tmp;
+
+ switch (device_dst->device_type) {
+ case ARROW_DEVICE_CUDA:
+ case ARROW_DEVICE_CUDA_HOST:
+ NANOARROW_RETURN_NOT_OK(
+ ArrowDeviceCudaAllocateBuffer(device_dst, &tmp, src.size_bytes));
+ break;
+ case ARROW_DEVICE_CPU:
+ ArrowBufferInit(&tmp);
+ NANOARROW_RETURN_NOT_OK(ArrowBufferResize(&tmp, src.size_bytes, 0));
+ break;
+ default:
+ return ENOTSUP;
}
- cudaError_t result = cudaMemcpy((void*)dst.data.as_uint8, src.data.as_uint8,
- dst.size_bytes, memcpy_kind);
- if (result != cudaSuccess) {
- return EINVAL;
+ struct ArrowBufferView tmp_view;
+ tmp_view.data.data = tmp.data;
+ tmp_view.size_bytes = tmp.size_bytes;
+ int result = ArrowDeviceCudaBufferCopy(device_src, src, device_dst,
tmp_view);
+ if (result != NANOARROW_OK) {
+ ArrowBufferReset(&tmp);
+ return result;
}
+
+ ArrowBufferMove(&tmp, dst);
return NANOARROW_OK;
}
@@ -279,13 +358,9 @@ static ArrowErrorCode ArrowDeviceCudaSynchronize(struct
ArrowDevice* device,
}
// Memory for cuda_event is owned by the ArrowArray member of the
ArrowDeviceArray
- cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
- cudaError_t result = cudaEventSynchronize(*cuda_event);
-
- if (result != cudaSuccess) {
- ArrowErrorSet(error, "cudaEventSynchronize() failed: %s",
cudaGetErrorString(result));
- return EINVAL;
- }
+ CUevent* cuda_event = (CUevent*)sync_event;
+ NANOARROW_CUDA_RETURN_NOT_OK(cuEventSynchronize(*cuda_event),
"cuEventSynchronize",
+ error);
return NANOARROW_OK;
}
@@ -318,7 +393,11 @@ static ArrowErrorCode ArrowDeviceCudaArrayMove(struct
ArrowDevice* device_src,
}
static void ArrowDeviceCudaRelease(struct ArrowDevice* device) {
- // No private_data to release
+ struct ArrowDeviceCudaPrivate* private_data =
+ (struct ArrowDeviceCudaPrivate*)device->private_data;
+ NANOARROW_CUDA_ASSERT_OK(cuDevicePrimaryCtxRelease(private_data->cu_device));
+ ArrowFree(device->private_data);
+ device->release = NULL;
}
static ArrowErrorCode ArrowDeviceCudaInitDevice(struct ArrowDevice* device,
@@ -334,16 +413,19 @@ static ArrowErrorCode ArrowDeviceCudaInitDevice(struct
ArrowDevice* device,
return EINVAL;
}
- int n_devices;
- cudaError_t result = cudaGetDeviceCount(&n_devices);
- if (result != cudaSuccess) {
- ArrowErrorSet(error, "cudaGetDeviceCount() failed: %s",
cudaGetErrorString(result));
- return EINVAL;
- }
+ CUdevice cu_device;
+ NANOARROW_CUDA_RETURN_NOT_OK(cuDeviceGet(&cu_device, device_id),
"cuDeviceGet", error);
- if (device_id < 0 || device_id >= n_devices) {
- ArrowErrorSet(error, "CUDA device_id must be between 0 and %d", n_devices
- 1);
- return EINVAL;
+ CUcontext cu_context;
+ NANOARROW_CUDA_RETURN_NOT_OK(cuDevicePrimaryCtxRetain(&cu_context,
cu_device),
+ "cuDevicePrimaryCtxRetain", error);
+
+ struct ArrowDeviceCudaPrivate* private_data =
+ (struct ArrowDeviceCudaPrivate*)ArrowMalloc(sizeof(struct
ArrowDeviceCudaPrivate));
+ if (private_data == NULL) {
+ NANOARROW_CUDA_ASSERT_OK(cuDevicePrimaryCtxRelease(cu_device));
+ ArrowErrorSet(error, "out of memory");
+ return ENOMEM;
}
device->device_type = device_type;
@@ -355,36 +437,73 @@ static ArrowErrorCode ArrowDeviceCudaInitDevice(struct
ArrowDevice* device,
device->buffer_copy = &ArrowDeviceCudaBufferCopy;
device->synchronize_event = &ArrowDeviceCudaSynchronize;
device->release = &ArrowDeviceCudaRelease;
- device->private_data = NULL;
+
+ private_data->cu_device = cu_device;
+ private_data->cu_context = cu_context;
+ device->private_data = private_data;
return NANOARROW_OK;
}
struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t
device_id) {
+ CUresult err;
int n_devices;
- cudaError_t result = cudaGetDeviceCount(&n_devices);
- if (result != cudaSuccess) {
- return NULL;
- }
+
static struct ArrowDevice* devices_singleton = NULL;
if (devices_singleton == NULL) {
+ err = cuInit(0);
+ if (err != CUDA_SUCCESS) {
+ return NULL;
+ }
+
+ err = cuDeviceGetCount(&n_devices);
+ if (err != CUDA_SUCCESS) {
+ return NULL;
+ }
+
+ if (n_devices == 0) {
+ return NULL;
+ }
+
devices_singleton =
(struct ArrowDevice*)ArrowMalloc(2 * n_devices * sizeof(struct
ArrowDevice));
+ if (devices_singleton == NULL) {
+ return NULL;
+ }
+
+ int result = NANOARROW_OK;
+ memset(devices_singleton, 0, 2 * n_devices * sizeof(struct ArrowDevice));
for (int i = 0; i < n_devices; i++) {
- int result =
+ result =
ArrowDeviceCudaInitDevice(devices_singleton + i, ARROW_DEVICE_CUDA,
i, NULL);
if (result != NANOARROW_OK) {
- ArrowFree(devices_singleton);
- devices_singleton = NULL;
+ break;
}
result = ArrowDeviceCudaInitDevice(devices_singleton + n_devices + i,
ARROW_DEVICE_CUDA_HOST, i, NULL);
if (result != NANOARROW_OK) {
- ArrowFree(devices_singleton);
- devices_singleton = NULL;
+ break;
+ }
+ }
+
+ if (result != NANOARROW_OK) {
+ for (int i = 0; i < n_devices; i++) {
+ if (devices_singleton[i].release != NULL) {
+ devices_singleton[i].release(&(devices_singleton[i]));
+ }
}
+
+ ArrowFree(devices_singleton);
+ devices_singleton = NULL;
+ return NULL;
+ }
+
+ } else {
+ err = cuDeviceGetCount(&n_devices);
+ if (err != CUDA_SUCCESS) {
+ return NULL;
}
}
diff --git
a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda_test.cc
b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda_test.cc
index 9751b8eb..8d5e8f4b 100644
--- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda_test.cc
+++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda_test.cc
@@ -17,12 +17,45 @@
#include <errno.h>
-#include <cuda_runtime_api.h>
+#include <cuda.h>
#include <gtest/gtest.h>
#include "nanoarrow_device.h"
#include "nanoarrow_device_cuda.h"
+class CudaTemporaryContext {
+ public:
+ CudaTemporaryContext(int device_id) : initialized_(false) {
+ CUresult err = cuDeviceGet(&device_, device_id);
+ if (err != CUDA_SUCCESS) {
+ return;
+ }
+
+ err = cuDevicePrimaryCtxRetain(&context_, device_);
+ if (err != CUDA_SUCCESS) {
+ return;
+ }
+
+ cuCtxPushCurrent(context_);
+ initialized_ = true;
+ }
+
+ bool valid() { return initialized_; }
+
+ ~CudaTemporaryContext() {
+ if (initialized_) {
+ CUcontext unused;
+ cuCtxPopCurrent(&unused);
+ cuDevicePrimaryCtxRelease(device_);
+ }
+ }
+
+ private:
+ bool initialized_;
+ CUdevice device_;
+ CUcontext context_;
+};
+
TEST(NanoarrowDeviceCuda, GetDevice) {
struct ArrowDevice* cuda = ArrowDeviceCuda(ARROW_DEVICE_CUDA, 0);
ASSERT_NE(cuda, nullptr);
@@ -40,6 +73,8 @@ TEST(NanoarrowDeviceCuda, GetDevice) {
TEST(NanoarrowDeviceCuda, DeviceCudaBufferInit) {
struct ArrowDevice* cpu = ArrowDeviceCpu();
struct ArrowDevice* gpu = ArrowDeviceCuda(ARROW_DEVICE_CUDA, 0);
+ ASSERT_NE(gpu, nullptr);
+
struct ArrowBuffer buffer_gpu;
struct ArrowBuffer buffer;
uint8_t data[] = {0x01, 0x02, 0x03, 0x04, 0x05};
@@ -69,6 +104,8 @@ TEST(NanoarrowDeviceCuda, DeviceCudaBufferInit) {
TEST(NanoarrowDeviceCuda, DeviceCudaHostBufferInit) {
struct ArrowDevice* cpu = ArrowDeviceCpu();
struct ArrowDevice* gpu = ArrowDeviceCuda(ARROW_DEVICE_CUDA_HOST, 0);
+ ASSERT_NE(gpu, nullptr);
+
struct ArrowBuffer buffer_gpu;
struct ArrowBuffer buffer;
uint8_t data[] = {0x01, 0x02, 0x03, 0x04, 0x05};
@@ -99,21 +136,26 @@ TEST(NanoarrowDeviceCuda, DeviceCudaHostBufferInit) {
TEST(NanoarrowDeviceCuda, DeviceCudaBufferCopy) {
struct ArrowDevice* cpu = ArrowDeviceCpu();
struct ArrowDevice* gpu = ArrowDeviceCuda(ARROW_DEVICE_CUDA, 0);
+ ASSERT_NE(gpu, nullptr);
+
uint8_t data[] = {0x01, 0x02, 0x03, 0x04, 0x05};
struct ArrowBufferView cpu_view = {data, sizeof(data)};
- void* gpu_dest;
- cudaError_t result = cudaMalloc(&gpu_dest, sizeof(data));
- struct ArrowBufferView gpu_view = {gpu_dest, sizeof(data)};
- if (result != cudaSuccess) {
- GTEST_FAIL() << "cudaMalloc(&gpu_dest) failed";
+ CudaTemporaryContext ctx(0);
+ ASSERT_TRUE(ctx.valid());
+
+ CUdeviceptr gpu_dest;
+ CUresult result = cuMemAlloc(&gpu_dest, sizeof(data));
+ struct ArrowBufferView gpu_view = {reinterpret_cast<void*>(gpu_dest),
sizeof(data)};
+ if (result != CUDA_SUCCESS) {
+ GTEST_FAIL() << "cuMemAlloc() failed";
}
- void* gpu_dest2;
- result = cudaMalloc(&gpu_dest2, sizeof(data));
- struct ArrowBufferView gpu_view2 = {gpu_dest2, sizeof(data)};
- if (result != cudaSuccess) {
- GTEST_FAIL() << "cudaMalloc(&gpu_dest2) failed";
+ CUdeviceptr gpu_dest2;
+ result = cuMemAlloc(&gpu_dest2, sizeof(data));
+ struct ArrowBufferView gpu_view2 = {reinterpret_cast<void*>(gpu_dest),
sizeof(data)};
+ if (result != CUDA_SUCCESS) {
+ GTEST_FAIL() << "cuMemAlloc() failed";
}
// CPU -> GPU
@@ -131,14 +173,14 @@ TEST(NanoarrowDeviceCuda, DeviceCudaBufferCopy) {
EXPECT_EQ(memcmp(cpu_dest, data, sizeof(data)), 0);
// Clean up
- result = cudaFree(gpu_dest);
- if (result != cudaSuccess) {
- GTEST_FAIL() << "cudaFree(gpu_dest) failed";
+ result = cuMemFree(gpu_dest);
+ if (result != CUDA_SUCCESS) {
+ GTEST_FAIL() << "cuMemFree() failed";
}
- result = cudaFree(gpu_dest2);
- if (result != cudaSuccess) {
- GTEST_FAIL() << "cudaFree(gpu_dest2) failed";
+ result = cuMemFree(gpu_dest2);
+ if (result != CUDA_SUCCESS) {
+ GTEST_FAIL() << "cuMemFree() failed";
}
}
@@ -168,7 +210,7 @@ TEST_P(StringTypeParameterizedTestFixture,
ArrowDeviceCudaArrayViewString) {
ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
- ASSERT_EQ(ArrowDeviceArrayInit(cpu, &device_array, &array), NANOARROW_OK);
+ ASSERT_EQ(ArrowDeviceArrayInit(cpu, &device_array, &array, nullptr),
NANOARROW_OK);
ArrowDeviceArrayViewInit(&device_array_view);
ArrowArrayViewInitFromType(&device_array_view.array_view, string_type);
diff --git
a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal.cc
b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal.cc
index e456fd47..43f06489 100644
--- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal.cc
+++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal.cc
@@ -153,7 +153,8 @@ static void ArrowDeviceMetalArrayRelease(struct ArrowArray*
array) {
static ArrowErrorCode ArrowDeviceMetalArrayInit(struct ArrowDevice* device,
struct ArrowDeviceArray*
device_array,
- struct ArrowArray* array) {
+ struct ArrowArray* array,
+ void* sync_event) {
struct ArrowDeviceMetalArrayPrivate* private_data =
(struct ArrowDeviceMetalArrayPrivate*)ArrowMalloc(
sizeof(struct ArrowDeviceMetalArrayPrivate));
@@ -161,8 +162,8 @@ static ArrowErrorCode ArrowDeviceMetalArrayInit(struct
ArrowDevice* device,
return ENOMEM;
}
- auto mtl_device = reinterpret_cast<MTL::Device*>(device->private_data);
- private_data->event = mtl_device->newSharedEvent();
+ // One can create a new event with mtl_device->newSharedEvent();
+ private_data->event = sync_event;
memset(device_array, 0, sizeof(struct ArrowDeviceArray));
device_array->array = *array;
diff --git
a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal_test.cc
b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal_test.cc
index 2579d2fc..30a5c7f4 100644
--- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal_test.cc
+++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal_test.cc
@@ -228,7 +228,7 @@ TEST_P(StringTypeParameterizedTestFixture,
ArrowDeviceMetalArrayViewString) {
ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
- ASSERT_EQ(ArrowDeviceArrayInit(cpu, &device_array, &array), NANOARROW_OK);
+ ASSERT_EQ(ArrowDeviceArrayInit(cpu, &device_array, &array, nullptr),
NANOARROW_OK);
ArrowDeviceArrayViewInit(&device_array_view);
ArrowArrayViewInitFromType(&device_array_view.array_view, string_type);
diff --git a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_test.cc
b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_test.cc
index 8ed39a24..8a904559 100644
--- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_test.cc
+++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_test.cc
@@ -81,7 +81,7 @@ TEST_P(StringTypeParameterizedTestFixture,
ArrowDeviceCpuArrayViewString) {
ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
- ASSERT_EQ(ArrowDeviceArrayInit(cpu, &device_array, &array), NANOARROW_OK);
+ ASSERT_EQ(ArrowDeviceArrayInit(cpu, &device_array, &array, nullptr),
NANOARROW_OK);
ArrowDeviceArrayViewInit(&device_array_view);
ArrowArrayViewInitFromType(&device_array_view.array_view, string_type);