(tvm) branch main updated: [REFACTOR][CUDA] Phase out cuda_common.h (#19770)

tqchen Mon, 15 Jun 2026 06:59:15 -0700

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 5550fb33cd [REFACTOR][CUDA] Phase out cuda_common.h (#19770)
5550fb33cd is described below

commit 5550fb33cd4e80290e79f937b58532fd9597538a
Author: Tianqi Chen <[email protected]>
AuthorDate: Mon Jun 15 09:58:53 2026 -0400

    [REFACTOR][CUDA] Phase out cuda_common.h (#19770)
    
    ## Summary
    
    The CUDA runtime can rely on the shared tvm-ffi CUDA error helper
    instead of carrying a TVM-local common header. This keeps CUDA error
    handling aligned with the FFI CUDA support and removes the remaining
    reason for src/backend/cuda/runtime/cuda_common.h.
    
    Main changes:
    
    - Move CUDA workspace thread-local state into cuda_device_api.cc.
    - Replace CUDA_CALL use sites with TVM_FFI_CHECK_CUDA_ERROR from
    tvm/ffi/extra/cuda/base.h.
    - Inline the remaining CUDA driver checks at their call sites and delete
    cuda_common.h.
---
 src/backend/cuda/runtime/cuda_common.h             |  69 ----------
 src/backend/cuda/runtime/cuda_device_api.cc        | 140 ++++++++++++---------
 src/backend/cuda/runtime/cuda_module.cc            |  16 ++-
 .../extra/contrib/cublas/cublas_json_runtime.cc    |   4 +-
 src/runtime/extra/contrib/cublas/cublas_utils.cc   |   5 +-
 src/runtime/extra/contrib/cudnn/conv_backward.cc   |   4 +-
 src/runtime/extra/contrib/cudnn/conv_forward.cc    |   2 +-
 .../contrib/cudnn/cudnn_frontend/attention.cc      |   4 +-
 .../extra/contrib/cudnn/cudnn_json_runtime.cc      |   4 +-
 src/runtime/extra/contrib/cudnn/cudnn_utils.cc     |   2 +-
 src/runtime/extra/contrib/cudnn/cudnn_utils.h      |   3 +-
 src/runtime/extra/contrib/cudnn/softmax.cc         |   2 +-
 src/runtime/extra/contrib/curand/curand.cc         |   2 +-
 .../cutlass/fp16_group_gemm_runner_sm100.cuh       |   3 +-
 .../cutlass/fp16_group_gemm_runner_sm90.cuh        |   3 +-
 .../fp8_groupwise_scaled_gemm_runner_sm100.cuh     |   4 +-
 .../fp8_groupwise_scaled_gemm_runner_sm90.cuh      |   4 +-
 ...p8_groupwise_scaled_group_gemm_runner_sm100.cuh |   4 +-
 src/runtime/extra/contrib/cutlass/gemm_runner.cuh  |   3 +-
 src/runtime/extra/contrib/nvshmem/dist_gemm.cu     |   7 +-
 src/runtime/extra/contrib/nvshmem/init.cc          |   7 +-
 .../extra/contrib/nvshmem/memory_allocator.cc      |   2 +-
 .../extra/contrib/tensorrt/tensorrt_calibrator.h   |  19 +--
 src/runtime/extra/contrib/thrust/thrust.cu         |   5 +-
 .../extra/disco/cuda_ipc/cuda_ipc_memory.cc        |  39 +++---
 src/runtime/extra/disco/nccl/nccl_context.h        |  17 ++-
 src/runtime/vm/cuda/cuda_graph_builtin.cc          |  18 +--
 tests/python/relax/test_vm_builtin.py              |   2 +-
 28 files changed, 182 insertions(+), 212 deletions(-)

diff --git a/src/backend/cuda/runtime/cuda_common.h 
b/src/backend/cuda/runtime/cuda_common.h
deleted file mode 100644
index 183a2e8702..0000000000
--- a/src/backend/cuda/runtime/cuda_common.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda_common.h
- * \brief Common utilities for CUDA
- */
-#ifndef TVM_RUNTIME_CUDA_CUDA_COMMON_H_
-#define TVM_RUNTIME_CUDA_CUDA_COMMON_H_
-
-#include <cuda_runtime.h>
-#include <tvm/ffi/function.h>
-
-#include <string>
-
-#include "../../../runtime/workspace_pool.h"
-
-namespace tvm {
-namespace runtime {
-
-#define CUDA_DRIVER_CALL(x)                                             \
-  {                                                                     \
-    CUresult result = x;                                                \
-    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
-      const char* msg;                                                  \
-      cuGetErrorName(result, &msg);                                     \
-      TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg;  \
-    }                                                                   \
-  }
-
-#ifndef CUDA_CALL
-#define CUDA_CALL(func)                                               \
-  {                                                                   \
-    cudaError_t e = (func);                                           \
-    TVM_FFI_ICHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
-        << "CUDA: " << cudaGetErrorString(e);                         \
-  }
-#endif
-
-/*! \brief Thread local workspace */
-class CUDAThreadEntry {
- public:
-  /*! \brief thread local pool*/
-  WorkspacePool pool;
-  /*! \brief constructor */
-  CUDAThreadEntry();
-  // get the threadlocal workspace
-  static CUDAThreadEntry* ThreadLocal();
-};
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CUDA_CUDA_COMMON_H_
diff --git a/src/backend/cuda/runtime/cuda_device_api.cc 
b/src/backend/cuda/runtime/cuda_device_api.cc
index 969f40a081..68ae39de56 100644
--- a/src/backend/cuda/runtime/cuda_device_api.cc
+++ b/src/backend/cuda/runtime/cuda_device_api.cc
@@ -24,6 +24,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
@@ -32,14 +33,26 @@
 
 #include <cstring>
 
-#include "cuda_common.h"
+#include "../../../runtime/workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
 
+#ifndef CUDA_DRIVER_CALL
+#define CUDA_DRIVER_CALL(x)                                             \
+  {                                                                     \
+    CUresult result = x;                                                \
+    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
+      const char* msg;                                                  \
+      cuGetErrorName(result, &msg);                                     \
+      TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg;  \
+    }                                                                   \
+  }
+#endif
+
 class CUDADeviceAPI final : public DeviceAPI {
  public:
-  void SetDevice(Device dev) final { CUDA_CALL(cudaSetDevice(dev.device_id)); }
+  void SetDevice(Device dev) final { 
TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); }
   void GetAttr(Device dev, DeviceAttrKind kind, ffi::Any* rv) final {
     int value = 0;
     switch (kind) {
@@ -50,23 +63,27 @@ class CUDADeviceAPI final : public DeviceAPI {
         break;
       }
       case kMaxThreadsPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, 
cudaDevAttrMaxThreadsPerBlock, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, 
dev.device_id));
         break;
       }
       case kWarpSize: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, 
dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, 
dev.device_id));
         break;
       }
       case kMaxSharedMemoryPerBlock: {
-        CUDA_CALL(
+        TVM_FFI_CHECK_CUDA_ERROR(
             cudaDeviceGetAttribute(&value, cudaDevAttrMaxSharedMemoryPerBlock, 
dev.device_id));
         break;
       }
       case kComputeVersion: {
         std::ostringstream os;
-        CUDA_CALL(cudaDeviceGetAttribute(&value, 
cudaDevAttrComputeCapabilityMajor, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, 
dev.device_id));
         os << value << ".";
-        CUDA_CALL(cudaDeviceGetAttribute(&value, 
cudaDevAttrComputeCapabilityMinor, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, 
dev.device_id));
         os << value;
         *rv = os.str();
         return;
@@ -79,18 +96,23 @@ class CUDADeviceAPI final : public DeviceAPI {
         return;
       }
       case kMaxClockRate: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, 
dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, 
dev.device_id));
         break;
       }
       case kMultiProcessorCount: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, 
cudaDevAttrMultiProcessorCount, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, 
dev.device_id));
         break;
       }
       case kMaxThreadDimensions: {
         int dims[3];
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, 
dev.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, 
dev.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, 
dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, 
dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, 
dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, 
dev.device_id));
 
         std::stringstream ss;  // use json string to return multiple int 
values;
         ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
@@ -98,7 +120,8 @@ class CUDADeviceAPI final : public DeviceAPI {
         return;
       }
       case kMaxRegistersPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, 
cudaDevAttrMaxRegistersPerBlock, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, 
dev.device_id));
         break;
       }
       case kGcnArch:
@@ -112,20 +135,21 @@ class CUDADeviceAPI final : public DeviceAPI {
       case kL2CacheSizeBytes: {
         // Get size of device l2 cache size in bytes.
         int l2_size = 0;
-        CUDA_CALL(cudaDeviceGetAttribute(&l2_size, cudaDevAttrL2CacheSize, 
dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&l2_size, cudaDevAttrL2CacheSize, 
dev.device_id));
         *rv = l2_size;
         return;
       }
       case kTotalGlobalMemory: {
         cudaDeviceProp prop;
-        CUDA_CALL(cudaGetDeviceProperties(&prop, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, 
dev.device_id));
         int64_t total_global_memory = prop.totalGlobalMem;
         *rv = total_global_memory;
         return;
       }
       case kAvailableGlobalMemory: {
         size_t free_mem, total_mem;
-        CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
         *rv = static_cast<int64_t>(free_mem);
         return;
       }
@@ -139,14 +163,14 @@ class CUDADeviceAPI final : public DeviceAPI {
     void* ret;
     if (dev.device_type == kDLCUDAHost) {
       VLOG(1) << "allocating " << nbytes << "bytes on host";
-      CUDA_CALL(cudaMallocHost(&ret, nbytes));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMallocHost(&ret, nbytes));
     } else {
-      CUDA_CALL(cudaSetDevice(dev.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
       size_t free_mem, total_mem;
-      CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
       VLOG(1) << "allocating " << nbytes << " bytes on device, with " << 
free_mem
               << " bytes currently free out of " << total_mem << " bytes 
available";
-      CUDA_CALL(cudaMalloc(&ret, nbytes));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&ret, nbytes));
     }
     return ret;
   }
@@ -172,11 +196,11 @@ class CUDADeviceAPI final : public DeviceAPI {
 
     if (dev.device_type == kDLCUDAHost) {
       VLOG(1) << "freeing host memory";
-      CUDA_CALL(cudaFreeHost(ptr));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaFreeHost(ptr));
     } else {
-      CUDA_CALL(cudaSetDevice(dev.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
       VLOG(1) << "freeing device memory";
-      CUDA_CALL(cudaFree(ptr));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaFree(ptr));
     }
   }
 
@@ -203,17 +227,17 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
 
     if (dev_from.device_type == kDLCUDA && dev_to.device_type == kDLCUDA) {
-      CUDA_CALL(cudaSetDevice(dev_from.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_from.device_id));
       if (dev_from.device_id == dev_to.device_id) {
         GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
       } else {
         cudaMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, 
size, cu_stream);
       }
     } else if (dev_from.device_type == kDLCUDA && dev_to.device_type == 
kDLCPU) {
-      CUDA_CALL(cudaSetDevice(dev_from.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_from.device_id));
       GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
     } else if (dev_from.device_type == kDLCPU && dev_to.device_type == 
kDLCUDA) {
-      CUDA_CALL(cudaSetDevice(dev_to.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_to.device_id));
       GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
     } else {
       TVM_FFI_THROW(InternalError) << "expect copy from/to GPU or between GPU";
@@ -222,40 +246,40 @@ class CUDADeviceAPI final : public DeviceAPI {
 
  public:
   TVMStreamHandle CreateStream(Device dev) {
-    CUDA_CALL(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
     cudaStream_t retval;
-    CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&retval, 
cudaStreamNonBlocking));
     return static_cast<TVMStreamHandle>(retval);
   }
 
   void FreeStream(Device dev, TVMStreamHandle stream) {
-    CUDA_CALL(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
     cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
-    CUDA_CALL(cudaStreamDestroy(cu_stream));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamDestroy(cu_stream));
   }
 
   void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle 
event_dst) {
-    CUDA_CALL(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
     cudaStream_t src_stream = static_cast<cudaStream_t>(event_src);
     cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst);
     cudaEvent_t evt;
-    CUDA_CALL(cudaEventCreate(&evt));
-    CUDA_CALL(cudaEventRecord(evt, src_stream));
-    CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0));
-    CUDA_CALL(cudaEventDestroy(evt));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&evt));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(evt, src_stream));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamWaitEvent(dst_stream, evt, 0));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(evt));
   }
 
   void StreamSync(Device dev, TVMStreamHandle stream) final {
-    CUDA_CALL(cudaSetDevice(dev.device_id));
-    CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
+    
TVM_FFI_CHECK_CUDA_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
   }
 
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final {
-    return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size);
+    return ThreadLocalWorkspacePool()->AllocWorkspace(dev, size);
   }
 
   void FreeWorkspace(Device dev, void* data) final {
-    CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data);
+    ThreadLocalWorkspacePool()->FreeWorkspace(dev, data);
   }
 
   bool SupportsDevicePointerArithmeticsOnHost() final { return true; }
@@ -268,17 +292,17 @@ class CUDADeviceAPI final : public DeviceAPI {
   }
 
  private:
+  static WorkspacePool* ThreadLocalWorkspacePool();
+
   static void GPUCopy(const void* from, void* to, size_t size, cudaMemcpyKind 
kind,
                       cudaStream_t stream) {
-    CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpyAsync(to, from, size, kind, stream));
   }
 };
 
-CUDAThreadEntry::CUDAThreadEntry() : pool(kDLCUDA, CUDADeviceAPI::Global()) {}
-
-CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
-  static thread_local CUDAThreadEntry inst;
-  return &inst;
+WorkspacePool* CUDADeviceAPI::ThreadLocalWorkspacePool() {
+  static thread_local WorkspacePool pool(kDLCUDA, CUDADeviceAPI::Global());
+  return &pool;
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -301,28 +325,28 @@ class CUDATimerNode : public TimerNode {
     // This initial cudaEventRecord is sometimes pretty slow (~100us). Does
     // cudaEventRecord do some stream synchronization?
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     stream_ = TVMFFIEnvGetStream(kDLCUDA, device_id);
-    CUDA_CALL(cudaEventRecord(start_, static_cast<cudaStream_t>(stream_)));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(start_, 
static_cast<cudaStream_t>(stream_)));
   }
   virtual void Stop() {
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
-    CUDA_CALL(cudaEventRecord(stop_, static_cast<cudaStream_t>(stream_)));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(stop_, 
static_cast<cudaStream_t>(stream_)));
   }
   virtual int64_t SyncAndGetElapsedNanos() {
-    CUDA_CALL(cudaEventSynchronize(stop_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventSynchronize(stop_));
     float milliseconds = 0;
-    CUDA_CALL(cudaEventElapsedTime(&milliseconds, start_, stop_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start_, 
stop_));
     return milliseconds * 1e6;
   }
   virtual ~CUDATimerNode() {
-    CUDA_CALL(cudaEventDestroy(start_));
-    CUDA_CALL(cudaEventDestroy(stop_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(start_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(stop_));
   }
   CUDATimerNode() {
-    CUDA_CALL(cudaEventCreate(&start_));
-    CUDA_CALL(cudaEventCreate(&stop_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&start_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&stop_));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.cuda.CUDATimerNode", 
CUDATimerNode, TimerNode);
 
@@ -340,7 +364,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 TVM_RUNTIME_DLL ffi::String GetCudaFreeMemory() {
   size_t free_mem, total_mem;
-  CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
   std::stringstream ss;
   ss << "Current CUDA memory is " << free_mem << " bytes free out of " << 
total_mem
      << " bytes on device";
@@ -355,14 +379,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         // TODO(tvm-team): remove once confirms all dep such as flashinfer
         // migrated to TVMFFIEnvGetStream
         int device_id;
-        CUDA_CALL(cudaGetDevice(&device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
         return static_cast<void*>(TVMFFIEnvGetStream(kDLCUDA, device_id));
       });
 }
 
 TVM_RUNTIME_DLL int GetCudaDeviceCount() {
   int count;
-  CUDA_CALL(cudaGetDeviceCount(&count));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDeviceCount(&count));
   return count;
 }
 
diff --git a/src/backend/cuda/runtime/cuda_module.cc 
b/src/backend/cuda/runtime/cuda_module.cc
index 8ea734da69..5a598c87cc 100644
--- a/src/backend/cuda/runtime/cuda_module.cc
+++ b/src/backend/cuda/runtime/cuda_module.cc
@@ -28,6 +28,7 @@
 #include <cuda_runtime.h>
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
@@ -41,11 +42,22 @@
 #include "../../../runtime/pack_args.h"
 #include "../../../runtime/thread_storage_scope.h"
 #include "../../../support/bytes_io.h"
-#include "cuda_common.h"
 
 namespace tvm {
 namespace runtime {
 
+#ifndef CUDA_DRIVER_CALL
+#define CUDA_DRIVER_CALL(x)                                             \
+  {                                                                     \
+    CUresult result = x;                                                \
+    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
+      const char* msg;                                                  \
+      cuGetErrorName(result, &msg);                                     \
+      TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg;  \
+    }                                                                   \
+  }
+#endif
+
 // Maximum number of GPU supported in CUDAModule (file-local).
 static constexpr const int kMaxNumGPUs = 32;
 
@@ -204,7 +216,7 @@ class CUDAWrappedFunc {
   // invoke the function with void arguments
   void operator()(ffi::PackedArgs args, ffi::Any* rv, void** void_args) const {
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     ThreadWorkLoad wl = launch_param_config_.Extract(args);
 
     if (fcache_[device_id] == nullptr) {
diff --git a/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc 
b/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc
index 6520753e11..34ba853505 100644
--- a/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc
+++ b/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc
@@ -24,6 +24,7 @@
 
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/tensor.h>
@@ -32,7 +33,6 @@
 #include <string>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 #include "cublas_utils.h"
@@ -89,7 +89,7 @@ class CublasJSONRuntime : public JSONRuntimeBase {
     }
 
     if (device_id == -1) {
-      CUDA_CALL(cudaGetDevice(&device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     }
     auto* entry_ptr = 
tvm::contrib::CuBlasLtThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
     cudaStream_t stream = 
static_cast<cudaStream_t>(TVMFFIEnvGetStream(kDLCUDA, device_id));
diff --git a/src/runtime/extra/contrib/cublas/cublas_utils.cc 
b/src/runtime/extra/contrib/cublas/cublas_utils.cc
index b8c239f13a..dcd115026c 100644
--- a/src/runtime/extra/contrib/cublas/cublas_utils.cc
+++ b/src/runtime/extra/contrib/cublas/cublas_utils.cc
@@ -23,10 +23,9 @@
 #include "cublas_utils.h"
 
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 namespace tvm {
 namespace contrib {
 
@@ -51,7 +50,7 @@ CuBlasThreadEntry* CuBlasThreadEntry::ThreadLocal(DLDevice 
curr_device) {
 CuBlasLtThreadEntry::CuBlasLtThreadEntry() {
   CHECK_CUBLAS_ERROR(cublasLtCreate(&handle));
   CHECK_CUBLAS_ERROR(cublasLtMatmulPreferenceCreate(&matmul_pref_desc));
-  CUDA_CALL(cudaMalloc(&workspace_ptr, workspace_size));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&workspace_ptr, workspace_size));
 }
 
 CuBlasLtThreadEntry::~CuBlasLtThreadEntry() {
diff --git a/src/runtime/extra/contrib/cudnn/conv_backward.cc 
b/src/runtime/extra/contrib/cudnn/conv_backward.cc
index bfc65baaff..ca711f4e78 100644
--- a/src/runtime/extra/contrib/cudnn/conv_backward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_backward.cc
@@ -67,7 +67,7 @@ void BackwardDataFindAlgo(int format, int dims, int groups, 
const int pad[], con
                           const int dx_dim[], const std::string& data_dtype,
                           const std::string& conv_dtype, bool verbose, 
ffi::Any* ret) {
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = 
CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   const int full_dims = dims + 2;
   std::vector<int64_t> dy_dim_int64(full_dims);
@@ -146,7 +146,7 @@ void BackwardFilterFindAlgo(int format, int dims, int 
groups, const int pad[], c
                             const int dw_dim[], const std::string& data_dtype,
                             const std::string& conv_dtype, bool verbose, 
ffi::Any* ret) {
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = 
CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   const int full_dims = dims + 2;
   std::vector<int64_t> x_dim_int64(full_dims);
diff --git a/src/runtime/extra/contrib/cudnn/conv_forward.cc 
b/src/runtime/extra/contrib/cudnn/conv_forward.cc
index 6c6fd7eb40..befbddd788 100644
--- a/src/runtime/extra/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_forward.cc
@@ -112,7 +112,7 @@ void FindAlgo(int format, int dims, int groups, const int 
pad[], const int strid
               const std::string& data_dtype, const std::string& conv_dtype, 
bool verbose,
               ffi::Any* ret) {
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = 
CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   const int full_dims = dims + 2;
   std::vector<int64_t> x_dim_int64(full_dims);
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc 
b/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc
index f31a38bd47..54d16b5a39 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc
+++ b/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc
@@ -24,10 +24,10 @@
 
 #include "./attention.h"
 
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/runtime/device_api.h>
 
-#include "../../../../../backend/cuda/runtime/cuda_common.h"
 #include "../cudnn_utils.h"
 
 namespace tvm {
@@ -99,7 +99,7 @@ void CuDNNSDPARunnerNode::Init(int64_t batch, int64_t 
seq_len, int64_t num_heads
   TVM_FFI_ICHECK(stats == nullptr);
   o->set_output(true).set_dim({batch, num_heads, seq_len, 
head_size_v}).set_stride(o_stride);
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = 
CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   CUDNN_FRONTEND_CALL(graph_->build(entry_ptr->handle, 
{cudnn_frontend::HeurMode_t::A}));
 }
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc 
b/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc
index a7cf1ec2b3..5b3756fd79 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc
+++ b/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc
@@ -103,7 +103,7 @@ class cuDNNJSONRuntime : public JSONRuntimeBase {
 
   std::function<void()> GetConv2DExec(const JSONGraphNode& node) {
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     auto* entry_ptr = 
tvm::contrib::CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
     auto op_name = node.GetOpName();
 
@@ -164,7 +164,7 @@ class cuDNNJSONRuntime : public JSONRuntimeBase {
     int algo = best_algo.cast<int>();
     std::function<void()> op_exec = [=, this]() {
       int device_id;
-      CUDA_CALL(cudaGetDevice(&device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
       cudaStream_t stream = 
static_cast<cudaStream_t>(TVMFFIEnvGetStream(kDLCUDA, device_id));
       CUDNN_CALL(cudnnSetStream(entry_ptr->handle, stream));
 
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc 
b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
index 5793febf01..5c34d4a2b0 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
+++ b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
@@ -270,7 +270,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tvm.contrib.cudnn.exists", []() -> bool {
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     return CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}, 
false)->exists();
   });
 }
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_utils.h 
b/src/runtime/extra/contrib/cudnn/cudnn_utils.h
index 81849c0f0c..cef82134ad 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/extra/contrib/cudnn/cudnn_utils.h
@@ -26,12 +26,11 @@
 
 #include <cudnn.h>
 #include <tvm/ffi/error.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/runtime/device_api.h>
 
 #include <string>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 namespace tvm {
 namespace contrib {
 
diff --git a/src/runtime/extra/contrib/cudnn/softmax.cc 
b/src/runtime/extra/contrib/cudnn/softmax.cc
index d494fb3349..f0562dcba0 100644
--- a/src/runtime/extra/contrib/cudnn/softmax.cc
+++ b/src/runtime/extra/contrib/cudnn/softmax.cc
@@ -41,7 +41,7 @@ void softmax_impl(cudnnSoftmaxAlgorithm_t alg, 
ffi::PackedArgs args, ffi::Any* r
   if (axis < 0) axis += ndim;
   TVM_FFI_ICHECK(axis >= 0 && axis < ndim);
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = 
CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   entry_ptr->softmax_entry.data_type = 
CuDNNDataType::DLTypeToCuDNNType(x->dtype);
 
diff --git a/src/runtime/extra/contrib/curand/curand.cc 
b/src/runtime/extra/contrib/curand/curand.cc
index 5f8ff07583..b2fc753865 100644
--- a/src/runtime/extra/contrib/curand/curand.cc
+++ b/src/runtime/extra/contrib/curand/curand.cc
@@ -17,11 +17,11 @@
  * under the License.
  */
 #include <curand.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/base.h>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "./helper_cuda_kernels.h"
 
 namespace tvm {
diff --git a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh 
b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh
index c30b34d0f4..0a3068c5d9 100644
--- a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/runtime/logging.h>
 
 #include <fstream>
@@ -25,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh 
b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
index dcb26b9071..552b46b0a0 100644
--- a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/runtime/logging.h>
 
 #include <fstream>
@@ -25,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git 
a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh 
b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
index 9c1962ad98..26eb35e34e 100644
--- 
a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
+++ 
b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -24,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git 
a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh 
b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
index 1955e47759..9307e33d7f 100644
--- 
a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
+++ 
b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -24,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git 
a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh
 
b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh
index a516f31d3c..01e9ca7b8a 100644
--- 
a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh
+++ 
b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh
@@ -17,14 +17,14 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/cutlass/gemm_runner.cuh 
b/src/runtime/extra/contrib/cutlass/gemm_runner.cuh
index 58e1c9fbd0..174699bcec 100644
--- a/src/runtime/extra/contrib/cutlass/gemm_runner.cuh
+++ b/src/runtime/extra/contrib/cutlass/gemm_runner.cuh
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/runtime/logging.h>
 
 #include <fstream>
@@ -25,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/nvshmem/dist_gemm.cu 
b/src/runtime/extra/contrib/nvshmem/dist_gemm.cu
index d975e14038..26cc55a442 100644
--- a/src/runtime/extra/contrib/nvshmem/dist_gemm.cu
+++ b/src/runtime/extra/contrib/nvshmem/dist_gemm.cu
@@ -19,12 +19,11 @@
 #include <nvshmem.h>
 #include <nvshmemx.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/disco/disco_worker.h>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 namespace tvm {
 namespace runtime {
 
@@ -62,7 +61,7 @@ TVMStreamHandle stream_create() {
   DiscoWorker* worker = ThreadLocalDiscoWorker::Get()->worker;
   TVM_FFI_ICHECK(worker != nullptr) << "NVSHMEM stream creation failed: worker 
is not initialized";
   cudaStream_t retval;
-  CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&retval, 
cudaStreamNonBlocking));
   return static_cast<TVMStreamHandle>(retval);
 }
 
@@ -100,7 +99,7 @@ void transfer_to_peers_reduce_scatter(Tensor semaphore, 
Tensor gemm_out, Tensor
                    stream);
     } else {
       int device_id;
-      CUDA_CALL(cudaGetDevice(&device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
       TVMStreamHandle main_stream = TVMFFIEnvGetStream(kDLCUDA, device_id);
       copy_to_peer(get_pointer(staging_buffer, ffi::Shape{my_rank, 0, 0}), 
to_rank,
                    get_pointer(gemm_out, ffi::Shape{to_rank * LOCAL_M, 0}), 
LOCAL_M * N * 2,
diff --git a/src/runtime/extra/contrib/nvshmem/init.cc 
b/src/runtime/extra/contrib/nvshmem/init.cc
index 698f3b6802..aeb71bb6a1 100644
--- a/src/runtime/extra/contrib/nvshmem/init.cc
+++ b/src/runtime/extra/contrib/nvshmem/init.cc
@@ -20,14 +20,13 @@
 #include <nvshmem.h>
 #include <nvshmemx.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/extra/json.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/disco/disco_worker.h>
 #include <tvm/runtime/logging.h>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 namespace tvm {
 namespace runtime {
 
@@ -66,7 +65,7 @@ void InitNVSHMEM(ffi::Shape uid_64, int num_workers, int 
worker_id_start) {
   nvshmemx_set_attr_uniqueid_args(worker_id, num_workers, &uid, &attr);
   nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
   int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
-  CUDA_CALL(cudaSetDevice(mype_node));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(mype_node));
   if (worker != nullptr) {
     if (worker->default_device.device_type == DLDeviceType::kDLCPU) {
       worker->default_device = Device{DLDeviceType::kDLCUDA, mype_node};
@@ -153,7 +152,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def("runtime.nvshmem.cumodule_init", NVSHMEMXCumoduleInit)
       .def("runtime.disco.nvshmem.barrier_all_on_current_stream", []() {
         int device_id;
-        CUDA_CALL(cudaGetDevice(&device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
         TVMStreamHandle stream = TVMFFIEnvGetStream(kDLCUDA, device_id);
         NVSHMEMBarrierAllOnStream(stream);
       });
diff --git a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc 
b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
index ee9afd0eca..cb6e3520c8 100644
--- a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
+++ b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
@@ -18,13 +18,13 @@
  */
 #include <nvshmem.h>
 #include <nvshmemx.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/memory/memory_manager.h>
 
 #include <thread>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "../../../memory/pooled_allocator.h"
 #include "../../disco/utils.h"
 
diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h 
b/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h
index 4d92afb234..408d50cc7e 100755
--- a/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h
+++ b/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h
@@ -23,10 +23,11 @@
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
 
+#include <tvm/ffi/extra/cuda/base.h>
+
 #include <string>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "NvInfer.h"
 
 namespace tvm {
@@ -46,7 +47,7 @@ class TensorRTCalibrator : public 
nvinfer1::IInt8EntropyCalibrator2 {
     }
     // Free buffers
     for (size_t i = 0; i < buffers_.size(); ++i) {
-      CUDA_CALL(cudaFree(buffers_[i]));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaFree(buffers_[i]));
     }
   }
 
@@ -55,8 +56,9 @@ class TensorRTCalibrator : public 
nvinfer1::IInt8EntropyCalibrator2 {
     std::vector<float*> data_host(bindings.size(), nullptr);
     for (size_t i = 0; i < bindings.size(); ++i) {
       data_host[i] = new float[batch_size_ * binding_sizes[i]];
-      CUDA_CALL(cudaMemcpy(static_cast<void*>(data_host[i]), bindings[i],
-                           batch_size_ * binding_sizes[i] * sizeof(float), 
cudaMemcpyDeviceToHost));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(static_cast<void*>(data_host[i]), 
bindings[i],
+                                          batch_size_ * binding_sizes[i] * 
sizeof(float),
+                                          cudaMemcpyDeviceToHost));
     }
     data_.push_back(data_host);
     data_sizes_.push_back(binding_sizes);
@@ -73,9 +75,10 @@ class TensorRTCalibrator : public 
nvinfer1::IInt8EntropyCalibrator2 {
     TVM_FFI_ICHECK_EQ(input_names_.size(), nbBindings);
     for (size_t i = 0; i < input_names_.size(); ++i) {
       TVM_FFI_ICHECK_EQ(input_names_[i], names[i]);
-      CUDA_CALL(cudaMemcpy(buffers_[i], data_[num_batches_calibrated_][i],
-                           batch_size_ * 
data_sizes_[num_batches_calibrated_][i] * sizeof(float),
-                           cudaMemcpyHostToDevice));
+      TVM_FFI_CHECK_CUDA_ERROR(
+          cudaMemcpy(buffers_[i], data_[num_batches_calibrated_][i],
+                     batch_size_ * data_sizes_[num_batches_calibrated_][i] * 
sizeof(float),
+                     cudaMemcpyHostToDevice));
       bindings[i] = buffers_[i];
     }
     num_batches_calibrated_++;
@@ -120,7 +123,7 @@ class TensorRTCalibrator : public 
nvinfer1::IInt8EntropyCalibrator2 {
     const int num_inputs = data_sizes_[0].size();
     buffers_.assign(num_inputs, nullptr);
     for (int i = 0; i < num_inputs; ++i) {
-      CUDA_CALL(cudaMalloc(&buffers_[i], data_sizes_[0][i] * sizeof(float)));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&buffers_[i], data_sizes_[0][i] * 
sizeof(float)));
     }
   }
 };
diff --git a/src/runtime/extra/contrib/thrust/thrust.cu 
b/src/runtime/extra/contrib/thrust/thrust.cu
index b361323231..f534e1fc76 100644
--- a/src/runtime/extra/contrib/thrust/thrust.cu
+++ b/src/runtime/extra/contrib/thrust/thrust.cu
@@ -33,6 +33,7 @@
 #include <thrust/sort.h>
 #include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/logging.h>
@@ -41,8 +42,6 @@
 #include <functional>
 #include <memory>
 #include <vector>
-
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 namespace tvm {
 namespace contrib {
 
@@ -95,7 +94,7 @@ class WorkspaceMemoryResource : public 
thrust::mr::memory_resource<void*> {
 
 auto get_thrust_exec_policy(WorkspaceMemoryResource* memory_resouce) {
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(kDLCUDA, 
device_id));
   return thrust::cuda::par_nosync(memory_resouce).on(stream);
 }
diff --git a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc 
b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
index 109ce6ed37..426557b7b7 100644
--- a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
+++ b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
@@ -18,13 +18,13 @@
  */
 
 #include <cuda_runtime.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/disco/cuda_ipc_memory.h>
 #include <tvm/runtime/memory/memory_manager.h>
 
 #include "../../../../../3rdparty/tensorrt_llm/custom_allreduce_kernels.h"
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "../../../memory/pooled_allocator.h"
 #include "../nccl/nccl_context.h"
 
@@ -45,20 +45,22 @@ using tvm::runtime::memory::Buffer;
 std::vector<cudaIpcMemHandle_t> 
AllGatherIPCHandles(nccl::CCLThreadLocalContext* ctx,
                                                     cudaIpcMemHandle_t 
local_handle) {
   void *d_src, *d_dst;
-  CUDA_CALL(cudaMalloc(&d_src, CUDA_IPC_HANDLE_SIZE));
-  CUDA_CALL(cudaMalloc(&d_dst, CUDA_IPC_HANDLE_SIZE * 
ctx->worker->num_workers));
-  CUDA_CALL(cudaMemcpy(d_src, &local_handle, CUDA_IPC_HANDLE_SIZE, 
cudaMemcpyHostToDevice));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_src, CUDA_IPC_HANDLE_SIZE));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_dst, CUDA_IPC_HANDLE_SIZE * 
ctx->worker->num_workers));
+  TVM_FFI_CHECK_CUDA_ERROR(
+      cudaMemcpy(d_src, &local_handle, CUDA_IPC_HANDLE_SIZE, 
cudaMemcpyHostToDevice));
   NCCL_CALL(ncclAllGather(d_src, d_dst, CUDA_IPC_HANDLE_SIZE, ncclChar, 
ctx->global_comm,
                           /*stream=*/nullptr));
   std::vector<char> serial_handles(CUDA_IPC_HANDLE_SIZE * 
ctx->worker->num_workers, 0);
-  CUDA_CALL(cudaMemcpy(serial_handles.data(), d_dst,
-                       CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, 
cudaMemcpyDefault));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(serial_handles.data(), d_dst,
+                                      CUDA_IPC_HANDLE_SIZE * 
ctx->worker->num_workers,
+                                      cudaMemcpyDefault));
   std::vector<cudaIpcMemHandle_t> handles(ctx->worker->num_workers);
   for (int i = 0; i < ctx->worker->num_workers; ++i) {
     memcpy(handles[i].reserved, &serial_handles[i * CUDA_IPC_HANDLE_SIZE], 
CUDA_IPC_HANDLE_SIZE);
   }
-  CUDA_CALL(cudaFree(d_src));
-  CUDA_CALL(cudaFree(d_dst));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaFree(d_src));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaFree(d_dst));
   return handles;
 }
 
@@ -115,7 +117,7 @@ class CUDAIPCMemoryAllocator final : public 
memory::PooledAllocator {
 
   void DeviceFreeDataSpace(Device dev, void* ptr) final {
     TVM_FFI_ICHECK(dev.device_type == kDLCUDA);
-    CUDA_CALL(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
     nccl::CCLThreadLocalContext* ctx = nccl::CCLThreadLocalContext::Get();
     auto it = ipc_memory_map_.find(ptr);
     TVM_FFI_ICHECK(it != ipc_memory_map_.end());
@@ -146,20 +148,20 @@ class CUDAIPCMemoryAllocator final : public 
memory::PooledAllocator {
     // Alloc local buffer
     TVM_FFI_ICHECK(dev.device_type == kDLCUDA);
     void* ptr;
-    CUDA_CALL(cudaSetDevice(dev.device_id));
-    CUDA_CALL(cudaMalloc(&ptr, size));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&ptr, size));
     // Reset allocated memory to zero when required.
     // We explicitly synchronize after memset, to make sure memset finishes
     // before using all-gather to exchange IPC handles.
     // This is important to ensure the memory reset get ordered
     // before any other peers read the memory.
     if (reset_memory_to_zero) {
-      CUDA_CALL(cudaMemset(ptr, 0, size));
-      CUDA_CALL(cudaDeviceSynchronize());
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMemset(ptr, 0, size));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     }
     // Create ipc handle
     cudaIpcMemHandle_t local_handle;
-    CUDA_CALL(cudaIpcGetMemHandle(&local_handle, ptr));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaIpcGetMemHandle(&local_handle, ptr));
     // All-gather IPC handles.
     nccl::CCLThreadLocalContext* ctx = nccl::CCLThreadLocalContext::Get();
     std::vector<cudaIpcMemHandle_t> handles = AllGatherIPCHandles(ctx, 
local_handle);
@@ -170,8 +172,9 @@ class CUDAIPCMemoryAllocator final : public 
memory::PooledAllocator {
         comm_ptrs[node_id] = ptr;
       } else {
         uint8_t* foreign_buffer;
-        
CUDA_CALL(cudaIpcOpenMemHandle(reinterpret_cast<void**>(&foreign_buffer), 
handles[node_id],
-                                       cudaIpcMemLazyEnablePeerAccess));
+        
TVM_FFI_CHECK_CUDA_ERROR(cudaIpcOpenMemHandle(reinterpret_cast<void**>(&foreign_buffer),
+                                                      handles[node_id],
+                                                      
cudaIpcMemLazyEnablePeerAccess));
         comm_ptrs[node_id] = foreign_buffer;
       }
     }
@@ -183,10 +186,10 @@ class CUDAIPCMemoryAllocator final : public 
memory::PooledAllocator {
     for (int i = 0; i < static_cast<int>(comm_ptrs.size()); ++i) {
       if (i != worker_id) {
         // Free ipc handle.
-        CUDA_CALL(cudaIpcCloseMemHandle(comm_ptrs[i]));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaIpcCloseMemHandle(comm_ptrs[i]));
       } else {
         // Free local buffer.
-        CUDA_CALL(cudaFree(comm_ptrs[i]));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaFree(comm_ptrs[i]));
       }
     }
   }
diff --git a/src/runtime/extra/disco/nccl/nccl_context.h 
b/src/runtime/extra/disco/nccl/nccl_context.h
index 3747fec434..7a99be0897 100644
--- a/src/runtime/extra/disco/nccl/nccl_context.h
+++ b/src/runtime/extra/disco/nccl/nccl_context.h
@@ -35,8 +35,7 @@
 #endif
 #if TVM_NCCL_RCCL_SWITCH == 0
 #include <nccl.h>
-
-#include "../../../../backend/cuda/runtime/cuda_common.h"
+#include <tvm/ffi/extra/cuda/base.h>
 #else
 #include <rccl/rccl.h>
 
@@ -62,10 +61,16 @@ namespace nccl {
 
 using deviceStream_t = cudaStream_t;
 const constexpr DLDeviceType TVM_DISCO_DEVICE_TYPE = DLDeviceType::kDLCUDA;
-inline void SetDevice(int device_id) { CUDA_CALL(cudaSetDevice(device_id)); }
-inline void StreamSynchronize(deviceStream_t stream) { 
CUDA_CALL(cudaStreamSynchronize(stream)); }
-inline void StreamCreate(deviceStream_t* stream) { 
CUDA_CALL(cudaStreamCreate(stream)); }
-inline void StreamDestroy(deviceStream_t stream) { 
CUDA_CALL(cudaStreamDestroy(stream)); }
+inline void SetDevice(int device_id) { 
TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(device_id)); }
+inline void StreamSynchronize(deviceStream_t stream) {
+  TVM_FFI_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+}
+inline void StreamCreate(deviceStream_t* stream) {
+  TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreate(stream));
+}
+inline void StreamDestroy(deviceStream_t stream) {
+  TVM_FFI_CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
+}
 
 #else
 
diff --git a/src/runtime/vm/cuda/cuda_graph_builtin.cc 
b/src/runtime/vm/cuda/cuda_graph_builtin.cc
index 58348fb695..b51b287d81 100644
--- a/src/runtime/vm/cuda/cuda_graph_builtin.cc
+++ b/src/runtime/vm/cuda/cuda_graph_builtin.cc
@@ -24,11 +24,11 @@
 
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/vm/vm.h>
 
-#include "../../../backend/cuda/runtime/cuda_common.h"
 #include "../../../support/utils.h"
 namespace tvm {
 namespace runtime {
@@ -85,7 +85,7 @@ struct CUDAGraphCapturedState {
 
   ~CUDAGraphCapturedState() {
     if (exec) {
-      CUDA_CALL(cudaGraphExecDestroy(exec));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGraphExecDestroy(exec));
     }
   }
 
@@ -100,7 +100,7 @@ struct CUDAGraphCapturedState {
 
 class ScopedCUDAStream {
  public:
-  ScopedCUDAStream() { CUDA_CALL(cudaStreamCreate(&stream_)); }
+  ScopedCUDAStream() { TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); }
   ~ScopedCUDAStream() { cudaStreamDestroy(stream_); }
   ScopedCUDAStream(const ScopedCUDAStream&) = delete;
   ScopedCUDAStream(ScopedCUDAStream&&) = delete;
@@ -116,11 +116,11 @@ class ScopedCUDAStream {
 class CUDACaptureStream {
  public:
   explicit CUDACaptureStream(cudaGraph_t* graph) : output_graph_(graph) {
-    CUDA_CALL(cudaGetDevice(&device_id_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id_));
     TVM_FFI_CHECK_SAFE_CALL(
         TVMFFIEnvSetStream(kDLCUDA, device_id_, capture_stream_,
                            
reinterpret_cast<TVMFFIStreamHandle*>(&prev_default_stream_)));
-    CUDA_CALL(cudaStreamBeginCapture(capture_stream_, 
cudaStreamCaptureModeGlobal));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamBeginCapture(capture_stream_, 
cudaStreamCaptureModeGlobal));
   }
   ~CUDACaptureStream() noexcept(false) {
     cudaStreamEndCapture(capture_stream_, output_graph_);
@@ -158,8 +158,8 @@ class CUDAGraphExtensionNode : public VMExtensionNode {
       // Launch CUDA graph
       const auto& [states, exec] = it->second;
       int device_id;
-      CUDA_CALL(cudaGetDevice(&device_id));
-      CUDA_CALL(
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(
           cudaGraphLaunch(exec, 
static_cast<cudaStream_t>(TVMFFIEnvGetStream(kDLCUDA, device_id))));
       return states;
     }
@@ -190,8 +190,8 @@ class CUDAGraphExtensionNode : public VMExtensionNode {
 
     CUDAGraphCapturedState entry;
     entry.states = capture_func_rv.cast<ffi::ObjectRef>();
-    CUDA_CALL(cudaGraphInstantiate(&entry.exec, graph, NULL, NULL, 0));
-    CUDA_CALL(cudaGraphDestroy(graph));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGraphInstantiate(&entry.exec, graph, NULL, 
NULL, 0));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGraphDestroy(graph));
 
     ffi::ObjectRef states = entry.states;
 
diff --git a/tests/python/relax/test_vm_builtin.py 
b/tests/python/relax/test_vm_builtin.py
index f46d618104..677bf0b4a3 100644
--- a/tests/python/relax/test_vm_builtin.py
+++ b/tests/python/relax/test_vm_builtin.py
@@ -77,7 +77,7 @@ def test_alloc_tensor_raises_out_of_memory(target, dev):
     built = tvm.compile(Module, target=target)
     vm = relax.VirtualMachine(built, dev)
 
-    with pytest.raises(Exception, match="CUDA: out of memory"):
+    with pytest.raises(Exception, match="CUDA.*out of memory"):
         vm["main"]()

(tvm) branch main updated: [REFACTOR][CUDA] Phase out cuda_common.h (#19770)

Reply via email to