This is an automated email from the ASF dual-hosted git repository.

tlopex pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 52e45477de [DOC] Unify CUDA naming (#18797)
52e45477de is described below

commit 52e45477de6f3a639480c854b7e38cf0a4a3173c
Author: Ruslan Baratov <[email protected]>
AuthorDate: Thu Feb 19 21:04:00 2026 +0800

    [DOC] Unify CUDA naming (#18797)
    
    Fix CUDA naming in documentation and comments
    
    - Cuda -> CUDA
    - cuda -> CUDA
---
 cmake/config.cmake                                 |  2 +-
 cmake/modules/CUDA.cmake                           |  2 +-
 cmake/utils/FindCUDA.cmake                         |  2 +-
 docs/install/docker.rst                            |  2 +-
 include/tvm/s_tir/meta_schedule/postproc.h         |  2 +-
 include/tvm/tir/function.h                         |  2 +-
 python/tvm/contrib/cutlass/gemm_profiler.py        |  2 +-
 .../msc/framework/tensorrt/codegen/sources.py      |  2 +-
 python/tvm/contrib/nvcc.py                         | 22 +++++++++---------
 python/tvm/contrib/xcode.py                        |  4 ++--
 python/tvm/runtime/device.py                       | 26 +++++++++++-----------
 python/tvm/runtime/executable.py                   |  2 +-
 .../schedule/cuda/layout_transform.py              |  2 +-
 python/tvm/testing/plugin.py                       |  2 +-
 python/tvm/testing/utils.py                        |  4 ++--
 python/tvm/topi/gpu/sort.py                        |  2 +-
 src/runtime/contrib/cublas/cublas_utils.h          |  2 +-
 src/runtime/contrib/nvshmem/memory_allocator.cc    |  2 +-
 src/runtime/contrib/papi/papi.cc                   |  2 +-
 src/runtime/cuda/cuda_module.h                     |  4 ++--
 .../postproc/rewrite_unbound_block.cc              |  2 +-
 src/s_tir/meta_schedule/schedule_rule/auto_bind.cc |  2 +-
 src/target/opt/build_cuda_off.cc                   |  2 +-
 src/target/opt/build_cuda_on.cc                    |  4 ++--
 src/target/source/codegen_cuda.cc                  |  2 +-
 src/target/source/codegen_cuda.h                   |  2 +-
 src/target/source/literal/cuda_half_t.h            |  2 +-
 src/target/source/literal/cuda_int8_t.h            |  2 +-
 tests/python/contrib/test_msc/test_runner.py       |  4 ++--
 tests/python/relax/test_relax_operators.py         |  2 +-
 .../test_tir_transform_device_kernel_launch.py     |  4 ++--
 tests/scripts/task_show_node_info.sh               |  2 +-
 32 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index fca79ee2f1..ed6910b786 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -45,7 +45,7 @@
 # Possible values:
 # - ON: enable CUDA with cmake's auto search
 # - OFF: disable CUDA
-# - /path/to/cuda: use specific path to cuda toolkit
+# - /path/to/cuda: use specific path to CUDA toolkit
 set(USE_CUDA OFF)
 
 # Whether to enable NCCL support:
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 3cd07a316f..61caf9c2fd 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -19,7 +19,7 @@
 find_cuda(${USE_CUDA} ${USE_CUDNN})
 
 if(CUDA_FOUND)
-  # always set the includedir when cuda is available
+  # always set the includedir when CUDA is available
   # avoid global retrigger of cmake
   include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 endif(CUDA_FOUND)
diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
index c62506cf41..d6732fc2cf 100644
--- a/cmake/utils/FindCUDA.cmake
+++ b/cmake/utils/FindCUDA.cmake
@@ -22,7 +22,7 @@
 #   find_cuda(${USE_CUDA} ${USE_CUDNN})
 #
 # - When USE_CUDA=ON, use auto search
-# - When USE_CUDA=/path/to/cuda-path, use the cuda path
+# - When USE_CUDA=/path/to/cuda-path, use the CUDA path
 # - When USE_CUDNN=ON, use auto search
 # - When USE_CUDNN=/path/to/cudnn-path, use the cudnn path
 #
diff --git a/docs/install/docker.rst b/docs/install/docker.rst
index 2557bf290e..4c68b92b9c 100644
--- a/docs/install/docker.rst
+++ b/docs/install/docker.rst
@@ -22,7 +22,7 @@ Docker Images
 We provide docker utility scripts to help developers to setup development 
environment.
 They are also helpful run through TVM demo and tutorials.
 We need `docker <https://docs.docker.com/engine/installation/>`_ and
-`nvidia-docker <https://github.com/NVIDIA/nvidia-docker/>`_ if we want to use 
cuda.
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker/>`_ if we want to use 
CUDA.
 
 Get a tvm source distribution or clone the GitHub repo to get the auxiliary 
scripts
 
diff --git a/include/tvm/s_tir/meta_schedule/postproc.h 
b/include/tvm/s_tir/meta_schedule/postproc.h
index 43612b0509..abd4a39bbb 100644
--- a/include/tvm/s_tir/meta_schedule/postproc.h
+++ b/include/tvm/s_tir/meta_schedule/postproc.h
@@ -137,7 +137,7 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Postproc RewriteReductionBlock();
   /*!
    * \brief Create a postprocessor that adds thread binding to unbound blocks
-   * \param max_threadblocks The max number of threadblocks in the cuda device.
+   * \param max_threadblocks The max number of threadblocks in the CUDA device.
    * \return The postprocessor created.
    */
   TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblocks);
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index 956254bbeb..97dfbb1330 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -297,7 +297,7 @@ namespace attr {
  *   The size of the shared memory that may be allocated internally by
  *   the kernel.  For example, exposed as the
  *   CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES attribute in
- *   cuda.
+ *   CUDA.
  *
  *   Defined as "tir.use_dyn_shared_memory".
  *
diff --git a/python/tvm/contrib/cutlass/gemm_profiler.py 
b/python/tvm/contrib/cutlass/gemm_profiler.py
index e89e7defbf..992f941c3c 100644
--- a/python/tvm/contrib/cutlass/gemm_profiler.py
+++ b/python/tvm/contrib/cutlass/gemm_profiler.py
@@ -48,7 +48,7 @@ class GemmProfilerEmitter(object):
   {                                                                     \\
     cudaError_t error = status;                                         \\
     if (error != cudaSuccess) {                                         \\
-      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \\
+      std::cerr << "Got bad CUDA status: " << cudaGetErrorString(error) \\
                 << " at line: " << __LINE__ << std::endl;               \\
       exit(EXIT_FAILURE);                                               \\
     }                                                                   \\
diff --git a/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py 
b/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py
index a5df42f78b..cbf84eb4c5 100644
--- a/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py
+++ b/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py
@@ -64,7 +64,7 @@ using namespace nvinfer1;
   do {                                                   \\
     auto ret = (status);                                 \\
     if (ret != 0) {                                      \\
-      std::cout << "Cuda failure: " << ret << std::endl; \\
+      std::cout << "CUDA failure: " << ret << std::endl; \\
       abort();                                           \\
     }                                                    \\
   } while (0)
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index e608bc2810..a869c3834a 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -38,18 +38,18 @@ from . import utils
 def compile_cuda(
     code, target_format=None, arch=None, options=None, path_target=None, 
compiler="nvcc"
 ):
-    """Compile cuda code with NVCC or NVRTC.
+    """Compile CUDA code with NVCC or NVRTC.
 
     Parameters
     ----------
     code : str
-        The cuda code.
+        The CUDA code.
 
     target_format : str
         The target format of the compiler ("ptx", "cubin", or "fatbin").
 
     arch : str
-        The cuda architecture.
+        The CUDA architecture.
 
     options : str or list of str
         The additional options.
@@ -78,7 +78,7 @@ def compile_cuda(
     elif compiler == "nvrtc":
         result = _compile_cuda_nvrtc(code, target_format, arch, options, 
path_target, use_nvshmem)
     else:
-        raise ValueError(f"cuda compiler must be 'nvcc' or 'nvrtc', got: 
{compiler}")
+        raise ValueError(f"CUDA compiler must be 'nvcc' or 'nvrtc', got: 
{compiler}")
 
     return result
 
@@ -623,12 +623,12 @@ def _link_nvshmem_nvrtc(binary_buf, nvshmem_lib_path):
 
 
 def find_cuda_path():
-    """Utility function to find cuda path
+    """Utility function to find CUDA path
 
     Returns
     -------
     path : str
-        Path to cuda root.
+        Path to CUDA root.
     """
     if "CUDA_PATH" in os.environ:
         return os.environ["CUDA_PATH"]
@@ -641,23 +641,23 @@ def find_cuda_path():
     cuda_path = "/usr/local/cuda"
     if os.path.exists(os.path.join(cuda_path, "bin/nvcc")):
         return cuda_path
-    raise RuntimeError("Cannot find cuda path")
+    raise RuntimeError("Cannot find CUDA path")
 
 
 def get_cuda_version(cuda_path=None):
-    """Utility function to get cuda version
+    """Utility function to get CUDA version
 
     Parameters
     ----------
     cuda_path : Optional[str]
 
-        Path to cuda root.  If None is passed, will use
+        Path to CUDA root.  If None is passed, will use
         `find_cuda_path()` as default.
 
     Returns
     -------
     version : float
-        The cuda version
+        The CUDA version
 
     """
     if cuda_path is None:
@@ -683,7 +683,7 @@ def get_cuda_version(cuda_path=None):
         release_fields = [s.strip() for s in release_line.split(",")]
         version_str = [f[1:] for f in release_fields if f.startswith("V")][0]
         return tuple(int(field) for field in version_str.split("."))
-    raise RuntimeError("Cannot read cuda version file")
+    raise RuntimeError("Cannot read CUDA version file")
 
 
 def find_nvshmem_paths() -> Tuple[str, str]:
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index adfc2dcd84..bd0be14fb3 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -107,12 +107,12 @@ create_dylib.output_format = "dylib"
 
 
 def compile_metal(code, path_target=None, sdk="macosx", min_os_version=None):
-    """Compile metal with CLI tool from env.
+    """Compile Metal with CLI tool from env.
 
     Parameters
     ----------
     code : str
-        The cuda code.
+        The Metal code.
 
     path_target : str, optional
         Output file.
diff --git a/python/tvm/runtime/device.py b/python/tvm/runtime/device.py
index b8a3db15f3..768ebf63a7 100644
--- a/python/tvm/runtime/device.py
+++ b/python/tvm/runtime/device.py
@@ -40,7 +40,7 @@ class Device(tvm_ffi.core.Device):
 
         Returns True if TVM has support for the device, if the
         physical device is present, and the device is accessible
-        through appropriate drivers (e.g. cuda/vulkan).
+        through appropriate drivers (e.g. CUDA/Vulkan).
 
         Returns
         -------
@@ -54,7 +54,7 @@ class Device(tvm_ffi.core.Device):
     def max_threads_per_block(self):
         """Maximum number of threads on each block.
 
-        Returns device value for cuda, metal, rocm, opencl, and vulkan
+        Returns device value for CUDA, Metal, ROCm, OpenCL, and Vulkan
         devices.  Returns remote device value for RPC devices.
         Returns None for all other devices.
 
@@ -70,8 +70,8 @@ class Device(tvm_ffi.core.Device):
     def warp_size(self):
         """Number of threads that execute concurrently.
 
-        Returns device value for cuda, rocm, and vulkan.  Returns
-        1 for metal and opencl devices, regardless of the physical
+        Returns device value for CUDA, ROCm, and Vulkan.  Returns
+        1 for Metal and OpenCL devices, regardless of the physical
         device.  Returns remote device value for RPC devices.  Returns
         None for all other devices.
 
@@ -87,7 +87,7 @@ class Device(tvm_ffi.core.Device):
     def max_shared_memory_per_block(self):
         """Total amount of shared memory per block in bytes.
 
-        Returns device value for cuda, rocm, opencl, and vulkan.
+        Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
         Returns remote device value for RPC devices.  Returns None for
         all other devices.
 
@@ -106,8 +106,8 @@ class Device(tvm_ffi.core.Device):
         Returns maximum API version (e.g. CUDA/OpenCL/Vulkan)
         supported by the device.
 
-        Returns device value for cuda, rocm, opencl, and
-        vulkan. Returns remote device value for RPC devices.  Returns
+        Returns device value for CUDA, ROCm, OpenCL, and
+        Vulkan. Returns remote device value for RPC devices.  Returns
         None for all other devices.
 
         Returns
@@ -122,7 +122,7 @@ class Device(tvm_ffi.core.Device):
     def device_name(self):
         """Return the vendor-specific name of device.
 
-        Returns device value for cuda, rocm, opencl, and vulkan.
+        Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
         Returns remote device value for RPC devices.  Returns None for
         all other devices.
 
@@ -138,7 +138,7 @@ class Device(tvm_ffi.core.Device):
     def max_clock_rate(self):
         """Return the max clock frequency of device (kHz).
 
-        Returns device value for cuda, rocm, and opencl.  Returns
+        Returns device value for CUDA, ROCm, and OpenCL.  Returns
         remote device value for RPC devices.  Returns None for all
         other devices.
 
@@ -154,7 +154,7 @@ class Device(tvm_ffi.core.Device):
     def multi_processor_count(self):
         """Return the number of compute units in the device.
 
-        Returns device value for cuda, rocm, and opencl.  Returns
+        Returns device value for CUDA, ROCm, and OpenCL.  Returns
         remote device value for RPC devices.  Returns None for all
         other devices.
 
@@ -170,7 +170,7 @@ class Device(tvm_ffi.core.Device):
     def max_thread_dimensions(self):
         """Return the maximum size of each thread axis
 
-        Returns device value for cuda, rocm, opencl, and vulkan.
+        Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
         Returns remote device value for RPC devices.  Returns None for
         all other devices.
 
@@ -186,10 +186,10 @@ class Device(tvm_ffi.core.Device):
     def api_version(self):
         """Returns version number of the SDK used to compile TVM.
 
-        For example, CUDA_VERSION for cuda or VK_HEADER_VERSION for
+        For example, CUDA_VERSION for CUDA or VK_HEADER_VERSION for
         Vulkan.
 
-        Returns device value for cuda, rocm, opencl, and vulkan.
+        Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
         Returns remote device value for RPC devices.  Returns None for
         all other devices.
 
diff --git a/python/tvm/runtime/executable.py b/python/tvm/runtime/executable.py
index a57c1b6231..c8c588eb44 100644
--- a/python/tvm/runtime/executable.py
+++ b/python/tvm/runtime/executable.py
@@ -52,7 +52,7 @@ class Executable:
         """Just-in-time compile and link the modules.
 
         The Executable returned by tvm.compile may not be directly
-        runnable as they may contain cuda source files and objects that
+        runnable as they may contain CUDA source files and objects that
         are yet to be compiled and linked.
         This function helps to create a runtime.Module for these cases.
 
diff --git a/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py 
b/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py
index 93e0bec754..4d759d3a21 100644
--- a/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""layout_transform scheduling rule for cuda."""
+"""layout_transform scheduling rule for CUDA."""
 
 import math
 from collections import deque
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index a2995b9489..281f4bb946 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -49,7 +49,7 @@ except ImportError:
 MARKERS = {
     "gpu": "mark a test as requiring a gpu",
     "tensorcore": "mark a test as requiring a tensorcore",
-    "cuda": "mark a test as requiring cuda",
+    "cuda": "mark a test as requiring CUDA",
     "opencl": "mark a test as requiring opencl",
     "rocm": "mark a test as requiring rocm",
     "vulkan": "mark a test as requiring vulkan",
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 9648fca1b0..92e108b72a 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1197,7 +1197,7 @@ def requires_nvcc_version(major_version, minor_version=0, 
release_version=0):
     installed version of NVCC is at least `(major_version,
     minor_version, release_version)`.
 
-    This also marks the test as requiring a cuda support.
+    This also marks the test as requiring a CUDA support.
 
     Parameters
     ----------
@@ -1240,7 +1240,7 @@ def requires_cuda_compute_version(major_version, 
minor_version=0):
     compute architecture of the GPU is at least `(major_version,
     minor_version)`.
 
-    This also marks the test as requiring a cuda support.
+    This also marks the test as requiring a CUDA support.
 
     Parameters
     ----------
diff --git a/python/tvm/topi/gpu/sort.py b/python/tvm/topi/gpu/sort.py
index 41e0c9cda7..b4c3f43413 100644
--- a/python/tvm/topi/gpu/sort.py
+++ b/python/tvm/topi/gpu/sort.py
@@ -579,7 +579,7 @@ def _sort_common(
 
     with T.serial(0, cast(upper_lim - lower_lim, target_dtype)) as l2_width:
         width = 2 << (l2_width + lower_lim)
-        # Define and launch the cuda kernel
+        # Define and launch the CUDA kernel
         target = tvm.target.Target.current()
         if "vulkan" in str(target):
             ntx = max_threads
diff --git a/src/runtime/contrib/cublas/cublas_utils.h 
b/src/runtime/contrib/cublas/cublas_utils.h
index 12260a78ef..9c99a83250 100644
--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -122,7 +122,7 @@ inline cudaDataType_t GetCudaDataType(DLDataType type) {
         return CUDA_R_16BF;
     }
   }
-  LOG(FATAL) << "Unsupported cuda type";
+  LOG(FATAL) << "Unsupported CUDA type";
 }
 
 /*! \brief Execute matrix multiply followed by the specified epilogue, using 
cuBLASLt. */
diff --git a/src/runtime/contrib/nvshmem/memory_allocator.cc 
b/src/runtime/contrib/nvshmem/memory_allocator.cc
index 5893d04ac3..c53935f8bc 100644
--- a/src/runtime/contrib/nvshmem/memory_allocator.cc
+++ b/src/runtime/contrib/nvshmem/memory_allocator.cc
@@ -76,7 +76,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
   void* DeviceAllocDataSpace(Device dev, size_t size, size_t alignment,
                              DLDataType type_hint) final {
     ICHECK_EQ(dev.device_type, DLDeviceType::kDLCUDA)
-        << "nvshmem can only allocate cuda device memory space.";
+        << "nvshmem can only allocate CUDA device memory space.";
     ICHECK(type_hint.code == DLDataTypeCode::kDLInt || type_hint.code == 
DLDataTypeCode::kDLUInt ||
            type_hint.code == DLDataTypeCode::kDLFloat)
         << "nvshmem can only allocate tensor with int, usingned int or float 
data types.";
diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
index 91af80de37..917fe1930e 100644
--- a/src/runtime/contrib/papi/papi.cc
+++ b/src/runtime/contrib/papi/papi.cc
@@ -88,7 +88,7 @@ int component_for_device(Device dev) {
 /*! \brief MetricCollectorNode for PAPI metrics.
  *
  * PAPI (Performance Application Programming Interface) collects metrics on a
- * variety of platforms including cpu, cuda and rocm.
+ * variety of platforms including CPU, CUDA and ROCm.
  *
  * PAPI is avaliable at https://github.com/icl-utk-edu/papi.
  */
diff --git a/src/runtime/cuda/cuda_module.h b/src/runtime/cuda/cuda_module.h
index 935758f062..58776dd7e5 100644
--- a/src/runtime/cuda/cuda_module.h
+++ b/src/runtime/cuda/cuda_module.h
@@ -39,12 +39,12 @@ namespace runtime {
 static constexpr const int kMaxNumGPUs = 32;
 
 /*!
- * \brief create a cuda module from data.
+ * \brief create a CUDA module from data.
  *
  * \param data The module data, can be ptx, cubin
  * \param fmt The format of the data, can be "ptx", "cubin"
  * \param fmap The map function information map of each function.
- * \param cuda_source Optional, cuda source file
+ * \param cuda_source Optional, CUDA source file
  */
 ffi::Module CUDAModuleCreate(std::string data, std::string fmt,
                              ffi::Map<ffi::String, FunctionInfo> fmap, 
std::string cuda_source);
diff --git a/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc 
b/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc
index 266d9ae641..578c3ad5ca 100644
--- a/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -109,7 +109,7 @@ class RewriteUnboundBlockNode : public PostprocNode {
  public:
   /*! \brief The max number of threads per block from Target */
   int max_threads_per_block_ = -1;
-  /*! \brief The max number of threadblocks in the cuda device */
+  /*! \brief The max number of threadblocks in the CUDA device */
   int max_threadblocks_ = -1;
 
   static void RegisterReflection() {
diff --git a/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc 
b/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc
index d645200572..7b650643ec 100644
--- a/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc
@@ -53,7 +53,7 @@ class AutoBindNode : public ScheduleRuleNode {
  public:
   /*! \brief The max number of threads per block from Target */
   int64_t max_threads_per_block_ = -1;
-  /*! \brief The max number of threadblocks in the cuda device */
+  /*! \brief The max number of threadblocks in the CUDA device */
   int64_t max_threadblocks_ = -1;
   /*! \brief thread_extents Candidates of thread axis extent. */
   ffi::Array<Integer> thread_extents_;
diff --git a/src/target/opt/build_cuda_off.cc b/src/target/opt/build_cuda_off.cc
index f35f69a1a2..339d07fd73 100644
--- a/src/target/opt/build_cuda_off.cc
+++ b/src/target/opt/build_cuda_off.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- *  Optional module when build cuda is switched to off
+ *  Optional module when build CUDA is switched to off
  */
 #include "../../runtime/cuda/cuda_module.h"
 namespace tvm {
diff --git a/src/target/opt/build_cuda_on.cc b/src/target/opt/build_cuda_on.cc
index 88960594d0..8cc1472172 100644
--- a/src/target/opt/build_cuda_on.cc
+++ b/src/target/opt/build_cuda_on.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- *  Build cuda modules from source.
- *  requires cuda to be available.
+ *  Build CUDA modules from source.
+ *  requires CUDA to be available.
  *
  * \file build_cuda.cc
  */
diff --git a/src/target/source/codegen_cuda.cc 
b/src/target/source/codegen_cuda.cc
index 650c3e3d96..32f0907ee2 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -149,7 +149,7 @@ void CodeGenCUDA::PrintFunctionSignature(const ffi::String& 
function_name, const
   } else if (calling_conv == CallingConv::kDefault) {
     os << "extern \"C\" __device__ ";
   } else {
-    LOG(FATAL) << "Unsupported calling convention for cuda codegen: " << 
calling_conv;
+    LOG(FATAL) << "Unsupported calling convention for CUDA codegen: " << 
calling_conv;
   }
   CodeGenC::PrintFunctionSignature(function_name, func, os);
 }
diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h
index 02fc0603a5..4ff20f0c91 100644
--- a/src/target/source/codegen_cuda.h
+++ b/src/target/source/codegen_cuda.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file codegen_cuda.h
- * \brief Utility to generate cuda code
+ * \brief Utility to generate CUDA code
  */
 #ifndef TVM_TARGET_SOURCE_CODEGEN_CUDA_H_
 #define TVM_TARGET_SOURCE_CODEGEN_CUDA_H_
diff --git a/src/target/source/literal/cuda_half_t.h 
b/src/target/source/literal/cuda_half_t.h
index 682845e9e7..78ee0298be 100644
--- a/src/target/source/literal/cuda_half_t.h
+++ b/src/target/source/literal/cuda_half_t.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file cuda_half_t.h
- * \brief half_t (fp16) definition for cuda codegen.
+ * \brief half_t (fp16) definition for CUDA codegen.
  */
 #ifndef TVM_TARGET_SOURCE_LITERAL_CUDA_HALF_T_H_
 #define TVM_TARGET_SOURCE_LITERAL_CUDA_HALF_T_H_
diff --git a/src/target/source/literal/cuda_int8_t.h 
b/src/target/source/literal/cuda_int8_t.h
index ce166ea8f3..4d9a260979 100644
--- a/src/target/source/literal/cuda_int8_t.h
+++ b/src/target/source/literal/cuda_int8_t.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file cuda_int8_t.h
- * \brief Extra int8 intrisic for cuda codegen.
+ * \brief Extra int8 intrisic for CUDA codegen.
  */
 #ifndef TVM_TARGET_SOURCE_LITERAL_CUDA_INT8_T_H_
 #define TVM_TARGET_SOURCE_LITERAL_CUDA_INT8_T_H_
diff --git a/tests/python/contrib/test_msc/test_runner.py 
b/tests/python/contrib/test_msc/test_runner.py
index 14c872bedd..be3140f72a 100644
--- a/tests/python/contrib/test_msc/test_runner.py
+++ b/tests/python/contrib/test_msc/test_runner.py
@@ -91,7 +91,7 @@ def test_tvm_runner_cpu(training):
 @tvm.testing.requires_cuda
 @pytest.mark.parametrize("training", [True, False])
 def test_tvm_runner_cuda(training):
-    """Test runner for tvm on cuda"""
+    """Test runner for tvm on CUDA"""
 
     _test_from_torch(TVMRunner, "cuda", training=training)
 
@@ -106,7 +106,7 @@ def test_torch_runner_cpu(training):
 @tvm.testing.requires_cuda
 @pytest.mark.parametrize("training", [True, False])
 def test_torch_runner_cuda(training):
-    """Test runner for torch on cuda"""
+    """Test runner for torch on CUDA"""
 
     _test_from_torch(TorchRunner, "cuda", training=training, atol=1e-1, 
rtol=1e-1)
 
diff --git a/tests/python/relax/test_relax_operators.py 
b/tests/python/relax/test_relax_operators.py
index 897082dd79..c0417ccaab 100644
--- a/tests/python/relax/test_relax_operators.py
+++ b/tests/python/relax/test_relax_operators.py
@@ -54,7 +54,7 @@ def run_cpu(mod, func_name, *args, exec_mode):
 
 
 def test_unique(exec_mode):
-    # TODO(prakalp): also add test for compiling and running on cuda device.
+    # TODO(prakalp): also add test for compiling and running on CUDA device.
     data_numpy = np.random.randint(0, 16, (16, 16))
     data = tvm.runtime.tensor(data_numpy)
     result, result_sorted = run_cpu(InputModule, "foo", data, 
exec_mode=exec_mode)
diff --git 
a/tests/python/tir-transform/test_tir_transform_device_kernel_launch.py 
b/tests/python/tir-transform/test_tir_transform_device_kernel_launch.py
index 32ec677167..57d2bca3b4 100644
--- a/tests/python/tir-transform/test_tir_transform_device_kernel_launch.py
+++ b/tests/python/tir-transform/test_tir_transform_device_kernel_launch.py
@@ -26,7 +26,7 @@ def test_lower_device_kernel_launch():
     The "tir.kernel_launch_params" determines which parameters belong
     to the runtime, and which below to the device-side PrimFunc.
     Parameters that are required prior to launching a kernel (e.g. the
-    number of Cuda threads to use) are stored in the
+    number of CUDA threads to use) are stored in the
     `"tir.kernel_launch_params"` attribute, and are used by the
     runtime prior in order to launch the generated kernel.
     """
@@ -126,7 +126,7 @@ def test_collect_launch_parameter():
     The "tir.kernel_launch_params" determines which parameters belong
     to the runtime, and which below to the device-side PrimFunc.
     Parameters that are required prior to launching a kernel (e.g. the
-    number of Cuda threads to use) are stored in the
+    number of CUDA threads to use) are stored in the
     `"tir.kernel_launch_params"` attribute, and are used by the
     runtime prior in order to launch the generated kernel.
     """
diff --git a/tests/scripts/task_show_node_info.sh 
b/tests/scripts/task_show_node_info.sh
index 8569a1e0d9..af095ed453 100755
--- a/tests/scripts/task_show_node_info.sh
+++ b/tests/scripts/task_show_node_info.sh
@@ -41,4 +41,4 @@ echo "===== RUNNER INFO ====="
 df --human-readable
 lscpu
 free
-nvidia-smi 2>/dev/null || echo "cuda not found"
+nvidia-smi 2>/dev/null || echo "CUDA not found"

Reply via email to