This is an automated email from the ASF dual-hosted git repository.
tlopex pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 52e45477de [DOC] Unify CUDA naming (#18797)
52e45477de is described below
commit 52e45477de6f3a639480c854b7e38cf0a4a3173c
Author: Ruslan Baratov <[email protected]>
AuthorDate: Thu Feb 19 21:04:00 2026 +0800
[DOC] Unify CUDA naming (#18797)
Fix CUDA naming in documentation and comments
- Cuda -> CUDA
- cuda -> CUDA
---
cmake/config.cmake | 2 +-
cmake/modules/CUDA.cmake | 2 +-
cmake/utils/FindCUDA.cmake | 2 +-
docs/install/docker.rst | 2 +-
include/tvm/s_tir/meta_schedule/postproc.h | 2 +-
include/tvm/tir/function.h | 2 +-
python/tvm/contrib/cutlass/gemm_profiler.py | 2 +-
.../msc/framework/tensorrt/codegen/sources.py | 2 +-
python/tvm/contrib/nvcc.py | 22 +++++++++---------
python/tvm/contrib/xcode.py | 4 ++--
python/tvm/runtime/device.py | 26 +++++++++++-----------
python/tvm/runtime/executable.py | 2 +-
.../schedule/cuda/layout_transform.py | 2 +-
python/tvm/testing/plugin.py | 2 +-
python/tvm/testing/utils.py | 4 ++--
python/tvm/topi/gpu/sort.py | 2 +-
src/runtime/contrib/cublas/cublas_utils.h | 2 +-
src/runtime/contrib/nvshmem/memory_allocator.cc | 2 +-
src/runtime/contrib/papi/papi.cc | 2 +-
src/runtime/cuda/cuda_module.h | 4 ++--
.../postproc/rewrite_unbound_block.cc | 2 +-
src/s_tir/meta_schedule/schedule_rule/auto_bind.cc | 2 +-
src/target/opt/build_cuda_off.cc | 2 +-
src/target/opt/build_cuda_on.cc | 4 ++--
src/target/source/codegen_cuda.cc | 2 +-
src/target/source/codegen_cuda.h | 2 +-
src/target/source/literal/cuda_half_t.h | 2 +-
src/target/source/literal/cuda_int8_t.h | 2 +-
tests/python/contrib/test_msc/test_runner.py | 4 ++--
tests/python/relax/test_relax_operators.py | 2 +-
.../test_tir_transform_device_kernel_launch.py | 4 ++--
tests/scripts/task_show_node_info.sh | 2 +-
32 files changed, 60 insertions(+), 60 deletions(-)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index fca79ee2f1..ed6910b786 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -45,7 +45,7 @@
# Possible values:
# - ON: enable CUDA with cmake's auto search
# - OFF: disable CUDA
-# - /path/to/cuda: use specific path to cuda toolkit
+# - /path/to/cuda: use specific path to CUDA toolkit
set(USE_CUDA OFF)
# Whether to enable NCCL support:
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 3cd07a316f..61caf9c2fd 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -19,7 +19,7 @@
find_cuda(${USE_CUDA} ${USE_CUDNN})
if(CUDA_FOUND)
- # always set the includedir when cuda is available
+ # always set the includedir when CUDA is available
# avoid global retrigger of cmake
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
endif(CUDA_FOUND)
diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
index c62506cf41..d6732fc2cf 100644
--- a/cmake/utils/FindCUDA.cmake
+++ b/cmake/utils/FindCUDA.cmake
@@ -22,7 +22,7 @@
# find_cuda(${USE_CUDA} ${USE_CUDNN})
#
# - When USE_CUDA=ON, use auto search
-# - When USE_CUDA=/path/to/cuda-path, use the cuda path
+# - When USE_CUDA=/path/to/cuda-path, use the CUDA path
# - When USE_CUDNN=ON, use auto search
# - When USE_CUDNN=/path/to/cudnn-path, use the cudnn path
#
diff --git a/docs/install/docker.rst b/docs/install/docker.rst
index 2557bf290e..4c68b92b9c 100644
--- a/docs/install/docker.rst
+++ b/docs/install/docker.rst
@@ -22,7 +22,7 @@ Docker Images
We provide docker utility scripts to help developers to setup development
environment.
They are also helpful run through TVM demo and tutorials.
We need `docker <https://docs.docker.com/engine/installation/>`_ and
-`nvidia-docker <https://github.com/NVIDIA/nvidia-docker/>`_ if we want to use
cuda.
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker/>`_ if we want to use
CUDA.
Get a tvm source distribution or clone the GitHub repo to get the auxiliary
scripts
diff --git a/include/tvm/s_tir/meta_schedule/postproc.h
b/include/tvm/s_tir/meta_schedule/postproc.h
index 43612b0509..abd4a39bbb 100644
--- a/include/tvm/s_tir/meta_schedule/postproc.h
+++ b/include/tvm/s_tir/meta_schedule/postproc.h
@@ -137,7 +137,7 @@ class Postproc : public runtime::ObjectRef {
TVM_DLL static Postproc RewriteReductionBlock();
/*!
* \brief Create a postprocessor that adds thread binding to unbound blocks
- * \param max_threadblocks The max number of threadblocks in the cuda device.
+ * \param max_threadblocks The max number of threadblocks in the CUDA device.
* \return The postprocessor created.
*/
TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblocks);
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index 956254bbeb..97dfbb1330 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -297,7 +297,7 @@ namespace attr {
* The size of the shared memory that may be allocated internally by
* the kernel. For example, exposed as the
* CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES attribute in
- * cuda.
+ * CUDA.
*
* Defined as "tir.use_dyn_shared_memory".
*
diff --git a/python/tvm/contrib/cutlass/gemm_profiler.py
b/python/tvm/contrib/cutlass/gemm_profiler.py
index e89e7defbf..992f941c3c 100644
--- a/python/tvm/contrib/cutlass/gemm_profiler.py
+++ b/python/tvm/contrib/cutlass/gemm_profiler.py
@@ -48,7 +48,7 @@ class GemmProfilerEmitter(object):
{ \\
cudaError_t error = status; \\
if (error != cudaSuccess) { \\
- std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \\
+ std::cerr << "Got bad CUDA status: " << cudaGetErrorString(error) \\
<< " at line: " << __LINE__ << std::endl; \\
exit(EXIT_FAILURE); \\
} \\
diff --git a/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py
b/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py
index a5df42f78b..cbf84eb4c5 100644
--- a/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py
+++ b/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py
@@ -64,7 +64,7 @@ using namespace nvinfer1;
do { \\
auto ret = (status); \\
if (ret != 0) { \\
- std::cout << "Cuda failure: " << ret << std::endl; \\
+ std::cout << "CUDA failure: " << ret << std::endl; \\
abort(); \\
} \\
} while (0)
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index e608bc2810..a869c3834a 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -38,18 +38,18 @@ from . import utils
def compile_cuda(
code, target_format=None, arch=None, options=None, path_target=None,
compiler="nvcc"
):
- """Compile cuda code with NVCC or NVRTC.
+ """Compile CUDA code with NVCC or NVRTC.
Parameters
----------
code : str
- The cuda code.
+ The CUDA code.
target_format : str
The target format of the compiler ("ptx", "cubin", or "fatbin").
arch : str
- The cuda architecture.
+ The CUDA architecture.
options : str or list of str
The additional options.
@@ -78,7 +78,7 @@ def compile_cuda(
elif compiler == "nvrtc":
result = _compile_cuda_nvrtc(code, target_format, arch, options,
path_target, use_nvshmem)
else:
- raise ValueError(f"cuda compiler must be 'nvcc' or 'nvrtc', got:
{compiler}")
+ raise ValueError(f"CUDA compiler must be 'nvcc' or 'nvrtc', got:
{compiler}")
return result
@@ -623,12 +623,12 @@ def _link_nvshmem_nvrtc(binary_buf, nvshmem_lib_path):
def find_cuda_path():
- """Utility function to find cuda path
+ """Utility function to find CUDA path
Returns
-------
path : str
- Path to cuda root.
+ Path to CUDA root.
"""
if "CUDA_PATH" in os.environ:
return os.environ["CUDA_PATH"]
@@ -641,23 +641,23 @@ def find_cuda_path():
cuda_path = "/usr/local/cuda"
if os.path.exists(os.path.join(cuda_path, "bin/nvcc")):
return cuda_path
- raise RuntimeError("Cannot find cuda path")
+ raise RuntimeError("Cannot find CUDA path")
def get_cuda_version(cuda_path=None):
- """Utility function to get cuda version
+ """Utility function to get CUDA version
Parameters
----------
cuda_path : Optional[str]
- Path to cuda root. If None is passed, will use
+ Path to CUDA root. If None is passed, will use
`find_cuda_path()` as default.
Returns
-------
version : float
- The cuda version
+ The CUDA version
"""
if cuda_path is None:
@@ -683,7 +683,7 @@ def get_cuda_version(cuda_path=None):
release_fields = [s.strip() for s in release_line.split(",")]
version_str = [f[1:] for f in release_fields if f.startswith("V")][0]
return tuple(int(field) for field in version_str.split("."))
- raise RuntimeError("Cannot read cuda version file")
+ raise RuntimeError("Cannot read CUDA version file")
def find_nvshmem_paths() -> Tuple[str, str]:
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index adfc2dcd84..bd0be14fb3 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -107,12 +107,12 @@ create_dylib.output_format = "dylib"
def compile_metal(code, path_target=None, sdk="macosx", min_os_version=None):
- """Compile metal with CLI tool from env.
+ """Compile Metal with CLI tool from env.
Parameters
----------
code : str
- The cuda code.
+ The Metal code.
path_target : str, optional
Output file.
diff --git a/python/tvm/runtime/device.py b/python/tvm/runtime/device.py
index b8a3db15f3..768ebf63a7 100644
--- a/python/tvm/runtime/device.py
+++ b/python/tvm/runtime/device.py
@@ -40,7 +40,7 @@ class Device(tvm_ffi.core.Device):
Returns True if TVM has support for the device, if the
physical device is present, and the device is accessible
- through appropriate drivers (e.g. cuda/vulkan).
+ through appropriate drivers (e.g. CUDA/Vulkan).
Returns
-------
@@ -54,7 +54,7 @@ class Device(tvm_ffi.core.Device):
def max_threads_per_block(self):
"""Maximum number of threads on each block.
- Returns device value for cuda, metal, rocm, opencl, and vulkan
+ Returns device value for CUDA, Metal, ROCm, OpenCL, and Vulkan
devices. Returns remote device value for RPC devices.
Returns None for all other devices.
@@ -70,8 +70,8 @@ class Device(tvm_ffi.core.Device):
def warp_size(self):
"""Number of threads that execute concurrently.
- Returns device value for cuda, rocm, and vulkan. Returns
- 1 for metal and opencl devices, regardless of the physical
+ Returns device value for CUDA, ROCm, and Vulkan. Returns
+ 1 for Metal and OpenCL devices, regardless of the physical
device. Returns remote device value for RPC devices. Returns
None for all other devices.
@@ -87,7 +87,7 @@ class Device(tvm_ffi.core.Device):
def max_shared_memory_per_block(self):
"""Total amount of shared memory per block in bytes.
- Returns device value for cuda, rocm, opencl, and vulkan.
+ Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
Returns remote device value for RPC devices. Returns None for
all other devices.
@@ -106,8 +106,8 @@ class Device(tvm_ffi.core.Device):
Returns maximum API version (e.g. CUDA/OpenCL/Vulkan)
supported by the device.
- Returns device value for cuda, rocm, opencl, and
- vulkan. Returns remote device value for RPC devices. Returns
+ Returns device value for CUDA, ROCm, OpenCL, and
+ Vulkan. Returns remote device value for RPC devices. Returns
None for all other devices.
Returns
@@ -122,7 +122,7 @@ class Device(tvm_ffi.core.Device):
def device_name(self):
"""Return the vendor-specific name of device.
- Returns device value for cuda, rocm, opencl, and vulkan.
+ Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
Returns remote device value for RPC devices. Returns None for
all other devices.
@@ -138,7 +138,7 @@ class Device(tvm_ffi.core.Device):
def max_clock_rate(self):
"""Return the max clock frequency of device (kHz).
- Returns device value for cuda, rocm, and opencl. Returns
+ Returns device value for CUDA, ROCm, and OpenCL. Returns
remote device value for RPC devices. Returns None for all
other devices.
@@ -154,7 +154,7 @@ class Device(tvm_ffi.core.Device):
def multi_processor_count(self):
"""Return the number of compute units in the device.
- Returns device value for cuda, rocm, and opencl. Returns
+ Returns device value for CUDA, ROCm, and OpenCL. Returns
remote device value for RPC devices. Returns None for all
other devices.
@@ -170,7 +170,7 @@ class Device(tvm_ffi.core.Device):
def max_thread_dimensions(self):
"""Return the maximum size of each thread axis
- Returns device value for cuda, rocm, opencl, and vulkan.
+ Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
Returns remote device value for RPC devices. Returns None for
all other devices.
@@ -186,10 +186,10 @@ class Device(tvm_ffi.core.Device):
def api_version(self):
"""Returns version number of the SDK used to compile TVM.
- For example, CUDA_VERSION for cuda or VK_HEADER_VERSION for
+ For example, CUDA_VERSION for CUDA or VK_HEADER_VERSION for
Vulkan.
- Returns device value for cuda, rocm, opencl, and vulkan.
+ Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
Returns remote device value for RPC devices. Returns None for
all other devices.
diff --git a/python/tvm/runtime/executable.py b/python/tvm/runtime/executable.py
index a57c1b6231..c8c588eb44 100644
--- a/python/tvm/runtime/executable.py
+++ b/python/tvm/runtime/executable.py
@@ -52,7 +52,7 @@ class Executable:
"""Just-in-time compile and link the modules.
The Executable returned by tvm.compile may not be directly
- runnable as they may contain cuda source files and objects that
+ runnable as they may contain CUDA source files and objects that
are yet to be compiled and linked.
This function helps to create a runtime.Module for these cases.
diff --git a/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py
b/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py
index 93e0bec754..4d759d3a21 100644
--- a/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py
@@ -14,7 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-"""layout_transform scheduling rule for cuda."""
+"""layout_transform scheduling rule for CUDA."""
import math
from collections import deque
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index a2995b9489..281f4bb946 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -49,7 +49,7 @@ except ImportError:
MARKERS = {
"gpu": "mark a test as requiring a gpu",
"tensorcore": "mark a test as requiring a tensorcore",
- "cuda": "mark a test as requiring cuda",
+ "cuda": "mark a test as requiring CUDA",
"opencl": "mark a test as requiring opencl",
"rocm": "mark a test as requiring rocm",
"vulkan": "mark a test as requiring vulkan",
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 9648fca1b0..92e108b72a 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1197,7 +1197,7 @@ def requires_nvcc_version(major_version, minor_version=0,
release_version=0):
installed version of NVCC is at least `(major_version,
minor_version, release_version)`.
- This also marks the test as requiring a cuda support.
+ This also marks the test as requiring a CUDA support.
Parameters
----------
@@ -1240,7 +1240,7 @@ def requires_cuda_compute_version(major_version,
minor_version=0):
compute architecture of the GPU is at least `(major_version,
minor_version)`.
- This also marks the test as requiring a cuda support.
+ This also marks the test as requiring a CUDA support.
Parameters
----------
diff --git a/python/tvm/topi/gpu/sort.py b/python/tvm/topi/gpu/sort.py
index 41e0c9cda7..b4c3f43413 100644
--- a/python/tvm/topi/gpu/sort.py
+++ b/python/tvm/topi/gpu/sort.py
@@ -579,7 +579,7 @@ def _sort_common(
with T.serial(0, cast(upper_lim - lower_lim, target_dtype)) as l2_width:
width = 2 << (l2_width + lower_lim)
- # Define and launch the cuda kernel
+ # Define and launch the CUDA kernel
target = tvm.target.Target.current()
if "vulkan" in str(target):
ntx = max_threads
diff --git a/src/runtime/contrib/cublas/cublas_utils.h
b/src/runtime/contrib/cublas/cublas_utils.h
index 12260a78ef..9c99a83250 100644
--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -122,7 +122,7 @@ inline cudaDataType_t GetCudaDataType(DLDataType type) {
return CUDA_R_16BF;
}
}
- LOG(FATAL) << "Unsupported cuda type";
+ LOG(FATAL) << "Unsupported CUDA type";
}
/*! \brief Execute matrix multiply followed by the specified epilogue, using
cuBLASLt. */
diff --git a/src/runtime/contrib/nvshmem/memory_allocator.cc
b/src/runtime/contrib/nvshmem/memory_allocator.cc
index 5893d04ac3..c53935f8bc 100644
--- a/src/runtime/contrib/nvshmem/memory_allocator.cc
+++ b/src/runtime/contrib/nvshmem/memory_allocator.cc
@@ -76,7 +76,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
void* DeviceAllocDataSpace(Device dev, size_t size, size_t alignment,
DLDataType type_hint) final {
ICHECK_EQ(dev.device_type, DLDeviceType::kDLCUDA)
- << "nvshmem can only allocate cuda device memory space.";
+ << "nvshmem can only allocate CUDA device memory space.";
ICHECK(type_hint.code == DLDataTypeCode::kDLInt || type_hint.code ==
DLDataTypeCode::kDLUInt ||
type_hint.code == DLDataTypeCode::kDLFloat)
<< "nvshmem can only allocate tensor with int, usingned int or float
data types.";
diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
index 91af80de37..917fe1930e 100644
--- a/src/runtime/contrib/papi/papi.cc
+++ b/src/runtime/contrib/papi/papi.cc
@@ -88,7 +88,7 @@ int component_for_device(Device dev) {
/*! \brief MetricCollectorNode for PAPI metrics.
*
* PAPI (Performance Application Programming Interface) collects metrics on a
- * variety of platforms including cpu, cuda and rocm.
+ * variety of platforms including CPU, CUDA and ROCm.
*
* PAPI is avaliable at https://github.com/icl-utk-edu/papi.
*/
diff --git a/src/runtime/cuda/cuda_module.h b/src/runtime/cuda/cuda_module.h
index 935758f062..58776dd7e5 100644
--- a/src/runtime/cuda/cuda_module.h
+++ b/src/runtime/cuda/cuda_module.h
@@ -39,12 +39,12 @@ namespace runtime {
static constexpr const int kMaxNumGPUs = 32;
/*!
- * \brief create a cuda module from data.
+ * \brief create a CUDA module from data.
*
* \param data The module data, can be ptx, cubin
* \param fmt The format of the data, can be "ptx", "cubin"
* \param fmap The map function information map of each function.
- * \param cuda_source Optional, cuda source file
+ * \param cuda_source Optional, CUDA source file
*/
ffi::Module CUDAModuleCreate(std::string data, std::string fmt,
ffi::Map<ffi::String, FunctionInfo> fmap,
std::string cuda_source);
diff --git a/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc
b/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc
index 266d9ae641..578c3ad5ca 100644
--- a/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -109,7 +109,7 @@ class RewriteUnboundBlockNode : public PostprocNode {
public:
/*! \brief The max number of threads per block from Target */
int max_threads_per_block_ = -1;
- /*! \brief The max number of threadblocks in the cuda device */
+ /*! \brief The max number of threadblocks in the CUDA device */
int max_threadblocks_ = -1;
static void RegisterReflection() {
diff --git a/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc
b/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc
index d645200572..7b650643ec 100644
--- a/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc
@@ -53,7 +53,7 @@ class AutoBindNode : public ScheduleRuleNode {
public:
/*! \brief The max number of threads per block from Target */
int64_t max_threads_per_block_ = -1;
- /*! \brief The max number of threadblocks in the cuda device */
+ /*! \brief The max number of threadblocks in the CUDA device */
int64_t max_threadblocks_ = -1;
/*! \brief thread_extents Candidates of thread axis extent. */
ffi::Array<Integer> thread_extents_;
diff --git a/src/target/opt/build_cuda_off.cc b/src/target/opt/build_cuda_off.cc
index f35f69a1a2..339d07fd73 100644
--- a/src/target/opt/build_cuda_off.cc
+++ b/src/target/opt/build_cuda_off.cc
@@ -18,7 +18,7 @@
*/
/*!
- * Optional module when build cuda is switched to off
+ * Optional module when build CUDA is switched to off
*/
#include "../../runtime/cuda/cuda_module.h"
namespace tvm {
diff --git a/src/target/opt/build_cuda_on.cc b/src/target/opt/build_cuda_on.cc
index 88960594d0..8cc1472172 100644
--- a/src/target/opt/build_cuda_on.cc
+++ b/src/target/opt/build_cuda_on.cc
@@ -18,8 +18,8 @@
*/
/*!
- * Build cuda modules from source.
- * requires cuda to be available.
+ * Build CUDA modules from source.
+ * requires CUDA to be available.
*
* \file build_cuda.cc
*/
diff --git a/src/target/source/codegen_cuda.cc
b/src/target/source/codegen_cuda.cc
index 650c3e3d96..32f0907ee2 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -149,7 +149,7 @@ void CodeGenCUDA::PrintFunctionSignature(const ffi::String&
function_name, const
} else if (calling_conv == CallingConv::kDefault) {
os << "extern \"C\" __device__ ";
} else {
- LOG(FATAL) << "Unsupported calling convention for cuda codegen: " <<
calling_conv;
+ LOG(FATAL) << "Unsupported calling convention for CUDA codegen: " <<
calling_conv;
}
CodeGenC::PrintFunctionSignature(function_name, func, os);
}
diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h
index 02fc0603a5..4ff20f0c91 100644
--- a/src/target/source/codegen_cuda.h
+++ b/src/target/source/codegen_cuda.h
@@ -19,7 +19,7 @@
/*!
* \file codegen_cuda.h
- * \brief Utility to generate cuda code
+ * \brief Utility to generate CUDA code
*/
#ifndef TVM_TARGET_SOURCE_CODEGEN_CUDA_H_
#define TVM_TARGET_SOURCE_CODEGEN_CUDA_H_
diff --git a/src/target/source/literal/cuda_half_t.h
b/src/target/source/literal/cuda_half_t.h
index 682845e9e7..78ee0298be 100644
--- a/src/target/source/literal/cuda_half_t.h
+++ b/src/target/source/literal/cuda_half_t.h
@@ -19,7 +19,7 @@
/*!
* \file cuda_half_t.h
- * \brief half_t (fp16) definition for cuda codegen.
+ * \brief half_t (fp16) definition for CUDA codegen.
*/
#ifndef TVM_TARGET_SOURCE_LITERAL_CUDA_HALF_T_H_
#define TVM_TARGET_SOURCE_LITERAL_CUDA_HALF_T_H_
diff --git a/src/target/source/literal/cuda_int8_t.h
b/src/target/source/literal/cuda_int8_t.h
index ce166ea8f3..4d9a260979 100644
--- a/src/target/source/literal/cuda_int8_t.h
+++ b/src/target/source/literal/cuda_int8_t.h
@@ -19,7 +19,7 @@
/*!
* \file cuda_int8_t.h
- * \brief Extra int8 intrisic for cuda codegen.
+ * \brief Extra int8 intrisic for CUDA codegen.
*/
#ifndef TVM_TARGET_SOURCE_LITERAL_CUDA_INT8_T_H_
#define TVM_TARGET_SOURCE_LITERAL_CUDA_INT8_T_H_
diff --git a/tests/python/contrib/test_msc/test_runner.py
b/tests/python/contrib/test_msc/test_runner.py
index 14c872bedd..be3140f72a 100644
--- a/tests/python/contrib/test_msc/test_runner.py
+++ b/tests/python/contrib/test_msc/test_runner.py
@@ -91,7 +91,7 @@ def test_tvm_runner_cpu(training):
@tvm.testing.requires_cuda
@pytest.mark.parametrize("training", [True, False])
def test_tvm_runner_cuda(training):
- """Test runner for tvm on cuda"""
+ """Test runner for tvm on CUDA"""
_test_from_torch(TVMRunner, "cuda", training=training)
@@ -106,7 +106,7 @@ def test_torch_runner_cpu(training):
@tvm.testing.requires_cuda
@pytest.mark.parametrize("training", [True, False])
def test_torch_runner_cuda(training):
- """Test runner for torch on cuda"""
+ """Test runner for torch on CUDA"""
_test_from_torch(TorchRunner, "cuda", training=training, atol=1e-1,
rtol=1e-1)
diff --git a/tests/python/relax/test_relax_operators.py
b/tests/python/relax/test_relax_operators.py
index 897082dd79..c0417ccaab 100644
--- a/tests/python/relax/test_relax_operators.py
+++ b/tests/python/relax/test_relax_operators.py
@@ -54,7 +54,7 @@ def run_cpu(mod, func_name, *args, exec_mode):
def test_unique(exec_mode):
- # TODO(prakalp): also add test for compiling and running on cuda device.
+ # TODO(prakalp): also add test for compiling and running on CUDA device.
data_numpy = np.random.randint(0, 16, (16, 16))
data = tvm.runtime.tensor(data_numpy)
result, result_sorted = run_cpu(InputModule, "foo", data,
exec_mode=exec_mode)
diff --git
a/tests/python/tir-transform/test_tir_transform_device_kernel_launch.py
b/tests/python/tir-transform/test_tir_transform_device_kernel_launch.py
index 32ec677167..57d2bca3b4 100644
--- a/tests/python/tir-transform/test_tir_transform_device_kernel_launch.py
+++ b/tests/python/tir-transform/test_tir_transform_device_kernel_launch.py
@@ -26,7 +26,7 @@ def test_lower_device_kernel_launch():
The "tir.kernel_launch_params" determines which parameters belong
to the runtime, and which below to the device-side PrimFunc.
Parameters that are required prior to launching a kernel (e.g. the
- number of Cuda threads to use) are stored in the
+ number of CUDA threads to use) are stored in the
`"tir.kernel_launch_params"` attribute, and are used by the
runtime prior in order to launch the generated kernel.
"""
@@ -126,7 +126,7 @@ def test_collect_launch_parameter():
The "tir.kernel_launch_params" determines which parameters belong
to the runtime, and which below to the device-side PrimFunc.
Parameters that are required prior to launching a kernel (e.g. the
- number of Cuda threads to use) are stored in the
+ number of CUDA threads to use) are stored in the
`"tir.kernel_launch_params"` attribute, and are used by the
runtime prior in order to launch the generated kernel.
"""
diff --git a/tests/scripts/task_show_node_info.sh
b/tests/scripts/task_show_node_info.sh
index 8569a1e0d9..af095ed453 100755
--- a/tests/scripts/task_show_node_info.sh
+++ b/tests/scripts/task_show_node_info.sh
@@ -41,4 +41,4 @@ echo "===== RUNNER INFO ====="
df --human-readable
lscpu
free
-nvidia-smi 2>/dev/null || echo "cuda not found"
+nvidia-smi 2>/dev/null || echo "CUDA not found"