(tvm-ffi) branch main updated: [Feature] Support AMD HIP for cpp extension (#460)

ruihangl Wed, 18 Feb 2026 09:54:42 -0800

This is an automated email from the ASF dual-hosted git repository.

ruihangl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-ffi.git



The following commit(s) were added to refs/heads/main by this push:
     new 65b5e905 [Feature] Support AMD HIP for cpp extension (#460)
65b5e905 is described below

commit 65b5e90576185cb6300f43bc1307158dc99afb54
Author: DarkSharpness <[email protected]>
AuthorDate: Thu Feb 19 01:54:32 2026 +0800

    [Feature] Support AMD HIP for cpp extension (#460)
    
    Related issue #458 .
    
    I'm not familiar with AMD at all, and most of the code is generated by
    claude code. I've only cleaned up a little and tried the following
    example on one AMD machine. Need some reviews from AMD experts.
    
    ```python
    import torch
    from tvm_ffi import Module
    import tvm_ffi.cpp
    
    # define the cpp source code
    cpp_source = '''
    #include <hip/hip_runtime.h>
    
    __global__ void add_one_kernel(const float* __restrict__ x,
                                   float* __restrict__ y,
                                   int64_t n) {
        int64_t i = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
        if (i < n) y[i] = x[i] + 1.0f;
    }
    
    void add_one_hip(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
        int64_t n = x.size(0);
        const float* x_ptr = static_cast<const float*>(x.data_ptr());
        float* y_ptr = static_cast<float*>(y.data_ptr());
        constexpr int threads = 256;
        int blocks = (int)((n + threads - 1) / threads);
        hipStream_t stream = 0;  // default stream; replace if your runtime 
provides a stream
        hipLaunchKernelGGL(add_one_kernel,
                           dim3(blocks), dim3(threads),
                           0, stream,
                           x_ptr, y_ptr, n);
    }
    '''
    
    # compile the cpp source code and load the module
    mod: Module = tvm_ffi.cpp.load_inline(
        name="hello",
        cuda_sources=cpp_source,
        functions="add_one_hip",
    )
    
    # use the function from the loaded module to perform
    x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32, device="cuda")
    y = torch.empty_like(x, device="cuda")
    mod.add_one_hip(x, y)
    torch.testing.assert_close(x + 1, y)
    ```
    
    ---------
    
    Co-authored-by: gemini-code-assist[bot] 
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 python/tvm_ffi/cpp/extension.py | 173 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 155 insertions(+), 18 deletions(-)

diff --git a/python/tvm_ffi/cpp/extension.py b/python/tvm_ffi/cpp/extension.py
index b558fb09..af30348c 100644
--- a/python/tvm_ffi/cpp/extension.py
+++ b/python/tvm_ffi/cpp/extension.py
@@ -27,13 +27,39 @@ import sys
 from collections.abc import Mapping, Sequence
 from contextlib import nullcontext
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 from tvm_ffi.libinfo import find_dlpack_include_path, find_include_path, 
find_libtvm_ffi
 from tvm_ffi.module import Module, load_module
 from tvm_ffi.utils import FileLock
 
 IS_WINDOWS = sys.platform == "win32"
+BACKEND_STR = Literal["cuda", "hip"]
+
+
[email protected]_cache
+def _detect_gpu_backend() -> BACKEND_STR:
+    """Auto-detect whether to use CUDA or HIP (ROCm).
+
+    Returns 'hip' if ROCm/HIP is available, 'cuda' otherwise.
+    """
+    # Check environment variable override first
+    backend = os.environ.get("TVM_FFI_GPU_BACKEND", "").lower()
+    if backend in ("cuda", "hip"):
+        return backend  # type: ignore[return-value]
+    try:
+        _find_rocm_home()
+        return "hip"
+    except RuntimeError:
+        return "cuda"
+
+
+def _resolve_gpu_backend(backend: str | None) -> BACKEND_STR:
+    if backend is not None:
+        if backend in ("cuda", "hip"):
+            return backend  # type: ignore[return-value]
+        raise ValueError(f"Invalid backend: {backend}. Supported backends are 
'cuda' and 'hip'.")
+    return _detect_gpu_backend()
 
 
 def _hash_sources(
@@ -177,6 +203,72 @@ def _get_cuda_target() -> str:
             )
 
 
[email protected]_cache
+def _find_rocm_home() -> str:
+    """Find the ROCm install path."""
+    # Guess #1: check environment variables
+    rocm_home = os.environ.get("ROCM_HOME") or os.environ.get("ROCM_PATH")
+    if rocm_home is None:
+        hipcc_path = shutil.which("hipcc")
+        # Guess #2: find hipcc in PATH and resolve ROCm home from it
+        if hipcc_path is not None:
+            rocm_home = str(Path(hipcc_path).resolve().parent.parent)
+            if Path(rocm_home).name == "hip":
+                rocm_home = str(Path(rocm_home).parent)
+        else:
+            # Guess #3: use default installation path
+            rocm_home = "/opt/rocm"
+            if not Path(rocm_home).exists():
+                raise RuntimeError(
+                    "Could not find ROCm installation. Please set ROCM_HOME 
environment variable."
+                )
+    return rocm_home
+
+
+def _get_rocm_target() -> list[str]:
+    """Get the ROCm target architecture flags (--offload-arch=gfxXXXX)."""
+    if "TVM_FFI_ROCM_ARCH_LIST" in os.environ:
+        arch_list = os.environ["TVM_FFI_ROCM_ARCH_LIST"].split()  # e.g., 
"gfx90a gfx942"
+        return [f"--offload-arch={arch}" for arch in arch_list]
+    # Try rocm_agent_enumerator
+    try:
+        agent_enum = str(Path(_find_rocm_home()) / "bin" / 
"rocm_agent_enumerator")
+        if not Path(agent_enum).exists():
+            agent_enum = "rocm_agent_enumerator"
+        status = subprocess.run(args=[agent_enum], capture_output=True, 
check=True, text=True)
+        archs = list(
+            dict.fromkeys(
+                line.strip()
+                for line in status.stdout.strip().split("\n")
+                if line.strip() and line.strip() != "gfx000"
+            )
+        )
+        if archs:
+            return [f"--offload-arch={arch}" for arch in archs]
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    # Try rocminfo
+    try:
+        status = subprocess.run(args=["rocminfo"], capture_output=True, 
check=True, text=True)
+        archs = list(
+            dict.fromkeys(
+                line.split(":")[-1].strip()
+                for line in status.stdout.split("\n")
+                if "Name:" in line
+                and "gfx" in line.lower()
+                and line.split(":")[-1].strip() != "gfx000"
+            )
+        )
+        if archs:
+            return [f"--offload-arch={arch}" for arch in archs]
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    raise RuntimeError(
+        "Could not detect ROCm GPU architecture automatically. "
+        "Please set TVM_FFI_ROCM_ARCH_LIST environment variable (e.g. 'gfx90a 
gfx942')."
+    )
+
+
 def _run_command_in_dev_prompt(
     args: list[str],
     cwd: str | os.PathLike[str],
@@ -242,7 +334,6 @@ def _run_command_in_dev_prompt(
 
 def _generate_ninja_build(  # noqa: PLR0915, PLR0912
     name: str,
-    with_cuda: bool,
     extra_cflags: Sequence[str],
     extra_cuda_cflags: Sequence[str],
     extra_ldflags: Sequence[str],
@@ -250,8 +341,13 @@ def _generate_ninja_build(  # noqa: PLR0915, PLR0912
     cpp_files: Sequence[str],
     cuda_files: Sequence[str],
     embed_cubin: Mapping[str, bytes] | None = None,
+    backend: str | None = None,
 ) -> str:
     """Generate the content of build.ninja for building the module."""
+    with_hip = backend == "hip"
+    with_cuda = backend == "cuda"
+    with_backend = with_hip or with_cuda
+
     default_include_paths = [find_include_path(), find_dlpack_include_path()]
     tvm_ffi_lib = Path(find_libtvm_ffi())
     tvm_ffi_lib_path = str(tvm_ffi_lib.parent)
@@ -280,11 +376,20 @@ def _generate_ninja_build(  # noqa: PLR0915, PLR0912
         ]
     else:
         default_cflags = ["-std=c++17", "-fPIC", "-O2"]
-        default_cuda_cflags = ["-Xcompiler", "-fPIC", "-std=c++17", "-O2"]
+        default_cuda_cflags = ["-std=c++17", "-O2"]
         default_ldflags = ["-shared", f"-L{tvm_ffi_lib_path}", "-ltvm_ffi"]
 
+        if with_hip:
+            rocm_home = _find_rocm_home()
+            default_cuda_cflags += ["-fPIC", "-D__HIP_PLATFORM_AMD__=1", 
"-fno-gpu-rdc"]
+            default_cuda_cflags += _get_rocm_target()
+            default_include_paths.append(str(Path(rocm_home) / "include"))
+            default_ldflags += [
+                f"-L{Path(rocm_home) / 'lib'!s}",
+                "-lamdhip64",
+            ]
         if with_cuda:
-            # determine the compute capability of the current GPU
+            default_cuda_cflags = ["-Xcompiler", "-fPIC", *default_cuda_cflags]
             default_cuda_cflags += [_get_cuda_target()]
             default_ldflags += [
                 "-L{}".format(str(Path(_find_cuda_home()) / "lib64")),
@@ -308,8 +413,11 @@ def _generate_ninja_build(  # noqa: PLR0915, PLR0912
     ninja.append("ninja_required_version = 1.3")
     ninja.append("cxx = {}".format(os.environ.get("CXX", "cl" if IS_WINDOWS 
else "c++")))
     ninja.append("cflags = {}".format(" ".join(cflags)))
-    if with_cuda:
-        ninja.append("nvcc = {}".format(str(Path(_find_cuda_home()) / "bin" / 
"nvcc")))
+    if with_backend:
+        if with_hip:
+            ninja.append("nvcc = {}".format(str(Path(_find_rocm_home()) / 
"bin" / "hipcc")))
+        if with_cuda:
+            ninja.append("nvcc = {}".format(str(Path(_find_cuda_home()) / 
"bin" / "nvcc")))
         ninja.append("cuda_cflags = {}".format(" ".join(cuda_cflags)))
     ninja.append("ldflags = {}".format(" ".join(ldflags)))
 
@@ -325,13 +433,16 @@ def _generate_ninja_build(  # noqa: PLR0915, PLR0912
         ninja.append("  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out")
     ninja.append("")
 
-    if with_cuda:
+    if with_backend:
         ninja.append("rule compile_cuda")
         ninja.append("  depfile = $out.d")
         ninja.append("  deps = gcc")
-        ninja.append(
-            "  command = $nvcc --generate-dependencies-with-compile 
--dependency-output $out.d $cuda_cflags -c $in -o $out"
-        )
+        if with_hip:
+            ninja.append("  command = $nvcc $cuda_cflags -c $in -o $out")
+        else:
+            ninja.append(
+                "  command = $nvcc  --generate-dependencies-with-compile 
--dependency-output $out.d $cuda_cflags -c $in -o $out"
+            )
         ninja.append("")
 
     # Add rules for object merging and cubin embedding (Unix only)
@@ -482,7 +593,7 @@ def _str_seq2list(seq: Sequence[str] | str | None) -> 
list[str]:
         return list(seq)
 
 
-def _build_impl(
+def _build_impl(  # noqa: PLR0913
     name: str,
     cpp_files: Sequence[str] | str | None,
     cuda_files: Sequence[str] | str | None,
@@ -493,15 +604,17 @@ def _build_impl(
     build_directory: str | None,
     need_lock: bool = True,
     embed_cubin: Mapping[str, bytes] | None = None,
+    backend: str | None = None,
 ) -> str:
     """Real implementation of build function."""
     # need to resolve the path to make it unique
     cpp_path_list = [str(Path(p).resolve()) for p in _str_seq2list(cpp_files)]
     cuda_path_list = [str(Path(p).resolve()) for p in 
_str_seq2list(cuda_files)]
     with_cpp = bool(cpp_path_list)
-    with_cuda = bool(cuda_path_list)
-    assert with_cpp or with_cuda, "Either cpp_files or cuda_files must be 
provided."
+    with_backend = bool(cuda_path_list)
+    assert with_cpp or with_backend, "Either cpp_files or cuda_files must be 
provided."
 
+    resolved_backend = _resolve_gpu_backend(backend) if with_backend else None
     extra_ldflags_list = list(extra_ldflags) if extra_ldflags is not None else 
[]
     extra_cflags_list = list(extra_cflags) if extra_cflags is not None else []
     extra_cuda_cflags_list = list(extra_cuda_cflags) if extra_cuda_cflags is 
not None else []
@@ -541,7 +654,6 @@ def _build_impl(
     # generate build.ninja
     ninja_source = _generate_ninja_build(
         name=name,
-        with_cuda=with_cuda,
         extra_cflags=extra_cflags_list,
         extra_cuda_cflags=extra_cuda_cflags_list,
         extra_ldflags=extra_ldflags_list,
@@ -549,6 +661,7 @@ def _build_impl(
         cpp_files=cpp_path_list,
         cuda_files=cuda_path_list,
         embed_cubin=embed_cubin,
+        backend=resolved_backend,
     )
 
     # may not hold lock when build_directory is specified, prevent deadlock
@@ -562,7 +675,7 @@ def _build_impl(
         return str((build_dir / f"{name}{ext}").resolve())
 
 
-def build_inline(
+def build_inline(  # noqa: PLR0913
     name: str,
     *,
     cpp_sources: Sequence[str] | str | None = None,
@@ -574,6 +687,7 @@ def build_inline(
     extra_include_paths: Sequence[str] | None = None,
     build_directory: str | None = None,
     embed_cubin: Mapping[str, bytes] | None = None,
+    backend: str | None = None,
 ) -> str:
     """Compile and build a C++/CUDA module from inline source code.
 
@@ -648,6 +762,10 @@ def build_inline(
         the macro `TVM_FFI_EMBED_CUBIN_GET_KERNEL(name, kernel_name)` defined 
in the `tvm/ffi/extra/cuda/cubin_launcher.h` header.
         See the `examples/cubin_launcher` directory for examples how to use 
cubin launcher to launch CUBIN kernels in TVM-FFI.
 
+    backend
+        The GPU backend to use. It can be "cuda" or "hip".
+        If not specified, the backend will be automatically determined based 
on the available GPU and the provided source code.
+
     Returns
     -------
     lib_path: str
@@ -702,7 +820,7 @@ def build_inline(
 
     cuda_source_list = _str_seq2list(cuda_sources)
     cuda_source = "\n".join(cuda_source_list)
-    with_cuda = bool(cuda_source_list)
+    with_backend = bool(cuda_source_list)
     del cuda_source_list
 
     extra_ldflags_list = list(extra_ldflags) if extra_ldflags is not None else 
[]
@@ -753,13 +871,13 @@ def build_inline(
     with FileLock(str(build_dir / "lock")):
         # write source files if they do not already exist
         _maybe_write(cpp_file, cpp_source)
-        if with_cuda:
+        if with_backend:
             _maybe_write(cuda_file, cuda_source)
 
         return _build_impl(
             name=name,
             cpp_files=[cpp_file] if with_cpp else [],
-            cuda_files=[cuda_file] if with_cuda else [],
+            cuda_files=[cuda_file] if with_backend else [],
             extra_cflags=extra_cflags_list,
             extra_cuda_cflags=extra_cuda_cflags_list,
             extra_ldflags=extra_ldflags_list,
@@ -767,6 +885,7 @@ def build_inline(
             build_directory=str(build_dir),
             need_lock=False,  # already hold the lock
             embed_cubin=embed_cubin,
+            backend=backend,
         )
 
 
@@ -783,6 +902,7 @@ def load_inline(  # noqa: PLR0913
     build_directory: str | None = None,
     embed_cubin: Mapping[str, bytes] | None = None,
     keep_module_alive: bool = True,
+    backend: str | None = None,
 ) -> Module:
     """Compile, build and load a C++/CUDA module from inline source code.
 
@@ -859,6 +979,10 @@ def load_inline(  # noqa: PLR0913
         Whether to keep the module alive. If True, the module will be kept 
alive
         for the duration of the program until libtvm_ffi.so is unloaded.
 
+    backend
+        The GPU backend to use. It can be "cuda" or "hip".
+        If not specified, the backend will be automatically determined based 
on the available GPU and the provided source code.
+
     Returns
     -------
     mod: Module
@@ -919,6 +1043,7 @@ def load_inline(  # noqa: PLR0913
             extra_include_paths=extra_include_paths,
             build_directory=build_directory,
             embed_cubin=embed_cubin,
+            backend=backend,
         ),
         keep_module_alive=keep_module_alive,
     )
@@ -934,6 +1059,7 @@ def build(
     extra_ldflags: Sequence[str] | None = None,
     extra_include_paths: Sequence[str] | None = None,
     build_directory: str | None = None,
+    backend: str | None = None,
 ) -> str:
     """Compile and build a C++/CUDA module from source files.
 
@@ -997,6 +1123,10 @@ def build(
         cache directory is ``~/.cache/tvm-ffi``. You can also set the 
``TVM_FFI_CACHE_DIR`` environment variable to
         specify the cache directory.
 
+    backend
+        The GPU backend to use. It can be "cuda" or "hip".
+        If not specified, the backend will be automatically determined based 
on the available GPU and the provided source code.
+
     Returns
     -------
     lib_path: str
@@ -1060,6 +1190,7 @@ def build(
         extra_include_paths=extra_include_paths,
         build_directory=build_directory,
         need_lock=True,
+        backend=backend,
     )
 
 
@@ -1074,6 +1205,7 @@ def load(
     extra_include_paths: Sequence[str] | None = None,
     build_directory: str | None = None,
     keep_module_alive: bool = True,
+    backend: str | None = None,
 ) -> Module:
     """Compile, build and load a C++/CUDA module from source files.
 
@@ -1141,6 +1273,10 @@ def load(
         Whether to keep the module alive. If True, the module will be kept 
alive
         for the duration of the program until libtvm_ffi.so is unloaded.
 
+    backend
+        The GPU backend to use. It can be "cuda" or "hip".
+        If not specified, the backend will be automatically determined based 
on the available GPU and the provided source code.
+
     Returns
     -------
     mod: Module
@@ -1205,6 +1341,7 @@ def load(
             extra_ldflags=extra_ldflags,
             extra_include_paths=extra_include_paths,
             build_directory=build_directory,
+            backend=backend,
         ),
         keep_module_alive=keep_module_alive,
     )

(tvm-ffi) branch main updated: [Feature] Support AMD HIP for cpp extension (#460)

Reply via email to