(tvm) branch main updated: [CUDA][FFI] Extend kernel launch config to support Programmatic Dependent Launch and cuLaunchCooperativeKernel (#18604)

tqchen Sat, 27 Dec 2025 06:15:07 -0800

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new cfadc1aff2 [CUDA][FFI] Extend kernel launch config to support 
Programmatic Dependent Launch and cuLaunchCooperativeKernel (#18604)
cfadc1aff2 is described below

commit cfadc1aff28631f8b1c02935280032e9979c90ac
Author: senhtry <[email protected]>
AuthorDate: Sat Dec 27 22:14:53 2025 +0800

    [CUDA][FFI] Extend kernel launch config to support Programmatic Dependent 
Launch and cuLaunchCooperativeKernel (#18604)
    
    This patch adds support for **Programmatic Dependent Kernel Launch
    (PDL)** in the TVM CUDA FFI layer. PDL enables launching dependent
    kernels on the GPU without host intervention, improving performance and
    expressiveness for dynamic CUDA workloads.
    
    Refer to NVIDIA documentation for PDL semantics:
    
    
https://docs.nvidia.com/cuda/cuda-programming-guide/03-advanced/advanced-host-programming.html#programmatic-dependent-kernel-launch
    
    In addition, this patch extends the CUDA FFI layer to support
    cooperative kernel launches via cuLaunchCooperativeKernel. Cooperative
    kernels allow grid-wide synchronization and are required for certain
    multi-stage or producer–consumer GPU workloads. When a kernel is marked
    for cooperative launch, it will be dispatched using
    cuLaunchCooperativeKernel instead of the standard cuLaunchKernel.
    
    Refer to NVIDIA documentation for cooperative kernel usage:
    
    
https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/cooperative-groups.html#when-to-use-cudalaunchcooperativekernel
---
 src/runtime/cuda/cuda_module.cc    | 33 ++++++++++++++++++++++++++++++---
 src/runtime/meta_data.h            |  4 ++++
 src/runtime/thread_storage_scope.h | 12 ++++++++++++
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 3fee6b55f2..f07996c68b 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -200,9 +200,36 @@ class CUDAWrappedFunc {
       }
     }
     CUstream strm = static_cast<CUstream>(TVMFFIEnvGetStream(kDLCUDA, 
device_id));
-    CUresult result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), 
wl.grid_dim(1),
-                                     wl.grid_dim(2), wl.block_dim(0), 
wl.block_dim(1),
-                                     wl.block_dim(2), wl.dyn_shmem_size, strm, 
void_args, nullptr);
+    CUresult result;
+
+    if (launch_param_config_.use_programtic_dependent_launch()) {
+      CUlaunchConfig config{};
+      CUlaunchAttribute attribute[1]{};
+      attribute[0].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
+      attribute[0].value.programmaticStreamSerializationAllowed = 1;
+
+      config.attrs = attribute;
+      config.numAttrs = 1;
+      config.hStream = strm;
+      config.gridDimX = wl.grid_dim(0);
+      config.gridDimY = wl.grid_dim(1);
+      config.gridDimZ = wl.grid_dim(2);
+      config.blockDimX = wl.block_dim(0);
+      config.blockDimY = wl.block_dim(1);
+      config.blockDimZ = wl.block_dim(2);
+      config.sharedMemBytes = wl.dyn_shmem_size;
+
+      result = cuLaunchKernelEx(&config, fcache_[device_id], void_args, 
nullptr);
+    } else if (launch_param_config_.use_cooperative_launch()) {
+      result = cuLaunchCooperativeKernel(fcache_[device_id], wl.grid_dim(0), 
wl.grid_dim(1),
+                                         wl.grid_dim(2), wl.block_dim(0), 
wl.block_dim(1),
+                                         wl.block_dim(2), wl.dyn_shmem_size, 
strm, void_args);
+    } else {
+      result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), 
wl.grid_dim(1), wl.grid_dim(2),
+                              wl.block_dim(0), wl.block_dim(1), 
wl.block_dim(2), wl.dyn_shmem_size,
+                              strm, void_args, nullptr);
+    }
+
     if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) {
       const char* msg;
       cuGetErrorName(result, &msg);
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 85b83289f4..aceb97b583 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -48,6 +48,10 @@ namespace launch_param {
 
 /*! \brief A tag to specify whether or not dynamic shared memory is used */
 constexpr const char* kUseDynamicSharedMemoryTag = "tir.use_dyn_shared_memory";
+/*! \brief A tag to specify whether or not use programatic dependent launch */
+constexpr const char* kUseProgramaticDependentLaunch = 
"tir.use_programtic_dependent_launch";
+/*! \brief A tag to specify whether or not use cooperative launch */
+constexpr const char* kUseCooperativeLaunch = "tir.use_cooperative_launch";
 
 }  // namespace launch_param
 
diff --git a/src/runtime/thread_storage_scope.h 
b/src/runtime/thread_storage_scope.h
index 914fe67819..c2cd792220 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -247,6 +247,10 @@ class LaunchParamConfig {
         ICHECK_EQ(i, launch_param_tags.size() - 1)
             << "kUseDynamicSharedMemoryTag should be the last tag in 
launch_param_tags.";
         use_dyn_shared_memory_ = true;
+      } else if (tag == launch_param::kUseProgramaticDependentLaunch) {
+        use_programmatic_dependent_launch_ = true;
+      } else if (tag == launch_param::kUseCooperativeLaunch) {
+        use_cooperative_launch_ = true;
       } else {
         ThreadScope ts = ThreadScope::Create(tag);
         arg_index_map_.push_back(ts.rank * 3 + ts.dim_index);
@@ -281,6 +285,10 @@ class LaunchParamConfig {
   // return the work dim
   size_t work_dim() const { return work_dim_; }
 
+  bool use_programtic_dependent_launch() const { return 
use_programmatic_dependent_launch_; }
+
+  bool use_cooperative_launch() const { return use_cooperative_launch_; }
+
  private:
   /*! \brief base axis */
   size_t base_;
@@ -290,6 +298,10 @@ class LaunchParamConfig {
   std::vector<uint32_t> arg_index_map_;
   /*! \brief Whether or not use dynamic shared memory. */
   bool use_dyn_shared_memory_{false};
+  /*! \brief Whether or not use programmatic dependent launch. */
+  bool use_programmatic_dependent_launch_{false};
+  /*! \brief Whether or not use cooperative launch. */
+  bool use_cooperative_launch_{false};
 };
 
 }  // namespace runtime

(tvm) branch main updated: [CUDA][FFI] Extend kernel launch config to support Programmatic Dependent Launch and cuLaunchCooperativeKernel (#18604)

Reply via email to