Author: Srivarshitha M
Date: 2026-04-08T10:07:08+09:00
New Revision: 24833808df27fe0b27b4491617dae746dec301ba

URL: 
https://github.com/llvm/llvm-project/commit/24833808df27fe0b27b4491617dae746dec301ba
DIFF: 
https://github.com/llvm/llvm-project/commit/24833808df27fe0b27b4491617dae746dec301ba.diff

LOG: [CIR][CUDA][HIP] Support stream per thread kernel launch (#188004)

Related: #175871, #179278

When `-fgpu-default-stream=per-thread` is specified, CUDA and HIP
kernels should be launched using the per-thread stream variants of the
launch API instead of the default `cudaLaunchKernel`/`hipLaunchKernel`.

This PR implements that by selecting the correct launch function name in
`emitDeviceStubBodyNew`:
For CUDA: `cudaLaunchKernel_ptsz`
For HIP: `hipLaunchKernel_spt`

This matches the behavior of the OG CodeGen implementation in
`CGCUDANV.cpp`.

Added: 
    

Modified: 
    clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
    clang/test/CIR/CodeGenCUDA/kernel-call.cu

Removed: 
    


################################################################################
diff  --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp 
b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
index 8b8e99023eceb..1bae5e470aadd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -150,10 +150,14 @@ void 
CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
 
   // The default stream is usually stream 0 (the legacy default stream).
   // For per-thread default stream, we need a 
diff erent LaunchKernel function.
-  StringRef kernelLaunchAPI = "LaunchKernel";
+  std::string kernelLaunchAPI = "LaunchKernel";
   if (cgm.getLangOpts().GPUDefaultStream ==
-      LangOptions::GPUDefaultStreamKind::PerThread)
-    cgm.errorNYI("CUDA/HIP Stream per thread");
+      LangOptions::GPUDefaultStreamKind::PerThread) {
+    if (cgm.getLangOpts().HIP)
+      kernelLaunchAPI += "_spt";
+    else if (cgm.getLangOpts().CUDA)
+      kernelLaunchAPI += "_ptsz";
+  }
 
   std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
   const IdentifierInfo &launchII =

diff  --git a/clang/test/CIR/CodeGenCUDA/kernel-call.cu 
b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
index 2d37b6eef73af..230bcdfe6e22c 100644
--- a/clang/test/CIR/CodeGenCUDA/kernel-call.cu
+++ b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
@@ -14,6 +14,14 @@
 // RUN:   -emit-cir %s -x cuda -fcuda-is-device -o %t.device.cir
 // RUN: FileCheck --input-file=%t.device.cir %s --check-prefix=DEVICE
 
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
+// RUN:   -fgpu-default-stream=per-thread -DCUDA_API_PER_THREAD_DEFAULT_STREAM 
\
+// RUN:   -emit-cir %s -x cuda -o - | FileCheck %s --check-prefix=CUDA-PTH
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fhip-new-launch-api \
+// RUN:   -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \
+// RUN:   -emit-cir %s -x hip -o - | FileCheck %s --check-prefix=HIP-PTH
+
 #include "Inputs/cuda.h"
 
 
@@ -55,6 +63,8 @@
 // Check cudaLaunchKernel is called with all 6 arguments:
 // func ptr, gridDim, blockDim, args, sharedMem, stream
 // CUDA-NEW: cir.call @cudaLaunchKernel({{.*}}) : (!cir.ptr<!void>{{.*}}, 
!rec_dim3, !rec_dim3, !cir.ptr<!cir.ptr<!void>>{{.*}}, !u64i{{.*}}, 
!cir.ptr<!rec_cudaStream>{{.*}}) -> (!u32i {llvm.noundef})
+// CUDA-PTH: cir.call @cudaLaunchKernel_ptsz
+
 //
 // HIP-NEW: cir.global constant external @_Z6kernelif = 
#cir.global_view<@_Z21__device_stub__kernelif> : !cir.func<(!s32i, !cir.float)>
 // HIP-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelif
@@ -62,6 +72,8 @@
 // HIP-NEW: cir.call @__hipPopCallConfiguration({{.*}}) : 
(!cir.ptr<!rec_dim3>, !cir.ptr<!rec_dim3>, !cir.ptr<!u64i>, 
!cir.ptr<!cir.ptr<!rec_hipStream>>) -> !s32i
 // HIP-NEW: cir.get_global @_Z6kernelif : !cir.ptr<!cir.func<(!s32i, 
!cir.float)>>
 // HIP-NEW: cir.call @hipLaunchKernel({{.*}}) : (!cir.ptr<!void> {{.*}}, 
!rec_dim3, !rec_dim3, !cir.ptr<!cir.ptr<!void>>{{.*}}, !u64i{{.*}}, 
!cir.ptr<!rec_hipStream>{{.*}}) -> (!u32i {llvm.noundef})
+// HIP-PTH: cir.call @hipLaunchKernel_spt
+
 __global__ void kernel(int x, float y) {}
 
 // ===----------------------------------------------------------------------===


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to