[clang] [llvm] [Offload] Unify the kernel argument passing (PR #205224)

via cfe-commits Mon, 22 Jun 2026 17:02:46 -0700

llvmorg-github-actions[bot] wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen

@llvm/pr-subscribers-backend-amdgpu

Author: Joseph Huber (jhuber6)

<details>
<summary>Changes</summary>

Summary:
Currently we have two conflicting methods of passing kernel arguments, a
flat pointer + size and an array of pointers. We recently decided to
move the offload API to the latter because it is more generic and lets
you construct the other formats.

This PR primarily just changes the format and the one existing core use.
The uses should be simplier now. Future changes will change the OpenMP
argument parsing.


---

Patch is 23.10 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/205224.diff


10 Files Affected:

- (modified) clang/lib/CodeGen/CGCUDANV.cpp (+13-16) 
- (modified) clang/test/CodeGenCUDA/offload_via_llvm.cu (+28-21) 
- (modified) offload/include/Shared/APITypes.h (+5-7) 
- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+19-28) 
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+5-5) 
- (modified) offload/plugins-nextgen/common/src/RecordReplay.cpp (+1-1) 
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+3-43) 
- (modified) offload/plugins-nextgen/host/src/rtl.cpp (+4-1) 
- (modified) offload/plugins-nextgen/level_zero/include/L0Kernel.h (+3-2) 
- (modified) offload/plugins-nextgen/level_zero/src/L0Kernel.cpp (+2-34) 


``````````diff
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 17b1963684428..e1f8645bd9f2c 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -342,44 +342,41 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
     emitDeviceStubBodyLegacy(CGF, Args);
 }
 
-/// CUDA passes the arguments with a level of indirection. For example, a
-/// (void*, short, void*) is passed as {void **, short *, void **} to the 
launch
-/// function. For the LLVM/offload launch we flatten the arguments into the
-/// struct directly. In addition, we include the size of the arguments, thus
-/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
-/// nullptr}. The last nullptr needs to be initialized to an array of pointers
-/// pointing to the arguments if we want to offload to the host.
+/// Build the input as a sized array of pointers so that it can be launched by
+/// the offloading runtime.
 Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
                                                       FunctionArgList &Args) {
   SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
   for (auto &Arg : Args)
     ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
   llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
+  llvm::Type *KernelArgsPtrsTy = llvm::ArrayType::get(PtrTy, Args.size());
 
-  auto *Int64Ty = CGF.Builder.getInt64Ty();
-  KernelLaunchParamsTypes.push_back(Int64Ty);
-  KernelLaunchParamsTypes.push_back(PtrTy);
+  auto *Int32Ty = CGF.Builder.getInt32Ty();
+  KernelLaunchParamsTypes.push_back(Int32Ty);
   KernelLaunchParamsTypes.push_back(PtrTy);
 
   llvm::StructType *KernelLaunchParamsTy =
       llvm::StructType::create(KernelLaunchParamsTypes);
   Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
       KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args");
+  Address KernelArgsPtrs = CGF.CreateTempAllocaWithoutCast(
+      KernelArgsPtrsTy, CharUnits::fromQuantity(16), "kernel_args_ptrs");
   Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
       KernelLaunchParamsTy, CharUnits::fromQuantity(16),
       "kernel_launch_params");
 
-  auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
-  CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
+  CGF.Builder.CreateStore(llvm::ConstantInt::get(Int32Ty, Args.size()),
                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
-  CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
+  CGF.Builder.CreateStore(KernelArgsPtrs.emitRawPointer(CGF),
                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
-  CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
-                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
 
   for (unsigned i = 0; i < Args.size(); ++i) {
     auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
-    CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, 
i));
+    Address ArgAddr = CGF.Builder.CreateStructGEP(KernelArgs, i);
+    CGF.Builder.CreateStore(ArgVal, ArgAddr);
+    CGF.Builder.CreateStore(ArgAddr.emitRawPointer(CGF),
+                            CGF.Builder.CreateConstArrayGEP(KernelArgsPtrs, 
i));
   }
 
   return KernelLaunchParams;
diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu 
b/clang/test/CodeGenCUDA/offload_via_llvm.cu
index 62942d8dc0755..b13a64c81b775 100644
--- a/clang/test/CodeGenCUDA/offload_via_llvm.cu
+++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu
@@ -15,6 +15,7 @@
 // HST-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 4
 // HST-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 4
 // HST-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16
+// HST-NEXT:    [[KERNEL_ARGS_PTRS:%.*]] = alloca [4 x ptr], align 16
 // HST-NEXT:    [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16
 // HST-NEXT:    [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8
 // HST-NEXT:    [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8
@@ -25,27 +26,33 @@
 // HST-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
 // HST-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
 // HST-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr 
[[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0
-// HST-NEXT:    store i64 16, ptr [[TMP4]], align 16
+// HST-NEXT:    store i32 4, ptr [[TMP4]], align 16
 // HST-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr 
[[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1
-// HST-NEXT:    store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8
-// HST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr 
[[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2
-// HST-NEXT:    store ptr null, ptr [[TMP6]], align 4
-// HST-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4
-// HST-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr 
[[KERNEL_ARGS]], i32 0, i32 0
-// HST-NEXT:    store i32 [[TMP7]], ptr [[TMP8]], align 16
+// HST-NEXT:    store ptr [[KERNEL_ARGS_PTRS]], ptr [[TMP5]], align 4
+// HST-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// HST-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr 
[[KERNEL_ARGS]], i32 0, i32 0
+// HST-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 16
+// HST-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x ptr], ptr 
[[KERNEL_ARGS_PTRS]], i32 0, i32 0
+// HST-NEXT:    store ptr [[TMP7]], ptr [[TMP8]], align 16
 // HST-NEXT:    [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2
 // HST-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr 
[[KERNEL_ARGS]], i32 0, i32 1
 // HST-NEXT:    store i16 [[TMP9]], ptr [[TMP10]], align 4
-// HST-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
-// HST-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr 
[[KERNEL_ARGS]], i32 0, i32 2
-// HST-NEXT:    store ptr [[TMP11]], ptr [[TMP12]], align 8
-// HST-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
-// HST-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr 
[[KERNEL_ARGS]], i32 0, i32 3
-// HST-NEXT:    store ptr [[TMP13]], ptr [[TMP14]], align 4
-// HST-NEXT:    [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr 
[[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]])
-// HST-NEXT:    [[TMP16:%.*]] = load i32, ptr [[SHMEM_SIZE]], align 4
-// HST-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 4
-// HST-NEXT:    [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef 
@_Z18__device_stub__fooisPvS_, ptr noundef byval([[STRUCT_DIM3]]) align 4 
[[GRID_DIM]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[BLOCK_DIM]], ptr 
noundef [[KERNEL_LAUNCH_PARAMS]], i32 noundef [[TMP16]], ptr noundef [[TMP17]])
+// HST-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr 
[[KERNEL_ARGS_PTRS]], i32 0, i32 1
+// HST-NEXT:    store ptr [[TMP10]], ptr [[TMP11]], align 4
+// HST-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
+// HST-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr 
[[KERNEL_ARGS]], i32 0, i32 2
+// HST-NEXT:    store ptr [[TMP12]], ptr [[TMP13]], align 8
+// HST-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr 
[[KERNEL_ARGS_PTRS]], i32 0, i32 2
+// HST-NEXT:    store ptr [[TMP13]], ptr [[TMP14]], align 8
+// HST-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
+// HST-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr 
[[KERNEL_ARGS]], i32 0, i32 3
+// HST-NEXT:    store ptr [[TMP15]], ptr [[TMP16]], align 4
+// HST-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr 
[[KERNEL_ARGS_PTRS]], i32 0, i32 3
+// HST-NEXT:    store ptr [[TMP16]], ptr [[TMP17]], align 4
+// HST-NEXT:    [[TMP18:%.*]] = call i32 @__llvmPopCallConfiguration(ptr 
[[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]])
+// HST-NEXT:    [[TMP19:%.*]] = load i32, ptr [[SHMEM_SIZE]], align 4
+// HST-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[STREAM]], align 4
+// HST-NEXT:    [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef 
@_Z18__device_stub__fooisPvS_, ptr noundef byval([[STRUCT_DIM3]]) align 4 
[[GRID_DIM]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[BLOCK_DIM]], ptr 
noundef [[KERNEL_LAUNCH_PARAMS]], i32 noundef [[TMP19]], ptr noundef [[TMP20]]) 
#[[ATTR3:[0-9]+]]
 // HST-NEXT:    br label %[[SETUP_END:.*]]
 // HST:       [[SETUP_END]]:
 // HST-NEXT:    ret void
@@ -72,15 +79,15 @@ __global__ void foo(int, short, void *, void *) {}
 // HST-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4
 // HST-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4
 // HST-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
-// HST-NEXT:    call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 
dereferenceable(12) [[AGG_TMP]], i32 noundef 3, i32 noundef 1, i32 noundef 1)
-// HST-NEXT:    call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 
dereferenceable(12) [[AGG_TMP1]], i32 noundef 7, i32 noundef 1, i32 noundef 1)
-// HST-NEXT:    [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(ptr 
noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP]], ptr noundef 
byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP1]], i32 noundef 0, ptr noundef null)
+// HST-NEXT:    call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 
dereferenceable(12) [[AGG_TMP]], i32 noundef 3, i32 noundef 1, i32 noundef 1) 
#[[ATTR4:[0-9]+]]
+// HST-NEXT:    call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 
dereferenceable(12) [[AGG_TMP1]], i32 noundef 7, i32 noundef 1, i32 noundef 1) 
#[[ATTR4]]
+// HST-NEXT:    [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(ptr 
noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP]], ptr noundef 
byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP1]], i32 noundef 0, ptr noundef null) 
#[[ATTR4]]
 // HST-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0
 // HST-NEXT:    br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label 
%[[KCALL_CONFIGOK:.*]]
 // HST:       [[KCALL_CONFIGOK]]:
 // HST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
 // HST-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
-// HST-NEXT:    call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 
noundef signext 1, ptr noundef [[TMP0]], ptr noundef [[TMP1]]) #[[ATTR3:[0-9]+]]
+// HST-NEXT:    call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 
noundef signext 1, ptr noundef [[TMP0]], ptr noundef [[TMP1]]) #[[ATTR4]]
 // HST-NEXT:    br label %[[KCALL_END]]
 // HST:       [[KCALL_END]]:
 // HST-NEXT:    ret void
diff --git a/offload/include/Shared/APITypes.h 
b/offload/include/Shared/APITypes.h
index 948c12a27107e..47d8c49bf7ef2 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -123,14 +123,12 @@ static_assert(sizeof(KernelArgsTy) ==
                    4 * sizeof(void **) + 2 * sizeof(int64_t *)),
               "Invalid struct size");
 
-/// Flat array of kernel launch parameters and their total size.
+/// Array of pointers to kernel launch arguments and the size of that array.
 struct KernelLaunchParamsTy {
-  /// Size of the Data array.
-  size_t Size = 0;
-  /// Flat array of kernel parameters.
-  void *Data = nullptr;
-  /// Ptrs to the Data entries. Only strictly required for the host plugin.
-  void **Ptrs = nullptr;
+  /// Number of kernel arguments in \p Args.
+  uint32_t NumArgs = 0;
+  /// Array of \p NumArgs pointers, each pointing at one argument's value.
+  void **Args = nullptr;
 };
 
 /// The outcome of a kernel replay.
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp 
b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index bb07dcc5c91fe..50b0feaf231f2 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -4207,36 +4207,27 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy 
&GenericDevice,
 
   // Copy explicit arguments.
   size_t ExplicitEnd = 0;
-  if (KernelArgs.Flags.IsPtrArgs) {
-    if (KernelArgs.ArgPtrs) {
-      const auto &ArgMDs = KernelInfo.ArgMDs;
-
-      // ArgMDs might also contain hidden implicit arguments, so we can't check
-      // if user-provided NumArgs matches exactly.
-      if (KernelArgs.NumArgs > ArgMDs.size())
-        return Plugin::error(
-            ErrorCode::INVALID_ARGUMENT,
-            "number of arguments (%u) exceeds the number of arguments "
-            "expected by the kernel (%zu)",
-            KernelArgs.NumArgs, ArgMDs.size());
-
-      for (size_t I = 0; I < KernelArgs.NumArgs; I++) {
-        auto [Offset, Size] = ArgMDs[I];
-        std::memcpy(utils::advancePtr(AllArgs, Offset), KernelArgs.ArgPtrs[I],
-                    Size);
-      }
+  if (LaunchParams.Args) {
+    const auto &ArgMDs = KernelInfo.ArgMDs;
+    uint32_t NumArgs = LaunchParams.NumArgs;
 
-      if (KernelArgs.NumArgs) {
-        auto [Offset, Size] = ArgMDs[KernelArgs.NumArgs - 1];
-        ExplicitEnd = Offset + Size;
-      }
+    if (NumArgs > ArgMDs.size())
+      return Plugin::error(
+          ErrorCode::INVALID_ARGUMENT,
+          "number of arguments (%u) exceeds the number of arguments "
+          "expected by the kernel (%zu)",
+          NumArgs, ArgMDs.size());
+
+    for (size_t I = 0; I < NumArgs; I++) {
+      auto [Offset, Size] = ArgMDs[I];
+      std::memcpy(utils::advancePtr(AllArgs, Offset), LaunchParams.Args[I],
+                  Size);
+    }
+
+    if (NumArgs) {
+      auto [Offset, Size] = ArgMDs[NumArgs - 1];
+      ExplicitEnd = Offset + Size;
     }
-  } else {
-    // TODO: We should expose the args memory manager alloc to the common part
-    // as alternative to copying them twice.
-    if (LaunchParams.Size)
-      std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
-    ExplicitEnd = LaunchParams.Size;
   }
 
   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp 
b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 0e0e1163d6e39..afc87e99f0783 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -297,13 +297,13 @@ Error GenericKernelTy::launch(GenericDeviceTy 
&GenericDevice, void **ArgPtrs,
 
   KernelLaunchParamsTy LaunchParams;
 
-  // Kernel languages (.IsCUDA) don't use indirection, whereas dispatching with
-  // an array of kernel argument pointers (.IsPtrArgs) uses KernelArgs.ArgPtrs
-  // and KernelArgs.ArgSizes directly.
+  // Kernel languages do not use the OpenMP indirection and argument parsing.
   if (KernelArgs.Flags.IsCUDA) {
     LaunchParams =
         *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs);
-  } else if (!KernelArgs.Flags.IsPtrArgs) {
+  } else if (KernelArgs.Flags.IsPtrArgs) {
+    LaunchParams = KernelLaunchParamsTy{KernelArgs.NumArgs, 
KernelArgs.ArgPtrs};
+  } else {
     LaunchParams =
         prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
                     Args, Ptrs, *KernelLaunchEnvOrErr, KernelArgs.Version);
@@ -377,7 +377,7 @@ GenericKernelTy::prepareArgs(GenericDeviceTy 
&GenericDevice, void **ArgPtrs,
   for (uint32_t I = 0; I < NumArgs; ++I)
     Ptrs[I] = &Args[I];
 
-  return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
+  return KernelLaunchParamsTy{NumArgs, &Ptrs[0]};
 }
 
 uint32_t
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp 
b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index ca6c5e7d98e45..436ee6308c0db 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -292,7 +292,7 @@ Error NativeRecordReplayTy::recordDescImpl(
 
   json::Array JsonArgPtrs;
   for (uint32_t I = 0; I < KernelArgs.NumArgs; ++I)
-    JsonArgPtrs.push_back((intptr_t)(*(void **)LaunchParams.Ptrs[I]));
+    JsonArgPtrs.push_back((intptr_t)(*(void **)LaunchParams.Args[I]));
   JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
 
   json::Array JsonArgOffsets;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp 
b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 51e2bdb0c01dc..c696150fcb696 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -133,8 +133,7 @@ struct CUDAKernelTy : public GenericKernelTy {
     // Set the static block memory size required by the kernel.
     StaticBlockMemSize = SharedMemSize;
 
-    // Retrieve the size of the arguments.
-    return initArgsSize();
+    return Plugin::success();
   }
 
   /// Launch the CUDA kernel function.
@@ -164,32 +163,11 @@ struct CUDAKernelTy : public GenericKernelTy {
                               uint32_t DynBlockMemSize) const override;
 
 private:
-  /// Initialize the size of the arguments.
-  Error initArgsSize() {
-    CUresult Res;
-    size_t ArgOffset, ArgSize;
-    size_t Arg = 0;
-
-    ArgsSize = 0;
-
-    // Find the last argument to know the total size of the arguments.
-    while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
-           CUDA_SUCCESS)
-      ArgsSize = ArgOffset + ArgSize;
-
-    if (Res != CUDA_ERROR_INVALID_VALUE)
-      return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
-    return Plugin::success();
-  }
-
   /// The CUDA kernel function to execute.
   CUfunction Func;
   /// The maximum amount of dynamic shared memory per thread group. By default,
   /// this is set to 48 KB.
   mutable uint32_t MaxDynBlockMemSize = 49152;
-
-  /// The size of the kernel arguments.
-  size_t ArgsSize;
 };
 
 /// Class wrapping a CUDA stream reference. These are the objects handled by 
the
@@ -1472,28 +1450,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy 
&GenericDevice,
                                AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
 
-  void **KernelParams = nullptr;
-  if (KernelArgs.Flags.IsPtrArgs) {
-    KernelParams = KernelArgs.ArgPtrs;
-  } else {
-    // The args size passed in LaunchParams may have tail padding,
-    // which is not accepted by the CUDA driver.
-    if (ArgsSize > LaunchParams.Size)
-      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                           "mismatch in kernel arguments");
-  }
-
   CUstream Stream;
   if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
-  size_t ConfigArgsSize = ArgsSize;
-  // Valid only for a contiguous buffer passed through LaunchParams.
-  void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
-                    CU_LAUNCH_PARAM_BUFFER_SIZE,
-                    reinterpret_cast<void *>(&ConfigArgsSize),
-                    CU_LAUNCH_PARAM_END};
-
   // If we are running an RPC server we want to wake up the server thread
   // whenever there is a kernel running and let it sleep otherwise.
   if (GenericDevice.getRPCServer())
@@ -1520,8 +1480,8 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy 
&GenericDevice,
                                  DynBlockMemSize, Stream,
                                  &CoopAttr,       1};
 
-  CUresult Res = cuLaunchKernelEx(&LaunchConfig, Func, KernelParams,
-                                  KernelParams ? nullptr : Config);
+  CUresult Res = cuLaunchKernelEx(&LaunchConfig, Func, LaunchParams.Args,
+                                  /*extra=*/nullptr);
 
   // Register a callback to indicate when the kernel is complete.
   if (GenericDevice.getRPCServer())
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp 
b/offload/plugins-nextgen/host/src/rtl.cpp
index 7c676bd82c801..8dd2399325162 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -99,7 +99,10 @@ struct GenELF64KernelTy : public GenericKernelTy {
                            "cooperative kernel launch not supported for host");
     // TODO: The data will need to be copied locally if we ever support
     //       asynchronous kernel launches in the host interface.
-    Func(LaunchParams.Data);
+    llvm::SmallVector<void *, 16> Buffer(LaunchParams.NumArgs);
+    for (uint32_t I = 0; I < LaunchParams.NumArgs; ++I)
+      Buffer[I] = *reinterpret_cast<void **>(LaunchParams.Args[I]);
+    Func(LaunchParams.NumArgs ? Buffer.data() : nullptr);
     return Plugin::success();
   }
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h 
b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index 131f5cd8403e6..fb84c761d3ff1 10...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/205224
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [Offload] Unify the kernel argument passing (PR #205224)

Reply via email to