[clang] [AMDGPU][SPIRV] Correctly lower huge device function arguments (PR #176921)

Steffen Larsen via cfe-commits Wed, 21 Jan 2026 01:10:27 -0800

https://github.com/steffenlarsen updated 
https://github.com/llvm/llvm-project/pull/176921


>From 42c62b74dc5b988a66a08adee5e0030759a6d1b1 Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <[email protected]>
Date: Tue, 20 Jan 2026 06:37:43 -0600
Subject: [PATCH 1/4] [AMDGPU][SPIRV] Correctly lower huge device function
 arguments

In the ABIInfo implementations for both the SPIRV and AMDGPU targets,
the lowering of arguments too large to fit into registers is currently
prone to integer overflows when determining the number of needed
registers for the arguments. This causes arguments so large that they
need more registers than an `unsigned` can represent to look like they
fit into the available registers. To avoid this, the function for
determining the required number of registers is changed to return a
64-bit unsigned integer value instead.

Note that the SPIR-V target currently trips the verifier due to a check
that arguments passed by value don't exceed the representable size. This
also affects other targets, such as x86 and is outside the scope of
these changes.
See https://github.com/llvm/llvm-project/issues/118207.

Signed-off-by: Steffen Holst Larsen <[email protected]>
---
 clang/lib/CodeGen/Targets/AMDGPU.cpp           | 18 +++++++++---------
 clang/lib/CodeGen/Targets/SPIR.cpp             | 16 ++++++++--------
 .../device-function-huge-byval-arg.hip         | 17 +++++++++++++++++
 3 files changed, 34 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/CodeGenHIP/device-function-huge-byval-arg.hip

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 4bc9557b26b52..8c1e8c58e67ef 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -24,7 +24,7 @@ class AMDGPUABIInfo final : public DefaultABIInfo {
 private:
   static const unsigned MaxNumRegsForArgsRet = 16;
 
-  unsigned numRegsForType(QualType Ty) const;
+  uint64_t numRegsForType(QualType Ty) const;
 
   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
   bool isHomogeneousAggregateSmallEnough(const Type *Base,
@@ -78,20 +78,20 @@ bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
 }
 
 /// Estimate number of registers the type will use when passed in registers.
-unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
-  unsigned NumRegs = 0;
+uint64_t AMDGPUABIInfo::numRegsForType(QualType Ty) const {
+  uint64_t NumRegs = 0;
 
   if (const VectorType *VT = Ty->getAs<VectorType>()) {
     // Compute from the number of elements. The reported size is based on the
     // in-memory size, which includes the padding 4th element for 3-vectors.
     QualType EltTy = VT->getElementType();
-    unsigned EltSize = getContext().getTypeSize(EltTy);
+    uint64_t EltSize = getContext().getTypeSize(EltTy);
 
     // 16-bit element vectors should be passed as packed.
     if (EltSize == 16)
       return (VT->getNumElements() + 1) / 2;
 
-    unsigned EltNumRegs = (EltSize + 31) / 32;
+    uint64_t EltNumRegs = (EltSize + 31) / 32;
     return EltNumRegs * VT->getNumElements();
   }
 
@@ -247,7 +247,7 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, 
bool Variadic,
       return DefaultABIInfo::classifyArgumentType(Ty);
 
     // Pack aggregates <= 8 bytes into single VGPR or pair.
-    uint64_t Size = getContext().getTypeSize(Ty);
+    unsigned Size = getContext().getTypeSize(Ty);
     if (Size <= 64) {
       unsigned NumRegs = (Size + 31) / 32;
       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
@@ -264,7 +264,7 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, 
bool Variadic,
     }
 
     if (NumRegsLeft > 0) {
-      unsigned NumRegs = numRegsForType(Ty);
+      uint64_t NumRegs = numRegsForType(Ty);
       if (NumRegsLeft >= NumRegs) {
         NumRegsLeft -= NumRegs;
         return ABIArgInfo::getDirect();
@@ -281,8 +281,8 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, 
bool Variadic,
   // Otherwise just do the default thing.
   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
   if (!ArgInfo.isIndirect()) {
-    unsigned NumRegs = numRegsForType(Ty);
-    NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
+    uint64_t NumRegs = numRegsForType(Ty);
+    NumRegsLeft -= std::min(NumRegs, uint64_t{NumRegsLeft});
   }
 
   return ArgInfo;
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp 
b/clang/lib/CodeGen/Targets/SPIR.cpp
index ba90ab3e67053..61ea677292492 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -49,7 +49,7 @@ class AMDGCNSPIRVABIInfo : public SPIRVABIInfo {
   static constexpr unsigned MaxNumRegsForArgsRet = 16; // 16 32-bit registers
   mutable unsigned NumRegsLeft = 0;
 
-  unsigned numRegsForType(QualType Ty) const;
+  uint64_t numRegsForType(QualType Ty) const;
 
   bool isHomogeneousAggregateBaseType(QualType Ty) const override {
     return true;
@@ -234,21 +234,21 @@ RValue SPIRVABIInfo::EmitVAArg(CodeGenFunction &CGF, 
Address VAListAddr,
                           /*AllowHigherAlign=*/true, Slot);
 }
 
-unsigned AMDGCNSPIRVABIInfo::numRegsForType(QualType Ty) const {
+uint64_t AMDGCNSPIRVABIInfo::numRegsForType(QualType Ty) const {
   // This duplicates the AMDGPUABI computation.
-  unsigned NumRegs = 0;
+  uint64_t NumRegs = 0;
 
   if (const VectorType *VT = Ty->getAs<VectorType>()) {
     // Compute from the number of elements. The reported size is based on the
     // in-memory size, which includes the padding 4th element for 3-vectors.
     QualType EltTy = VT->getElementType();
-    unsigned EltSize = getContext().getTypeSize(EltTy);
+    uint64_t EltSize = getContext().getTypeSize(EltTy);
 
     // 16-bit element vectors should be passed as packed.
     if (EltSize == 16)
       return (VT->getNumElements() + 1) / 2;
 
-    unsigned EltNumRegs = (EltSize + 31) / 32;
+    uint64_t EltNumRegs = (EltSize + 31) / 32;
     return EltNumRegs * VT->getNumElements();
   }
 
@@ -355,8 +355,8 @@ ABIArgInfo 
AMDGCNSPIRVABIInfo::classifyArgumentType(QualType Ty) const {
   if (!isAggregateTypeForABI(Ty)) {
     ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
     if (!ArgInfo.isIndirect()) {
-      unsigned NumRegs = numRegsForType(Ty);
-      NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
+      uint64_t NumRegs = numRegsForType(Ty);
+      NumRegsLeft -= std::min(NumRegs, uint64_t{NumRegsLeft});
     }
 
     return ArgInfo;
@@ -401,7 +401,7 @@ ABIArgInfo 
AMDGCNSPIRVABIInfo::classifyArgumentType(QualType Ty) const {
   }
 
   if (NumRegsLeft > 0) {
-    unsigned NumRegs = numRegsForType(Ty);
+    uint64_t NumRegs = numRegsForType(Ty);
     if (NumRegsLeft >= NumRegs) {
       NumRegsLeft -= NumRegs;
       return ABIArgInfo::getDirect();
diff --git a/clang/test/CodeGenHIP/device-function-huge-byval-arg.hip 
b/clang/test/CodeGenHIP/device-function-huge-byval-arg.hip
new file mode 100644
index 0000000000000..adc2a0af738a9
--- /dev/null
+++ b/clang/test/CodeGenHIP/device-function-huge-byval-arg.hip
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -emit-llvm 
-fcuda-is-device -disable-llvm-verifier -o - %s | FileCheck %s 
--check-prefix=CHECK-AMDGCNSPIRV
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm 
-fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-AMDGCN
+
+// NOTE: The verifier is currently disabled for the spirv64 case as it 
complains
+//       about the 'byref' arguments being too large. This is currently a
+//       problem for all targets that lower large arguments to 'byref'
+//       arguments.
+
+#define __device__ __attribute__((device))
+
+typedef struct {
+  long data[6871947673600];
+} huge_struct;
+
+// CHECK-AMDGCNSPIRV: @_Z9printBits11huge_struct(ptr noundef 
byref(%struct.huge_struct)
+// CHECK-AMDGCN: @_Z9printBits11huge_struct(i16
+__device__ void printBits(huge_struct X) {}

>From 7283c6df4bfdad1e062f2efc615e92868d60321c Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <[email protected]>
Date: Tue, 20 Jan 2026 07:49:20 -0600
Subject: [PATCH 2/4] Expand tests to include large returns and call sites

Signed-off-by: Steffen Holst Larsen <[email protected]>
---
 .../device-function-huge-arg-ret.hip          | 34 +++++++++++++++++++
 .../device-function-huge-byval-arg.hip        | 17 ----------
 2 files changed, 34 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/CodeGenHIP/device-function-huge-arg-ret.hip
 delete mode 100644 clang/test/CodeGenHIP/device-function-huge-byval-arg.hip

diff --git a/clang/test/CodeGenHIP/device-function-huge-arg-ret.hip 
b/clang/test/CodeGenHIP/device-function-huge-arg-ret.hip
new file mode 100644
index 0000000000000..81cc6502cbd9e
--- /dev/null
+++ b/clang/test/CodeGenHIP/device-function-huge-arg-ret.hip
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -emit-llvm 
-fcuda-is-device -disable-llvm-verifier -o - %s | FileCheck %s 
--check-prefix=CHECK-AMDGCNSPIRV
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm 
-fcuda-is-device -disable-llvm-verifier -o - %s | FileCheck %s 
--check-prefix=CHECK-AMDGCN
+
+// NOTE: The verifier is currently disabled as it complains about the 'byref'
+//       arguments being too large. This is currently a problem for all targets
+//       that lower large arguments to 'byref' arguments.
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+typedef struct {
+  long data[6871947673600];
+} huge_struct;
+
+// CHECK-AMDGCNSPIRV: define spir_func void @_Z21deviceFuncWithHugeRetv(ptr 
dead_on_unwind noalias writable sret(%struct.huge_struct) align 8
+// CHECK-AMDGCN: define dso_local void @_Z21deviceFuncWithHugeRetv(ptr 
addrspace(5) dead_on_unwind noalias writable sret(%struct.huge_struct) align 8
+__device__ huge_struct deviceFuncWithHugeRet() { return {}; }
+
+// CHECK-AMDGCNSPIRV: define spir_func void 
@_Z21deviceFuncWithHugeArg11huge_struct(ptr noundef byref(%struct.huge_struct) 
align 8
+// CHECK-AMDGCN: define dso_local void 
@_Z21deviceFuncWithHugeArg11huge_struct(i16
+__device__ void deviceFuncWithHugeArg(huge_struct X) {}
+
+__device__ void deviceCaller() {
+  // CHECK-AMDGCNSPIRV: call spir_func addrspace(4) void 
@_Z21deviceFuncWithHugeRetv(ptr dead_on_unwind writable 
sret(%struct.huge_struct) align 8
+  // CHECK-AMDGCN: call void @_Z21deviceFuncWithHugeRetv(ptr addrspace(5) 
dead_on_unwind writable sret(%struct.huge_struct) align 8
+  huge_struct X = deviceFuncWithHugeRet();
+  // CHECK-AMDGCNSPIRV: call spir_func addrspace(4) void 
@_Z21deviceFuncWithHugeArg11huge_struct(ptr noundef byref(%struct.huge_struct) 
align 8
+  // CHECK-AMDGCN: call void @_Z21deviceFuncWithHugeArg11huge_struct(i16
+  deviceFuncWithHugeArg(X);
+}
+
+// CHECK-AMDGCNSPIRV: define spir_kernel void 
@_Z21globalFuncWithHugeArg11huge_struct(ptr addrspace(2) noundef 
byref(%struct.huge_struct) align 8
+// CHECK-AMDGCN: define dso_local amdgpu_kernel void 
@_Z21globalFuncWithHugeArg11huge_struct(ptr addrspace(4) noundef 
byref(%struct.huge_struct) align 8
+__global__ void globalFuncWithHugeArg(huge_struct X) {}
diff --git a/clang/test/CodeGenHIP/device-function-huge-byval-arg.hip 
b/clang/test/CodeGenHIP/device-function-huge-byval-arg.hip
deleted file mode 100644
index adc2a0af738a9..0000000000000
--- a/clang/test/CodeGenHIP/device-function-huge-byval-arg.hip
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -emit-llvm 
-fcuda-is-device -disable-llvm-verifier -o - %s | FileCheck %s 
--check-prefix=CHECK-AMDGCNSPIRV
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm 
-fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-AMDGCN
-
-// NOTE: The verifier is currently disabled for the spirv64 case as it 
complains
-//       about the 'byref' arguments being too large. This is currently a
-//       problem for all targets that lower large arguments to 'byref'
-//       arguments.
-
-#define __device__ __attribute__((device))
-
-typedef struct {
-  long data[6871947673600];
-} huge_struct;
-
-// CHECK-AMDGCNSPIRV: @_Z9printBits11huge_struct(ptr noundef 
byref(%struct.huge_struct)
-// CHECK-AMDGCN: @_Z9printBits11huge_struct(i16
-__device__ void printBits(huge_struct X) {}

>From be1e5c8dae30c65a549c451ebd3c2f875d1ae879 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <[email protected]>
Date: Wed, 21 Jan 2026 09:35:13 +0100
Subject: [PATCH 3/4] Revert getTypeSize mistake

---
 clang/lib/CodeGen/Targets/AMDGPU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 8c1e8c58e67ef..7ba32b92cfd55 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -247,7 +247,7 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, 
bool Variadic,
       return DefaultABIInfo::classifyArgumentType(Ty);
 
     // Pack aggregates <= 8 bytes into single VGPR or pair.
-    unsigned Size = getContext().getTypeSize(Ty);
+    uint64_t Size = getContext().getTypeSize(Ty);
     if (Size <= 64) {
       unsigned NumRegs = (Size + 31) / 32;
       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);

>From 9cf7728ab609e778fef03c4fa2c289cecaf6c04b Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <[email protected]>
Date: Wed, 21 Jan 2026 03:10:05 -0600
Subject: [PATCH 4/4] Fix test expectation for byref call in AMDGCN case

Signed-off-by: Steffen Holst Larsen <[email protected]>
---
 clang/test/CodeGenHIP/device-function-huge-arg-ret.hip | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGenHIP/device-function-huge-arg-ret.hip 
b/clang/test/CodeGenHIP/device-function-huge-arg-ret.hip
index 81cc6502cbd9e..4dc42493a291c 100644
--- a/clang/test/CodeGenHIP/device-function-huge-arg-ret.hip
+++ b/clang/test/CodeGenHIP/device-function-huge-arg-ret.hip
@@ -17,7 +17,7 @@ typedef struct {
 __device__ huge_struct deviceFuncWithHugeRet() { return {}; }
 
 // CHECK-AMDGCNSPIRV: define spir_func void 
@_Z21deviceFuncWithHugeArg11huge_struct(ptr noundef byref(%struct.huge_struct) 
align 8
-// CHECK-AMDGCN: define dso_local void 
@_Z21deviceFuncWithHugeArg11huge_struct(i16
+// CHECK-AMDGCN: define dso_local void 
@_Z21deviceFuncWithHugeArg11huge_struct(ptr addrspace(5) noundef 
byref(%struct.huge_struct) align 8
 __device__ void deviceFuncWithHugeArg(huge_struct X) {}
 
 __device__ void deviceCaller() {
@@ -25,7 +25,7 @@ __device__ void deviceCaller() {
   // CHECK-AMDGCN: call void @_Z21deviceFuncWithHugeRetv(ptr addrspace(5) 
dead_on_unwind writable sret(%struct.huge_struct) align 8
   huge_struct X = deviceFuncWithHugeRet();
   // CHECK-AMDGCNSPIRV: call spir_func addrspace(4) void 
@_Z21deviceFuncWithHugeArg11huge_struct(ptr noundef byref(%struct.huge_struct) 
align 8
-  // CHECK-AMDGCN: call void @_Z21deviceFuncWithHugeArg11huge_struct(i16
+  // CHECK-AMDGCN: call void @_Z21deviceFuncWithHugeArg11huge_struct(ptr 
addrspace(5) noundef byref(%struct.huge_struct) align 8
   deviceFuncWithHugeArg(X);
 }
 

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AMDGPU][SPIRV] Correctly lower huge device function arguments (PR #176921)

Reply via email to