[clang] [llvm] [mlir] [AMDGPU] Verify AMDGPU required workgroup size matches flat workgroup size (PR #200989)

via cfe-commits Mon, 01 Jun 2026 19:52:20 -0700

llvmorg-github-actions[bot] wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-analysis

Author: Shilei Tian (shiltian)

<details>
<summary>Changes</summary>



---

Patch is 58.50 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/200989.diff


25 Files Affected:

- (modified) clang/include/clang/Basic/DiagnosticSemaKinds.td (+3) 
- (modified) clang/lib/CodeGen/Targets/AMDGPU.cpp (+5-5) 
- (modified) clang/lib/Sema/SemaDeclAttr.cpp (+35) 
- (modified) clang/test/CodeGenOpenCL/amdgpu-attrs.cl (+3-4) 
- (modified) clang/test/SemaOpenCL/amdgpu-attrs.cl (+3) 
- (modified) llvm/docs/AMDGPUUsage.rst (+3) 
- (modified) llvm/lib/IR/Verifier.cpp (+78) 
- (modified) 
llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll (+7-7) 
- (modified) 
llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll (+1-1) 
- (modified) 
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll 
(+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll (+2-1) 
- (modified) llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll (+11-7) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/zext-lid.ll (+4-2) 
- (modified) llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-negative-cases.ll 
(+8-8) 
- (modified) 
llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave32-optimizations.ll (+12-12) 
- (modified) 
llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave64-optimizations.ll (+7-7) 
- (added) llvm/test/Verifier/AMDGPU/reqd-work-group-size.ll (+78) 
- (modified) mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp 
(+35) 
- (modified) mlir/test/Target/LLVMIR/rocdl.mlir (+11) 


``````````diff
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 077aace321264..75d84b4129e8e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3697,6 +3697,9 @@ def err_swift_abi_parameter_wrong_type : Error<
 def err_attribute_argument_invalid : Error<
   "%0 attribute argument is invalid: %select{max must be 0 since min is 0|"
   "min must not be greater than max}1">;
+def err_attribute_amdgpu_flat_work_group_size_mismatch : Error<
+  "'amdgpu_flat_work_group_size' attribute must match "
+  "'reqd_work_group_size' product">;
 def err_attribute_argument_is_zero : Error<
   "%0 attribute must be greater than 0">;
 def warn_attribute_argument_n_negative : Warning<
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index a3a596bb9d822..7b37f3f7f9b6e 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -731,15 +731,15 @@ void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
   auto Eval = [&](Expr *E) {
     return E->EvaluateKnownConstInt(getContext()).getExtValue();
   };
-  if (FlatWGS) {
+  if (ReqdWGS) {
+    Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
+                Eval(ReqdWGS->getZDim());
+  } else if (FlatWGS) {
     Min = Eval(FlatWGS->getMin());
     Max = Eval(FlatWGS->getMax());
   }
-  if (ReqdWGS && Min == 0 && Max == 0)
-    Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
-                Eval(ReqdWGS->getZDim());
 
-  if (Min != 0) {
+  if (Min != 0 || ReqdWGS) {
     assert(Min <= Max && "Min must be less than or equal Max");
 
     if (MinThreadsVal)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ae04d3855f01c..c1b95a9dac24d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8439,6 +8439,40 @@ static bool isKernelDecl(Decl *D) {
          D->hasAttr<CUDAGlobalAttr>();
 }
 
+static void checkAMDGPUReqdWorkGroupSize(Sema &S, Decl *D) {
+  if (!S.Context.getTargetInfo().getTriple().isAMDGPU())
+    return;
+
+  const auto *Flat = D->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
+  const auto *Reqd = D->getAttr<ReqdWorkGroupSizeAttr>();
+  if (!Flat || !Reqd)
+    return;
+
+  auto Eval = [&](Expr *E) -> std::optional<uint64_t> {
+    if (E->isValueDependent())
+      return std::nullopt;
+    std::optional<llvm::APSInt> V = E->getIntegerConstantExpr(S.Context);
+    if (!V)
+      return std::nullopt;
+    return V->getZExtValue();
+  };
+
+  std::optional<uint64_t> X = Eval(Reqd->getXDim());
+  std::optional<uint64_t> Y = Eval(Reqd->getYDim());
+  std::optional<uint64_t> Z = Eval(Reqd->getZDim());
+  std::optional<uint64_t> Min = Eval(Flat->getMin());
+  std::optional<uint64_t> Max = Eval(Flat->getMax());
+  if (!X || !Y || !Z || !Min || !Max)
+    return;
+
+  uint64_t Product = *X * *Y * *Z;
+  if (*Min != Product || *Max != Product) {
+    S.Diag(Flat->getLocation(),
+           diag::err_attribute_amdgpu_flat_work_group_size_mismatch);
+    D->setInvalidDecl();
+  }
+}
+
 void Sema::ProcessDeclAttributeList(
     Scope *S, Decl *D, const ParsedAttributesView &AttrList,
     const ProcessDeclAttributeOptions &Options) {
@@ -8502,6 +8536,7 @@ void Sema::ProcessDeclAttributeList(
       D->setInvalidDecl();
     }
   }
+  checkAMDGPUReqdWorkGroupSize(*this, D);
 
   // CUDA/HIP: restrict explicit CUDA target attributes on deduction guides.
   //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl 
b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
index 606566e070c41..8251658edb631 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
@@ -134,9 +134,9 @@ __attribute__((reqd_work_group_size(32, 2, 1))) // 
expected-no-diagnostics
 kernel void reqd_work_group_size_32_2_1() {
 // CHECK: define{{.*}} amdgpu_kernel void @reqd_work_group_size_32_2_1() 
[[FLAT_WORK_GROUP_SIZE_64_64:#[0-9]+]]
 }
-__attribute__((reqd_work_group_size(32, 2, 1), amdgpu_flat_work_group_size(16, 
128))) // expected-no-diagnostics
-kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {
-// CHECK: define{{.*}} amdgpu_kernel void 
@reqd_work_group_size_32_2_1_flat_work_group_size_16_128() 
[[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]]
+__attribute__((reqd_work_group_size(32, 2, 1), amdgpu_flat_work_group_size(64, 
64))) // expected-no-diagnostics
+kernel void reqd_work_group_size_32_2_1_flat_work_group_size_64_64() {
+// CHECK: define{{.*}} amdgpu_kernel void 
@reqd_work_group_size_32_2_1_flat_work_group_size_64_64() 
[[FLAT_WORK_GROUP_SIZE_64_64]]
 }
 
 __attribute__((amdgpu_max_num_work_groups(1, 1, 1))) // expected-no-diagnostics
@@ -203,7 +203,6 @@ kernel void default_kernel() {
 
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}} 
"amdgpu-flat-work-group-size"="32,64"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_64_64]] = {{.*}} 
"amdgpu-flat-work-group-size"="64,64"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_16_128]] = {{.*}} 
"amdgpu-flat-work-group-size"="16,128"
 
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}} 
"amdgpu-flat-work-group-size"="1,256"  "amdgpu-waves-per-eu"="2"
 
diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl 
b/clang/test/SemaOpenCL/amdgpu-attrs.cl
index a6177798f4819..9321c2f83e01c 100644
--- a/clang/test/SemaOpenCL/amdgpu-attrs.cl
+++ b/clang/test/SemaOpenCL/amdgpu-attrs.cl
@@ -51,6 +51,9 @@ __attribute__((amdgpu_waves_per_eu(0, 4))) kernel void 
kernel_waves_per_eu_0_4()
 __attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void 
kernel_flat_work_group_size_64_32() {} // expected-error 
{{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be 
greater than max}}
 __attribute__((amdgpu_waves_per_eu(4, 2))) kernel void 
kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' 
attribute argument is invalid: min must not be greater than max}}
 
+__attribute__((reqd_work_group_size(32, 2, 1), amdgpu_flat_work_group_size(16, 
128))) kernel void 
kernel_reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {} // 
expected-error {{'amdgpu_flat_work_group_size' attribute must match 
'reqd_work_group_size' product}}
+__attribute__((amdgpu_flat_work_group_size(16, 128), reqd_work_group_size(32, 
2, 1))) kernel void 
kernel_flat_work_group_size_16_128_reqd_work_group_size_32_2_1() {} // 
expected-error {{'amdgpu_flat_work_group_size' attribute must match 
'reqd_work_group_size' product}}
+
 __attribute__((amdgpu_waves_per_eu(2, 4, 8))) kernel void 
kernel_waves_per_eu_2_4_8() {} // expected-error {{'amdgpu_waves_per_eu' 
attribute takes no more than 2 arguments}}
 
 __attribute__((amdgpu_flat_work_group_size(0, 0))) kernel void 
kernel_flat_work_group_size_0_0() {}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 61841716792ef..03ee0a0709443 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2249,6 +2249,9 @@ The AMDGPU backend supports the following LLVM IR 
attributes.
      "amdgpu-flat-work-group-size"="min,max"          Specify the minimum and 
maximum flat work group sizes that
                                                       will be specified when 
the kernel is dispatched. Generated
                                                       by the 
``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_.
+                                                      If the 
``reqd_work_group_size`` metadata is present, the product
+                                                      of its three workgroup 
size dimensions must match both ``min``
+                                                      and ``max``.
                                                       The IR implied default 
value is 1,1024. Clang may emit this attribute
                                                       with more restrictive 
bounds depending on language defaults.
                                                       If the actual block or 
workgroup size exceeds the limit at any point during
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 96f228f88a46c..072008a01c955 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -123,10 +123,12 @@
 #include "llvm/Support/ModRef.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Coroutines/CoroInstr.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <queue>
@@ -645,6 +647,7 @@ class Verifier : public InstVisitor<Verifier>, 
VerifierSupport {
   void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                            const Value *V, bool IsIntrinsic, bool IsInlineAsm);
   void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
+  void verifyAMDGPUReqdWorkGroupSize(const Function &F);
   void verifyUnknownProfileMetadata(MDNode *MD);
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
@@ -2814,6 +2817,80 @@ void Verifier::verifyFunctionMetadata(
   }
 }
 
+void Verifier::verifyAMDGPUReqdWorkGroupSize(const Function &F) {
+  if (!TT.isAMDGPU())
+    return;
+
+  MDNode *ReqdWorkGroupSize = F.getMetadata("reqd_work_group_size");
+  if (!ReqdWorkGroupSize)
+    return;
+
+  Check(ReqdWorkGroupSize->getNumOperands() == 3,
+        "reqd_work_group_size must have exactly three operands", &F,
+        ReqdWorkGroupSize);
+  if (ReqdWorkGroupSize->getNumOperands() != 3)
+    return;
+
+  uint64_t Product = 1;
+  for (unsigned I = 0; I != 3; ++I) {
+    ConstantInt *C =
+        mdconst::dyn_extract<ConstantInt>(ReqdWorkGroupSize->getOperand(I));
+    Check(C, "reqd_work_group_size operands must be integer constants", &F,
+          ReqdWorkGroupSize);
+    if (!C)
+      return;
+
+    const APInt &Value = C->getValue();
+    Check(Value.getActiveBits() <= 64,
+          "reqd_work_group_size operands must fit in 64 bits", &F,
+          ReqdWorkGroupSize);
+    if (Value.getActiveBits() > 64)
+      return;
+
+    uint64_t Dim = Value.getZExtValue();
+    Check(Dim == 0 || Product <= std::numeric_limits<uint64_t>::max() / Dim,
+          "reqd_work_group_size product must fit in 64 bits", &F,
+          ReqdWorkGroupSize);
+    if (Dim != 0 && Product > std::numeric_limits<uint64_t>::max() / Dim)
+      return;
+    Product *= Dim;
+  }
+
+  Attribute FlatWorkGroupSize = 
F.getFnAttribute("amdgpu-flat-work-group-size");
+  Check(FlatWorkGroupSize.isValid(),
+        "reqd_work_group_size requires amdgpu-flat-work-group-size", &F,
+        ReqdWorkGroupSize);
+  if (!FlatWorkGroupSize.isValid())
+    return;
+
+  Check(FlatWorkGroupSize.isStringAttribute(),
+        "amdgpu-flat-work-group-size must be a string attribute", &F);
+  if (!FlatWorkGroupSize.isStringAttribute())
+    return;
+
+  auto ParseUnsigned = [](StringRef S, uint64_t &Value) {
+    S = S.trim();
+    return !S.empty() && !S.starts_with("-") && !S.getAsInteger(0, Value);
+  };
+
+  StringRef AttrValue = FlatWorkGroupSize.getValueAsString();
+  std::pair<StringRef, StringRef> Values = AttrValue.split(',');
+  uint64_t Min = 0;
+  uint64_t Max = 0;
+  bool Parsed = !Values.second.contains(',') &&
+                ParseUnsigned(Values.first, Min) &&
+                ParseUnsigned(Values.second, Max);
+  Check(Parsed,
+        "amdgpu-flat-work-group-size must be a pair of unsigned integers", &F);
+  if (!Parsed)
+    return;
+
+  Check(Min == Product && Max == Product,
+        "amdgpu-flat-work-group-size must equal the product of "
+        "reqd_work_group_size operands",
+        &F, ReqdWorkGroupSize);
+}
+
 void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
   if (EntryC->getNumOperands() == 0)
     return;
@@ -3284,6 +3361,7 @@ void Verifier::visitFunction(const Function &F) {
   F.getAllMetadata(MDs);
   assert(F.hasMetadata() != MDs.empty() && "Bit out-of-sync");
   verifyFunctionMetadata(MDs);
+  verifyAMDGPUReqdWorkGroupSize(F);
 
   // Check validity of the personality function
   if (F.hasPersonalityFn()) {
diff --git 
a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll 
b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
index fbd855b0d691b..96d14dcde2ebc 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -67,7 +67,7 @@ define amdgpu_kernel void @workitem_id_z_singlethreaded() #2 {
 
 ; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_singlethreaded_md':
 ; CHECK-NOT: DIVERGENT
-define amdgpu_kernel void @workitem_id_x_singlethreaded_md() "amdgpu-no-wwm" 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @workitem_id_x_singlethreaded_md() 
"amdgpu-flat-work-group-size"="1,1" "amdgpu-no-wwm" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -75,7 +75,7 @@ define amdgpu_kernel void @workitem_id_x_singlethreaded_md() 
"amdgpu-no-wwm" !re
 
 ; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_singlethreaded_md':
 ; CHECK-NOT: DIVERGENT
-define amdgpu_kernel void @workitem_id_y_singlethreaded_md() 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @workitem_id_y_singlethreaded_md() 
"amdgpu-flat-work-group-size"="1,1" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -83,7 +83,7 @@ define amdgpu_kernel void @workitem_id_y_singlethreaded_md() 
!reqd_work_group_si
 
 ; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_singlethreaded_md':
 ; CHECK-NOT: DIVERGENT
-define amdgpu_kernel void @workitem_id_z_singlethreaded_md() 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @workitem_id_z_singlethreaded_md() 
"amdgpu-flat-work-group-size"="1,1" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -91,7 +91,7 @@ define amdgpu_kernel void @workitem_id_z_singlethreaded_md() 
!reqd_work_group_si
 
 ; CHECK-LABEL: UniformityInfo for function 
'workitem_id_x_not_singlethreaded_dimx':
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimx() 
!reqd_work_group_size !1 {
+define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimx() 
"amdgpu-flat-work-group-size"="2,2" !reqd_work_group_size !1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -99,7 +99,7 @@ define amdgpu_kernel void 
@workitem_id_x_not_singlethreaded_dimx() !reqd_work_gr
 
 ; CHECK-LABEL: UniformityInfo for function 
'workitem_id_x_not_singlethreaded_dimy':
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimy() 
!reqd_work_group_size !2 {
+define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimy() 
"amdgpu-flat-work-group-size"="2,2" !reqd_work_group_size !2 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -107,7 +107,7 @@ define amdgpu_kernel void 
@workitem_id_x_not_singlethreaded_dimy() !reqd_work_gr
 
 ; CHECK-LABEL: UniformityInfo for function 
'workitem_id_x_not_singlethreaded_dimz':
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() 
!reqd_work_group_size !3 {
+define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() 
"amdgpu-flat-work-group-size"="2,2" !reqd_work_group_size !3 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -115,7 +115,7 @@ define amdgpu_kernel void 
@workitem_id_x_not_singlethreaded_dimz() !reqd_work_gr
 
 ; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_uniform_len_1'
 ; CHECK-NOT: DIVERGENT
-define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) 
!reqd_work_group_size !4 {
+define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !4 {
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %id.z, ptr %o
   ret void
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
index d69515591ecee..c66ef8f64df25 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
@@ -1234,7 +1234,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare i32 @llvm.amdgcn.workitem.id.y() #1
 declare i32 @llvm.amdgcn.workitem.id.z() #1
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" }
 attributes #1 = { nounwind readnone speculatable willreturn }
 
 !llvm.module.flags = !{!6}
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir
index 915139b590fd4..8fcb3b6892a0e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir
@@ -2,27 +2,27 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -run-pass=legalizer -o - 
%s | FileCheck -check-prefix=GCN %s
 
 --- |
-  define amdgpu_kernel void @test_workitem_id_x_unpacked() 
!reqd_work_group_size !0 {
+  define amdgpu_kernel void @test_workitem_id_x_unpacked() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_y_unpacked() 
!reqd_work_group_size !0 {
+  define amdgpu_kernel void @test_workitem_id_y_unpacked() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_z_unpacked() 
!reqd_work_group_size !0 {
+  define amdgpu_kernel void @test_workitem_id_z_unpacked() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_x_packed() !reqd_work_group_size 
!0 {
+  define amdgpu_kernel void @test_workitem_id_x_packed() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_y_packed() !reqd_work_group_size 
!0 {
+  define amdgpu_kernel void @test_workitem_id_y_packed() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_z_packed() !reqd_work_group_size 
!0 {
+  define amdgpu_kernel void @test_workitem_id_z_packed() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index ef933fc7fdd4a..f84cdc32f1e86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -141,7 +141,7 @@ define void @test_workitem_id_z_func(ptr addrspace(1) %out) 
#1 {
 
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
-define amdgpu_kernel void @test_reqd_workgroup_size_x_only(ptr %out) 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @test_reqd_workgroup_size_x_only(ptr %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
@@ -163,7 +163,7 @@ define amdgpu_kernel void 
@test_reqd_workgroup_size_x_only(ptr %out) !reqd_work_
 ; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
-define amdgpu_kernel void @test_reqd_workgroup_size_y_only(ptr %out) 
!reqd_work_group_size !1 {
+define amdgpu_kernel void @test_reqd_workgroup_size_y_only(ptr %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !1 {
   %id.x = call i32 @llvm.amdgcn...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/200989
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [AMDGPU] Verify AMDGPU required workgroup size matches flat workgroup size (PR #200989)

Reply via email to