[clang] [llvm] [mlir] [AMDGPU] Verify AMDGPU required workgroup size matches flat workgroup size (PR #200989)

Shilei Tian via cfe-commits Wed, 03 Jun 2026 05:59:40 -0700

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/200989


>From 895d82ba3d9fd9f40ca2214add7a475f314aabe0 Mon Sep 17 00:00:00 2001
From: Shilei Tian <[email protected]>
Date: Mon, 1 Jun 2026 22:40:30 -0400
Subject: [PATCH] [AMDGPU] Verify AMDGPU required workgroup size matches flat
 workgroup size

---
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 10 +--
 clang/lib/Sema/SemaDeclAttr.cpp               | 35 ++++++++
 clang/test/CodeGenOpenCL/amdgpu-attrs.cl      |  7 +-
 clang/test/SemaOpenCL/amdgpu-attrs.cl         |  3 +
 llvm/docs/AMDGPUUsage.rst                     |  3 +
 llvm/lib/IR/Verifier.cpp                      | 82 +++++++++++++++++
 .../AMDGPU/workitem-intrinsics.ll             | 14 +--
 .../irtranslator-call-implicit-args.ll        |  2 +-
 .../legalize-amdgcn.workitem.id.mir           | 12 +--
 .../GlobalISel/llvm.amdgcn.workitem.id.ll     |  6 +-
 .../CodeGen/AMDGPU/call-reqd-group-size.ll    | 12 +--
 .../AMDGPU/hsa-metadata-from-llvm-ir-full.ll  |  3 +-
 .../CodeGen/AMDGPU/implicit-arg-v5-opt.ll     | 18 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll |  6 +-
 .../CodeGen/AMDGPU/reqd-work-group-size.ll    | 10 +--
 .../AMDGPU/simplifydemandedbits-recursion.ll  |  2 +-
 .../CodeGen/AMDGPU/uniform-load-from-tid.ll   | 16 ++--
 llvm/test/CodeGen/AMDGPU/zext-lid.ll          |  6 +-
 .../AMDGPU/mbcnt-negative-cases.ll            | 16 ++--
 .../AMDGPU/mbcnt-wave32-optimizations.ll      | 24 ++---
 .../AMDGPU/mbcnt-wave64-optimizations.ll      | 14 +--
 .../Verifier/AMDGPU/reqd-work-group-size.ll   | 89 +++++++++++++++++++
 .../ROCDL/ROCDLToLLVMIRTranslation.cpp        | 35 ++++++++
 mlir/test/Target/LLVMIR/rocdl-invalid.mlir    | 19 ++++
 mlir/test/Target/LLVMIR/rocdl.mlir            | 11 +++
 26 files changed, 372 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/Verifier/AMDGPU/reqd-work-group-size.ll
 create mode 100644 mlir/test/Target/LLVMIR/rocdl-invalid.mlir

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 077aace321264..75d84b4129e8e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3697,6 +3697,9 @@ def err_swift_abi_parameter_wrong_type : Error<
 def err_attribute_argument_invalid : Error<
   "%0 attribute argument is invalid: %select{max must be 0 since min is 0|"
   "min must not be greater than max}1">;
+def err_attribute_amdgpu_flat_work_group_size_mismatch : Error<
+  "'amdgpu_flat_work_group_size' attribute must match "
+  "'reqd_work_group_size' product">;
 def err_attribute_argument_is_zero : Error<
   "%0 attribute must be greater than 0">;
 def warn_attribute_argument_n_negative : Warning<
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index a3a596bb9d822..7b37f3f7f9b6e 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -731,15 +731,15 @@ void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
   auto Eval = [&](Expr *E) {
     return E->EvaluateKnownConstInt(getContext()).getExtValue();
   };
-  if (FlatWGS) {
+  if (ReqdWGS) {
+    Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
+                Eval(ReqdWGS->getZDim());
+  } else if (FlatWGS) {
     Min = Eval(FlatWGS->getMin());
     Max = Eval(FlatWGS->getMax());
   }
-  if (ReqdWGS && Min == 0 && Max == 0)
-    Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
-                Eval(ReqdWGS->getZDim());
 
-  if (Min != 0) {
+  if (Min != 0 || ReqdWGS) {
     assert(Min <= Max && "Min must be less than or equal Max");
 
     if (MinThreadsVal)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ae04d3855f01c..c1b95a9dac24d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8439,6 +8439,40 @@ static bool isKernelDecl(Decl *D) {
          D->hasAttr<CUDAGlobalAttr>();
 }
 
+static void checkAMDGPUReqdWorkGroupSize(Sema &S, Decl *D) {
+  if (!S.Context.getTargetInfo().getTriple().isAMDGPU())
+    return;
+
+  const auto *Flat = D->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
+  const auto *Reqd = D->getAttr<ReqdWorkGroupSizeAttr>();
+  if (!Flat || !Reqd)
+    return;
+
+  auto Eval = [&](Expr *E) -> std::optional<uint64_t> {
+    if (E->isValueDependent())
+      return std::nullopt;
+    std::optional<llvm::APSInt> V = E->getIntegerConstantExpr(S.Context);
+    if (!V)
+      return std::nullopt;
+    return V->getZExtValue();
+  };
+
+  std::optional<uint64_t> X = Eval(Reqd->getXDim());
+  std::optional<uint64_t> Y = Eval(Reqd->getYDim());
+  std::optional<uint64_t> Z = Eval(Reqd->getZDim());
+  std::optional<uint64_t> Min = Eval(Flat->getMin());
+  std::optional<uint64_t> Max = Eval(Flat->getMax());
+  if (!X || !Y || !Z || !Min || !Max)
+    return;
+
+  uint64_t Product = *X * *Y * *Z;
+  if (*Min != Product || *Max != Product) {
+    S.Diag(Flat->getLocation(),
+           diag::err_attribute_amdgpu_flat_work_group_size_mismatch);
+    D->setInvalidDecl();
+  }
+}
+
 void Sema::ProcessDeclAttributeList(
     Scope *S, Decl *D, const ParsedAttributesView &AttrList,
     const ProcessDeclAttributeOptions &Options) {
@@ -8502,6 +8536,7 @@ void Sema::ProcessDeclAttributeList(
       D->setInvalidDecl();
     }
   }
+  checkAMDGPUReqdWorkGroupSize(*this, D);
 
   // CUDA/HIP: restrict explicit CUDA target attributes on deduction guides.
   //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl 
b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
index 606566e070c41..8251658edb631 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
@@ -134,9 +134,9 @@ __attribute__((reqd_work_group_size(32, 2, 1))) // 
expected-no-diagnostics
 kernel void reqd_work_group_size_32_2_1() {
 // CHECK: define{{.*}} amdgpu_kernel void @reqd_work_group_size_32_2_1() 
[[FLAT_WORK_GROUP_SIZE_64_64:#[0-9]+]]
 }
-__attribute__((reqd_work_group_size(32, 2, 1), amdgpu_flat_work_group_size(16, 
128))) // expected-no-diagnostics
-kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {
-// CHECK: define{{.*}} amdgpu_kernel void 
@reqd_work_group_size_32_2_1_flat_work_group_size_16_128() 
[[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]]
+__attribute__((reqd_work_group_size(32, 2, 1), amdgpu_flat_work_group_size(64, 
64))) // expected-no-diagnostics
+kernel void reqd_work_group_size_32_2_1_flat_work_group_size_64_64() {
+// CHECK: define{{.*}} amdgpu_kernel void 
@reqd_work_group_size_32_2_1_flat_work_group_size_64_64() 
[[FLAT_WORK_GROUP_SIZE_64_64]]
 }
 
 __attribute__((amdgpu_max_num_work_groups(1, 1, 1))) // expected-no-diagnostics
@@ -203,7 +203,6 @@ kernel void default_kernel() {
 
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}} 
"amdgpu-flat-work-group-size"="32,64"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_64_64]] = {{.*}} 
"amdgpu-flat-work-group-size"="64,64"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_16_128]] = {{.*}} 
"amdgpu-flat-work-group-size"="16,128"
 
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}} 
"amdgpu-flat-work-group-size"="1,256"  "amdgpu-waves-per-eu"="2"
 
diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl 
b/clang/test/SemaOpenCL/amdgpu-attrs.cl
index a6177798f4819..9321c2f83e01c 100644
--- a/clang/test/SemaOpenCL/amdgpu-attrs.cl
+++ b/clang/test/SemaOpenCL/amdgpu-attrs.cl
@@ -51,6 +51,9 @@ __attribute__((amdgpu_waves_per_eu(0, 4))) kernel void 
kernel_waves_per_eu_0_4()
 __attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void 
kernel_flat_work_group_size_64_32() {} // expected-error 
{{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be 
greater than max}}
 __attribute__((amdgpu_waves_per_eu(4, 2))) kernel void 
kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' 
attribute argument is invalid: min must not be greater than max}}
 
+__attribute__((reqd_work_group_size(32, 2, 1), amdgpu_flat_work_group_size(16, 
128))) kernel void 
kernel_reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {} // 
expected-error {{'amdgpu_flat_work_group_size' attribute must match 
'reqd_work_group_size' product}}
+__attribute__((amdgpu_flat_work_group_size(16, 128), reqd_work_group_size(32, 
2, 1))) kernel void 
kernel_flat_work_group_size_16_128_reqd_work_group_size_32_2_1() {} // 
expected-error {{'amdgpu_flat_work_group_size' attribute must match 
'reqd_work_group_size' product}}
+
 __attribute__((amdgpu_waves_per_eu(2, 4, 8))) kernel void 
kernel_waves_per_eu_2_4_8() {} // expected-error {{'amdgpu_waves_per_eu' 
attribute takes no more than 2 arguments}}
 
 __attribute__((amdgpu_flat_work_group_size(0, 0))) kernel void 
kernel_flat_work_group_size_0_0() {}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 61841716792ef..03ee0a0709443 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2249,6 +2249,9 @@ The AMDGPU backend supports the following LLVM IR 
attributes.
      "amdgpu-flat-work-group-size"="min,max"          Specify the minimum and 
maximum flat work group sizes that
                                                       will be specified when 
the kernel is dispatched. Generated
                                                       by the 
``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_.
+                                                      If the 
``reqd_work_group_size`` metadata is present, the product
+                                                      of its three workgroup 
size dimensions must match both ``min``
+                                                      and ``max``.
                                                       The IR implied default 
value is 1,1024. Clang may emit this attribute
                                                       with more restrictive 
bounds depending on language defaults.
                                                       If the actual block or 
workgroup size exceeds the limit at any point during
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c9639d1420bfc..53e5e858219c5 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -123,10 +123,12 @@
 #include "llvm/Support/ModRef.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Coroutines/CoroInstr.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <queue>
@@ -645,6 +647,7 @@ class Verifier : public InstVisitor<Verifier>, 
VerifierSupport {
   void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                            const Value *V, bool IsIntrinsic, bool IsInlineAsm);
   void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
+  void verifyAMDGPUReqdWorkGroupSize(const Function &F);
   void verifyUnknownProfileMetadata(MDNode *MD);
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
@@ -2810,10 +2813,88 @@ void Verifier::verifyFunctionMetadata(
             "expected a constant integer operand for !kcfi_type", MD);
       Check(cast<ConstantInt>(C)->getBitWidth() == 32,
             "expected a 32-bit integer constant operand for !kcfi_type", MD);
+    } else if (Pair.first == Context.getMDKindID("reqd_work_group_size")) {
+      MDNode *MD = Pair.second;
+      Check(MD->getNumOperands() == 3,
+            "reqd_work_group_size must have exactly three operands", MD);
+      if (MD->getNumOperands() != 3)
+        continue;
+
+      uint64_t Product = 1;
+      for (unsigned I = 0; I != 3; ++I) {
+        ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD->getOperand(I));
+        Check(C, "reqd_work_group_size operands must be integer constants", 
MD);
+        if (!C)
+          break;
+
+        const APInt &Value = C->getValue();
+        Check(Value.getActiveBits() <= 64,
+              "reqd_work_group_size operands must fit in 64 bits", MD);
+        if (Value.getActiveBits() > 64)
+          break;
+
+        uint64_t Dim = Value.getZExtValue();
+        Check(Dim == 0 || Product <= std::numeric_limits<uint64_t>::max() / 
Dim,
+              "reqd_work_group_size product must fit in 64 bits", MD);
+        if (Dim != 0 && Product > std::numeric_limits<uint64_t>::max() / Dim)
+          break;
+        Product *= Dim;
+      }
     }
   }
 }
 
+void Verifier::verifyAMDGPUReqdWorkGroupSize(const Function &F) {
+  if (!TT.isAMDGPU())
+    return;
+
+  MDNode *ReqdWorkGroupSize = F.getMetadata("reqd_work_group_size");
+  if (!ReqdWorkGroupSize || ReqdWorkGroupSize->getNumOperands() != 3)
+    return;
+
+  uint64_t Product = 1;
+  for (const MDOperand &Op : ReqdWorkGroupSize->operands()) {
+    ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Op);
+    if (!C || C->getValue().getActiveBits() > 64)
+      return;
+    uint64_t Dim = C->getZExtValue();
+    if (Dim != 0 && Product > std::numeric_limits<uint64_t>::max() / Dim)
+      return;
+    Product *= Dim;
+  }
+
+  Attribute FlatWorkGroupSize = 
F.getFnAttribute("amdgpu-flat-work-group-size");
+  if (!FlatWorkGroupSize.isValid()) {
+    CheckFailed("reqd_work_group_size requires amdgpu-flat-work-group-size", 
&F,
+                ReqdWorkGroupSize);
+    return;
+  }
+
+  if (!FlatWorkGroupSize.isStringAttribute()) {
+    CheckFailed("amdgpu-flat-work-group-size must be a string attribute", &F);
+    return;
+  }
+
+  StringRef AttrValue = FlatWorkGroupSize.getValueAsString();
+  std::pair<StringRef, StringRef> Values = AttrValue.split(',');
+  uint64_t Min = 0;
+  uint64_t Max = 0;
+  bool Parsed = !Values.second.contains(',') &&
+                llvm::to_integer(Values.first.trim(), Min) &&
+                llvm::to_integer(Values.second.trim(), Max);
+  if (!Parsed) {
+    CheckFailed("amdgpu-flat-work-group-size must be a pair of unsigned "
+                "integers",
+                &F);
+    return;
+  }
+
+  Check(Min == Product && Max == Product,
+        "amdgpu-flat-work-group-size must equal the product of "
+        "reqd_work_group_size operands",
+        &F, ReqdWorkGroupSize);
+}
+
 void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
   if (EntryC->getNumOperands() == 0)
     return;
@@ -3284,6 +3365,7 @@ void Verifier::visitFunction(const Function &F) {
   F.getAllMetadata(MDs);
   assert(F.hasMetadata() != MDs.empty() && "Bit out-of-sync");
   verifyFunctionMetadata(MDs);
+  verifyAMDGPUReqdWorkGroupSize(F);
 
   // Check validity of the personality function
   if (F.hasPersonalityFn()) {
diff --git 
a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll 
b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
index fbd855b0d691b..96d14dcde2ebc 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -67,7 +67,7 @@ define amdgpu_kernel void @workitem_id_z_singlethreaded() #2 {
 
 ; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_singlethreaded_md':
 ; CHECK-NOT: DIVERGENT
-define amdgpu_kernel void @workitem_id_x_singlethreaded_md() "amdgpu-no-wwm" 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @workitem_id_x_singlethreaded_md() 
"amdgpu-flat-work-group-size"="1,1" "amdgpu-no-wwm" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -75,7 +75,7 @@ define amdgpu_kernel void @workitem_id_x_singlethreaded_md() 
"amdgpu-no-wwm" !re
 
 ; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_singlethreaded_md':
 ; CHECK-NOT: DIVERGENT
-define amdgpu_kernel void @workitem_id_y_singlethreaded_md() 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @workitem_id_y_singlethreaded_md() 
"amdgpu-flat-work-group-size"="1,1" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -83,7 +83,7 @@ define amdgpu_kernel void @workitem_id_y_singlethreaded_md() 
!reqd_work_group_si
 
 ; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_singlethreaded_md':
 ; CHECK-NOT: DIVERGENT
-define amdgpu_kernel void @workitem_id_z_singlethreaded_md() 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @workitem_id_z_singlethreaded_md() 
"amdgpu-flat-work-group-size"="1,1" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -91,7 +91,7 @@ define amdgpu_kernel void @workitem_id_z_singlethreaded_md() 
!reqd_work_group_si
 
 ; CHECK-LABEL: UniformityInfo for function 
'workitem_id_x_not_singlethreaded_dimx':
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimx() 
!reqd_work_group_size !1 {
+define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimx() 
"amdgpu-flat-work-group-size"="2,2" !reqd_work_group_size !1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -99,7 +99,7 @@ define amdgpu_kernel void 
@workitem_id_x_not_singlethreaded_dimx() !reqd_work_gr
 
 ; CHECK-LABEL: UniformityInfo for function 
'workitem_id_x_not_singlethreaded_dimy':
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimy() 
!reqd_work_group_size !2 {
+define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimy() 
"amdgpu-flat-work-group-size"="2,2" !reqd_work_group_size !2 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -107,7 +107,7 @@ define amdgpu_kernel void 
@workitem_id_x_not_singlethreaded_dimy() !reqd_work_gr
 
 ; CHECK-LABEL: UniformityInfo for function 
'workitem_id_x_not_singlethreaded_dimz':
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() 
!reqd_work_group_size !3 {
+define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() 
"amdgpu-flat-work-group-size"="2,2" !reqd_work_group_size !3 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, ptr addrspace(1) undef
   ret void
@@ -115,7 +115,7 @@ define amdgpu_kernel void 
@workitem_id_x_not_singlethreaded_dimz() !reqd_work_gr
 
 ; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_uniform_len_1'
 ; CHECK-NOT: DIVERGENT
-define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) 
!reqd_work_group_size !4 {
+define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !4 {
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %id.z, ptr %o
   ret void
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
index d69515591ecee..c66ef8f64df25 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
@@ -1234,7 +1234,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare i32 @llvm.amdgcn.workitem.id.y() #1
 declare i32 @llvm.amdgcn.workitem.id.z() #1
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" }
 attributes #1 = { nounwind readnone speculatable willreturn }
 
 !llvm.module.flags = !{!6}
diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir
index 915139b590fd4..8fcb3b6892a0e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir
@@ -2,27 +2,27 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -run-pass=legalizer -o - 
%s | FileCheck -check-prefix=GCN %s
 
 --- |
-  define amdgpu_kernel void @test_workitem_id_x_unpacked() 
!reqd_work_group_size !0 {
+  define amdgpu_kernel void @test_workitem_id_x_unpacked() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_y_unpacked() 
!reqd_work_group_size !0 {
+  define amdgpu_kernel void @test_workitem_id_y_unpacked() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_z_unpacked() 
!reqd_work_group_size !0 {
+  define amdgpu_kernel void @test_workitem_id_z_unpacked() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_x_packed() !reqd_work_group_size 
!0 {
+  define amdgpu_kernel void @test_workitem_id_x_packed() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_y_packed() !reqd_work_group_size 
!0 {
+  define amdgpu_kernel void @test_workitem_id_y_packed() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
-  define amdgpu_kernel void @test_workitem_id_z_packed() !reqd_work_group_size 
!0 {
+  define amdgpu_kernel void @test_workitem_id_z_packed() 
"amdgpu-flat-work-group-size"="8192,8192" !reqd_work_group_size !0 {
     ret void
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index ef933fc7fdd4a..f84cdc32f1e86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -141,7 +141,7 @@ define void @test_workitem_id_z_func(ptr addrspace(1) %out) 
#1 {
 
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
-define amdgpu_kernel void @test_reqd_workgroup_size_x_only(ptr %out) 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @test_reqd_workgroup_size_x_only(ptr %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
@@ -163,7 +163,7 @@ define amdgpu_kernel void 
@test_reqd_workgroup_size_x_only(ptr %out) !reqd_work_
 ; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
-define amdgpu_kernel void @test_reqd_workgroup_size_y_only(ptr %out) 
!reqd_work_group_size !1 {
+define amdgpu_kernel void @test_reqd_workgroup_size_y_only(ptr %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
@@ -184,7 +184,7 @@ define amdgpu_kernel void 
@test_reqd_workgroup_size_y_only(ptr %out) !reqd_work_
 
 ; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
 ; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @test_reqd_workgroup_size_z_only(ptr %out) 
!reqd_work_group_size !2 {
+define amdgpu_kernel void @test_reqd_workgroup_size_z_only(ptr %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !2 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll 
b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index 024d3e0a684a9..b8ae6fdb37eb6 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -7,7 +7,7 @@
 
 declare hidden void @callee() #0
 
-define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) 
"amdgpu-flat-work-group-size"="4096,4096" !reqd_work_group_size !0 {
 ; CHECK-LABEL: known_x_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -42,7 +42,7 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) 
!reqd_work_group_siz
 }
 ; CHECK: .amdhsa_system_vgpr_workitem_id 2
 
-define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) 
!reqd_work_group_size !1 {
+define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) 
"amdgpu-flat-work-group-size"="4096,4096" !reqd_work_group_size !1 {
 ; CHECK-LABEL: known_y_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -75,7 +75,7 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) 
!reqd_work_group_siz
 }
 ; CHECK: .amdhsa_system_vgpr_workitem_id 2
 
-define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) 
!reqd_work_group_size !2 {
+define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) 
"amdgpu-flat-work-group-size"="4096,4096" !reqd_work_group_size !2 {
 ; CHECK-LABEL: known_z_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -108,7 +108,7 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) 
!reqd_work_group_siz
 }
 ; CHECK: .amdhsa_system_vgpr_workitem_id 1
 
-define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) 
!reqd_work_group_size !3 {
+define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !3 {
 ; CHECK-LABEL: known_yz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -141,7 +141,7 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) 
%out) !reqd_work_group_si
 }
 ; CHECK: .amdhsa_system_vgpr_workitem_id 0
 
-define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) 
!reqd_work_group_size !4 {
+define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !4 {
 ; CHECK-LABEL: known_xz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -175,7 +175,7 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) 
%out) !reqd_work_group_si
 ; CHECK: .amdhsa_system_vgpr_workitem_id 1
 
 
-define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) 
!reqd_work_group_size !5 {
+define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) 
"amdgpu-flat-work-group-size"="1,1" !reqd_work_group_size !5 {
 ; CHECK-LABEL: known_xyz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll 
b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
index 834b3812673bc..506f663e0cda2 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
@@ -1244,7 +1244,7 @@ define amdgpu_kernel void @test_vec_type_hint_unknown(i32 
%a) #0
 ; CHECK-NEXT:       - 4
 ; CHECK:          .symbol:         test_reqd_wgs_vec_type_hint.kd
 ; CHECK:          .vec_type_hint:  int
-define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) #0
+define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) #4
     !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
     !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5
     !reqd_work_group_size !6 {
@@ -1769,6 +1769,7 @@ attributes #0 = { optnone noinline 
"amdgpu-no-completion-action" "amdgpu-no-defa
 attributes #1 = { optnone noinline "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" 
"runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
 attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
 attributes #3 = { optnone noinline "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" 
"amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" 
"amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" 
"amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" 
"amdgpu-no-workitem-id-z" }
+attributes #4 = { optnone noinline "amdgpu-flat-work-group-size"="8,8" 
"amdgpu-no-completion-action" "amdgpu-no-default-queue" 
"amdgpu-implicitarg-num-bytes"="56" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll 
b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
index d5dfb505cd7ca..c2d1d085fd53d 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -388,7 +388,7 @@ define i32 @get_grid_dims_i32() #2 {
   ret i32 %grid.dims
 }
 
-define i16 @get_grid_dims_reqd_work_group_size_1d() #2 !reqd_work_group_size 
!2 {
+define i16 @get_grid_dims_reqd_work_group_size_1d() #3 !reqd_work_group_size 
!2 {
 ; GCN-LABEL: @get_grid_dims_reqd_work_group_size_1d(
 ; GCN-NEXT:    ret i16 1
 ;
@@ -398,7 +398,7 @@ define i16 @get_grid_dims_reqd_work_group_size_1d() #2 
!reqd_work_group_size !2
   ret i16 %grid.dims
 }
 
-define i16 @get_grid_dims_reqd_work_group_size_2d() #2 !reqd_work_group_size 
!3 {
+define i16 @get_grid_dims_reqd_work_group_size_2d() #4 !reqd_work_group_size 
!3 {
 ; GCN-LABEL: @get_grid_dims_reqd_work_group_size_2d(
 ; GCN-NEXT:    ret i16 2
 ;
@@ -408,7 +408,7 @@ define i16 @get_grid_dims_reqd_work_group_size_2d() #2 
!reqd_work_group_size !3
   ret i16 %grid.dims
 }
 
-define i16 @get_grid_dims_reqd_work_group_size_2d_weird() #2 
!reqd_work_group_size !5 {
+define i16 @get_grid_dims_reqd_work_group_size_2d_weird() #5 
!reqd_work_group_size !5 {
 ; GCN-LABEL: @get_grid_dims_reqd_work_group_size_2d_weird(
 ; GCN-NEXT:    ret i16 2
 ;
@@ -418,7 +418,7 @@ define i16 @get_grid_dims_reqd_work_group_size_2d_weird() 
#2 !reqd_work_group_si
   ret i16 %grid.dims
 }
 
-define i16 @get_grid_dims_reqd_work_group_size_3d() #2 !reqd_work_group_size 
!0 {
+define i16 @get_grid_dims_reqd_work_group_size_3d() #6 !reqd_work_group_size 
!0 {
 ; GCN-LABEL: @get_grid_dims_reqd_work_group_size_3d(
 ; GCN-NEXT:    ret i16 3
 ;
@@ -428,7 +428,7 @@ define i16 @get_grid_dims_reqd_work_group_size_3d() #2 
!reqd_work_group_size !0
   ret i16 %grid.dims
 }
 
-define i16 @get_grid_dims_reqd_work_group_size_3d_weird() #2 
!reqd_work_group_size !4 {
+define i16 @get_grid_dims_reqd_work_group_size_3d_weird() #3 
!reqd_work_group_size !4 {
 ; GCN-LABEL: @get_grid_dims_reqd_work_group_size_3d_weird(
 ; GCN-NEXT:    ret i16 3
 ;
@@ -438,7 +438,7 @@ define i16 @get_grid_dims_reqd_work_group_size_3d_weird() 
#2 !reqd_work_group_si
   ret i16 %grid.dims
 }
 
-define i1 @get_grid_dims_i1_reqd_work_group_size() #2 !reqd_work_group_size !3 
{
+define i1 @get_grid_dims_i1_reqd_work_group_size() #4 !reqd_work_group_size !3 
{
 ; GCN-LABEL: @get_grid_dims_i1_reqd_work_group_size(
 ; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[IMPLICITARG_PTR]], i64 64
@@ -471,9 +471,13 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #1
 
 !llvm.module.flags = !{!1}
 
-attributes #0 = { nounwind "uniform-work-group-size" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="256,256" 
"uniform-work-group-size" }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { nounwind }
+attributes #3 = { nounwind "amdgpu-flat-work-group-size"="64,64" }
+attributes #4 = { nounwind "amdgpu-flat-work-group-size"="128,128" }
+attributes #5 = { nounwind "amdgpu-flat-work-group-size"="32,32" }
+attributes #6 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
 !0 = !{i32 8, i32 16, i32 2}
 !1 = !{i32 1, !"amdhsa_code_object_version", i32 500}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index eaee8ec73fe41..4bcea613bef53 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -75,7 +75,7 @@ define amdgpu_kernel void @test_workitem_id_z(ptr 
addrspace(1) %out) #1 {
 
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
-define amdgpu_kernel void @test_reqd_workgroup_size_x_only(ptr %out) 
!reqd_work_group_size !0 {
+define amdgpu_kernel void @test_reqd_workgroup_size_x_only(ptr %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
@@ -97,7 +97,7 @@ define amdgpu_kernel void 
@test_reqd_workgroup_size_x_only(ptr %out) !reqd_work_
 ; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
-define amdgpu_kernel void @test_reqd_workgroup_size_y_only(ptr %out) 
!reqd_work_group_size !1 {
+define amdgpu_kernel void @test_reqd_workgroup_size_y_only(ptr %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
@@ -118,7 +118,7 @@ define amdgpu_kernel void 
@test_reqd_workgroup_size_y_only(ptr %out) !reqd_work_
 
 ; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
 ; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @test_reqd_workgroup_size_z_only(ptr %out) 
!reqd_work_group_size !2 {
+define amdgpu_kernel void @test_reqd_workgroup_size_z_only(ptr %out) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !2 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll 
b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
index b79d6bcc39b3c..575a1a699d65c 100644
--- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S 
-passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | 
FileCheck -enable-var-scope %s
 
-; CHECK-LABEL: @invalid_reqd_work_group_size(
+; CHECK-LABEL: @no_reqd_work_group_size(
 ; CHECK: load i16,
-define amdgpu_kernel void @invalid_reqd_work_group_size(ptr addrspace(1) %out) 
#0 !reqd_work_group_size !1 {
+define amdgpu_kernel void @no_reqd_work_group_size(ptr addrspace(1) %out) #0 {
   %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) 
%dispatch.ptr, i64 4
   %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
@@ -246,7 +246,7 @@ define i32 @func_group_size_x(ptr addrspace(1) %out) #0 
!reqd_work_group_size !0
 
 ; CHECK-LABEL: @__ockl_get_local_size_reqd_size(
 ; CHECK: %group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, 
%bb1 ], [ 1, %bb ]
-define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size 
!0 {
+define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #4 !reqd_work_group_size 
!0 {
 bb:
   %tmp = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #2
   switch i32 %arg, label %bb25 [
@@ -444,13 +444,13 @@ declare i32 @llvm.umin.i32(i32, i32) #1
 declare i32 @llvm.smin.i32(i32, i32) #1
 declare i32 @llvm.umax.i32(i32, i32) #1
 
-attributes #0 = { nounwind "uniform-work-group-size" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="256,256" 
"uniform-work-group-size" }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { nounwind "uniform-work-group-size" }
 attributes #3 = { nounwind }
+attributes #4 = { nounwind readnone speculatable 
"amdgpu-flat-work-group-size"="256,256" }
 
 !0 = !{i32 8, i32 16, i32 2}
-!1 = !{i32 8, i32 16}
 !2 = !{i64 8, i64 16, i64 2}
 !3 = !{i16 8, i16 16, i16 2}
 
diff --git a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll 
b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
index d041699bcc9e6..77b87371d586b 100644
--- a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare float @llvm.fmuladd.f32(float, float, float) #0
 
-define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly 
%arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) 
noalias nocapture %arg2, float %arg3, i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4, 
i1 %c5) local_unnamed_addr !reqd_work_group_size !0 {
+define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly 
%arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) 
noalias nocapture %arg2, float %arg3, i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4, 
i1 %c5) local_unnamed_addr "amdgpu-flat-work-group-size"="128,128" 
!reqd_work_group_size !0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll 
b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
index f54e0019514f7..bdc875dd0608a 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -11,7 +11,7 @@
 ; OPT-LABEL: @lshr_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4{{$}}
 ; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4, !amdgpu.uniform
-define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) !reqd_work_group_size !0 {
+define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) "amdgpu-flat-work-group-size"="64,64" 
!reqd_work_group_size !0 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = lshr i32 %lid, 5
@@ -31,7 +31,7 @@ entry:
 ; OPT-LABEL: @ashr_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4{{$}}
 ; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4, !amdgpu.uniform
-define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) !reqd_work_group_size !0 {
+define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) "amdgpu-flat-work-group-size"="64,64" 
!reqd_work_group_size !0 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = ashr i32 %lid, 5
@@ -51,7 +51,7 @@ entry:
 ; OPT-LABEL: @and_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4{{$}}
 ; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4, !amdgpu.uniform
-define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) !reqd_work_group_size !0 {
+define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) "amdgpu-flat-work-group-size"="64,64" 
!reqd_work_group_size !0 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %and = and i32 %lid, -32
@@ -85,7 +85,7 @@ entry:
 
 ; OPT-LABEL: @lshr_threadid_2d
 ; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4{{$}}
-define amdgpu_kernel void @lshr_threadid_2d(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) !reqd_work_group_size !1 {
+define amdgpu_kernel void @lshr_threadid_2d(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) "amdgpu-flat-work-group-size"="130,130" 
!reqd_work_group_size !1 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = lshr i32 %lid, 5
@@ -105,7 +105,7 @@ entry:
 ; OPT-LABEL: @lshr_threadid_3d
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4{{$}}
 ; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4, !amdgpu.uniform
-define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) !reqd_work_group_size !2 {
+define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) "amdgpu-flat-work-group-size"="128,128" 
!reqd_work_group_size !2 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = lshr i32 %lid, 5
@@ -124,7 +124,7 @@ entry:
 
 ; OPT-LABEL: @high_id_uniform
 ; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%zid.zext, !amdgpu.uniform
-define amdgpu_kernel void @high_id_uniform(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) !reqd_work_group_size !2 {
+define amdgpu_kernel void @high_id_uniform(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) "amdgpu-flat-work-group-size"="128,128" 
!reqd_work_group_size !2 {
 entry:
   %zid = tail call i32 @llvm.amdgcn.workitem.id.z()
   %zid.zext = zext nneg i32 %zid to i64
@@ -143,7 +143,7 @@ entry:
 ; OPT-LABEL: @lshr_threadid_1d_uneven
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4{{$}}
 ; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4, !amdgpu.uniform
-define amdgpu_kernel void @lshr_threadid_1d_uneven(ptr addrspace(1) align 4 
%in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !3 {
+define amdgpu_kernel void @lshr_threadid_1d_uneven(ptr addrspace(1) align 4 
%in, ptr addrspace(1) align 4 %out) "amdgpu-flat-work-group-size"="65,65" 
!reqd_work_group_size !3 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = lshr i32 %lid, 5
@@ -160,7 +160,7 @@ entry:
 
 ; OPT-LABEL: @and_threadid_2d
 ; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 
%div4{{$}}
-define amdgpu_kernel void @and_threadid_2d(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) !reqd_work_group_size !1 {
+define amdgpu_kernel void @and_threadid_2d(ptr addrspace(1) align 4 %in, ptr 
addrspace(1) align 4 %out) "amdgpu-flat-work-group-size"="130,130" 
!reqd_work_group_size !1 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %and = and i32 %lid, -32
diff --git a/llvm/test/CodeGen/AMDGPU/zext-lid.ll 
b/llvm/test/CodeGen/AMDGPU/zext-lid.ll
index 395862c58f4eb..f6922d4e5ed7e 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-lid.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-lid.ll
@@ -21,7 +21,7 @@ bb:
 
 ; GCN-LABEL: {{^}}zext_grp_size_32x4x1:
 ; O2-NOT: and_b32
-define amdgpu_kernel void @zext_grp_size_32x4x1(ptr addrspace(1) nocapture 
%arg) #0 !reqd_work_group_size !0 {
+define amdgpu_kernel void @zext_grp_size_32x4x1(ptr addrspace(1) nocapture 
%arg) #5 !reqd_work_group_size !0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = and i32 %tmp, 31
@@ -42,7 +42,7 @@ bb:
 
 ; When EarlyCSE is not run this call produces a range max with 0 active bits,
 ; which is a special case as an AssertZext from width 0 is invalid.
-define amdgpu_kernel void @zext_grp_size_1x1x1(ptr addrspace(1) nocapture 
%arg) #0 !reqd_work_group_size !1 {
+define amdgpu_kernel void @zext_grp_size_1x1x1(ptr addrspace(1) nocapture 
%arg) #6 !reqd_work_group_size !1 {
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = and i32 %tmp, 1
   store i32 %tmp1, ptr addrspace(1) %arg, align 4
@@ -102,6 +102,8 @@ attributes #1 = { nounwind 
"amdgpu-flat-work-group-size"="512,512" }
 attributes #2 = { nounwind readnone speculatable }
 attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind }
+attributes #5 = { nounwind "amdgpu-flat-work-group-size"="128,128" }
+attributes #6 = { nounwind "amdgpu-flat-work-group-size"="1,1" }
 
 !0 = !{i32 32, i32 4, i32 1}
 !1 = !{i32 1, i32 1, i32 1}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-negative-cases.ll 
b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-negative-cases.ll
index 36c7cfd388bf8..6e3f012ea2c66 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-negative-cases.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-negative-cases.ll
@@ -38,9 +38,9 @@ entry:
 ; =============================================================================
 
 ; Test with partial mask
-define i32 @test_mbcnt_partial_mask() !reqd_work_group_size !0 {
+define i32 @test_mbcnt_partial_mask() "amdgpu-flat-work-group-size"="64,64" 
!reqd_work_group_size !0 {
 ; CHECK-LABEL: define i32 @test_mbcnt_partial_mask(
-; CHECK-SAME: ) !reqd_work_group_size [[META0:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 65535, i32 0)
 ; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[A]])
@@ -53,9 +53,9 @@ entry:
 }
 
 ; Test with non-zero base
-define i32 @test_mbcnt_non_zero_base() !reqd_work_group_size !0 {
+define i32 @test_mbcnt_non_zero_base() "amdgpu-flat-work-group-size"="64,64" 
!reqd_work_group_size !0 {
 ; CHECK-LABEL: define i32 @test_mbcnt_non_zero_base(
-; CHECK-SAME: ) !reqd_work_group_size [[META0]] {
+; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 5)
 ; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[A]])
@@ -85,9 +85,9 @@ entry:
 }
 
 ; Test with work group size = not a wave multiple (48)
-define i32 @test_mbcnt_hi_copy_non_wave_multiple(i32 %val) 
!reqd_work_group_size !1 {
+define i32 @test_mbcnt_hi_copy_non_wave_multiple(i32 %val) 
"amdgpu-flat-work-group-size"="48,48" !reqd_work_group_size !1 {
 ; CHECK-LABEL: define i32 @test_mbcnt_hi_copy_non_wave_multiple(
-; CHECK-SAME: i32 [[VAL:%.*]]) !reqd_work_group_size [[META1:![0-9]+]] {
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR1:[0-9]+]] !reqd_work_group_size 
[[META1:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 
[[VAL]])
 ; CHECK-NEXT:    ret i32 [[RESULT]]
@@ -98,9 +98,9 @@ entry:
 }
 
 ; Test with zero mask
-define i32 @test_mbcnt_hi_copy_zero_mask(i32 %val) !reqd_work_group_size !0 {
+define i32 @test_mbcnt_hi_copy_zero_mask(i32 %val) 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !0 {
 ; CHECK-LABEL: define i32 @test_mbcnt_hi_copy_zero_mask(
-; CHECK-SAME: i32 [[VAL:%.*]]) !reqd_work_group_size [[META0]] {
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] !reqd_work_group_size [[META0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 
[[VAL]])
 ; CHECK-NEXT:    ret i32 [[RESULT]]
diff --git 
a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave32-optimizations.ll 
b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave32-optimizations.ll
index e0afa3e876ec2..fdb8ca7e5c8eb 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave32-optimizations.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave32-optimizations.ll
@@ -9,7 +9,7 @@
 ; =============================================================================
 
 ; Test with work group size = wave size (32)
-define i32 @test_mbcnt_lo_simple_wave32() !reqd_work_group_size !0 {
+define i32 @test_mbcnt_lo_simple_wave32() 
"amdgpu-flat-work-group-size"="32,32" !reqd_work_group_size !0 {
 ; CHECK-LABEL: define i32 @test_mbcnt_lo_simple_wave32(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
@@ -26,9 +26,9 @@ entry:
 ; =============================================================================
 
 ; Test with work group size = 2 * wave size (64)
-define i32 @test_mbcnt_lo_bitmask_64() !reqd_work_group_size !1 {
+define i32 @test_mbcnt_lo_bitmask_64() "amdgpu-flat-work-group-size"="64,64" 
!reqd_work_group_size !1 {
 ; CHECK-LABEL: define i32 @test_mbcnt_lo_bitmask_64(
-; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META1:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !reqd_work_group_size [[META1:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 0, 64) i32 
@llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], 31
@@ -40,9 +40,9 @@ entry:
 }
 
 ; Test with work group size = 3 * wave size (96)
-define i32 @test_mbcnt_lo_bitmask_96() !reqd_work_group_size !2 {
+define i32 @test_mbcnt_lo_bitmask_96() "amdgpu-flat-work-group-size"="96,96" 
!reqd_work_group_size !2 {
 ; CHECK-LABEL: define i32 @test_mbcnt_lo_bitmask_96(
-; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META2:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] !reqd_work_group_size [[META2:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 0, 96) i32 
@llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], 31
@@ -54,9 +54,9 @@ entry:
 }
 
 ; Test with work group size = 0.75 * wave size (48)
-define i32 @test_mbcnt_lo_bitmask_48() !reqd_work_group_size !3 {
+define i32 @test_mbcnt_lo_bitmask_48() "amdgpu-flat-work-group-size"="48,48" 
!reqd_work_group_size !3 {
 ; CHECK-LABEL: define i32 @test_mbcnt_lo_bitmask_48(
-; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META3:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] !reqd_work_group_size [[META3:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 0, 48) i32 
@llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[A:%.*]] = and i32 [[TMP0]], 31
@@ -72,7 +72,7 @@ entry:
 ; =============================================================================
 
 ; Test with mask = wave32 range
-define i32 @test_mbcnt_hi_copy_basic(i32 %val) !reqd_work_group_size !0 {
+define i32 @test_mbcnt_hi_copy_basic(i32 %val) 
"amdgpu-flat-work-group-size"="32,32" !reqd_work_group_size !0 {
 ; CHECK-LABEL: define i32 @test_mbcnt_hi_copy_basic(
 ; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] !reqd_work_group_size [[META0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
@@ -84,7 +84,7 @@ entry:
 }
 
 ; Test with partial mask
-define i32 @test_mbcnt_hi_copy_partial_mask(i32 %val) !reqd_work_group_size !0 
{
+define i32 @test_mbcnt_hi_copy_partial_mask(i32 %val) 
"amdgpu-flat-work-group-size"="32,32" !reqd_work_group_size !0 {
 ; CHECK-LABEL: define i32 @test_mbcnt_hi_copy_partial_mask(
 ; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] !reqd_work_group_size [[META0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
@@ -100,7 +100,7 @@ entry:
 ; =============================================================================
 
 ; Test with work group size = wave size (32)
-define i32 @test_mbcnt_full_pattern_wave32() !reqd_work_group_size !0 {
+define i32 @test_mbcnt_full_pattern_wave32() 
"amdgpu-flat-work-group-size"="32,32" !reqd_work_group_size !0 {
 ; CHECK-LABEL: define i32 @test_mbcnt_full_pattern_wave32(
 ; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
@@ -114,9 +114,9 @@ entry:
 }
 
 ; Test with work group size = 0.75 * wave size (48)
-define i32 @test_mbcnt_full_pattern_wave32_partial() !reqd_work_group_size !3 {
+define i32 @test_mbcnt_full_pattern_wave32_partial() 
"amdgpu-flat-work-group-size"="48,48" !reqd_work_group_size !3 {
 ; CHECK-LABEL: define i32 @test_mbcnt_full_pattern_wave32_partial(
-; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META3]] {
+; CHECK-SAME: ) #[[ATTR3]] !reqd_work_group_size [[META3]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 0, 48) i32 
@llvm.amdgcn.workitem.id.x()
diff --git 
a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave64-optimizations.ll 
b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave64-optimizations.ll
index 3da06b6692ecc..7d2255c3ab488 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave64-optimizations.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-wave64-optimizations.ll
@@ -9,7 +9,7 @@
 ; =============================================================================
 
 ; Test with work group size = wave size (64)
-define i32 @test_mbcnt_full_pattern_wave64_basic() !reqd_work_group_size !0 {
+define i32 @test_mbcnt_full_pattern_wave64_basic() 
"amdgpu-flat-work-group-size"="64,64" !reqd_work_group_size !0 {
 ; CHECK-LABEL: define i32 @test_mbcnt_full_pattern_wave64_basic(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
@@ -24,9 +24,9 @@ entry:
 }
 
 ; Test with work group size = 2 * wave size (128)
-define i32 @test_mbcnt_full_pattern_wave64_128() !reqd_work_group_size !1 {
+define i32 @test_mbcnt_full_pattern_wave64_128() 
"amdgpu-flat-work-group-size"="128,128" !reqd_work_group_size !1 {
 ; CHECK-LABEL: define i32 @test_mbcnt_full_pattern_wave64_128(
-; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META1:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !reqd_work_group_size [[META1:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 0, 128) i32 
@llvm.amdgcn.workitem.id.x()
@@ -40,9 +40,9 @@ entry:
 }
 
 ; Test with multidimensional work group where X dimension matches pattern
-define i32 @test_mbcnt_full_pattern_wave64_multidim() !reqd_work_group_size !2 
{
+define i32 @test_mbcnt_full_pattern_wave64_multidim() 
"amdgpu-flat-work-group-size"="128,128" !reqd_work_group_size !2 {
 ; CHECK-LABEL: define i32 @test_mbcnt_full_pattern_wave64_multidim(
-; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META2:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1]] !reqd_work_group_size [[META2:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 0, 64) i32 
@llvm.amdgcn.workitem.id.x()
@@ -55,9 +55,9 @@ entry:
 }
 
 ; Test with work group size = 0.75 * wave size (48)
-define i32 @test_mbcnt_full_pattern_wave64_partial() !reqd_work_group_size !3 {
+define i32 @test_mbcnt_full_pattern_wave64_partial() 
"amdgpu-flat-work-group-size"="48,48" !reqd_work_group_size !3 {
 ; CHECK-LABEL: define i32 @test_mbcnt_full_pattern_wave64_partial(
-; CHECK-SAME: ) #[[ATTR0]] !reqd_work_group_size [[META3:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] !reqd_work_group_size [[META3:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 0, 48) i32 
@llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/Verifier/AMDGPU/reqd-work-group-size.ll 
b/llvm/test/Verifier/AMDGPU/reqd-work-group-size.ll
new file mode 100644
index 0000000000000..e46fdcdbf3f62
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/reqd-work-group-size.ll
@@ -0,0 +1,89 @@
+; RUN: split-file %s %t
+
+; RUN: not llvm-as %t/missing-flat.ll --disable-output 2>&1 | FileCheck %s 
--check-prefix=MISSING
+; RUN: not llvm-as %t/broad-flat.ll --disable-output 2>&1 | FileCheck %s 
--check-prefix=BROAD
+; RUN: not llvm-as %t/wrong-flat.ll --disable-output 2>&1 | FileCheck %s 
--check-prefix=WRONG
+; RUN: not llvm-as %t/malformed-flat.ll --disable-output 2>&1 | FileCheck %s 
--check-prefix=MALFORMED-FLAT
+; RUN: not llvm-as %t/malformed-reqd.ll --disable-output 2>&1 | FileCheck %s 
--check-prefix=MALFORMED-REQD
+; RUN: not llvm-as %t/non-integer-reqd.ll --disable-output 2>&1 | FileCheck %s 
--check-prefix=NON-INTEGER-REQD
+; RUN: llvm-as %t/valid.ll --disable-output 2>&1 | count 0
+; RUN: llvm-as %t/spir.ll --disable-output 2>&1 | count 0
+
+; MISSING: reqd_work_group_size requires amdgpu-flat-work-group-size
+; BROAD: amdgpu-flat-work-group-size must equal the product of 
reqd_work_group_size operands
+; WRONG: amdgpu-flat-work-group-size must equal the product of 
reqd_work_group_size operands
+; MALFORMED-FLAT: amdgpu-flat-work-group-size must be a pair of unsigned 
integers
+; MALFORMED-REQD: reqd_work_group_size must have exactly three operands
+; NON-INTEGER-REQD: reqd_work_group_size operands must be integer constants
+
+;--- missing-flat.ll
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @missing_flat() !reqd_work_group_size !0 {
+  ret void
+}
+
+!0 = !{i32 32, i32 2, i32 1}
+
+;--- broad-flat.ll
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @broad_flat() "amdgpu-flat-work-group-size"="16,128" 
!reqd_work_group_size !0 {
+  ret void
+}
+
+!0 = !{i32 32, i32 2, i32 1}
+
+;--- wrong-flat.ll
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @wrong_flat() 
"amdgpu-flat-work-group-size"="128,128" !reqd_work_group_size !0 {
+  ret void
+}
+
+!0 = !{i32 32, i32 2, i32 1}
+
+;--- malformed-flat.ll
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @malformed_flat() "amdgpu-flat-work-group-size"="64" 
!reqd_work_group_size !0 {
+  ret void
+}
+
+!0 = !{i32 32, i32 2, i32 1}
+
+;--- malformed-reqd.ll
+target triple = "spirv64-unknown-unknown"
+
+define spir_kernel void @malformed_reqd() !reqd_work_group_size !0 {
+  ret void
+}
+
+!0 = !{i32 32, i32 2}
+
+;--- non-integer-reqd.ll
+target triple = "spirv64-unknown-unknown"
+
+define spir_kernel void @non_integer_reqd() !reqd_work_group_size !0 {
+  ret void
+}
+
+!0 = !{!"32", i32 2, i32 1}
+
+;--- valid.ll
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @valid() "amdgpu-flat-work-group-size"="64,64" 
!reqd_work_group_size !0 {
+  ret void
+}
+
+!0 = !{i32 32, i32 2, i32 1}
+
+;--- spir.ll
+target triple = "spirv64-unknown-unknown"
+
+define spir_kernel void @spir() !reqd_work_group_size !0 {
+  ret void
+}
+
+!0 = !{i32 32, i32 2, i32 1}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp 
b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index e1168e75f10da..42393218f8705 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace mlir;
 using namespace mlir::LLVM;
@@ -174,14 +175,48 @@ class ROCDLDialectLLVMIRTranslationInterface
       if (!value)
         return op->emitOpError(Twine(attribute.getName()) +
                                " must be a dense i32 array attribute");
+      if (value.asArrayRef().size() != 3)
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must contain exactly three values");
+
+      uint64_t FlatWorkGroupSize = 1;
       SmallVector<llvm::Metadata *, 3> metadata;
       llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
       for (int32_t i : value.asArrayRef()) {
+        FlatWorkGroupSize *= static_cast<uint32_t>(i);
         llvm::Constant *constant = llvm::ConstantInt::get(i32, i);
         metadata.push_back(llvm::ConstantAsMetadata::get(constant));
       }
       llvm::Function *llvmFunc =
           moduleTranslation.lookupFunction(func.getName());
+      llvm::SmallString<16> expectedFlatWorkGroupSize;
+      llvm::raw_svector_ostream attrValueStream(expectedFlatWorkGroupSize);
+      attrValueStream << FlatWorkGroupSize << "," << FlatWorkGroupSize;
+
+      StringRef flatAttrName =
+          dialect->getFlatWorkGroupSizeAttrHelper().getName();
+      if (auto flatAttr =
+              dyn_cast_if_present<StringAttr>(op->getAttr(flatAttrName))) {
+        if (flatAttr.getValue() != expectedFlatWorkGroupSize)
+          return op->emitOpError(Twine(flatAttrName) +
+                                 " must match rocdl.reqd_work_group_size");
+      }
+
+      StringRef maxFlatAttrName =
+          dialect->getMaxFlatWorkGroupSizeAttrHelper().getName();
+      if (auto maxFlatAttr =
+              dyn_cast_if_present<IntegerAttr>(op->getAttr(maxFlatAttrName))) {
+        llvm::SmallString<16> expectedMaxFlatWorkGroupSize;
+        llvm::raw_svector_ostream maxAttrValueStream(
+            expectedMaxFlatWorkGroupSize);
+        maxAttrValueStream << "1," << maxFlatAttr.getInt();
+        if (expectedMaxFlatWorkGroupSize != expectedFlatWorkGroupSize)
+          return op->emitOpError(Twine(maxFlatAttrName) +
+                                 " must match rocdl.reqd_work_group_size");
+      }
+
+      llvmFunc->addFnAttr("amdgpu-flat-work-group-size",
+                          expectedFlatWorkGroupSize);
       llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata);
       llvmFunc->setMetadata("reqd_work_group_size", node);
     }
diff --git a/mlir/test/Target/LLVMIR/rocdl-invalid.mlir 
b/mlir/test/Target/LLVMIR/rocdl-invalid.mlir
new file mode 100644
index 0000000000000..3a3cff5f6226a
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/rocdl-invalid.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// expected-error @below {{rocdl.flat_work_group_size must match 
rocdl.reqd_work_group_size}}
+llvm.func @reqd_work_group_size_flat_work_group_size_mismatch()
+    attributes {rocdl.kernel,
+      rocdl.flat_work_group_size = "16,128",
+      rocdl.reqd_work_group_size = array<i32: 32, 2, 1>} {
+  llvm.return
+}
+
+// -----
+
+// expected-error @below {{rocdl.max_flat_work_group_size must match 
rocdl.reqd_work_group_size}}
+llvm.func @reqd_work_group_size_max_flat_work_group_size_mismatch()
+    attributes {rocdl.kernel,
+      rocdl.max_flat_work_group_size = 128 : index,
+      rocdl.reqd_work_group_size = array<i32: 32, 2, 1>} {
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir 
b/mlir/test/Target/LLVMIR/rocdl.mlir
index 0a3127b868a3d..6b1c03345e61b 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -128,6 +128,15 @@ llvm.func @known_block_sizes()
   llvm.return
 }
 
+llvm.func @known_block_sizes_from_reqd()
+    attributes {rocdl.kernel,
+      rocdl.reqd_work_group_size = array<i32: 8, 4, 2>} {
+  // CHECK-LABEL: amdgpu_kernel void @known_block_sizes_from_reqd()
+  // CHECK: #[[$REQD_BLOCK_SIZE_ATTRS:[0-9]+]]
+  // CHECK: !reqd_work_group_size ![[$REQD_BLOCK_SIZE:[0-9]+]]
+  llvm.return
+}
+
 llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, 
rocdl.uniform_work_group_size = false} {
   // CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
   // CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
@@ -2235,7 +2244,9 @@ llvm.func @rocdl_dot_fp8_family(%i32: i32, %f32: f32) -> 
f32 {
 // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { 
"amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { 
"amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { 
"amdgpu-flat-work-group-size"="128,128"
+// CHECK-DAG: attributes #[[$REQD_BLOCK_SIZE_ATTRS]] = { 
"amdgpu-flat-work-group-size"="64,64" "uniform-work-group-size" }
 // CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { 
"amdgpu-flat-work-group-size"="1,256" }
 // CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}
+// CHECK-DAG: ![[$REQD_BLOCK_SIZE]] = !{i32 8, i32 4, i32 2}
 // CHECK-DAG: attributes #[[$KERNEL_WAVES_PER_EU_ATTR]] = { 
"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" 
"uniform-work-group-size" }
 // CHECK-DAG: attributes #[[$KERNEL_UNSAFE_FP_ATOMICS_ATTR]] = { 
"amdgpu-flat-work-group-size"="1,256" "amdgpu-unsafe-fp-atomics"="true" 
"uniform-work-group-size" }

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [AMDGPU] Verify AMDGPU required workgroup size matches flat workgroup size (PR #200989)

Reply via email to