[clang] 47e3108 - clang/AMDGPU: Fix workgroup size builtins for nonuniform work group sizes (#185098)

via cfe-commits Mon, 09 Mar 2026 03:48:04 -0700

Author: Matt Arsenault
Date: 2026-03-09T11:47:49+01:00
New Revision: 47e31088f3ac829060b653afef2272d0b220f3bd


URL: 
https://github.com/llvm/llvm-project/commit/47e31088f3ac829060b653afef2272d0b220f3bd
DIFF: 
https://github.com/llvm/llvm-project/commit/47e31088f3ac829060b653afef2272d0b220f3bd.diff

LOG: clang/AMDGPU: Fix workgroup size builtins for nonuniform work group sizes 
(#185098)

These were assuming uniform work group sizes. Emit the v4 and v5
sequences to take the remainder group for the nonuniform case.

Currently the device libs uses this builtin on the legacy ABI path with
the same sequence to calculate the remainder, and fully implements the
v5 path. If you perform a franken-build of the library with the updated
builtin, the result is worse. The duplicate sequence does not fully fold out.
However, it does not appear to be wrong. The relevant conformance tests still
pass.

Added: 
    clang/test/CodeGenOpenCL/builtins-amdgcn-workgroup-size.cl

Modified: 
    clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
    clang/test/CodeGen/amdgpu-abi-version.c
    clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
    clang/test/CodeGenOpenCL/builtins-amdgcn.cl
    clang/test/Headers/gpuintrin.c

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 72d5cb8040119..4258bfeea1c35 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -77,6 +77,122 @@ Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
   return Call;
 }
 
+static llvm::Intrinsic::ID getAMDGPUWorkGroupID(CodeGenFunction &CGF,
+                                                unsigned Index) {
+  switch (Index) {
+  case 0:
+    return llvm::Intrinsic::amdgcn_workgroup_id_x;
+  case 1:
+    return llvm::Intrinsic::amdgcn_workgroup_id_y;
+  case 2:
+    return llvm::Intrinsic::amdgcn_workgroup_id_z;
+  default:
+    llvm_unreachable("unhandled index");
+  }
+}
+
+static void setNoundefInvariantLoad(llvm::LoadInst *Ld) {
+  Ld->setMetadata(llvm::LLVMContext::MD_noundef,
+                  llvm::MDNode::get(Ld->getContext(), {}));
+  Ld->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ld->getContext(), {}));
+}
+
+static void addMaxWorkGroupSizeRangeMetadata(CodeGenFunction &CGF,
+                                             llvm::LoadInst *GroupSize) {
+  llvm::MDBuilder MDHelper(CGF.getLLVMContext());
+  llvm::MDNode *RNode = MDHelper.createRange(
+      APInt(16, 1), APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 
1));
+  GroupSize->setMetadata(llvm::LLVMContext::MD_range, RNode);
+  setNoundefInvariantLoad(GroupSize);
+}
+
+static Value *emitAMDGPUWorkGroupSizeV5(CodeGenFunction &CGF, unsigned Index) {
+  llvm::Value *ImplicitArgPtr = EmitAMDGPUImplicitArgPtr(CGF);
+
+  // offsetof(amdhsa_implicit_kernarg_v5, block_count[Index])
+  unsigned BlockCountOffset = 0 + Index * 4;
+  // offsetof(amdhsa_implicit_kernarg_v5, group_size[Index])
+  unsigned GroupSizeOffset = 12 + Index * 2;
+  // offsetof(amdhsa_implicit_kernarg_v5, remainder[Index])
+  unsigned RemainderOffset = 18 + Index * 2;
+
+  if (CGF.CGM.getLangOpts().OffloadUniformBlock) {
+    // Indexing the implicit kernarg segment.
+    llvm::Value *GroupSizeGEP = CGF.Builder.CreateConstInBoundsGEP1_64(
+        CGF.Int8Ty, ImplicitArgPtr, GroupSizeOffset);
+    llvm::LoadInst *GroupSize = CGF.Builder.CreateLoad(
+        Address(GroupSizeGEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
+
+    addMaxWorkGroupSizeRangeMetadata(CGF, GroupSize);
+
+    return CGF.Builder.CreateZExt(GroupSize, CGF.Int32Ty);
+  }
+
+  llvm::Value *BlockCountGEP = CGF.Builder.CreateConstGEP1_64(
+      CGF.Int8Ty, ImplicitArgPtr, BlockCountOffset);
+  llvm::LoadInst *BlockCount = CGF.Builder.CreateLoad(
+      Address(BlockCountGEP, CGF.Int32Ty, CharUnits::fromQuantity(4)));
+  setNoundefInvariantLoad(BlockCount);
+
+  llvm::Value *WorkgroupID =
+      CGF.Builder.CreateIntrinsic(getAMDGPUWorkGroupID(CGF, Index), {});
+  llvm::Value *IsFull = CGF.Builder.CreateICmpULT(WorkgroupID, BlockCount);
+
+  llvm::Value *StructOffset = CGF.Builder.CreateSelect(
+      IsFull, ConstantInt::get(CGF.Int32Ty, GroupSizeOffset),
+      ConstantInt::get(CGF.Int32Ty, RemainderOffset));
+
+  llvm::Value *SizeGEP =
+      CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, ImplicitArgPtr, StructOffset);
+  llvm::LoadInst *Size = CGF.Builder.CreateLoad(
+      Address(SizeGEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
+  addMaxWorkGroupSizeRangeMetadata(CGF, Size);
+  setNoundefInvariantLoad(Size);
+
+  return CGF.Builder.CreateZExt(Size, CGF.Int32Ty);
+}
+
+static Value *emitAMDGPUWorkGroupSizeV4(CodeGenFunction &CGF, unsigned Index) {
+  llvm::Value *DispatchPtr = EmitAMDGPUDispatchPtr(CGF);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  llvm::Value *GroupSizeGEP = CGF.Builder.CreateConstInBoundsGEP1_64(
+      CGF.Int8Ty, DispatchPtr, 4 + Index * 2);
+  llvm::LoadInst *GroupSizeLD = CGF.Builder.CreateLoad(
+      Address(GroupSizeGEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
+
+  addMaxWorkGroupSizeRangeMetadata(CGF, GroupSizeLD);
+
+  llvm::Value *GroupSize = CGF.Builder.CreateZExt(GroupSizeLD, CGF.Int32Ty);
+
+  if (CGF.CGM.getLangOpts().OffloadUniformBlock)
+    return GroupSize;
+
+  llvm::Value *WorkgroupID =
+      CGF.Builder.CreateIntrinsic(getAMDGPUWorkGroupID(CGF, Index), {});
+
+  llvm::Value *GridSizeGEP = CGF.Builder.CreateConstInBoundsGEP1_64(
+      CGF.Int8Ty, DispatchPtr, 12 + Index * 4);
+  llvm::LoadInst *GridSize = CGF.Builder.CreateLoad(
+      Address(GridSizeGEP, CGF.Int32Ty, CharUnits::fromQuantity(4)));
+
+  llvm::MDBuilder MDB(CGF.getLLVMContext());
+
+  // Known non-zero.
+  GridSize->setMetadata(llvm::LLVMContext::MD_range,
+                        MDB.createRange(APInt(32, 1), APInt::getZero(32)));
+  GridSize->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                        llvm::MDNode::get(CGF.getLLVMContext(), {}));
+
+  llvm::Value *Mul = CGF.Builder.CreateMul(WorkgroupID, GroupSize);
+  llvm::Value *Remainder = CGF.Builder.CreateSub(GridSize, Mul);
+
+  llvm::Value *IsPartial = CGF.Builder.CreateICmpULT(Remainder, GroupSize);
+
+  return CGF.Builder.CreateSelect(IsPartial, Remainder, GroupSize);
+}
+
 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
 /// Emit code based on Code Object ABI version.
 /// COV_4    : Emit code to use dispatch ptr
@@ -85,11 +201,9 @@ Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
 ///            and use its value for COV_4 or COV_5+ approach. It is used for
 ///            compiling device libraries in an ABI-agnostic way.
 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
-  llvm::LoadInst *LD;
-
   auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
 
-  // Do not emit __oclc_ABI_version references with non-empty environment.
+  // Do not emit __oclc_ABI_version references with non-empt environment.
   if (Cov == CodeObjectVersionKind::COV_None &&
       CGF.getTarget().getTriple().hasEnvironment())
     Cov = CodeObjectVersionKind::COV_6;
@@ -114,41 +228,14 @@ Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, 
unsigned Index) {
         ABIVersion,
         llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5));
 
-    // Indexing the implicit kernarg segment.
-    Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
-        CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
-
-    // Indexing the HSA kernel_dispatch_packet struct.
-    Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
-        CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
-
-    auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
-    LD = CGF.Builder.CreateLoad(
-        Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
-  } else {
-    Value *GEP = nullptr;
-    if (Cov >= CodeObjectVersionKind::COV_5) {
-      // Indexing the implicit kernarg segment.
-      GEP = CGF.Builder.CreateConstGEP1_32(
-          CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
-    } else {
-      // Indexing the HSA kernel_dispatch_packet struct.
-      GEP = CGF.Builder.CreateConstGEP1_32(
-          CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
-    }
-    LD = CGF.Builder.CreateLoad(
-        Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
+    llvm::Value *V5Impl = emitAMDGPUWorkGroupSizeV5(CGF, Index);
+    llvm::Value *V4Impl = emitAMDGPUWorkGroupSizeV4(CGF, Index);
+    return CGF.Builder.CreateSelect(IsCOV5, V5Impl, V4Impl);
   }
 
-  llvm::MDBuilder MDHelper(CGF.getLLVMContext());
-  llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
-      APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
-  LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
-  LD->setMetadata(llvm::LLVMContext::MD_noundef,
-                  llvm::MDNode::get(CGF.getLLVMContext(), {}));
-  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
-                  llvm::MDNode::get(CGF.getLLVMContext(), {}));
-  return LD;
+  return Cov >= CodeObjectVersionKind::COV_5
+             ? emitAMDGPUWorkGroupSizeV5(CGF, Index)
+             : emitAMDGPUWorkGroupSizeV4(CGF, Index);
 }
 
 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.

diff  --git a/clang/test/CodeGen/amdgpu-abi-version.c 
b/clang/test/CodeGen/amdgpu-abi-version.c
index 9b7011f36f523..2cfab3e8e3e0c 100644
--- a/clang/test/CodeGen/amdgpu-abi-version.c
+++ b/clang/test/CodeGen/amdgpu-abi-version.c
@@ -11,22 +11,41 @@
 // LLVM-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
 // LLVM-NEXT:    [[TMP1:%.*]] = icmp sge i32 [[TMP0]], 500
 // LLVM-NEXT:    [[TMP2:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// LLVM-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], 
i32 12
-// LLVM-NEXT:    [[TMP4:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// LLVM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP4]], 
i32 4
-// LLVM-NEXT:    [[TMP6:%.*]] = select i1 [[TMP1]], ptr addrspace(4) [[TMP3]], 
ptr addrspace(4) [[TMP5]]
-// LLVM-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], align 2, 
!range [[RNG1:![0-9]+]], !invariant.load [[META2:![0-9]+]], !noundef [[META2]]
-// LLVM-NEXT:    [[CONV:%.*]] = zext i16 [[TMP7]] to i32
-// LLVM-NEXT:    ret i32 [[CONV]]
+// LLVM-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], 
i64 0
+// LLVM-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, 
!invariant.load [[META1:![0-9]+]], !noundef [[META1]]
+// LLVM-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+// LLVM-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP4]]
+// LLVM-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 12, i32 18
+// LLVM-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP2]], i32 [[TMP7]]
+// LLVM-NEXT:    [[TMP9:%.*]] = load i16, ptr addrspace(4) [[TMP8]], align 2, 
!range [[RNG2:![0-9]+]], !invariant.load [[META1]], !noundef [[META1]]
+// LLVM-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP9]] to i32
+// LLVM-NEXT:    [[TMP11:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// LLVM-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP11]], i64 4
+// LLVM-NEXT:    [[TMP13:%.*]] = load i16, ptr addrspace(4) [[TMP12]], align 
2, !range [[RNG2]], !invariant.load [[META1]], !noundef [[META1]]
+// LLVM-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP13]] to i32
+// LLVM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+// LLVM-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP11]], i64 12
+// LLVM-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(4) [[TMP16]], align 
4, !range [[RNG3:![0-9]+]], !invariant.load [[META1]]
+// LLVM-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP15]], [[TMP14]]
+// LLVM-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP17]], [[TMP18]]
+// LLVM-NEXT:    [[TMP20:%.*]] = icmp ult i32 [[TMP19]], [[TMP14]]
+// LLVM-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 
[[TMP14]]
+// LLVM-NEXT:    [[TMP22:%.*]] = select i1 [[TMP1]], i32 [[TMP10]], i32 
[[TMP21]]
+// LLVM-NEXT:    ret i32 [[TMP22]]
 //
 // LLVMENV-LABEL: define dso_local i32 @foo(
 // LLVMENV-SAME: ) #[[ATTR0:[0-9]+]] {
 // LLVMENV-NEXT:  [[ENTRY:.*:]]
 // LLVMENV-NEXT:    [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// LLVMENV-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) 
[[TMP0]], i32 12
-// LLVMENV-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 
2, !range [[RNG1:![0-9]+]], !invariant.load [[META2:![0-9]+]], !noundef 
[[META2]]
-// LLVMENV-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
-// LLVMENV-NEXT:    ret i32 [[CONV]]
+// LLVMENV-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) 
[[TMP0]], i64 0
+// LLVMENV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !invariant.load [[META1:![0-9]+]], !noundef [[META1]]
+// LLVMENV-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+// LLVMENV-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], [[TMP2]]
+// LLVMENV-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 12, i32 18
+// LLVMENV-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP0]], i32 [[TMP5]]
+// LLVMENV-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], align 
2, !range [[RNG2:![0-9]+]], !invariant.load [[META1]], !noundef [[META1]]
+// LLVMENV-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
+// LLVMENV-NEXT:    ret i32 [[TMP8]]
 //
 int foo() { return __builtin_amdgcn_workgroup_size_x(); }
 //.
@@ -37,12 +56,13 @@ int foo() { return __builtin_amdgcn_workgroup_size_x(); }
 // LLVMENV: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind 
speculatable willreturn memory(none) }
 //.
 // LLVM: [[META0:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-// LLVM: [[RNG1]] = !{i16 1, i16 1025}
-// LLVM: [[META2]] = !{}
+// LLVM: [[META1]] = !{}
+// LLVM: [[RNG2]] = !{i16 1, i16 1025}
+// LLVM: [[RNG3]] = !{i32 1, i32 0}
 //.
 // LLVMENV: [[META0:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-// LLVMENV: [[RNG1]] = !{i16 1, i16 1025}
-// LLVMENV: [[META2]] = !{}
+// LLVMENV: [[META1]] = !{}
+// LLVMENV: [[RNG2]] = !{i16 1, i16 1025}
 //.
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add 
tests below this line:
 // CHECK: {{.*}}

diff  --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu 
b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
index 2d3730cdfc8c9..b7f597b989242 100644
--- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
 // RUN:     -fcuda-is-device -mcode-object-version=4 -emit-llvm -o - -x hip %s 
\
 // RUN:     | FileCheck -check-prefix=PRECOV5 %s
@@ -12,24 +13,100 @@
 
 #include "Inputs/cuda.h"
 
-// PRECOV5-LABEL: test_get_workgroup_size
-// PRECOV5: call align 4 dereferenceable(64) ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
-// PRECOV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
-// PRECOV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// PRECOV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
-// PRECOV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// PRECOV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8
-// PRECOV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
 
-// COV5-LABEL: test_get_workgroup_size
-// COV5: call align 8 dereferenceable(256) ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
-// COV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
-// COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// COV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
-// COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// COV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
-// COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
 
+// PRECOV5-LABEL: define dso_local void @_Z23test_get_workgroup_sizeiPi(
+// PRECOV5-SAME: i32 noundef [[D:%.*]], ptr noundef [[OUT:%.*]]) 
#[[ATTR0:[0-9]+]] {
+// PRECOV5-NEXT:  [[ENTRY:.*:]]
+// PRECOV5-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// PRECOV5-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// PRECOV5-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[D_ADDR]] to ptr
+// PRECOV5-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// PRECOV5-NEXT:    store i32 [[D]], ptr [[D_ADDR_ASCAST]], align 4
+// PRECOV5-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// PRECOV5-NEXT:    [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
+// PRECOV5-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// PRECOV5-NEXT:      i32 0, label %[[SW_BB:.*]]
+// PRECOV5-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// PRECOV5-NEXT:      i32 2, label %[[SW_BB2:.*]]
+// PRECOV5-NEXT:    ]
+// PRECOV5:       [[SW_BB]]:
+// PRECOV5-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// PRECOV5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP1]], i64 4
+// PRECOV5-NEXT:    [[TMP3:%.*]] = load i16, ptr addrspace(4) [[TMP2]], align 
2, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]], !noundef 
[[META4]]
+// PRECOV5-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
+// PRECOV5-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// PRECOV5-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4
+// PRECOV5-NEXT:    br label %[[SW_EPILOG:.*]]
+// PRECOV5:       [[SW_BB1]]:
+// PRECOV5-NEXT:    [[TMP6:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// PRECOV5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP6]], i64 6
+// PRECOV5-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 
2, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// PRECOV5-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP8]] to i32
+// PRECOV5-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// PRECOV5-NEXT:    store i32 [[TMP9]], ptr [[TMP10]], align 4
+// PRECOV5-NEXT:    br label %[[SW_EPILOG]]
+// PRECOV5:       [[SW_BB2]]:
+// PRECOV5-NEXT:    [[TMP11:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// PRECOV5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr 
addrspace(4) [[TMP11]], i64 8
+// PRECOV5-NEXT:    [[TMP13:%.*]] = load i16, ptr addrspace(4) [[TMP12]], 
align 2, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// PRECOV5-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP13]] to i32
+// PRECOV5-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// PRECOV5-NEXT:    store i32 [[TMP14]], ptr [[TMP15]], align 4
+// PRECOV5-NEXT:    br label %[[SW_EPILOG]]
+// PRECOV5:       [[SW_DEFAULT]]:
+// PRECOV5-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// PRECOV5-NEXT:    store i32 0, ptr [[TMP16]], align 4
+// PRECOV5-NEXT:    br label %[[SW_EPILOG]]
+// PRECOV5:       [[SW_EPILOG]]:
+// PRECOV5-NEXT:    ret void
+//
+// COV5-LABEL: define dso_local void @_Z23test_get_workgroup_sizeiPi(
+// COV5-SAME: i32 noundef [[D:%.*]], ptr noundef [[OUT:%.*]]) 
#[[ATTR0:[0-9]+]] {
+// COV5-NEXT:  [[ENTRY:.*:]]
+// COV5-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// COV5-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// COV5-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[D_ADDR]] to ptr
+// COV5-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// COV5-NEXT:    store i32 [[D]], ptr [[D_ADDR_ASCAST]], align 4
+// COV5-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// COV5-NEXT:    [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
+// COV5-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// COV5-NEXT:      i32 0, label %[[SW_BB:.*]]
+// COV5-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// COV5-NEXT:      i32 2, label %[[SW_BB2:.*]]
+// COV5-NEXT:    ]
+// COV5:       [[SW_BB]]:
+// COV5-NEXT:    [[TMP1:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// COV5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP1]], i64 12
+// COV5-NEXT:    [[TMP3:%.*]] = load i16, ptr addrspace(4) [[TMP2]], align 2, 
!range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]], !noundef [[META4]]
+// COV5-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
+// COV5-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// COV5-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4
+// COV5-NEXT:    br label %[[SW_EPILOG:.*]]
+// COV5:       [[SW_BB1]]:
+// COV5-NEXT:    [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// COV5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP6]], i64 14
+// COV5-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, 
!range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// COV5-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP8]] to i32
+// COV5-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// COV5-NEXT:    store i32 [[TMP9]], ptr [[TMP10]], align 4
+// COV5-NEXT:    br label %[[SW_EPILOG]]
+// COV5:       [[SW_BB2]]:
+// COV5-NEXT:    [[TMP11:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// COV5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP11]], i64 16
+// COV5-NEXT:    [[TMP13:%.*]] = load i16, ptr addrspace(4) [[TMP12]], align 
2, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// COV5-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP13]] to i32
+// COV5-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// COV5-NEXT:    store i32 [[TMP14]], ptr [[TMP15]], align 4
+// COV5-NEXT:    br label %[[SW_EPILOG]]
+// COV5:       [[SW_DEFAULT]]:
+// COV5-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// COV5-NEXT:    store i32 0, ptr [[TMP16]], align 4
+// COV5-NEXT:    br label %[[SW_EPILOG]]
+// COV5:       [[SW_EPILOG]]:
+// COV5-NEXT:    ret void
+//
 __device__ void test_get_workgroup_size(int d, int *out)
 {
   switch (d) {
@@ -41,3 +118,10 @@ __device__ void test_get_workgroup_size(int d, int *out)
 }
 
 // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
+//.
+// PRECOV5: [[RNG3]] = !{i16 1, i16 1025}
+// PRECOV5: [[META4]] = !{}
+//.
+// COV5: [[RNG3]] = !{i16 1, i16 1025}
+// COV5: [[META4]] = !{}
+//.

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-workgroup-size.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-workgroup-size.cl
new file mode 100644
index 0000000000000..9b4cdfa08176f
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-workgroup-size.cl
@@ -0,0 +1,627 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown 
-mcode-object-version=5 -target-cpu tahiti -emit-llvm -o - %s | FileCheck 
-check-prefixes=NONUNIFORM-V5  %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -cl-uniform-work-group-size -triple 
amdgcn-unknown-unknown -mcode-object-version=5 -target-cpu tahiti -emit-llvm -o 
- %s | FileCheck -check-prefixes=UNIFORM-V5  %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown 
-mcode-object-version=4 -target-cpu tahiti -emit-llvm -o - %s | FileCheck 
-check-prefixes=NONUNIFORM-V4  %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -cl-uniform-work-group-size -triple 
amdgcn-unknown-unknown -mcode-object-version=4 -target-cpu tahiti -emit-llvm -o 
- %s | FileCheck -check-prefixes=UNIFORM-V4  %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown 
-mcode-object-version=none -emit-llvm -o - %s | FileCheck 
-check-prefixes=NONUNIFORM-UNKNOWN  %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -cl-uniform-work-group-size -triple 
amdgcn-unknown-unknown -mcode-object-version=none -emit-llvm -o - %s | 
FileCheck -check-prefixes=UNIFORM-UNKNOWN  %s
+
+
+// NONUNIFORM-V5-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_x(
+// NONUNIFORM-V5-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NONUNIFORM-V5-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-V5-NEXT:    [[TMP0:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-V5-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], 
align 8, !invariant.load [[META7:![0-9]+]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP2:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.x()
+// NONUNIFORM-V5-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], [[TMP1]]
+// NONUNIFORM-V5-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 12, i64 18
+// NONUNIFORM-V5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 [[TMP4]]
+// NONUNIFORM-V5-NEXT:    [[TMP6:%.*]] = load i16, ptr addrspace(4) [[TMP5]], 
align 2, !range [[RNG8:![0-9]+]], !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP7:%.*]] = zext nneg i16 [[TMP6]] to i32
+// NONUNIFORM-V5-NEXT:    ret i32 [[TMP7]]
+//
+// UNIFORM-V5-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_x(
+// UNIFORM-V5-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// UNIFORM-V5-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-V5-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) 
ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-V5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 12
+// UNIFORM-V5-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 4, !range [[RNG7:![0-9]+]], !invariant.load [[META8:![0-9]+]], !noundef 
[[META8]]
+// UNIFORM-V5-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// UNIFORM-V5-NEXT:    ret i32 [[TMP3]]
+//
+// NONUNIFORM-V4-LABEL: define dso_local range(i32 0, 1025) i32 
@test_get_workgroup_size_x(
+// NONUNIFORM-V4-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NONUNIFORM-V4-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-V4-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-V4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 4
+// NONUNIFORM-V4-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 4, !range [[RNG7:![0-9]+]], !invariant.load [[META8:![0-9]+]], !noundef 
[[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// NONUNIFORM-V4-NEXT:    [[TMP4:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.x()
+// NONUNIFORM-V4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 12
+// NONUNIFORM-V4-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], 
align 4, !range [[RNG9:![0-9]+]], !invariant.load [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP4]], [[TMP3]]
+// NONUNIFORM-V4-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+// NONUNIFORM-V4-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.umin.i32(i32 
[[TMP8]], i32 [[TMP3]])
+// NONUNIFORM-V4-NEXT:    ret i32 [[TMP9]]
+//
+// UNIFORM-V4-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_x(
+// UNIFORM-V4-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// UNIFORM-V4-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-V4-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-V4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 4
+// UNIFORM-V4-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 4, !range [[RNG7:![0-9]+]], !invariant.load [[META8:![0-9]+]], !noundef 
[[META8]]
+// UNIFORM-V4-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// UNIFORM-V4-NEXT:    ret i32 [[TMP3]]
+//
+// NONUNIFORM-UNKNOWN-LABEL: define dso_local range(i32 0, 1025) i32 
@test_get_workgroup_size_x(
+// NONUNIFORM-UNKNOWN-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NONUNIFORM-UNKNOWN-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP0]], 499
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP2:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) 
[[TMP2]], align 8, !invariant.load [[META6:![0-9]+]], !noundef [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP4:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.x()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[TMP3]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 12, i64 
18
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP2]], i64 [[TMP6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) 
[[TMP7]], align 2, !range [[RNG7:![0-9]+]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP9:%.*]] = zext nneg i16 [[TMP8]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP10:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP10]], i64 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP12:%.*]] = load i16, ptr addrspace(4) 
[[TMP11]], align 4, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP13:%.*]] = zext nneg i16 [[TMP12]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP10]], i64 12
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(4) 
[[TMP14]], align 4, !range [[RNG8:![0-9]+]], !invariant.load [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP4]], [[TMP13]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP18:%.*]] = tail call i32 
@llvm.umin.i32(i32 [[TMP17]], i32 [[TMP13]])
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP19:%.*]] = select i1 [[TMP1]], i32 
[[TMP9]], i32 [[TMP18]]
+// NONUNIFORM-UNKNOWN-NEXT:    ret i32 [[TMP19]]
+//
+// UNIFORM-UNKNOWN-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_x(
+// UNIFORM-UNKNOWN-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// UNIFORM-UNKNOWN-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// UNIFORM-UNKNOWN-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP0]], 499
+// UNIFORM-UNKNOWN-NEXT:    [[TMP2:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP2]], i64 12
+// UNIFORM-UNKNOWN-NEXT:    [[TMP4:%.*]] = load i16, ptr addrspace(4) 
[[TMP3]], align 4, !range [[RNG6:![0-9]+]], !invariant.load [[META7:![0-9]+]], 
!noundef [[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP5:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP5]], i64 4
+// UNIFORM-UNKNOWN-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) 
[[TMP6]], align 4, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[DOTV:%.*]] = select i1 [[TMP1]], i16 [[TMP4]], 
i16 [[TMP7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP8:%.*]] = zext nneg i16 [[DOTV]] to i32
+// UNIFORM-UNKNOWN-NEXT:    ret i32 [[TMP8]]
+//
+unsigned int test_get_workgroup_size_x()
+{
+    return __builtin_amdgcn_workgroup_size_x();
+}
+
+// NONUNIFORM-V5-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_y(
+// NONUNIFORM-V5-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// NONUNIFORM-V5-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-V5-NEXT:    [[TMP0:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-V5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 4
+// NONUNIFORM-V5-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], 
align 4, !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP3:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.y()
+// NONUNIFORM-V5-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], [[TMP2]]
+// NONUNIFORM-V5-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 14, i64 20
+// NONUNIFORM-V5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 [[TMP5]]
+// NONUNIFORM-V5-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], 
align 2, !range [[RNG8]], !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP8:%.*]] = zext nneg i16 [[TMP7]] to i32
+// NONUNIFORM-V5-NEXT:    ret i32 [[TMP8]]
+//
+// UNIFORM-V5-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_y(
+// UNIFORM-V5-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// UNIFORM-V5-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-V5-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) 
ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-V5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 14
+// UNIFORM-V5-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 2, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V5-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// UNIFORM-V5-NEXT:    ret i32 [[TMP3]]
+//
+// NONUNIFORM-V4-LABEL: define dso_local range(i32 0, 1025) i32 
@test_get_workgroup_size_y(
+// NONUNIFORM-V4-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// NONUNIFORM-V4-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-V4-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-V4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 6
+// NONUNIFORM-V4-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 2, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// NONUNIFORM-V4-NEXT:    [[TMP4:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.y()
+// NONUNIFORM-V4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 16
+// NONUNIFORM-V4-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], 
align 4, !range [[RNG9]], !invariant.load [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP4]], [[TMP3]]
+// NONUNIFORM-V4-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+// NONUNIFORM-V4-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.umin.i32(i32 
[[TMP8]], i32 [[TMP3]])
+// NONUNIFORM-V4-NEXT:    ret i32 [[TMP9]]
+//
+// UNIFORM-V4-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_y(
+// UNIFORM-V4-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// UNIFORM-V4-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-V4-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-V4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 6
+// UNIFORM-V4-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 2, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V4-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// UNIFORM-V4-NEXT:    ret i32 [[TMP3]]
+//
+// NONUNIFORM-UNKNOWN-LABEL: define dso_local range(i32 0, 1025) i32 
@test_get_workgroup_size_y(
+// NONUNIFORM-UNKNOWN-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// NONUNIFORM-UNKNOWN-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP0]], 499
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP2:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP2]], i64 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) 
[[TMP3]], align 4, !invariant.load [[META6]], !noundef [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP5:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.y()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP4]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 14, i64 
20
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP2]], i64 [[TMP7]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP9:%.*]] = load i16, ptr addrspace(4) 
[[TMP8]], align 2, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP10:%.*]] = zext nneg i16 [[TMP9]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP11:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP11]], i64 6
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP13:%.*]] = load i16, ptr addrspace(4) 
[[TMP12]], align 2, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP14:%.*]] = zext nneg i16 [[TMP13]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP11]], i64 16
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(4) 
[[TMP15]], align 4, !range [[RNG8]], !invariant.load [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP5]], [[TMP14]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP17]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP19:%.*]] = tail call i32 
@llvm.umin.i32(i32 [[TMP18]], i32 [[TMP14]])
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP20:%.*]] = select i1 [[TMP1]], i32 
[[TMP10]], i32 [[TMP19]]
+// NONUNIFORM-UNKNOWN-NEXT:    ret i32 [[TMP20]]
+//
+// UNIFORM-UNKNOWN-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_y(
+// UNIFORM-UNKNOWN-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// UNIFORM-UNKNOWN-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// UNIFORM-UNKNOWN-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP0]], 499
+// UNIFORM-UNKNOWN-NEXT:    [[TMP2:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP2]], i64 14
+// UNIFORM-UNKNOWN-NEXT:    [[TMP4:%.*]] = load i16, ptr addrspace(4) 
[[TMP3]], align 2, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP5:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP5]], i64 6
+// UNIFORM-UNKNOWN-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) 
[[TMP6]], align 2, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[DOTV:%.*]] = select i1 [[TMP1]], i16 [[TMP4]], 
i16 [[TMP7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP8:%.*]] = zext nneg i16 [[DOTV]] to i32
+// UNIFORM-UNKNOWN-NEXT:    ret i32 [[TMP8]]
+//
+unsigned int test_get_workgroup_size_y()
+{
+    return __builtin_amdgcn_workgroup_size_y();
+}
+
+// NONUNIFORM-V5-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_z(
+// NONUNIFORM-V5-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// NONUNIFORM-V5-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-V5-NEXT:    [[TMP0:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-V5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 8
+// NONUNIFORM-V5-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], 
align 8, !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP3:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.z()
+// NONUNIFORM-V5-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], [[TMP2]]
+// NONUNIFORM-V5-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 16, i64 22
+// NONUNIFORM-V5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 [[TMP5]]
+// NONUNIFORM-V5-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], 
align 2, !range [[RNG8]], !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP8:%.*]] = zext nneg i16 [[TMP7]] to i32
+// NONUNIFORM-V5-NEXT:    ret i32 [[TMP8]]
+//
+// UNIFORM-V5-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_z(
+// UNIFORM-V5-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// UNIFORM-V5-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-V5-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) 
ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-V5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 16
+// UNIFORM-V5-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 8, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V5-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// UNIFORM-V5-NEXT:    ret i32 [[TMP3]]
+//
+// NONUNIFORM-V4-LABEL: define dso_local range(i32 0, 1025) i32 
@test_get_workgroup_size_z(
+// NONUNIFORM-V4-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// NONUNIFORM-V4-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-V4-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-V4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 8
+// NONUNIFORM-V4-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 4, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// NONUNIFORM-V4-NEXT:    [[TMP4:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.z()
+// NONUNIFORM-V4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 20
+// NONUNIFORM-V4-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], 
align 4, !range [[RNG9]], !invariant.load [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP4]], [[TMP3]]
+// NONUNIFORM-V4-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+// NONUNIFORM-V4-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.umin.i32(i32 
[[TMP8]], i32 [[TMP3]])
+// NONUNIFORM-V4-NEXT:    ret i32 [[TMP9]]
+//
+// UNIFORM-V4-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_z(
+// UNIFORM-V4-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// UNIFORM-V4-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-V4-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-V4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 8
+// UNIFORM-V4-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 4, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V4-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// UNIFORM-V4-NEXT:    ret i32 [[TMP3]]
+//
+// NONUNIFORM-UNKNOWN-LABEL: define dso_local range(i32 0, 1025) i32 
@test_get_workgroup_size_z(
+// NONUNIFORM-UNKNOWN-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// NONUNIFORM-UNKNOWN-NEXT:  [[ENTRY:.*:]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP0]], 499
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP2:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP2]], i64 8
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) 
[[TMP3]], align 8, !invariant.load [[META6]], !noundef [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP5:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.z()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP4]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 16, i64 
22
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP2]], i64 [[TMP7]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP9:%.*]] = load i16, ptr addrspace(4) 
[[TMP8]], align 2, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP10:%.*]] = zext nneg i16 [[TMP9]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP11:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP11]], i64 8
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP13:%.*]] = load i16, ptr addrspace(4) 
[[TMP12]], align 4, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP14:%.*]] = zext nneg i16 [[TMP13]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP11]], i64 20
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(4) 
[[TMP15]], align 4, !range [[RNG8]], !invariant.load [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP5]], [[TMP14]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP17]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP19:%.*]] = tail call i32 
@llvm.umin.i32(i32 [[TMP18]], i32 [[TMP14]])
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP20:%.*]] = select i1 [[TMP1]], i32 
[[TMP10]], i32 [[TMP19]]
+// NONUNIFORM-UNKNOWN-NEXT:    ret i32 [[TMP20]]
+//
+// UNIFORM-UNKNOWN-LABEL: define dso_local range(i32 1, 1025) i32 
@test_get_workgroup_size_z(
+// UNIFORM-UNKNOWN-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// UNIFORM-UNKNOWN-NEXT:  [[ENTRY:.*:]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// UNIFORM-UNKNOWN-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP0]], 499
+// UNIFORM-UNKNOWN-NEXT:    [[TMP2:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP2]], i64 16
+// UNIFORM-UNKNOWN-NEXT:    [[TMP4:%.*]] = load i16, ptr addrspace(4) 
[[TMP3]], align 8, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP5:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP5]], i64 8
+// UNIFORM-UNKNOWN-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) 
[[TMP6]], align 4, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[DOTV:%.*]] = select i1 [[TMP1]], i16 [[TMP4]], 
i16 [[TMP7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP8:%.*]] = zext nneg i16 [[DOTV]] to i32
+// UNIFORM-UNKNOWN-NEXT:    ret i32 [[TMP8]]
+//
+unsigned int test_get_workgroup_size_z()
+{
+    return __builtin_amdgcn_workgroup_size_z();
+}
+
+// NONUNIFORM-V5-LABEL: define dso_local void @test_get_workgroup_size(
+// NONUNIFORM-V5-SAME: i32 noundef [[D:%.*]], ptr addrspace(1) noundef 
writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR4:[0-9]+]] {
+// NONUNIFORM-V5-NEXT:  [[ENTRY:.*]]:
+// NONUNIFORM-V5-NEXT:    switch i32 [[D]], label %[[SW_EPILOG:.*]] [
+// NONUNIFORM-V5-NEXT:      i32 0, label %[[SW_BB:.*]]
+// NONUNIFORM-V5-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// NONUNIFORM-V5-NEXT:      i32 2, label %[[SW_BB2:.*]]
+// NONUNIFORM-V5-NEXT:    ]
+// NONUNIFORM-V5:       [[SW_BB]]:
+// NONUNIFORM-V5-NEXT:    [[TMP0:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-V5-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], 
align 8, !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP2:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.x()
+// NONUNIFORM-V5-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], [[TMP1]]
+// NONUNIFORM-V5-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 12, i64 18
+// NONUNIFORM-V5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 [[TMP4]]
+// NONUNIFORM-V5-NEXT:    [[TMP6:%.*]] = load i16, ptr addrspace(4) [[TMP5]], 
align 2, !range [[RNG8]], !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[NARROW:%.*]] = add nuw nsw i16 [[TMP6]], 1
+// NONUNIFORM-V5-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-V5:       [[SW_BB1]]:
+// NONUNIFORM-V5-NEXT:    [[TMP7:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-V5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP7]], i64 4
+// NONUNIFORM-V5-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) [[TMP8]], 
align 4, !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP10:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.y()
+// NONUNIFORM-V5-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP9]]
+// NONUNIFORM-V5-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 14, i64 20
+// NONUNIFORM-V5-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP7]], i64 [[TMP12]]
+// NONUNIFORM-V5-NEXT:    [[TMP14:%.*]] = load i16, ptr addrspace(4) 
[[TMP13]], align 2, !range [[RNG8]], !invariant.load [[META7]], !noundef 
[[META7]]
+// NONUNIFORM-V5-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-V5:       [[SW_BB2]]:
+// NONUNIFORM-V5-NEXT:    [[TMP15:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-V5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP15]], i64 8
+// NONUNIFORM-V5-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(4) 
[[TMP16]], align 8, !invariant.load [[META7]], !noundef [[META7]]
+// NONUNIFORM-V5-NEXT:    [[TMP18:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.z()
+// NONUNIFORM-V5-NEXT:    [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP17]]
+// NONUNIFORM-V5-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i64 16, i64 22
+// NONUNIFORM-V5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP15]], i64 [[TMP20]]
+// NONUNIFORM-V5-NEXT:    [[TMP22:%.*]] = load i16, ptr addrspace(4) 
[[TMP21]], align 2, !range [[RNG8]], !invariant.load [[META7]], !noundef 
[[META7]]
+// NONUNIFORM-V5-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-V5:       [[SW_EPILOG]]:
+// NONUNIFORM-V5-NEXT:    [[DOTSINK_SHRUNK:%.*]] = phi i16 [ [[NARROW]], 
%[[SW_BB]] ], [ [[TMP22]], %[[SW_BB2]] ], [ [[TMP14]], %[[SW_BB1]] ], [ 0, 
%[[ENTRY]] ]
+// NONUNIFORM-V5-NEXT:    [[DOTSINK:%.*]] = zext i16 [[DOTSINK_SHRUNK]] to i32
+// NONUNIFORM-V5-NEXT:    store i32 [[DOTSINK]], ptr addrspace(1) [[OUT]], 
align 4, !tbaa [[INT_TBAA3:![0-9]+]]
+// NONUNIFORM-V5-NEXT:    ret void
+//
+// UNIFORM-V5-LABEL: define dso_local void @test_get_workgroup_size(
+// UNIFORM-V5-SAME: i32 noundef [[D:%.*]], ptr addrspace(1) noundef writeonly 
captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR2:[0-9]+]] {
+// UNIFORM-V5-NEXT:  [[ENTRY:.*]]:
+// UNIFORM-V5-NEXT:    switch i32 [[D]], label %[[SW_EPILOG:.*]] [
+// UNIFORM-V5-NEXT:      i32 0, label %[[SW_BB:.*]]
+// UNIFORM-V5-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// UNIFORM-V5-NEXT:      i32 2, label %[[SW_BB2:.*]]
+// UNIFORM-V5-NEXT:    ]
+// UNIFORM-V5:       [[SW_BB]]:
+// UNIFORM-V5-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) 
ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-V5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 12
+// UNIFORM-V5-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 4, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V5-NEXT:    [[NARROW:%.*]] = add nuw nsw i16 [[TMP2]], 1
+// UNIFORM-V5-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-V5:       [[SW_BB1]]:
+// UNIFORM-V5-NEXT:    [[TMP3:%.*]] = tail call align 8 dereferenceable(256) 
ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-V5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP3]], i64 14
+// UNIFORM-V5-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], 
align 2, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V5-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-V5:       [[SW_BB2]]:
+// UNIFORM-V5-NEXT:    [[TMP6:%.*]] = tail call align 8 dereferenceable(256) 
ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-V5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP6]], i64 16
+// UNIFORM-V5-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], 
align 8, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V5-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-V5:       [[SW_EPILOG]]:
+// UNIFORM-V5-NEXT:    [[DOTSINK_SHRUNK:%.*]] = phi i16 [ [[NARROW]], 
%[[SW_BB]] ], [ [[TMP8]], %[[SW_BB2]] ], [ [[TMP5]], %[[SW_BB1]] ], [ 0, 
%[[ENTRY]] ]
+// UNIFORM-V5-NEXT:    [[DOTSINK:%.*]] = zext i16 [[DOTSINK_SHRUNK]] to i32
+// UNIFORM-V5-NEXT:    store i32 [[DOTSINK]], ptr addrspace(1) [[OUT]], align 
4, !tbaa [[INT_TBAA3:![0-9]+]]
+// UNIFORM-V5-NEXT:    ret void
+//
+// NONUNIFORM-V4-LABEL: define dso_local void @test_get_workgroup_size(
+// NONUNIFORM-V4-SAME: i32 noundef [[D:%.*]], ptr addrspace(1) noundef 
writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR4:[0-9]+]] {
+// NONUNIFORM-V4-NEXT:  [[ENTRY:.*]]:
+// NONUNIFORM-V4-NEXT:    switch i32 [[D]], label %[[SW_EPILOG:.*]] [
+// NONUNIFORM-V4-NEXT:      i32 0, label %[[SW_BB:.*]]
+// NONUNIFORM-V4-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// NONUNIFORM-V4-NEXT:      i32 2, label %[[SW_BB2:.*]]
+// NONUNIFORM-V4-NEXT:    ]
+// NONUNIFORM-V4:       [[SW_BB]]:
+// NONUNIFORM-V4-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-V4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 4
+// NONUNIFORM-V4-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 4, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP3:%.*]] = zext nneg i16 [[TMP2]] to i32
+// NONUNIFORM-V4-NEXT:    [[TMP4:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.x()
+// NONUNIFORM-V4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 12
+// NONUNIFORM-V4-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], 
align 4, !range [[RNG9]], !invariant.load [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP4]], [[TMP3]]
+// NONUNIFORM-V4-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+// NONUNIFORM-V4-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.umin.i32(i32 
[[TMP8]], i32 [[TMP3]])
+// NONUNIFORM-V4-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[TMP9]], 1
+// NONUNIFORM-V4-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-V4:       [[SW_BB1]]:
+// NONUNIFORM-V4-NEXT:    [[TMP10:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-V4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP10]], i64 6
+// NONUNIFORM-V4-NEXT:    [[TMP12:%.*]] = load i16, ptr addrspace(4) 
[[TMP11]], align 2, !range [[RNG7]], !invariant.load [[META8]], !noundef 
[[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP13:%.*]] = zext nneg i16 [[TMP12]] to i32
+// NONUNIFORM-V4-NEXT:    [[TMP14:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.y()
+// NONUNIFORM-V4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP10]], i64 16
+// NONUNIFORM-V4-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(4) 
[[TMP15]], align 4, !range [[RNG9]], !invariant.load [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP14]], [[TMP13]]
+// NONUNIFORM-V4-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP17]]
+// NONUNIFORM-V4-NEXT:    [[TMP19:%.*]] = tail call i32 @llvm.umin.i32(i32 
[[TMP18]], i32 [[TMP13]])
+// NONUNIFORM-V4-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-V4:       [[SW_BB2]]:
+// NONUNIFORM-V4-NEXT:    [[TMP20:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-V4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP20]], i64 8
+// NONUNIFORM-V4-NEXT:    [[TMP22:%.*]] = load i16, ptr addrspace(4) 
[[TMP21]], align 4, !range [[RNG7]], !invariant.load [[META8]], !noundef 
[[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP23:%.*]] = zext nneg i16 [[TMP22]] to i32
+// NONUNIFORM-V4-NEXT:    [[TMP24:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.z()
+// NONUNIFORM-V4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP20]], i64 20
+// NONUNIFORM-V4-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(4) 
[[TMP25]], align 4, !range [[RNG9]], !invariant.load [[META8]]
+// NONUNIFORM-V4-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP24]], [[TMP23]]
+// NONUNIFORM-V4-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP27]]
+// NONUNIFORM-V4-NEXT:    [[TMP29:%.*]] = tail call i32 @llvm.umin.i32(i32 
[[TMP28]], i32 [[TMP23]])
+// NONUNIFORM-V4-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-V4:       [[SW_EPILOG]]:
+// NONUNIFORM-V4-NEXT:    [[DOTSINK:%.*]] = phi i32 [ [[ADD]], %[[SW_BB]] ], [ 
[[TMP29]], %[[SW_BB2]] ], [ [[TMP19]], %[[SW_BB1]] ], [ 0, %[[ENTRY]] ]
+// NONUNIFORM-V4-NEXT:    store i32 [[DOTSINK]], ptr addrspace(1) [[OUT]], 
align 4, !tbaa [[INT_TBAA3:![0-9]+]]
+// NONUNIFORM-V4-NEXT:    ret void
+//
+// UNIFORM-V4-LABEL: define dso_local void @test_get_workgroup_size(
+// UNIFORM-V4-SAME: i32 noundef [[D:%.*]], ptr addrspace(1) noundef writeonly 
captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR2:[0-9]+]] {
+// UNIFORM-V4-NEXT:  [[ENTRY:.*]]:
+// UNIFORM-V4-NEXT:    switch i32 [[D]], label %[[SW_EPILOG:.*]] [
+// UNIFORM-V4-NEXT:      i32 0, label %[[SW_BB:.*]]
+// UNIFORM-V4-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// UNIFORM-V4-NEXT:      i32 2, label %[[SW_BB2:.*]]
+// UNIFORM-V4-NEXT:    ]
+// UNIFORM-V4:       [[SW_BB]]:
+// UNIFORM-V4-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-V4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 4
+// UNIFORM-V4-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], 
align 4, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V4-NEXT:    [[NARROW:%.*]] = add nuw nsw i16 [[TMP2]], 1
+// UNIFORM-V4-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-V4:       [[SW_BB1]]:
+// UNIFORM-V4-NEXT:    [[TMP3:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-V4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP3]], i64 6
+// UNIFORM-V4-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], 
align 2, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V4-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-V4:       [[SW_BB2]]:
+// UNIFORM-V4-NEXT:    [[TMP6:%.*]] = tail call align 4 dereferenceable(64) 
ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-V4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP6]], i64 8
+// UNIFORM-V4-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], 
align 4, !range [[RNG7]], !invariant.load [[META8]], !noundef [[META8]]
+// UNIFORM-V4-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-V4:       [[SW_EPILOG]]:
+// UNIFORM-V4-NEXT:    [[DOTSINK_SHRUNK:%.*]] = phi i16 [ [[NARROW]], 
%[[SW_BB]] ], [ [[TMP8]], %[[SW_BB2]] ], [ [[TMP5]], %[[SW_BB1]] ], [ 0, 
%[[ENTRY]] ]
+// UNIFORM-V4-NEXT:    [[DOTSINK:%.*]] = zext i16 [[DOTSINK_SHRUNK]] to i32
+// UNIFORM-V4-NEXT:    store i32 [[DOTSINK]], ptr addrspace(1) [[OUT]], align 
4, !tbaa [[INT_TBAA3:![0-9]+]]
+// UNIFORM-V4-NEXT:    ret void
+//
+// NONUNIFORM-UNKNOWN-LABEL: define dso_local void @test_get_workgroup_size(
+// NONUNIFORM-UNKNOWN-SAME: i32 noundef [[D:%.*]], ptr addrspace(1) noundef 
writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR4:[0-9]+]] {
+// NONUNIFORM-UNKNOWN-NEXT:  [[ENTRY:.*]]:
+// NONUNIFORM-UNKNOWN-NEXT:    switch i32 [[D]], label %[[SW_EPILOG:.*]] [
+// NONUNIFORM-UNKNOWN-NEXT:      i32 0, label %[[SW_BB:.*]]
+// NONUNIFORM-UNKNOWN-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// NONUNIFORM-UNKNOWN-NEXT:      i32 2, label %[[SW_BB2:.*]]
+// NONUNIFORM-UNKNOWN-NEXT:    ]
+// NONUNIFORM-UNKNOWN:       [[SW_BB]]:
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP0]], 499
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP2:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) 
[[TMP2]], align 8, !invariant.load [[META6]], !noundef [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP4:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.x()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[TMP3]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 12, i64 
18
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP2]], i64 [[TMP6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) 
[[TMP7]], align 2, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP9:%.*]] = zext nneg i16 [[TMP8]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP10:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP10]], i64 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP12:%.*]] = load i16, ptr addrspace(4) 
[[TMP11]], align 4, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP13:%.*]] = zext nneg i16 [[TMP12]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP10]], i64 12
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(4) 
[[TMP14]], align 4, !range [[RNG8]], !invariant.load [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP4]], [[TMP13]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP18:%.*]] = tail call i32 
@llvm.umin.i32(i32 [[TMP17]], i32 [[TMP13]])
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP19:%.*]] = select i1 [[TMP1]], i32 
[[TMP9]], i32 [[TMP18]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[TMP19]], 1
+// NONUNIFORM-UNKNOWN-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-UNKNOWN:       [[SW_BB1]]:
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], 499
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP22:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP22]], i64 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(4) 
[[TMP23]], align 4, !invariant.load [[META6]], !noundef [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP25:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.y()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP26:%.*]] = icmp ult i32 [[TMP25]], 
[[TMP24]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 14, 
i64 20
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP22]], i64 [[TMP27]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP29:%.*]] = load i16, ptr addrspace(4) 
[[TMP28]], align 2, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP30:%.*]] = zext nneg i16 [[TMP29]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP31:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP31]], i64 6
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP33:%.*]] = load i16, ptr addrspace(4) 
[[TMP32]], align 2, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP34:%.*]] = zext nneg i16 [[TMP33]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP35:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP31]], i64 16
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(4) 
[[TMP35]], align 4, !range [[RNG8]], !invariant.load [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP25]], [[TMP34]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP36]], [[TMP37]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP39:%.*]] = tail call i32 
@llvm.umin.i32(i32 [[TMP38]], i32 [[TMP34]])
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP40:%.*]] = select i1 [[TMP21]], i32 
[[TMP30]], i32 [[TMP39]]
+// NONUNIFORM-UNKNOWN-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-UNKNOWN:       [[SW_BB2]]:
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP42:%.*]] = icmp sgt i32 [[TMP41]], 499
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP43:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP43]], i64 8
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(4) 
[[TMP44]], align 8, !invariant.load [[META6]], !noundef [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP46:%.*]] = tail call i32 
@llvm.amdgcn.workgroup.id.z()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP47:%.*]] = icmp ult i32 [[TMP46]], 
[[TMP45]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 16, 
i64 22
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP43]], i64 [[TMP48]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP50:%.*]] = load i16, ptr addrspace(4) 
[[TMP49]], align 2, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP51:%.*]] = zext nneg i16 [[TMP50]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP52:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP53:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP52]], i64 8
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP54:%.*]] = load i16, ptr addrspace(4) 
[[TMP53]], align 4, !range [[RNG7]], !invariant.load [[META6]], !noundef 
[[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP55:%.*]] = zext nneg i16 [[TMP54]] to i32
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP56:%.*]] = getelementptr inbounds nuw i8, 
ptr addrspace(4) [[TMP52]], i64 20
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(4) 
[[TMP56]], align 4, !range [[RNG8]], !invariant.load [[META6]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP58:%.*]] = mul i32 [[TMP46]], [[TMP55]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP57]], [[TMP58]]
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP60:%.*]] = tail call i32 
@llvm.umin.i32(i32 [[TMP59]], i32 [[TMP55]])
+// NONUNIFORM-UNKNOWN-NEXT:    [[TMP61:%.*]] = select i1 [[TMP42]], i32 
[[TMP51]], i32 [[TMP60]]
+// NONUNIFORM-UNKNOWN-NEXT:    br label %[[SW_EPILOG]]
+// NONUNIFORM-UNKNOWN:       [[SW_EPILOG]]:
+// NONUNIFORM-UNKNOWN-NEXT:    [[DOTSINK:%.*]] = phi i32 [ [[ADD]], %[[SW_BB]] 
], [ [[TMP61]], %[[SW_BB2]] ], [ [[TMP40]], %[[SW_BB1]] ], [ 0, %[[ENTRY]] ]
+// NONUNIFORM-UNKNOWN-NEXT:    store i32 [[DOTSINK]], ptr addrspace(1) 
[[OUT]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
+// NONUNIFORM-UNKNOWN-NEXT:    ret void
+//
+// UNIFORM-UNKNOWN-LABEL: define dso_local void @test_get_workgroup_size(
+// UNIFORM-UNKNOWN-SAME: i32 noundef [[D:%.*]], ptr addrspace(1) noundef 
writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR2:[0-9]+]] {
+// UNIFORM-UNKNOWN-NEXT:  [[ENTRY:.*]]:
+// UNIFORM-UNKNOWN-NEXT:    switch i32 [[D]], label %[[SW_EPILOG:.*]] [
+// UNIFORM-UNKNOWN-NEXT:      i32 0, label %[[SW_BB:.*]]
+// UNIFORM-UNKNOWN-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// UNIFORM-UNKNOWN-NEXT:      i32 2, label %[[SW_BB2:.*]]
+// UNIFORM-UNKNOWN-NEXT:    ]
+// UNIFORM-UNKNOWN:       [[SW_BB]]:
+// UNIFORM-UNKNOWN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// UNIFORM-UNKNOWN-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP0]], 499
+// UNIFORM-UNKNOWN-NEXT:    [[TMP2:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP2]], i64 12
+// UNIFORM-UNKNOWN-NEXT:    [[TMP4:%.*]] = load i16, ptr addrspace(4) 
[[TMP3]], align 4, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP5:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP5]], i64 4
+// UNIFORM-UNKNOWN-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) 
[[TMP6]], align 4, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[DOTV7:%.*]] = select i1 [[TMP1]], i16 [[TMP4]], 
i16 [[TMP7]]
+// UNIFORM-UNKNOWN-NEXT:    [[NARROW:%.*]] = add nuw nsw i16 [[DOTV7]], 1
+// UNIFORM-UNKNOWN-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-UNKNOWN:       [[SW_BB1]]:
+// UNIFORM-UNKNOWN-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// UNIFORM-UNKNOWN-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], 499
+// UNIFORM-UNKNOWN-NEXT:    [[TMP10:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP10]], i64 14
+// UNIFORM-UNKNOWN-NEXT:    [[TMP12:%.*]] = load i16, ptr addrspace(4) 
[[TMP11]], align 2, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP13:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP13]], i64 6
+// UNIFORM-UNKNOWN-NEXT:    [[TMP15:%.*]] = load i16, ptr addrspace(4) 
[[TMP14]], align 2, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[DOTV6:%.*]] = select i1 [[TMP9]], i16 [[TMP12]], 
i16 [[TMP15]]
+// UNIFORM-UNKNOWN-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-UNKNOWN:       [[SW_BB2]]:
+// UNIFORM-UNKNOWN-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+// UNIFORM-UNKNOWN-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], 499
+// UNIFORM-UNKNOWN-NEXT:    [[TMP18:%.*]] = tail call align 8 
dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP18]], i64 16
+// UNIFORM-UNKNOWN-NEXT:    [[TMP20:%.*]] = load i16, ptr addrspace(4) 
[[TMP19]], align 8, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[TMP21:%.*]] = tail call align 4 
dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// UNIFORM-UNKNOWN-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP21]], i64 8
+// UNIFORM-UNKNOWN-NEXT:    [[TMP23:%.*]] = load i16, ptr addrspace(4) 
[[TMP22]], align 4, !range [[RNG6]], !invariant.load [[META7]], !noundef 
[[META7]]
+// UNIFORM-UNKNOWN-NEXT:    [[DOTV:%.*]] = select i1 [[TMP17]], i16 [[TMP20]], 
i16 [[TMP23]]
+// UNIFORM-UNKNOWN-NEXT:    br label %[[SW_EPILOG]]
+// UNIFORM-UNKNOWN:       [[SW_EPILOG]]:
+// UNIFORM-UNKNOWN-NEXT:    [[DOTSINK_SHRUNK:%.*]] = phi i16 [ [[NARROW]], 
%[[SW_BB]] ], [ [[DOTV]], %[[SW_BB2]] ], [ [[DOTV6]], %[[SW_BB1]] ], [ 0, 
%[[ENTRY]] ]
+// UNIFORM-UNKNOWN-NEXT:    [[DOTSINK:%.*]] = zext i16 [[DOTSINK_SHRUNK]] to 
i32
+// UNIFORM-UNKNOWN-NEXT:    store i32 [[DOTSINK]], ptr addrspace(1) [[OUT]], 
align 4, !tbaa [[INT_TBAA2:![0-9]+]]
+// UNIFORM-UNKNOWN-NEXT:    ret void
+//
+void test_get_workgroup_size(int d, global int *out)
+{
+       switch (d) {
+       case 0: *out = __builtin_amdgcn_workgroup_size_x() + 1; break;
+       case 1: *out = __builtin_amdgcn_workgroup_size_y(); break;
+       case 2: *out = __builtin_amdgcn_workgroup_size_z(); break;
+       default: *out = 0;
+       }
+}
+
+
+//.
+// NONUNIFORM-V5: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// NONUNIFORM-V5: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// NONUNIFORM-V5: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// NONUNIFORM-V5: [[META6]] = !{!"Simple C/C++ TBAA"}
+// NONUNIFORM-V5: [[META7]] = !{}
+// NONUNIFORM-V5: [[RNG8]] = !{i16 1, i16 1025}
+//.
+// UNIFORM-V5: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// UNIFORM-V5: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// UNIFORM-V5: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// UNIFORM-V5: [[META6]] = !{!"Simple C/C++ TBAA"}
+// UNIFORM-V5: [[RNG7]] = !{i16 1, i16 1025}
+// UNIFORM-V5: [[META8]] = !{}
+//.
+// NONUNIFORM-V4: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// NONUNIFORM-V4: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// NONUNIFORM-V4: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// NONUNIFORM-V4: [[META6]] = !{!"Simple C/C++ TBAA"}
+// NONUNIFORM-V4: [[RNG7]] = !{i16 1, i16 1025}
+// NONUNIFORM-V4: [[META8]] = !{}
+// NONUNIFORM-V4: [[RNG9]] = !{i32 1, i32 0}
+//.
+// UNIFORM-V4: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// UNIFORM-V4: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// UNIFORM-V4: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// UNIFORM-V4: [[META6]] = !{!"Simple C/C++ TBAA"}
+// UNIFORM-V4: [[RNG7]] = !{i16 1, i16 1025}
+// UNIFORM-V4: [[META8]] = !{}
+//.
+// NONUNIFORM-UNKNOWN: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// NONUNIFORM-UNKNOWN: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// NONUNIFORM-UNKNOWN: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], 
i64 0}
+// NONUNIFORM-UNKNOWN: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NONUNIFORM-UNKNOWN: [[META6]] = !{}
+// NONUNIFORM-UNKNOWN: [[RNG7]] = !{i16 1, i16 1025}
+// NONUNIFORM-UNKNOWN: [[RNG8]] = !{i32 1, i32 0}
+//.
+// UNIFORM-UNKNOWN: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// UNIFORM-UNKNOWN: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// UNIFORM-UNKNOWN: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// UNIFORM-UNKNOWN: [[META5]] = !{!"Simple C/C++ TBAA"}
+// UNIFORM-UNKNOWN: [[RNG6]] = !{i16 1, i16 1025}
+// UNIFORM-UNKNOWN: [[META7]] = !{}
+//.

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 376105cb6594c..dc5333c92d439 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1137,24 +1137,6 @@ void test_get_local_id(int d, global int *out)
 // CHECK: declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
 // CHECK: declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
 
-// CHECK-LABEL: @test_get_workgroup_size(
-// CHECK: {{.*}}call align 8 dereferenceable(256){{.*}} ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
-// CHECK: getelementptr inbounds nuw i8, ptr addrspace(4) %{{.*}}, i64 12
-// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 4, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// CHECK: getelementptr inbounds nuw i8, ptr addrspace(4) %{{.*}}, i64 14
-// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// CHECK: getelementptr inbounds nuw i8, ptr addrspace(4) %{{.*}}, i64 16
-// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 8, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-void test_get_workgroup_size(int d, global int *out)
-{
-       switch (d) {
-       case 0: *out = __builtin_amdgcn_workgroup_size_x() + 1; break;
-       case 1: *out = __builtin_amdgcn_workgroup_size_y(); break;
-       case 2: *out = __builtin_amdgcn_workgroup_size_z(); break;
-       default: *out = 0;
-       }
-}
-
 // CHECK-LABEL: @test_get_grid_size(
 // CHECK: {{.*}}call align 4 dereferenceable(64){{.*}} ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
 // CHECK: getelementptr inbounds nuw i8, ptr addrspace(4) %{{.*}}, i64 %{{.+}}
@@ -1382,5 +1364,4 @@ void test_set_fpenv(unsigned long env) {
 }
 
 // CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0}
-// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
 // CHECK-DAG: attributes #[[$NOUNWIND_READONLY_NOPOISON]] = { convergent 
mustprogress nocallback nocreateundeforpoison nofree nounwind willreturn 
memory(none) }

diff  --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 565481ab0a971..17c1699ee5c36 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -84,12 +84,17 @@ __gpu_kernel void foo() {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
 // AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i32 12
-// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]]
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !range [[RNG2:![0-9]+]], !invariant.load [[META3:![0-9]+]]
 // AMDGPU-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i32 12
-// AMDGPU-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 
2, !range [[RNG5:![0-9]+]], !invariant.load [[META4]], !noundef [[META4]]
-// AMDGPU-NEXT:    [[CONV:%.*]] = zext i16 [[TMP5]] to i32
-// AMDGPU-NEXT:    [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]]
+// AMDGPU-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i64 0
+// AMDGPU-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 
4, !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+// AMDGPU-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
+// AMDGPU-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 12, i32 18
+// AMDGPU-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP3]], i32 [[TMP8]]
+// AMDGPU-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(4) [[TMP9]], align 
2, !range [[RNG4:![0-9]+]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP10]] to i32
+// AMDGPU-NEXT:    [[DIV:%.*]] = udiv i32 [[TMP2]], [[TMP11]]
 // AMDGPU-NEXT:    ret i32 [[DIV]]
 //
 //
@@ -98,12 +103,17 @@ __gpu_kernel void foo() {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
 // AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i32 16
-// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !range [[RNG3]], !invariant.load [[META4]]
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !range [[RNG2]], !invariant.load [[META3]]
 // AMDGPU-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i32 14
-// AMDGPU-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 
2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
-// AMDGPU-NEXT:    [[CONV:%.*]] = zext i16 [[TMP5]] to i32
-// AMDGPU-NEXT:    [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]]
+// AMDGPU-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i64 4
+// AMDGPU-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 
4, !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
+// AMDGPU-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
+// AMDGPU-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 14, i32 20
+// AMDGPU-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP3]], i32 [[TMP8]]
+// AMDGPU-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(4) [[TMP9]], align 
2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP10]] to i32
+// AMDGPU-NEXT:    [[DIV:%.*]] = udiv i32 [[TMP2]], [[TMP11]]
 // AMDGPU-NEXT:    ret i32 [[DIV]]
 //
 //
@@ -112,12 +122,17 @@ __gpu_kernel void foo() {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
 // AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i32 20
-// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !range [[RNG3]], !invariant.load [[META4]]
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !range [[RNG2]], !invariant.load [[META3]]
 // AMDGPU-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i32 16
-// AMDGPU-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 
2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
-// AMDGPU-NEXT:    [[CONV:%.*]] = zext i16 [[TMP5]] to i32
-// AMDGPU-NEXT:    [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]]
+// AMDGPU-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i64 8
+// AMDGPU-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 
4, !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
+// AMDGPU-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
+// AMDGPU-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 16, i32 22
+// AMDGPU-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP3]], i32 [[TMP8]]
+// AMDGPU-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(4) [[TMP9]], align 
2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP10]] to i32
+// AMDGPU-NEXT:    [[DIV:%.*]] = udiv i32 [[TMP2]], [[TMP11]]
 // AMDGPU-NEXT:    ret i32 [[DIV]]
 //
 //
@@ -212,30 +227,45 @@ __gpu_kernel void foo() {
 // AMDGPU-SAME: ) #[[ATTR0]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i32 12
-// AMDGPU-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 
2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
-// AMDGPU-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
-// AMDGPU-NEXT:    ret i32 [[CONV]]
+// AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i64 0
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+// AMDGPU-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], [[TMP2]]
+// AMDGPU-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 12, i32 18
+// AMDGPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP0]], i32 [[TMP5]]
+// AMDGPU-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], align 
2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
+// AMDGPU-NEXT:    ret i32 [[TMP8]]
 //
 //
 // AMDGPU-LABEL: define internal i32 @__gpu_num_threads_y(
 // AMDGPU-SAME: ) #[[ATTR0]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i32 14
-// AMDGPU-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 
2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
-// AMDGPU-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
-// AMDGPU-NEXT:    ret i32 [[CONV]]
+// AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i64 4
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
+// AMDGPU-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], [[TMP2]]
+// AMDGPU-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 14, i32 20
+// AMDGPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP0]], i32 [[TMP5]]
+// AMDGPU-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], align 
2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
+// AMDGPU-NEXT:    ret i32 [[TMP8]]
 //
 //
 // AMDGPU-LABEL: define internal i32 @__gpu_num_threads_z(
 // AMDGPU-SAME: ) #[[ATTR0]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i32 16
-// AMDGPU-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 
2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
-// AMDGPU-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
-// AMDGPU-NEXT:    ret i32 [[CONV]]
+// AMDGPU-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], 
i64 8
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 
4, !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
+// AMDGPU-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], [[TMP2]]
+// AMDGPU-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 16, i32 22
+// AMDGPU-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(4) 
[[TMP0]], i32 [[TMP5]]
+// AMDGPU-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], align 
2, !range [[RNG4]], !invariant.load [[META3]], !noundef [[META3]]
+// AMDGPU-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
+// AMDGPU-NEXT:    ret i32 [[TMP8]]
 //
 //
 // AMDGPU-LABEL: define internal i32 @__gpu_num_threads(
@@ -1378,7 +1408,7 @@ __gpu_kernel void foo() {
 // SPIRV-NEXT:    ret void
 //
 //.
-// AMDGPU: [[RNG3]] = !{i32 1, i32 0}
-// AMDGPU: [[META4]] = !{}
-// AMDGPU: [[RNG5]] = !{i16 1, i16 1025}
+// AMDGPU: [[RNG2]] = !{i32 1, i32 0}
+// AMDGPU: [[META3]] = !{}
+// AMDGPU: [[RNG4]] = !{i16 1, i16 1025}
 //.


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 47e3108 - clang/AMDGPU: Fix workgroup size builtins for nonuniform work group sizes (#185098)

Reply via email to