[clang] clang: Simplify emission of uniform-work-group-size attribute (PR #185066)

Matt Arsenault via cfe-commits Fri, 06 Mar 2026 09:51:14 -0800

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/185066


This wasn't strictly using the LangOpt field for this property,
and hardcoding the OpenCL version. It was also going out of its
way to specifically annotate specific calling conventions. Just
unconditionally emit it on all functions. The uniform work group
assumption must hold in the entire module, not just the entrypoint.
This also theoretically saves work for the attributor propagation.

This will avoid the need to repeat the same logic in builtin function
codegen.

>From e203da79c27dc7e6212847a7ed978bc824db9479 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <[email protected]>
Date: Fri, 6 Mar 2026 18:23:30 +0100
Subject: [PATCH] clang: Simplify emission of uniform-work-group-size attribute

This wasn't strictly using the LangOpt field for this property,
and hardcoding the OpenCL version. It was also going out of its
way to specifically annotate specific calling conventions. Just
unconditionally emit it on all functions. The uniform work group
assumption must hold in the entire module, not just the entrypoint.
This also theoretically saves work for the attributor propagation.

This will avoid the need to repeat the same logic in builtin function
codegen.
---
 clang/include/clang/Options/Options.td        |  2 +-
 clang/lib/CodeGen/CGCall.cpp                  | 28 +++-------
 clang/test/CodeGenCUDA/convergent.cu          | 53 ++++++++++---------
 .../CodeGenCUDA/incomplete-func-ptr-type.cu   |  2 +-
 clang/test/CodeGenHIP/default-attributes.hip  |  6 +--
 clang/test/CodeGenHIP/hip_weak_alias.cpp      |  4 +-
 .../cl20-device-side-enqueue-attributes.cl    |  2 +-
 clang/test/CodeGenOpenCL/convergent.cl        | 18 +++----
 8 files changed, 51 insertions(+), 64 deletions(-)

diff --git a/clang/include/clang/Options/Options.td 
b/clang/include/clang/Options/Options.td
index fe7169423b6bf..bc590c42fd7d5 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -1035,7 +1035,7 @@ def b : JoinedOrSeparate<["-"], "b">, 
Flags<[LinkerInput]>,
   Group<Link_Group>;
 
 defm offload_uniform_block : BoolFOption<"offload-uniform-block",
-  LangOpts<"OffloadUniformBlock">, Default<"LangOpts->CUDA">,
+  LangOpts<"OffloadUniformBlock">, Default<"LangOpts->CUDA || 
(LangOpts->OpenCL && LangOpts->OpenCLVersion <= 120)">,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Assume">,
   NegFlag<SetFalse, [], [ClangOption, CC1Option], "Don't assume">,
   BothFlags<[], [ClangOption], " that kernels are launched with uniform block 
sizes (default true for CUDA/HIP and false otherwise)">>;
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 04b27925bab8e..6dfd0f915190a 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2611,28 +2611,12 @@ void CodeGenModule::ConstructAttributeList(StringRef 
Name,
                                  NumElemsParam);
     }
 
-    if (DeviceKernelAttr::isOpenCLSpelling(
-            TargetDecl->getAttr<DeviceKernelAttr>()) &&
-        CallingConv != CallingConv::CC_C &&
-        CallingConv != CallingConv::CC_SpirFunction) {
-      // Check CallingConv to avoid adding uniform-work-group-size attribute to
-      // OpenCL Kernel Stub
-      if (getLangOpts().OpenCLVersion <= 120) {
-        // OpenCL v1.2 Work groups are always uniform
-        FuncAttrs.addAttribute("uniform-work-group-size");
-      } else {
-        // OpenCL v2.0 Work groups may be whether uniform or not.
-        // '-cl-uniform-work-group-size' compile option gets a hint
-        // to the compiler that the global work-size be a multiple of
-        // the work-group size specified to clEnqueueNDRangeKernel
-        // (i.e. work groups are uniform).
-        if (getLangOpts().OffloadUniformBlock)
-          FuncAttrs.addAttribute("uniform-work-group-size");
-      }
-    }
-
-    if (TargetDecl->hasAttr<CUDAGlobalAttr>() &&
-        getLangOpts().OffloadUniformBlock)
+    // OpenCL v2.0 Work groups may be whether uniform or not.
+    // '-cl-uniform-work-group-size' compile option gets a hint
+    // to the compiler that the global work-size be a multiple of
+    // the work-group size specified to clEnqueueNDRangeKernel
+    // (i.e. work groups are uniform).
+    if (getLangOpts().OffloadUniformBlock)
       FuncAttrs.addAttribute("uniform-work-group-size");
 
     if (TargetDecl->hasAttr<ArmLocallyStreamingAttr>())
diff --git a/clang/test/CodeGenCUDA/convergent.cu 
b/clang/test/CodeGenCUDA/convergent.cu
index bb034ee4ff442..97a24dc20d841 100644
--- a/clang/test/CodeGenCUDA/convergent.cu
+++ b/clang/test/CodeGenCUDA/convergent.cu
@@ -36,27 +36,27 @@ __host__ __device__ [[clang::noconvergent]] float 
aliasf1(int) asm("somethingels
 // DEVICE-NEXT:    call void @_Z3bazv() #[[ATTR4:[0-9]+]]
 // DEVICE-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", "=l"() 
#[[ATTR5:[0-9]+]], !srcloc [[META2:![0-9]+]]
 // DEVICE-NEXT:    store i32 [[TMP0]], ptr [[X]], align 4
-// DEVICE-NEXT:    call void asm sideeffect "trap", ""() #[[ATTR4]], !srcloc 
[[META3:![0-9]+]]
-// DEVICE-NEXT:    call void asm sideeffect "nop", ""() #[[ATTR6:[0-9]+]], 
!srcloc [[META4:![0-9]+]]
+// DEVICE-NEXT:    call void asm sideeffect "trap", ""() #[[ATTR6:[0-9]+]], 
!srcloc [[META3:![0-9]+]]
+// DEVICE-NEXT:    call void asm sideeffect "nop", ""() #[[ATTR7:[0-9]+]], 
!srcloc [[META4:![0-9]+]]
 // DEVICE-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
 // DEVICE-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 
noundef [[TMP1]]) #[[ATTR4]]
 // DEVICE-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
-// DEVICE-NEXT:    [[CALL1:%.*]] = call contract noundef float 
@somethingelse(i32 noundef [[TMP2]]) #[[ATTR6]]
+// DEVICE-NEXT:    [[CALL1:%.*]] = call contract noundef float 
@somethingelse(i32 noundef [[TMP2]]) #[[ATTR8:[0-9]+]]
 // DEVICE-NEXT:    ret void
 //
 // HOST-LABEL: define dso_local void @_Z3barv(
 // HOST-SAME: ) #[[ATTR0:[0-9]+]] {
 // HOST-NEXT:  [[ENTRY:.*:]]
 // HOST-NEXT:    [[X:%.*]] = alloca i32, align 4
-// HOST-NEXT:    call void @_Z3bazv()
-// HOST-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", 
"=l,~{dirflag},~{fpsr},~{flags}"() #[[ATTR2:[0-9]+]], !srcloc [[META1:![0-9]+]]
+// HOST-NEXT:    call void @_Z3bazv() #[[ATTR2:[0-9]+]]
+// HOST-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", 
"=l,~{dirflag},~{fpsr},~{flags}"() #[[ATTR3:[0-9]+]], !srcloc [[META1:![0-9]+]]
 // HOST-NEXT:    store i32 [[TMP0]], ptr [[X]], align 4
-// HOST-NEXT:    call void asm sideeffect "trap", 
"~{dirflag},~{fpsr},~{flags}"() #[[ATTR3:[0-9]+]], !srcloc [[META2:![0-9]+]]
-// HOST-NEXT:    call void asm sideeffect "nop", 
"~{dirflag},~{fpsr},~{flags}"() #[[ATTR3]], !srcloc [[META3:![0-9]+]]
+// HOST-NEXT:    call void asm sideeffect "trap", 
"~{dirflag},~{fpsr},~{flags}"() #[[ATTR4:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// HOST-NEXT:    call void asm sideeffect "nop", 
"~{dirflag},~{fpsr},~{flags}"() #[[ATTR4]], !srcloc [[META3:![0-9]+]]
 // HOST-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
-// HOST-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 
noundef [[TMP1]])
+// HOST-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 
noundef [[TMP1]]) #[[ATTR2]]
 // HOST-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
-// HOST-NEXT:    [[CALL1:%.*]] = call contract noundef float 
@somethingelse(i32 noundef [[TMP2]])
+// HOST-NEXT:    [[CALL1:%.*]] = call contract noundef float 
@somethingelse(i32 noundef [[TMP2]]) #[[ATTR2]]
 // HOST-NEXT:    ret void
 //
 __host__ __device__ void bar() {
@@ -71,27 +71,30 @@ __host__ __device__ void bar() {
 
 
 //.
-// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" }
-// DEVICE: attributes #[[ATTR4]] = { convergent nounwind }
+// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"uniform-work-group-size" }
+// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"uniform-work-group-size" }
+// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"uniform-work-group-size" }
+// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "uniform-work-group-size" }
+// DEVICE: attributes #[[ATTR4]] = { convergent nounwind 
"uniform-work-group-size" }
 // DEVICE: attributes #[[ATTR5]] = { convergent nounwind memory(none) }
-// DEVICE: attributes #[[ATTR6]] = { nounwind }
+// DEVICE: attributes #[[ATTR6]] = { convergent nounwind }
+// DEVICE: attributes #[[ATTR7]] = { nounwind }
+// DEVICE: attributes #[[ATTR8]] = { nounwind "uniform-work-group-size" }
 //.
-// HOST: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone 
"min-legal-vector-width"="0" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" 
}
-// HOST: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" 
}
-// HOST: attributes #[[ATTR2]] = { nounwind memory(none) }
-// HOST: attributes #[[ATTR3]] = { nounwind }
+// HOST: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone 
"min-legal-vector-width"="0" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" 
"uniform-work-group-size" }
+// HOST: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" 
"uniform-work-group-size" }
+// HOST: attributes #[[ATTR2]] = { "uniform-work-group-size" }
+// HOST: attributes #[[ATTR3]] = { nounwind memory(none) }
+// HOST: attributes #[[ATTR4]] = { nounwind }
 //.
 // DEVICE: [[META0:![0-9]+]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 // DEVICE: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-// DEVICE: [[META2]] = !{i64 3120}
-// DEVICE: [[META3]] = !{i64 3155}
-// DEVICE: [[META4]] = !{i64 3206}
+// DEVICE: [[META2]] = !{i64 3174}
+// DEVICE: [[META3]] = !{i64 3209}
+// DEVICE: [[META4]] = !{i64 3260}
 //.
 // HOST: [[META0:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-// HOST: [[META1]] = !{i64 3120}
-// HOST: [[META2]] = !{i64 3155}
-// HOST: [[META3]] = !{i64 3206}
+// HOST: [[META1]] = !{i64 3174}
+// HOST: [[META2]] = !{i64 3209}
+// HOST: [[META3]] = !{i64 3260}
 //.
diff --git a/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu 
b/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu
index bd1da1f05c1eb..f4641fd242b4d 100644
--- a/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu
+++ b/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu
@@ -16,7 +16,7 @@ int main(int argc, char ** argv) {
 // CHECK:   store ptr @_Z4kern7TempValIjE, ptr %fptr, align 8
   return 0;
 }
-// CHECK:  define dso_local void @_Z19__device_stub__kern7TempValIjE(i32 
%in_val.coerce) #1 {
+// CHECK:  define dso_local void @_Z19__device_stub__kern7TempValIjE(i32 
%in_val.coerce) #{{[0-9]+}} {
 // CHECK:  %2 = call i32 @hipLaunchByPtr(ptr @_Z4kern7TempValIjE)
 
 // CHECK:  define internal void @__hip_register_globals(ptr %0) {
diff --git a/clang/test/CodeGenHIP/default-attributes.hip 
b/clang/test/CodeGenHIP/default-attributes.hip
index faccebbdad7e7..d5b5aa0c84ace 100644
--- a/clang/test/CodeGenHIP/default-attributes.hip
+++ b/clang/test/CodeGenHIP/default-attributes.hip
@@ -33,10 +33,10 @@ __global__ void kernel() {
  extern_func();
 }
 //.
-// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline 
nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline 
nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"uniform-work-group-size" }
+// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"uniform-work-group-size" }
 // OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline 
norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"uniform-work-group-size" }
-// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind }
+// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind 
"uniform-work-group-size" }
 //.
 // OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 
600}
 // OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
diff --git a/clang/test/CodeGenHIP/hip_weak_alias.cpp 
b/clang/test/CodeGenHIP/hip_weak_alias.cpp
index 480a278dc55d3..33ff74a1cf995 100644
--- a/clang/test/CodeGenHIP/hip_weak_alias.cpp
+++ b/clang/test/CodeGenHIP/hip_weak_alias.cpp
@@ -119,9 +119,9 @@ __host__ __device__ float __Four(float f) { return 2.0f * 
f; }
 __host__ __device__ int Four(void) __attribute__((weak, alias("_Z6__Fourv")));
 __host__ __device__ float Four(float f) __attribute__((weak, 
alias("_Z6__Fourf")));
 //.
-// HOST: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone 
"min-legal-vector-width"="0" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" 
}
+// HOST: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone 
"min-legal-vector-width"="0" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" 
"uniform-work-group-size" }
 //.
-// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind 
optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"uniform-work-group-size" }
 //.
 // HOST: [[META0:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 //.
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl 
b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
index af6cc30fcd329..42630375aeb8f 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
@@ -198,7 +198,7 @@ kernel void device_side_enqueue(global float *a, global 
float *b, int i) {
 //.
 // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind 
optnone denormal_fpenv(float: preservesign) "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "uniform-work-group-size" }
 // SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind 
willreturn memory(argmem: readwrite) }
-// SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone 
denormal_fpenv(float: preservesign) "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" }
+// SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone 
denormal_fpenv(float: preservesign) "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "uniform-work-group-size" }
 // SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nocreateundeforpoison 
nofree nosync nounwind speculatable willreturn memory(none) }
 // SPIR32: attributes #[[ATTR4]] = { convergent nounwind denormal_fpenv(float: 
preservesign) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // SPIR32: attributes #[[ATTR5]] = { convergent nounwind 
"uniform-work-group-size" }
diff --git a/clang/test/CodeGenOpenCL/convergent.cl 
b/clang/test/CodeGenOpenCL/convergent.cl
index 99d9ee74e669b..cefa4d7f8ebe9 100644
--- a/clang/test/CodeGenOpenCL/convergent.cl
+++ b/clang/test/CodeGenOpenCL/convergent.cl
@@ -127,18 +127,18 @@ void test_not_unroll() {
 // CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]]
 
 // CHECK-LABEL: @assume_convergent_asm
-// CHECK: tail call void asm sideeffect "s_barrier", ""() #6
+// CHECK: tail call void asm sideeffect "s_barrier", ""() #8
 kernel void assume_convergent_asm()
 {
   __asm__ volatile("s_barrier");
 }
 
 // CHECK: attributes #0 = { nofree noinline norecurse nounwind 
memory(readwrite, target_mem0: none, target_mem1: none) "
-// CHECK: attributes #1 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
-// CHECK: attributes #4 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #5 = { {{[^}]*}}alwaysinline convergent{{[^}]*}} }
-// CHECK: attributes #6 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #7 = { {{[^}]*}}nounwind{{[^}]*}} }
-// CHECK: attributes #8 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} }
+// CHECK: attributes #1 = { convergent norecurse nounwind{{[^}]*}} }
+// CHECK: attributes #2 = { convergent nounwind{{[^}]*}} }
+// CHECK: attributes #3 = { convergent noduplicate nounwind{{[^}]*}} }
+// CHECK: attributes #4 = { alwaysinline convergent norecurse 
nounwind{{[^}]*}} }
+// CHECK: attributes #5 = { convergent nounwind "uniform-work-group-size" }
+// CHECK: attributes #6 = { nounwind "uniform-work-group-size" }
+// CHECK: attributes #7 = { convergent noduplicate nounwind 
"uniform-work-group-size" }
+// CHECK: attributes #8 = { convergent nounwind }

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] clang: Simplify emission of uniform-work-group-size attribute (PR #185066)

Reply via email to