[clang] [llvm] AMDGPU: Add llvm.amdgcn.s.wait.event intrinsic (PR #180170)

Matt Arsenault via cfe-commits Fri, 06 Feb 2026 03:32:40 -0800

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/180170


Exactly match the s_wait_event instruction. For some reason we already
had this instruction used through llvm.amdgcn.s.wait.event.export.ready,
but that hardcodes a specific value. This should really be a bitmask that
can combine multiple wait types.

gfx11 -> gfx12 broke compatabilty in a weird way, by inverting the
interpretation  of the bit but also shifting the used bit by 1. Simplify
the selection of the old intrinsic by just using the magic number 2, which
should satisfy both cases.

>From fecc0bfc218892d83ae7871d3ecde0ebd746e06c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <[email protected]>
Date: Fri, 6 Feb 2026 09:21:08 +0100
Subject: [PATCH] AMDGPU: Add llvm.amdgcn.s.wait.event intrinsic

Exactly match the s_wait_event instruction. For some reason we already
had this instruction used through llvm.amdgcn.s.wait.event.export.ready,
but that hardcodes a specific value. This should really be a bitmask that
can combine multiple wait types.

gfx11 -> gfx12 broke compatabilty in a weird way, by inverting the
interpretation  of the bit but also shifting the used bit by 1. Simplify
the selection of the old intrinsic by just using the magic number 2, which
should satisfy both cases.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.td   |  1 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  7 +++
 clang/lib/Sema/SemaAMDGPU.cpp                 | 24 +++++++++
 .../builtins-amdgcn-s-wait-event.cl           | 27 ++++++++++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 11 ++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  6 +--
 .../AMDGPU/llvm.amdgcn.s.wait.event.ll        | 51 +++++++++++++++----
 7 files changed, 114 insertions(+), 13 deletions(-)
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td 
b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 740d136f465c1..17f081a906364 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -339,6 +339,7 @@ def __builtin_amdgcn_image_bvh_intersect_ray_lh : 
AMDGPUBuiltin<"_ExtVector<4, u
 // TODO: This is a no-op in wave32. Should the builtin require wavefrontsize64?
 def __builtin_amdgcn_permlane64 : AMDGPUBuiltin<"unsigned int(unsigned int)", 
[Const], "gfx11-insts">;
 def __builtin_amdgcn_s_wait_event_export_ready : AMDGPUBuiltin<"void()", [], 
"gfx11-insts">;
+def __builtin_amdgcn_s_wait_event : AMDGPUBuiltin<"void(_Constant short)", [], 
"gfx11-insts">;
 
 
//===----------------------------------------------------------------------===//
 // WMMA builtins.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index af96b6cf02195..b81e0da956c35 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13907,6 +13907,13 @@ def note_amdgcn_load_lds_size_valid_value : Note<"size 
must be %select{1, 2, or
 
 def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a 
global or generic pointer">;
 
+def warn_amdgpu_s_wait_event_mask_no_effect_target :
+  Warning<"event mask has no effect for target">,
+  InGroup<DiagGroup<"amdgpu-wait-event-mask">>;
+
+def note_amdgpu_s_wait_event_suggested_value :
+  Note<"value of 2 valid for export_ready for gfx11 and gfx12+">;
+
 def warn_comparison_in_enum_initializer : Warning<
   "comparison operator '%0' is potentially a typo for a shift operator '%1'">,
   InGroup<DiagGroup<"enum-compare-typo">>;
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 4261e1849133f..cec8f9d2675e6 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -89,6 +89,30 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_s_setreg:
     return SemaRef.BuiltinConstantArgRange(TheCall, /*ArgNum=*/0, /*Low=*/0,
                                            /*High=*/UINT16_MAX);
+  case AMDGPU::BI__builtin_amdgcn_s_wait_event: {
+    llvm::APSInt Result;
+    if (SemaRef.BuiltinConstantArg(TheCall, 0, Result))
+      return true;
+
+    bool IsGFX12Plus = Builtin::evaluateRequiredTargetFeatures(
+        "gfx12-insts", CallerFeatureMap);
+
+    // gfx11 -> gfx12 changed the interpretation of the bitmask. gfx12 inverted
+    // the intepretation for export_ready, but shifted the used bit by 1. Thus
+    // waiting for the export_ready event can use a value of 2 universally.
+    if (((IsGFX12Plus && !Result[1]) || (!IsGFX12Plus && Result[0])) ||
+        Result.getZExtValue() > 2) {
+      Expr *ArgExpr = TheCall->getArg(0);
+      SemaRef.targetDiag(ArgExpr->getExprLoc(),
+                         diag::warn_amdgpu_s_wait_event_mask_no_effect_target)
+          << ArgExpr->getSourceRange();
+      SemaRef.targetDiag(ArgExpr->getExprLoc(),
+                         diag::note_amdgpu_s_wait_event_suggested_value)
+          << ArgExpr->getSourceRange();
+    }
+
+    return false;
+  }
   case AMDGPU::BI__builtin_amdgcn_mov_dpp:
     return checkMovDPPFunctionCall(TheCall, 5, 1);
   case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl
new file mode 100644
index 0000000000000..1a9d40cf90cbe
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl
@@ -0,0 +1,27 @@
+// xUN: %clang_cc1 -fsyntax-only -triple amdgcn-- -target-cpu gfx1100 
-verify=ALL,GFX11 %s
+// RUN: %clang_cc1 -fsyntax-only -triple amdgcn-- -target-cpu gfx1200 
-verify=ALL,GFX12 %s
+
+void test(int x) {
+  // ALL-error@+1 {{argument to '__builtin_amdgcn_s_wait_event' must be a 
constant integer}}
+  __builtin_amdgcn_s_wait_event(x);
+
+  // GFX11-expected-no-diagnostics
+  // GFX12-warning@+2 {{event mask has no effect for target}}
+  // GFX12-note@+1 {{value of 2 valid for export_ready for gfx11 and gfx12+}}
+  __builtin_amdgcn_s_wait_event(0); // 0 does nothing on gfx12
+
+  // GFX11-expected-no-diagnostics
+  // GFX12-warning@+2 {{event mask has no effect for target}}
+  // GFX12-note@+1 {{value of 2 valid for export_ready for gfx11 and gfx12+}}
+  __builtin_amdgcn_s_wait_event(1); // 1 does nothing on gfx11
+
+  __builtin_amdgcn_s_wait_event(2); // expected-no-diagnostics
+
+  // ALL-warning@+2 {{event mask has no effect for target}}
+  // ALL-note@+1 {{value of 2 valid for export_ready for gfx11 and gfx12+}}
+  __builtin_amdgcn_s_wait_event(3);
+
+  // ALL-warning@+2 {{event mask has no effect for target}}
+  // ALL-note@+1 {{value of 2 valid for export_ready for gfx11 and gfx12+}}
+  __builtin_amdgcn_s_wait_event(-1);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a8eba9ed126b7..e35376ba404c0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2904,6 +2904,17 @@ class IntDSBVHStackRtn<LLVMType vdst, LLVMType data1> :
 def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn<vdst  = llvm_i32_ty,
                                                    data1 = llvm_v4i32_ty>;
 
+// Emit s_wait_event instruction. Note that between gfx11 and gfx12,
+// the bit for the export_ready event changed. gfx11 expects bit 0 to
+// be 0, and gfx12 expects bit 1 to be 0. Thus, an immediate value of
+// 2 can be used as the universal value for export_ready.
+def int_amdgcn_s_wait_event :
+  ClangBuiltin<"__builtin_amdgcn_s_wait_event">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 
IntrHasSideEffects]
+>;
+
+// Emits same instruction as s_wait_event, with a hardcoded immediate
+// value. FIXME: This should be removed
 def int_amdgcn_s_wait_event_export_ready :
   ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td 
b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index d265440c03274..874249dc83c9f 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1835,7 +1835,7 @@ let SubtargetPredicate = isGFX10Plus in {
 let SubtargetPredicate = isGFX11Plus in {
 let OtherPredicates = [HasExportInsts] in
   def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16),
-                                 "$simm16"> {
+                                 "$simm16", [(int_amdgcn_s_wait_event 
timm:$simm16)]> {
                                    let hasSideEffects = 1;
                                  }
   def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins SDelayALU:$simm16),
@@ -1958,9 +1958,7 @@ def : GCNPat<
   (S_SEXT_I32_I16 $src)
 >;
 
-let SubtargetPredicate = isNotGFX12Plus in
-  def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 
0))>;
-let SubtargetPredicate = isGFX12Plus in
+let SubtargetPredicate = isGFX11Plus in
   def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 
2))>;
 
 // The first 10 bits of the mode register are the core FP mode on all
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
index 27a8b35467218..0656671fac0df 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
@@ -1,14 +1,47 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck 
-check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck 
-check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck 
-check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck 
-check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck 
-check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck 
-check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck 
-check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck 
-check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}test_wait_event:
-; GFX11: s_wait_event 0x0
-; GFX12: s_wait_event 0x2
-
-define amdgpu_ps void @test_wait_event() {
+; GCN-LABEL: {{^}}test_wait_event_export_ready:
+; GCN: s_wait_event 0x2
+define amdgpu_ps void @test_wait_event_export_ready() {
 entry:
   call void @llvm.amdgcn.s.wait.event.export.ready()
   ret void
 }
+
+; GCN-LABEL: {{^}}test_wait_event_0:
+; GCN: s_wait_event 0x0
+define amdgpu_ps void @test_wait_event_0() {
+  call void @llvm.amdgcn.s.wait.event(i16 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_wait_event_1:
+; GCN: s_wait_event 0x1
+define amdgpu_ps void @test_wait_event_1() {
+  call void @llvm.amdgcn.s.wait.event(i16 1)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_wait_event_2:
+; GCN: s_wait_event 0x2
+define amdgpu_ps void @test_wait_event_2() {
+  call void @llvm.amdgcn.s.wait.event(i16 2)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_wait_event_3:
+; GCN: s_wait_event 0x3
+define amdgpu_ps void @test_wait_event_3() {
+  call void @llvm.amdgcn.s.wait.event(i16 3)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_wait_event_max:
+; GCN: s_wait_event 0xffff
+define amdgpu_ps void @test_wait_event_max() {
+  call void @llvm.amdgcn.s.wait.event(i16 -1)
+  ret void
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] AMDGPU: Add llvm.amdgcn.s.wait.event intrinsic (PR #180170)

Reply via email to