https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/180170
Exactly match the s_wait_event instruction. For some reason we already had this instruction used through llvm.amdgcn.s.wait.event.export.ready, but that hardcodes a specific value. This should really be a bitmask that can combine multiple wait types. gfx11 -> gfx12 broke compatabilty in a weird way, by inverting the interpretation of the bit but also shifting the used bit by 1. Simplify the selection of the old intrinsic by just using the magic number 2, which should satisfy both cases. >From fecc0bfc218892d83ae7871d3ecde0ebd746e06c Mon Sep 17 00:00:00 2001 From: Matt Arsenault <[email protected]> Date: Fri, 6 Feb 2026 09:21:08 +0100 Subject: [PATCH] AMDGPU: Add llvm.amdgcn.s.wait.event intrinsic Exactly match the s_wait_event instruction. For some reason we already had this instruction used through llvm.amdgcn.s.wait.event.export.ready, but that hardcodes a specific value. This should really be a bitmask that can combine multiple wait types. gfx11 -> gfx12 broke compatabilty in a weird way, by inverting the interpretation of the bit but also shifting the used bit by 1. Simplify the selection of the old intrinsic by just using the magic number 2, which should satisfy both cases. --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 1 + .../clang/Basic/DiagnosticSemaKinds.td | 7 +++ clang/lib/Sema/SemaAMDGPU.cpp | 24 +++++++++ .../builtins-amdgcn-s-wait-event.cl | 27 ++++++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 ++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 6 +-- .../AMDGPU/llvm.amdgcn.s.wait.event.ll | 51 +++++++++++++++---- 7 files changed, 114 insertions(+), 13 deletions(-) create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 740d136f465c1..17f081a906364 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -339,6 +339,7 @@ def __builtin_amdgcn_image_bvh_intersect_ray_lh : AMDGPUBuiltin<"_ExtVector<4, u // TODO: This is a no-op in wave32. Should the builtin require wavefrontsize64? def __builtin_amdgcn_permlane64 : AMDGPUBuiltin<"unsigned int(unsigned int)", [Const], "gfx11-insts">; def __builtin_amdgcn_s_wait_event_export_ready : AMDGPUBuiltin<"void()", [], "gfx11-insts">; +def __builtin_amdgcn_s_wait_event : AMDGPUBuiltin<"void(_Constant short)", [], "gfx11-insts">; //===----------------------------------------------------------------------===// // WMMA builtins. diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index af96b6cf02195..b81e0da956c35 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13907,6 +13907,13 @@ def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">; +def warn_amdgpu_s_wait_event_mask_no_effect_target : + Warning<"event mask has no effect for target">, + InGroup<DiagGroup<"amdgpu-wait-event-mask">>; + +def note_amdgpu_s_wait_event_suggested_value : + Note<"value of 2 valid for export_ready for gfx11 and gfx12+">; + def warn_comparison_in_enum_initializer : Warning< "comparison operator '%0' is potentially a typo for a shift operator '%1'">, InGroup<DiagGroup<"enum-compare-typo">>; diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 4261e1849133f..cec8f9d2675e6 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -89,6 +89,30 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_s_setreg: return SemaRef.BuiltinConstantArgRange(TheCall, /*ArgNum=*/0, /*Low=*/0, /*High=*/UINT16_MAX); + case AMDGPU::BI__builtin_amdgcn_s_wait_event: { + llvm::APSInt Result; + if (SemaRef.BuiltinConstantArg(TheCall, 0, Result)) + return true; + + bool IsGFX12Plus = Builtin::evaluateRequiredTargetFeatures( + "gfx12-insts", CallerFeatureMap); + + // gfx11 -> gfx12 changed the interpretation of the bitmask. gfx12 inverted + // the intepretation for export_ready, but shifted the used bit by 1. Thus + // waiting for the export_ready event can use a value of 2 universally. + if (((IsGFX12Plus && !Result[1]) || (!IsGFX12Plus && Result[0])) || + Result.getZExtValue() > 2) { + Expr *ArgExpr = TheCall->getArg(0); + SemaRef.targetDiag(ArgExpr->getExprLoc(), + diag::warn_amdgpu_s_wait_event_mask_no_effect_target) + << ArgExpr->getSourceRange(); + SemaRef.targetDiag(ArgExpr->getExprLoc(), + diag::note_amdgpu_s_wait_event_suggested_value) + << ArgExpr->getSourceRange(); + } + + return false; + } case AMDGPU::BI__builtin_amdgcn_mov_dpp: return checkMovDPPFunctionCall(TheCall, 5, 1); case AMDGPU::BI__builtin_amdgcn_mov_dpp8: diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl b/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl new file mode 100644 index 0000000000000..1a9d40cf90cbe --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl @@ -0,0 +1,27 @@ +// xUN: %clang_cc1 -fsyntax-only -triple amdgcn-- -target-cpu gfx1100 -verify=ALL,GFX11 %s +// RUN: %clang_cc1 -fsyntax-only -triple amdgcn-- -target-cpu gfx1200 -verify=ALL,GFX12 %s + +void test(int x) { + // ALL-error@+1 {{argument to '__builtin_amdgcn_s_wait_event' must be a constant integer}} + __builtin_amdgcn_s_wait_event(x); + + // GFX11-expected-no-diagnostics + // GFX12-warning@+2 {{event mask has no effect for target}} + // GFX12-note@+1 {{value of 2 valid for export_ready for gfx11 and gfx12+}} + __builtin_amdgcn_s_wait_event(0); // 0 does nothing on gfx12 + + // GFX11-expected-no-diagnostics + // GFX12-warning@+2 {{event mask has no effect for target}} + // GFX12-note@+1 {{value of 2 valid for export_ready for gfx11 and gfx12+}} + __builtin_amdgcn_s_wait_event(1); // 1 does nothing on gfx11 + + __builtin_amdgcn_s_wait_event(2); // expected-no-diagnostics + + // ALL-warning@+2 {{event mask has no effect for target}} + // ALL-note@+1 {{value of 2 valid for export_ready for gfx11 and gfx12+}} + __builtin_amdgcn_s_wait_event(3); + + // ALL-warning@+2 {{event mask has no effect for target}} + // ALL-note@+1 {{value of 2 valid for export_ready for gfx11 and gfx12+}} + __builtin_amdgcn_s_wait_event(-1); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a8eba9ed126b7..e35376ba404c0 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2904,6 +2904,17 @@ class IntDSBVHStackRtn<LLVMType vdst, LLVMType data1> : def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty, data1 = llvm_v4i32_ty>; +// Emit s_wait_event instruction. Note that between gfx11 and gfx12, +// the bit for the export_ready event changed. gfx11 expects bit 0 to +// be 0, and gfx12 expects bit 1 to be 0. Thus, an immediate value of +// 2 can be used as the universal value for export_ready. +def int_amdgcn_s_wait_event : + ClangBuiltin<"__builtin_amdgcn_s_wait_event">, + Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects] +>; + +// Emits same instruction as s_wait_event, with a hardcoded immediate +// value. FIXME: This should be removed def int_amdgcn_s_wait_event_export_ready : ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index d265440c03274..874249dc83c9f 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1835,7 +1835,7 @@ let SubtargetPredicate = isGFX10Plus in { let SubtargetPredicate = isGFX11Plus in { let OtherPredicates = [HasExportInsts] in def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16), - "$simm16"> { + "$simm16", [(int_amdgcn_s_wait_event timm:$simm16)]> { let hasSideEffects = 1; } def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins SDelayALU:$simm16), @@ -1958,9 +1958,7 @@ def : GCNPat< (S_SEXT_I32_I16 $src) >; -let SubtargetPredicate = isNotGFX12Plus in - def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>; -let SubtargetPredicate = isGFX12Plus in +let SubtargetPredicate = isGFX11Plus in def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 2))>; // The first 10 bits of the mode register are the core FP mode on all diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll index 27a8b35467218..0656671fac0df 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll @@ -1,14 +1,47 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}test_wait_event: -; GFX11: s_wait_event 0x0 -; GFX12: s_wait_event 0x2 - -define amdgpu_ps void @test_wait_event() { +; GCN-LABEL: {{^}}test_wait_event_export_ready: +; GCN: s_wait_event 0x2 +define amdgpu_ps void @test_wait_event_export_ready() { entry: call void @llvm.amdgcn.s.wait.event.export.ready() ret void } + +; GCN-LABEL: {{^}}test_wait_event_0: +; GCN: s_wait_event 0x0 +define amdgpu_ps void @test_wait_event_0() { + call void @llvm.amdgcn.s.wait.event(i16 0) + ret void +} + +; GCN-LABEL: {{^}}test_wait_event_1: +; GCN: s_wait_event 0x1 +define amdgpu_ps void @test_wait_event_1() { + call void @llvm.amdgcn.s.wait.event(i16 1) + ret void +} + +; GCN-LABEL: {{^}}test_wait_event_2: +; GCN: s_wait_event 0x2 +define amdgpu_ps void @test_wait_event_2() { + call void @llvm.amdgcn.s.wait.event(i16 2) + ret void +} + +; GCN-LABEL: {{^}}test_wait_event_3: +; GCN: s_wait_event 0x3 +define amdgpu_ps void @test_wait_event_3() { + call void @llvm.amdgcn.s.wait.event(i16 3) + ret void +} + +; GCN-LABEL: {{^}}test_wait_event_max: +; GCN: s_wait_event 0xffff +define amdgpu_ps void @test_wait_event_max() { + call void @llvm.amdgcn.s.wait.event(i16 -1) + ret void +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
