https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/199622
>From d03a68a37b3c775ae68d21eef81871fc36cf55bd Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe <[email protected]> Date: Mon, 25 May 2026 13:14:43 +0530 Subject: [PATCH 1/2] [Clang][AMDGPU] Add ``amdgcn_av("none")`` attribute for atomic expressions Add a statement attribute that suppresses MakeAvailable/MakeVisible cache operations on AMDGPU atomic instructions while preserving memory ordering (waits). The attribute takes a string argument specifying the mode. Currently "none" is the only supported mode. The resulting atomic or fence instruction carries !mmra !{!"amdgcn-av", !"none"} metadata. Assisted-By: Claude Opus 4.6 --- clang/include/clang/Basic/Attr.td | 6 + clang/include/clang/Basic/AttrDocs.td | 23 ++++ clang/include/clang/Basic/DiagnosticGroups.td | 2 + .../clang/Basic/DiagnosticSemaKinds.td | 4 + clang/lib/CodeGen/CGAtomic.cpp | 9 +- clang/lib/CodeGen/CGBuiltin.cpp | 68 +++++---- clang/lib/CodeGen/CGStmt.cpp | 5 + clang/lib/CodeGen/CodeGenFunction.h | 4 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + clang/lib/CodeGen/Targets/AMDGPU.cpp | 6 + clang/lib/Sema/SemaStmtAttr.cpp | 57 ++++++++ .../CodeGen/AMDGPU/amdgcn-av-non-atomic.cpp | 43 ++++++ .../CodeGen/AMDGPU/amdgcn-av-none-attr.cpp | 130 ++++++++++++++++++ 13 files changed, 331 insertions(+), 27 deletions(-) create mode 100644 clang/test/CodeGen/AMDGPU/amdgcn-av-non-atomic.cpp create mode 100644 clang/test/CodeGen/AMDGPU/amdgcn-av-none-attr.cpp diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 7f7e9489782a7..53fa4c8ee9036 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2522,6 +2522,12 @@ def AMDGPUMaxNumWorkGroups : InheritableAttr { let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; } +def AMDGCNAV : StmtAttr, TargetSpecificAttr<TargetAMDGPU> { + let Spellings = [Clang<"amdgcn_av">]; + let Args = [StringArgument<"Mode">]; + let Documentation = [AMDGCNAVDocs]; +} + def BPFPreserveAccessIndex : InheritableAttr, TargetSpecificAttr<TargetBPF> { let Spellings = [Clang<"preserve_access_index">]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index dab778d4047aa..b77d71d7202fc 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3577,6 +3577,29 @@ An error will be given if: }]; } +def AMDGCNAVDocs : Documentation { + let Category = DocCatAMDGPUAttributes; + let Content = [{ +This attribute controls MakeAvailable and MakeVisible cache operations on +AMDGPU synchronization operations. It takes a string argument specifying the +mode. + +When placed on a statement containing a C/C++ atomic builtin call, the +resulting atomic or fence instruction will carry ``!mmra !{!"amdgcn-av", +!"<mode>"}`` metadata. + +The supported modes are: + +- ``"none"``: Skip cache writeback (on release) and cache invalidation (on + acquire), while preserving memory ordering (waits). + +.. code-block:: c++ + + [[clang::amdgcn_av("none")]] __atomic_thread_fence(__ATOMIC_SEQ_CST); + [[clang::amdgcn_av("none")]] __atomic_fetch_add(ptr, 1, __ATOMIC_ACQ_REL); + }]; +} + def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> { let Content = [{ Clang supports several different calling conventions, depending on the target diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 8031f99419bdc..51787935e1280 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1968,3 +1968,5 @@ def ExperimentalOption : DiagGroup<"experimental-option">; // Warnings about unguarded usages of AMDGPU target specific constructs def UnguardedBuiltinUsageAMDGPU : DiagGroup<"amdgpu-unguarded-builtin-usage">; + +def AMDGCNAVNonAtomic : DiagGroup<"amdgcn-av-non-atomic">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a3b575b7ee63a..e314052dd9397 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -14259,6 +14259,10 @@ def note_amdgcn_unguarded_builtin_silence def err_amdgcn_global_or_flat_pointer_required : Error<"builtin requires a global or generic pointer">; +def warn_amdgcn_av_requires_atomic : Warning< + "%0 attribute only applies to atomic operations">, + InGroup<AMDGCNAVNonAtomic>; + def err_amdgcn_dmask_has_too_many_bits_set : Error<"dmask argument cannot have more bits set than there are elements " "in return type">; diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp index 270965b109943..92ee116ca8dc0 100644 --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -632,6 +632,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest, llvm::LoadInst *Load = CGF.Builder.CreateLoad(Ptr); Load->setAtomic(Order, Scope); Load->setVolatile(E->isVolatile()); + CGF.getTargetHooks().setTargetAtomicMetadata(CGF, *Load, E); CGF.maybeAttachRangeForLoad(Load, E->getValueType(), E->getExprLoc()); auto *I = CGF.Builder.CreateStore(Load, Dest); CGF.addInstToCurrentSourceAtom(I, Load); @@ -649,6 +650,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest, llvm::StoreInst *Store = CGF.Builder.CreateStore(LoadVal1, Ptr); Store->setAtomic(Order, Scope); Store->setVolatile(E->isVolatile()); + CGF.getTargetHooks().setTargetAtomicMetadata(CGF, *Store, E); CGF.addInstToCurrentSourceAtom(Store, LoadVal1); return; } @@ -795,6 +797,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest, CGF.Builder.CreateStore(CGF.Builder.getInt8(0), Ptr); Store->setAtomic(Order, Scope); Store->setVolatile(E->isVolatile()); + CGF.getTargetHooks().setTargetAtomicMetadata(CGF, *Store, E); CGF.addInstToCurrentSourceAtom(Store, nullptr); return; } @@ -1586,6 +1589,7 @@ llvm::Value *AtomicInfo::EmitAtomicLoadOp(llvm::AtomicOrdering AO, Addr = castToAtomicIntPointer(Addr); llvm::LoadInst *Load = CGF.Builder.CreateLoad(Addr, "atomic-load"); Load->setAtomic(AO); + CGF.getTargetHooks().setTargetAtomicMetadata(CGF, *Load); // Other decoration. if (IsVolatile) @@ -1753,6 +1757,7 @@ std::pair<llvm::Value *, llvm::Value *> AtomicInfo::EmitAtomicCompareExchangeOp( // Other decoration. Inst->setVolatile(LVal.isVolatileQualified()); Inst->setWeak(IsWeak); + CGF.getTargetHooks().setTargetAtomicMetadata(CGF, *Inst); // Okay, turn that back into the original value type. auto *PreviousVal = CGF.Builder.CreateExtractValue(Inst, /*Idxs=*/0); @@ -2108,8 +2113,10 @@ void CodeGenFunction::EmitAtomicStore(RValue rvalue, LValue dest, else if (AO == llvm::AtomicOrdering::AcquireRelease) AO = llvm::AtomicOrdering::Release; // Initializations don't need to be atomic. - if (!isInit) + if (!isInit) { store->setAtomic(AO); + getTargetHooks().setTargetAtomicMetadata(*this, *store); + } // Other decoration. if (IsVolatile) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 682b125890fe1..54b1807a30739 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5663,24 +5663,28 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Value *Order = EmitScalarExpr(E->getArg(0)); if (isa<llvm::ConstantInt>(Order)) { int ord = cast<llvm::ConstantInt>(Order)->getZExtValue(); + llvm::FenceInst *Fence = nullptr; switch (ord) { case 0: // memory_order_relaxed default: // invalid order break; case 1: // memory_order_consume case 2: // memory_order_acquire - Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID); + Fence = Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID); break; case 3: // memory_order_release - Builder.CreateFence(llvm::AtomicOrdering::Release, SSID); + Fence = Builder.CreateFence(llvm::AtomicOrdering::Release, SSID); break; case 4: // memory_order_acq_rel - Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID); + Fence = Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID); break; case 5: // memory_order_seq_cst - Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID); + Fence = Builder.CreateFence( + llvm::AtomicOrdering::SequentiallyConsistent, SSID); break; } + if (Fence) + getTargetHooks().setTargetAtomicMetadata(*this, *Fence); return RValue::get(nullptr); } @@ -5695,23 +5699,29 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB); Builder.SetInsertPoint(AcquireBB); - Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID); + getTargetHooks().setTargetAtomicMetadata( + *this, *Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID)); Builder.CreateBr(ContBB); SI->addCase(Builder.getInt32(1), AcquireBB); SI->addCase(Builder.getInt32(2), AcquireBB); Builder.SetInsertPoint(ReleaseBB); - Builder.CreateFence(llvm::AtomicOrdering::Release, SSID); + getTargetHooks().setTargetAtomicMetadata( + *this, *Builder.CreateFence(llvm::AtomicOrdering::Release, SSID)); Builder.CreateBr(ContBB); SI->addCase(Builder.getInt32(3), ReleaseBB); Builder.SetInsertPoint(AcqRelBB); - Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID); + getTargetHooks().setTargetAtomicMetadata( + *this, + *Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID)); Builder.CreateBr(ContBB); SI->addCase(Builder.getInt32(4), AcqRelBB); Builder.SetInsertPoint(SeqCstBB); - Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID); + getTargetHooks().setTargetAtomicMetadata( + *this, *Builder.CreateFence( + llvm::AtomicOrdering::SequentiallyConsistent, SSID)); Builder.CreateBr(ContBB); SI->addCase(Builder.getInt32(5), SeqCstBB); @@ -5729,40 +5739,43 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, SyncScope SS = ScopeModel->isValid(Scp->getZExtValue()) ? ScopeModel->map(Scp->getZExtValue()) : ScopeModel->map(ScopeModel->getFallBackValue()); + llvm::FenceInst *Fence = nullptr; switch (Ord->getZExtValue()) { case 0: // memory_order_relaxed default: // invalid order break; case 1: // memory_order_consume case 2: // memory_order_acquire - Builder.CreateFence( + Fence = Builder.CreateFence( llvm::AtomicOrdering::Acquire, getTargetHooks().getLLVMSyncScopeID(getLangOpts(), SS, llvm::AtomicOrdering::Acquire, getLLVMContext())); break; case 3: // memory_order_release - Builder.CreateFence( + Fence = Builder.CreateFence( llvm::AtomicOrdering::Release, getTargetHooks().getLLVMSyncScopeID(getLangOpts(), SS, llvm::AtomicOrdering::Release, getLLVMContext())); break; case 4: // memory_order_acq_rel - Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, - getTargetHooks().getLLVMSyncScopeID( - getLangOpts(), SS, - llvm::AtomicOrdering::AcquireRelease, - getLLVMContext())); + Fence = Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, + getTargetHooks().getLLVMSyncScopeID( + getLangOpts(), SS, + llvm::AtomicOrdering::AcquireRelease, + getLLVMContext())); break; case 5: // memory_order_seq_cst - Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, - getTargetHooks().getLLVMSyncScopeID( - getLangOpts(), SS, - llvm::AtomicOrdering::SequentiallyConsistent, - getLLVMContext())); + Fence = Builder.CreateFence( + llvm::AtomicOrdering::SequentiallyConsistent, + getTargetHooks().getLLVMSyncScopeID( + getLangOpts(), SS, llvm::AtomicOrdering::SequentiallyConsistent, + getLLVMContext())); break; } + if (Fence) + getTargetHooks().setTargetAtomicMetadata(*this, *Fence); return RValue::get(nullptr); } @@ -5821,9 +5834,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, SyncScope SS = ScopeModel->isValid(Scp->getZExtValue()) ? ScopeModel->map(Scp->getZExtValue()) : ScopeModel->map(ScopeModel->getFallBackValue()); - Builder.CreateFence(Ordering, - getTargetHooks().getLLVMSyncScopeID( - getLangOpts(), SS, Ordering, getLLVMContext())); + llvm::FenceInst *Fence = Builder.CreateFence( + Ordering, getTargetHooks().getLLVMSyncScopeID( + getLangOpts(), SS, Ordering, getLLVMContext())); + getTargetHooks().setTargetAtomicMetadata(*this, *Fence); Builder.CreateBr(ContBB); } else { llvm::DenseMap<unsigned, llvm::BasicBlock *> BBs; @@ -5837,9 +5851,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, SI->addCase(Builder.getInt32(Scp), B); Builder.SetInsertPoint(B); - Builder.CreateFence(Ordering, getTargetHooks().getLLVMSyncScopeID( - getLangOpts(), ScopeModel->map(Scp), - Ordering, getLLVMContext())); + llvm::FenceInst *Fence = Builder.CreateFence( + Ordering, getTargetHooks().getLLVMSyncScopeID( + getLangOpts(), ScopeModel->map(Scp), Ordering, + getLLVMContext())); + getTargetHooks().setTargetAtomicMetadata(*this, *Fence); Builder.CreateBr(ContBB); } } diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 30756180ebafa..8a8d73c093f8b 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -778,6 +778,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { bool noinline = false; bool alwaysinline = false; bool noconvergent = false; + StringRef amdgcnAVMode; HLSLControlFlowHintAttr::Spelling flattenOrBranch = HLSLControlFlowHintAttr::SpellingNotCalculated; const CallExpr *musttail = nullptr; @@ -815,6 +816,9 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { case attr::Atomic: AA = cast<AtomicAttr>(A); break; + case attr::AMDGCNAV: + amdgcnAVMode = cast<AMDGCNAVAttr>(A)->getMode(); + break; case attr::HLSLControlFlowHint: { flattenOrBranch = cast<HLSLControlFlowHintAttr>(A)->getSemanticSpelling(); } break; @@ -824,6 +828,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { SaveAndRestore save_noinline(InNoInlineAttributedStmt, noinline); SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline); SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent); + SaveAndRestore save_amdgcnav(AMDGCNAVMode, amdgcnAVMode); SaveAndRestore save_musttail(MustTailCall, musttail); SaveAndRestore save_flattenOrBranch(HLSLControlFlowAttr, flattenOrBranch); CGAtomicOptionsRAII AORAII(CGM, AA); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index aeace0d789a61..9db9ae8397493 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -624,6 +624,10 @@ class CodeGenFunction : public CodeGenTypeCache { /// True if the current statement has noconvergent attribute. bool InNoConvergentAttributedStmt = false; + /// The mode string from the amdgcn_av attribute on the current statement, + /// or empty if the attribute is not present. + StringRef AMDGCNAVMode; + /// HLSL Branch attribute. HLSLControlFlowHintAttr::Spelling HLSLControlFlowAttr = HLSLControlFlowHintAttr::SpellingNotCalculated; diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 21f32b12c4fd1..5def78f019833 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1891,6 +1891,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, FenceInst *Fence = Builder.CreateFence(AO, SSID); if (E->getNumArgs() > 2) AddAMDGPUFenceAddressSpaceMMRA(Fence, E); + getTargetHooks().setTargetAtomicMetadata(*this, *Fence); return Fence; } case AMDGPU::BI__builtin_amdgcn_atomic_inc32: diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index a3a596bb9d822..1c9ab22e41435 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -10,6 +10,7 @@ #include "TargetInfo.h" #include "clang/AST/DeclCXX.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/Support/AMDGPUAddrSpace.h" using namespace clang; @@ -560,6 +561,11 @@ void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata( AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange); } + if (!CGF.AMDGCNAVMode.empty()) { + llvm::MMRAMetadata::appendTags(AtomicInst, + {{"amdgcn-av", CGF.AMDGCNAVMode}}); + } + if (!RMW) return; diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index 58d5332565d10..b13b1d2256a92 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -351,6 +351,61 @@ static Attr *handleMustTailAttr(Sema &S, Stmt *St, const ParsedAttr &A, return ::new (S.Context) MustTailAttr(S.Context, A); } +/// Return true if St is an atomic operation or fence builtin call. +static bool isAtomicOp(const Stmt *St) { + const Expr *E = dyn_cast<Expr>(St); + if (!E) + return false; + + E = E->IgnoreParenCasts(); + + if (isa<AtomicExpr>(E)) + return true; + + const CallExpr *CE = dyn_cast<CallExpr>(E); + if (!CE) + return false; + + unsigned BuiltinID = CE->getBuiltinCallee(); + switch (BuiltinID) { + case Builtin::BI__atomic_thread_fence: + case Builtin::BI__atomic_signal_fence: + case Builtin::BI__c11_atomic_thread_fence: + case Builtin::BI__c11_atomic_signal_fence: + case Builtin::BI__scoped_atomic_thread_fence: + return true; + default: + break; + } + + // Check for target-specific fence builtins. + if (const FunctionDecl *FD = CE->getDirectCallee()) { + if (FD->getName() == "__builtin_amdgcn_fence") + return true; + } + + return false; +} + +static Attr *handleAMDGCNAVAttr(Sema &S, Stmt *St, const ParsedAttr &A, + SourceRange Range) { + StringRef Mode; + if (!S.checkStringLiteralArgumentAttr(A, 0, Mode)) + return nullptr; + + if (Mode != "none") { + S.Diag(A.getLoc(), diag::warn_attribute_type_not_supported) << A << Mode; + return nullptr; + } + + if (!isAtomicOp(St)) { + S.Diag(A.getLoc(), diag::warn_amdgcn_av_requires_atomic) << A; + return nullptr; + } + + return ::new (S.Context) AMDGCNAVAttr(S.Context, A, Mode); +} + static Attr *handleLikely(Sema &S, Stmt *St, const ParsedAttr &A, SourceRange Range) { @@ -730,6 +785,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A, return handleNoInlineAttr(S, St, A, Range); case ParsedAttr::AT_MustTail: return handleMustTailAttr(S, St, A, Range); + case ParsedAttr::AT_AMDGCNAV: + return handleAMDGCNAVAttr(S, St, A, Range); case ParsedAttr::AT_Likely: return handleLikely(S, St, A, Range); case ParsedAttr::AT_Unlikely: diff --git a/clang/test/CodeGen/AMDGPU/amdgcn-av-non-atomic.cpp b/clang/test/CodeGen/AMDGPU/amdgcn-av-non-atomic.cpp new file mode 100644 index 0000000000000..1f193a865be97 --- /dev/null +++ b/clang/test/CodeGen/AMDGPU/amdgcn-av-non-atomic.cpp @@ -0,0 +1,43 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 %s -emit-llvm -O0 -verify -o - \ +// RUN: -triple=amdgcn-amd-amdhsa | FileCheck %s + +// Test that [[clang::amdgcn_av("none")]] on non-atomic statements emits a +// warning and does NOT produce !mmra metadata. + +// CHECK-LABEL: define {{.*}} @_Z16test_plain_storePii( +// CHECK-NOT: !mmra +// CHECK: ret void +void test_plain_store(int *p, int val) { + [[clang::amdgcn_av("none")]] *p = val; // expected-warning {{'clang::amdgcn_av' attribute only applies to atomic operations}} +} + +// CHECK-LABEL: define {{.*}} @_Z15test_plain_callv( +// CHECK-NOT: !mmra +// CHECK: ret void +extern void foo(); +void test_plain_call() { + [[clang::amdgcn_av("none")]] foo(); // expected-warning {{'clang::amdgcn_av' attribute only applies to atomic operations}} +} + +// CHECK-LABEL: define {{.*}} @_Z18test_for_with_atomPi( +// CHECK-NOT: !mmra +// CHECK: ret void +void test_for_with_atom(int *p) { + // The attribute on a for loop should warn even if the body contains atomics. + [[clang::amdgcn_av("none")]] for (;;) { // expected-warning {{'clang::amdgcn_av' attribute only applies to atomic operations}} + __atomic_fetch_add(p, 1, __ATOMIC_SEQ_CST); + break; + } +} + +// The attribute on an if statement should warn even if the condition is atomic. +// CHECK-LABEL: define {{.*}} @_Z20test_if_atomic_condnPi( +// CHECK-NOT: !mmra +// CHECK: ret void +void test_if_atomic_condn(int *p) { + [[clang::amdgcn_av("none")]] if (__atomic_load_n(p, __ATOMIC_ACQUIRE)) { // expected-warning {{'clang::amdgcn_av' attribute only applies to atomic operations}} + } +} + +// CHECK-NOT: amdgcn-av diff --git a/clang/test/CodeGen/AMDGPU/amdgcn-av-none-attr.cpp b/clang/test/CodeGen/AMDGPU/amdgcn-av-none-attr.cpp new file mode 100644 index 0000000000000..1a489d94e7d29 --- /dev/null +++ b/clang/test/CodeGen/AMDGPU/amdgcn-av-none-attr.cpp @@ -0,0 +1,130 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \ +// RUN: -triple=amdgcn-amd-amdhsa | FileCheck %s + +// Test that [[clang::amdgcn_av("none")]] on atomic builtins emits +// !mmra !{!"amdgcn-av", !"none"} metadata on the resulting IR instructions. + +// CHECK-LABEL: define dso_local void @_Z21test_atomic_fetch_addPi( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4, !mmra [[META2:![0-9]+]], !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]] +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// CHECK-NEXT: ret void +// +void test_atomic_fetch_add(int *p) { + [[clang::amdgcn_av("none")]] __atomic_fetch_add(p, 1, __ATOMIC_SEQ_CST); +} + +// CHECK-LABEL: define dso_local void @_Z17test_atomic_storePi( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 42, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// CHECK-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] release, align 4, !mmra [[META2]] +// CHECK-NEXT: ret void +// +void test_atomic_store(int *p) { + [[clang::amdgcn_av("none")]] __atomic_store_n(p, 42, __ATOMIC_RELEASE); +} + +// CHECK-LABEL: define dso_local void @_Z16test_atomic_loadPi( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] acquire, align 4, !mmra [[META2]] +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// CHECK-NEXT: ret void +// +void test_atomic_load(int *p) { + [[clang::amdgcn_av("none")]] __atomic_load_n(p, __ATOMIC_ACQUIRE); +} + +// CHECK-LABEL: define dso_local void @_Z19test_atomic_cmpxchgPi( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[EXPECTED:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: [[EXPECTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[EXPECTED]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 0, ptr [[EXPECTED_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[EXPECTED_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] seq_cst acquire, align 4, !mmra [[META2]] +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// CHECK-NEXT: br i1 [[TMP5]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]] +// CHECK: cmpxchg.store_expected: +// CHECK-NEXT: store i32 [[TMP4]], ptr [[EXPECTED_ASCAST]], align 4 +// CHECK-NEXT: br label [[CMPXCHG_CONTINUE]] +// CHECK: cmpxchg.continue: +// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// CHECK-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// CHECK-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 +// CHECK-NEXT: ret void +// +void test_atomic_cmpxchg(int *p) { + int expected = 0; + [[clang::amdgcn_av("none")]] __atomic_compare_exchange_n(p, &expected, 1, false, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); +} + +// CHECK-LABEL: define dso_local void @_Z24test_atomic_thread_fencev( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: fence seq_cst, !mmra [[META2]] +// CHECK-NEXT: ret void +// +void test_atomic_thread_fence() { + [[clang::amdgcn_av("none")]] __atomic_thread_fence(__ATOMIC_SEQ_CST); +} + +// Verify that without the attribute, no !mmra metadata is emitted. +// CHECK-LABEL: define dso_local void @_Z12test_no_attrPi( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// CHECK-NEXT: ret void +// +void test_no_attr(int *p) { + __atomic_fetch_add(p, 1, __ATOMIC_SEQ_CST); +} +//. +// CHECK: [[META2]] = !{!"amdgcn-av", !"none"} +// CHECK: [[META4]] = !{} +//. >From 7b0d45be4858a0951a8597fee6777c90522227f9 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe <[email protected]> Date: Mon, 22 Jun 2026 12:14:51 +0530 Subject: [PATCH 2/2] fix attr doc; add test that got left behind --- clang/include/clang/Basic/AttrDocs.td | 17 +++-- .../AMDGPU/amdgcn-av-none-attr-c-atomic.c | 72 +++++++++++++++++++ 2 files changed, 80 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGen/AMDGPU/amdgcn-av-none-attr-c-atomic.c diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index b77d71d7202fc..51b39d273a9be 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3580,18 +3580,17 @@ An error will be given if: def AMDGCNAVDocs : Documentation { let Category = DocCatAMDGPUAttributes; let Content = [{ -This attribute controls MakeAvailable and MakeVisible cache operations on -AMDGPU synchronization operations. It takes a string argument specifying the -mode. -When placed on a statement containing a C/C++ atomic builtin call, the -resulting atomic or fence instruction will carry ``!mmra !{!"amdgcn-av", -!"<mode>"}`` metadata. +This attribute controls availability and visibility as described in the +`AMDGPU Memory Model <https://llvm.org/docs/AMDGPUMemoryModel.html>`__. +When placed on a statement containing a C/C++ atomic builtin call, the resulting +atomic or fence instruction carries the corresponding *AV Metadata*. -The supported modes are: +The attribute takes a string literal as an argument, which currently has only +one supported value: -- ``"none"``: Skip cache writeback (on release) and cache invalidation (on - acquire), while preserving memory ordering (waits). +- ``"none"``: Disable MakeAvailable and MakeVisible semantics on release and + acquire operations respectively. .. code-block:: c++ diff --git a/clang/test/CodeGen/AMDGPU/amdgcn-av-none-attr-c-atomic.c b/clang/test/CodeGen/AMDGPU/amdgcn-av-none-attr-c-atomic.c new file mode 100644 index 0000000000000..dc68addeeacd7 --- /dev/null +++ b/clang/test/CodeGen/AMDGPU/amdgcn-av-none-attr-c-atomic.c @@ -0,0 +1,72 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \ +// RUN: -triple=amdgcn-amd-amdhsa -std=c23 | FileCheck %s + +// Test that [[clang::amdgcn_av("none")]] works with _Atomic type +// qualifier operations, which go through CGExprScalar and CGAtomic +// codegen paths. + +// CHECK-LABEL: define dso_local void @test_atomic_rmw_add( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 seq_cst, align 4, !mmra [[META2:![0-9]+]], !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]] +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 1 +// CHECK-NEXT: ret void +// +void test_atomic_rmw_add(_Atomic int *p) { + [[clang::amdgcn_av("none")]] *p += 1; +} + +// CHECK-LABEL: define dso_local void @test_atomic_store( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store atomic i32 42, ptr [[TMP0]] seq_cst, align 4, !mmra [[META2]] +// CHECK-NEXT: ret void +// +void test_atomic_store(_Atomic int *p) { + [[clang::amdgcn_av("none")]] *p = 42; +} + +// CHECK-LABEL: define dso_local void @test_atomic_load( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i32, ptr [[TMP0]] seq_cst, align 4, !mmra [[META2]] +// CHECK-NEXT: ret void +// +void test_atomic_load(_Atomic int *p) { + [[clang::amdgcn_av("none")]] (void)*p; +} + +// --- Negative control: no attribute --- + +// CHECK-LABEL: define dso_local void @test_no_attr( +// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 1 +// CHECK-NEXT: ret void +// +void test_no_attr(_Atomic int *p) { + *p += 1; +} +//. +// CHECK: [[META2]] = !{!"amdgcn-av", !"none"} +// CHECK: [[META3]] = !{} +//. _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
