llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) <details> <summary>Changes</summary> Also breaks the long inheritance chains by making both `SIGfx10CacheControl` and `SIGfx12CacheControl` inherit from `SICacheControl` directly. With this patch, we now just have 3 `SICacheControl` implementations that each do their own thing, and there is no more code hidden 3 superclasses above (which made this code harder to read and maintain than it needed to be). --- Full diff: https://github.com/llvm/llvm-project/pull/168058.diff 1 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+38-120) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index b3c56c325e210..562d21a0cd6cf 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -404,7 +404,7 @@ class SICacheControl { /// Generates code sequences for the memory model of all GFX targets below /// GFX10. -class SIGfx6CacheControl : public SICacheControl { +class SIGfx6CacheControl final : public SICacheControl { public: SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} @@ -443,14 +443,27 @@ class SIGfx6CacheControl : public SICacheControl { Position Pos) const override; }; -class SIGfx10CacheControl : public SIGfx6CacheControl { +/// Generates code sequences for the memory model of GFX10/11. +class SIGfx10CacheControl final : public SICacheControl { public: - SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} + SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override; + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return false; + } + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return false; + } + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal, @@ -463,23 +476,17 @@ class SIGfx10CacheControl : public SIGfx6CacheControl { bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; -}; - -class SIGfx11CacheControl : public SIGfx10CacheControl { -public: - SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const override { + return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); + } }; -class SIGfx12CacheControl : public SIGfx11CacheControl { +class SIGfx12CacheControl final : public SICacheControl { protected: // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. // \returns Returns true if \p MI is modified, false otherwise. @@ -504,7 +511,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) { // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases // the behavior is the same if assuming GFX12.0 in CU mode. assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); @@ -915,10 +922,8 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); if (Generation < AMDGPUSubtarget::GFX10) return std::make_unique<SIGfx6CacheControl>(ST); - if (Generation < AMDGPUSubtarget::GFX11) - return std::make_unique<SIGfx10CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX12) - return std::make_unique<SIGfx11CacheControl>(ST); + return std::make_unique<SIGfx10CacheControl>(ST); return std::make_unique<SIGfx12CacheControl>(ST); } @@ -1438,8 +1443,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, } bool SIGfx10CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; @@ -1450,7 +1454,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( case SIAtomicScope::AGENT: // Set the L0 and L1 cache policies to MISS_EVICT. // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); + // For GFX10, set GLC+DLC, for GFX11, only set GLC. + Changed |= + enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0)); break; case SIAtomicScope::WORKGROUP: // In WGP mode the waves of a work-group can be executing on either CU of @@ -1504,6 +1510,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); } + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -1524,6 +1534,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( Changed |= enableCPolBits(MI, CPol::GLC); Changed |= enableCPolBits(MI, CPol::SLC); + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); + return Changed; } @@ -1722,102 +1736,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx11CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - // Set the L0 and L1 cache policies to MISS_EVICT. - // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableCPolBits(MI, CPol::GLC); - break; - case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in - // CU mode all waves of a work-group are on the same CU, and so the L0 - // does not need to be bypassed. - if (!ST.isCuModeEnabled()) - Changed |= enableCPolBits(MI, CPol::GLC); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set L0 and L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache coherent bypass control at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableCPolBits(MI, CPol::GLC); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableCPolBits(MI, CPol::DLC); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered, - /*AtomicsOnly=*/false); - return Changed; - } - - if (IsNonTemporal) { - // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT - // and L2 cache policy to STREAM. - // For stores setting both GLC and SLC configures L0 and L1 cache policy - // to MISS_EVICT and the L2 cache policy to STREAM. - if (Op == SIMemOp::STORE) - Changed |= enableCPolBits(MI, CPol::GLC); - Changed |= enableCPolBits(MI, CPol::SLC); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableCPolBits(MI, CPol::DLC); - return Changed; - } - - return Changed; -} - bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); `````````` </details> https://github.com/llvm/llvm-project/pull/168058 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
