[gem5-dev] [L] Change in gem5/gem5[develop]: gpu-compute,mem-ruby: Add support for GPU cache bypassing
VISHNU RAMADAS has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=email ) ( 7 is the latest approved patch-set. No files were changed between the latest approved patch-set and the submitted one. )Change subject: gpu-compute,mem-ruby: Add support for GPU cache bypassing .. gpu-compute,mem-ruby: Add support for GPU cache bypassing The GPU cache models do not support cache bypassing when the GLC or SLC AMDGPU instruction modifiers are used in a load or store. This commit adds cache bypass support by introducing new transitions in the coherence protocol used by the GPU memory system. Now, instructions with the GLC bit set will not cache in the L1 and instructions with SLC bit set will not cache in L1 or L2. Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/66991 Reviewed-by: Jason Lowe-Power Maintainer: Matt Sinclair Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/mem/packet.hh M src/mem/request.hh M src/mem/ruby/protocol/GPU_VIPER-TCC.sm M src/mem/ruby/protocol/GPU_VIPER-TCP.sm M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm M src/mem/ruby/protocol/RubySlicc_MemControl.sm M src/mem/ruby/protocol/RubySlicc_Types.sm M src/mem/ruby/slicc_interface/RubyRequest.hh 9 files changed, 337 insertions(+), 8 deletions(-) Approvals: Matt Sinclair: Looks good to me, but someone else must approve; Looks good to me, approved Jason Lowe-Power: Looks good to me, approved kokoro: Regressions pass diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 9238dbe..a80b918 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -1101,6 +1101,16 @@ } /** + * Accessor functions for the cache bypass flags. The cache bypass + * can specify which levels in the hierarchy to bypass. If GLC_BIT + * is set, the requests are globally coherent and bypass TCP. + * If SLC_BIT is set, then the requests are system level coherent + * and bypass both TCP and TCC. + */ +bool isGLCSet() const { return req->isGLCSet();} +bool isSLCSet() const { return req->isSLCSet();} + +/** * Check if packet corresponds to a given block-aligned address and * address space. * diff --git a/src/mem/request.hh b/src/mem/request.hh index 39d9d72..6a0cbc2 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -1071,6 +1071,17 @@ bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); } + +/** + * Accessor functions for the cache bypass flags. The cache bypass + * can specify which levels in the hierarchy to bypass. If GLC_BIT + * is set, the requests are globally coherent and bypass TCP. + * If SLC_BIT is set, then the requests are system level coherent + * and bypass both TCP and TCC. + */ +bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); } +bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); } + /** * Accessor functions for the memory space configuration flags and used by * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index 032a64c..ae14247 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -56,8 +56,10 @@ enumeration(Event, desc="TCC Events") { // Requests coming from the Cores RdBlk, desc="RdBlk event"; +RdBypassEvict, desc="Bypass L2 on reads. Evict if cache block already allocated"; WrVicBlk, desc="L1 Write Through"; WrVicBlkBack, desc="L1 Write Through(dirty cache)"; +WrVicBlkEvict, desc="L1 Write Through(dirty cache) and evict"; Atomic, desc="Atomic Op"; AtomicDone, desc="AtomicOps Complete"; AtomicNotDone, desc="AtomicOps not Complete"; @@ -68,6 +70,7 @@ PrbInv, desc="Invalidating probe"; // Coming from Memory Controller WBAck, desc="writethrough ack from memory"; +Bypass, desc="Bypass the entire L2 cache"; } // STATES @@ -107,6 +110,8 @@ NetDest Destination, desc="Data destination"; int numAtomics, desc="number remaining atomics"; int atomicDoneCnt, desc="number AtomicDones triggered"; +bool isGLCSet, desc="Bypass L1 Cache"; +bool isSLCSet, desc="Bypass L1 and L2 Cache"; } structure(TBETable, external="yes") { @@ -173,7 +178,6 @@ int functionalWrite(Addr addr, Packet *pkt) { int num_functional_writes := 0; - TBE tbe := TBEs.lookup(addr); if(is_valid(tbe)) { num_functional_writes := num_functional_writes + @@ -279,7 +283,11 @@ peek(responseFromNB_in, ResponseMsg,
[gem5-dev] [L] Change in gem5/gem5[develop]: gpu-compute,mem-ruby: Add support for GPU cache bypassing
VISHNU RAMADAS has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=email ) Change subject: gpu-compute,mem-ruby: Add support for GPU cache bypassing .. gpu-compute,mem-ruby: Add support for GPU cache bypassing The GPU cache models do not support cache bypassing when the GLC or SLC AMDGPU instruction modifiers are used in a load or store. This commit adds cache bypass support by introducing new transitions in the coherence protocol used by the GPU memory system. Now, instructions with the GLC bit set will not cache in the L1 and instructions with SLC bit set will not cache in L1 or L2. Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3 --- M src/mem/packet.hh M src/mem/request.hh M src/mem/ruby/protocol/GPU_VIPER-TCC.sm M src/mem/ruby/protocol/GPU_VIPER-TCP.sm M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm M src/mem/ruby/protocol/RubySlicc_MemControl.sm M src/mem/ruby/protocol/RubySlicc_Types.sm M src/mem/ruby/slicc_interface/RubyRequest.hh 9 files changed, 286 insertions(+), 8 deletions(-) diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 9238dbe..224a7b5 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -418,6 +418,11 @@ */ uint64_t htmTransactionUid; +/** + * Track whether the request has the GLC_B_BIT + * or SLC_BIT flags set to bypass caches. + */ + public: /** @@ -886,6 +891,10 @@ addr = req->getPaddr(); flags.set(VALID_ADDR); _isSecure = req->isSecure(); + +/** + * Set the flags to track original request's GLC and SLC bits + */ } /** @@ -1101,6 +1110,16 @@ } /** + * Accessor functions for the cache bypass flags. The cache bypass + * can specify which levels in the hierarchy to bypass. If GLC_BIT + * is set, the requests are globally coherent and bypass TCP. + * If SLC_BIT is set, then the requests are system level coherent + * and bypass both TCP and TCC. + */ +bool isGLCSet() const { return req->isGLCSet();} +bool isSLCSet() const { return req->isSLCSet();} + +/** * Check if packet corresponds to a given block-aligned address and * address space. * diff --git a/src/mem/request.hh b/src/mem/request.hh index 39d9d72..6a0cbc2 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -1071,6 +1071,17 @@ bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); } + +/** + * Accessor functions for the cache bypass flags. The cache bypass + * can specify which levels in the hierarchy to bypass. If GLC_BIT + * is set, the requests are globally coherent and bypass TCP. + * If SLC_BIT is set, then the requests are system level coherent + * and bypass both TCP and TCC. + */ +bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); } +bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); } + /** * Accessor functions for the memory space configuration flags and used by * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index 032a64c..13acc02 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -56,8 +56,10 @@ enumeration(Event, desc="TCC Events") { // Requests coming from the Cores RdBlk, desc="RdBlk event"; +RdBypassEvict, desc="Bypass L2 on reads. Evict if cache block already allocated"; WrVicBlk, desc="L1 Write Through"; WrVicBlkBack, desc="L1 Write Through(dirty cache)"; +WrVicBlkEvict, desc="L1 Write Through(dirty cache) and evict"; Atomic, desc="Atomic Op"; AtomicDone, desc="AtomicOps Complete"; AtomicNotDone, desc="AtomicOps not Complete"; @@ -68,6 +70,7 @@ PrbInv, desc="Invalidating probe"; // Coming from Memory Controller WBAck, desc="writethrough ack from memory"; +Bypass, desc="Bypass the entire L2 cache"; } // STATES @@ -107,6 +110,8 @@ NetDest Destination, desc="Data destination"; int numAtomics, desc="number remaining atomics"; int atomicDoneCnt, desc="number AtomicDones triggered"; +bool isGLCSet, desc="Bypass L1 Cache"; +bool isSLCSet, desc="Bypass L1 and L2 Cache"; } structure(TBETable, external="yes") { @@ -173,7 +178,6 @@ int functionalWrite(Addr addr, Packet *pkt) { int num_functional_writes := 0; - TBE tbe := TBEs.lookup(addr); if(is_valid(tbe)) { num_functional_writes := num_functional_writes + @@ -279,7 +283,9 @@ peek(responseFromNB_in, ResponseMsg,