[gem5-dev] [L] Change in gem5/gem5[develop]: gpu-compute,mem-ruby: Add support for GPU cache bypassing

2023-01-03 Thread VISHNU RAMADAS (Gerrit) via gem5-dev
VISHNU RAMADAS has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=email )


 (

7 is the latest approved patch-set.
No files were changed between the latest approved patch-set and the  
submitted one.

 )Change subject: gpu-compute,mem-ruby: Add support for GPU cache bypassing
..

gpu-compute,mem-ruby: Add support for GPU cache bypassing

The GPU cache models do not support cache bypassing when the GLC or SLC
AMDGPU instruction modifiers are used in a load or store. This commit
adds cache bypass support by introducing new transitions in the
coherence protocol used by the GPU memory system. Now, instructions with
the GLC bit set will not cache in the L1 and instructions with SLC bit
set will not cache in L1 or L2.

Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/66991
Reviewed-by: Jason Lowe-Power 
Maintainer: Matt Sinclair 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/mem/packet.hh
M src/mem/request.hh
M src/mem/ruby/protocol/GPU_VIPER-TCC.sm
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
M src/mem/ruby/protocol/RubySlicc_MemControl.sm
M src/mem/ruby/protocol/RubySlicc_Types.sm
M src/mem/ruby/slicc_interface/RubyRequest.hh
9 files changed, 337 insertions(+), 8 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, but someone else must approve; Looks  
good to me, approved

  Jason Lowe-Power: Looks good to me, approved
  kokoro: Regressions pass




diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbe..a80b918 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1101,6 +1101,16 @@
 }

 /**
+ * Accessor functions for the cache bypass flags. The cache bypass
+ * can specify which levels in the hierarchy to bypass. If GLC_BIT
+ * is set, the requests are globally coherent and bypass TCP.
+ * If SLC_BIT is set, then the requests are system level coherent
+ * and bypass both TCP and TCC.
+ */
+bool isGLCSet() const { return req->isGLCSet();}
+bool isSLCSet() const { return req->isSLCSet();}
+
+/**
  * Check if packet corresponds to a given block-aligned address and
  * address space.
  *
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 39d9d72..6a0cbc2 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1071,6 +1071,17 @@

 bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); }

+
+/**
+ * Accessor functions for the cache bypass flags. The cache bypass
+ * can specify which levels in the hierarchy to bypass. If GLC_BIT
+ * is set, the requests are globally coherent and bypass TCP.
+ * If SLC_BIT is set, then the requests are system level coherent
+ * and bypass both TCP and TCC.
+ */
+bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); }
+bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); }
+
 /**
  * Accessor functions for the memory space configuration flags and  
used by
  * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note  
that
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm  
b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm

index 032a64c..ae14247 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -56,8 +56,10 @@
   enumeration(Event, desc="TCC Events") {
 // Requests coming from the Cores
 RdBlk,  desc="RdBlk event";
+RdBypassEvict,  desc="Bypass L2 on reads. Evict if cache block  
already allocated";

 WrVicBlk,   desc="L1 Write Through";
 WrVicBlkBack,   desc="L1 Write Through(dirty cache)";
+WrVicBlkEvict,  desc="L1 Write Through(dirty cache) and evict";
 Atomic, desc="Atomic Op";
 AtomicDone, desc="AtomicOps Complete";
 AtomicNotDone,  desc="AtomicOps not Complete";
@@ -68,6 +70,7 @@
 PrbInv, desc="Invalidating probe";
 // Coming from Memory Controller
 WBAck,  desc="writethrough ack from memory";
+Bypass, desc="Bypass the entire L2 cache";
   }

   // STATES
@@ -107,6 +110,8 @@
 NetDest Destination, desc="Data destination";
 int numAtomics, desc="number remaining atomics";
 int atomicDoneCnt,  desc="number AtomicDones triggered";
+bool isGLCSet,  desc="Bypass L1 Cache";
+bool isSLCSet,  desc="Bypass L1 and L2 Cache";
   }

   structure(TBETable, external="yes") {
@@ -173,7 +178,6 @@

   int functionalWrite(Addr addr, Packet *pkt) {
 int num_functional_writes := 0;
-
 TBE tbe := TBEs.lookup(addr);
 if(is_valid(tbe)) {
   num_functional_writes := num_functional_writes +
@@ -279,7 +283,11 @@
   peek(responseFromNB_in, ResponseMsg, 

[gem5-dev] [L] Change in gem5/gem5[develop]: gpu-compute,mem-ruby: Add support for GPU cache bypassing

2022-12-27 Thread VISHNU RAMADAS (Gerrit) via gem5-dev
VISHNU RAMADAS has uploaded this change for review. (  
https://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=email )



Change subject: gpu-compute,mem-ruby: Add support for GPU cache bypassing
..

gpu-compute,mem-ruby: Add support for GPU cache bypassing

The GPU cache models do not support cache bypassing when the GLC or SLC
AMDGPU instruction modifiers are used in a load or store. This commit
adds cache bypass support by introducing new transitions in the
coherence protocol used by the GPU memory system. Now, instructions with
the GLC bit set will not cache in the L1 and instructions with SLC bit
set will not cache in L1 or L2.

Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3
---
M src/mem/packet.hh
M src/mem/request.hh
M src/mem/ruby/protocol/GPU_VIPER-TCC.sm
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
M src/mem/ruby/protocol/RubySlicc_MemControl.sm
M src/mem/ruby/protocol/RubySlicc_Types.sm
M src/mem/ruby/slicc_interface/RubyRequest.hh
9 files changed, 286 insertions(+), 8 deletions(-)



diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbe..224a7b5 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -418,6 +418,11 @@
  */
 uint64_t htmTransactionUid;

+/**
+ * Track whether the request has the GLC_B_BIT
+ * or SLC_BIT flags set to bypass caches.
+ */
+
   public:

 /**
@@ -886,6 +891,10 @@
 addr = req->getPaddr();
 flags.set(VALID_ADDR);
 _isSecure = req->isSecure();
+
+/**
+ * Set the flags to track original request's GLC and SLC bits
+ */
 }

 /**
@@ -1101,6 +1110,16 @@
 }

 /**
+ * Accessor functions for the cache bypass flags. The cache bypass
+ * can specify which levels in the hierarchy to bypass. If GLC_BIT
+ * is set, the requests are globally coherent and bypass TCP.
+ * If SLC_BIT is set, then the requests are system level coherent
+ * and bypass both TCP and TCC.
+ */
+bool isGLCSet() const { return req->isGLCSet();}
+bool isSLCSet() const { return req->isSLCSet();}
+
+/**
  * Check if packet corresponds to a given block-aligned address and
  * address space.
  *
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 39d9d72..6a0cbc2 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1071,6 +1071,17 @@

 bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); }

+
+/**
+ * Accessor functions for the cache bypass flags. The cache bypass
+ * can specify which levels in the hierarchy to bypass. If GLC_BIT
+ * is set, the requests are globally coherent and bypass TCP.
+ * If SLC_BIT is set, then the requests are system level coherent
+ * and bypass both TCP and TCC.
+ */
+bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); }
+bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); }
+
 /**
  * Accessor functions for the memory space configuration flags and  
used by
  * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note  
that
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm  
b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm

index 032a64c..13acc02 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -56,8 +56,10 @@
   enumeration(Event, desc="TCC Events") {
 // Requests coming from the Cores
 RdBlk,  desc="RdBlk event";
+RdBypassEvict,  desc="Bypass L2 on reads. Evict if cache block  
already allocated";

 WrVicBlk,   desc="L1 Write Through";
 WrVicBlkBack,   desc="L1 Write Through(dirty cache)";
+WrVicBlkEvict,  desc="L1 Write Through(dirty cache) and evict";
 Atomic, desc="Atomic Op";
 AtomicDone, desc="AtomicOps Complete";
 AtomicNotDone,  desc="AtomicOps not Complete";
@@ -68,6 +70,7 @@
 PrbInv, desc="Invalidating probe";
 // Coming from Memory Controller
 WBAck,  desc="writethrough ack from memory";
+Bypass, desc="Bypass the entire L2 cache";
   }

   // STATES
@@ -107,6 +110,8 @@
 NetDest Destination, desc="Data destination";
 int numAtomics, desc="number remaining atomics";
 int atomicDoneCnt,  desc="number AtomicDones triggered";
+bool isGLCSet,  desc="Bypass L1 Cache";
+bool isSLCSet,  desc="Bypass L1 and L2 Cache";
   }

   structure(TBETable, external="yes") {
@@ -173,7 +178,6 @@

   int functionalWrite(Addr addr, Packet *pkt) {
 int num_functional_writes := 0;
-
 TBE tbe := TBEs.lookup(addr);
 if(is_valid(tbe)) {
   num_functional_writes := num_functional_writes +
@@ -279,7 +283,9 @@
   peek(responseFromNB_in, ResponseMsg,