[gem5-dev] [L] Change in gem5/gem5[develop]: gpu-compute,mem-ruby: Add support for GPU cache bypassing

VISHNU RAMADAS (Gerrit) via gem5-dev Tue, 03 Jan 2023 13:20:27 -0800

VISHNU RAMADAS has submitted this change. (https://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=email )


7 is the latest approved patch-set.

No files were changed between the latest approved patch-set and thesubmitted one.

 )Change subject: gpu-compute,mem-ruby: Add support for GPU cache bypassing
......................................................................

gpu-compute,mem-ruby: Add support for GPU cache bypassing

The GPU cache models do not support cache bypassing when the GLC or SLC
AMDGPU instruction modifiers are used in a load or store. This commit
adds cache bypass support by introducing new transitions in the
coherence protocol used by the GPU memory system. Now, instructions with
the GLC bit set will not cache in the L1 and instructions with SLC bit
set will not cache in L1 or L2.

Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/66991
Reviewed-by: Jason Lowe-Power <power...@gmail.com>
Maintainer: Matt Sinclair <mattdsincl...@gmail.com>
Tested-by: kokoro <noreply+kok...@google.com>
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
---
M src/mem/packet.hh
M src/mem/request.hh
M src/mem/ruby/protocol/GPU_VIPER-TCC.sm
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
M src/mem/ruby/protocol/RubySlicc_MemControl.sm
M src/mem/ruby/protocol/RubySlicc_Types.sm
M src/mem/ruby/slicc_interface/RubyRequest.hh
9 files changed, 337 insertions(+), 8 deletions(-)

Approvals:

Matt Sinclair: Looks good to me, but someone else must approve; Looksgood to me, approved

  Jason Lowe-Power: Looks good to me, approved
  kokoro: Regressions pass




diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbe..a80b918 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1101,6 +1101,16 @@
     }

     /**
+     * Accessor functions for the cache bypass flags. The cache bypass
+     * can specify which levels in the hierarchy to bypass. If GLC_BIT
+     * is set, the requests are globally coherent and bypass TCP.
+     * If SLC_BIT is set, then the requests are system level coherent
+     * and bypass both TCP and TCC.
+     */
+    bool isGLCSet() const { return req->isGLCSet();}
+    bool isSLCSet() const { return req->isSLCSet();}
+
+    /**
      * Check if packet corresponds to a given block-aligned address and
      * address space.
      *
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 39d9d72..6a0cbc2 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1071,6 +1071,17 @@

     bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); }

+
+    /**
+     * Accessor functions for the cache bypass flags. The cache bypass
+     * can specify which levels in the hierarchy to bypass. If GLC_BIT
+     * is set, the requests are globally coherent and bypass TCP.
+     * If SLC_BIT is set, then the requests are system level coherent
+     * and bypass both TCP and TCC.
+     */
+    bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); }
+    bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); }
+
     /**

* Accessor functions for the memory space configuration flags andused by* GPU ISAs such as the Heterogeneous System Architecture (HSA). Notethatdiff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.smb/src/mem/ruby/protocol/GPU_VIPER-TCC.sm

index 032a64c..ae14247 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -56,8 +56,10 @@
   enumeration(Event, desc="TCC Events") {
     // Requests coming from the Cores
     RdBlk,                  desc="RdBlk event";

+ RdBypassEvict, desc="Bypass L2 on reads. Evict if cache blockalready allocated";

     WrVicBlk,               desc="L1 Write Through";
     WrVicBlkBack,           desc="L1 Write Through(dirty cache)";
+    WrVicBlkEvict,          desc="L1 Write Through(dirty cache) and evict";
     Atomic,                 desc="Atomic Op";
     AtomicDone,             desc="AtomicOps Complete";
     AtomicNotDone,          desc="AtomicOps not Complete";
@@ -68,6 +70,7 @@
     PrbInv,                 desc="Invalidating probe";
     // Coming from Memory Controller
     WBAck,                  desc="writethrough ack from memory";
+    Bypass,                 desc="Bypass the entire L2 cache";
   }

   // STATES
@@ -107,6 +110,8 @@
     NetDest Destination, desc="Data destination";
     int numAtomics,     desc="number remaining atomics";
     int atomicDoneCnt,  desc="number AtomicDones triggered";
+    bool isGLCSet,      desc="Bypass L1 Cache";
+    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
   }

   structure(TBETable, external="yes") {
@@ -173,7 +178,6 @@

   int functionalWrite(Addr addr, Packet *pkt) {
     int num_functional_writes := 0;
-
     TBE tbe := TBEs.lookup(addr);
     if(is_valid(tbe)) {
       num_functional_writes := num_functional_writes +
@@ -279,7 +283,11 @@
       peek(responseFromNB_in, ResponseMsg, block_on="addr") {
         TBE tbe := TBEs.lookup(in_msg.addr);
         Entry cache_entry := getCacheEntry(in_msg.addr);
-        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+        if (in_msg.isSLCSet) {

+ // If the SLC bit is set, the response needs to bypass thecache

+            // and should not be allocated an entry.
+            trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:NBSysResp) {
           if(presentOrAvail(in_msg.addr)) {
             trigger(Event:Data, in_msg.addr, cache_entry, tbe);
           } else {
@@ -313,7 +321,18 @@
         TBE tbe := TBEs.lookup(in_msg.addr);
         Entry cache_entry := getCacheEntry(in_msg.addr);
         if (in_msg.Type == CoherenceRequestType:WriteThrough) {
-            if(WB) {
+            if (in_msg.isSLCSet) {
+                // The request should bypass the cache if SLC bit is set.
+                // If the cache entry exists already, then evict it.
+                // Else, perform a normal cache access.

+ // The cache entry is allocated only on response andbypass is

+                // handled there
+                if(presentOrAvail(in_msg.addr)) {

+ trigger(Event:WrVicBlkEvict, in_msg.addr, cache_entry,tbe);

+                } else {
+                    trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+                }
+            } else if(WB) {
                 if(presentOrAvail(in_msg.addr)) {

trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry,tbe);

                 } else {
@@ -326,7 +345,13 @@
         } else if (in_msg.Type == CoherenceRequestType:Atomic) {
           trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
-          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          if (in_msg.isSLCSet) {

+ // If SLC bit is set, the request needs to go directly tomemory.

+            // If a cache block already exists, then evict it.
+            trigger(Event:RdBypassEvict, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          }
         } else {
           DPRINTF(RubySlicc, "%s\n", in_msg);
           error("Unexpected Response Message to Core");
@@ -354,6 +379,8 @@
         out_msg.MessageSize := MessageSizeType:Response_Data;
         out_msg.Dirty := false;
         out_msg.State := CoherenceState:Shared;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
         DPRINTF(RubySlicc, "%s\n", out_msg);
       }
     }
@@ -371,15 +398,46 @@
       out_msg.Dirty := false;
       out_msg.State := CoherenceState:Shared;
       DPRINTF(RubySlicc, "%s\n", out_msg);
+      peek(responseFromNB_in, ResponseMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
     enqueue(unblockToNB_out, UnblockMsg, 1) {
       out_msg.addr := address;

out_msg.Destination.add(mapAddressToMachine(address,MachineType:Directory));

       out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      peek(responseFromNB_in, ResponseMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
       DPRINTF(RubySlicc, "%s\n", out_msg);
     }
   }

+  action(rb_bypassDone, "rb", desc="bypass L2 of read access") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Sender := machineID;
+          out_msg.Destination := tbe.Destination;
+          out_msg.DataBlk := in_msg.DataBlk;
+          out_msg.MessageSize := MessageSizeType:Response_Data;
+          out_msg.Dirty := false;
+          out_msg.State := CoherenceState:Shared;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        enqueue(unblockToNB_out, UnblockMsg, 1) {
+          out_msg.addr := address;

+ out_msg.Destination.add(mapAddressToMachine(address,MachineType:Directory));

+          out_msg.MessageSize := MessageSizeType:Unblock_Control;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+    }
+  }

   action(rd_requestData, "r", desc="Miss in L2, pass on") {
     if(tbe.Destination.count()==1){
@@ -391,6 +449,8 @@

out_msg.Destination.add(mapAddressToMachine(address,MachineType:Directory));

           out_msg.Shared := false; // unneeded for this request
           out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", out_msg);
         }
       }
@@ -407,6 +467,9 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+
       }
     }
   }
@@ -421,6 +484,9 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+
       }
     }
   }
@@ -434,6 +500,9 @@
           out_msg.Sender := machineID;
           out_msg.MessageSize := in_msg.MessageSize;
           out_msg.DataBlk := in_msg.DataBlk;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+
         }
     }
   }
@@ -466,6 +535,8 @@
       peek(coreRequestNetwork_in, CPURequestMsg) {

if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type ==CoherenceRequestType:Atomic){

           tbe.Destination.add(in_msg.Requestor);
+          tbe.isGLCSet := in_msg.isGLCSet;
+          tbe.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -505,6 +576,8 @@
         out_msg.DataBlk := in_msg.DataBlk;
         out_msg.writeMask.orMask(in_msg.writeMask);
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -520,6 +593,10 @@
       out_msg.Dirty := true;
       out_msg.DataBlk := cache_entry.DataBlk;
       out_msg.writeMask.orMask(cache_entry.writeMask);
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
   }

@@ -534,6 +611,8 @@
         out_msg.Type := CoherenceRequestType:Atomic;
         out_msg.Dirty := true;
         out_msg.writeMask.orMask(in_msg.writeMask);
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -549,6 +628,10 @@
       out_msg.Ntsl := true;
       out_msg.State := CoherenceState:NA;
       out_msg.MessageSize := MessageSizeType:Response_Control;
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
   }
   action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
@@ -592,6 +675,10 @@
         tbe.atomicDoneCnt := tbe.atomicDoneCnt + 1;
         out_msg.addr := address;
         out_msg.Type := TriggerType:AtomicDone;
+        peek(responseFromNB_in, ResponseMsg) {
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+        }
       }
     }
   }
@@ -659,6 +746,54 @@
     p_popRequestQueue;
   }

+  transition(I, RdBypassEvict) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state W. It evicts and invalidates the cache entry before
+// forwarding the request to global memory
+  transition(W, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state M. It evicts and invalidates the cache entry before
+// forwarding the request to global memory to main memory
+  transition(M, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state V. It invalidates the cache entry before forwarding the
+// request to global memory.
+  transition(V, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+

+// Transition to be called when a read request with SLC flag arrives atentry+// in transient state. The request stalls until the pending transition iscomplete.

+  transition({WI, IV}, RdBypassEvict)  {
+    st_stallAndWaitRequest;
+  }
+
   transition(V, Atomic, A) {TagArrayRead} {
     p_profileHit;
     i_invL2;
@@ -730,6 +865,31 @@
     p_popRequestQueue;
   }

+// Transition to be called when a write request with SLC bit set arrivesat an+// entry with state V. The entry has to be evicted and invalidated beforethe

+// request is forwarded to global memory

+ transition(V, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite,DataArrayWrite} {

+    p_profileMiss;
+    ut_updateTag;
+    t_allocateTBE;
+    wt_writeThrough;
+    i_invL2;
+    p_popRequestQueue;
+  }
+

+// Transition to be called when a write request with SLC bit set arrivesat an+// entry with state W. The entry has to be evicted and invalidated beforethe

+// request is forwarded to global memory.

+ transition(W, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite,DataArrayWrite} {

+    p_profileMiss;
+    ut_updateTag;
+    wdb_writeDirtyBytes;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
   transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
     t_allocateTBE;
     wb_writeBack;
@@ -764,6 +924,16 @@
     pp_popProbeQueue;
   }

+// Transition to be called when the response for a request with SLC bit set
+// arrives. The request has to be forwarded to the core that needs it while
+// making sure no entry is allocated.
+  transition(I, Bypass, I) {
+    rb_bypassDone;
+    pr_popResponseQueue;
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
+  }
+
   transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
     a_allocateBlock;
     ut_updateTag;

diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.smb/src/mem/ruby/protocol/GPU_VIPER-TCP.sm

index 775a62b..3be1397 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -60,6 +60,7 @@
   enumeration(Event, desc="TCP Events") {
     // Core initiated
     Load,           desc="Load";

+ LoadBypassEvict, desc="Bypass L1 on a load. Evict if cache blockalready allocated";

     Store,          desc="Store to L1 (L1 is dirty)";
     StoreThrough,   desc="Store directly to L2(L1 is clean)";
     Atomic,         desc="Atomic";
@@ -256,8 +257,10 @@
         Entry cache_entry := getCacheEntry(in_msg.addr);
         TBE tbe := TBEs.lookup(in_msg.addr);
         if (in_msg.Type == CoherenceResponseType:TDSysResp) {
-          // disable L1 cache
-          if (disableL1) {
+          if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
+              // If L1 is disabled or requests have GLC or SLC flag set,

+ // then, the requests should not cache in the L1. Theresponse

+              // from L2/global memory should bypass the cache
                  trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
           } else {
             if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
@@ -284,13 +287,23 @@
         TBE tbe := TBEs.lookup(in_msg.LineAddress);
         DPRINTF(RubySlicc, "%s\n", in_msg);
         if (in_msg.Type == RubyRequestType:LD) {
-          trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);

+ if ((in_msg.isGLCSet || in_msg.isSLCSet) &&is_valid(cache_entry)) {+ // Read rquests with GLC or SLC bit set should not cache inthe L1.+ // They need to bypass the L1 and go to the L2. If an entryexists

+            // in the L1, it needs to be evicted

+ trigger(Event:LoadBypassEvict, in_msg.LineAddress,cache_entry, tbe);

+          }
+          else {
+            trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+          }
         } else if (in_msg.Type == RubyRequestType:ATOMIC ||
                    in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
                    in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
           trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
         } else if (in_msg.Type == RubyRequestType:ST) {
-          if(disableL1) {
+          if(disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {

+ // Write requests with GLC or SLC bit set, or when L1 isdisabled,+ // should not cache in the L1. They need to perform a storethroughtrigger(Event:StoreThrough, in_msg.LineAddress, cache_entry,tbe);

           } else {

if (is_valid(cache_entry) ||L1cache.cacheAvail(in_msg.LineAddress)) {

@@ -330,6 +343,10 @@
                               TCC_select_low_bit, TCC_select_num_bits));
       out_msg.MessageSize := MessageSizeType:Request_Control;
       out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
   }

@@ -375,6 +392,8 @@
                               TCC_select_low_bit, TCC_select_num_bits));
           out_msg.MessageSize := MessageSizeType:Request_Control;
           out_msg.InitialRequestTime := curCycle();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -401,6 +420,8 @@
       // forward inst sequence number to lower TCC
       peek(mandatoryQueue_in, RubyRequest) {
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -418,6 +439,11 @@
         out_msg.Type := CoherenceRequestType:Atomic;
         out_msg.InitialRequestTime := curCycle();
         out_msg.Shared := false;
+        peek(mandatoryQueue_in, RubyRequest) {
+          out_msg.instSeqNum := in_msg.instSeqNum;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+        }
       }
     }
   }
@@ -583,6 +609,17 @@
     p_popMandatoryQueue;
   }

+// Transition to be called when a load request with GLC or SLC flag setarrives

+// at L1. This transition invalidates any existing entry and forwards the
+// request to L2.
+  transition(V, LoadBypassEvict, I) {TagArrayRead, TagArrayWrite} {
+    uu_profileDataMiss;
+    inv_invDone;
+    ic_invCache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+}
+
   transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
     t_allocateTBE;
     mru_updateMRU;

diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.smb/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm

index 3b38e3b..57edef8 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -161,6 +161,8 @@
     uint64_t probe_id,        desc="probe id for lifetime profiling";
     WriteMask writeMask,    desc="outstanding write through mask";
     int Len,            desc="Length of memory request for DMA";
+    bool isGLCSet,      desc="Bypass L1 Cache";
+    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
   }

   structure(TBETable, external="yes") {
@@ -483,6 +485,8 @@
       out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
       out_msg.OriginalResponder := tbe.LastSender;
       out_msg.L3Hit := tbe.L3Hit;
+      out_msg.isGLCSet := tbe.isGLCSet;
+      out_msg.isSLCSet := tbe.isSLCSet;
       DPRINTF(RubySlicc, "%s\n", out_msg);
     }
   }
@@ -512,6 +516,8 @@
         out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
         out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
         out_msg.OriginalResponder := tbe.LastSender;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         if(tbe.atomicData){
           out_msg.WTRequestor := tbe.WTRequestor;
         }
@@ -540,6 +546,8 @@
         out_msg.InitialRequestTime := tbe.InitialRequestTime;
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         DPRINTF(RubySlicc, "%s\n", out_msg);
       }
   }
@@ -557,6 +565,8 @@
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := curCycle();
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -569,6 +579,8 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -624,6 +636,8 @@
           out_msg.Type := MemoryRequestType:MEMORY_READ;
           out_msg.Sender := machineID;
           out_msg.MessageSize := MessageSizeType:Request_Control;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -739,6 +753,8 @@
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
           tbe.NumPendingAcks := out_msg.Destination.count();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", out_msg);
           APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -842,6 +858,8 @@
           out_msg.ReturnData := true;
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           tbe.NumPendingAcks := out_msg.Destination.count();
           DPRINTF(RubySlicc, "%s\n", (out_msg));
           APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
@@ -897,6 +915,8 @@
           out_msg.ReturnData := false;
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           tbe.NumPendingAcks := out_msg.Destination.count();
           APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -923,6 +943,8 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
       if (tbe.Dirty == false) {
           // have to update the TBE, too, because of how this
@@ -985,6 +1007,8 @@
       tbe.NumPendingAcks := 0;
       tbe.Cached := in_msg.ForceShared;
       tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.isGLCSet := in_msg.isGLCSet;
+      tbe.isSLCSet := in_msg.isSLCSet;
     }
   }

@@ -1004,6 +1028,8 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := tbe.DataBlk;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         DPRINTF(ProtocolTrace, "%s\n", out_msg);
       }
     }
@@ -1104,6 +1130,8 @@
             out_msg.Sender := machineID;
             out_msg.MessageSize := MessageSizeType:Writeback_Data;
             out_msg.DataBlk := victim_entry.DataBlk;
+            out_msg.isGLCSet := in_msg.isGLCSet;
+            out_msg.isSLCSet := in_msg.isSLCSet;
           }
           L3CacheMemory.deallocate(victim);
         }
@@ -1136,6 +1164,8 @@
             out_msg.Sender := machineID;
             out_msg.MessageSize := MessageSizeType:Writeback_Data;
             out_msg.DataBlk := victim_entry.DataBlk;
+            out_msg.isGLCSet := tbe.isGLCSet;
+            out_msg.isSLCSet := tbe.isSLCSet;
           }
           L3CacheMemory.deallocate(victim);
         }

diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.smb/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm

index 46bab43..6ff19e9 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -138,6 +138,9 @@

bool NoWriteConflict, default="true", desc="write collidedwith CAB entry";

   int ProgramCounter,               desc="PC that accesses to this block";

+  bool isGLCSet, default="false", desc="GLC flag value in the request";
+  bool isSLCSet, default="false", desc="SLC flag value in the request";
+
   bool functionalRead(Packet *pkt) {
     // Only PUTX messages contains the data block
     if (Type == CoherenceRequestType:VicDirty) {
@@ -165,6 +168,8 @@
   MachineID Requestor,          desc="Requestor id for 3-hop requests";
   bool NoAckNeeded, default="false", desc="For short circuting acks";
   int ProgramCounter,           desc="PC that accesses to this block";
+  bool isGLCSet,                desc="Bypass L1 Cache";
+  bool isSLCSet,                desc="Bypass L1 and L2 Caches";

   bool functionalRead(Packet *pkt) {
     return false;
@@ -248,6 +253,9 @@
   int ProgramCounter,       desc="PC that issues this request";

bool mispred, desc="tell TCP if the block should not bebypassed";

+ bool isGLCSet, default="false", desc="GLC flag value in the request thattriggered response";+ bool isSLCSet, default="false", desc="SLC flag value in the request thattriggered response";

+

   bool functionalRead(Packet *pkt) {
     // Only PUTX messages contains the data block
@@ -277,6 +285,8 @@
   bool wasValid, default="false", desc="Was block valid when evicted";
   bool valid, default="false", desc="Is block valid";

bool validToInvalid, default="false", desc="Was block valid whenevicted";

+  bool isGLCSet, default="false", desc="GLC flag value in the request";
+  bool isSLCSet, default="false", desc="SLC flag value in the request";

   bool functionalRead(Packet *pkt) {
     return false;
@@ -321,6 +331,8 @@
   TriggerType Type,             desc="Type of trigger";
   CacheId Dest,         default="CacheId_NA", desc="Cache to invalidate";
   int ProgramCounter,           desc="PC that accesses to this block";

+ bool isGLCSet, default="false", desc="GLC flag value in therequest";+ bool isSLCSet, default="false", desc="SLC flag value in therequest";


   bool functionalRead(Packet *pkt) {
     return false;

diff --git a/src/mem/ruby/protocol/RubySlicc_MemControl.smb/src/mem/ruby/protocol/RubySlicc_MemControl.sm

index e8517a4..012b169 100644
--- a/src/mem/ruby/protocol/RubySlicc_MemControl.sm
+++ b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
@@ -74,6 +74,8 @@
   PrefetchBit Prefetch,         desc="Is this a prefetch request";
   bool ReadX,                   desc="Exclusive";
   int Acks,                     desc="How many acks to expect";
+  bool isGLCSet,                desc="Bypass L1 Cache";
+  bool isSLCSet,                desc="Bypass L1 and L2 Caches";

   bool functionalRead(Packet *pkt) {
     if ((MessageSize == MessageSizeType:Response_Data) ||

diff --git a/src/mem/ruby/protocol/RubySlicc_Types.smb/src/mem/ruby/protocol/RubySlicc_Types.sm

index 8d76f78..8ba9d93 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -177,6 +177,8 @@

int htmTransactionUid, desc="Used to identify the unique HTMtransaction that produced this request";bool isTlbi, desc="Memory request is a TLB shootdown(invalidation) operation";Addr tlbiTransactionUid, desc="Unique identifier of the TLB shootdownoperation that produced this request";+ bool isGLCSet, default="false",desc="If flag is set, bypassGPU L1 cache";+ bool isSLCSet, default="false",desc="If flag is set, bypassGPU L1 and L2 caches";


   RequestPtr getRequestPtr();
 }

diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hhb/src/mem/ruby/slicc_interface/RubyRequest.hh

index 2345c22..89ce834 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -79,6 +79,11 @@
     bool m_isTlbi;
     // Should be uint64, but SLICC complains about casts
     Addr m_tlbiTransactionUid;

+ // GPU cache bypass flags. GLC bypasses L1 while SLC bypasses both L1and

+    // L2 if set to true. They are set to false by default and they must be
+    // explicitly set to true in the program in order to bypass caches
+    bool m_isGLCSet;
+    bool m_isSLCSet;

     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -99,6 +104,13 @@
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }

     /** RubyRequest for memory management commands */
@@ -120,6 +132,13 @@
           m_tlbiTransactionUid(0)
     {
         assert(m_pkt->req->isMemMgmt());
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }

     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -148,6 +167,13 @@
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }

     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -177,6 +203,14 @@
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }

     RubyRequest(Tick curTime) : Message(curTime) {}

--

To view, visithttps://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=emailTo unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3
Gerrit-Change-Number: 66991
Gerrit-PatchSet: 9
Gerrit-Owner: VISHNU RAMADAS <vrama...@wisc.edu>
Gerrit-Reviewer: Jason Lowe-Power <ja...@lowepower.com>
Gerrit-Reviewer: Jason Lowe-Power <power...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsinclair.w...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: VISHNU RAMADAS <vrama...@wisc.edu>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged

_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org

[gem5-dev] [L] Change in gem5/gem5[develop]: gpu-compute,mem-ruby: Add support for GPU cache bypassing

Reply via email to