[gem5-dev] [L] Change in gem5/gem5[develop]: gpu-compute,mem-ruby: Add support for GPU cache bypassing

VISHNU RAMADAS (Gerrit) via gem5-dev Tue, 27 Dec 2022 12:51:47 -0800

VISHNU RAMADAS has uploaded this change for review. (https://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=email )


Change subject: gpu-compute,mem-ruby: Add support for GPU cache bypassing
......................................................................

gpu-compute,mem-ruby: Add support for GPU cache bypassing

The GPU cache models do not support cache bypassing when the GLC or SLC
AMDGPU instruction modifiers are used in a load or store. This commit
adds cache bypass support by introducing new transitions in the
coherence protocol used by the GPU memory system. Now, instructions with
the GLC bit set will not cache in the L1 and instructions with SLC bit
set will not cache in L1 or L2.

Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3
---
M src/mem/packet.hh
M src/mem/request.hh
M src/mem/ruby/protocol/GPU_VIPER-TCC.sm
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
M src/mem/ruby/protocol/RubySlicc_MemControl.sm
M src/mem/ruby/protocol/RubySlicc_Types.sm
M src/mem/ruby/slicc_interface/RubyRequest.hh
9 files changed, 286 insertions(+), 8 deletions(-)



diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbe..224a7b5 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -418,6 +418,11 @@
      */
     uint64_t htmTransactionUid;

+    /**
+     * Track whether the request has the GLC_B_BIT
+     * or SLC_BIT flags set to bypass caches.
+     */
+
   public:

     /**
@@ -886,6 +891,10 @@
             addr = req->getPaddr();
             flags.set(VALID_ADDR);
             _isSecure = req->isSecure();
+
+            /**
+             * Set the flags to track original request's GLC and SLC bits
+             */
         }

         /**
@@ -1101,6 +1110,16 @@
     }

     /**
+     * Accessor functions for the cache bypass flags. The cache bypass
+     * can specify which levels in the hierarchy to bypass. If GLC_BIT
+     * is set, the requests are globally coherent and bypass TCP.
+     * If SLC_BIT is set, then the requests are system level coherent
+     * and bypass both TCP and TCC.
+     */
+    bool isGLCSet() const { return req->isGLCSet();}
+    bool isSLCSet() const { return req->isSLCSet();}
+
+    /**
      * Check if packet corresponds to a given block-aligned address and
      * address space.
      *
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 39d9d72..6a0cbc2 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1071,6 +1071,17 @@

     bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); }

+
+    /**
+     * Accessor functions for the cache bypass flags. The cache bypass
+     * can specify which levels in the hierarchy to bypass. If GLC_BIT
+     * is set, the requests are globally coherent and bypass TCP.
+     * If SLC_BIT is set, then the requests are system level coherent
+     * and bypass both TCP and TCC.
+     */
+    bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); }
+    bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); }
+
     /**

* Accessor functions for the memory space configuration flags andused by* GPU ISAs such as the Heterogeneous System Architecture (HSA). Notethatdiff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.smb/src/mem/ruby/protocol/GPU_VIPER-TCC.sm

index 032a64c..13acc02 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -56,8 +56,10 @@
   enumeration(Event, desc="TCC Events") {
     // Requests coming from the Cores
     RdBlk,                  desc="RdBlk event";

+ RdBypassEvict, desc="Bypass L2 on reads. Evict if cache blockalready allocated";

     WrVicBlk,               desc="L1 Write Through";
     WrVicBlkBack,           desc="L1 Write Through(dirty cache)";
+    WrVicBlkEvict,          desc="L1 Write Through(dirty cache) and evict";
     Atomic,                 desc="Atomic Op";
     AtomicDone,             desc="AtomicOps Complete";
     AtomicNotDone,          desc="AtomicOps not Complete";
@@ -68,6 +70,7 @@
     PrbInv,                 desc="Invalidating probe";
     // Coming from Memory Controller
     WBAck,                  desc="writethrough ack from memory";
+    Bypass,                 desc="Bypass the entire L2 cache";
   }

   // STATES
@@ -107,6 +110,8 @@
     NetDest Destination, desc="Data destination";
     int numAtomics,     desc="number remaining atomics";
     int atomicDoneCnt,  desc="number AtomicDones triggered";
+    bool isGLCSet,      desc="Bypass L1 Cache";
+    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
   }

   structure(TBETable, external="yes") {
@@ -173,7 +178,6 @@

   int functionalWrite(Addr addr, Packet *pkt) {
     int num_functional_writes := 0;
-
     TBE tbe := TBEs.lookup(addr);
     if(is_valid(tbe)) {
       num_functional_writes := num_functional_writes +
@@ -279,7 +283,9 @@
       peek(responseFromNB_in, ResponseMsg, block_on="addr") {
         TBE tbe := TBEs.lookup(in_msg.addr);
         Entry cache_entry := getCacheEntry(in_msg.addr);
-        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+        if (in_msg.isSLCSet) {
+            trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:NBSysResp) {
           if(presentOrAvail(in_msg.addr)) {
             trigger(Event:Data, in_msg.addr, cache_entry, tbe);
           } else {
@@ -313,7 +319,13 @@
         TBE tbe := TBEs.lookup(in_msg.addr);
         Entry cache_entry := getCacheEntry(in_msg.addr);
         if (in_msg.Type == CoherenceRequestType:WriteThrough) {
-            if(WB) {
+            if (in_msg.isSLCSet) {
+                if(presentOrAvail(in_msg.addr)) {

+ trigger(Event:WrVicBlkEvict, in_msg.addr, cache_entry,tbe);

+                } else {
+                    trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+                }
+            } else if(WB) {
                 if(presentOrAvail(in_msg.addr)) {

trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry,tbe);

                 } else {
@@ -326,7 +338,11 @@
         } else if (in_msg.Type == CoherenceRequestType:Atomic) {
           trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
-          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          if (in_msg.isSLCSet) {
+            trigger(Event:RdBypassEvict, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          }
         } else {
           DPRINTF(RubySlicc, "%s\n", in_msg);
           error("Unexpected Response Message to Core");
@@ -354,6 +370,8 @@
         out_msg.MessageSize := MessageSizeType:Response_Data;
         out_msg.Dirty := false;
         out_msg.State := CoherenceState:Shared;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
         DPRINTF(RubySlicc, "%s\n", out_msg);
       }
     }
@@ -371,6 +389,10 @@
       out_msg.Dirty := false;
       out_msg.State := CoherenceState:Shared;
       DPRINTF(RubySlicc, "%s\n", out_msg);
+      peek(responseFromNB_in, ResponseMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
     enqueue(unblockToNB_out, UnblockMsg, 1) {
       out_msg.addr := address;
@@ -380,6 +402,29 @@
     }
   }

+  action(rb_bypassDone, "rb", desc="bypass L2 of read access") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Sender := machineID;
+          out_msg.Destination := tbe.Destination;
+          out_msg.DataBlk := in_msg.DataBlk;
+          out_msg.MessageSize := MessageSizeType:Response_Data;
+          out_msg.Dirty := false;
+          out_msg.State := CoherenceState:Shared;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        enqueue(unblockToNB_out, UnblockMsg, 1) {
+          out_msg.addr := address;

+ out_msg.Destination.add(mapAddressToMachine(address,MachineType:Directory));

+          out_msg.MessageSize := MessageSizeType:Unblock_Control;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+    }
+  }

   action(rd_requestData, "r", desc="Miss in L2, pass on") {
     if(tbe.Destination.count()==1){
@@ -391,6 +436,8 @@

out_msg.Destination.add(mapAddressToMachine(address,MachineType:Directory));

           out_msg.Shared := false; // unneeded for this request
           out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", out_msg);
         }
       }
@@ -407,6 +454,9 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+
       }
     }
   }
@@ -421,6 +471,9 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+
       }
     }
   }
@@ -434,6 +487,9 @@
           out_msg.Sender := machineID;
           out_msg.MessageSize := in_msg.MessageSize;
           out_msg.DataBlk := in_msg.DataBlk;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+
         }
     }
   }
@@ -466,6 +522,8 @@
       peek(coreRequestNetwork_in, CPURequestMsg) {

if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type ==CoherenceRequestType:Atomic){

           tbe.Destination.add(in_msg.Requestor);
+          tbe.isGLCSet := in_msg.isGLCSet;
+          tbe.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -505,6 +563,8 @@
         out_msg.DataBlk := in_msg.DataBlk;
         out_msg.writeMask.orMask(in_msg.writeMask);
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -520,6 +580,10 @@
       out_msg.Dirty := true;
       out_msg.DataBlk := cache_entry.DataBlk;
       out_msg.writeMask.orMask(cache_entry.writeMask);
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
   }

@@ -534,6 +598,8 @@
         out_msg.Type := CoherenceRequestType:Atomic;
         out_msg.Dirty := true;
         out_msg.writeMask.orMask(in_msg.writeMask);
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -643,6 +709,7 @@
     p_profileHit;
     t_allocateTBE;
     wb_writeBack;
+    p_popRequestQueue;
   }

   transition(I, RdBlk, IV) {TagArrayRead} {
@@ -659,6 +726,46 @@
     p_popRequestQueue;
   }

+  transition(I, RdBypassEvict) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Comment explaining transition and when it happens
+  transition(W, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Comment explaining transition and when it happens
+  transition(M, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition(V, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition({WI, IV}, RdBypassEvict)  {
+    st_stallAndWaitRequest;
+  }
+
+
   transition(V, Atomic, A) {TagArrayRead} {
     p_profileHit;
     i_invL2;
@@ -730,6 +837,25 @@
     p_popRequestQueue;
   }

+ transition(V, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite,DataArrayWrite} {

+    p_profileMiss;
+    ut_updateTag;
+    t_allocateTBE;
+    wt_writeThrough;
+    i_invL2;
+    p_popRequestQueue;
+  }
+

+ transition(W, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite,DataArrayWrite} {

+    p_profileMiss;
+    ut_updateTag;
+    wdb_writeDirtyBytes;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
   transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
     t_allocateTBE;
     wb_writeBack;
@@ -764,6 +890,13 @@
     pp_popProbeQueue;
   }

+  transition(I, Bypass, I) {
+    rb_bypassDone;
+    pr_popResponseQueue;
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
+  }
+
   transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
     a_allocateBlock;
     ut_updateTag;

diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.smb/src/mem/ruby/protocol/GPU_VIPER-TCP.sm

index 775a62b..808f514 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -60,6 +60,7 @@
   enumeration(Event, desc="TCP Events") {
     // Core initiated
     Load,           desc="Load";

+ LoadBypassEvict, desc="Bypass L1 on a load. Evict if cache blockalready allocated";

     Store,          desc="Store to L1 (L1 is dirty)";
     StoreThrough,   desc="Store directly to L2(L1 is clean)";
     Atomic,         desc="Atomic";
@@ -257,7 +258,7 @@
         TBE tbe := TBEs.lookup(in_msg.addr);
         if (in_msg.Type == CoherenceResponseType:TDSysResp) {
           // disable L1 cache
-          if (disableL1) {
+          if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
                  trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
           } else {
             if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
@@ -284,13 +285,18 @@
         TBE tbe := TBEs.lookup(in_msg.LineAddress);
         DPRINTF(RubySlicc, "%s\n", in_msg);
         if (in_msg.Type == RubyRequestType:LD) {
-          trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);

+ if ((in_msg.isGLCSet || in_msg.isSLCSet) &&is_valid(cache_entry)) {+ trigger(Event:LoadBypassEvict, in_msg.LineAddress,cache_entry, tbe);

+          }
+          else {
+            trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+          }
         } else if (in_msg.Type == RubyRequestType:ATOMIC ||
                    in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
                    in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
           trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
         } else if (in_msg.Type == RubyRequestType:ST) {
-          if(disableL1) {
+          if(disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {

trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry,tbe);

           } else {

if (is_valid(cache_entry) ||L1cache.cacheAvail(in_msg.LineAddress)) {

@@ -330,6 +336,10 @@
                               TCC_select_low_bit, TCC_select_num_bits));
       out_msg.MessageSize := MessageSizeType:Request_Control;
       out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
   }

@@ -375,6 +385,8 @@
                               TCC_select_low_bit, TCC_select_num_bits));
           out_msg.MessageSize := MessageSizeType:Request_Control;
           out_msg.InitialRequestTime := curCycle();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -401,6 +413,8 @@
       // forward inst sequence number to lower TCC
       peek(mandatoryQueue_in, RubyRequest) {
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -418,6 +432,11 @@
         out_msg.Type := CoherenceRequestType:Atomic;
         out_msg.InitialRequestTime := curCycle();
         out_msg.Shared := false;
+        peek(mandatoryQueue_in, RubyRequest) {
+          out_msg.instSeqNum := in_msg.instSeqNum;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+        }
       }
     }
   }
@@ -583,6 +602,14 @@
     p_popMandatoryQueue;
   }

+  transition(V, LoadBypassEvict, I) {TagArrayRead, TagArrayWrite} {
+    uu_profileDataMiss;
+    inv_invDone;
+    ic_invCache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+}
+
   transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
     t_allocateTBE;
     mru_updateMRU;

diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.smb/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm

index 3b38e3b..57edef8 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -161,6 +161,8 @@
     uint64_t probe_id,        desc="probe id for lifetime profiling";
     WriteMask writeMask,    desc="outstanding write through mask";
     int Len,            desc="Length of memory request for DMA";
+    bool isGLCSet,      desc="Bypass L1 Cache";
+    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
   }

   structure(TBETable, external="yes") {
@@ -483,6 +485,8 @@
       out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
       out_msg.OriginalResponder := tbe.LastSender;
       out_msg.L3Hit := tbe.L3Hit;
+      out_msg.isGLCSet := tbe.isGLCSet;
+      out_msg.isSLCSet := tbe.isSLCSet;
       DPRINTF(RubySlicc, "%s\n", out_msg);
     }
   }
@@ -512,6 +516,8 @@
         out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
         out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
         out_msg.OriginalResponder := tbe.LastSender;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         if(tbe.atomicData){
           out_msg.WTRequestor := tbe.WTRequestor;
         }
@@ -540,6 +546,8 @@
         out_msg.InitialRequestTime := tbe.InitialRequestTime;
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         DPRINTF(RubySlicc, "%s\n", out_msg);
       }
   }
@@ -557,6 +565,8 @@
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := curCycle();
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -569,6 +579,8 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -624,6 +636,8 @@
           out_msg.Type := MemoryRequestType:MEMORY_READ;
           out_msg.Sender := machineID;
           out_msg.MessageSize := MessageSizeType:Request_Control;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -739,6 +753,8 @@
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
           tbe.NumPendingAcks := out_msg.Destination.count();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", out_msg);
           APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -842,6 +858,8 @@
           out_msg.ReturnData := true;
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           tbe.NumPendingAcks := out_msg.Destination.count();
           DPRINTF(RubySlicc, "%s\n", (out_msg));
           APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
@@ -897,6 +915,8 @@
           out_msg.ReturnData := false;
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           tbe.NumPendingAcks := out_msg.Destination.count();
           APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -923,6 +943,8 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
       if (tbe.Dirty == false) {
           // have to update the TBE, too, because of how this
@@ -985,6 +1007,8 @@
       tbe.NumPendingAcks := 0;
       tbe.Cached := in_msg.ForceShared;
       tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.isGLCSet := in_msg.isGLCSet;
+      tbe.isSLCSet := in_msg.isSLCSet;
     }
   }

@@ -1004,6 +1028,8 @@
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := tbe.DataBlk;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         DPRINTF(ProtocolTrace, "%s\n", out_msg);
       }
     }
@@ -1104,6 +1130,8 @@
             out_msg.Sender := machineID;
             out_msg.MessageSize := MessageSizeType:Writeback_Data;
             out_msg.DataBlk := victim_entry.DataBlk;
+            out_msg.isGLCSet := in_msg.isGLCSet;
+            out_msg.isSLCSet := in_msg.isSLCSet;
           }
           L3CacheMemory.deallocate(victim);
         }
@@ -1136,6 +1164,8 @@
             out_msg.Sender := machineID;
             out_msg.MessageSize := MessageSizeType:Writeback_Data;
             out_msg.DataBlk := victim_entry.DataBlk;
+            out_msg.isGLCSet := tbe.isGLCSet;
+            out_msg.isSLCSet := tbe.isSLCSet;
           }
           L3CacheMemory.deallocate(victim);
         }

diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.smb/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm

index 46bab43..dccb5f4 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -138,6 +138,9 @@

bool NoWriteConflict, default="true", desc="write collidedwith CAB entry";

   int ProgramCounter,               desc="PC that accesses to this block";

+  bool isGLCSet, default="false", desc="GLC flag value in the request";
+  bool isSLCSet, default="false", desc="SLC flag value in the request";
+
   bool functionalRead(Packet *pkt) {
     // Only PUTX messages contains the data block
     if (Type == CoherenceRequestType:VicDirty) {
@@ -165,6 +168,8 @@
   MachineID Requestor,          desc="Requestor id for 3-hop requests";
   bool NoAckNeeded, default="false", desc="For short circuting acks";
   int ProgramCounter,           desc="PC that accesses to this block";
+  bool isGLCSet,                desc="Bypass L1 Cache";
+  bool isSLCSet,                desc="Bypass L1 and L2 Caches";

   bool functionalRead(Packet *pkt) {
     return false;
@@ -248,6 +253,9 @@
   int ProgramCounter,       desc="PC that issues this request";

bool mispred, desc="tell TCP if the block should not bebypassed";

+ bool isGLCSet, default="false", desc="GLC flag value in the request thattriggered response";+ bool isSLCSet, default="false", desc="SLC flag value in the request thattriggered response";

+

   bool functionalRead(Packet *pkt) {
     // Only PUTX messages contains the data block

diff --git a/src/mem/ruby/protocol/RubySlicc_MemControl.smb/src/mem/ruby/protocol/RubySlicc_MemControl.sm

index e8517a4..012b169 100644
--- a/src/mem/ruby/protocol/RubySlicc_MemControl.sm
+++ b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
@@ -74,6 +74,8 @@
   PrefetchBit Prefetch,         desc="Is this a prefetch request";
   bool ReadX,                   desc="Exclusive";
   int Acks,                     desc="How many acks to expect";
+  bool isGLCSet,                desc="Bypass L1 Cache";
+  bool isSLCSet,                desc="Bypass L1 and L2 Caches";

   bool functionalRead(Packet *pkt) {
     if ((MessageSize == MessageSizeType:Response_Data) ||

diff --git a/src/mem/ruby/protocol/RubySlicc_Types.smb/src/mem/ruby/protocol/RubySlicc_Types.sm

index 8d76f78..7d51f80 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -177,6 +177,8 @@

int htmTransactionUid, desc="Used to identify the unique HTMtransaction that produced this request";bool isTlbi, desc="Memory request is a TLB shootdown(invalidation) operation";Addr tlbiTransactionUid, desc="Unique identifier of the TLB shootdownoperation that produced this request";+ bool isGLCSet, default="false",desc="Flag that determines ifrequest bypasses L1 or not";+ bool isSLCSet, default="false",desc="Flag that determines ifrequest bypasses both L1 and L2 or not";


   RequestPtr getRequestPtr();
 }

diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hhb/src/mem/ruby/slicc_interface/RubyRequest.hh

index 2345c22..9bba146 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -43,7 +43,6 @@

 #include <ostream>
 #include <vector>
-
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/DataBlock.hh"
 #include "mem/ruby/common/WriteMask.hh"
@@ -79,6 +78,8 @@
     bool m_isTlbi;
     // Should be uint64, but SLICC complains about casts
     Addr m_tlbiTransactionUid;
+    bool m_isGLCSet;
+    bool m_isSLCSet;

     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -99,6 +100,13 @@
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }

     /** RubyRequest for memory management commands */
@@ -120,6 +128,13 @@
           m_tlbiTransactionUid(0)
     {
         assert(m_pkt->req->isMemMgmt());
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }

     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -148,6 +163,13 @@
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }

     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -177,6 +199,14 @@
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }

     RubyRequest(Tick curTime) : Message(curTime) {}

--

To view, visithttps://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=emailTo unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3
Gerrit-Change-Number: 66991
Gerrit-PatchSet: 1
Gerrit-Owner: VISHNU RAMADAS <vrama...@wisc.edu>
Gerrit-MessageType: newchange

_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org

[gem5-dev] [L] Change in gem5/gem5[develop]: gpu-compute,mem-ruby: Add support for GPU cache bypassing

Reply via email to