VISHNU RAMADAS has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=email )
(
7 is the latest approved patch-set.
No files were changed between the latest approved patch-set and the
submitted one.
)Change subject: gpu-compute,mem-ruby: Add support for GPU cache bypassing
......................................................................
gpu-compute,mem-ruby: Add support for GPU cache bypassing
The GPU cache models do not support cache bypassing when the GLC or SLC
AMDGPU instruction modifiers are used in a load or store. This commit
adds cache bypass support by introducing new transitions in the
coherence protocol used by the GPU memory system. Now, instructions with
the GLC bit set will not cache in the L1 and instructions with SLC bit
set will not cache in L1 or L2.
Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/66991
Reviewed-by: Jason Lowe-Power <power...@gmail.com>
Maintainer: Matt Sinclair <mattdsincl...@gmail.com>
Tested-by: kokoro <noreply+kok...@google.com>
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
---
M src/mem/packet.hh
M src/mem/request.hh
M src/mem/ruby/protocol/GPU_VIPER-TCC.sm
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
M src/mem/ruby/protocol/RubySlicc_MemControl.sm
M src/mem/ruby/protocol/RubySlicc_Types.sm
M src/mem/ruby/slicc_interface/RubyRequest.hh
9 files changed, 337 insertions(+), 8 deletions(-)
Approvals:
Matt Sinclair: Looks good to me, but someone else must approve; Looks
good to me, approved
Jason Lowe-Power: Looks good to me, approved
kokoro: Regressions pass
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbe..a80b918 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1101,6 +1101,16 @@
}
/**
+ * Accessor functions for the cache bypass flags. The cache bypass
+ * can specify which levels in the hierarchy to bypass. If GLC_BIT
+ * is set, the requests are globally coherent and bypass TCP.
+ * If SLC_BIT is set, then the requests are system level coherent
+ * and bypass both TCP and TCC.
+ */
+ bool isGLCSet() const { return req->isGLCSet();}
+ bool isSLCSet() const { return req->isSLCSet();}
+
+ /**
* Check if packet corresponds to a given block-aligned address and
* address space.
*
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 39d9d72..6a0cbc2 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1071,6 +1071,17 @@
bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); }
+
+ /**
+ * Accessor functions for the cache bypass flags. The cache bypass
+ * can specify which levels in the hierarchy to bypass. If GLC_BIT
+ * is set, the requests are globally coherent and bypass TCP.
+ * If SLC_BIT is set, then the requests are system level coherent
+ * and bypass both TCP and TCC.
+ */
+ bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); }
+ bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); }
+
/**
* Accessor functions for the memory space configuration flags and
used by
* GPU ISAs such as the Heterogeneous System Architecture (HSA). Note
that
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index 032a64c..ae14247 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -56,8 +56,10 @@
enumeration(Event, desc="TCC Events") {
// Requests coming from the Cores
RdBlk, desc="RdBlk event";
+ RdBypassEvict, desc="Bypass L2 on reads. Evict if cache block
already allocated";
WrVicBlk, desc="L1 Write Through";
WrVicBlkBack, desc="L1 Write Through(dirty cache)";
+ WrVicBlkEvict, desc="L1 Write Through(dirty cache) and evict";
Atomic, desc="Atomic Op";
AtomicDone, desc="AtomicOps Complete";
AtomicNotDone, desc="AtomicOps not Complete";
@@ -68,6 +70,7 @@
PrbInv, desc="Invalidating probe";
// Coming from Memory Controller
WBAck, desc="writethrough ack from memory";
+ Bypass, desc="Bypass the entire L2 cache";
}
// STATES
@@ -107,6 +110,8 @@
NetDest Destination, desc="Data destination";
int numAtomics, desc="number remaining atomics";
int atomicDoneCnt, desc="number AtomicDones triggered";
+ bool isGLCSet, desc="Bypass L1 Cache";
+ bool isSLCSet, desc="Bypass L1 and L2 Cache";
}
structure(TBETable, external="yes") {
@@ -173,7 +178,6 @@
int functionalWrite(Addr addr, Packet *pkt) {
int num_functional_writes := 0;
-
TBE tbe := TBEs.lookup(addr);
if(is_valid(tbe)) {
num_functional_writes := num_functional_writes +
@@ -279,7 +283,11 @@
peek(responseFromNB_in, ResponseMsg, block_on="addr") {
TBE tbe := TBEs.lookup(in_msg.addr);
Entry cache_entry := getCacheEntry(in_msg.addr);
- if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+ if (in_msg.isSLCSet) {
+ // If the SLC bit is set, the response needs to bypass the
cache
+ // and should not be allocated an entry.
+ trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:NBSysResp) {
if(presentOrAvail(in_msg.addr)) {
trigger(Event:Data, in_msg.addr, cache_entry, tbe);
} else {
@@ -313,7 +321,18 @@
TBE tbe := TBEs.lookup(in_msg.addr);
Entry cache_entry := getCacheEntry(in_msg.addr);
if (in_msg.Type == CoherenceRequestType:WriteThrough) {
- if(WB) {
+ if (in_msg.isSLCSet) {
+ // The request should bypass the cache if SLC bit is set.
+ // If the cache entry exists already, then evict it.
+ // Else, perform a normal cache access.
+ // The cache entry is allocated only on response and
bypass is
+ // handled there
+ if(presentOrAvail(in_msg.addr)) {
+ trigger(Event:WrVicBlkEvict, in_msg.addr, cache_entry,
tbe);
+ } else {
+ trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+ }
+ } else if(WB) {
if(presentOrAvail(in_msg.addr)) {
trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry,
tbe);
} else {
@@ -326,7 +345,13 @@
} else if (in_msg.Type == CoherenceRequestType:Atomic) {
trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
} else if (in_msg.Type == CoherenceRequestType:RdBlk) {
- trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+ if (in_msg.isSLCSet) {
+ // If SLC bit is set, the request needs to go directly to
memory.
+ // If a cache block already exists, then evict it.
+ trigger(Event:RdBypassEvict, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+ }
} else {
DPRINTF(RubySlicc, "%s\n", in_msg);
error("Unexpected Response Message to Core");
@@ -354,6 +379,8 @@
out_msg.MessageSize := MessageSizeType:Response_Data;
out_msg.Dirty := false;
out_msg.State := CoherenceState:Shared;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
@@ -371,15 +398,46 @@
out_msg.Dirty := false;
out_msg.State := CoherenceState:Shared;
DPRINTF(RubySlicc, "%s\n", out_msg);
+ peek(responseFromNB_in, ResponseMsg) {
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+ }
}
enqueue(unblockToNB_out, UnblockMsg, 1) {
out_msg.addr := address;
out_msg.Destination.add(mapAddressToMachine(address,
MachineType:Directory));
out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ peek(responseFromNB_in, ResponseMsg) {
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+ }
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
+ action(rb_bypassDone, "rb", desc="bypass L2 of read access") {
+ peek(responseFromNB_in, ResponseMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination := tbe.Destination;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ enqueue(unblockToNB_out, UnblockMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Destination.add(mapAddressToMachine(address,
MachineType:Directory));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
action(rd_requestData, "r", desc="Miss in L2, pass on") {
if(tbe.Destination.count()==1){
@@ -391,6 +449,8 @@
out_msg.Destination.add(mapAddressToMachine(address,
MachineType:Directory));
out_msg.Shared := false; // unneeded for this request
out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
@@ -407,6 +467,9 @@
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
out_msg.instSeqNum := in_msg.instSeqNum;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+
}
}
}
@@ -421,6 +484,9 @@
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
out_msg.instSeqNum := in_msg.instSeqNum;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+
}
}
}
@@ -434,6 +500,9 @@
out_msg.Sender := machineID;
out_msg.MessageSize := in_msg.MessageSize;
out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+
}
}
}
@@ -466,6 +535,8 @@
peek(coreRequestNetwork_in, CPURequestMsg) {
if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type ==
CoherenceRequestType:Atomic){
tbe.Destination.add(in_msg.Requestor);
+ tbe.isGLCSet := in_msg.isGLCSet;
+ tbe.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -505,6 +576,8 @@
out_msg.DataBlk := in_msg.DataBlk;
out_msg.writeMask.orMask(in_msg.writeMask);
out_msg.instSeqNum := in_msg.instSeqNum;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -520,6 +593,10 @@
out_msg.Dirty := true;
out_msg.DataBlk := cache_entry.DataBlk;
out_msg.writeMask.orMask(cache_entry.writeMask);
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+ }
}
}
@@ -534,6 +611,8 @@
out_msg.Type := CoherenceRequestType:Atomic;
out_msg.Dirty := true;
out_msg.writeMask.orMask(in_msg.writeMask);
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -549,6 +628,10 @@
out_msg.Ntsl := true;
out_msg.State := CoherenceState:NA;
out_msg.MessageSize := MessageSizeType:Response_Control;
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+ }
}
}
action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
@@ -592,6 +675,10 @@
tbe.atomicDoneCnt := tbe.atomicDoneCnt + 1;
out_msg.addr := address;
out_msg.Type := TriggerType:AtomicDone;
+ peek(responseFromNB_in, ResponseMsg) {
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+ }
}
}
}
@@ -659,6 +746,54 @@
p_popRequestQueue;
}
+ transition(I, RdBypassEvict) {TagArrayRead} {
+ p_profileMiss;
+ t_allocateTBE;
+ rd_requestData;
+ p_popRequestQueue;
+ }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state W. It evicts and invalidates the cache entry before
+// forwarding the request to global memory
+ transition(W, RdBypassEvict, I) {TagArrayRead} {
+ p_profileMiss;
+ t_allocateTBE;
+ wb_writeBack;
+ i_invL2;
+ rd_requestData;
+ p_popRequestQueue;
+ }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state M. It evicts and invalidates the cache entry before
+// forwarding the request to global memory to main memory
+ transition(M, RdBypassEvict, I) {TagArrayRead} {
+ p_profileMiss;
+ t_allocateTBE;
+ wb_writeBack;
+ i_invL2;
+ rd_requestData;
+ p_popRequestQueue;
+ }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state V. It invalidates the cache entry before forwarding the
+// request to global memory.
+ transition(V, RdBypassEvict, I) {TagArrayRead} {
+ p_profileMiss;
+ t_allocateTBE;
+ i_invL2;
+ rd_requestData;
+ p_popRequestQueue;
+ }
+
+// Transition to be called when a read request with SLC flag arrives at
entry
+// in transient state. The request stalls until the pending transition is
complete.
+ transition({WI, IV}, RdBypassEvict) {
+ st_stallAndWaitRequest;
+ }
+
transition(V, Atomic, A) {TagArrayRead} {
p_profileHit;
i_invL2;
@@ -730,6 +865,31 @@
p_popRequestQueue;
}
+// Transition to be called when a write request with SLC bit set arrives
at an
+// entry with state V. The entry has to be evicted and invalidated before
the
+// request is forwarded to global memory
+ transition(V, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite,
DataArrayWrite} {
+ p_profileMiss;
+ ut_updateTag;
+ t_allocateTBE;
+ wt_writeThrough;
+ i_invL2;
+ p_popRequestQueue;
+ }
+
+// Transition to be called when a write request with SLC bit set arrives
at an
+// entry with state W. The entry has to be evicted and invalidated before
the
+// request is forwarded to global memory.
+ transition(W, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite,
DataArrayWrite} {
+ p_profileMiss;
+ ut_updateTag;
+ wdb_writeDirtyBytes;
+ t_allocateTBE;
+ wb_writeBack;
+ i_invL2;
+ p_popRequestQueue;
+ }
+
transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
t_allocateTBE;
wb_writeBack;
@@ -764,6 +924,16 @@
pp_popProbeQueue;
}
+// Transition to be called when the response for a request with SLC bit set
+// arrives. The request has to be forwarded to the core that needs it while
+// making sure no entry is allocated.
+ transition(I, Bypass, I) {
+ rb_bypassDone;
+ pr_popResponseQueue;
+ wada_wakeUpAllDependentsAddr;
+ dt_deallocateTBE;
+ }
+
transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
a_allocateBlock;
ut_updateTag;
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 775a62b..3be1397 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -60,6 +60,7 @@
enumeration(Event, desc="TCP Events") {
// Core initiated
Load, desc="Load";
+ LoadBypassEvict, desc="Bypass L1 on a load. Evict if cache block
already allocated";
Store, desc="Store to L1 (L1 is dirty)";
StoreThrough, desc="Store directly to L2(L1 is clean)";
Atomic, desc="Atomic";
@@ -256,8 +257,10 @@
Entry cache_entry := getCacheEntry(in_msg.addr);
TBE tbe := TBEs.lookup(in_msg.addr);
if (in_msg.Type == CoherenceResponseType:TDSysResp) {
- // disable L1 cache
- if (disableL1) {
+ if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
+ // If L1 is disabled or requests have GLC or SLC flag set,
+ // then, the requests should not cache in the L1. The
response
+ // from L2/global memory should bypass the cache
trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
@@ -284,13 +287,23 @@
TBE tbe := TBEs.lookup(in_msg.LineAddress);
DPRINTF(RubySlicc, "%s\n", in_msg);
if (in_msg.Type == RubyRequestType:LD) {
- trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+ if ((in_msg.isGLCSet || in_msg.isSLCSet) &&
is_valid(cache_entry)) {
+ // Read rquests with GLC or SLC bit set should not cache in
the L1.
+ // They need to bypass the L1 and go to the L2. If an entry
exists
+ // in the L1, it needs to be evicted
+ trigger(Event:LoadBypassEvict, in_msg.LineAddress,
cache_entry, tbe);
+ }
+ else {
+ trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+ }
} else if (in_msg.Type == RubyRequestType:ATOMIC ||
in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:ST) {
- if(disableL1) {
+ if(disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
+ // Write requests with GLC or SLC bit set, or when L1 is
disabled,
+ // should not cache in the L1. They need to perform a store
through
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry,
tbe);
} else {
if (is_valid(cache_entry) ||
L1cache.cacheAvail(in_msg.LineAddress)) {
@@ -330,6 +343,10 @@
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := curCycle();
+ peek(mandatoryQueue_in, RubyRequest) {
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+ }
}
}
@@ -375,6 +392,8 @@
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := curCycle();
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -401,6 +420,8 @@
// forward inst sequence number to lower TCC
peek(mandatoryQueue_in, RubyRequest) {
out_msg.instSeqNum := in_msg.instSeqNum;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -418,6 +439,11 @@
out_msg.Type := CoherenceRequestType:Atomic;
out_msg.InitialRequestTime := curCycle();
out_msg.Shared := false;
+ peek(mandatoryQueue_in, RubyRequest) {
+ out_msg.instSeqNum := in_msg.instSeqNum;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
+ }
}
}
}
@@ -583,6 +609,17 @@
p_popMandatoryQueue;
}
+// Transition to be called when a load request with GLC or SLC flag set
arrives
+// at L1. This transition invalidates any existing entry and forwards the
+// request to L2.
+ transition(V, LoadBypassEvict, I) {TagArrayRead, TagArrayWrite} {
+ uu_profileDataMiss;
+ inv_invDone;
+ ic_invCache;
+ n_issueRdBlk;
+ p_popMandatoryQueue;
+}
+
transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
t_allocateTBE;
mru_updateMRU;
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 3b38e3b..57edef8 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -161,6 +161,8 @@
uint64_t probe_id, desc="probe id for lifetime profiling";
WriteMask writeMask, desc="outstanding write through mask";
int Len, desc="Length of memory request for DMA";
+ bool isGLCSet, desc="Bypass L1 Cache";
+ bool isSLCSet, desc="Bypass L1 and L2 Cache";
}
structure(TBETable, external="yes") {
@@ -483,6 +485,8 @@
out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
out_msg.OriginalResponder := tbe.LastSender;
out_msg.L3Hit := tbe.L3Hit;
+ out_msg.isGLCSet := tbe.isGLCSet;
+ out_msg.isSLCSet := tbe.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
@@ -512,6 +516,8 @@
out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.isGLCSet := tbe.isGLCSet;
+ out_msg.isSLCSet := tbe.isSLCSet;
if(tbe.atomicData){
out_msg.WTRequestor := tbe.WTRequestor;
}
@@ -540,6 +546,8 @@
out_msg.InitialRequestTime := tbe.InitialRequestTime;
out_msg.ForwardRequestTime := curCycle();
out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.isGLCSet := tbe.isGLCSet;
+ out_msg.isSLCSet := tbe.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
@@ -557,6 +565,8 @@
out_msg.ForwardRequestTime := curCycle();
out_msg.ProbeRequestStartTime := curCycle();
out_msg.instSeqNum := in_msg.instSeqNum;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -569,6 +579,8 @@
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -624,6 +636,8 @@
out_msg.Type := MemoryRequestType:MEMORY_READ;
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -739,6 +753,8 @@
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
tbe.NumPendingAcks := out_msg.Destination.count();
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -842,6 +858,8 @@
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
tbe.NumPendingAcks := out_msg.Destination.count();
DPRINTF(RubySlicc, "%s\n", (out_msg));
APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
@@ -897,6 +915,8 @@
out_msg.ReturnData := false;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
tbe.NumPendingAcks := out_msg.Destination.count();
APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -923,6 +943,8 @@
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
if (tbe.Dirty == false) {
// have to update the TBE, too, because of how this
@@ -985,6 +1007,8 @@
tbe.NumPendingAcks := 0;
tbe.Cached := in_msg.ForceShared;
tbe.InitialRequestTime := in_msg.InitialRequestTime;
+ tbe.isGLCSet := in_msg.isGLCSet;
+ tbe.isSLCSet := in_msg.isSLCSet;
}
}
@@ -1004,6 +1028,8 @@
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := tbe.DataBlk;
+ out_msg.isGLCSet := tbe.isGLCSet;
+ out_msg.isSLCSet := tbe.isSLCSet;
DPRINTF(ProtocolTrace, "%s\n", out_msg);
}
}
@@ -1104,6 +1130,8 @@
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := victim_entry.DataBlk;
+ out_msg.isGLCSet := in_msg.isGLCSet;
+ out_msg.isSLCSet := in_msg.isSLCSet;
}
L3CacheMemory.deallocate(victim);
}
@@ -1136,6 +1164,8 @@
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := victim_entry.DataBlk;
+ out_msg.isGLCSet := tbe.isGLCSet;
+ out_msg.isSLCSet := tbe.isSLCSet;
}
L3CacheMemory.deallocate(victim);
}
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index 46bab43..6ff19e9 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -138,6 +138,9 @@
bool NoWriteConflict, default="true", desc="write collided
with CAB entry";
int ProgramCounter, desc="PC that accesses to this block";
+ bool isGLCSet, default="false", desc="GLC flag value in the request";
+ bool isSLCSet, default="false", desc="SLC flag value in the request";
+
bool functionalRead(Packet *pkt) {
// Only PUTX messages contains the data block
if (Type == CoherenceRequestType:VicDirty) {
@@ -165,6 +168,8 @@
MachineID Requestor, desc="Requestor id for 3-hop requests";
bool NoAckNeeded, default="false", desc="For short circuting acks";
int ProgramCounter, desc="PC that accesses to this block";
+ bool isGLCSet, desc="Bypass L1 Cache";
+ bool isSLCSet, desc="Bypass L1 and L2 Caches";
bool functionalRead(Packet *pkt) {
return false;
@@ -248,6 +253,9 @@
int ProgramCounter, desc="PC that issues this request";
bool mispred, desc="tell TCP if the block should not be
bypassed";
+ bool isGLCSet, default="false", desc="GLC flag value in the request that
triggered response";
+ bool isSLCSet, default="false", desc="SLC flag value in the request that
triggered response";
+
bool functionalRead(Packet *pkt) {
// Only PUTX messages contains the data block
@@ -277,6 +285,8 @@
bool wasValid, default="false", desc="Was block valid when evicted";
bool valid, default="false", desc="Is block valid";
bool validToInvalid, default="false", desc="Was block valid when
evicted";
+ bool isGLCSet, default="false", desc="GLC flag value in the request";
+ bool isSLCSet, default="false", desc="SLC flag value in the request";
bool functionalRead(Packet *pkt) {
return false;
@@ -321,6 +331,8 @@
TriggerType Type, desc="Type of trigger";
CacheId Dest, default="CacheId_NA", desc="Cache to invalidate";
int ProgramCounter, desc="PC that accesses to this block";
+ bool isGLCSet, default="false", desc="GLC flag value in the
request";
+ bool isSLCSet, default="false", desc="SLC flag value in the
request";
bool functionalRead(Packet *pkt) {
return false;
diff --git a/src/mem/ruby/protocol/RubySlicc_MemControl.sm
b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
index e8517a4..012b169 100644
--- a/src/mem/ruby/protocol/RubySlicc_MemControl.sm
+++ b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
@@ -74,6 +74,8 @@
PrefetchBit Prefetch, desc="Is this a prefetch request";
bool ReadX, desc="Exclusive";
int Acks, desc="How many acks to expect";
+ bool isGLCSet, desc="Bypass L1 Cache";
+ bool isSLCSet, desc="Bypass L1 and L2 Caches";
bool functionalRead(Packet *pkt) {
if ((MessageSize == MessageSizeType:Response_Data) ||
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm
b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 8d76f78..8ba9d93 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -177,6 +177,8 @@
int htmTransactionUid, desc="Used to identify the unique HTM
transaction that produced this request";
bool isTlbi, desc="Memory request is a TLB shootdown
(invalidation) operation";
Addr tlbiTransactionUid, desc="Unique identifier of the TLB shootdown
operation that produced this request";
+ bool isGLCSet, default="false",desc="If flag is set, bypass
GPU L1 cache";
+ bool isSLCSet, default="false",desc="If flag is set, bypass
GPU L1 and L2 caches";
RequestPtr getRequestPtr();
}
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh
b/src/mem/ruby/slicc_interface/RubyRequest.hh
index 2345c22..89ce834 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -79,6 +79,11 @@
bool m_isTlbi;
// Should be uint64, but SLICC complains about casts
Addr m_tlbiTransactionUid;
+ // GPU cache bypass flags. GLC bypasses L1 while SLC bypasses both L1
and
+ // L2 if set to true. They are set to false by default and they must be
+ // explicitly set to true in the program in order to bypass caches
+ bool m_isGLCSet;
+ bool m_isSLCSet;
RubyRequest(Tick curTime, uint64_t _paddr, int _len,
uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -99,6 +104,13 @@
m_tlbiTransactionUid(0)
{
m_LineAddress = makeLineAddress(m_PhysicalAddress);
+ if (_pkt) {
+ m_isGLCSet = m_pkt->req->isGLCSet();
+ m_isSLCSet = m_pkt->req->isSLCSet();
+ } else {
+ m_isGLCSet = 0;
+ m_isSLCSet = 0;
+ }
}
/** RubyRequest for memory management commands */
@@ -120,6 +132,13 @@
m_tlbiTransactionUid(0)
{
assert(m_pkt->req->isMemMgmt());
+ if (_pkt) {
+ m_isGLCSet = m_pkt->req->isGLCSet();
+ m_isSLCSet = m_pkt->req->isSLCSet();
+ } else {
+ m_isGLCSet = 0;
+ m_isSLCSet = 0;
+ }
}
RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -148,6 +167,13 @@
m_tlbiTransactionUid(0)
{
m_LineAddress = makeLineAddress(m_PhysicalAddress);
+ if (_pkt) {
+ m_isGLCSet = m_pkt->req->isGLCSet();
+ m_isSLCSet = m_pkt->req->isSLCSet();
+ } else {
+ m_isGLCSet = 0;
+ m_isSLCSet = 0;
+ }
}
RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -177,6 +203,14 @@
m_tlbiTransactionUid(0)
{
m_LineAddress = makeLineAddress(m_PhysicalAddress);
+ if (_pkt) {
+ m_isGLCSet = m_pkt->req->isGLCSet();
+ m_isSLCSet = m_pkt->req->isSLCSet();
+
+ } else {
+ m_isGLCSet = 0;
+ m_isSLCSet = 0;
+ }
}
RubyRequest(Tick curTime) : Message(curTime) {}
--
To view, visit
https://gem5-review.googlesource.com/c/public/gem5/+/66991?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3
Gerrit-Change-Number: 66991
Gerrit-PatchSet: 9
Gerrit-Owner: VISHNU RAMADAS <vrama...@wisc.edu>
Gerrit-Reviewer: Jason Lowe-Power <ja...@lowepower.com>
Gerrit-Reviewer: Jason Lowe-Power <power...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsinclair.w...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: VISHNU RAMADAS <vrama...@wisc.edu>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org