Matthew Poremba has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/57410 )

Change subject: mem-ruby: Enhance MOESI_AMD DmaWrite
......................................................................

mem-ruby: Enhance MOESI_AMD DmaWrite

This enhances MOESI_AMD_Base-dir DmaWrite to enable partial writes. This
is currently done by assuming a full cache line, invalidating caches,
and transitioning back to unblocked state. The enhanced write supports
partial writes (i.e., smaller than cache line size) by first reading
memory, merging the modified data, and then writing back to memory.
Implementation of this mirrors that of DmaRead in terms of state. This
means for each DmaRead state (BDR_PM, BDR_Pm, and BDR_M) there is a
write analogue (BDW_PM, BDW_Pm, and BDR_M) and the BDR_P state is
removed. Furthermore, this enhanced DmaWrite ... actually writes data to
memory instread of relying on DirectoryEntry / backing store for correct
data.

There are two possible state transitions for DmaWrite now. (1) Memory
data arrives before probe response and (2) probe response arrives before
memory data. In case (1), probe data overwrites memory data and merges
the partial write using the TBE write mask then updates write mask to
'filled' state. In case (2), probe data is merged with the partial data
using the TBE write mask then updates write mask to 'filled' state. The
memory data will then be clobbered by copying the TBE data over the
response since the write mask is now full.

Change-Id: I1eebb882b464c4c5ee5fd60932fd38d271ada4d7
---
M src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
1 file changed, 102 insertions(+), 30 deletions(-)



diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 9176df5..3b38e3b 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -61,26 +61,28 @@
 {
   // STATES
state_declaration(State, desc="Directory states", default="Directory_State_U") {
-    U, AccessPermission:Backing_Store,                 desc="unblocked";
+    U, AccessPermission:Backing_Store,          desc="unblocked";
     BL, AccessPermission:Busy,                  desc="got L3 WB request";
// BL is Busy because it's possible for the data only to be in the network // in the WB, L3 has sent it and gone on with its business in possibly I
     // state.
- BDR_M, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for memory"; - BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; - BDR_PM, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes and memory"; - BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - BDW_P, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes, no need for memory"; - BDR_Pm, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes, already got memory"; - BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + BDR_M, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for memory"; + BDW_M, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for memory"; + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BDR_PM, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes and memory"; + BDW_PM, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes and memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BDR_Pm, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes, already got memory"; + BDW_Pm, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes, already got memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack";
   }

   // Events
@@ -579,7 +581,13 @@
           out_msg.Type := TriggerType:L3Hit;
         }
CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
-        tbe.DataBlk := entry.DataBlk;
+
+ // tbe.DataBlk may have partial data (e.g., for DMA writes). Make sure + // not to clobber the data before merging it with the L3 cache data.
+        DataBlock tmpBlk := entry.DataBlk;
+        tmpBlk.copyPartial(tbe.DataBlk, tbe.writeMask);
+        tbe.DataBlk := tmpBlk;
+
         tbe.L3Hit := true;
         tbe.MemData := true;
         L3CacheMemory.deallocate(address);
@@ -937,7 +945,14 @@
         tbe.wtData := true;
         tbe.Dirty := true;
         tbe.DataBlk := in_msg.DataBlk;
-        tbe.writeMask.fillMask();
+ // DMAs can be partial cache line. This is indicated by a "Len" value
+        // which is non-zero. In this case make sure to set the writeMask
+        // appropriately. This can occur for either reads or writes.
+        if (in_msg.Len != 0) {
+ tbe.writeMask.setMask(addressOffset(in_msg.PhysicalAddress, address), in_msg.Len);
+        } else {
+          tbe.writeMask.fillMask();
+        }
       }
     }
   }
@@ -998,7 +1013,9 @@
     peek(memQueue_in, MemoryMsg) {
       if (tbe.wtData == true) {
// Keep the write-through data based on mask, but use the memory block
-        // for the masked-off data.
+ // for the masked-off data. If we received a probe with data, the mask
+        // will be filled and the tbe data will fully overwrite the memory
+        // data in the temp block.
         DataBlock tmpBlk := in_msg.DataBlk;
         tmpBlk.copyPartial(tbe.DataBlk, tbe.writeMask);
         tbe.DataBlk := tmpBlk;
@@ -1199,7 +1216,7 @@
   }

   // TRANSITIONS
- transition({BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { + transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
       st_stallAndWaitRequest;
   }

@@ -1210,7 +1227,7 @@

// The exit state is always going to be U, so wakeUpDependents logic should be covered in all the
   // transitions which are flowing into U.
- transition({BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead,DmaWrite}){ + transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead,DmaWrite}){
     sd_stallAndWaitRequest;
   }

@@ -1231,9 +1248,10 @@
     p_popRequestQueue;
   }

-  transition(U, DmaWrite, BDW_P) {L3TagArrayRead} {
+  transition(U, DmaWrite, BDW_PM) {L3TagArrayRead} {
     atd_allocateTBEforDMA;
-    da_sendResponseDmaAck;
+    qdr_queueDmaRdReq;
+    pr_profileL3HitMiss; //Must come after qdr_queueDmaRdReq
     icd_probeInvCoreDataForDMA;
     pd_popDmaRequestQueue;
   }
@@ -1308,15 +1326,15 @@
     pr_popResponseQueue;
   }

- transition({B, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) { + transition({B, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) {
     z_stall;
   }

- transition({U, BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, WBAck) { + transition({U, BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, WBAck) {
     pm_popMemQueue;
   }

- transition({U, BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) { + transition({U, BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) {
     rv_removeVicDirtyIgnore;
     w_sendResponseWBAck;
     p_popRequestQueue;
@@ -1337,6 +1355,11 @@
     pm_popMemQueue;
   }

+  transition(BDW_PM, MemData, BDW_Pm) {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
   transition(BS_PM, MemData, BS_Pm) {} {
     mt_writeMemDataToTBE;
     pm_popMemQueue;
@@ -1356,6 +1379,10 @@
     ptl_popTriggerQueue;
   }

+  transition(BDW_PM, L3Hit, BDW_Pm) {
+    ptl_popTriggerQueue;
+  }
+
   transition(BS_PM, L3Hit, BS_Pm) {} {
     ptl_popTriggerQueue;
   }
@@ -1376,6 +1403,15 @@
     pm_popMemQueue;
   }

+  transition(BDW_M, MemData, U) {
+    mt_writeMemDataToTBE;
+    da_sendResponseDmaAck;
+    wd_writeBackData;
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
   transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
     mt_writeMemDataToTBE;
     s_sendResponseS;
@@ -1427,7 +1463,7 @@
     ptl_popTriggerQueue;
   }

- transition({BDR_PM, BS_PM, BDW_P, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) { + transition({BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) {
     y_writeProbeDataToTBE;
     x_decrementAcks;
     o_checkForCompletion;
@@ -1438,6 +1474,10 @@
     pt_popTriggerQueue;
   }

+  transition(BDW_PM, ProbeAcksComplete, BDW_M) {
+    pt_popTriggerQueue;
+  }
+
   transition(BS_PM, ProbeAcksComplete, BS_M) {} {
     sf_setForwardReqTime;
     pt_popTriggerQueue;
@@ -1453,7 +1493,8 @@
     pt_popTriggerQueue;
   }

-  transition(BDW_P, ProbeAcksComplete, U) {
+  transition(BDR_Pm, ProbeAcksComplete, U) {
+    dd_sendResponseDmaData;
// Check for pending requests from the core we put to sleep while waiting
     // for a response
     wada_wakeUpAllDependentsAddr;
@@ -1461,8 +1502,9 @@
     pt_popTriggerQueue;
   }

-  transition(BDR_Pm, ProbeAcksComplete, U) {
-    dd_sendResponseDmaData;
+  transition(BDW_Pm, ProbeAcksComplete, U) {
+    da_sendResponseDmaAck;
+    wd_writeBackData;
// Check for pending requests from the core we put to sleep while waiting
     // for a response
     wada_wakeUpAllDependentsAddr;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/57410
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I1eebb882b464c4c5ee5fd60932fd38d271ada4d7
Gerrit-Change-Number: 57410
Gerrit-PatchSet: 1
Gerrit-Owner: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to