Hello Tuan Ta,

I'd like you to do a code review. Please visit

    https://gem5-review.googlesource.com/c/public/gem5/+/32858

to review the following change.


Change subject: mem-ruby: fix integration issues between GCN3 and VIPER
......................................................................

mem-ruby: fix integration issues between GCN3 and VIPER

This patch basically changes how coalescer supports two different cases:
(1) using real CUs and (2) using tester threads.

This patch keeps the implementation of writeCompleteCallback in
GPUCoalescer instead of placing it in VIPER-specific coalescer.

Change-Id: I6d54795ac60d7aa020bc94b853dc3673f59762ba
---
M configs/example/ruby_gpu_random_test.py
M src/cpu/testers/gpu_ruby_test/AddressManager.cc
M src/cpu/testers/gpu_ruby_test/GpuWavefront.cc
M src/cpu/testers/gpu_ruby_test/GpuWavefront.hh
M src/cpu/testers/gpu_ruby_test/ProtocolTester.cc
M src/cpu/testers/gpu_ruby_test/ProtocolTester.hh
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
M src/mem/ruby/protocol/GPU_VIPER-msg.sm
M src/mem/ruby/system/GPUCoalescer.cc
M src/mem/ruby/system/GPUCoalescer.hh
M src/mem/ruby/system/VIPERCoalescer.cc
M src/mem/ruby/system/VIPERCoalescer.hh
12 files changed, 197 insertions(+), 143 deletions(-)



diff --git a/configs/example/ruby_gpu_random_test.py b/configs/example/ruby_gpu_random_test.py
index d40a942..7e79d9e 100644
--- a/configs/example/ruby_gpu_random_test.py
+++ b/configs/example/ruby_gpu_random_test.py
@@ -98,7 +98,97 @@
 assert(options.num_compute_units >= 1)
 n_cu = options.num_compute_units

-options.num_sqc = int((n_cu + options.cu_per_sqc - 1) // options.cu_per_sqc)
+#
+# Set up system size - 3 options
+#
+if (options.system_size == 0):
+    # 1 CU, 1 CPU, 1 SQC, 1 Scalar
+    options.wf_size = 1
+    options.wavefronts_per_cu = 1
+    options.num_cpus = 1
+    options.cu_per_sqc = 1
+    options.cu_per_scalar_cache = 1
+    options.num_compute_units = 1
+elif (options.system_size == 1):
+    # 4 CUs, 4 CPUs, 1 SQCs, 1 Scalars
+    options.wf_size = 16
+    options.wavefronts_per_cu = 4
+    options.num_cpus = 4
+    options.cu_per_sqc = 4
+    options.cu_per_scalar_cache = 4
+    options.num_compute_units = 4
+elif (options.system_size == 2):
+    # 8 CUs, 4 CPUs, 1 SQCs, 1 Scalars
+    options.wf_size = 32
+    options.wavefronts_per_cu = 4
+    options.num_cpus = 4
+    options.cu_per_sqc = 4
+    options.cu_per_scalar_cache = 4
+    options.num_compute_units = 8
+else:
+ print("Error: option system size '%s' not recognized", options.system_size)
+    sys.exit(1)
+
+#
+# set address range - 2 options
+#   level 0: small
+#   level 1: large
+# each location corresponds to a 4-byte piece of data
+#
+options.mem_size = '1024MB'
+num_atomic_locs = 10
+num_regular_locs_per_atomic_loc = 10000
+if (options.address_range == 1):
+    num_atomic_locs = 100
+    num_regular_locs_per_atomic_loc = 100000
+elif (options.address_range != 0):
+    print("Error: option address_range '%s' not recognized", \
+              options.address_range)
+    sys.exit(1)
+
+#
+# set episode length (# of actions per episode) - 3 options
+#   0: 10 actions
+#   1: 100 actions
+#   2: 500 actions
+#
+eps_length = 10
+if (options.episode_length == 1):
+    eps_length = 100
+elif (options.episode_length == 2):
+    eps_length = 500
+elif (options.episode_length != 0):
+    print("Error: option episode_length '%s' not recognized",
+              options.episode_length)
+    sys.exit(1)
+
+# set the Ruby's and tester's deadlock thresholds
+# the Ruby's deadlock detection is the primary check for deadlock.
+# the tester's deadlock threshold detection is a secondary check for deadlock
+# if there is a bug in RubyPort that causes a packet not to return to the
+# tester properly, the tester will throw a deadlock exception.
+# we set cache_deadlock_threshold < tester_deadlock_threshold to detect
+# deadlock caused by Ruby protocol first before one caused by the coalescer
+options.cache_deadlock_threshold = 100000000
+tester_deadlock_threshold = 1000000000
+
+# for now, we're testing only GPU protocol, so we set num_cpus to 0
+options.num_cpus = 0
+# number of CPUs and CUs
+n_CPUs = options.num_cpus
+n_CUs = options.num_compute_units
+# set test length, i.e., number of episodes per wavefront * #WFs
+# test length can be 1x#WFs, 10x#WFs, 100x#WFs, ...
+n_WFs = n_CUs * options.wavefronts_per_cu
+max_episodes = options.test_length * n_WFs
+# number of SQC and Scalar caches
+assert(n_CUs % options.cu_per_sqc == 0)
+n_SQCs = int(n_CUs/options.cu_per_sqc)
+options.num_sqc = n_SQCs
+assert(n_CUs % options.cu_per_scalar_cache == 0)
+assert(options.cu_per_scalar_cache != 0)
+n_Scalars = int(n_CUs/options.cu_per_scalar_cache)
+options.num_scalar_cache = n_Scalars

 if args:
      print("Error: script doesn't take any positional arguments")
diff --git a/src/cpu/testers/gpu_ruby_test/AddressManager.cc b/src/cpu/testers/gpu_ruby_test/AddressManager.cc
index 6ef52fc..3d9b161 100644
--- a/src/cpu/testers/gpu_ruby_test/AddressManager.cc
+++ b/src/cpu/testers/gpu_ruby_test/AddressManager.cc
@@ -38,6 +38,7 @@
 #include <algorithm>

 #include "base/intmath.hh"
+#include "base/logging.hh"
 #include "base/random.hh"
 #include "base/trace.hh"

@@ -314,8 +315,8 @@
             // increment the location's number of owners
             loc_prop.second++;
         } else {
-            panic("Location in loadStoreMap but wasn't picked "
-                            "in any action\n");
+            panic("Location in loadStoreMap but wasn't picked in any"
+                            " action\n");
         }
     }

@@ -376,7 +377,14 @@
     ExpectedValueSet::iterator it = expectedValues.find(val);

     if (it == expectedValues.end()) {
-      return false;
+        std::stringstream exp_val_ss;
+        for (auto& val : expectedValues) {
+            exp_val_ss << " " << val;
+        }
+
+        warn("Expected return values are:\n\t%s\n", exp_val_ss.str());
+
+        return false;
     }

     // erase this value b/c it's done
diff --git a/src/cpu/testers/gpu_ruby_test/GpuWavefront.cc b/src/cpu/testers/gpu_ruby_test/GpuWavefront.cc
index 699776c..822fb26 100644
--- a/src/cpu/testers/gpu_ruby_test/GpuWavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/GpuWavefront.cc
@@ -81,14 +81,9 @@
             // for now, assert address is 4-byte aligned
             assert(address % load_size == 0);

-            Request *req = new Request(0,                     // asid
- address, // virtual addr - load_size, // size in bytes
-                                       0,                     // flags
-                                       tester->masterId(),  // port id
-                                       0,                     // pc
-                                       threadId,           // thread_id
-                                       0);
+            auto req = std::make_shared<Request>(address, load_size,
+                                                 0, tester->masterId(),
+                                                 0, threadId, nullptr);
             req->setPaddr(address);
             req->setReqInstSeqNum(tester->getActionSeqNum());
             // set protocol-specific flags
@@ -103,7 +98,7 @@
             pendingLdStCount++;

             if (!port->sendTimingReq(pkt)) {
-                retryPkts.push_back(pkt);
+                panic("Not expected failed sendTimingReq\n");
             }

             // insert an outstanding load
@@ -138,14 +133,9 @@
                     curEpisode->getEpisodeId(), printAddress(address),
                     new_value);

-            Request *req = new Request(0,                     // asid
- address, // virtual addr - sizeof(Value), // size in bytes
-                                       0,                     // flags
-                                       tester->masterId(),  // port id
-                                       0,                     // pc
-                                       threadId,           // thread_id
-                                       0);
+            auto req = std::make_shared<Request>(address, sizeof(Value),
+                                                 0, tester->masterId(), 0,
+                                                 threadId, nullptr);
             req->setPaddr(address);
             req->setReqInstSeqNum(tester->getActionSeqNum());
             // set protocol-specific flags
@@ -163,7 +153,7 @@
             pendingLdStCount++;

             if (!port->sendTimingReq(pkt)) {
-                retryPkts.push_back(pkt);
+                panic("Not expecting a failed sendTimingReq\n");
             }

             // add an outstanding store
@@ -198,14 +188,11 @@

         // must be aligned with store size
         assert(address % sizeof(Value) == 0);
-        Request *req = new Request(0,                     // asid
-                                   address,               // virtual addr
-                                   sizeof(Value),         // size in bytes
-                                   flags,                 // flags
-                                   tester->masterId(),  // port id
-                                   0,                     // pc
-                                   threadId,           // thread_id
-                                   new AtomicOpInc<Value>());
+        AtomicOpFunctor *amo_op = new AtomicOpInc<Value>();
+        auto req = std::make_shared<Request>(address, sizeof(Value),
+                                             flags, tester->masterId(),
+                                             0, threadId,
+                                             AtomicOpFunctorPtr(amo_op));
         req->setPaddr(address);
         req->setReqInstSeqNum(tester->getActionSeqNum());
         // set protocol-specific flags
@@ -217,7 +204,7 @@
         pkt->senderState = new ProtocolTester::SenderState(this);

         if (!port->sendTimingReq(pkt)) {
-            retryPkts.push_back(pkt);
+            panic("Not expecting failed sendTimingReq\n");
         }

         // increment the number of outstanding atomic ops
@@ -241,14 +228,9 @@
     assert(pendingLdStCount == 0);
     assert(pendingAtomicCount == 0);

-    Request *acq_req = new Request(0,
-                                   0,                     // vaddr
-                                   0,                     // request size
-                                   0,                     // flags
-                                   tester->masterId(),
-                                   0,
-                                   threadId,
-                                   0);
+    auto acq_req = std::make_shared<Request>(0, 0, 0,
+                                             tester->masterId(), 0,
+                                             threadId, nullptr);
     acq_req->setPaddr(0);
     acq_req->setReqInstSeqNum(tester->getActionSeqNum());
     acq_req->setFlags(Request::ACQUIRE);
@@ -262,7 +244,7 @@
     pendingFenceCount++;

     if (!port->sendTimingReq(pkt)) {
-        retryPkts.push_back(pkt);
+        panic("Not expecting failed sendTimingReq\n");
     }
 }

@@ -287,7 +269,7 @@
 {
     assert(pkt);
     MemCmd resp_cmd = pkt->cmd;
-    Addr addr = pkt->getAddr();
+    Addr addr = (resp_cmd == MemCmd::MemSyncResp) ? 0 : pkt->getAddr();

     DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s - "
                     "Addr %s\n", this->getName(),
@@ -362,7 +344,7 @@

         // this Atomic is done
         pendingAtomicCount--;
-    } else if (resp_cmd == MemCmd::MessageResp) {
+    } else if (resp_cmd == MemCmd::WriteCompleteResp) {
         // write completion ACK
         assert(pendingLdStCount > 0);
         assert(pendingAtomicCount == 0);
@@ -376,7 +358,6 @@
     if (isTransactionDone) {
         // no need to keep senderState and request around
         delete pkt->senderState;
-        delete pkt->req;
     }

     delete pkt;
@@ -391,17 +372,7 @@
 }

 void
-GpuWavefront::setExtraRequestFlags(Request* req)
+GpuWavefront::setExtraRequestFlags(RequestPtr req)
 {
     // No extra request flag is set
 }
-
-void
-GpuWavefront::retry()
-{
-    while (retryPkts.size() > 0) {
-        if (!port->sendTimingReq(retryPkts.front()))
-            break;
-        retryPkts.pop_front();
-    }
-}
diff --git a/src/cpu/testers/gpu_ruby_test/GpuWavefront.hh b/src/cpu/testers/gpu_ruby_test/GpuWavefront.hh
index e17da0e..fa9528b 100644
--- a/src/cpu/testers/gpu_ruby_test/GpuWavefront.hh
+++ b/src/cpu/testers/gpu_ruby_test/GpuWavefront.hh
@@ -52,9 +52,6 @@

     virtual void hitCallback(PacketPtr pkt);

-    // retry failed packets
-    void retry();
-
   protected:
     void issueLoadOps();
     void issueStoreOps();
@@ -64,11 +61,10 @@
     virtual void issueAcquireOp();
     virtual void issueReleaseOp();
     // set extra request flags that is specific to a target protocol
-    virtual void setExtraRequestFlags(Request* req);
+    virtual void setExtraRequestFlags(RequestPtr req);

   protected:
     int cuId;    // compute unit associated with this wavefront
-    std::deque<PacketPtr> retryPkts;
 };

 #endif /* CPU_TESTERS_PROTOCOL_TESTER_GPUWAVEFRONT_HH_ */
diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.cc b/src/cpu/testers/gpu_ruby_test/ProtocolTester.cc
index 1a9389b..8235b44 100644
--- a/src/cpu/testers/gpu_ruby_test/ProtocolTester.cc
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.cc
@@ -50,7 +50,7 @@

 ProtocolTester::ProtocolTester(const Params *p)
       : MemObject(p),
-        _masterId(p->system->getMasterId(this, name())),
+        _masterId(p->system->getMasterId(this)),
         numCpuPorts(p->port_cpu_ports_connection_count),
         numVectorPorts(p->port_cu_vector_ports_connection_count),
         numSqcPorts(p->port_cu_sqc_ports_connection_count),
@@ -190,16 +190,13 @@
     for (int cu_id = 0; cu_id < numCus; ++cu_id) {
         vectorPortId = cu_id;
         sqcPortId = cu_id/numCusPerSqc;
-        // no scalar port if 'numCusPerScalar' is '0'
-        if (numCusPerScalar != 0)
-            scalarPortId = cu_id/numCusPerScalar;
+        scalarPortId = cu_id/numCusPerScalar;

         for (int i = 0; i < numWfsPerCu; ++i) {
             wfId = cu_id * numWfsPerCu + i;
             wfs[wfId]->attachThreadToPorts(this,
static_cast<SeqPort*>(cuVectorPorts[vectorPortId]),
                            static_cast<SeqPort*>(cuSqcPorts[sqcPortId]),
-                           !numCusPerScalar ? nullptr :
static_cast<SeqPort*>(cuScalarPorts[scalarPortId]));
             wfs[wfId]->scheduleWakeup();
             wfs[wfId]->scheduleDeadlockCheckEvent();
@@ -207,13 +204,13 @@
     }
 }

-BaseMasterPort &
-ProtocolTester::getMasterPort(const std::string & if_name, PortID idx)
+Port&
+ProtocolTester::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name != "cpu_ports" && if_name != "cu_vector_ports" &&
         if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports") {
         // pass along to super class
-        return MemObject::getMasterPort(if_name, idx);
+        return MemObject::getPort(if_name, idx);
     } else {
         if (if_name == "cpu_ports") {
             if (idx > numCpuPorts)
@@ -245,8 +242,7 @@
         if (!sentExitSignal) {
             // all done
             inform("Total completed episodes: %d\n", nextEpisodeId - 1);
-            inform("Protocol Test: Passed!\n");
-            exitSimLoop("ProtocolTester completed!");
+            exitSimLoop("GPU Ruby Tester: Passed!");
             sentExitSignal = true;
         }
         return true;
@@ -292,17 +288,9 @@
         ccprintf(*(logFile->stream()), "%s", ss.str());
         logFile->stream()->flush();

-        // exit the sim loop
-        exitSimLoop("GPU Ruby Tester: Failed!", -1);
         sentExitSignal = true;
-    }
-}
-
-void ProtocolTester::retry()
-{
-    // FIXME for now, just let all wavefronts retry
-    for (auto wf : wfs) {
-        wf->retry();
+        // terminate the simulation
+        panic("GPU Ruby Tester: Failed!\n");
     }
 }

@@ -319,12 +307,6 @@
     return true;
 }

-void
-ProtocolTester::SeqPort::recvReqRetry()
-{
-    tester->retry();
-}
-
 ProtocolTester*
 ProtocolTesterParams::create()
 {
diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.hh b/src/cpu/testers/gpu_ruby_test/ProtocolTester.hh
index ba3fdac..2264d1c 100644
--- a/src/cpu/testers/gpu_ruby_test/ProtocolTester.hh
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.hh
@@ -75,16 +75,13 @@
       public:
SeqPort(const std::string &_name, ProtocolTester *_tester, PortID _id,
                 PortID _index)
-            : MasterPort(_name, _tester, _id),
-              tester(_tester)
+            : MasterPort(_name, _tester, _id)
         {}

       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-
-      private:
-          ProtocolTester* tester;
+        virtual void recvReqRetry()
+            { panic("%s does not expect a retry\n", name()); }
     };

     struct SenderState : public Packet::SenderState
@@ -110,8 +107,8 @@

     void init();
     MasterID masterId() { return _masterId; };
-    virtual BaseMasterPort &getMasterPort(const std::string &if_name,
-                                          PortID idx = InvalidPortID);
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;

     int getEpisodeLength() const { return episodeLength; }
     // return pointer to the address manager
@@ -125,8 +122,6 @@
     int getNextEpisodeID() { return nextEpisodeId++; }
     // get action sequence number
     int getActionSeqNum() { return actionCount++; }
-    // retry all failed request pkts
-    void retry();

     // dump error log into a file and exit the simulation
     void dumpErrorLog(std::stringstream& ss);
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index aafe5a4..511d584 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -495,7 +495,7 @@
       assert(false);
     } else {
       peek(responseToTCP_in, ResponseMsg) {
-        coalescer.writeCompleteCallback(address, in_msg.instSeqNum);
+ coalescer.writeCompleteCallback(address, in_msg.instSeqNum, MachineType:L1Cache);
       }
     }
   }
diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
index 6011748..04172ed 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -44,6 +44,7 @@
                      Cycles, Cycles, Cycles);
   void writeCallback(Addr, MachineType, DataBlock,
                      Cycles, Cycles, Cycles, bool);
+  void writeCompleteCallback(Addr, uint64_t, MachineType);
   void evictionCallback(Addr);
   void recordCPReadCallBack(MachineID, MachineID);
   void recordCPWriteCallBack(MachineID, MachineID);
@@ -64,6 +65,6 @@
                      Cycles, Cycles, Cycles, bool);
   void atomicCallback(Addr, MachineType, DataBlock);
   void invTCPCallback(Addr);
-  void writeCompleteCallback(Addr, uint64_t);
+  void writeCompleteCallback(Addr, uint64_t, MachineType);
   void evictionCallback(Addr);
 }
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 80bc19a..1a44e55 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -671,22 +671,8 @@
             // instruction.
             RubyPort::MemSlavePort* mem_slave_port = ss->port;

-            GPUDynInstPtr gpuDynInst = nullptr;
-
-            if (!m_usingRubyTester) {
-                // If this coalescer is connected to a real CU, we need
-                // to save the corresponding gpu dynamic instruction.
-                // CU will use that instruction to decrement wait counters
-                // in the issuing wavefront.
-                // For Ruby tester, gpuDynInst == nullptr
-                ComputeUnit::DataPort::SenderState* cu_state =
-                    safe_cast<ComputeUnit::DataPort::SenderState*>
-                        (ss->predecessor);
-                gpuDynInst = cu_state->_gpuDynInst;
-            }
-
             PendingWriteInst& inst = pendingWriteInsts[seqNum];
- inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
+            inst.addPendingReq(mem_slave_port, pkt, m_usingRubyTester);
         }

         return true;
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 401f70b..ca53c7f 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -38,6 +38,7 @@
 #include <unordered_map>

 #include "base/statistics.hh"
+#include "cpu/testers/gpu_ruby_test/ProtocolTester.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/misc.hh"
 #include "mem/request.hh"
@@ -127,6 +128,7 @@
   public:
     PendingWriteInst()
         : numPendingStores(0),
+          numRequiredWriteCompleteAcks(0),
           originalPort(nullptr),
           gpuDynInstPtr(nullptr)
     {}
@@ -135,14 +137,35 @@
     {}

     void
-    addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
+    addPendingReq(RubyPort::MemSlavePort* port, PacketPtr pkt,
                   bool usingRubyTester)
     {
         assert(port);
         originalPort = port;

-        if (!usingRubyTester) {
-            gpuDynInstPtr = inst;
+        RubyPort::SenderState* ss =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+        if (usingRubyTester) {
+            // If this coalescer is connected to a tester thread, we need
+            // to save the corresponding requesting thread.
+            // get the requesting thread from the original sender state
+            ProtocolTester::SenderState* senderState =
+                            safe_cast<ProtocolTester::SenderState*>
+                                (ss->predecessor);
+            testerThreadPtr = senderState->th;
+            numRequiredWriteCompleteAcks++;
+        } else {
+            // If this coalescer is connected to a real CU, we need
+            // to save the corresponding gpu dynamic instruction.
+            // CU will use that instruction to decrement wait counters
+            // in the issuing wavefront.
+            // For Ruby tester, gpuDynInst == nullptr
+            ComputeUnit::DataPort::SenderState* cu_state =
+                safe_cast<ComputeUnit::DataPort::SenderState*>
+                    (ss->predecessor);
+            gpuDynInstPtr = cu_state->_gpuDynInst;
+            numRequiredWriteCompleteAcks = 1;
         }

         numPendingStores++;
@@ -162,21 +185,29 @@
     ackWriteCompletion(bool usingRubyTester)
     {
         assert(numPendingStores == 0);
+        assert(numRequiredWriteCompleteAcks > 0);

-        // make a response packet
-        PacketPtr pkt = new Packet(std::make_shared<Request>(),
-                                   MemCmd::WriteCompleteResp);
+        for (int i = 0; i < numRequiredWriteCompleteAcks; ++i) {
+            // make a response packet
+            PacketPtr pkt = new Packet(std::make_shared<Request>(),
+                                       MemCmd::WriteCompleteResp);

-        if (!usingRubyTester) {
-            assert(gpuDynInstPtr);
-            ComputeUnit::DataPort::SenderState* ss =
-                    new ComputeUnit::DataPort::SenderState
-                                            (gpuDynInstPtr, 0, nullptr);
-            pkt->senderState = ss;
+            if (usingRubyTester) {
+                assert(testerThreadPtr);
+                ProtocolTester::SenderState* ss =
+                        new ProtocolTester::SenderState(testerThreadPtr);
+                pkt->senderState = ss;
+            } else {
+                assert(gpuDynInstPtr);
+                ComputeUnit::DataPort::SenderState* ss =
+                        new ComputeUnit::DataPort::SenderState
+ (gpuDynInstPtr, 0, nullptr);
+                pkt->senderState = ss;
+            }
+
+            // send the ack response to the requester
+            originalPort->sendTimingResp(pkt);
         }
-
-        // send the ack response to the requester
-        originalPort->sendTimingResp(pkt);
     }

     int
@@ -187,6 +218,11 @@
   private:
     // the number of stores waiting for writeCompleteCallback
     int numPendingStores;
+    // number of write-complete acks required by the requestor
+    // if the requestor is a CU, this number is always 1
+ // if the requestor is a tester thread, this number is equal to the number
+    // of store requests sent to the coalescer
+    int numRequiredWriteCompleteAcks;
     // The original port that sent one of packets associated with this
     // write instruction. We may have more than one packet per instruction,
     // which implies multiple ports per instruction. However, we need
@@ -196,6 +232,8 @@
     // similar to the originalPort, this gpuDynInstPtr is set only for
     // the first packet of this instruction.
     GPUDynInstPtr gpuDynInstPtr;
+ // if protocol tester is used, this points to the requesting tester thread
+    Thread* testerThreadPtr;
 };

 class GPUCoalescer : public RubyPort
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index eafce6d..a21cf33 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -180,9 +180,9 @@
                               dataBlock, crequest->getSeqNum());
     }

-    if (pkt->cmd == MemCmd::WriteReq) {
-        makeWriteCompletePkts(crequest);
-    }
+//    if (pkt->cmd == MemCmd::WriteReq) {
+//        makeWriteCompletePkts(crequest);
+//    }

     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
              curTick(), m_version, "Coal", "Begin", "", "",
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
index 2f68c10..c83bda0 100644
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -57,7 +57,6 @@
     typedef VIPERCoalescerParams Params;
     VIPERCoalescer(const Params *);
     ~VIPERCoalescer();
-    void writeCompleteCallback(Addr address, uint64_t instSeqNum);
     void invTCPCallback(Addr address);
     RequestStatus makeRequest(PacketPtr pkt) override;
     void issueRequest(CoalescedRequest* crequest) override;
@@ -65,23 +64,11 @@
   private:
     void invTCP();

- // make write-complete response packets from original write request packets
-    void makeWriteCompletePkts(CoalescedRequest* crequest);
-
     // current cache invalidation packet
     // nullptr if there is no active cache invalidation request
     PacketPtr m_cache_inv_pkt;

     // number of remaining cache lines to be invalidated in TCP
     int m_num_pending_invs;
-
-    // a map of instruction sequence number and corresponding pending
-    // write-complete response packets. Each write-complete response
-    // corresponds to a pending store request that is waiting for
- // writeCompleteCallback. We may have multiple pending store requests per - // wavefront at a time. Each time writeCompleteCallback is called, an entry
-    // with a corresponding seqNum is popped off from map and returned to
-    // compute unit.
- std::unordered_map<uint64_t, std::vector<PacketPtr>> m_writeCompletePktMap;
 };
 #endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/32858
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I6d54795ac60d7aa020bc94b853dc3673f59762ba
Gerrit-Change-Number: 32858
Gerrit-PatchSet: 1
Gerrit-Owner: Bradford Beckmann <[email protected]>
Gerrit-Reviewer: Tuan Ta <[email protected]>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to