Pouya Fotouhi has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/28411 )

Change subject: mem-ruby: Getting rid of HSA segment and scope
......................................................................

mem-ruby: Getting rid of HSA segment and scope

This is protocol and sequencer parts of the bigger GCN3 change.

Author: Tony Gutierrez <anthony.gutier...@amd.com>
Change-Id: I803b4cbb46eeab8462d9af80dd003940a9968b60
---
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
M src/mem/ruby/protocol/RubySlicc_Exports.sm
M src/mem/ruby/protocol/RubySlicc_Types.sm
M src/mem/ruby/system/GPUCoalescer.cc
M src/mem/ruby/system/GPUCoalescer.hh
M src/mem/ruby/system/GPUCoalescer.py
M src/mem/ruby/system/VIPERCoalescer.py
8 files changed, 285 insertions(+), 399 deletions(-)



diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 9dffe0f..4047dc6 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -298,9 +298,7 @@
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
           } else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
-              if (in_msg.segment == HSASegment:SPILL) {
- trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
-              } else if (WB) {
+              if (WB) {
                 trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
               } else {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index a66939c..6d04c76 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -137,7 +137,6 @@
CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer";
   WriteMask writeMask, desc="Write Through Data";
MachineID WTRequestor, desc="Node who initiated the write through"; - HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope";
   int wfid,                         default="0", desc="wavefront id";
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
   int ProgramCounter,               desc="PC that accesses to this block";
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index 8e17f98..fe2c83b 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -91,26 +91,6 @@
   NotPresent, desc="block is NotPresent";
   Busy,       desc="block is in a transient state, currently invalid";
 }
-//HSA scopes
-enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
-  UNSPECIFIED, desc="Unspecified scope";
-  NOSCOPE,     desc="Explictly unscoped";
-  WAVEFRONT,   desc="Wavefront scope";
-  WORKGROUP,   desc="Workgroup scope";
-  DEVICE,      desc="Device scope";
-  SYSTEM,      desc="System scope";
-}
-
-// HSA segment types
-enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
-  GLOBAL,   desc="Global segment";
-  GROUP,    desc="Group segment";
-  PRIVATE,  desc="Private segment";
-  KERNARG,  desc="Kernarg segment";
-  READONLY, desc="Readonly segment";
-  SPILL,    desc="Spill segment";
-  ARG,      desc="Arg segment";
-}

 // TesterStatus
 enumeration(TesterStatus, desc="...") {
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index fd76289..95a093a 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -169,8 +169,6 @@
   WriteMask writeMask,       desc="Writethrough mask";
   DataBlock WTData,          desc="Writethrough data block";
   int wfid,                  desc="Writethrough wavefront";
-  HSAScope scope,            desc="HSA scope";
-  HSASegment segment,        desc="HSA segment";
   PacketPtr pkt,             desc="Packet associated with this request";
 }

diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 4cea30f..19b50ea 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -63,58 +63,6 @@

 using namespace std;

-GPUCoalescer *
-RubyGPUCoalescerParams::create()
-{
-    return new GPUCoalescer(this);
-}
-
-HSAScope
-reqScopeToHSAScope(const RequestPtr &req)
-{
-    HSAScope accessScope = HSAScope_UNSPECIFIED;
-    if (req->isScoped()) {
-        if (req->isWavefrontScope()) {
-            accessScope = HSAScope_WAVEFRONT;
-        } else if (req->isWorkgroupScope()) {
-            accessScope = HSAScope_WORKGROUP;
-        } else if (req->isDeviceScope()) {
-            accessScope = HSAScope_DEVICE;
-        } else if (req->isSystemScope()) {
-            accessScope = HSAScope_SYSTEM;
-        } else {
-            fatal("Bad scope type");
-        }
-    }
-    return accessScope;
-}
-
-HSASegment
-reqSegmentToHSASegment(const RequestPtr &req)
-{
-    HSASegment accessSegment = HSASegment_GLOBAL;
-
-    if (req->isGlobalSegment()) {
-        accessSegment = HSASegment_GLOBAL;
-    } else if (req->isGroupSegment()) {
-        accessSegment = HSASegment_GROUP;
-    } else if (req->isPrivateSegment()) {
-        accessSegment = HSASegment_PRIVATE;
-    } else if (req->isKernargSegment()) {
-        accessSegment = HSASegment_KERNARG;
-    } else if (req->isReadonlySegment()) {
-        accessSegment = HSASegment_READONLY;
-    } else if (req->isSpillSegment()) {
-        accessSegment = HSASegment_SPILL;
-    } else if (req->isArgSegment()) {
-        accessSegment = HSASegment_ARG;
-    } else {
-        fatal("Bad segment type");
-    }
-
-    return accessSegment;
-}
-
 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
     : coalescer(gc)
 {
@@ -154,6 +102,7 @@
 {
     for (auto iter = instMap.begin(); iter != instMap.end(); ) {
         if (iter->second.empty()) {
+ DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
             instMap.erase(iter++);
             coalescer->getMemSlavePort(0)->sendTokens(1);
         } else {
@@ -162,15 +111,27 @@
     }
 }

+bool
+UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
+ // iterate the instructions held in UncoalescedTable to see whether there
+    // are more requests to issue; if yes, not yet done; otherwise, done
+    for (auto& inst : instMap) {
+        DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
+            ,inst.first, inst.second.size());
+        if (inst.first == instSeqNum) { return false; }
+    }
+
+    return true;
+}
+
 void
 UncoalescedTable::printRequestTable(std::stringstream& ss)
 {
-    ss << "UncoalescedTable contains " << instMap.size()
-       << " address entries." << std::endl;
+ ss << "Listing pending packets from " << instMap.size() << " instructions";
+
     for (auto& inst : instMap) {
-        ss << "Addr 0x" << std::hex << inst.first << std::dec
-           << " with " << inst.second.size() << " packets"
-           << std::endl;
+        ss << "\tAddr: " << printAddress(inst.first) << " with "
+           << inst.second.size() << " pending packets" << std::endl;
     }
 }

@@ -229,7 +190,6 @@
     assert(m_dataCache_ptr);

     m_runningGarnetStandalone = p->garnet_standalone;
-    assumingRfOCoherence = p->assume_rfo;
 }

 GPUCoalescer::~GPUCoalescer()
@@ -245,18 +205,9 @@
if (current_time - req->getIssueTime() > m_deadlock_threshold) {
                 std::stringstream ss;
                 printRequestTable(ss);
-                ss << "Outstanding requests: " << m_outstanding_count
-                   << std::endl;
-
-                panic("Possible Deadlock detected. Aborting!\n"
-                     "version: %d request.paddr: 0x%x coalescedTable: %d "
-                     "current time: %u issue_time: %d difference: %d\n"
-                     "Request Tables:\n %s", m_version,
-                      req->getFirstPkt()->getAddr(),
-                      coalescedTable.size(), cyclesToTicks(current_time),
-                      cyclesToTicks(req->getIssueTime()),
-                      cyclesToTicks(current_time - req->getIssueTime()),
-                      ss.str());
+                warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
+                     m_version, ss.str());
+                panic("Aborting due to deadlock!\n");
             }
         }
     }
@@ -274,21 +225,27 @@
 void
 GPUCoalescer::printRequestTable(std::stringstream& ss)
 {
-    uncoalescedTable.printRequestTable(ss);
+    ss << "Printing out " << coalescedTable.size()
+       << " outstanding requests in the coalesced table\n";

-    ss << "CoalescedTable contains " << coalescedTable.size()
-       << " address entries." << std::endl;
     for (auto& requestList : coalescedTable) {
-        ss << "Addr 0x" << std::hex << requestList.first << std::dec
-           << ": type-";
         for (auto& request : requestList.second) {
-            ss << RubyRequestType_to_string(request->getRubyType())
-               << " pkts-" << request->getPackets().size()
-               << " issued-" << request->getIssueTime() << " seqNum-"
-               << request->getSeqNum() << "; ";
+            ss << "\tAddr: " << printAddress(requestList.first) << "\n"
+               << "\tInstruction sequence number: "
+               << request->getSeqNum() << "\n"
+               << "\t\tType: "
+               << RubyRequestType_to_string(request->getRubyType()) << "\n"
+               << "\t\tNumber of associated packets: "
+               << request->getPackets().size() << "\n"
+               << "\t\tIssue time: "
+               << request->getIssueTime() * clockPeriod() << "\n"
+               << "\t\tDifference from current tick: "
+               << (curCycle() - request->getIssueTime()) * clockPeriod();
         }
-        ss << std::endl;
     }
+
+    // print out packets waiting to be issued in uncoalesced table
+    uncoalescedTable.printRequestTable(ss);
 }

 void
@@ -378,6 +335,7 @@
     hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
                 forwardRequestTime, firstResponseTime, isRegion);

+    // remove this crequest in coalescedTable
     delete crequest;
     coalescedTable.at(address).pop_front();

@@ -390,6 +348,36 @@
 }

 void
+GPUCoalescer::writeCompleteCallback(Addr address,
+                                    uint64_t instSeqNum,
+                                    MachineType mach)
+{
+    DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
+            " instSeqNum = %d\n", address, instSeqNum);
+
+    assert(pendingWriteInsts.count(instSeqNum) == 1);
+    PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
+
+    // check the uncoalescedTable to see whether all requests for the inst
+    // have been issued or not
+    bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
+    DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
+                    "reqsAllIssued=%d\n", reqsAllIssued,
+                    inst.getNumPendingStores()-1, reqsAllIssued);
+
+    if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
+ // if the pending write instruction has received all write completion
+        // callbacks for its issued Ruby requests, we can now start respond
+        // the requesting CU in one response packet.
+        inst.ackWriteCompletion(m_usingRubyTester);
+
+        DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
+                instSeqNum);
+        pendingWriteInsts.erase(instSeqNum);
+    }
+}
+
+void
 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 {
     readCallback(address, MachineType_NULL, data);
@@ -468,7 +456,7 @@
 {
     PacketPtr pkt = crequest->getFirstPkt();
     Addr request_address = pkt->getAddr();
-    Addr request_line_address = makeLineAddress(request_address);
+ Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);

     RubyRequestType type = crequest->getRubyType();

@@ -507,20 +495,6 @@
                     "%s\n",
                     RubyRequestType_to_string(type));
         }
-
-        // If using the RubyTester, update the RubyTester sender state's
-        // subBlock with the recieved data.  The tester will later access
-        // this state.
-        // Note: RubyPort will access it's sender state before the
-        // RubyTester.
-        if (m_usingRubyTester) {
-            RubyPort::SenderState *requestSenderState =
-                safe_cast<RubyPort::SenderState*>(pkt->senderState);
-            RubyTester::SenderState* testerSenderState =
-                safe_cast<RubyTester::SenderState*>
-                    (requestSenderState->predecessor);
-            testerSenderState->subBlock.mergeFrom(data);
-        }
     }


@@ -557,8 +531,6 @@
     } else if (pkt->isWrite()) {
         req_type = RubyRequestType_ST;
     } else {
-        // Acquire and release packets will have been issued by
-        // makeRequest, so we do not need to check for it here.
         panic("Unsupported ruby packet type\n");
     }

@@ -570,71 +542,43 @@
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // Check for GPU Barrier Kernel End or Kernel Begin
-    // Leave these to be handled by the child class
-    // Kernel End/Barrier = isFlush + isRelease
-    // Kernel Begin = isFlush + isAcquire
-    if (pkt->req->isKernel()) {
-        if (pkt->req->isAcquire()){
-            // This is a Kernel Begin leave handling to
-            // virtual xCoalescer::makeRequest
-            return RequestStatus_Issued;
-        }else if (pkt->req->isRelease()) {
-            // This is a Kernel End leave handling to
-            // virtual xCoalescer::makeRequest
-            // If we are here then we didn't call
-            // a virtual version of this function
-            // so we will also schedule the callback
-            int wf_id = 0;
-            if (pkt->req->hasContextId()) {
-                wf_id = pkt->req->contextId();
-            }
-            insertKernel(wf_id, pkt);
-            newKernelEnds.push_back(wf_id);
-            if (!issueEvent.scheduled()) {
-                schedule(issueEvent, curTick());
-            }
-            return RequestStatus_Issued;
+    // all packets must have valid instruction sequence numbers
+    assert(pkt->req->hasInstSeqNum());
+
+    if (pkt->cmd == MemCmd::MemSyncReq) {
+        // issue mem_sync requests immedidately to the cache system without
+        // going though uncoalescedTable like normal LD/ST/Atomic requests
+        issueMemSyncRequest(pkt);
+    } else {
+        // otherwise, this must be either read or write command
+        assert(pkt->isRead() || pkt->isWrite());
+
+        // the pkt is temporarily stored in the uncoalesced table until
+        // it's picked for coalescing process later in this cycle or in a
+        // future cycle
+        uncoalescedTable.insertPacket(pkt);
+ DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
+                pkt->getAddr());
+
+        // we schedule an issue event here to process the uncoalesced table
+        // and try to issue Ruby request to cache system
+        if (!issueEvent.scheduled()) {
+            schedule(issueEvent, curTick());
         }
     }

-    if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
-        !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
-        (pkt->req->isRelease() || pkt->req->isAcquire())) {
-        if (assumingRfOCoherence) {
-            // If we reached here, this request must be a memFence
-            // and the protocol implements RfO, the coalescer can
-            // assume sequentially consistency and schedule the callback
-            // immediately.
-            // Currently the code implements fence callbacks
-            // by reusing the mechanism for kernel completions.
-            // This should be fixed.
-            int wf_id = 0;
-            if (pkt->req->hasContextId()) {
-                wf_id = pkt->req->contextId();
-            }
-            insertKernel(wf_id, pkt);
-            newKernelEnds.push_back(wf_id);
-            if (!issueEvent.scheduled()) {
-                schedule(issueEvent, curTick());
-            }
-            return RequestStatus_Issued;
-        } else {
-            // If not RfO, return issued here and let the child coalescer
-            // take care of it.
-            return RequestStatus_Issued;
-        }
-    }
-
-    uncoalescedTable.insertPacket(pkt);
-    DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
-
-    if (!issueEvent.scheduled())
-        schedule(issueEvent, curTick());
-    // TODO: issue hardware prefetches here
+    // we always return RequestStatus_Issued in this coalescer
+    // b/c the coalescer's resouce was checked ealier and the coalescer is
+    // queueing up aliased requets in its coalesced table
     return RequestStatus_Issued;
 }

+/**
+ * TODO: Figure out what do with this code. This code may go away
+ *       and/or be merged into the VIPER coalescer once the VIPER
+ *       protocol is re-integrated with GCN3 codes.
+ */
+/*
 void
 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 {
@@ -728,7 +672,7 @@

     assert(m_mandatory_q_ptr);
     m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-}
+}*/

 template <class KEY, class VALUE>
 std::ostream &
@@ -758,12 +702,6 @@
 {
 }

-void
-GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
-    DPRINTF(RubyStats, "Recorded statistic: %s\n",
-            SequencerRequestType_to_string(requestType));
-}
-
 bool
 GPUCoalescer::coalescePacket(PacketPtr pkt)
 {
@@ -817,6 +755,41 @@
         // be counted as outstanding requests.
         m_outstanding_count++;

+ // We track all issued or to-be-issued Ruby requests associated with
+        // write instructions. An instruction may have multiple Ruby
+        // requests.
+        if (pkt->cmd == MemCmd::WriteReq) {
+            DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
+                    " the pending write instruction list\n", seqNum,
+                    line_addr);
+
+            RubyPort::SenderState* ss =
+                    safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+            // we need to save this port because it will be used to call
+            // back the requesting CU when we receive write
+            // complete callbacks for all issued Ruby requests of this
+            // instruction.
+            RubyPort::MemSlavePort* mem_slave_port = ss->port;
+
+            GPUDynInstPtr gpuDynInst = nullptr;
+
+            if (!m_usingRubyTester) {
+                // If this coalescer is connected to a real CU, we need
+                // to save the corresponding gpu dynamic instruction.
+                // CU will use that instruction to decrement wait counters
+                // in the issuing wavefront.
+                // For Ruby tester, gpuDynInst == nullptr
+                ComputeUnit::DataPort::SenderState* cu_state =
+                    safe_cast<ComputeUnit::DataPort::SenderState*>
+                        (ss->predecessor);
+                gpuDynInst = cu_state->_gpuDynInst;
+            }
+
+            PendingWriteInst& inst = pendingWriteInsts[seqNum];
+ inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
+        }
+
         return true;
     }

@@ -906,34 +879,6 @@
 }

 void
-GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
-{
-    if (myMachID == senderMachID) {
-        CP_TCPLdHits++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
-        CP_TCPLdTransfers++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
-        CP_TCCLdHits++;
-    } else {
-        CP_LdMiss++;
-    }
-}
-
-void
-GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
-{
-    if (myMachID == senderMachID) {
-        CP_TCPStHits++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
-        CP_TCPStTransfers++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
-        CP_TCCStHits++;
-    } else {
-        CP_StMiss++;
-    }
-}
-
-void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
     for (auto& pkt : mylist) {
@@ -968,74 +913,6 @@
                                 Cycles firstResponseTime,
                                 bool success, bool isRegion)
 {
-    RubyRequestType type = crequest->getRubyType();
-    Cycles issued_time = crequest->getIssueTime();
-    Cycles completion_time = curCycle();
-    assert(completion_time >= issued_time);
-    Cycles total_lat = completion_time - issued_time;
-
-    // cache stats (valid for RfO protocol only)
-    if (mach == MachineType_TCP) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCPLdHits++;
-        } else {
-            GPU_TCPStHits++;
-        }
-    } else if (mach == MachineType_L1Cache_wCC) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCPLdTransfers++;
-        } else {
-            GPU_TCPStTransfers++;
-        }
-    } else if (mach == MachineType_TCC) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCCLdHits++;
-        } else {
-            GPU_TCCStHits++;
-        }
-    } else  {
-        if (type == RubyRequestType_LD) {
-            GPU_LdMiss++;
-        } else {
-            GPU_StMiss++;
-        }
-    }
-
-    // Profile all access latency, even zero latency accesses
-    m_latencyHist.sample(total_lat);
-    m_typeLatencyHist[type]->sample(total_lat);
-
-    // Profile the miss latency for all non-zero demand misses
-    if (total_lat != Cycles(0)) {
-        m_missLatencyHist.sample(total_lat);
-        m_missTypeLatencyHist[type]->sample(total_lat);
-
-        if (mach != MachineType_NUM) {
-            m_missMachLatencyHist[mach]->sample(total_lat);
-            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
-
-            if ((issued_time <= initialRequestTime) &&
-                (initialRequestTime <= forwardRequestTime) &&
-                (forwardRequestTime <= firstResponseTime) &&
-                (firstResponseTime <= completion_time)) {
-
-                m_IssueToInitialDelayHist[mach]->sample(
-                    initialRequestTime - issued_time);
-                m_InitialToForwardDelayHist[mach]->sample(
-                    forwardRequestTime - initialRequestTime);
-                m_ForwardToFirstResponseDelayHist[mach]->sample(
-                    firstResponseTime - forwardRequestTime);
-                m_FirstResponseToCompletionDelayHist[mach]->sample(
-                    completion_time - firstResponseTime);
-            }
-        }
-
-    }
-
-    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
-             curTick(), m_version, "Coal",
-             success ? "Done" : "SC_Failed", "", "",
-             printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
 }

 void
@@ -1083,74 +960,4 @@
             m_missTypeMachLatencyHist[i][j]->init(10);
         }
     }
-
-    // GPU cache stats
-    GPU_TCPLdHits
-        .name(name() + ".gpu_tcp_ld_hits")
-        .desc("loads that hit in the TCP")
-        ;
-    GPU_TCPLdTransfers
-        .name(name() + ".gpu_tcp_ld_transfers")
-        .desc("TCP to TCP load transfers")
-        ;
-    GPU_TCCLdHits
-        .name(name() + ".gpu_tcc_ld_hits")
-        .desc("loads that hit in the TCC")
-        ;
-    GPU_LdMiss
-        .name(name() + ".gpu_ld_misses")
-        .desc("loads that miss in the GPU")
-        ;
-
-    GPU_TCPStHits
-        .name(name() + ".gpu_tcp_st_hits")
-        .desc("stores that hit in the TCP")
-        ;
-    GPU_TCPStTransfers
-        .name(name() + ".gpu_tcp_st_transfers")
-        .desc("TCP to TCP store transfers")
-        ;
-    GPU_TCCStHits
-        .name(name() + ".gpu_tcc_st_hits")
-        .desc("stores that hit in the TCC")
-        ;
-    GPU_StMiss
-        .name(name() + ".gpu_st_misses")
-        .desc("stores that miss in the GPU")
-        ;
-
-    // CP cache stats
-    CP_TCPLdHits
-        .name(name() + ".cp_tcp_ld_hits")
-        .desc("loads that hit in the TCP")
-        ;
-    CP_TCPLdTransfers
-        .name(name() + ".cp_tcp_ld_transfers")
-        .desc("TCP to TCP load transfers")
-        ;
-    CP_TCCLdHits
-        .name(name() + ".cp_tcc_ld_hits")
-        .desc("loads that hit in the TCC")
-        ;
-    CP_LdMiss
-        .name(name() + ".cp_ld_misses")
-        .desc("loads that miss in the GPU")
-        ;
-
-    CP_TCPStHits
-        .name(name() + ".cp_tcp_st_hits")
-        .desc("stores that hit in the TCP")
-        ;
-    CP_TCPStTransfers
-        .name(name() + ".cp_tcp_st_transfers")
-        .desc("TCP to TCP store transfers")
-        ;
-    CP_TCCStHits
-        .name(name() + ".cp_tcc_st_hits")
-        .desc("stores that hit in the TCC")
-        ;
-    CP_StMiss
-        .name(name() + ".cp_st_misses")
-        .desc("stores that miss in the GPU")
-        ;
 }
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 32b3af4..31b6bfa 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -40,11 +40,11 @@
 #include <unordered_map>

 #include "base/statistics.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
 #include "mem/request.hh"
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/Consumer.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
 #include "mem/ruby/protocol/RubyRequestType.hh"
@@ -58,9 +58,6 @@

 class RubyGPUCoalescerParams;

-HSAScope reqScopeToHSAScope(const RequestPtr &req);
-HSASegment reqSegmentToHSASegment(const RequestPtr &req);
-
 // List of packets that belongs to a specific instruction.
 typedef std::list<PacketPtr> PerInstPackets;

@@ -79,6 +76,7 @@
     // instructions at the offset.
     PerInstPackets* getInstPackets(int offset);
     void updateResources();
+    bool areRequestsDone(const uint64_t instSeqNum);

     // Check if a packet hasn't been removed from instMap in too long.
     // Panics if a deadlock is detected and returns nothing otherwise.
@@ -121,6 +119,85 @@
     std::vector<PacketPtr> pkts;
 };

+// PendingWriteInst tracks the number of outstanding Ruby requests
+// per write instruction. Once all requests associated with one instruction
+// are completely done in Ruby, we call back the requester to mark
+// that this instruction is complete.
+class PendingWriteInst
+{
+  public:
+    PendingWriteInst()
+        : numPendingStores(0),
+          originalPort(nullptr),
+          gpuDynInstPtr(nullptr)
+    {}
+
+    ~PendingWriteInst()
+    {}
+
+    void
+    addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
+                  bool usingRubyTester)
+    {
+        assert(port);
+        originalPort = port;
+
+        if (!usingRubyTester) {
+            gpuDynInstPtr = inst;
+        }
+
+        numPendingStores++;
+    }
+
+    // return true if no more ack is expected
+    bool
+    receiveWriteCompleteAck()
+    {
+        assert(numPendingStores > 0);
+        numPendingStores--;
+        return (numPendingStores == 0) ? true : false;
+    }
+
+    // ack the original requester that this write instruction is complete
+    void
+    ackWriteCompletion(bool usingRubyTester)
+    {
+        assert(numPendingStores == 0);
+
+        // make a response packet
+        PacketPtr pkt = new Packet(new Request(), MemCmd::MessageResp);
+
+        if (!usingRubyTester) {
+            assert(gpuDynInstPtr);
+            ComputeUnit::DataPort::SenderState* ss =
+                    new ComputeUnit::DataPort::SenderState
+                                            (gpuDynInstPtr, 0, nullptr);
+            pkt->senderState = ss;
+        }
+
+        // send the ack response to the requester
+        originalPort->sendTimingResp(pkt);
+    }
+
+    int
+    getNumPendingStores() {
+        return numPendingStores;
+    }
+
+  private:
+    // the number of stores waiting for writeCompleteCallback
+    int numPendingStores;
+    // The original port that sent one of packets associated with this
+    // write instruction. We may have more than one packet per instruction,
+    // which implies multiple ports per instruction. However, we need
+    // only 1 of the ports to call back the CU. Therefore, here we keep
+    // track the port that sent the first packet of this instruction.
+    RubyPort::MemSlavePort* originalPort;
+    // similar to the originalPort, this gpuDynInstPtr is set only for
+    // the first packet of this instruction.
+    GPUDynInstPtr gpuDynInstPtr;
+};
+
 class GPUCoalescer : public RubyPort
 {
   public:
@@ -137,6 +214,17 @@
     void collateStats();
     void regStats();

+    // each store request needs two callbacks:
+ // (1) writeCallback is called when the store is received and processed + // by TCP. This writeCallback does not guarantee the store is actually + // completed at its destination cache or memory. writeCallback helps
+    //      release hardware resources (e.g., its entry in coalescedTable)
+    //      allocated for the store so that subsequent requests will not be
+    //      blocked unnecessarily due to hardware resource constraints.
+ // (2) writeCompleteCallback is called when the store is fully completed
+    //      at its destination cache or memory. writeCompleteCallback
+    //      guarantees that the store is fully completed. This callback
+    //      will decrement hardware counters in CU
     void writeCallback(Addr address, DataBlock& data);

     void writeCallback(Addr address,
@@ -158,6 +246,10 @@
                        Cycles forwardRequestTime,
                        Cycles firstResponseTime);

+    void writeCompleteCallback(Addr address,
+                               uint64_t instSeqNum,
+                               MachineType mach);
+
     void readCallback(Addr address, DataBlock& data);

     void readCallback(Addr address,
@@ -178,18 +270,12 @@
                       Cycles forwardRequestTime,
                       Cycles firstResponseTime,
                       bool isRegion);
-    /* atomics need their own callback because the data
-       might be const coming from SLICC */
+
     void atomicCallback(Addr address,
                         MachineType mach,
                         const DataBlock& data);

-    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
-    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
-
-    // Alternate implementations in VIPER Coalescer
-    virtual RequestStatus makeRequest(PacketPtr pkt);
-
+    RequestStatus makeRequest(PacketPtr pkt);
     int outstandingCount() const { return m_outstanding_count; }

     bool
@@ -214,7 +300,6 @@

     void insertKernel(int wavefront_id, PacketPtr pkt);

-    void recordRequestType(SequencerRequestType requestType);
     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }

     Stats::Histogram& getLatencyHist() { return m_latencyHist; }
@@ -248,15 +333,17 @@
     getFirstResponseToCompletionDelayHist(const MachineType t) const
     { return *m_FirstResponseToCompletionDelayHist[t]; }

-  // Changed to protected to enable inheritance by VIPER Coalescer
   protected:
     bool tryCacheAccess(Addr addr, RubyRequestType type,
                         Addr pc, RubyAccessMode access_mode,
                         int size, DataBlock*& data_ptr);
-    // Alternate implementations in VIPER Coalescer
-    virtual void issueRequest(CoalescedRequest* crequest);

-    void kernelCallback(int wavfront_id);
+    // since the two following issue functions are protocol-specific,
+    // they must be implemented in a derived coalescer
+    virtual void issueRequest(CoalescedRequest* crequest) = 0;
+    virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+
+    void kernelCallback(int wavefront_id);

     void hitCallback(CoalescedRequest* crequest,
                      MachineType mach,
@@ -274,7 +361,6 @@
                            bool success, bool isRegion);
     void completeHitCallback(std::vector<PacketPtr> & mylist);

-
     virtual RubyRequestType getRequestType(PacketPtr pkt);

     // Attempt to remove a packet from the uncoalescedTable and coalesce
@@ -286,8 +372,6 @@

     EventFunctionWrapper issueEvent;

-
-  // Changed to protected to enable inheritance by VIPER Coalescer
   protected:
     int m_max_outstanding_requests;
     int m_deadlock_threshold;
@@ -311,6 +395,11 @@
     // an address, the are serviced in age order.
     std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;

+    // a map btw an instruction sequence number and PendingWriteInst
+    // this is used to do a final call back for each write when it is
+    // completely done in the memory system
+    std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
+
     // Global outstanding request count, across all request tables
     int m_outstanding_count;
     bool m_deadlock_check_scheduled;
@@ -327,26 +416,28 @@
     EventFunctionWrapper deadlockCheckEvent;
     bool assumingRfOCoherence;

-    // m5 style stats for TCP hit/miss counts
-    Stats::Scalar GPU_TCPLdHits;
-    Stats::Scalar GPU_TCPLdTransfers;
-    Stats::Scalar GPU_TCCLdHits;
-    Stats::Scalar GPU_LdMiss;
-
-    Stats::Scalar GPU_TCPStHits;
-    Stats::Scalar GPU_TCPStTransfers;
-    Stats::Scalar GPU_TCCStHits;
-    Stats::Scalar GPU_StMiss;
-
-    Stats::Scalar CP_TCPLdHits;
-    Stats::Scalar CP_TCPLdTransfers;
-    Stats::Scalar CP_TCCLdHits;
-    Stats::Scalar CP_LdMiss;
-
-    Stats::Scalar CP_TCPStHits;
-    Stats::Scalar CP_TCPStTransfers;
-    Stats::Scalar CP_TCCStHits;
-    Stats::Scalar CP_StMiss;
+// TODO - Need to update the following stats once the VIPER protocol
+//        is re-integrated.
+//    // m5 style stats for TCP hit/miss counts
+//    Stats::Scalar GPU_TCPLdHits;
+//    Stats::Scalar GPU_TCPLdTransfers;
+//    Stats::Scalar GPU_TCCLdHits;
+//    Stats::Scalar GPU_LdMiss;
+//
+//    Stats::Scalar GPU_TCPStHits;
+//    Stats::Scalar GPU_TCPStTransfers;
+//    Stats::Scalar GPU_TCCStHits;
+//    Stats::Scalar GPU_StMiss;
+//
+//    Stats::Scalar CP_TCPLdHits;
+//    Stats::Scalar CP_TCPLdTransfers;
+//    Stats::Scalar CP_TCCLdHits;
+//    Stats::Scalar CP_LdMiss;
+//
+//    Stats::Scalar CP_TCPStHits;
+//    Stats::Scalar CP_TCPStTransfers;
+//    Stats::Scalar CP_TCCStHits;
+//    Stats::Scalar CP_StMiss;

     //! Histogram for number of outstanding requests per cycle.
     Stats::Histogram m_outstandReqHist;
@@ -371,6 +462,21 @@
     std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
     std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;

+// TODO - Need to update the following stats once the VIPER protocol
+//        is re-integrated.
+//    Stats::Distribution numHopDelays;
+//    Stats::Distribution tcpToTccDelay;
+//    Stats::Distribution tccToSdDelay;
+//    Stats::Distribution sdToSdDelay;
+//    Stats::Distribution sdToTccDelay;
+//    Stats::Distribution tccToTcpDelay;
+//
+//    Stats::Average avgTcpToTcc;
+//    Stats::Average avgTccToSd;
+//    Stats::Average avgSdToSd;
+//    Stats::Average avgSdToTcc;
+//    Stats::Average avgTccToTcp;
+
 private:
     // Private copy constructor and assignment operator
     GPUCoalescer(const GPUCoalescer& obj);
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
index a114feb..a588b48 100644
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -39,6 +39,7 @@

 class RubyGPUCoalescer(RubyPort):
    type = 'RubyGPUCoalescer'
+   abstract = True
    cxx_class = 'GPUCoalescer'
    cxx_header = "mem/ruby/system/GPUCoalescer.hh"

@@ -47,8 +48,6 @@
"max requests (incl. prefetches) outstanding")
    max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
                                 "coalesced in a single cycle")
-   assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
-                           "Ownership coherence");

    icache = Param.RubyCache("")
    dcache = Param.RubyCache("")
diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py
index 85370f6..c9ddd6b 100644
--- a/src/mem/ruby/system/VIPERCoalescer.py
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -42,4 +42,3 @@
     cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
     max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
     max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
-    assume_rfo = False

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28411
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I803b4cbb46eeab8462d9af80dd003940a9968b60
Gerrit-Change-Number: 28411
Gerrit-PatchSet: 1
Gerrit-Owner: Pouya Fotouhi <pfoto...@ucdavis.edu>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to