Pouya Fotouhi has uploaded this change for review. (
https://gem5-review.googlesource.com/c/public/gem5/+/28411 )
Change subject: mem-ruby: Getting rid of HSA segment and scope
......................................................................
mem-ruby: Getting rid of HSA segment and scope
This is protocol and sequencer parts of the bigger GCN3 change.
Author: Tony Gutierrez <anthony.gutier...@amd.com>
Change-Id: I803b4cbb46eeab8462d9af80dd003940a9968b60
---
M src/mem/ruby/protocol/GPU_VIPER-TCP.sm
M src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
M src/mem/ruby/protocol/RubySlicc_Exports.sm
M src/mem/ruby/protocol/RubySlicc_Types.sm
M src/mem/ruby/system/GPUCoalescer.cc
M src/mem/ruby/system/GPUCoalescer.hh
M src/mem/ruby/system/GPUCoalescer.py
M src/mem/ruby/system/VIPERCoalescer.py
8 files changed, 285 insertions(+), 399 deletions(-)
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 9dffe0f..4047dc6 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -298,9 +298,7 @@
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry,
tbe);
} else {
if (is_valid(cache_entry) ||
L1cache.cacheAvail(in_msg.LineAddress)) {
- if (in_msg.segment == HSASegment:SPILL) {
- trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry,
tbe);
- } else if (WB) {
+ if (WB) {
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
} else {
trigger(Event:StoreThrough, in_msg.LineAddress,
cache_entry, tbe);
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index a66939c..6d04c76 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -137,7 +137,6 @@
CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",
desc="Type of request from core fwded through region buffer";
WriteMask writeMask, desc="Write Through Data";
MachineID WTRequestor, desc="Node who initiated the write
through";
- HSAScope scope, default="HSAScope_SYSTEM",
desc="Request Scope";
int wfid, default="0", desc="wavefront id";
bool NoWriteConflict, default="true", desc="write collided
with CAB entry";
int ProgramCounter, desc="PC that accesses to this block";
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm
b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index 8e17f98..fe2c83b 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -91,26 +91,6 @@
NotPresent, desc="block is NotPresent";
Busy, desc="block is in a transient state, currently invalid";
}
-//HSA scopes
-enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
- UNSPECIFIED, desc="Unspecified scope";
- NOSCOPE, desc="Explictly unscoped";
- WAVEFRONT, desc="Wavefront scope";
- WORKGROUP, desc="Workgroup scope";
- DEVICE, desc="Device scope";
- SYSTEM, desc="System scope";
-}
-
-// HSA segment types
-enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
- GLOBAL, desc="Global segment";
- GROUP, desc="Group segment";
- PRIVATE, desc="Private segment";
- KERNARG, desc="Kernarg segment";
- READONLY, desc="Readonly segment";
- SPILL, desc="Spill segment";
- ARG, desc="Arg segment";
-}
// TesterStatus
enumeration(TesterStatus, desc="...") {
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm
b/src/mem/ruby/protocol/RubySlicc_Types.sm
index fd76289..95a093a 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -169,8 +169,6 @@
WriteMask writeMask, desc="Writethrough mask";
DataBlock WTData, desc="Writethrough data block";
int wfid, desc="Writethrough wavefront";
- HSAScope scope, desc="HSA scope";
- HSASegment segment, desc="HSA segment";
PacketPtr pkt, desc="Packet associated with this request";
}
diff --git a/src/mem/ruby/system/GPUCoalescer.cc
b/src/mem/ruby/system/GPUCoalescer.cc
index 4cea30f..19b50ea 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -63,58 +63,6 @@
using namespace std;
-GPUCoalescer *
-RubyGPUCoalescerParams::create()
-{
- return new GPUCoalescer(this);
-}
-
-HSAScope
-reqScopeToHSAScope(const RequestPtr &req)
-{
- HSAScope accessScope = HSAScope_UNSPECIFIED;
- if (req->isScoped()) {
- if (req->isWavefrontScope()) {
- accessScope = HSAScope_WAVEFRONT;
- } else if (req->isWorkgroupScope()) {
- accessScope = HSAScope_WORKGROUP;
- } else if (req->isDeviceScope()) {
- accessScope = HSAScope_DEVICE;
- } else if (req->isSystemScope()) {
- accessScope = HSAScope_SYSTEM;
- } else {
- fatal("Bad scope type");
- }
- }
- return accessScope;
-}
-
-HSASegment
-reqSegmentToHSASegment(const RequestPtr &req)
-{
- HSASegment accessSegment = HSASegment_GLOBAL;
-
- if (req->isGlobalSegment()) {
- accessSegment = HSASegment_GLOBAL;
- } else if (req->isGroupSegment()) {
- accessSegment = HSASegment_GROUP;
- } else if (req->isPrivateSegment()) {
- accessSegment = HSASegment_PRIVATE;
- } else if (req->isKernargSegment()) {
- accessSegment = HSASegment_KERNARG;
- } else if (req->isReadonlySegment()) {
- accessSegment = HSASegment_READONLY;
- } else if (req->isSpillSegment()) {
- accessSegment = HSASegment_SPILL;
- } else if (req->isArgSegment()) {
- accessSegment = HSASegment_ARG;
- } else {
- fatal("Bad segment type");
- }
-
- return accessSegment;
-}
-
UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
: coalescer(gc)
{
@@ -154,6 +102,7 @@
{
for (auto iter = instMap.begin(); iter != instMap.end(); ) {
if (iter->second.empty()) {
+ DPRINTF(GPUCoalescer, "Returning token seqNum %d\n",
iter->first);
instMap.erase(iter++);
coalescer->getMemSlavePort(0)->sendTokens(1);
} else {
@@ -162,15 +111,27 @@
}
}
+bool
+UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
+ // iterate the instructions held in UncoalescedTable to see whether
there
+ // are more requests to issue; if yes, not yet done; otherwise, done
+ for (auto& inst : instMap) {
+ DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
+ ,inst.first, inst.second.size());
+ if (inst.first == instSeqNum) { return false; }
+ }
+
+ return true;
+}
+
void
UncoalescedTable::printRequestTable(std::stringstream& ss)
{
- ss << "UncoalescedTable contains " << instMap.size()
- << " address entries." << std::endl;
+ ss << "Listing pending packets from " << instMap.size() << "
instructions";
+
for (auto& inst : instMap) {
- ss << "Addr 0x" << std::hex << inst.first << std::dec
- << " with " << inst.second.size() << " packets"
- << std::endl;
+ ss << "\tAddr: " << printAddress(inst.first) << " with "
+ << inst.second.size() << " pending packets" << std::endl;
}
}
@@ -229,7 +190,6 @@
assert(m_dataCache_ptr);
m_runningGarnetStandalone = p->garnet_standalone;
- assumingRfOCoherence = p->assume_rfo;
}
GPUCoalescer::~GPUCoalescer()
@@ -245,18 +205,9 @@
if (current_time - req->getIssueTime() > m_deadlock_threshold)
{
std::stringstream ss;
printRequestTable(ss);
- ss << "Outstanding requests: " << m_outstanding_count
- << std::endl;
-
- panic("Possible Deadlock detected. Aborting!\n"
- "version: %d request.paddr: 0x%x coalescedTable: %d "
- "current time: %u issue_time: %d difference: %d\n"
- "Request Tables:\n %s", m_version,
- req->getFirstPkt()->getAddr(),
- coalescedTable.size(), cyclesToTicks(current_time),
- cyclesToTicks(req->getIssueTime()),
- cyclesToTicks(current_time - req->getIssueTime()),
- ss.str());
+ warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
+ m_version, ss.str());
+ panic("Aborting due to deadlock!\n");
}
}
}
@@ -274,21 +225,27 @@
void
GPUCoalescer::printRequestTable(std::stringstream& ss)
{
- uncoalescedTable.printRequestTable(ss);
+ ss << "Printing out " << coalescedTable.size()
+ << " outstanding requests in the coalesced table\n";
- ss << "CoalescedTable contains " << coalescedTable.size()
- << " address entries." << std::endl;
for (auto& requestList : coalescedTable) {
- ss << "Addr 0x" << std::hex << requestList.first << std::dec
- << ": type-";
for (auto& request : requestList.second) {
- ss << RubyRequestType_to_string(request->getRubyType())
- << " pkts-" << request->getPackets().size()
- << " issued-" << request->getIssueTime() << " seqNum-"
- << request->getSeqNum() << "; ";
+ ss << "\tAddr: " << printAddress(requestList.first) << "\n"
+ << "\tInstruction sequence number: "
+ << request->getSeqNum() << "\n"
+ << "\t\tType: "
+ << RubyRequestType_to_string(request->getRubyType()) << "\n"
+ << "\t\tNumber of associated packets: "
+ << request->getPackets().size() << "\n"
+ << "\t\tIssue time: "
+ << request->getIssueTime() * clockPeriod() << "\n"
+ << "\t\tDifference from current tick: "
+ << (curCycle() - request->getIssueTime()) * clockPeriod();
}
- ss << std::endl;
}
+
+ // print out packets waiting to be issued in uncoalesced table
+ uncoalescedTable.printRequestTable(ss);
}
void
@@ -378,6 +335,7 @@
hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
forwardRequestTime, firstResponseTime, isRegion);
+ // remove this crequest in coalescedTable
delete crequest;
coalescedTable.at(address).pop_front();
@@ -390,6 +348,36 @@
}
void
+GPUCoalescer::writeCompleteCallback(Addr address,
+ uint64_t instSeqNum,
+ MachineType mach)
+{
+ DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
+ " instSeqNum = %d\n", address, instSeqNum);
+
+ assert(pendingWriteInsts.count(instSeqNum) == 1);
+ PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
+
+ // check the uncoalescedTable to see whether all requests for the inst
+ // have been issued or not
+ bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
+ DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
+ "reqsAllIssued=%d\n", reqsAllIssued,
+ inst.getNumPendingStores()-1, reqsAllIssued);
+
+ if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
+ // if the pending write instruction has received all write
completion
+ // callbacks for its issued Ruby requests, we can now start respond
+ // the requesting CU in one response packet.
+ inst.ackWriteCompletion(m_usingRubyTester);
+
+ DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
+ instSeqNum);
+ pendingWriteInsts.erase(instSeqNum);
+ }
+}
+
+void
GPUCoalescer::readCallback(Addr address, DataBlock& data)
{
readCallback(address, MachineType_NULL, data);
@@ -468,7 +456,7 @@
{
PacketPtr pkt = crequest->getFirstPkt();
Addr request_address = pkt->getAddr();
- Addr request_line_address = makeLineAddress(request_address);
+ Addr request_line_address M5_VAR_USED =
makeLineAddress(request_address);
RubyRequestType type = crequest->getRubyType();
@@ -507,20 +495,6 @@
"%s\n",
RubyRequestType_to_string(type));
}
-
- // If using the RubyTester, update the RubyTester sender state's
- // subBlock with the recieved data. The tester will later access
- // this state.
- // Note: RubyPort will access it's sender state before the
- // RubyTester.
- if (m_usingRubyTester) {
- RubyPort::SenderState *requestSenderState =
- safe_cast<RubyPort::SenderState*>(pkt->senderState);
- RubyTester::SenderState* testerSenderState =
- safe_cast<RubyTester::SenderState*>
- (requestSenderState->predecessor);
- testerSenderState->subBlock.mergeFrom(data);
- }
}
@@ -557,8 +531,6 @@
} else if (pkt->isWrite()) {
req_type = RubyRequestType_ST;
} else {
- // Acquire and release packets will have been issued by
- // makeRequest, so we do not need to check for it here.
panic("Unsupported ruby packet type\n");
}
@@ -570,71 +542,43 @@
RequestStatus
GPUCoalescer::makeRequest(PacketPtr pkt)
{
- // Check for GPU Barrier Kernel End or Kernel Begin
- // Leave these to be handled by the child class
- // Kernel End/Barrier = isFlush + isRelease
- // Kernel Begin = isFlush + isAcquire
- if (pkt->req->isKernel()) {
- if (pkt->req->isAcquire()){
- // This is a Kernel Begin leave handling to
- // virtual xCoalescer::makeRequest
- return RequestStatus_Issued;
- }else if (pkt->req->isRelease()) {
- // This is a Kernel End leave handling to
- // virtual xCoalescer::makeRequest
- // If we are here then we didn't call
- // a virtual version of this function
- // so we will also schedule the callback
- int wf_id = 0;
- if (pkt->req->hasContextId()) {
- wf_id = pkt->req->contextId();
- }
- insertKernel(wf_id, pkt);
- newKernelEnds.push_back(wf_id);
- if (!issueEvent.scheduled()) {
- schedule(issueEvent, curTick());
- }
- return RequestStatus_Issued;
+ // all packets must have valid instruction sequence numbers
+ assert(pkt->req->hasInstSeqNum());
+
+ if (pkt->cmd == MemCmd::MemSyncReq) {
+ // issue mem_sync requests immedidately to the cache system without
+ // going though uncoalescedTable like normal LD/ST/Atomic requests
+ issueMemSyncRequest(pkt);
+ } else {
+ // otherwise, this must be either read or write command
+ assert(pkt->isRead() || pkt->isWrite());
+
+ // the pkt is temporarily stored in the uncoalesced table until
+ // it's picked for coalescing process later in this cycle or in a
+ // future cycle
+ uncoalescedTable.insertPacket(pkt);
+ DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to
uncoalescedTable\n",
+ pkt->getAddr());
+
+ // we schedule an issue event here to process the uncoalesced table
+ // and try to issue Ruby request to cache system
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
}
}
- if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
- !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
- (pkt->req->isRelease() || pkt->req->isAcquire())) {
- if (assumingRfOCoherence) {
- // If we reached here, this request must be a memFence
- // and the protocol implements RfO, the coalescer can
- // assume sequentially consistency and schedule the callback
- // immediately.
- // Currently the code implements fence callbacks
- // by reusing the mechanism for kernel completions.
- // This should be fixed.
- int wf_id = 0;
- if (pkt->req->hasContextId()) {
- wf_id = pkt->req->contextId();
- }
- insertKernel(wf_id, pkt);
- newKernelEnds.push_back(wf_id);
- if (!issueEvent.scheduled()) {
- schedule(issueEvent, curTick());
- }
- return RequestStatus_Issued;
- } else {
- // If not RfO, return issued here and let the child coalescer
- // take care of it.
- return RequestStatus_Issued;
- }
- }
-
- uncoalescedTable.insertPacket(pkt);
- DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
-
- if (!issueEvent.scheduled())
- schedule(issueEvent, curTick());
- // TODO: issue hardware prefetches here
+ // we always return RequestStatus_Issued in this coalescer
+ // b/c the coalescer's resouce was checked ealier and the coalescer is
+ // queueing up aliased requets in its coalesced table
return RequestStatus_Issued;
}
+/**
+ * TODO: Figure out what do with this code. This code may go away
+ * and/or be merged into the VIPER coalescer once the VIPER
+ * protocol is re-integrated with GCN3 codes.
+ */
+/*
void
GPUCoalescer::issueRequest(CoalescedRequest* crequest)
{
@@ -728,7 +672,7 @@
assert(m_mandatory_q_ptr);
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-}
+}*/
template <class KEY, class VALUE>
std::ostream &
@@ -758,12 +702,6 @@
{
}
-void
-GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
- DPRINTF(RubyStats, "Recorded statistic: %s\n",
- SequencerRequestType_to_string(requestType));
-}
-
bool
GPUCoalescer::coalescePacket(PacketPtr pkt)
{
@@ -817,6 +755,41 @@
// be counted as outstanding requests.
m_outstanding_count++;
+ // We track all issued or to-be-issued Ruby requests associated
with
+ // write instructions. An instruction may have multiple Ruby
+ // requests.
+ if (pkt->cmd == MemCmd::WriteReq) {
+ DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
+ " the pending write instruction list\n", seqNum,
+ line_addr);
+
+ RubyPort::SenderState* ss =
+ safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+ // we need to save this port because it will be used to call
+ // back the requesting CU when we receive write
+ // complete callbacks for all issued Ruby requests of this
+ // instruction.
+ RubyPort::MemSlavePort* mem_slave_port = ss->port;
+
+ GPUDynInstPtr gpuDynInst = nullptr;
+
+ if (!m_usingRubyTester) {
+ // If this coalescer is connected to a real CU, we need
+ // to save the corresponding gpu dynamic instruction.
+ // CU will use that instruction to decrement wait counters
+ // in the issuing wavefront.
+ // For Ruby tester, gpuDynInst == nullptr
+ ComputeUnit::DataPort::SenderState* cu_state =
+ safe_cast<ComputeUnit::DataPort::SenderState*>
+ (ss->predecessor);
+ gpuDynInst = cu_state->_gpuDynInst;
+ }
+
+ PendingWriteInst& inst = pendingWriteInsts[seqNum];
+ inst.addPendingReq(mem_slave_port, gpuDynInst,
m_usingRubyTester);
+ }
+
return true;
}
@@ -906,34 +879,6 @@
}
void
-GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID
senderMachID)
-{
- if (myMachID == senderMachID) {
- CP_TCPLdHits++;
- } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
- CP_TCPLdTransfers++;
- } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
- CP_TCCLdHits++;
- } else {
- CP_LdMiss++;
- }
-}
-
-void
-GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID
senderMachID)
-{
- if (myMachID == senderMachID) {
- CP_TCPStHits++;
- } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
- CP_TCPStTransfers++;
- } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
- CP_TCCStHits++;
- } else {
- CP_StMiss++;
- }
-}
-
-void
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
{
for (auto& pkt : mylist) {
@@ -968,74 +913,6 @@
Cycles firstResponseTime,
bool success, bool isRegion)
{
- RubyRequestType type = crequest->getRubyType();
- Cycles issued_time = crequest->getIssueTime();
- Cycles completion_time = curCycle();
- assert(completion_time >= issued_time);
- Cycles total_lat = completion_time - issued_time;
-
- // cache stats (valid for RfO protocol only)
- if (mach == MachineType_TCP) {
- if (type == RubyRequestType_LD) {
- GPU_TCPLdHits++;
- } else {
- GPU_TCPStHits++;
- }
- } else if (mach == MachineType_L1Cache_wCC) {
- if (type == RubyRequestType_LD) {
- GPU_TCPLdTransfers++;
- } else {
- GPU_TCPStTransfers++;
- }
- } else if (mach == MachineType_TCC) {
- if (type == RubyRequestType_LD) {
- GPU_TCCLdHits++;
- } else {
- GPU_TCCStHits++;
- }
- } else {
- if (type == RubyRequestType_LD) {
- GPU_LdMiss++;
- } else {
- GPU_StMiss++;
- }
- }
-
- // Profile all access latency, even zero latency accesses
- m_latencyHist.sample(total_lat);
- m_typeLatencyHist[type]->sample(total_lat);
-
- // Profile the miss latency for all non-zero demand misses
- if (total_lat != Cycles(0)) {
- m_missLatencyHist.sample(total_lat);
- m_missTypeLatencyHist[type]->sample(total_lat);
-
- if (mach != MachineType_NUM) {
- m_missMachLatencyHist[mach]->sample(total_lat);
- m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
-
- if ((issued_time <= initialRequestTime) &&
- (initialRequestTime <= forwardRequestTime) &&
- (forwardRequestTime <= firstResponseTime) &&
- (firstResponseTime <= completion_time)) {
-
- m_IssueToInitialDelayHist[mach]->sample(
- initialRequestTime - issued_time);
- m_InitialToForwardDelayHist[mach]->sample(
- forwardRequestTime - initialRequestTime);
- m_ForwardToFirstResponseDelayHist[mach]->sample(
- firstResponseTime - forwardRequestTime);
- m_FirstResponseToCompletionDelayHist[mach]->sample(
- completion_time - firstResponseTime);
- }
- }
-
- }
-
- DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
- curTick(), m_version, "Coal",
- success ? "Done" : "SC_Failed", "", "",
- printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
}
void
@@ -1083,74 +960,4 @@
m_missTypeMachLatencyHist[i][j]->init(10);
}
}
-
- // GPU cache stats
- GPU_TCPLdHits
- .name(name() + ".gpu_tcp_ld_hits")
- .desc("loads that hit in the TCP")
- ;
- GPU_TCPLdTransfers
- .name(name() + ".gpu_tcp_ld_transfers")
- .desc("TCP to TCP load transfers")
- ;
- GPU_TCCLdHits
- .name(name() + ".gpu_tcc_ld_hits")
- .desc("loads that hit in the TCC")
- ;
- GPU_LdMiss
- .name(name() + ".gpu_ld_misses")
- .desc("loads that miss in the GPU")
- ;
-
- GPU_TCPStHits
- .name(name() + ".gpu_tcp_st_hits")
- .desc("stores that hit in the TCP")
- ;
- GPU_TCPStTransfers
- .name(name() + ".gpu_tcp_st_transfers")
- .desc("TCP to TCP store transfers")
- ;
- GPU_TCCStHits
- .name(name() + ".gpu_tcc_st_hits")
- .desc("stores that hit in the TCC")
- ;
- GPU_StMiss
- .name(name() + ".gpu_st_misses")
- .desc("stores that miss in the GPU")
- ;
-
- // CP cache stats
- CP_TCPLdHits
- .name(name() + ".cp_tcp_ld_hits")
- .desc("loads that hit in the TCP")
- ;
- CP_TCPLdTransfers
- .name(name() + ".cp_tcp_ld_transfers")
- .desc("TCP to TCP load transfers")
- ;
- CP_TCCLdHits
- .name(name() + ".cp_tcc_ld_hits")
- .desc("loads that hit in the TCC")
- ;
- CP_LdMiss
- .name(name() + ".cp_ld_misses")
- .desc("loads that miss in the GPU")
- ;
-
- CP_TCPStHits
- .name(name() + ".cp_tcp_st_hits")
- .desc("stores that hit in the TCP")
- ;
- CP_TCPStTransfers
- .name(name() + ".cp_tcp_st_transfers")
- .desc("TCP to TCP store transfers")
- ;
- CP_TCCStHits
- .name(name() + ".cp_tcc_st_hits")
- .desc("stores that hit in the TCC")
- ;
- CP_StMiss
- .name(name() + ".cp_st_misses")
- .desc("stores that miss in the GPU")
- ;
}
diff --git a/src/mem/ruby/system/GPUCoalescer.hh
b/src/mem/ruby/system/GPUCoalescer.hh
index 32b3af4..31b6bfa 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -40,11 +40,11 @@
#include <unordered_map>
#include "base/statistics.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
#include "mem/request.hh"
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/Consumer.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
#include "mem/ruby/protocol/PrefetchBit.hh"
#include "mem/ruby/protocol/RubyAccessMode.hh"
#include "mem/ruby/protocol/RubyRequestType.hh"
@@ -58,9 +58,6 @@
class RubyGPUCoalescerParams;
-HSAScope reqScopeToHSAScope(const RequestPtr &req);
-HSASegment reqSegmentToHSASegment(const RequestPtr &req);
-
// List of packets that belongs to a specific instruction.
typedef std::list<PacketPtr> PerInstPackets;
@@ -79,6 +76,7 @@
// instructions at the offset.
PerInstPackets* getInstPackets(int offset);
void updateResources();
+ bool areRequestsDone(const uint64_t instSeqNum);
// Check if a packet hasn't been removed from instMap in too long.
// Panics if a deadlock is detected and returns nothing otherwise.
@@ -121,6 +119,85 @@
std::vector<PacketPtr> pkts;
};
+// PendingWriteInst tracks the number of outstanding Ruby requests
+// per write instruction. Once all requests associated with one instruction
+// are completely done in Ruby, we call back the requester to mark
+// that this instruction is complete.
+class PendingWriteInst
+{
+ public:
+ PendingWriteInst()
+ : numPendingStores(0),
+ originalPort(nullptr),
+ gpuDynInstPtr(nullptr)
+ {}
+
+ ~PendingWriteInst()
+ {}
+
+ void
+ addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
+ bool usingRubyTester)
+ {
+ assert(port);
+ originalPort = port;
+
+ if (!usingRubyTester) {
+ gpuDynInstPtr = inst;
+ }
+
+ numPendingStores++;
+ }
+
+ // return true if no more ack is expected
+ bool
+ receiveWriteCompleteAck()
+ {
+ assert(numPendingStores > 0);
+ numPendingStores--;
+ return (numPendingStores == 0) ? true : false;
+ }
+
+ // ack the original requester that this write instruction is complete
+ void
+ ackWriteCompletion(bool usingRubyTester)
+ {
+ assert(numPendingStores == 0);
+
+ // make a response packet
+ PacketPtr pkt = new Packet(new Request(), MemCmd::MessageResp);
+
+ if (!usingRubyTester) {
+ assert(gpuDynInstPtr);
+ ComputeUnit::DataPort::SenderState* ss =
+ new ComputeUnit::DataPort::SenderState
+ (gpuDynInstPtr, 0, nullptr);
+ pkt->senderState = ss;
+ }
+
+ // send the ack response to the requester
+ originalPort->sendTimingResp(pkt);
+ }
+
+ int
+ getNumPendingStores() {
+ return numPendingStores;
+ }
+
+ private:
+ // the number of stores waiting for writeCompleteCallback
+ int numPendingStores;
+ // The original port that sent one of packets associated with this
+ // write instruction. We may have more than one packet per instruction,
+ // which implies multiple ports per instruction. However, we need
+ // only 1 of the ports to call back the CU. Therefore, here we keep
+ // track the port that sent the first packet of this instruction.
+ RubyPort::MemSlavePort* originalPort;
+ // similar to the originalPort, this gpuDynInstPtr is set only for
+ // the first packet of this instruction.
+ GPUDynInstPtr gpuDynInstPtr;
+};
+
class GPUCoalescer : public RubyPort
{
public:
@@ -137,6 +214,17 @@
void collateStats();
void regStats();
+ // each store request needs two callbacks:
+ // (1) writeCallback is called when the store is received and
processed
+ // by TCP. This writeCallback does not guarantee the store is
actually
+ // completed at its destination cache or memory. writeCallback
helps
+ // release hardware resources (e.g., its entry in coalescedTable)
+ // allocated for the store so that subsequent requests will not be
+ // blocked unnecessarily due to hardware resource constraints.
+ // (2) writeCompleteCallback is called when the store is fully
completed
+ // at its destination cache or memory. writeCompleteCallback
+ // guarantees that the store is fully completed. This callback
+ // will decrement hardware counters in CU
void writeCallback(Addr address, DataBlock& data);
void writeCallback(Addr address,
@@ -158,6 +246,10 @@
Cycles forwardRequestTime,
Cycles firstResponseTime);
+ void writeCompleteCallback(Addr address,
+ uint64_t instSeqNum,
+ MachineType mach);
+
void readCallback(Addr address, DataBlock& data);
void readCallback(Addr address,
@@ -178,18 +270,12 @@
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion);
- /* atomics need their own callback because the data
- might be const coming from SLICC */
+
void atomicCallback(Addr address,
MachineType mach,
const DataBlock& data);
- void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
- void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
-
- // Alternate implementations in VIPER Coalescer
- virtual RequestStatus makeRequest(PacketPtr pkt);
-
+ RequestStatus makeRequest(PacketPtr pkt);
int outstandingCount() const { return m_outstanding_count; }
bool
@@ -214,7 +300,6 @@
void insertKernel(int wavefront_id, PacketPtr pkt);
- void recordRequestType(SequencerRequestType requestType);
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
@@ -248,15 +333,17 @@
getFirstResponseToCompletionDelayHist(const MachineType t) const
{ return *m_FirstResponseToCompletionDelayHist[t]; }
- // Changed to protected to enable inheritance by VIPER Coalescer
protected:
bool tryCacheAccess(Addr addr, RubyRequestType type,
Addr pc, RubyAccessMode access_mode,
int size, DataBlock*& data_ptr);
- // Alternate implementations in VIPER Coalescer
- virtual void issueRequest(CoalescedRequest* crequest);
- void kernelCallback(int wavfront_id);
+ // since the two following issue functions are protocol-specific,
+ // they must be implemented in a derived coalescer
+ virtual void issueRequest(CoalescedRequest* crequest) = 0;
+ virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+
+ void kernelCallback(int wavefront_id);
void hitCallback(CoalescedRequest* crequest,
MachineType mach,
@@ -274,7 +361,6 @@
bool success, bool isRegion);
void completeHitCallback(std::vector<PacketPtr> & mylist);
-
virtual RubyRequestType getRequestType(PacketPtr pkt);
// Attempt to remove a packet from the uncoalescedTable and coalesce
@@ -286,8 +372,6 @@
EventFunctionWrapper issueEvent;
-
- // Changed to protected to enable inheritance by VIPER Coalescer
protected:
int m_max_outstanding_requests;
int m_deadlock_threshold;
@@ -311,6 +395,11 @@
// an address, the are serviced in age order.
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
+ // a map btw an instruction sequence number and PendingWriteInst
+ // this is used to do a final call back for each write when it is
+ // completely done in the memory system
+ std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
+
// Global outstanding request count, across all request tables
int m_outstanding_count;
bool m_deadlock_check_scheduled;
@@ -327,26 +416,28 @@
EventFunctionWrapper deadlockCheckEvent;
bool assumingRfOCoherence;
- // m5 style stats for TCP hit/miss counts
- Stats::Scalar GPU_TCPLdHits;
- Stats::Scalar GPU_TCPLdTransfers;
- Stats::Scalar GPU_TCCLdHits;
- Stats::Scalar GPU_LdMiss;
-
- Stats::Scalar GPU_TCPStHits;
- Stats::Scalar GPU_TCPStTransfers;
- Stats::Scalar GPU_TCCStHits;
- Stats::Scalar GPU_StMiss;
-
- Stats::Scalar CP_TCPLdHits;
- Stats::Scalar CP_TCPLdTransfers;
- Stats::Scalar CP_TCCLdHits;
- Stats::Scalar CP_LdMiss;
-
- Stats::Scalar CP_TCPStHits;
- Stats::Scalar CP_TCPStTransfers;
- Stats::Scalar CP_TCCStHits;
- Stats::Scalar CP_StMiss;
+// TODO - Need to update the following stats once the VIPER protocol
+// is re-integrated.
+// // m5 style stats for TCP hit/miss counts
+// Stats::Scalar GPU_TCPLdHits;
+// Stats::Scalar GPU_TCPLdTransfers;
+// Stats::Scalar GPU_TCCLdHits;
+// Stats::Scalar GPU_LdMiss;
+//
+// Stats::Scalar GPU_TCPStHits;
+// Stats::Scalar GPU_TCPStTransfers;
+// Stats::Scalar GPU_TCCStHits;
+// Stats::Scalar GPU_StMiss;
+//
+// Stats::Scalar CP_TCPLdHits;
+// Stats::Scalar CP_TCPLdTransfers;
+// Stats::Scalar CP_TCCLdHits;
+// Stats::Scalar CP_LdMiss;
+//
+// Stats::Scalar CP_TCPStHits;
+// Stats::Scalar CP_TCPStTransfers;
+// Stats::Scalar CP_TCCStHits;
+// Stats::Scalar CP_StMiss;
//! Histogram for number of outstanding requests per cycle.
Stats::Histogram m_outstandReqHist;
@@ -371,6 +462,21 @@
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
+// TODO - Need to update the following stats once the VIPER protocol
+// is re-integrated.
+// Stats::Distribution numHopDelays;
+// Stats::Distribution tcpToTccDelay;
+// Stats::Distribution tccToSdDelay;
+// Stats::Distribution sdToSdDelay;
+// Stats::Distribution sdToTccDelay;
+// Stats::Distribution tccToTcpDelay;
+//
+// Stats::Average avgTcpToTcc;
+// Stats::Average avgTccToSd;
+// Stats::Average avgSdToSd;
+// Stats::Average avgSdToTcc;
+// Stats::Average avgTccToTcp;
+
private:
// Private copy constructor and assignment operator
GPUCoalescer(const GPUCoalescer& obj);
diff --git a/src/mem/ruby/system/GPUCoalescer.py
b/src/mem/ruby/system/GPUCoalescer.py
index a114feb..a588b48 100644
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -39,6 +39,7 @@
class RubyGPUCoalescer(RubyPort):
type = 'RubyGPUCoalescer'
+ abstract = True
cxx_class = 'GPUCoalescer'
cxx_header = "mem/ruby/system/GPUCoalescer.hh"
@@ -47,8 +48,6 @@
"max requests (incl. prefetches)
outstanding")
max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
"coalesced in a single cycle")
- assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
- "Ownership coherence");
icache = Param.RubyCache("")
dcache = Param.RubyCache("")
diff --git a/src/mem/ruby/system/VIPERCoalescer.py
b/src/mem/ruby/system/VIPERCoalescer.py
index 85370f6..c9ddd6b 100644
--- a/src/mem/ruby/system/VIPERCoalescer.py
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -42,4 +42,3 @@
cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
- assume_rfo = False
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28411
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I803b4cbb46eeab8462d9af80dd003940a9968b60
Gerrit-Change-Number: 28411
Gerrit-PatchSet: 1
Gerrit-Owner: Pouya Fotouhi <pfoto...@ucdavis.edu>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s