Matthew Poremba has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/27429 )

Change subject: gpu-compute,mem-ruby: Refactor GPU coalescer
......................................................................

gpu-compute,mem-ruby: Refactor GPU coalescer

Remove the read/write tables and coalescing table and introduce a two
levels of tables for uncoalesced and coalesced packets. Tokens are
granted to GPU instructions to place in uncoalesced table. If tokens
are available, the operation always succeeds such that the 'Aliased'
status is never returned. Coalesced accesses are placed in the
coalesced table while requests are outstanding. Requests to the same
address are added as targets to the table similar to how MSHRs
operate.

Change-Id: I44983610307b638a97472db3576d0a30df2de600
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/27429
Reviewed-by: Bradford Beckmann <brad.beckm...@amd.com>
Reviewed-by: Jason Lowe-Power <power...@gmail.com>
Maintainer: Bradford Beckmann <brad.beckm...@amd.com>
Tested-by: kokoro <noreply+kok...@google.com>
---
M src/gpu-compute/GPU.py
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/global_memory_pipeline.hh
M src/gpu-compute/wavefront.cc
M src/mem/ruby/system/GPUCoalescer.cc
M src/mem/ruby/system/GPUCoalescer.hh
M src/mem/ruby/system/GPUCoalescer.py
M src/mem/ruby/system/VIPERCoalescer.cc
10 files changed, 572 insertions(+), 643 deletions(-)

Approvals:
  Jason Lowe-Power: Looks good to me, but someone else must approve
  Bradford Beckmann: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index fee0254..7eaf65f 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -129,6 +129,8 @@
                                       "memory pipeline's queues")
     local_mem_queue_size = Param.Int(256, "Number of entries in the local "
                                       "memory pipeline's queues")
+ max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\ + " of instructions that can be sent to coalescer")
     ldsBus = Bridge() # the bridge between the CU and its LDS
     ldsPort = MasterPort("The port that goes to the LDS")
     localDataStore = Param.LdsState("the LDS for this CU")
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 59bc6a0..cd880d6 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -74,9 +74,9 @@
     req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
     resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
     _masterId(p->system->getMasterId(this, "ComputeUnit")),
-    lds(*p->localDataStore), _cacheLineSize(p->system->cacheLineSize()),
-    globalSeqNum(0), wavefrontSize(p->wfSize),
-    kernelLaunchInst(new KernelLaunchStaticInst())
+    lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
+    _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
+ wavefrontSize(p->wfSize), kernelLaunchInst(new KernelLaunchStaticInst())
 {
     /**
      * This check is necessary because std::bitset only provides conversion
@@ -139,6 +139,10 @@

     memPort.resize(wfSize());

+    // Setup tokens for slave ports. The number of tokens in memSlaveTokens
+    // is the total token count for the entire vector port (i.e., this CU).
+    memPortTokens = new TokenManager(p->max_cu_tokens);
+
     // resize the tlbPort vectorArray
     int tlbPort_width = perLaneTLB ? wfSize() : 1;
     tlbPort.resize(tlbPort_width);
@@ -612,6 +616,8 @@
     vectorAluInstAvail.resize(numSIMDs, false);
     shrMemInstAvail = 0;
     glbMemInstAvail = 0;
+
+    gmTokenPort.setTokenManager(memPortTokens);
 }

 bool
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index a023cb2..49713e9 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -51,6 +51,7 @@
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/port.hh"
+#include "mem/token_port.hh"
 #include "sim/clocked_object.hh"

 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
@@ -415,6 +416,26 @@

     CUExitCallback *cuExitCallback;

+    class GMTokenPort : public TokenMasterPort
+    {
+      public:
+        GMTokenPort(const std::string& name, SimObject *owner,
+                    PortID id = InvalidPortID)
+            : TokenMasterPort(name, owner, id)
+        { }
+        ~GMTokenPort() { }
+
+      protected:
+        bool recvTimingResp(PacketPtr) { return false; }
+        void recvReqRetry() { }
+    };
+
+    // Manager for the number of tokens available to this compute unit to
+ // send global memory request packets to the coalescer this is only used
+    // between global memory pipe and TCP coalescer.
+    TokenManager *memPortTokens;
+    GMTokenPort gmTokenPort;
+
     /** Data access Port **/
     class DataPort : public MasterPort
     {
@@ -677,6 +698,12 @@
         return ldsPort;
     }

+    TokenManager *
+    getTokenManager()
+    {
+        return memPortTokens;
+    }
+
     /** The memory port for SIMD data accesses.
      *  Can be connected to PhysMem for Ruby for timing simulations
      */
@@ -712,6 +739,8 @@
             }
             ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
             return *ldsPort;
+        } else if (if_name == "gmTokenPort") {
+            return gmTokenPort;
         } else {
             panic("incorrect port name");
         }
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index d8e6d47..64778f0 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -33,6 +33,7 @@

 #include "gpu-compute/global_memory_pipeline.hh"

+#include "debug/GPUCoalescer.hh"
 #include "debug/GPUMem.hh"
 #include "debug/GPUReg.hh"
 #include "gpu-compute/compute_unit.hh"
@@ -56,6 +57,25 @@
     _name = computeUnit->name() + ".GlobalMemPipeline";
 }

+bool
+GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
+{
+    // We require one token from the coalescer's uncoalesced table to
+    // proceed
+    int token_count = 1;
+
+    // Make sure the vector port has tokens. There is a single pool
+    // of tokens so only one port in the vector port needs to be checked.
+    // Lane 0 is chosen arbirarily.
+    DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
+    if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
+        DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
+        return false;
+    }
+
+    return true;
+}
+
 void
 GlobalMemPipeline::exec()
 {
@@ -124,6 +144,14 @@
             }
         }

+        DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
+                mp->disassemble(), mp->seqNum());
+        // Memfences will not return tokens and must be issued so we should
+ // not request one as this will deplete the token count until deadlock
+        if (!mp->isMemFence()) {
+            assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
+            mp->computeUnit()->getTokenManager()->acquireTokens(1);
+        }
         mp->initiateAcc(mp);

         if (!outOfOrderDataDelivery && !mp->isMemFence()) {
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
index 0bc8596..2f83185 100644
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -121,6 +121,8 @@
         loadVrfBankConflictCycles += num_cycles;
     }

+    bool coalescerReady(GPUDynInstPtr mp) const;
+
   private:
     ComputeUnit *computeUnit;
     std::string _name;
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index e70a874..46cce9c 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -434,6 +434,11 @@
             return 0;
         }

+        // Does the coalescer have space for our instruction?
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            return 0;
+        }
+
         if (!computeUnit->globalMemoryPipe.
             isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
             // Can we insert a new request to the Global Mem Request FIFO?
@@ -504,6 +509,12 @@
         if (!locMemIssueRdy) {
             return 0;
         }
+
+        // Does the coalescer have space for our instruction?
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            return 0;
+        }
+
         if (!computeUnit->globalMemoryPipe.
             isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
             // Can we insert a new request to the Global Mem Request FIFO?
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index a7b658e..0153b4c 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -113,11 +113,95 @@
     return accessSegment;
 }

+UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
+    : coalescer(gc)
+{
+}
+
+void
+UncoalescedTable::insertPacket(PacketPtr pkt)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+
+    instMap[seqNum].push_back(pkt);
+ DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n", + pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
+}
+
+bool
+UncoalescedTable::packetAvailable()
+{
+    return !instMap.empty();
+}
+
+PerInstPackets*
+UncoalescedTable::getInstPackets(int offset)
+{
+    if (offset >= instMap.size()) {
+        return nullptr;
+    }
+
+    auto instMapIter = instMap.begin();
+    std::advance(instMapIter, offset);
+
+    return &(instMapIter->second);
+}
+
+void
+UncoalescedTable::updateResources()
+{
+    for (auto iter = instMap.begin(); iter != instMap.end(); ) {
+        if (iter->second.empty()) {
+            instMap.erase(iter++);
+            coalescer->getGMTokenPort().sendTokens(1);
+        } else {
+            ++iter;
+        }
+    }
+}
+
+void
+UncoalescedTable::printRequestTable(std::stringstream& ss)
+{
+    ss << "UncoalescedTable contains " << instMap.size()
+       << " address entries." << std::endl;
+    for (auto& inst : instMap) {
+        ss << "Addr 0x" << std::hex << inst.first << std::dec
+           << " with " << inst.second.size() << " packets"
+           << std::endl;
+    }
+}
+
+void
+UncoalescedTable::checkDeadlock(Tick threshold)
+{
+    Tick current_time = curTick();
+
+    for (auto &it : instMap) {
+        for (auto &pkt : it.second) {
+            if (current_time - pkt->req->time() > threshold) {
+                std::stringstream ss;
+                printRequestTable(ss);
+
+                panic("Possible Deadlock detected. Aborting!\n"
+ "version: %d request.paddr: 0x%x uncoalescedTable: %d "
+                     "current time: %u issue_time: %d difference: %d\n"
+                     "Request Tables:\n\n%s", coalescer->getId(),
+                      pkt->getAddr(), instMap.size(), current_time,
+                      pkt->req->time(), current_time - pkt->req->time(),
+                      ss.str());
+            }
+        }
+    }
+}
+
 GPUCoalescer::GPUCoalescer(const Params *p)
     : RubyPort(p),
       issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
                  false, Event::Progress_Event_Pri),
- deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
+      uncoalescedTable(this),
+ deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
+      gmTokenPort(name() + ".gmTokenPort", this)
 {
     m_store_waiting_on_load_cycles = 0;
     m_store_waiting_on_store_cycles = 0;
@@ -126,8 +210,9 @@

     m_outstanding_count = 0;

+    coalescingWindow = p->max_coalesces_per_cycle;
+
     m_max_outstanding_requests = 0;
-    m_deadlock_threshold = 0;
     m_instCache_ptr = nullptr;
     m_dataCache_ptr = nullptr;

@@ -149,52 +234,46 @@
 {
 }

+Port &
+GPUCoalescer::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "gmTokenPort") {
+        return gmTokenPort;
+    }
+
+    // delgate to RubyPort otherwise
+    return RubyPort::getPort(if_name, idx);
+}
+
 void
 GPUCoalescer::wakeup()
 {
-    // Check for deadlock of any of the requests
     Cycles current_time = curCycle();
+    for (auto& requestList : coalescedTable) {
+        for (auto& req : requestList.second) {
+ if (current_time - req->getIssueTime() > m_deadlock_threshold) {
+                std::stringstream ss;
+                printRequestTable(ss);
+                ss << "Outstanding requests: " << m_outstanding_count
+                   << std::endl;

-    // Check across all outstanding requests
-    int total_outstanding = 0;
-
-    RequestTable::iterator read = m_readRequestTable.begin();
-    RequestTable::iterator read_end = m_readRequestTable.end();
-    for (; read != read_end; ++read) {
-        GPUCoalescerRequest* request = read->second;
-        if (current_time - request->issue_time < m_deadlock_threshold)
-            continue;
-
-        panic("Possible Deadlock detected. Aborting!\n"
-             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
-             "current time: %u issue_time: %d difference: %d\n", m_version,
-              request->pkt->getAddr(), m_readRequestTable.size(),
- current_time * clockPeriod(), request->issue_time * clockPeriod(),
-              (current_time - request->issue_time)*clockPeriod());
+                panic("Possible Deadlock detected. Aborting!\n"
+                     "version: %d request.paddr: 0x%x coalescedTable: %d "
+                     "current time: %u issue_time: %d difference: %d\n"
+                     "Request Tables:\n %s", m_version,
+                      req->getFirstPkt()->getAddr(),
+                      coalescedTable.size(), cyclesToTicks(current_time),
+                      cyclesToTicks(req->getIssueTime()),
+                      cyclesToTicks(current_time - req->getIssueTime()),
+                      ss.str());
+            }
+        }
     }

-    RequestTable::iterator write = m_writeRequestTable.begin();
-    RequestTable::iterator write_end = m_writeRequestTable.end();
-    for (; write != write_end; ++write) {
-        GPUCoalescerRequest* request = write->second;
-        if (current_time - request->issue_time < m_deadlock_threshold)
-            continue;
-
-        panic("Possible Deadlock detected. Aborting!\n"
-             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
-             "current time: %u issue_time: %d difference: %d\n", m_version,
-              request->pkt->getAddr(), m_writeRequestTable.size(),
- current_time * clockPeriod(), request->issue_time * clockPeriod(),
-              (current_time - request->issue_time) * clockPeriod());
-    }
-
-    total_outstanding += m_writeRequestTable.size();
-    total_outstanding += m_readRequestTable.size();
-
-    assert(m_outstanding_count == total_outstanding);
+    Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
+    uncoalescedTable.checkDeadlock(tick_threshold);

     if (m_outstanding_count > 0) {
-        // If there are still outstanding requests, keep checking
         schedule(deadlockCheckEvent,
                  m_deadlock_threshold * clockPeriod() +
                  curTick());
@@ -202,6 +281,26 @@
 }

 void
+GPUCoalescer::printRequestTable(std::stringstream& ss)
+{
+    uncoalescedTable.printRequestTable(ss);
+
+    ss << "CoalescedTable contains " << coalescedTable.size()
+       << " address entries." << std::endl;
+    for (auto& requestList : coalescedTable) {
+        ss << "Addr 0x" << std::hex << requestList.first << std::dec
+           << ": type-";
+        for (auto& request : requestList.second) {
+            ss << RubyRequestType_to_string(request->getRubyType())
+               << " pkts-" << request->getPackets().size()
+               << " issued-" << request->getIssueTime() << " seqNum-"
+               << request->getSeqNum() << "; ";
+        }
+        ss << std::endl;
+    }
+}
+
+void
 GPUCoalescer::resetStats()
 {
     m_latencyHist.reset();
@@ -229,65 +328,6 @@
 {
 }

-RequestStatus
-GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
-{
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-
-    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
-        return RequestStatus_BufferFull;
-    }
-
-    if (m_controller->isBlocked(line_addr) &&
-       request_type != RubyRequestType_Locked_RMW_Write) {
-        return RequestStatus_Aliased;
-    }
-
-    if ((request_type == RubyRequestType_ST) ||
-        (request_type == RubyRequestType_ATOMIC) ||
-        (request_type == RubyRequestType_ATOMIC_RETURN) ||
-        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-        (request_type == RubyRequestType_RMW_Read) ||
-        (request_type == RubyRequestType_RMW_Write) ||
-        (request_type == RubyRequestType_Load_Linked) ||
-        (request_type == RubyRequestType_Store_Conditional) ||
-        (request_type == RubyRequestType_Locked_RMW_Read) ||
-        (request_type == RubyRequestType_Locked_RMW_Write) ||
-        (request_type == RubyRequestType_FLUSH)) {
-
-        // Check if there is any outstanding read request for the same
-        // cache line.
-        if (m_readRequestTable.count(line_addr) > 0) {
-            m_store_waiting_on_load_cycles++;
-            return RequestStatus_Aliased;
-        }
-
-        if (m_writeRequestTable.count(line_addr) > 0) {
-          // There is an outstanding write request for the cache line
-          m_store_waiting_on_store_cycles++;
-          return RequestStatus_Aliased;
-        }
-    } else {
-        // Check if there is any outstanding write request for the same
-        // cache line.
-        if (m_writeRequestTable.count(line_addr) > 0) {
-            m_load_waiting_on_store_cycles++;
-            return RequestStatus_Aliased;
-        }
-
-        if (m_readRequestTable.count(line_addr) > 0) {
-            // There is an outstanding read request for the cache line
-            m_load_waiting_on_load_cycles++;
-            return RequestStatus_Aliased;
-        }
-    }
-
-    return RequestStatus_Ready;
-
-}
-
-
-
 // sets the kernelEndList
 void
 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
@@ -303,153 +343,6 @@
             kernelEndList.size());
 }

-
-// Insert the request on the correct request table.  Return true if
-// the entry was already present.
-bool
-GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
-{
-    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
-           pkt->req->isLockedRMW() ||
-           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
-
-    int total_outstanding M5_VAR_USED =
-        m_writeRequestTable.size() + m_readRequestTable.size();
-
-    assert(m_outstanding_count == total_outstanding);
-
-    // See if we should schedule a deadlock check
-    if (!deadlockCheckEvent.scheduled()) {
-        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
-    }
-
-    Addr line_addr = makeLineAddress(pkt->getAddr());
-    if ((request_type == RubyRequestType_ST) ||
-        (request_type == RubyRequestType_ATOMIC) ||
-        (request_type == RubyRequestType_ATOMIC_RETURN) ||
-        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-        (request_type == RubyRequestType_RMW_Read) ||
-        (request_type == RubyRequestType_RMW_Write) ||
-        (request_type == RubyRequestType_Load_Linked) ||
-        (request_type == RubyRequestType_Store_Conditional) ||
-        (request_type == RubyRequestType_Locked_RMW_Read) ||
-        (request_type == RubyRequestType_Locked_RMW_Write) ||
-        (request_type == RubyRequestType_FLUSH)) {
-
-        pair<RequestTable::iterator, bool> r =
-          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
-                                       (GPUCoalescerRequest*) NULL));
-        if (r.second) {
-            RequestTable::iterator i = r.first;
-            i->second = new GPUCoalescerRequest(pkt, request_type,
-                                                curCycle());
-            DPRINTF(GPUCoalescer,
-                    "Inserting write request for paddr %#x for type %d\n",
-                    pkt->req->getPaddr(), i->second->m_type);
-            m_outstanding_count++;
-        } else {
-            return true;
-        }
-    } else {
-        pair<RequestTable::iterator, bool> r =
-            m_readRequestTable.insert(RequestTable::value_type(line_addr,
-                                        (GPUCoalescerRequest*) NULL));
-
-        if (r.second) {
-            RequestTable::iterator i = r.first;
-            i->second = new GPUCoalescerRequest(pkt, request_type,
-                                             curCycle());
-            DPRINTF(GPUCoalescer,
-                    "Inserting read request for paddr %#x for type %d\n",
-                    pkt->req->getPaddr(), i->second->m_type);
-            m_outstanding_count++;
-        } else {
-            return true;
-        }
-    }
-
-    m_outstandReqHist.sample(m_outstanding_count);
-
- total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
-    assert(m_outstanding_count == total_outstanding);
-
-    return false;
-}
-
-void
-GPUCoalescer::markRemoved()
-{
-    m_outstanding_count--;
-    assert(m_outstanding_count ==
-           m_writeRequestTable.size() + m_readRequestTable.size());
-}
-
-void
-GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
-{
-    assert(m_outstanding_count ==
-           m_writeRequestTable.size() + m_readRequestTable.size());
-
-    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
-    if ((srequest->m_type == RubyRequestType_ST) ||
-        (srequest->m_type == RubyRequestType_RMW_Read) ||
-        (srequest->m_type == RubyRequestType_RMW_Write) ||
-        (srequest->m_type == RubyRequestType_Load_Linked) ||
-        (srequest->m_type == RubyRequestType_Store_Conditional) ||
-        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
-        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
-        m_writeRequestTable.erase(line_addr);
-    } else {
-        m_readRequestTable.erase(line_addr);
-    }
-
-    markRemoved();
-}
-
-bool
-GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
-{
-    //
- // The success flag indicates whether the LLSC operation was successful.
-    // LL ops will always succeed, but SC may fail if the cache line is no
-    // longer locked.
-    //
-    bool success = true;
-    if (request->m_type == RubyRequestType_Store_Conditional) {
-        if (!m_dataCache_ptr->isLocked(address, m_version)) {
-            //
-            // For failed SC requests, indicate the failure to the cpu by
-            // setting the extra data to zero.
-            //
-            request->pkt->req->setExtraData(0);
-            success = false;
-        } else {
-            //
- // For successful SC requests, indicate the success to the cpu by
-            // setting the extra data to one.
-            //
-            request->pkt->req->setExtraData(1);
-        }
-        //
-        // Independent of success, all SC operations must clear the lock
-        //
-        m_dataCache_ptr->clearLocked(address);
-    } else if (request->m_type == RubyRequestType_Load_Linked) {
-        //
- // Note: To fully follow Alpha LLSC semantics, should the LL clear any
-        // previously locked cache lines?
-        //
-        m_dataCache_ptr->setLocked(address, m_version);
-    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
-               (m_dataCache_ptr->isLocked(address, m_version))) {
-        //
-        // Normal writes should clear the locked address
-        //
-        m_dataCache_ptr->clearLocked(address);
-    }
-    return success;
-}
-
 void
 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
 {
@@ -487,49 +380,22 @@
                          bool isRegion)
 {
     assert(address == makeLineAddress(address));
+    assert(coalescedTable.count(address));

-    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
-    assert(m_writeRequestTable.count(makeLineAddress(address)));
+    auto crequest = coalescedTable.at(address).front();

-    RequestTable::iterator i = m_writeRequestTable.find(address);
-    assert(i != m_writeRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
+    hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
+                forwardRequestTime, firstResponseTime, isRegion);

-    m_writeRequestTable.erase(i);
-    markRemoved();
+    delete crequest;
+    coalescedTable.at(address).pop_front();

-    assert((request->m_type == RubyRequestType_ST) ||
-           (request->m_type == RubyRequestType_ATOMIC) ||
-           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
-           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
-           (request->m_type == RubyRequestType_RMW_Read) ||
-           (request->m_type == RubyRequestType_RMW_Write) ||
-           (request->m_type == RubyRequestType_Load_Linked) ||
-           (request->m_type == RubyRequestType_Store_Conditional) ||
-           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
-           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
-           (request->m_type == RubyRequestType_FLUSH));
-
-
-    //
- // For Alpha, properly handle LL, SC, and write requests with respect to
-    // locked cache blocks.
-    //
-    // Not valid for Garnet_standalone protocl
-    //
-    bool success = true;
-    if (!m_runningGarnetStandalone)
-        success = handleLlsc(address, request);
-
-    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
-        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
-    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
-        m_controller->unblock(address);
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
     }
-
-    hitCallback(request, mach, data, success,
-                request->issue_time, forwardRequestTime, firstResponseTime,
-                isRegion);
 }

 void
@@ -570,26 +436,37 @@
                         bool isRegion)
 {
     assert(address == makeLineAddress(address));
-    assert(m_readRequestTable.count(makeLineAddress(address)));
+    assert(coalescedTable.count(address));

-    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
-    RequestTable::iterator i = m_readRequestTable.find(address);
-    assert(i != m_readRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
+    auto crequest = coalescedTable.at(address).front();
+    fatal_if(crequest->getRubyType() != RubyRequestType_LD,
+             "readCallback received non-read type response\n");

-    m_readRequestTable.erase(i);
-    markRemoved();
+    // Iterate over the coalesced requests to respond to as many loads as
+    // possible until another request type is seen. Models MSHR for TCP.
+    while (crequest->getRubyType() == RubyRequestType_LD) {
+        hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
+                    forwardRequestTime, firstResponseTime, isRegion);

-    assert((request->m_type == RubyRequestType_LD) ||
-           (request->m_type == RubyRequestType_IFETCH));
+        delete crequest;
+        coalescedTable.at(address).pop_front();
+        if (coalescedTable.at(address).empty()) {
+            break;
+        }

-    hitCallback(request, mach, data, true,
-                request->issue_time, forwardRequestTime, firstResponseTime,
-                isRegion);
+        crequest = coalescedTable.at(address).front();
+    }
+
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
+    }
 }

 void
-GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                        MachineType mach,
                        DataBlock& data,
                        bool success,
@@ -598,22 +475,15 @@
                        Cycles firstResponseTime,
                        bool isRegion)
 {
-    PacketPtr pkt = srequest->pkt;
+    PacketPtr pkt = crequest->getFirstPkt();
     Addr request_address = pkt->getAddr();
     Addr request_line_address = makeLineAddress(request_address);

-    RubyRequestType type = srequest->m_type;
+    RubyRequestType type = crequest->getRubyType();

-    // Set this cache entry to the most recently used
-    if (type == RubyRequestType_IFETCH) {
-        if (m_instCache_ptr->isTagPresent(request_line_address))
-            m_instCache_ptr->setMRU(request_line_address);
-    } else {
-        if (m_dataCache_ptr->isTagPresent(request_line_address))
-            m_dataCache_ptr->setMRU(request_line_address);
-    }
+ DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);

-    recordMissLatency(srequest, mach,
+    recordMissLatency(crequest, mach,
                       initialRequestTime,
                       forwardRequestTime,
                       firstResponseTime,
@@ -621,13 +491,11 @@
     // update the data
     //
     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
-    int len = reqCoalescer[request_line_address].size();
-    std::vector<PacketPtr> mylist;
-    for (int i = 0; i < len; ++i) {
-        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
-        assert(type == reqCoalescer[request_line_address][i].primaryType);
+    std::vector<PacketPtr> pktList = crequest->getPackets();
+    DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
+            pktList.size(), request_line_address);
+    for (auto& pkt : pktList) {
         request_address = pkt->getAddr();
-        request_line_address = makeLineAddress(pkt->getAddr());
         if (pkt->getPtr<uint8_t>()) {
             if ((type == RubyRequestType_LD) ||
                 (type == RubyRequestType_ATOMIC) ||
@@ -658,36 +526,56 @@
             RubyPort::SenderState *requestSenderState =
                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
             RubyTester::SenderState* testerSenderState =
- safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+                safe_cast<RubyTester::SenderState*>
+                    (requestSenderState->predecessor);
             testerSenderState->subBlock.mergeFrom(data);
         }
-
-        mylist.push_back(pkt);
     }
-    delete srequest;
-    reqCoalescer.erase(request_line_address);
-    assert(!reqCoalescer.count(request_line_address));



-    completeHitCallback(mylist, len);
+    m_outstanding_count--;
+    assert(m_outstanding_count >= 0);
+
+    completeHitCallback(pktList);
 }

 bool
 GPUCoalescer::empty() const
 {
-    return m_writeRequestTable.empty() && m_readRequestTable.empty();
+    return coalescedTable.empty();
 }

-// Analyzes the packet to see if this request can be coalesced.
-// If request can be coalesced, this request is added to the reqCoalescer table
-// and makeRequest returns RequestStatus_Issued;
-// If this is the first request to a cacheline, request is added to both
-// newRequests queue and to the reqCoalescer table; makeRequest
-// returns RequestStatus_Issued.
-// If there is a pending request to this cacheline and this request
-// can't be coalesced, RequestStatus_Aliased is returned and
-// the packet needs to be reissued.
+RubyRequestType
+GPUCoalescer::getRequestType(PacketPtr pkt)
+{
+    RubyRequestType req_type = RubyRequestType_NULL;
+
+    // These types are not support or not used in GPU caches.
+    assert(!pkt->req->isLLSC());
+    assert(!pkt->req->isLockedRMW());
+    assert(!pkt->req->isInstFetch());
+    assert(!pkt->isFlush());
+
+    if (pkt->req->isAtomicReturn()) {
+        req_type = RubyRequestType_ATOMIC_RETURN;
+    } else if (pkt->req->isAtomicNoReturn()) {
+        req_type = RubyRequestType_ATOMIC_NO_RETURN;
+    } else if (pkt->isRead()) {
+        req_type = RubyRequestType_LD;
+    } else if (pkt->isWrite()) {
+        req_type = RubyRequestType_ST;
+    } else {
+        // Acquire and release packets will have been issued by
+        // makeRequest, so we do not need to check for it here.
+        panic("Unsupported ruby packet type\n");
+    }
+
+    return req_type;
+}
+
+// Places an uncoalesced packet in uncoalescedTable. If the packet is a
+// special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
@@ -719,147 +607,37 @@
         }
     }

-    // If number of outstanding requests greater than the max allowed,
-    // return RequestStatus_BufferFull. This logic can be extended to
-    // support proper backpressure.
-    if (m_outstanding_count >= m_max_outstanding_requests) {
-        return RequestStatus_BufferFull;
-    }
-
-    RubyRequestType primary_type = RubyRequestType_NULL;
-    RubyRequestType secondary_type = RubyRequestType_NULL;
-
-    if (pkt->isLLSC()) {
-        //
- // Alpha LL/SC instructions need to be handled carefully by the cache - // coherence protocol to ensure they follow the proper semantics. In - // particular, by identifying the operations as atomic, the protocol - // should understand that migratory sharing optimizations should not
-        // be performed (i.e. a load between the LL and SC should not steal
-        // away exclusive permission).
-        //
-        if (pkt->isWrite()) {
-            primary_type = RubyRequestType_Store_Conditional;
-        } else {
-            assert(pkt->isRead());
-            primary_type = RubyRequestType_Load_Linked;
-        }
-        secondary_type = RubyRequestType_ATOMIC;
-    } else if (pkt->req->isLockedRMW()) {
-        //
-        // x86 locked instructions are translated to store cache coherence
-        // requests because these requests should always be treated as read
-        // exclusive operations and should leverage any migratory sharing
-        // optimization built into the protocol.
-        //
-        if (pkt->isWrite()) {
-            primary_type = RubyRequestType_Locked_RMW_Write;
-        } else {
-            assert(pkt->isRead());
-            primary_type = RubyRequestType_Locked_RMW_Read;
-        }
-        secondary_type = RubyRequestType_ST;
-    } else if (pkt->isAtomicOp()) {
-        //
-        // GPU Atomic Operation
-        //
-        primary_type = RubyRequestType_ATOMIC;
-        secondary_type = RubyRequestType_ATOMIC;
-    } else {
-        if (pkt->isRead()) {
-            if (pkt->req->isInstFetch()) {
-                primary_type = secondary_type = RubyRequestType_IFETCH;
-            } else {
-#if THE_ISA == X86_ISA
-                uint32_t flags = pkt->req->getFlags();
-                bool storeCheck = flags &
-                        (TheISA::StoreCheck << TheISA::FlagShift);
-#else
-                bool storeCheck = false;
-#endif // X86_ISA
-                if (storeCheck) {
-                    primary_type = RubyRequestType_RMW_Read;
-                    secondary_type = RubyRequestType_ST;
-                } else {
-                    primary_type = secondary_type = RubyRequestType_LD;
-                }
+    if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
+        !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
+        (pkt->req->isRelease() || pkt->req->isAcquire())) {
+        if (assumingRfOCoherence) {
+            // If we reached here, this request must be a memFence
+            // and the protocol implements RfO, the coalescer can
+            // assume sequentially consistency and schedule the callback
+            // immediately.
+            // Currently the code implements fence callbacks
+            // by reusing the mechanism for kernel completions.
+            // This should be fixed.
+            int wf_id = 0;
+            if (pkt->req->hasContextId()) {
+                wf_id = pkt->req->contextId();
             }
-        } else if (pkt->isWrite()) {
-            //
-            // Note: M5 packets do not differentiate ST from RMW_Write
-            //
-            primary_type = secondary_type = RubyRequestType_ST;
-        } else if (pkt->isFlush()) {
-            primary_type = secondary_type = RubyRequestType_FLUSH;
-        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
-            if (assumingRfOCoherence) {
-                // If we reached here, this request must be a memFence
-                // and the protocol implements RfO, the coalescer can
- // assume sequentially consistency and schedule the callback
-                // immediately.
-                // Currently the code implements fence callbacks
-                // by reusing the mechanism for kernel completions.
-                // This should be fixed.
-                int wf_id = 0;
-                if (pkt->req->hasContextId()) {
-                    wf_id = pkt->req->contextId();
-                }
-                insertKernel(wf_id, pkt);
-                newKernelEnds.push_back(wf_id);
-                if (!issueEvent.scheduled()) {
-                    schedule(issueEvent, curTick());
-                }
-                return RequestStatus_Issued;
-            } else {
- // If not RfO, return issued here and let the child coalescer
-                // take care of it.
-                return RequestStatus_Issued;
+            insertKernel(wf_id, pkt);
+            newKernelEnds.push_back(wf_id);
+            if (!issueEvent.scheduled()) {
+                schedule(issueEvent, curTick());
             }
+            return RequestStatus_Issued;
         } else {
-            panic("Unsupported ruby packet type\n");
+            // If not RfO, return issued here and let the child coalescer
+            // take care of it.
+            return RequestStatus_Issued;
         }
     }

-    // Check if there is any pending request to this cache line from
-    // previous cycles.
-    // If there is a pending request, return aliased. Since coalescing
-    // across time is not permitted, aliased requests are not coalesced.
-    // If a request for this address has already been issued, we must block
-    RequestStatus status = getRequestStatus(pkt, primary_type);
-    if (status != RequestStatus_Ready)
-        return status;
+    uncoalescedTable.insertPacket(pkt);
+    DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());

-    Addr line_addr = makeLineAddress(pkt->getAddr());
-
-    // Check if this request can be coalesced with previous
-    // requests from this cycle.
-    if (!reqCoalescer.count(line_addr)) {
-        // This is the first access to this cache line.
-        // A new request to the memory subsystem has to be
-        // made in the next cycle for this cache line, so
-        // add this line addr to the "newRequests" queue
-        newRequests.push_back(line_addr);
-
-    // There was a request to this cache line in this cycle,
-    // let us see if we can coalesce this request with the previous
-    // requests from this cycle
-    } else if (primary_type !=
-               reqCoalescer[line_addr][0].primaryType) {
-        // can't coalesce loads, stores and atomics!
-        return RequestStatus_Aliased;
-    } else if (pkt->req->isLockedRMW() ||
-               reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
-        // can't coalesce locked accesses, but can coalesce atomics!
-        return RequestStatus_Aliased;
-    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
-               pkt->req->contextId() !=
-               reqCoalescer[line_addr][0].pkt->req->contextId()) {
-        // can't coalesce releases from different wavefronts
-        return RequestStatus_Aliased;
-    }
-
-    // in addition to the packet, we need to save both request types
- reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
     if (!issueEvent.scheduled())
         schedule(issueEvent, curTick());
     // TODO: issue hardware prefetches here
@@ -867,8 +645,9 @@
 }

 void
-GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 {
+    PacketPtr pkt = crequest->getFirstPkt();

     int proc_id = -1;
     if (pkt != NULL && pkt->req->hasContextId()) {
@@ -901,9 +680,9 @@
     uint32_t blockSize = RubySystem::getBlockSizeBytes();
     std::vector<bool> accessMask(blockSize,false);
     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
-    uint32_t tableSize = reqCoalescer[line_addr].size();
+    uint32_t tableSize = crequest->getPackets().size();
     for (int i = 0; i < tableSize; i++) {
-        PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
+        PacketPtr tmpPkt = crequest->getPackets()[i];
         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
         uint32_t tmpSize = tmpPkt->getSize();
         if (tmpPkt->isAtomicOp()) {
@@ -922,7 +701,7 @@
     if (pkt->isAtomicOp()) {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
-                              pkt->getSize(), pc, secondary_type,
+                              pkt->getSize(), pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
@@ -931,7 +710,7 @@
     } else {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
-                              pkt->getSize(), pc, secondary_type,
+                              pkt->getSize(), pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
@@ -941,15 +720,21 @@
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
              curTick(), m_version, "Coal", "Begin", "", "",
              printAddress(msg->getPhysicalAddress()),
-             RubyRequestType_to_string(secondary_type));
+             RubyRequestType_to_string(crequest->getRubyType()));

-    fatal_if(secondary_type == RubyRequestType_IFETCH,
+    fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
"there should not be any I-Fetch requests in the GPU Coalescer");

     Tick latency = cyclesToTicks(
- m_controller->mandatoryQueueLatency(secondary_type)); + m_controller->mandatoryQueueLatency(crequest->getRubyType()));
     assert(latency > 0);

+    if (!deadlockCheckEvent.scheduled()) {
+        schedule(deadlockCheckEvent,
+                 m_deadlock_threshold * clockPeriod() +
+                 curTick());
+    }
+
     assert(m_mandatory_q_ptr);
     m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
 }
@@ -971,8 +756,6 @@
 {
     out << "[GPUCoalescer: " << m_version
         << ", outstanding requests: " << m_outstanding_count
-        << ", read request table: " << m_readRequestTable
-        << ", write request table: " << m_writeRequestTable
         << "]";
 }

@@ -983,40 +766,96 @@
             SequencerRequestType_to_string(requestType));
 }

+bool
+GPUCoalescer::coalescePacket(PacketPtr pkt)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // If the packet has the same line address as a request already in the
+ // coalescedTable and has the same sequence number, it can be coalesced.
+    if (coalescedTable.count(line_addr)) {
+        // Search for a previous coalesced request with the same seqNum.
+        auto& creqQueue = coalescedTable.at(line_addr);
+        auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
+            [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
+        );
+        if (citer != creqQueue.end()) {
+            (*citer)->insertPacket(pkt);
+            return true;
+        }
+    }
+
+    if (m_outstanding_count < m_max_outstanding_requests) {
+        // This is an "aliased" or new request. Create a RubyRequest and
+        // append it to the list of "targets" in the coalescing table.
+        DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
+                line_addr);
+
+        CoalescedRequest *creq = new CoalescedRequest(seqNum);
+        creq->insertPacket(pkt);
+        creq->setRubyType(getRequestType(pkt));
+        creq->setIssueTime(curCycle());
+
+        if (!coalescedTable.count(line_addr)) {
+            // If there is no outstanding request for this line address,
+            // create a new coalecsed request and issue it immediately.
+            auto reqList = std::deque<CoalescedRequest*> { creq };
+            coalescedTable.insert(std::make_pair(line_addr, reqList));
+
+            DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
+ RubyRequestType_to_string(creq->getRubyType()), seqNum);
+            issueRequest(creq);
+        } else {
+ // The request is for a line address that is already outstanding + // but for a different instruction. Add it as a new request to be
+            // issued when the current outstanding request is completed.
+            coalescedTable.at(line_addr).push_back(creq);
+ DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
+                    line_addr, seqNum);
+        }
+
+ // In both cases, requests are added to the coalescing table and will
+        // be counted as outstanding requests.
+        m_outstanding_count++;
+
+        return true;
+    }
+
+    // The maximum number of outstanding requests have been issued.
+    return false;
+}

 void
 GPUCoalescer::completeIssue()
 {
-    // newRequests has the cacheline addresses of all the
-    // requests which need to be issued to the memory subsystem
-    // in this cycle
-    int len = newRequests.size();
-    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
-    for (int i = 0; i < len; ++i) {
-        // Get the requests from reqCoalescer table. Get only the
-        // first request for each cacheline, the remaining requests
-        // can be coalesced with the first request. So, only
-        // one request is issued per cacheline.
-        RequestDesc info = reqCoalescer[newRequests[i]][0];
-        PacketPtr pkt = info.pkt;
-        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
-                i, pkt->req->getPaddr());
-        // Insert this request to the read/writeRequestTables. These tables
-        // are used to track aliased requests in makeRequest subroutine
-        bool found = insertRequest(pkt, info.primaryType);
+    // Iterate over the maximum number of instructions we can coalesce
+    // per cycle (coalescingWindow).
+    for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
+        PerInstPackets *pktList =
+            uncoalescedTable.getInstPackets(instIdx);

-        if (found) {
- panic("GPUCoalescer::makeRequest should never be called if the "
-                  "request is already outstanding\n");
+        // getInstPackets will return nullptr if no instruction
+        // exists at the current offset.
+        if (!pktList) {
+            break;
+        } else {
+            // Since we have a pointer to the list of packets in the inst,
+            // erase them from the list if coalescing is successful and
+            // leave them in the list otherwise. This aggressively attempts
+ // to coalesce as many packets as possible from the current inst.
+            pktList->remove_if(
+                [&](PacketPtr pkt) { return coalescePacket(pkt); }
+            );
         }
-
-        // Issue request to ruby subsystem
-        issueRequest(pkt, info.secondaryType);
     }
-    newRequests.clear();
+
+    // Clean up any instructions in the uncoalesced table that have had
+    // all of their packets coalesced and return a token for that column.
+    uncoalescedTable.updateResources();

     // have Kernel End releases been issued this cycle
-    len = newKernelEnds.size();
+    int len = newKernelEnds.size();
     for (int i = 0; i < len; i++) {
         kernelCallback(newKernelEnds[i]);
     }
@@ -1045,71 +884,27 @@
                              const DataBlock& data)
 {
     assert(address == makeLineAddress(address));
+    assert(coalescedTable.count(address));

-    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
-    assert(m_writeRequestTable.count(makeLineAddress(address)));
+    auto crequest = coalescedTable.at(address).front();

-    RequestTable::iterator i = m_writeRequestTable.find(address);
-    assert(i != m_writeRequestTable.end());
-    GPUCoalescerRequest* srequest = i->second;
+    fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
+              crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
+              crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
+             "atomicCallback saw non-atomic type response\n");

-    m_writeRequestTable.erase(i);
-    markRemoved();
+    hitCallback(crequest, mach, (DataBlock&)data, true,
+                crequest->getIssueTime(), Cycles(0), Cycles(0), false);

-    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
-           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
-           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+    delete crequest;
+    coalescedTable.at(address).pop_front();

-
-    // Atomics don't write to cache, so there is no MRU update...
-
-    recordMissLatency(srequest, mach,
- srequest->issue_time, Cycles(0), Cycles(0), true, false);
-
-    PacketPtr pkt = srequest->pkt;
-    Addr request_address = pkt->getAddr();
-    Addr request_line_address = makeLineAddress(pkt->getAddr());
-
-    int len = reqCoalescer[request_line_address].size();
-    std::vector<PacketPtr> mylist;
-    for (int i = 0; i < len; ++i) {
-        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
-        assert(srequest->m_type ==
-               reqCoalescer[request_line_address][i].primaryType);
-        request_address = (pkt->getAddr());
-        request_line_address = makeLineAddress(request_address);
-        if (pkt->getPtr<uint8_t>() &&
-            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
- /* atomics are done in memory, and return the data *before* the atomic op... */
-            pkt->setData(
-                data.getData(getOffset(request_address), pkt->getSize()));
-        } else {
-            DPRINTF(MemoryAccess,
- "WARNING. Data not transfered from Ruby to M5 for type " \
-                    "%s\n",
-                    RubyRequestType_to_string(srequest->m_type));
-        }
-
-        // If using the RubyTester, update the RubyTester sender state's
-        // subBlock with the recieved data.  The tester will later access
-        // this state.
-        // Note: RubyPort will access it's sender state before the
-        // RubyTester.
-        if (m_usingRubyTester) {
-            RubyPort::SenderState *requestSenderState =
-                safe_cast<RubyPort::SenderState*>(pkt->senderState);
-            RubyTester::SenderState* testerSenderState =
- safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
-            testerSenderState->subBlock.mergeFrom(data);
-        }
-
-        mylist.push_back(pkt);
+    if (coalescedTable.at(address).empty()) {
+        coalescedTable.erase(address);
+    } else {
+        auto nextRequest = coalescedTable.at(address).front();
+        issueRequest(nextRequest);
     }
-    delete srequest;
-    reqCoalescer.erase(request_line_address);
-    assert(!reqCoalescer.count(request_line_address));
-
-    completeHitCallback(mylist, len);
 }

 void
@@ -1141,42 +936,42 @@
 }

 void
-GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
-    for (int i = 0; i < len; ++i) {
+    for (auto& pkt : mylist) {
         RubyPort::SenderState *ss =
-            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+            safe_cast<RubyPort::SenderState *>(pkt->senderState);
         MemSlavePort *port = ss->port;
         assert(port != NULL);

-        mylist[i]->senderState = ss->predecessor;
+        pkt->senderState = ss->predecessor;
         delete ss;
-        port->hitCallback(mylist[i]);
+        port->hitCallback(pkt);
         trySendRetries();
     }

+    // We schedule an event in the same tick as hitCallback (similar to
+    // makeRequest) rather than calling completeIssue directly to reduce
+    // function calls to complete issue. This can only happen if the max
+    // outstanding requests is less than the number of slots in the
+    // uncoalesced table and makeRequest is not called again.
+    if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
+        schedule(issueEvent, curTick());
+    }
+
     testDrainComplete();
 }

-PacketPtr
-GPUCoalescer::mapAddrToPkt(Addr address)
-{
-    RequestTable::iterator i = m_readRequestTable.find(address);
-    assert(i != m_readRequestTable.end());
-    GPUCoalescerRequest* request = i->second;
-    return request->pkt;
-}
-
 void
-GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
                                 MachineType mach,
                                 Cycles initialRequestTime,
                                 Cycles forwardRequestTime,
                                 Cycles firstResponseTime,
                                 bool success, bool isRegion)
 {
-    RubyRequestType type = srequest->m_type;
-    Cycles issued_time = srequest->issue_time;
+    RubyRequestType type = crequest->getRubyType();
+    Cycles issued_time = crequest->getIssueTime();
     Cycles completion_time = curCycle();
     assert(completion_time >= issued_time);
     Cycles total_lat = completion_time - issued_time;
@@ -1242,7 +1037,7 @@
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
              curTick(), m_version, "Coal",
              success ? "Done" : "SC_Failed", "", "",
-             printAddress(srequest->pkt->getAddr()), total_lat);
+             printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
 }

 void
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 3230ef1..56a2079 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -48,6 +48,7 @@
 #include "mem/ruby/protocol/RubyRequestType.hh"
 #include "mem/ruby/protocol/SequencerRequestType.hh"
 #include "mem/ruby/system/Sequencer.hh"
+#include "mem/token_port.hh"

 class DataBlock;
 class CacheMsg;
@@ -59,47 +60,99 @@
 HSAScope reqScopeToHSAScope(const RequestPtr &req);
 HSASegment reqSegmentToHSASegment(const RequestPtr &req);

-struct GPUCoalescerRequest
-{
-    PacketPtr pkt;
-    RubyRequestType m_type;
-    Cycles issue_time;
+// List of packets that belongs to a specific instruction.
+typedef std::list<PacketPtr> PerInstPackets;

-    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
-                        Cycles _issue_time)
-        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
-    {}
-};
-
-class RequestDesc
+class UncoalescedTable
 {
   public:
- RequestDesc(PacketPtr pkt, RubyRequestType p_type, RubyRequestType s_type)
-        : pkt(pkt), primaryType(p_type), secondaryType(s_type)
-    {
-    }
+    UncoalescedTable(GPUCoalescer *gc);
+    ~UncoalescedTable() {}

-    RequestDesc() : pkt(nullptr), primaryType(RubyRequestType_NULL),
-        secondaryType(RubyRequestType_NULL)
-    {
-    }
+    void insertPacket(PacketPtr pkt);
+    bool packetAvailable();
+    void printRequestTable(std::stringstream& ss);

-    PacketPtr pkt;
-    RubyRequestType primaryType;
-    RubyRequestType secondaryType;
+    // Returns a pointer to the list of packets corresponding to an
+    // instruction in the instruction map or nullptr if there are no
+    // instructions at the offset.
+    PerInstPackets* getInstPackets(int offset);
+    void updateResources();
+
+    // Check if a packet hasn't been removed from instMap in too long.
+    // Panics if a deadlock is detected and returns nothing otherwise.
+    void checkDeadlock(Tick threshold);
+
+  private:
+    GPUCoalescer *coalescer;
+
+    // Maps an instructions unique sequence number to a queue of packets
+ // which need responses. This data structure assumes the sequence number
+    // is monotonically increasing (which is true for CU class) in order to
+    // issue packets in age order.
+    std::map<uint64_t, PerInstPackets> instMap;
 };

-std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
+class CoalescedRequest
+{
+  public:
+    CoalescedRequest(uint64_t _seqNum)
+        : seqNum(_seqNum), issueTime(Cycles(0)),
+          rubyType(RubyRequestType_NULL)
+    {}
+    ~CoalescedRequest() {}
+
+    void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
+    void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
+    void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
+    void setRubyType(RubyRequestType type) { rubyType = type; }
+
+    uint64_t getSeqNum() const { return seqNum; }
+    PacketPtr getFirstPkt() const { return pkts[0]; }
+    Cycles getIssueTime() const { return issueTime; }
+    RubyRequestType getRubyType() const { return rubyType; }
+    std::vector<PacketPtr>& getPackets() { return pkts; }
+
+  private:
+    uint64_t seqNum;
+    Cycles issueTime;
+    RubyRequestType rubyType;
+    std::vector<PacketPtr> pkts;
+};

 class GPUCoalescer : public RubyPort
 {
   public:
+    class GMTokenPort : public TokenSlavePort
+    {
+      public:
+        GMTokenPort(const std::string& name, ClockedObject *owner,
+                    PortID id = InvalidPortID)
+            : TokenSlavePort(name, owner, id)
+        { }
+        ~GMTokenPort() { }
+
+      protected:
+        Tick recvAtomic(PacketPtr) { return Tick(0); }
+        void recvFunctional(PacketPtr) { }
+        bool recvTimingReq(PacketPtr) { return false; }
+        AddrRangeList getAddrRanges() const
+        {
+            AddrRangeList ranges;
+            return ranges;
+        }
+    };
+
     typedef RubyGPUCoalescerParams Params;
     GPUCoalescer(const Params *);
     ~GPUCoalescer();

+    Port &getPort(const std::string &if_name,
+                  PortID idx = InvalidPortID) override;
+
     // Public Methods
     void wakeup(); // Used only for deadlock detection
+    void printRequestTable(std::stringstream& ss);

     void printProgress(std::ostream& out) const;
     void resetStats() override;
@@ -177,13 +230,13 @@

     void print(std::ostream& out) const;

-    void markRemoved();
-    void removeRequest(GPUCoalescerRequest* request);
     void evictionCallback(Addr address);
     void completeIssue();

     void insertKernel(int wavefront_id, PacketPtr pkt);

+    GMTokenPort& getGMTokenPort() { return gmTokenPort; }
+
     void recordRequestType(SequencerRequestType requestType);
     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }

@@ -224,11 +277,11 @@
                         Addr pc, RubyAccessMode access_mode,
                         int size, DataBlock*& data_ptr);
     // Alternate implementations in VIPER Coalescer
-    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+    virtual void issueRequest(CoalescedRequest* crequest);

     void kernelCallback(int wavfront_id);

-    void hitCallback(GPUCoalescerRequest* request,
+    void hitCallback(CoalescedRequest* crequest,
                      MachineType mach,
                      DataBlock& data,
                      bool success,
@@ -236,21 +289,23 @@
                      Cycles forwardRequestTime,
                      Cycles firstResponseTime,
                      bool isRegion);
-    void recordMissLatency(GPUCoalescerRequest* request,
+    void recordMissLatency(CoalescedRequest* crequest,
                            MachineType mach,
                            Cycles initialRequestTime,
                            Cycles forwardRequestTime,
                            Cycles firstResponseTime,
                            bool success, bool isRegion);
-    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
-    PacketPtr mapAddrToPkt(Addr address);
+    void completeHitCallback(std::vector<PacketPtr> & mylist);


-    RequestStatus getRequestStatus(PacketPtr pkt,
-                                   RubyRequestType request_type);
-    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+    virtual RubyRequestType getRequestType(PacketPtr pkt);

-    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+    // Attempt to remove a packet from the uncoalescedTable and coalesce
+    // with a previous request from the same instruction. If there is no
+    // previous instruction and the max number of outstanding requests has
+    // not be reached, a new coalesced request is created and added to the
+    // "target" list of the coalescedTable.
+    bool coalescePacket(PacketPtr pkt);

     EventFunctionWrapper issueEvent;

@@ -258,22 +313,27 @@
   // Changed to protected to enable inheritance by VIPER Coalescer
   protected:
     int m_max_outstanding_requests;
-    int m_deadlock_threshold;
+    Cycles m_deadlock_threshold;

     CacheMemory* m_dataCache_ptr;
     CacheMemory* m_instCache_ptr;

-    // We need to track both the primary and secondary request types.
- // The secondary request type comprises a subset of RubyRequestTypes that - // are understood by the L1 Controller. A primary request type can be any
-    // RubyRequestType.
- typedef std::unordered_map<Addr, std::vector<RequestDesc>> CoalescingTable;
-    CoalescingTable reqCoalescer;
-    std::vector<Addr> newRequests;
+    // coalescingWindow is the maximum number of instructions that are
+    // allowed to be coalesced in a single cycle.
+    int coalescingWindow;

-    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
-    RequestTable m_writeRequestTable;
-    RequestTable m_readRequestTable;
+    // The uncoalescedTable contains several "columns" which hold memory
+ // request packets for an instruction. The maximum size is the number of
+    // columns * the wavefront size.
+    UncoalescedTable uncoalescedTable;
+
+    // An MSHR-like struct for holding coalesced requests. The requests in
+ // this table may or may not be outstanding in the memory hierarchy. The
+    // maximum size is equal to the maximum outstanding requests for a CU
+    // (typically the number of blocks in TCP). If there are duplicates of
+    // an address, the are serviced in age order.
+    std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
+
     // Global outstanding request count, across all request tables
     int m_outstanding_count;
     bool m_deadlock_check_scheduled;
@@ -334,7 +394,12 @@
     std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
     std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;

-private:
+  private:
+ // Token port is used to send/receive tokens to/from GPU's global memory
+    // pipeline across the port boundary. There is one per <wave size> data
+    // ports in the CU.
+    GMTokenPort gmTokenPort;
+
     // Private copy constructor and assignment operator
     GPUCoalescer(const GPUCoalescer& obj);
     GPUCoalescer& operator=(const GPUCoalescer& obj);
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
index c02fb75..0335981 100644
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -42,6 +42,8 @@
    # max_outstanding_requests = (wave front slots) x (wave front size)
    max_outstanding_requests = Param.Int(40*64,
"max requests (incl. prefetches) outstanding")
+   max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
+                                "coalesced in a single cycle")
    assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
                            "Ownership coherence");

diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index feb13c5..d8977ac 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -76,15 +76,8 @@
 {
 }

-// Analyzes the packet to see if this request can be coalesced.
-// If request can be coalesced, this request is added to the reqCoalescer table
-// and makeRequest returns RequestStatus_Issued;
-// If this is the first request to a cacheline, request is added to both
-// newRequests queue and to the reqCoalescer table; makeRequest
-// returns RequestStatus_Issued.
-// If there is a pending request to this cacheline and this request
-// can't be coalesced, RequestStatus_Aliased is returned and
-// the packet needs to be reissued.
+// Places an uncoalesced packet in uncoalescedTable. If the packet is a
+// special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 VIPERCoalescer::makeRequest(PacketPtr pkt)
 {
@@ -109,7 +102,6 @@

             return RequestStatus_Issued;
         }
-//        return RequestStatus_Aliased;
     } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
         // Flush Dirty Data on Kernel End
         // isKernel + isRelease
@@ -123,13 +115,10 @@
         }
         return RequestStatus_Issued;
     }
-    RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
-    if (requestStatus!=RequestStatus_Issued) {
-        // Request not isssued
-        // enqueue Retry
-        DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
-        return requestStatus;
-    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+
+    GPUCoalescer::makeRequest(pkt);
+
+    if (pkt->req->isKernel() && pkt->req->isAcquire()) {
         // Invalidate clean Data on Kernel Begin
         // isKernel + isAcquire
         invL1();

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/27429
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I44983610307b638a97472db3576d0a30df2de600
Gerrit-Change-Number: 27429
Gerrit-PatchSet: 14
Gerrit-Owner: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Alexandru Duțu <alexandru.d...@amd.com>
Gerrit-Reviewer: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Bradford Beckmann <brad.beckm...@amd.com>
Gerrit-Reviewer: Gem5 Cloud Project GCB service account <345032938...@cloudbuild.gserviceaccount.com>
Gerrit-Reviewer: Jason Lowe-Power <power...@gmail.com>
Gerrit-Reviewer: Juan Manuel Cebrián González <jm.cebriangonza...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: Pouya Fotouhi <pfoto...@ucdavis.edu>
Gerrit-Reviewer: Tony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Tuan Ta <q...@cornell.edu>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to