[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Create CU's ports in the standard way

Anthony Gutierrez (Gerrit) via gem5-dev Tue, 18 Aug 2020 15:55:00 -0700

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit


    https://gem5-review.googlesource.com/c/public/gem5/+/32836

to review the following change.


Change subject: gpu-compute: Create CU's ports in the standard way
......................................................................

gpu-compute: Create CU's ports in the standard way

The CU would initialize its ports in getMasterPort(), which
is not desirable as getMasterPort() may be called several
times for the same port. This can lead to a fatal if the CU
expects to only create a single port of a given type, and may
lead to other issues where stat names are duplicated.

This change instantiates and initializes the CU's ports in the
CU constructor using the CU params.

The index field is also removed from the CU's ports because the
base class already has an ID field, which will be set to the
default value in the base class's constructor for scalar ports.

It doesn't make sense for scalar port's to take an index because
they are scalar, so we let the base class initialize the ID to
the invalid port ID.

Change-Id: Id18386f5f53800a6447d968380676d8fd9bac9df
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/fetch_unit.cc
3 files changed, 86 insertions(+), 103 deletions(-)

diff --git a/src/gpu-compute/compute_unit.ccb/src/gpu-compute/compute_unit.cc

index 7e0947f..b9f7dec 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -96,6 +96,11 @@
     resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
     _masterId(p->system->getMasterId(this, "ComputeUnit")),
     lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
+    ldsPort(csprintf("%s-port", name()), this),
+    scalarDataPort(csprintf("%s-port", name()), this),
+    scalarDTLBPort(csprintf("%s-port", name()), this),
+    sqcPort(csprintf("%s-port", name()), this),
+    sqcTLBPort(csprintf("%s-port", name()), this),
     _cacheLineSize(p->system->cacheLineSize()),
     _numBarrierSlots(p->num_barrier_slots),
     globalSeqNum(0), wavefrontSize(p->wf_size),
@@ -169,16 +174,20 @@
         fatal("Invalid WF execution policy (CU)\n");
     }

-    memPort.resize(wfSize());
+    for (int i = 0; i < p->port_memory_port_connection_count; ++i) {
+        memPort.push_back(
+            new DataPort(csprintf("%s-port%d", name(), i), this, i));
+    }
+
+    for (int i = 0; i < p->port_translation_port_connection_count; ++i) {
+        tlbPort.push_back(
+            new DTLBPort(csprintf("%s-port%d", name(), i), this, i));
+    }

     // Setup tokens for slave ports. The number of tokens in memSlaveTokens
     // is the total token count for the entire vector port (i.e., this CU).
     memPortTokens = new TokenManager(p->max_cu_tokens);

-    // resize the tlbPort vectorArray
-    int tlbPort_width = perLaneTLB ? wfSize() : 1;
-    tlbPort.resize(tlbPort_width);
-
     registerExitCallback([this]() { exitCallback(); });

     lastExecCycle.resize(numVectorALUs, 0);
@@ -214,7 +223,14 @@
         lastVaddrSimd[j].clear();
     }
     lastVaddrCU.clear();
-    delete ldsPort;
+
+    for (auto mem_port : memPort) {
+        delete mem_port;
+    }
+
+    for (auto tlb_port : tlbPort) {
+        delete tlb_port;
+    }
 }

 int
@@ -781,7 +797,7 @@
     // appropriate cycle to process the timing memory response
     // This delay represents the pipeline delay
     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
-    int index = sender_state->port_index;
+    PortID index = sender_state->port_index;
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
     GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();

@@ -1007,7 +1023,7 @@
 }

 void

-ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtrpkt)+ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtrpkt)

 {
     // There must be a way around this check to do the globalMemStart...
     Addr tmp_vaddr = pkt->req->getVaddr();
@@ -1039,7 +1055,7 @@
     tlbCycles -= curTick();
     ++tlbRequests;

-    int tlbPort_index = perLaneTLB ? index : 0;
+    PortID tlbPort_index = perLaneTLB ? index : 0;

     if (shader->timingSim) {
         if (debugSegFault) {
@@ -1205,12 +1221,12 @@

new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc,false,

                                              pkt->senderState);

-    if (scalarDTLBPort->isStalled()) {
-        assert(scalarDTLBPort->retries.size());
-        scalarDTLBPort->retries.push_back(pkt);
-    } else if (!scalarDTLBPort->sendTimingReq(pkt)) {
-        scalarDTLBPort->stallPort();
-        scalarDTLBPort->retries.push_back(pkt);
+    if (scalarDTLBPort.isStalled()) {
+        assert(scalarDTLBPort.retries.size());
+        scalarDTLBPort.retries.push_back(pkt);
+    } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
+        scalarDTLBPort.stallPort();
+        scalarDTLBPort.retries.push_back(pkt);
     } else {

DPRINTF(GPUTLB, "sent scalar %s translation request foraddr %#x\n",

                 tlb_mode == BaseTLB::Read ? "read" : "write",
@@ -1308,7 +1324,7 @@

     DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
             compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
-            pkt->req->getPaddr(), index);
+            pkt->req->getPaddr(), id);

     Addr paddr = pkt->req->getPaddr();

@@ -1321,7 +1337,7 @@
     int index = gpuDynInst->memStatusVector[paddr].back();

     DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
-            pkt->req->getPaddr(), index);
+            pkt->req->getPaddr(), id);

     gpuDynInst->memStatusVector[paddr].pop_back();
     gpuDynInst->pAddr = pkt->req->getPaddr();
@@ -1425,7 +1441,7 @@
         safe_cast<DTLBPort::SenderState*>(pkt->senderState);

     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
-    int mp_index = sender_state->portIndex;
+    PortID mp_index = sender_state->portIndex;
     Addr vaddr = pkt->req->getVaddr();
     gpuDynInst->memStatusVector[line].push_back(mp_index);
     gpuDynInst->tlbHitLevel[mp_index] = hit_level;
@@ -1575,14 +1591,13 @@

         DPRINTF(GPUPort,
                 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
-                compute_unit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, index,
-                pkt->req->getPaddr());

+ compute_unit->cu_id, gpuDynInst->simdId,gpuDynInst->wfSlotId,

+                id, pkt->req->getPaddr());
     } else {
         DPRINTF(GPUPort,

"CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#xdata "

                 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index,
+                gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
                 pkt->req->getPaddr());
     }
 }
@@ -1598,22 +1613,21 @@
 {
     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
-    ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort->computeUnit;
+    ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort.computeUnit;

-    if (!(scalarDataPort->sendTimingReq(pkt))) {
-        scalarDataPort->retries.push_back(pkt);
+    if (!(scalarDataPort.sendTimingReq(pkt))) {
+        scalarDataPort.retries.push_back(pkt);

         DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+                "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
                 compute_unit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, scalarDataPort->index,
-                pkt->req->getPaddr());
+                gpuDynInst->wfSlotId, pkt->req->getPaddr());
     } else {
         DPRINTF(GPUPort,

- "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#xdata "

+                "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
                 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
                 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
-                scalarDataPort->index, pkt->req->getPaddr());
+                pkt->req->getPaddr());
     }
 }

@@ -1702,8 +1716,8 @@
     req_pkt->senderState =
         new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);

-    if (!computeUnit->scalarDataPort->sendTimingReq(req_pkt)) {
-        computeUnit->scalarDataPort->retries.push_back(req_pkt);
+    if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
+        computeUnit->scalarDataPort.retries.push_back(req_pkt);
         DPRINTF(GPUMem, "send scalar req failed for: %s\n",
                 gpuDynInst->disassemble());
     } else {
@@ -2544,7 +2558,7 @@
     // This is the SenderState needed upon return
     newPacket->senderState = new LDSPort::SenderState(gpuDynInst);

-    return ldsPort->sendTimingReq(newPacket);
+    return ldsPort.sendTimingReq(newPacket);
 }

 /**

diff --git a/src/gpu-compute/compute_unit.hhb/src/gpu-compute/compute_unit.hh

index cf51a86..55adfc8 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -448,7 +448,7 @@
     void doSmReturn(GPUDynInstPtr gpuDynInst);

     virtual void init() override;
-    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);

+ void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtrpkt);

     void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
     void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
                               bool kernelMemSync,
@@ -652,16 +652,15 @@
     class DataPort : public MasterPort
     {
       public:
-        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
-            : MasterPort(_name, _cu), computeUnit(_cu),
-              index(_index) { }
+        DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
+            : MasterPort(_name, _cu, id), computeUnit(_cu) { }

         bool snoopRangeSent;

         struct SenderState : public Packet::SenderState
         {
             GPUDynInstPtr _gpuDynInst;
-            int port_index;
+            PortID port_index;
             Packet::SenderState *saved;

             SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
@@ -681,7 +680,6 @@

       protected:
         ComputeUnit *computeUnit;
-        int index;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
@@ -702,11 +700,9 @@
     class ScalarDataPort : public MasterPort
     {
       public:
-        ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
-                       PortID _index)

- : MasterPort(_name, _cu, _index), computeUnit(_cu),index(_index)

+        ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
+            : MasterPort(_name, _cu), computeUnit(_cu)
         {
-            (void)index;
         }

         bool recvTimingResp(PacketPtr pkt) override;
@@ -727,11 +723,11 @@
         class MemReqEvent : public Event
         {
           private:
-            ScalarDataPort *scalarDataPort;
+            ScalarDataPort &scalarDataPort;
             PacketPtr pkt;

           public:
-            MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
+            MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
                 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
             {
               setFlags(Event::AutoDelete);
@@ -745,16 +741,14 @@

       private:
         ComputeUnit *computeUnit;
-        PortID index;
     };

     // Instruction cache access port
     class SQCPort : public MasterPort
     {
       public:
-        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
-            : MasterPort(_name, _cu), computeUnit(_cu),
-              index(_index) { }
+        SQCPort(const std::string &_name, ComputeUnit *_cu)
+            : MasterPort(_name, _cu), computeUnit(_cu) { }

         bool snoopRangeSent;

@@ -775,7 +769,6 @@

       protected:
         ComputeUnit *computeUnit;
-        int index;

         virtual bool recvTimingResp(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
@@ -795,9 +788,9 @@
     class DTLBPort : public MasterPort
     {
       public:
-        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
-            : MasterPort(_name, _cu), computeUnit(_cu),
-              index(_index), stalled(false)
+        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
+            : MasterPort(_name, _cu, id), computeUnit(_cu),
+              stalled(false)
         { }

         bool isStalled() { return stalled; }
@@ -820,7 +813,7 @@

             // the lane in the memInst this is associated with, so we send
             // the memory request down the right port
-            int portIndex;
+            PortID portIndex;

             // constructor used for packets involved in timing accesses
             SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
@@ -830,7 +823,6 @@

       protected:
         ComputeUnit *computeUnit;
-        int index;
         bool stalled;

         virtual bool recvTimingResp(PacketPtr pkt);
@@ -913,8 +905,8 @@
     class LDSPort : public MasterPort
     {
       public:
-        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
-        : MasterPort(_name, _cu, _id), computeUnit(_cu)
+        LDSPort(const std::string &_name, ComputeUnit *_cu)
+        : MasterPort(_name, _cu), computeUnit(_cu)
         {
         }

@@ -983,13 +975,7 @@
     /** The port to access the Local Data Store
      *  Can be connected to a LDS object
      */
-    LDSPort *ldsPort = nullptr;
-
-    LDSPort *
-    getLdsPort() const
-    {
-        return ldsPort;
-    }
+    LDSPort ldsPort;

     TokenManager *
     getTokenManager()
@@ -1004,50 +990,33 @@
     // port to the TLB hierarchy (i.e., the L1 TLB)
     std::vector<DTLBPort*> tlbPort;
     // port to the scalar data cache
-    ScalarDataPort *scalarDataPort;
+    ScalarDataPort scalarDataPort;
     // port to the scalar data TLB
-    ScalarDTLBPort *scalarDTLBPort;
+    ScalarDTLBPort scalarDTLBPort;
     // port to the SQC (i.e. the I-cache)
-    SQCPort *sqcPort;
+    SQCPort sqcPort;
     // port to the SQC TLB (there's a separate TLB for each I-cache)
-    ITLBPort *sqcTLBPort;
+    ITLBPort sqcTLBPort;

     Port &
     getPort(const std::string &if_name, PortID idx) override
     {
-        if (if_name == "memory_port") {
-            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
-                                        this, idx);
+        if (if_name == "memory_port" && idx < memPort.size()) {
             return *memPort[idx];
-        } else if (if_name == "translation_port") {
-            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
-                                        this, idx);
+        } else if (if_name == "translation_port" && idx < tlbPort.size()) {
             return *tlbPort[idx];
         } else if (if_name == "scalar_port") {

- scalarDataPort = new ScalarDataPort(csprintf("%s-port%d",name(),

-                                                idx), this, idx);
-            return *scalarDataPort;
+            return scalarDataPort;
         } else if (if_name == "scalar_tlb_port") {

- scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port",name()),

-                                                this);
-            return *scalarDTLBPort;
+            return scalarDTLBPort;
         } else if (if_name == "sqc_port") {
-            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
-                                  this, idx);
-            return *sqcPort;
+            return sqcPort;
         } else if (if_name == "sqc_tlb_port") {
-            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
-            return *sqcTLBPort;
+            return sqcTLBPort;
         } else if (if_name == "ldsPort") {
-            if (ldsPort) {
-                fatal("an LDS port was already allocated");
-            }
-            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
-            return *ldsPort;
-        } else if (if_name == "gmTokenPort") {
-            return gmTokenPort;
+            return ldsPort;
         } else {
-            panic("incorrect port name");
+            return ClockedObject::getPort(if_name, idx);
         }
     }

diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index ac9a5a6..3a139f5 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -174,24 +174,24 @@
                                                  computeUnit.shader->gpuTc,
                                                  false, pkt->senderState);

-        if (computeUnit.sqcTLBPort->isStalled()) {
-            assert(computeUnit.sqcTLBPort->retries.size() > 0);
+        if (computeUnit.sqcTLBPort.isStalled()) {
+            assert(computeUnit.sqcTLBPort.retries.size() > 0);

             DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
                     vaddr);

-            computeUnit.sqcTLBPort->retries.push_back(pkt);
-        } else if (!computeUnit.sqcTLBPort->sendTimingReq(pkt)) {
+            computeUnit.sqcTLBPort.retries.push_back(pkt);
+        } else if (!computeUnit.sqcTLBPort.sendTimingReq(pkt)) {
             // Stall the data port;
             // No more packet is issued till
             // ruby indicates resources are freed by
             // a recvReqRetry() call back on this port.
-            computeUnit.sqcTLBPort->stallPort();
+            computeUnit.sqcTLBPort.stallPort();

             DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
                     vaddr);

-            computeUnit.sqcTLBPort->retries.push_back(pkt);
+            computeUnit.sqcTLBPort.retries.push_back(pkt);
         } else {

DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n",vaddr);

         }
@@ -200,7 +200,7 @@
             new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,

computeUnit.shader->gpuTc);


-        computeUnit.sqcTLBPort->sendFunctional(pkt);
+        computeUnit.sqcTLBPort.sendFunctional(pkt);

         TheISA::GpuTLB::TranslationState *sender_state =

safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);

@@ -257,8 +257,8 @@
     if (timingSim) {
         // translation is done. Send the appropriate timing memory request.

-        if (!computeUnit.sqcPort->sendTimingReq(pkt)) {
-            computeUnit.sqcPort->retries.push_back(std::make_pair(pkt,
+        if (!computeUnit.sqcPort.sendTimingReq(pkt)) {
+            computeUnit.sqcPort.retries.push_back(std::make_pair(pkt,

wavefront));


             DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
@@ -270,7 +270,7 @@
                     pkt->req->getPaddr());
         }
     } else {
-        computeUnit.sqcPort->sendFunctional(pkt);
+        computeUnit.sqcPort.sendFunctional(pkt);
         processFetchReturn(pkt);
     }
 }

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/32836

To unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: Id18386f5f53800a6447d968380676d8fd9bac9df
Gerrit-Change-Number: 32836
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez <[email protected]>
Gerrit-Reviewer: Tony Gutierrez <[email protected]>
Gerrit-MessageType: newchange

_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Create CU's ports in the standard way

Reply via email to