[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Support Scalar and Vector access to system pages

Matthew Poremba (Gerrit) via gem5-dev Tue, 15 Mar 2022 17:24:13 -0700

Matthew Poremba has uploaded this change for review. (https://gem5-review.googlesource.com/c/public/gem5/+/57711 )

Change subject: gpu-compute: Support Scalar and Vector access to systempages

......................................................................

gpu-compute: Support Scalar and Vector access to system pages

The amdgpu driver supports reading and writing scalar and vector memory
addresses that reside in system memory. This is commonly used for things
like blit kernels that perform host-to-device or device-to-host copies
using GPU load/store instructions.

This is done by utilizing the system hub device added in a prior
changeset. Memory packets translated by the Scalar or VMEM TLBs will
have the correspoding system request field set from the PTE in the TLB
which can be used in the compute unit to determine if a request is for
system memory or not.

Another important change is to return global memory tokens for system
requests. Since these do not flow through the GPU coalescer where the
token is returned, the token can be returned once the request is known
to be a system request.

Change-Id: I35030e0b3698f10c63a397f96b81267271e3130e
---
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/gpu_dyn_inst.hh
4 files changed, 150 insertions(+), 9 deletions(-)

diff --git a/src/gpu-compute/compute_unit.ccb/src/gpu-compute/compute_unit.cc

index b40b464..b9c14f9 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -112,6 +112,12 @@
     scheduleToExecute(p),
     stats(this, p.n_wf)
 {

+ // This is not currently supported and would require adding morehandling

+    // for system vs. device memory requests on the functional paths, so we
+    // fatal immediately in the constructor if this configuration is seen.
+    fatal_if(functionalTLB && FullSystem,
+             "Functional TLB not supported in full-system GPU simulation");
+
     /**
      * This check is necessary because std::bitset only provides conversion

* to unsigned long or unsigned long long via to_ulong() orto_ullong().

@@ -801,6 +807,12 @@
 bool
 ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
 {
+    return handleResponse(pkt);
+}
+
+bool
+ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
+{
     // Ruby has completed the memory op. Schedule the mem_resp_event at the
     // appropriate cycle to process the timing memory response
     // This delay represents the pipeline delay
@@ -902,6 +914,12 @@
 bool
 ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
 {
+    return handleResponse(pkt);
+}
+
+bool
+ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt)
+{
     assert(!pkt->req->isKernel());

     // retrieve sender state
@@ -1241,9 +1259,13 @@
     assert(gpuDynInst->isGlobalSeg() ||
            gpuDynInst->executedAs() == enums::SC_GLOBAL);

+    // Fences will never be issued to system memory, so we can mark the
+    // requestor as a device memory ID here.
     if (!req) {
         req = std::make_shared<Request>(
-            0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
+            0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
+    } else {
+        req->requestorId(vramRequestorId());
     }

     // all mem sync requests have Paddr == 0
@@ -1544,6 +1566,24 @@
             new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
                                                    nullptr);

+    // Set VRAM ID for device requests
+    // For now, system vmem requests use functional reads. This is not that

+ // critical to model as the region of interest should always beaccessing

+    // device memory. System vmem requests are used by blit kernels to do
+    // memcpys and load code objects into device memory.
+    if (new_pkt->req->systemReq()) {
+        // There will be multiple packets returned for the same gpuDynInst,
+        // so first check if systemReq is not already set and if so, return
+        // the token acquired when the dispatch list is filled as system
+        // requests do not require a GPU coalescer token.
+        if (!gpuDynInst->isSystemReq()) {
+            computeUnit->getTokenManager()->recvTokens(1);
+            gpuDynInst->setSystemReq();
+        }
+    } else {
+        new_pkt->req->requestorId(computeUnit->vramRequestorId());
+    }
+
     // translation is done. Schedule the mem_req_event at the appropriate
     // cycle to send the timing memory request to ruby
     EventFunctionWrapper *mem_req_event =
@@ -1582,7 +1622,11 @@
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
     [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;

-    if (!(sendTimingReq(pkt))) {
+    if (pkt->req->systemReq()) {
+        assert(compute_unit->shader->systemHub);
+        SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
+        compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
+    } else if (!(sendTimingReq(pkt))) {
         retries.push_back(std::make_pair(pkt, gpuDynInst));

         DPRINTF(GPUPort,
@@ -1611,7 +1655,11 @@
     GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;

[[maybe_unused]] ComputeUnit *compute_unit =scalarDataPort.computeUnit;


-    if (!(scalarDataPort.sendTimingReq(pkt))) {
+    if (pkt->req->systemReq()) {
+        assert(compute_unit->shader->systemHub);

+ SystemHubEvent *resp_event = new SystemHubEvent(pkt,&scalarDataPort);

+        compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
+    } else if (!(scalarDataPort.sendTimingReq(pkt))) {
         scalarDataPort.retries.push_back(pkt);

         DPRINTF(GPUPort,
@@ -1712,15 +1760,26 @@
     req_pkt->senderState =
         new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);

-    if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
-        computeUnit->scalarDataPort.retries.push_back(req_pkt);
-        DPRINTF(GPUMem, "send scalar req failed for: %s\n",
-                gpuDynInst->disassemble());
+    // For a system request we want to mark the GPU instruction as a system

+ // load/store so that after the request is issued to system memory wecan

+    // return any token acquired for the request. Since tokens are returned

+ // by the coalescer and system requests do not take that path, thisneeds

+    // to be tracked.
+    //
+    // Device requests change the requestor ID to something in the device
+    // memory Ruby network.
+    if (req_pkt->req->systemReq()) {
+        gpuDynInst->setSystemReq();
     } else {
-        DPRINTF(GPUMem, "send scalar req for: %s\n",
-                gpuDynInst->disassemble());
+        req_pkt->req->requestorId(computeUnit->vramRequestorId());
     }

+    ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
+            = new ComputeUnit::ScalarDataPort::MemReqEvent
+                (computeUnit->scalarDataPort, req_pkt);
+    computeUnit->schedule(scalar_mem_req_event, curTick() +
+                          computeUnit->req_tick_latency);
+
     return true;
 }

diff --git a/src/gpu-compute/compute_unit.hhb/src/gpu-compute/compute_unit.hh

index 1c211d9..a080e3d 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -529,6 +529,28 @@
                   saved(sender_state) { }
         };

+        class SystemHubEvent : public Event
+        {
+          DataPort *dataPort;
+          PacketPtr reqPkt;
+
+          public:
+            SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
+                : dataPort(_dataPort), reqPkt(pkt)
+            {
+                setFlags(Event::AutoDelete);
+            }
+
+            void
+            process()
+            {
+                // DMAs do not operate on packets and therefore do not
+                // convert to a response. Do that here instead.
+                reqPkt->makeResponse();
+                dataPort->handleResponse(reqPkt);
+            }
+        };
+
         void processMemReqEvent(PacketPtr pkt);
         EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);

@@ -537,6 +559,8 @@

         std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;

+        bool handleResponse(PacketPtr pkt);
+
       protected:
         ComputeUnit *computeUnit;

@@ -596,6 +620,30 @@
             const char *description() const;
         };

+        class SystemHubEvent : public Event
+        {
+          ScalarDataPort *dataPort;
+          PacketPtr reqPkt;
+
+          public:
+            SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
+                : dataPort(_dataPort), reqPkt(pkt)
+            {
+                setFlags(Event::AutoDelete);
+            }
+
+            void
+            process()
+            {
+                // DMAs do not operate on packets and therefore do not
+                // convert to a response. Do that here instead.
+                reqPkt->makeResponse();
+                dataPort->handleResponse(reqPkt);
+            }
+        };
+
+        bool handleResponse(PacketPtr pkt);
+
         std::deque<PacketPtr> retries;

       private:

diff --git a/src/gpu-compute/global_memory_pipeline.ccb/src/gpu-compute/global_memory_pipeline.cc

index f766f7e..08303a4 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -62,6 +62,10 @@
 bool
 GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
 {
+    // System requests do not need GPU coalescer tokens. Make sure nothing
+    // has bypassed the operand gather check stage.
+    assert(!mp->isSystemReq());
+
     // We require one token from the coalescer's uncoalesced table to
     // proceed
     int token_count = 1;

diff --git a/src/gpu-compute/gpu_dyn_inst.hhb/src/gpu-compute/gpu_dyn_inst.hh

index 17c7ee6..e2884a0 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -476,11 +476,16 @@

     // inst used to save/restore a wavefront context
     bool isSaveRestore;
+
+    bool isSystemReq() { return systemReq; }
+    void setSystemReq() { systemReq = true; }
+
   private:
     GPUStaticInst *_staticInst;
     const InstSeqNum _seqNum;
     int maxSrcVecRegOpSize;
     int maxSrcScalarRegOpSize;
+    bool systemReq = false;

     // the time the request was started
     Tick accessTime = -1;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/57711

To unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I35030e0b3698f10c63a397f96b81267271e3130e
Gerrit-Change-Number: 57711
Gerrit-PatchSet: 1
Gerrit-Owner: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-MessageType: newchange

_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Support Scalar and Vector access to system pages

Reply via email to