Bobby Bruce has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/65451?usp=email )

Change subject: dev-amdgpu: Fix SDMA ring buffer wrap around
......................................................................

dev-amdgpu: Fix SDMA ring buffer wrap around

The current SDMA wrap around handling only considers the ring buffer
location as seen by the GPU. Eventually when the end of the SDMA ring
buffer is reached, the driver waits until the rptr written back to the
host catches up to what the driver sees before wrapping around back to
the beginning of the buffer. This writeback currently does not happen at
all, causing hangs for applications with a lot of SDMA commands.

This changeset first fixes the sizes of the queues, especially RLC
queues, so that the wrap around occurs in the correct place. Second, we
now store the rptr writeback address and the absoluate (unwrapped) rptr
value in each SDMA queue. The absolulte rptr is what the driver sends to
the device and what it expects to be written back.

This was tested with an application which basically does a few hundred
thousand hipMemcpy() calls in a loop. It should also fix the issue with
pannotia BC in fullsystem mode.

Change-Id: I53ebdcc6b02fb4eb4da435c9a509544066a97069
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65351
Maintainer: Jason Lowe-Power <power...@gmail.com>
Tested-by: kokoro <noreply+kok...@google.com>
Reviewed-by: Jason Lowe-Power <power...@gmail.com>
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
Maintainer: Matt Sinclair <mattdsincl...@gmail.com>
(cherry picked from commit c8d687b05c803e3b358014e7f729a5700a003552)
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65451
Maintainer: Bobby Bruce <bbr...@ucdavis.edu>
Reviewed-by: Bobby Bruce <bbr...@ucdavis.edu>
Reviewed-by: Matthew Poremba <matthew.pore...@amd.com>
---
M src/dev/amdgpu/pm4_packet_processor.cc
M src/dev/amdgpu/sdma_engine.cc
M src/dev/amdgpu/sdma_engine.hh
3 files changed, 84 insertions(+), 15 deletions(-)

Approvals:
  Matthew Poremba: Looks good to me, approved
  Bobby Bruce: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass




diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index c8e6320..f78f833 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -441,12 +441,17 @@
PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
     SDMAQueueDesc *mqd, uint16_t vmid)
 {
+    uint32_t rlc_size = 4UL << bits(mqd->sdmax_rlcx_rb_cntl, 6, 1);
+    Addr rptr_wb_addr = mqd->sdmax_rlcx_rb_rptr_addr_hi;
+    rptr_wb_addr <<= 32;
+    rptr_wb_addr |= mqd->sdmax_rlcx_rb_rptr_addr_lo;
+
DPRINTF(PM4PacketProcessor, "SDMAMQD: rb base: %#lx rptr: %#x/%#x wptr: "
-            "%#x/%#x ib: %#x/%#x size: %d ctrl: %#x\n", mqd->rb_base,
-            mqd->sdmax_rlcx_rb_rptr, mqd->sdmax_rlcx_rb_rptr_hi,
+            "%#x/%#x ib: %#x/%#x size: %d ctrl: %#x rptr wb addr: %#lx\n",
+ mqd->rb_base, mqd->sdmax_rlcx_rb_rptr, mqd->sdmax_rlcx_rb_rptr_hi,
             mqd->sdmax_rlcx_rb_wptr, mqd->sdmax_rlcx_rb_wptr_hi,
             mqd->sdmax_rlcx_ib_base_lo, mqd->sdmax_rlcx_ib_base_hi,
-            mqd->sdmax_rlcx_ib_size, mqd->sdmax_rlcx_rb_cntl);
+            rlc_size, mqd->sdmax_rlcx_rb_cntl, rptr_wb_addr);

     // Engine 2 points to SDMA0 while engine 3 points to SDMA1
     assert(pkt->engineSel == 2 || pkt->engineSel == 3);
@@ -454,7 +459,8 @@

     // Register RLC queue with SDMA
     sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2,
-                               mqd->rb_base << 8);
+                               mqd->rb_base << 8, rlc_size,
+                               rptr_wb_addr);

     // Register doorbell with GPU device
     gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index e9a4c17..59c5027 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -161,7 +161,8 @@
 }

 void
-SDMAEngine::registerRLCQueue(Addr doorbell, Addr rb_base)
+SDMAEngine::registerRLCQueue(Addr doorbell, Addr rb_base, uint32_t size,
+                             Addr rptr_wb_addr)
 {
     // Get first free RLC
     if (!rlc0.valid()) {
@@ -171,19 +172,19 @@
         rlc0.base(rb_base);
         rlc0.rptr(0);
         rlc0.wptr(0);
+        rlc0.rptrWbAddr(rptr_wb_addr);
         rlc0.processing(false);
-        // TODO: size - I think pull from MQD 2^rb_cntrl[6:1]-1
-        rlc0.size(1024*1024);
+        rlc0.size(size);
     } else if (!rlc1.valid()) {
         DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC1\n", doorbell);
         rlcInfo[1] = doorbell;
         rlc1.valid(true);
         rlc1.base(rb_base);
-        rlc1.rptr(1);
-        rlc1.wptr(1);
+        rlc1.rptr(0);
+        rlc1.wptr(0);
+        rlc1.rptrWbAddr(rptr_wb_addr);
         rlc1.processing(false);
-        // TODO: size - I think pull from MQD 2^rb_cntrl[6:1]-1
-        rlc1.size(1024*1024);
+        rlc1.size(size);
     } else {
         panic("No free RLCs. Check they are properly unmapped.");
     }
@@ -291,6 +292,17 @@
                 { decodeHeader(q, header); });
         dmaReadVirt(q->rptr(), sizeof(uint32_t), cb, &cb->dmaBuffer);
     } else {
+        // The driver expects the rptr to be written back to host memory
+ // periodically. In simulation, we writeback rptr after each burst of
+        // packets from a doorbell, rather than using the cycle count which
+        // is not accurate in all simulation settings (e.g., KVM).
+        DPRINTF(SDMAEngine, "Writing rptr %#lx back to host addr %#lx\n",
+                q->globalRptr(), q->rptrWbAddr());
+        if (q->rptrWbAddr()) {
+            auto cb = new DmaVirtCallback<uint64_t>(
+                [ = ](const uint64_t &) { }, q->globalRptr());
+ dmaWriteVirt(q->rptrWbAddr(), sizeof(Addr), cb, &cb->dmaBuffer);
+        }
         q->processing(false);
         if (q->parent()) {
             DPRINTF(SDMAEngine, "SDMA switching queues\n");
@@ -1158,6 +1170,7 @@
 {
     gfxRptr = insertBits(gfxRptr, 31, 0, 0);
     gfxRptr |= data;
+    gfx.rptrWbAddr(getGARTAddr(gfxRptr));
 }

 void
@@ -1165,6 +1178,7 @@
 {
     gfxRptr = insertBits(gfxRptr, 63, 32, 0);
     gfxRptr |= ((uint64_t)data) << 32;
+    gfx.rptrWbAddr(getGARTAddr(gfxRptr));
 }

 void
@@ -1236,6 +1250,7 @@
 {
     pageRptr = insertBits(pageRptr, 31, 0, 0);
     pageRptr |= data;
+    page.rptrWbAddr(getGARTAddr(pageRptr));
 }

 void
@@ -1243,6 +1258,7 @@
 {
     pageRptr = insertBits(pageRptr, 63, 32, 0);
     pageRptr |= ((uint64_t)data) << 32;
+    page.rptrWbAddr(getGARTAddr(pageRptr));
 }

 void
diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh
index 6fe7a8e..d0afaf7 100644
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -58,6 +58,8 @@
         Addr _rptr;
         Addr _wptr;
         Addr _size;
+        Addr _rptr_wb_addr = 0;
+        Addr _global_rptr = 0;
         bool _valid;
         bool _processing;
         SDMAQueue *_parent;
@@ -72,6 +74,8 @@
         Addr wptr() { return _base + _wptr; }
         Addr getWptr() { return _wptr; }
         Addr size() { return _size; }
+        Addr rptrWbAddr() { return _rptr_wb_addr; }
+        Addr globalRptr() { return _global_rptr; }
         bool valid() { return _valid; }
         bool processing() { return _processing; }
         SDMAQueue* parent() { return _parent; }
@@ -82,22 +86,27 @@
         void
         incRptr(uint32_t value)
         {
-            //assert((_rptr + value) <= (_size << 1));
             _rptr = (_rptr + value) % _size;
+            _global_rptr += value;
         }

-        void rptr(Addr value) { _rptr = value; }
+        void
+        rptr(Addr value)
+        {
+            _rptr = value;
+            _global_rptr = value;
+        }

         void
         setWptr(Addr value)
         {
-            //assert(value <= (_size << 1));
             _wptr = value % _size;
         }

         void wptr(Addr value) { _wptr = value; }

         void size(Addr value) { _size = value; }
+        void rptrWbAddr(Addr value) { _rptr_wb_addr = value; }
         void valid(bool v) { _valid = v; }
         void processing(bool value) { _processing = value; }
         void parent(SDMAQueue* q) { _parent = q; }
@@ -268,7 +277,8 @@
     /**
      * Methods for RLC queues
      */
-    void registerRLCQueue(Addr doorbell, Addr rb_base);
+    void registerRLCQueue(Addr doorbell, Addr rb_base, uint32_t size,
+                          Addr rptr_wb_addr);
     void unregisterRLCQueue(Addr doorbell);
     void deallocateRLCQueues();


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/65451?usp=email To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: release-staging-v22-1
Gerrit-Change-Id: I53ebdcc6b02fb4eb4da435c9a509544066a97069
Gerrit-Change-Number: 65451
Gerrit-PatchSet: 2
Gerrit-Owner: Bobby Bruce <bbr...@ucdavis.edu>
Gerrit-Reviewer: Bobby Bruce <bbr...@ucdavis.edu>
Gerrit-Reviewer: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org

Reply via email to