[gem5-dev] Change in gem5/gem5[develop]: dev-hsa,gpu-compute: fix bug with gfx8 VAs for HSA Queues

Matt Sinclair (Gerrit) via gem5-dev Thu, 07 Oct 2021 23:15:05 -0700

Matt Sinclair has uploaded this change for review. (https://gem5-review.googlesource.com/c/public/gem5/+/51371 )


Change subject: dev-hsa,gpu-compute: fix bug with gfx8 VAs for HSA Queues
......................................................................

dev-hsa,gpu-compute: fix bug with gfx8 VAs for HSA Queues

GFX7 (not supported in gem5) and GFX8 have a bug with how virtual
addresses are calculated for their HSA queues.  The ROCr component of
ROCm solves this problem by doubling the HSA queue size that is
requested, then mapping all virtual addresses in the second half of the
queue to the same virtual addresses as the first half of the queue.
This commit fixes gem5's support to mimic this behavior.

Note that this change does not affect Vega's HSA queue support, because
according to the ROCm documentation, Vega does not have the same problem
as GCN3.

Change-Id: I133cf1acc3a00a0baded0c4c3c2a25f39effdb51
---
M src/dev/hsa/hsa_packet_processor.cc
M src/dev/hsa/hsa_packet_processor.hh
M src/gpu-compute/gpu_compute_driver.cc
M src/dev/hsa/hw_scheduler.cc
M src/dev/hsa/hw_scheduler.hh
5 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/src/dev/hsa/hsa_packet_processor.ccb/src/dev/hsa/hsa_packet_processor.cc

index 0427def..22124b1 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -44,6 +44,7 @@
 #include "dev/dma_device.hh"
 #include "dev/hsa/hsa_packet.hh"
 #include "dev/hsa/hw_scheduler.hh"
+#include "enums/GfxVersion.hh"
 #include "gpu-compute/gpu_command_processor.hh"
 #include "mem/packet_access.hh"
 #include "mem/page_table.hh"
@@ -100,13 +101,15 @@
 HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
                                        uint64_t basePointer,
                                        uint64_t queue_id,
-                                       uint32_t size, int doorbellSize)
+                                       uint32_t size, int doorbellSize,
+                                       GfxVersion gfxVersion)
 {
     DPRINTF(HSAPacketProcessor,
              "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
              (void *)basePointer, queue_id, size);
     hwSchdlr->registerNewQueue(hostReadIndexPointer,
-                               basePointer, queue_id, size, doorbellSize);
+                               basePointer, queue_id, size, doorbellSize,
+                               gfxVersion);
 }

 AddrRangeList

diff --git a/src/dev/hsa/hsa_packet_processor.hhb/src/dev/hsa/hsa_packet_processor.hh

index 9545006..aabe24e 100644
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -39,9 +39,11 @@
 #include <vector>

 #include "base/types.hh"
+#include "debug/HSAPacketProcessor.hh"
 #include "dev/dma_virt_device.hh"
 #include "dev/hsa/hsa.h"
 #include "dev/hsa/hsa_queue.hh"
+#include "enums/GfxVersion.hh"
 #include "params/HSAPacketProcessor.hh"
 #include "sim/eventq.hh"

@@ -84,14 +86,16 @@
         uint64_t     hostReadIndexPtr;
         bool         stalledOnDmaBufAvailability;
         bool         dmaInProgress;
+        GfxVersion   gfxVersion;

         HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
-                           uint64_t hri_ptr, uint32_t size)
+                           uint64_t hri_ptr, uint32_t size,
+                           GfxVersion gfxVersion)
           : basePointer(base_ptr), doorbellPointer(db_ptr),
             writeIndex(0), readIndex(0),
             numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
             stalledOnDmaBufAvailability(false),
-            dmaInProgress(false)
+            dmaInProgress(false), gfxVersion(gfxVersion)
         {  }

uint64_t spaceRemaining() { return numElts - (writeIndex -readIndex); }

         uint64_t spaceUsed() { return writeIndex - readIndex; }
@@ -102,15 +106,38 @@

         uint64_t ptr(uint64_t ix)
         {
-            /**
-             * Sometimes queues report that their size is 512k, which would
-             * indicate numElts of 0x2000. However, they only have 256k
-             * mapped which means any index over 0x1000 will fail an
-             * address translation.
+            /*
+             * Based on ROCm Documentation:

+ * -https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/+10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/

+                     rocr/src/core/runtime/amd_aql_queue.cpp#L99

+ * -https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/+10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/

+                     rocr/src/core/runtime/amd_aql_queue.cpp#L624
+             *

+ * GFX7 and GFX8 will allocate twice as much space for theirHSA+ * queues as they actually access (using mod operations to mapthe+ * virtual addresses from the upper half of the queue to thesame+ * virtual addresses as the lower half). Thus, we need tocheck if+ * the ISA is GFX8 and mod the address by half of the queuesize if

+             * so.
              */
-            assert(ix % numElts < 0x1000);
-            return basePointer +
-                ((ix % numElts) * objSize());
+            uint64_t retAddr = 0ll;
+            if ((gfxVersion == GfxVersion::gfx801) ||
+                (gfxVersion == GfxVersion::gfx803)) {
+              retAddr = basePointer + ((ix % (numElts/2)) * objSize());
+              DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
+                      "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
+                      "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
+                      numElts, numElts/2, objSize(), retAddr);
+            } else {
+              retAddr = basePointer + ((ix % numElts) * objSize());
+              DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
+                      "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "

+ "retAddr: 0x%x\n", basePointer, ix, numElts,objSize(),

+                      retAddr);
+            }
+            return retAddr;
         }
 };

@@ -325,7 +352,8 @@
     void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
                             uint64_t basePointer,
                             uint64_t queue_id,
-                            uint32_t size, int doorbellSize);
+                            uint32_t size, int doorbellSize,
+                            GfxVersion gfxVersion);
     void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
     void setDevice(GPUCommandProcessor * dev);
     void updateReadIndex(int, uint32_t);
diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc
index 48f4e69..f42dede 100644
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -87,7 +87,8 @@
 HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
                               uint64_t basePointer,
                               uint64_t queue_id,
-                              uint32_t size, int doorbellSize)
+                              uint32_t size, int doorbellSize,
+                              GfxVersion gfxVersion)
 {
     assert(queue_id < MAX_ACTIVE_QUEUES);
     // Map queue ID to doorbell.
@@ -108,7 +109,7 @@

     HSAQueueDescriptor* q_desc =
        new HSAQueueDescriptor(basePointer, db_offset,
-                              hostReadIndexPointer, size);
+                              hostReadIndexPointer, size, gfxVersion);
     AQLRingBuffer* aql_buf =
         new AQLRingBuffer(NUM_DMA_BUFS, hsaPP->name());
     QCntxt q_cntxt(q_desc, aql_buf);
diff --git a/src/dev/hsa/hw_scheduler.hh b/src/dev/hsa/hw_scheduler.hh
index b50273d..8c043ec 100644
--- a/src/dev/hsa/hw_scheduler.hh
+++ b/src/dev/hsa/hw_scheduler.hh
@@ -39,6 +39,7 @@

 #include "base/types.hh"
 #include "dev/hsa/hsa_packet_processor.hh"
+#include "enums/GfxVersion.hh"
 #include "sim/eventq.hh"

 // We allocate one PIO page for doorbells and each
@@ -59,7 +60,8 @@
     void registerNewQueue(uint64_t hostReadIndexPointer,
                           uint64_t basePointer,
                           uint64_t queue_id,
-                          uint32_t size, int doorbellSize);
+                          uint32_t size, int doorbellSize,
+                          GfxVersion gfxVersion);
     void unregisterQueue(uint64_t queue_id, int doorbellSize);
     void wakeup();
     void schedWakeup();

diff --git a/src/gpu-compute/gpu_compute_driver.ccb/src/gpu-compute/gpu_compute_driver.cc

index 3adfadd..dd8e140 100644
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -173,7 +173,7 @@
     auto &hsa_pp = device->hsaPacketProc();
     hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
                               args->ring_base_address, args->queue_id,
-                              args->ring_size, doorbellSize());
+                              args->ring_size, doorbellSize(), gfxVersion);
     args.copyOut(mem_proxy);
 }


--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/51371

To unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I133cf1acc3a00a0baded0c4c3c2a25f39effdb51
Gerrit-Change-Number: 51371
Gerrit-PatchSet: 1
Gerrit-Owner: Matt Sinclair <[email protected]>
Gerrit-MessageType: newchange

_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: dev-hsa,gpu-compute: fix bug with gfx8 VAs for HSA Queues

Reply via email to