Matthew Poremba has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email )

Change subject: configs,dev-amdgpu: GPUFS MI200/gfx90a support
......................................................................

configs,dev-amdgpu: GPUFS MI200/gfx90a support

Add support for MI200-like device. This includes adding PCI IDs and new
MMIOs for the device, a different MAP_PROCESS packet, and a different
calculation for the number of VGPRs.

Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d
---
M configs/example/gpufs/runfs.py
M configs/example/gpufs/system/amdgpu.py
M configs/example/gpufs/system/system.py
M src/dev/amdgpu/amdgpu_device.cc
M src/dev/amdgpu/amdgpu_device.hh
M src/dev/amdgpu/amdgpu_nbio.cc
M src/dev/amdgpu/amdgpu_nbio.hh
M src/dev/amdgpu/amdgpu_vm.hh
M src/dev/amdgpu/pm4_defines.hh
M src/dev/amdgpu/pm4_packet_processor.cc
M src/dev/amdgpu/pm4_packet_processor.hh
M src/gpu-compute/GPU.py
M src/gpu-compute/gpu_command_processor.cc
M src/gpu-compute/hsa_queue_entry.hh
14 files changed, 173 insertions(+), 27 deletions(-)



diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 4c90601..f8ef70d 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -132,8 +132,9 @@
     parser.add_argument(
         "--gpu-device",
         default="Vega10",
-        choices=["Vega10", "MI100"],
-        help="GPU model to run: Vega10 (gfx900) or MI100 (gfx908)",
+        choices=["Vega10", "MI100", "MI200"],
+        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
+        "MI200 (gfx90a)",
     )


diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py
index 5f98b55..9697e50 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -177,6 +177,10 @@
         system.pc.south_bridge.gpu.DeviceID = 0x738C
         system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
         system.pc.south_bridge.gpu.SubsystemID = 0x0C34
+    elif args.gpu_device == "MI200":
+        system.pc.south_bridge.gpu.DeviceID = 0x740F
+        system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
+        system.pc.south_bridge.gpu.SubsystemID = 0x0C34
     elif args.gpu_device == "Vega10":
         system.pc.south_bridge.gpu.DeviceID = 0x6863
     else:
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 90c5c01..263ffc0 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -152,6 +152,16 @@
             0x7D000,
         ]
         sdma_sizes = [0x1000] * 8
+    elif args.gpu_device == "MI200":
+        num_sdmas = 5
+        sdma_bases = [
+            0x4980,
+            0x6180,
+            0x78000,
+            0x79000,
+            0x7A000,
+        ]
+        sdma_sizes = [0x1000] * 5
     else:
         m5.util.panic(f"Unknown GPU device {args.gpu_device}")

diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index f58d1f7..734f0d7 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -115,7 +115,7 @@
         sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
         sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
         sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
-    } else if (p.device_name == "MI100") {
+    } else if (p.device_name == "MI100" || p.device_name == "MI200") {
         sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
         sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
         sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -144,10 +144,19 @@
     if (p.device_name == "Vega10") {
         setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
+        gfx_version = GfxVersion::gfx900;
     } else if (p.device_name == "MI100") {
         setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
         setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
+        gfx_version = GfxVersion::gfx908;
+    } else if (p.device_name == "MI200") {
+        // This device can have either 64GB or 128GB of device memory.
+        // This limits to 16GB for simulation.
+        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
+        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
+        setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
+        gfx_version = GfxVersion::gfx90a;
     } else {
         panic("Unknown GPU device %s\n", p.device_name);
     }
diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh
index cab7991..56ed2f4 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -42,6 +42,7 @@
 #include "dev/amdgpu/mmio_reader.hh"
 #include "dev/io_device.hh"
 #include "dev/pci/device.hh"
+#include "enums/GfxVersion.hh"
 #include "params/AMDGPUDevice.hh"

 namespace gem5
@@ -145,6 +146,9 @@
      */
     memory::PhysicalMemory deviceMem;

+    /* Device information */
+    GfxVersion gfx_version = GfxVersion::gfx900;
+
   public:
     AMDGPUDevice(const AMDGPUDeviceParams &p);

@@ -206,6 +210,9 @@
     uint16_t getVMID(Addr doorbell) { return doorbellVMIDMap[doorbell]; }
     std::unordered_map<uint16_t, std::set<int>>& getUsedVMIDs();
     void insertQId(uint16_t vmid, int id);
+
+    /* Device information */
+    GfxVersion getGfxVersion() const { return gfx_version; }
 };

 } // namespace gem5
diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc
index 8064fd2..bf437b6 100644
--- a/src/dev/amdgpu/amdgpu_nbio.cc
+++ b/src/dev/amdgpu/amdgpu_nbio.cc
@@ -75,12 +75,14 @@
       case VEGA10_INV_ENG17_ACK2:
       case MI100_INV_ENG17_ACK2:
       case MI100_INV_ENG17_ACK3:
+      case MI200_INV_ENG17_ACK2:
         pkt->setLE<uint32_t>(0x10001);
         break;
       case VEGA10_INV_ENG17_SEM1:
       case VEGA10_INV_ENG17_SEM2:
       case MI100_INV_ENG17_SEM2:
       case MI100_INV_ENG17_SEM3:
+      case MI200_INV_ENG17_SEM2:
         pkt->setLE<uint32_t>(0x1);
         break;
       // PSP responds with bit 31 set when ready
diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh
index d1e5391..dc95443 100644
--- a/src/dev/amdgpu/amdgpu_nbio.hh
+++ b/src/dev/amdgpu/amdgpu_nbio.hh
@@ -80,6 +80,11 @@
 #define MI100_INV_ENG17_SEM2                              0x6a888
 #define MI100_INV_ENG17_SEM3                              0x76888

+#define MI200_INV_ENG17_ACK1                              0x0a318
+#define MI200_INV_ENG17_ACK2                              0x6b018
+#define MI200_INV_ENG17_SEM1                              0x0a288
+#define MI200_INV_ENG17_SEM2                              0x6af88
+
 class AMDGPUNbio
 {
   public:
diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh
index ac35a11..f35a735 100644
--- a/src/dev/amdgpu/amdgpu_vm.hh
+++ b/src/dev/amdgpu/amdgpu_vm.hh
@@ -81,6 +81,10 @@
#define MI100_FB_LOCATION_BASE 0x6ac00 #define MI100_FB_LOCATION_TOP 0x6ac04

+#define MI200_MEM_SIZE_REG 0x0378c +#define MI200_FB_LOCATION_BASE 0x6b300 +#define MI200_FB_LOCATION_TOP 0x6b304
+
 // AMD GPUs support 16 different virtual address spaces
 static constexpr int AMDGPU_VM_COUNT = 16;

diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh
index 42832d5..a303f8e 100644
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -275,6 +275,64 @@

 typedef struct GEM5_PACKED
 {
+    uint32_t pasid : 16;
+    uint32_t reserved0 : 8;
+    uint32_t diq : 1;
+    uint32_t processQuantum : 7;
+    union
+    {
+        struct
+        {
+            uint32_t ptBaseLo;
+            uint32_t ptBaseHi;
+        };
+        uint64_t ptBase;
+    };
+    uint32_t shMemBases;
+    uint32_t shMemConfig;
+    uint32_t sqShaderTbaLo;
+    uint32_t sqShaderTbaHi;
+    uint32_t sqShaderTmaLo;
+    uint32_t sqShaderTmaHi;
+    uint32_t reserved1;
+    union
+    {
+        struct
+        {
+            uint32_t gdsAddrLo;
+            uint32_t gdsAddrHi;
+        };
+        uint64_t gdsAddr;
+    };
+    union
+    {
+        struct
+        {
+            uint32_t numGws : 7;
+            uint32_t sdma_enable : 1;
+            uint32_t numOac : 4;
+            uint32_t reserved3 : 4;
+            uint32_t gdsSize : 6;
+            uint32_t numQueues : 10;
+        };
+        uint32_t ordinal14;
+    };
+    uint32_t spiGdbgPerVmidCntl;
+    uint32_t tcpWatchCntl[4];
+    union
+    {
+        struct
+        {
+            uint32_t completionSignalLo;
+            uint32_t completionSignalHi;
+        };
+        uint64_t completionSignal;
+    };
+}  PM4MapProcessMI200;
+static_assert(sizeof(PM4MapProcessMI200) == 80);
+
+typedef struct GEM5_PACKED
+{
     uint32_t function : 4;
     uint32_t memSpace : 2;
     uint32_t operation : 2;
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index 3690113..e7b8465 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -271,12 +271,21 @@
                     dmaBuffer);
         } break;
       case IT_MAP_PROCESS: {
-        dmaBuffer = new PM4MapProcess();
-        cb = new DmaVirtCallback<uint64_t>(
-            [ = ] (const uint64_t &)
-                { mapProcess(q, (PM4MapProcess *)dmaBuffer); });
-        dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
-                    dmaBuffer);
+        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
+            dmaBuffer = new PM4MapProcessMI200();
+            cb = new DmaVirtCallback<uint64_t>(
+                [ = ] (const uint64_t &)
+ { mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); });
+            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
+                        cb, dmaBuffer);
+        } else {
+            dmaBuffer = new PM4MapProcess();
+            cb = new DmaVirtCallback<uint64_t>(
+                [ = ] (const uint64_t &)
+                    { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
+            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
+                        dmaBuffer);
+        }
         } break;

       case IT_UNMAP_QUEUES: {
@@ -613,27 +622,50 @@
 }

 void
-PM4PacketProcessor::mapProcess(PM4Queue *q, PM4MapProcess *pkt)
+PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
+                               uint32_t shMemBases)
 {
-    q->incRptr(sizeof(PM4MapProcess));
-    uint16_t vmid = gpuDevice->allocateVMID(pkt->pasid);
+    uint16_t vmid = gpuDevice->allocateVMID(pasid);

- DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p vmid: %d quantum: " - "%d pt: %p signal: %p\n", pkt->pasid, vmid, pkt->processQuantum,
-            pkt->ptBase, pkt->completionSignal);
-
-    gpuDevice->getVM().setPageTableBase(vmid, pkt->ptBase);
- gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, pkt->shMemBases);
+    gpuDevice->getVM().setPageTableBase(vmid, ptBase);
+    gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, shMemBases);

     // Setup the apertures that gem5 uses. These values are bits [63:48].
-    Addr lds_base = (Addr)bits(pkt->shMemBases, 31, 16) << 48;
-    Addr scratch_base = (Addr)bits(pkt->shMemBases, 15, 0) << 48;
+    Addr lds_base = (Addr)bits(shMemBases, 31, 16) << 48;
+    Addr scratch_base = (Addr)bits(shMemBases, 15, 0) << 48;

     // There does not seem to be any register for the limit, but the driver
     // assumes scratch and LDS have a 4GB aperture, so use that.
     gpuDevice->CP()->shader()->setLdsApe(lds_base, lds_base + 0xFFFFFFFF);
     gpuDevice->CP()->shader()->setScratchApe(scratch_base,
                                              scratch_base + 0xFFFFFFFF);
+}
+
+void
+PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
+{
+    q->incRptr(sizeof(PM4MapProcess));
+
+    DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
+            "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
+            pkt->ptBase, pkt->completionSignal);
+
+    mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
+
+    delete pkt;
+    decodeNext(q);
+}
+
+void
+PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
+{
+    q->incRptr(sizeof(PM4MapProcessMI200));
+
+    DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
+            "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
+            pkt->ptBase, pkt->completionSignal);
+
+    mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);

     delete pkt;
     decodeNext(q);
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh
index 4617a21..3fb0551 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -141,7 +141,9 @@
     void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
     void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
     void doneMQDWrite(Addr mqdAddr, Addr addr);
-    void mapProcess(PM4Queue *q, PM4MapProcess *pkt);
+    void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
+    void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
+    void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd,
                     uint16_t vmid);
     void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 3a87186..c5449cc 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -45,7 +45,7 @@


 class GfxVersion(ScopedEnum):
-    vals = ["gfx801", "gfx803", "gfx900", "gfx902"]
+    vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"]


 class PoolManager(SimObject):
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index af59b78..9755180 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -228,7 +228,8 @@
     DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());

     HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
-        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
+        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
+        gpuDevice->getGfxVersion());

     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
         "grid size (%dx%dx%d) kernarg addr: %#x, completion "
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index fbe0efe..4083c1c 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -51,6 +51,7 @@
 #include "base/types.hh"
 #include "dev/hsa/hsa_packet.hh"
 #include "dev/hsa/hsa_queue.hh"
+#include "enums/GfxVersion.hh"
 #include "gpu-compute/kernel_code.hh"

 namespace gem5
@@ -61,7 +62,7 @@
   public:
     HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
                   int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
-                  Addr host_pkt_addr, Addr code_addr)
+ Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
         : kernName(kernel_name),
_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x, (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
@@ -92,9 +93,19 @@
         // we need to rip register usage from the resource registers.
         //
         // We can't get an exact number of registers from the resource
- // registers because they round, but we can get an upper bound on it
-        if (!numVgprs)
-            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
+ // registers because they round, but we can get an upper bound on it.
+        // We determine the number of registers by solving for "vgprs_used"
+        // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
+        //     #code-object-v3-kernel-descriptor
+        // Currently, the only supported gfx version in gem5 that computes
+        // this differently is gfx90a.
+        if (!numVgprs) {
+            if (gfx_version == GfxVersion::gfx90a) {
+                numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
+            } else {
+                numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
+            }
+        }

         if (!numSgprs || numSgprs ==
std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-MessageType: newchange
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d
Gerrit-Change-Number: 70317
Gerrit-PatchSet: 1
Gerrit-Owner: Matthew Poremba <matthew.pore...@amd.com>
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org

Reply via email to