Alex Dutu has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/53077 )

Change subject: gpu-compute,dev-hsa: Update CP and HSAPP for full-system
......................................................................

gpu-compute,dev-hsa: Update CP and HSAPP for full-system

Make the necessary changes to connect Vega pagetable walkers for
full-system mode. Previously the CP and HSA packet processor could only
read AQL packets from system/host memory using proxy port. This allows
for AQL to be read from device memory which is used for non-blit
kernels.

Change-Id: If28eb8be68173da03e15084765e77e92eda178e9
---
M src/gpu-compute/gpu_command_processor.cc
M src/gpu-compute/gpu_command_processor.hh
M src/dev/hsa/hsa_packet_processor.cc
M src/dev/hsa/hsa_packet_processor.hh
M src/gpu-compute/GPU.py
M src/dev/amdgpu/amdgpu_device.cc
M configs/example/gpufs/system/system.py
M src/dev/hsa/HSADevice.py
8 files changed, 75 insertions(+), 9 deletions(-)



diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 9aeaf3a..86815ff 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -97,11 +97,15 @@

     # This arbitrary address is something in the X86 I/O hole
     hsapp_gpu_map_paddr = 0xe00000000
+    pt_walker1 = VegaPagetableWalker()
     gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
-                                   numHWQueues=args.num_hw_queues)
+                                   numHWQueues=args.num_hw_queues,
+                                   walker=pt_walker1)
     dispatcher = GPUDispatcher()
+    pt_walker2 = VegaPagetableWalker()
     gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
-                                       dispatcher=dispatcher)
+                                       dispatcher=dispatcher,
+                                       walker=pt_walker2)
     shader.dispatcher = dispatcher
     shader.gpu_cmd_proc = gpu_cmd_proc

@@ -138,6 +142,8 @@
     system._dma_ports.append(device_ih)
     system._dma_ports.append(pm4_pkt_proc)
     system._dma_ports.append(gpu_mem_mgr)
+    system._dma_ports.append(pt_walker1)
+    system._dma_ports.append(pt_walker2)
     system._dma_ports.append(pt_walker3)
     system._dma_ports.append(pt_walker4)

diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 35f860d..f559827 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -90,6 +90,8 @@
     sdma1->setId(1);
     deviceIH->setGPUDevice(this);
     pm4PktProc->setGPUDevice(this);
+    cp->hsaPacketProc().setGPUDevice(this);
+    cp->setGPUDevice(this);
 }

 void
diff --git a/src/dev/hsa/HSADevice.py b/src/dev/hsa/HSADevice.py
index f67d9a8..a94458c 100644
--- a/src/dev/hsa/HSADevice.py
+++ b/src/dev/hsa/HSADevice.py
@@ -33,6 +33,7 @@
 from m5.params import *
 from m5.proxy import *
 from m5.objects.Device import DmaDevice
+from m5.objects.VegaGPUTLB import VegaPagetableWalker

 class HSAPacketProcessor(DmaDevice):
     type = 'HSAPacketProcessor'
@@ -50,3 +51,5 @@
     # See: https://github.com/RadeonOpenCompute/atmi/tree/master/examples/
     #      runtime/kps
     pktProcessDelay = Param.Tick(4400000, "Packet processing delay")
+    walker = Param.VegaPagetableWalker(VegaPagetableWalker(),
+            "Page table walker")
diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc
index b0421d2..43cf4e5 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -41,6 +41,7 @@
 #include "base/logging.hh"
 #include "base/trace.hh"
 #include "debug/HSAPacketProcessor.hh"
+#include "dev/amdgpu/amdgpu_device.hh"
 #include "dev/dma_device.hh"
 #include "dev/hsa/hsa_packet.hh"
 #include "dev/hsa/hw_scheduler.hh"
@@ -48,6 +49,7 @@
 #include "gpu-compute/gpu_command_processor.hh"
 #include "mem/packet_access.hh"
 #include "mem/page_table.hh"
+#include "sim/full_system.hh"
 #include "sim/process.hh"
 #include "sim/proxy_ptr.hh"
 #include "sim/system.hh"
@@ -73,7 +75,8 @@
 HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)

 HSAPacketProcessor::HSAPacketProcessor(const Params &p)
-    : DmaVirtDevice(p), numHWQueues(p.numHWQueues), pioAddr(p.pioAddr),
+    : DmaVirtDevice(p), walker(p.walker),
+      numHWQueues(p.numHWQueues), pioAddr(p.pioAddr),
       pioSize(PAGE_SIZE), pioDelay(10), pktProcessDelay(p.pktProcessDelay)
 {
     DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
@@ -92,6 +95,15 @@
 }

 void
+HSAPacketProcessor::setGPUDevice(AMDGPUDevice *gpu_device)
+{
+    gpuDevice = gpu_device;
+
+    assert(walker);
+    walker->setDevRequestor(gpuDevice->vramRequestorId());
+}
+
+void
HSAPacketProcessor::unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
 {
     hwSchdlr->unregisterQueue(queue_id, doorbellSize);
@@ -166,12 +178,20 @@
 TranslationGenPtr
 HSAPacketProcessor::translate(Addr vaddr, Addr size)
 {
- // Grab the process and try to translate the virtual address with it; with - // new extensions, it will likely be wrong to just arbitrarily grab context
-    // zero.
-    auto process = sys->threads[0]->getProcessPtr();
+    if (!FullSystem) {
+ // Grab the process and try to translate the virtual address with it;
+        // with new extensions, it will likely be wrong to just arbitrarily
+        // grab context zero.
+        auto process = sys->threads[0]->getProcessPtr();

-    return process->pTable->translateRange(vaddr, size);
+        return process->pTable->translateRange(vaddr, size);
+    }
+
+    // In full system use the page tables setup by the kernel driver rather
+    // than the CPU page tables.
+    return TranslationGenPtr(
+        new AMDGPUVM::UserTranslationGen(&gpuDevice->getVM(), walker,
+                                         1 /* vmid */, vaddr, size));
 }

 /**
diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh
index a838069..a8ba297 100644
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -57,6 +57,8 @@
 namespace gem5
 {

+class AMDGPUDevice;
+
 // Ideally, each queue should store this status and
 // the processPkt() should make decisions based on that
 // status variable.
@@ -256,6 +258,8 @@
     typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
     GPUCommandProcessor *gpu_device;
     HWScheduler *hwSchdlr;
+    AMDGPUDevice *gpuDevice;
+    VegaISA::Walker *walker;

     // Structure to store the read values of dependency signals
     // from shared memory. Also used for tracking the status of
@@ -358,6 +362,7 @@
                             Addr offset = 0, uint64_t rd_idx = 0);
     void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
     void setDevice(GPUCommandProcessor * dev);
+    void setGPUDevice(AMDGPUDevice *gpu_device);
     void updateReadIndex(int, uint32_t);
     void getCommandsFromHost(int pid, uint32_t rl_idx);
     HWScheduler *hwScheduler() { return hwSchdlr; }
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index b739e80..90120d1 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -39,6 +39,7 @@
 from m5.objects.Device import DmaDevice
 from m5.objects.LdsState import LdsState
 from m5.objects.Process import EmulatedDriver
+from m5.objects.VegaGPUTLB import VegaPagetableWalker

 class PrefetchType(Enum): vals = [
     'PF_CU',
@@ -274,6 +275,8 @@
     dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')

     hsapp = Param.HSAPacketProcessor('PP attached to this device')
+    walker = Param.VegaPagetableWalker(VegaPagetableWalker(),
+            "Page table walker")

 class StorageClassType(Enum): vals = [
     'SC_SPILL',
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index a444c9d..3d75a2c 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -35,9 +35,11 @@

 #include <cassert>

+#include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
 #include "debug/GPUKernelInfo.hh"
+#include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "mem/se_translating_port_proxy.hh"
 #include "mem/translating_port_proxy.hh"
@@ -52,7 +54,7 @@

 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
     : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
-      hsaPP(p.hsapp)
+      walker(p.walker), hsaPP(p.hsapp)
 {
     assert(hsaPP);
     hsaPP->setDevice(this);
@@ -359,6 +361,13 @@
 }

 void
+GPUCommandProcessor::setGPUDevice(AMDGPUDevice *gpu_device)
+{
+    gpuDevice = gpu_device;
+    walker->setDevRequestor(gpuDevice->vramRequestorId());
+}
+
+void
 GPUCommandProcessor::setShader(Shader *shader)
 {
     _shader = shader;
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index c9084d1..2aa4dae 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -79,6 +79,7 @@

     HSAPacketProcessor& hsaPacketProc();

+    void setGPUDevice(AMDGPUDevice *gpu_device);
     void setShader(Shader *shader);
     Shader* shader();
     GPUComputeDriver* driver();
@@ -130,6 +131,8 @@
     Shader *_shader;
     GPUDispatcher &dispatcher;
     GPUComputeDriver *_driver;
+    AMDGPUDevice *gpuDevice;
+    VegaISA::Walker *walker;

     // Typedefing dmaRead and dmaWrite function pointer
     typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/53077
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: If28eb8be68173da03e15084765e77e92eda178e9
Gerrit-Change-Number: 53077
Gerrit-PatchSet: 1
Gerrit-Owner: Alex Dutu <alexandru.d...@amd.com>
Gerrit-CC: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to