Alex Dutu has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/42214 )
Change subject: gpu-compute: Topology and driver changes for dGPU
......................................................................
gpu-compute: Topology and driver changes for dGPU
New topology ripped from Fiji to support dGPU. A dGPU flag is added to
the config which is propogated to the driver. The emulated driver is
now able to properly deal with dGPU ioctls and mmaps. For now, dGPU
physical memory is allocated from the host, but this is easy to change
once we get a GPU memory controller up and running.
Change-Id: I594418482b12ec8fb2e4018d8d0371d56f4f51c8
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42214
Reviewed-by: Matt Sinclair <[email protected]>
Maintainer: Matt Sinclair <[email protected]>
Tested-by: kokoro <[email protected]>
---
M configs/example/apu_se.py
M configs/example/hsaTopology.py
M src/dev/hsa/hsa_driver.cc
M src/gpu-compute/GPU.py
M src/gpu-compute/gpu_compute_driver.cc
M src/gpu-compute/gpu_compute_driver.hh
6 files changed, 329 insertions(+), 27 deletions(-)
Approvals:
Matt Sinclair: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index feed8a7..baf9360 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -182,6 +182,13 @@
parser.add_option("--reg-alloc-policy",type="string", default="simple",
help="register allocation policy (simple/dynamic)")
+parser.add_option("--dgpu", action="store_true", default=False,
+ help="Configure the system as a dGPU instead of an APU. "
+ "The dGPU config has its own local memory pool and is
not "
+ "coherent with the host through hardware. Data is "
+ "transfered from host to device memory using runtime
calls "
+ "that copy data over a PCIe-like IO bus.")
+
Ruby.define_options(parser)
#add TLB options to the parser
@@ -417,7 +424,7 @@
hsapp_gpu_map_paddr = int(Addr(options.mem_size))
# HSA kernel mode driver
-gpu_driver = GPUComputeDriver(filename="kfd")
+gpu_driver = GPUComputeDriver(filename = "kfd", isdGPU = options.dgpu)
# Creating the GPU kernel launching components: that is the HSA
# packet processor (HSAPP), GPU command processor (CP), and the
@@ -470,7 +477,15 @@
"/usr/lib/x86_64-linux-gnu"
]),
'HOME=%s' % os.getenv('HOME','/'),
- "HSA_ENABLE_INTERRUPT=1"]
+ # Disable the VM fault handler signal creation for dGPUs also
+ # forces the use of DefaultSignals instead of driver-controlled
+ # InteruptSignals throughout the runtime. DefaultSignals poll
+ # on memory in the runtime, while InteruptSignals call into the
+ # driver.
+ "HSA_ENABLE_INTERRUPT=1",
+ # We don't have an SDMA hardware model, so need to fallback to
+ # vector copy kernels for dGPU memcopies to/from host and
device.
+ "HSA_ENABLE_SDMA=0"]
process = Process(executable = executable, cmd = [options.cmd]
+ options.options.split(), drivers = [gpu_driver], env =
env)
@@ -643,7 +658,12 @@
root = Root(system=system, full_system=False)
-hsaTopology.createHsaTopology(options)
+# Create the /sys/devices filesystem for the simulator so that the HSA
Runtime
+# knows what type of GPU hardware we are simulating
+if options.dgpu:
+ hsaTopology.createFijiTopology(options)
+else:
+ hsaTopology.createCarrizoTopology(options)
m5.ticks.setGlobalFrequency('1THz')
if options.abs_max_tick:
diff --git a/configs/example/hsaTopology.py b/configs/example/hsaTopology.py
index 707a83d..a5e0d44 100644
--- a/configs/example/hsaTopology.py
+++ b/configs/example/hsaTopology.py
@@ -49,7 +49,177 @@
rmtree(path)
makedirs(path)
-def createHsaTopology(options):
+# This fakes out a dGPU setup so the runtime correctly operations. The
spoofed
+# system has a single dGPU and a single socket CPU. Note that more complex
+# topologies (multi-GPU, multi-socket CPUs) need to have a different setup
+# here or the runtime won't be able to issue Memcpies from one node to
another.
+#
+# TODO: There is way too much hardcoded here. It doesn't effect anything
in
+# our current ROCm stack (1.6), but it is highly possible that it will in
the
+# future. We might need to scrub through this and extract the appropriate
+# fields from the simulator in the future.
+def createFijiTopology(options):
+ topology_dir = joinpath(m5.options.outdir, \
+ 'fs/sys/devices/virtual/kfd/kfd/topology')
+ remake_dir(topology_dir)
+
+ amdgpu_dir = joinpath(m5.options.outdir, \
+ 'fs/sys/module/amdgpu/parameters')
+ remake_dir(amdgpu_dir)
+
+ # Fiji reported VM size in GB. Used to reserve an allocation from CPU
+ # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree)
+ file_append((amdgpu_dir, 'vm_size'), 256)
+
+ # Ripped from real Fiji platform to appease KMT version checks
+ file_append((topology_dir, 'generation_id'), 2)
+
+ # Set up system properties. Regiter as ast-rocm server
+ sys_prop = 'platform_oem 35498446626881\n' + \
+ 'platform_id 71791775140929\n' + \
+ 'platform_rev 2\n'
+ file_append((topology_dir, 'system_properties'), sys_prop)
+
+ # Populate the topology tree
+ # Our dGPU system is two nodes. Node 0 is a CPU and Node 1 is a dGPU
+ node_dir = joinpath(topology_dir, 'nodes/0')
+ remake_dir(node_dir)
+
+ # Register as a CPU
+ file_append((node_dir, 'gpu_id'), 0)
+ file_append((node_dir, 'name'), '')
+
+ # CPU links. Only thing that matters is we tell the runtime that GPU
is
+ # connected through PCIe to CPU socket 0.
+ io_links = 1
+ io_dir = joinpath(node_dir, 'io_links/0')
+ remake_dir(io_dir)
+ io_prop = 'type 2\n' + \
+ 'version_major 0\n' + \
+ 'version_minor 0\n' + \
+ 'node_from 0\n' + \
+ 'node_to 1\n' + \
+ 'weight 20\n' + \
+ 'min_latency 0\n' + \
+ 'max_latency 0\n' + \
+ 'min_bandwidth 0\n' + \
+ 'max_bandwidth 0\n' + \
+ 'recommended_transfer_size 0\n' + \
+ 'flags 13\n'
+ file_append((io_dir, 'properties'), io_prop)
+
+ # Populate CPU node properties
+ node_prop = 'cpu_cores_count %s\n' % options.num_cpus + \
+ 'simd_count 0\n' + \
+ 'mem_banks_count 1\n' + \
+ 'caches_count 0\n' + \
+ 'io_links_count %s\n' % io_links + \
+ 'cpu_core_id_base 0\n' + \
+ 'simd_id_base 0\n' + \
+ 'max_waves_per_simd 0\n' + \
+ 'lds_size_in_kb 0\n' + \
+ 'gds_size_in_kb 0\n' + \
+ 'wave_front_size 64\n' + \
+ 'array_count 0\n' + \
+ 'simd_arrays_per_engine 0\n' + \
+ 'cu_per_simd_array 0\n' + \
+ 'simd_per_cu 0\n' + \
+ 'max_slots_scratch_cu 0\n' + \
+ 'vendor_id 0\n' + \
+ 'device_id 0\n' + \
+ 'location_id 0\n' + \
+ 'drm_render_minor 0\n' + \
+ 'max_engine_clk_ccompute 3400\n'
+
+ file_append((node_dir, 'properties'), node_prop)
+
+ # CPU memory reporting
+ mem_dir = joinpath(node_dir, 'mem_banks/0')
+ remake_dir(mem_dir)
+ mem_prop = 'heap_type 0\n' + \
+ 'size_in_bytes 33704329216\n' + \
+ 'flags 0\n' + \
+ 'width 72\n' + \
+ 'mem_clk_max 2400\n'
+
+ file_append((mem_dir, 'properties'), mem_prop)
+
+ # Build the GPU node
+ node_dir = joinpath(topology_dir, 'nodes/1')
+ remake_dir(node_dir)
+
+ # Register as a Fiji
+ file_append((node_dir, 'gpu_id'), 50156)
+ file_append((node_dir, 'name'), 'Fiji\n')
+
+ # Real Fiji shows 96, but building that topology is complex and doesn't
+ # appear to be required for anything.
+ caches = 0
+
+ # GPU links. Only thing that matters is we tell the runtime that GPU
is
+ # connected through PCIe to CPU socket 0.
+ io_links = 1
+ io_dir = joinpath(node_dir, 'io_links/0')
+ remake_dir(io_dir)
+ io_prop = 'type 2\n' + \
+ 'version_major 0\n' + \
+ 'version_minor 0\n' + \
+ 'node_from 1\n' + \
+ 'node_to 0\n' + \
+ 'weight 20\n' + \
+ 'min_latency 0\n' + \
+ 'max_latency 0\n' + \
+ 'min_bandwidth 0\n' + \
+ 'max_bandwidth 0\n' + \
+ 'recommended_transfer_size 0\n' + \
+ 'flags 1\n'
+ file_append((io_dir, 'properties'), io_prop)
+
+ # Populate GPU node properties
+ node_prop = 'cpu_cores_count %s\n' %
options.num_cpus + \
+ 'simd_count %s\n'
\
+ % (options.num_compute_units *
options.simds_per_cu) + \
+ 'mem_banks_count
1\n' + \
+ 'caches_count %s\n' %
caches + \
+ 'io_links_count %s\n' %
io_links + \
+ 'cpu_core_id_base
0\n' + \
+ 'simd_id_base
2147487744\n' + \
+ 'max_waves_per_simd %s\n' %
options.wfs_per_simd + \
+ 'lds_size_in_kb %s\n' % int(options.lds_size /
1024) + \
+ 'gds_size_in_kb
0\n' + \
+ 'wave_front_size %s\n' %
options.wf_size + \
+ 'array_count 4\n' + \
+ 'simd_arrays_per_engine %s\n' %
options.sa_per_complex + \
+ 'cu_per_simd_array %s\n' %
options.cu_per_sa + \
+ 'simd_per_cu %s\n' %
options.simds_per_cu + \
+ 'max_slots_scratch_cu
32\n' + \
+ 'vendor_id
4098\n' + \
+ 'device_id
29440\n' + \
+ 'location_id
512\n' + \
+ 'max_engine_clk_fcompute %s\n'
\
+ % int(toFrequency(options.gpu_clock) /
1e6) + \
+ 'local_mem_size
4294967296\n' + \
+ 'fw_version
730\n' + \
+ 'capability
4736\n' + \
+ 'max_engine_clk_ccompute %s\n'
\
+ % int(toFrequency(options.CPUClock) / 1e6)
+
+ file_append((node_dir, 'properties'), node_prop)
+
+ # Fiji HBM reporting
+ # TODO: Extract size, clk, and width from sim paramters
+ mem_dir = joinpath(node_dir, 'mem_banks/0')
+ remake_dir(mem_dir)
+ mem_prop = 'heap_type 1\n' + \
+ 'size_in_bytes 4294967296\n' + \
+ 'flags 0\n' + \
+ 'width 4096\n' + \
+ 'mem_clk_max 500\n'
+
+ file_append((mem_dir, 'properties'), mem_prop)
+
+
+def createCarrizoTopology(options):
topology_dir = joinpath(m5.options.outdir, \
'fs/sys/devices/virtual/kfd/kfd/topology')
remake_dir(topology_dir)
diff --git a/src/dev/hsa/hsa_driver.cc b/src/dev/hsa/hsa_driver.cc
index db31cbc..f2db436 100644
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -70,25 +70,25 @@
HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
int tgt_flags, int tgt_fd, off_t offset)
{
- // Is this a signal event mmap
- bool is_event_mmap = false;
- // If addr == 0, then we may need to do mmap.
- bool should_mmap = (start == 0);
- auto process = tc->getProcessPtr();
- auto mem_state = process->memState;
- // Check if mmap is for signal events first
- if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) ==
- KFD_MMAP_TYPE_EVENTS) {
- is_event_mmap = true;
- DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length:
0x%x,"
- "offset: 0x%x, )\n", start, length, offset);
- panic_if(start != 0,
- "Start address should be provided by KFD\n");
- panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
- "Requested length %d, expected length %d; length
mismatch\n",
- length, 8 * KFD_SIGNAL_EVENT_LIMIT);
- // For signal event, do mmap only is eventPage is uninitialized
- should_mmap = (!eventPage);
+ // Is this a signal event mmap
+ bool is_event_mmap = false;
+ // If addr == 0, then we may need to do mmap.
+ bool should_mmap = (start == 0);
+ auto process = tc->getProcessPtr();
+ auto mem_state = process->memState;
+ // Check if mmap is for signal events first
+ if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) ==
+ KFD_MMAP_TYPE_EVENTS) {
+ is_event_mmap = true;
+ DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length:
0x%x,"
+ "offset: 0x%x, )\n", start, length, offset);
+ panic_if(start != 0,
+ "Start address should be provided by KFD\n");
+ panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
+ "Requested length %d, expected length %d; length
mismatch\n",
+ length, 8 * KFD_SIGNAL_EVENT_LIMIT);
+ // For signal event, do mmap only is eventPage is uninitialized
+ should_mmap = (!eventPage);
} else {
DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x,"
"offset: 0x%x)\n", start, length, offset);
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index d2959ac..e548823 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -236,6 +236,7 @@
class GPUComputeDriver(HSADriver):
type = 'GPUComputeDriver'
cxx_header = 'gpu-compute/gpu_compute_driver.hh'
+ isdGPU = Param.Bool(False, 'Driver is for a dGPU')
class GPUDispatcher(SimObject):
type = 'GPUDispatcher'
diff --git a/src/gpu-compute/gpu_compute_driver.cc
b/src/gpu-compute/gpu_compute_driver.cc
index 664afa9..6c4639a 100644
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -40,10 +40,11 @@
#include "dev/hsa/kfd_event_defines.h"
#include "dev/hsa/kfd_ioctl.h"
#include "params/GPUComputeDriver.hh"
+#include "sim/process.hh"
#include "sim/syscall_emul_buf.hh"
GPUComputeDriver::GPUComputeDriver(const Params &p)
- : HSADriver(p)
+ : HSADriver(p), isdGPU(p.isdGPU)
{
device->attachDriver(this);
DPRINTF(GPUDriver, "Constructing KFD: device\n");
@@ -86,6 +87,19 @@
break;
case AMDKFD_IOC_SET_MEMORY_POLICY:
{
+ /**
+ * This is where the runtime requests MTYPE from an aperture.
+ * Basically, the globally memory aperture is divided up into
+ * a default aperture and an alternate aperture each of which
have
+ * their own MTYPE policies. This is done to mark a small
piece
+ * of the global memory as uncacheable. Host memory mappings
will
+ * be carved out of this uncacheable aperture, which is how
they
+ * implement 'coherent' host/device memory on dGPUs.
+ *
+ * TODO: Need to reflect per-aperture MTYPE policies based on
this
+ * call.
+ *
+ */
warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
}
break;
@@ -145,7 +159,10 @@
gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
// NOTE: Must match ID populated by hsaTopology.py
- args->process_apertures[i].gpu_id = 2765;
+ if (isdGPU)
+ args->process_apertures[i].gpu_id = 50156;
+ else
+ args->process_apertures[i].gpu_id = 2765;
DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
args->process_apertures[i].gpuvm_base);
@@ -351,9 +368,91 @@
warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
}
break;
+ /**
+ * In real hardware, this IOCTL maps host memory, dGPU memory, or
dGPU
+ * doorbells into GPUVM space. Essentially, ROCm implements SVM by
+ * carving out a region of free VA space that both the host and
GPUVM
+ * can agree upon. The entire GPU VA space is reserved on the host
+ * using a fixed mmap at a low VA range that is also directly
+ * accessable by the GPU's limited number of VA bits. When we
actually
+ * call memory allocation later in the program, this IOCTL is
invoked
+ * to create BOs/VMAs in the driver and bind them to physical
+ * memory/doorbells.
+ *
+ * For gem5, we don't need to carve out any GPUVM space here (we
don't
+ * support GPUVM and use host page tables on the GPU directly). We
can
+ * can just use the existing host SVM region. We comment on each
memory
+ * type seperately.
+ */
case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
{
- warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+ DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+ TypedBufferArg<kfd_ioctl_alloc_memory_of_gpu_args>
args(ioc_buf);
+ args.copyIn(virt_proxy);
+
+ assert(isdGPU);
+ assert((args->va_addr % TheISA::PageBytes) == 0);
+ Addr mmap_offset = 0;
+
+ if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) {
+ DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n");
+ args->mmap_offset = args->va_addr;
+ // VRAM allocations are device memory mapped into GPUVM
+ // space.
+ //
+ // We can't rely on the lazy host allocator (fixupFault) to
+ // handle this mapping since it needs to be placed in dGPU
+ // framebuffer memory. The lazy allocator will try to
place
+ // this in host memory.
+ //
+ // TODO: We don't have the appropriate bifurcation of the
+ // physical address space with different memory controllers
+ // yet. This is where we will explicitly add the PT maps
to
+ // dGPU memory in the future.
+ } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) {
+ DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n");
+ mmap_offset = args->mmap_offset;
+ // USERPTR allocations are system memory mapped into GPUVM
+ // space. The user provides the driver with the pointer.
+ //
+ // No action needs to be taken for this memory type. We
will
+ // lazily map it into host memory on first touch.
+ } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) {
+ DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n");
+ args->mmap_offset = args->va_addr;
+ // GTT allocations are system memory mapped into GPUVM
space.
+ // It's different than a USERPTR allocation since the
driver
+ // itself allocates the physical memory on the host.
+ //
+ // No action needs to be taken for this memory type. We
will
+ // lazily map it into host memory on first touch. The
+ // fixupFault will find the original SVM aperture mapped
to the
+ // host.
+ //
+ // Note that for GTT the thunk layer needs to call mmap on
the
+ // driver FD later if it wants the host to have access to
this
+ // memory (which it probably does).
+ } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) {
+ DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n");
+ // DOORBELL allocations are the queue doorbells that are
+ // memory mapped into GPUVM space.
+ //
+ // Explicitly map this virtual address to our PIO doorbell
+ // interface in the page tables (non-cacheable)
+ tc->getProcessPtr()->pTable->map(args->va_addr,
+ device->hsaPacketProc().pioAddr,
+ args->size, false);
+ break;
+ }
+
+ DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p "
+ "size %lu, mmap_offset %p, gpu_id %d\n",
+ args->va_addr, args->size, mmap_offset, args->gpu_id);
+
+ // TODO: Not sure where the handle is used yet. Set it to an
+ // easily trackable value.
+ args->handle= 0xdeadbeef;
+ args.copyOut(virt_proxy);
}
break;
case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
@@ -361,6 +460,13 @@
warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
}
break;
+ /**
+ * Called to map an already allocated region of memory to this
GPU's
+ * GPUVM VA space. We don't need to implement this in the
simulator
+ * since we only have a single VM system. If the region has
already
+ * been allocated somewhere like the CPU, then it's already visible
+ * to the device.
+ */
case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
{
warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
@@ -415,7 +521,11 @@
ape_args->gpuvm_base = gpuVmApeBase(i + 1);
ape_args->gpuvm_limit =
gpuVmApeLimit(ape_args->gpuvm_base);
- ape_args->gpu_id = 2765;
+ // NOTE: Must match ID populated by hsaTopology.py
+ if (isdGPU)
+ ape_args->gpu_id = 50156;
+ else
+ ape_args->gpu_id = 2765;
assert(bits<Addr>(ape_args->scratch_base, 63, 47) !=
0x1ffff);
assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
diff --git a/src/gpu-compute/gpu_compute_driver.hh
b/src/gpu-compute/gpu_compute_driver.hh
index d2c822d..f8c02b2 100644
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -55,6 +55,7 @@
void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
private:
+ bool isdGPU;
/**
* The aperture (APE) base/limit pairs are set
* statically at startup by the real KFD. AMD
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/42214
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I594418482b12ec8fb2e4018d8d0371d56f4f51c8
Gerrit-Change-Number: 42214
Gerrit-PatchSet: 10
Gerrit-Owner: Alex Dutu <[email protected]>
Gerrit-Reviewer: Alex Dutu <[email protected]>
Gerrit-Reviewer: Kyle Roarty <[email protected]>
Gerrit-Reviewer: Matt Sinclair <[email protected]>
Gerrit-Reviewer: Matthew Poremba <[email protected]>
Gerrit-Reviewer: kokoro <[email protected]>
Gerrit-CC: Michael LeBeane <[email protected]>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s