[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: add support for unaligned accesses

Anthony Gutierrez (Gerrit) via gem5-dev Wed, 03 Jun 2020 15:59:36 -0700

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit


    https://gem5-review.googlesource.com/c/public/gem5/+/29920

to review the following change.


Change subject: arch-gcn3: add support for unaligned accesses
......................................................................

arch-gcn3: add support for unaligned accesses

Previously, with HSAIL, we were guaranteed by the HSA specification
that the GPU will never issue unaligned accesses.  However, now
that we are directly running GCN this is no longer true.
Accordingly, this commit adds support for unaligned accesses.
Moreover, to reduce the replication of nearly identical
code for the different request types, I also added new helper
functions that are called by all the different memory request
producing instruction types in op_encodings.hh.

Adding support for unaligned instructions requires changing
the statusBitVector used to track the status of the memory
requests for each lane from a bit per lane to an int per lane.
This is necessary because an unaligned access may span multiple
cache lines.  In the worst case, each lane may span multiple
cache lines.  There are corresponding changes in the files that
use the statusBitVector.

Change-Id: I319bf2f0f644083e98ca546d2bfe68cf87a5f967
---
A src/arch/gcn3/gpu_mem_helpers.hh
M src/arch/gcn3/insts/op_encodings.hh
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/gpu_dyn_inst.cc
M src/gpu-compute/gpu_dyn_inst.hh
M src/mem/ruby/common/DataBlock.cc
M src/mem/ruby/system/RubyPort.cc
7 files changed, 297 insertions(+), 242 deletions(-)

diff --git a/src/arch/gcn3/gpu_mem_helpers.hhb/src/arch/gcn3/gpu_mem_helpers.hh

new file mode 100644
index 0000000..40ca565
--- /dev/null
+++ b/src/arch/gcn3/gpu_mem_helpers.hh
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions aremet:

+ *

+ * 1. Redistributions of source code must retain the above copyrightnotice,

+ * this list of conditions and the following disclaimer.
+ *

+ * 2. Redistributions in binary form must reproduce the above copyrightnotice,+ * this list of conditions and the following disclaimer in thedocumentation

+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its

+ * contributors may be used to endorse or promote products derived fromthis

+ * software without specific prior written permission.
+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS"+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULARPURPOSE+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORSBE

+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OFTHE

+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Matt Sinclair
+ */
+
+#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+
+#include "arch/gcn3/insts/gpu_static_inst.hh"
+#include "arch/gcn3/insts/op_encodings.hh"
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+/**

+ * Helper function for instructions declared in op_encodings. Thisfunction+ * takes in all of the arguments for a given memory request we are tryingto

+ * initialize, then submits the request or requests depending on if the
+ * original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
+                 bool is_atomic=false)
+{
+    // local variables
+    int req_size = N * sizeof(T);
+    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+    Addr vaddr = 0, split_addr = 0;
+    bool misaligned_acc = false;
+    RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
+    PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
+
+    gpuDynInst->resetEntireStatusVector();
+    for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
+        if (gpuDynInst->exec_mask[lane]) {
+            vaddr = gpuDynInst->addr[lane];
+
+            /**
+             * the base address of the cache line where the the last
+             * byte of the request will be stored.
+             */
+            split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+            /**
+             * if the base cache line address of the last byte is
+             * greater than the address of the first byte then we have
+             * a misaligned access.
+             */
+            misaligned_acc = split_addr > vaddr;
+
+            if (is_atomic) {
+                req = std::make_shared<Request>(vaddr, sizeof(T), 0,
+                    gpuDynInst->computeUnit()->masterId(), 0,
+                    gpuDynInst->wfDynId,
+                    gpuDynInst->makeAtomicOpFunctor<T>(
+                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],

+&(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));

+            } else {
+                req = std::make_shared<Request>(vaddr, req_size, 0,
+                                  gpuDynInst->computeUnit()->masterId(), 0,
+                                  gpuDynInst->wfDynId);
+            }
+
+            if (misaligned_acc) {
+                gpuDynInst->setStatusVector(lane, 2);
+                req->splitOnVaddr(split_addr, req1, req2);
+                gpuDynInst->setRequestFlags(req1);
+                gpuDynInst->setRequestFlags(req2);
+                pkt1 = new Packet(req1, mem_req_type);
+                pkt2 = new Packet(req2, mem_req_type);
+                pkt1->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N]);
+                pkt2->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N + req1->getSize()]);

+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unalignedmemory "

+                        "request for %#x\n", gpuDynInst->cu_id,
+                        gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
+                        split_addr);

+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,pkt1);+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,pkt2);

+            } else {
+                gpuDynInst->setStatusVector(lane, 1);
+                gpuDynInst->setRequestFlags(req);
+                pkt = new Packet(req, mem_req_type);
+                pkt->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N]);

+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,pkt);

+            }
+        } else { // if lane is not active, then no pending requests
+            gpuDynInst->setStatusVector(lane, 0);
+        }
+    }
+}
+
+/**
+ * Helper function for scalar instructions declared in op_encodings.  This
+ * function takes in all of the arguments for a given memory request we are

+ * trying to initialize, then submits the request or requests depending onif

+ * the original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
+{
+    int req_size = N * sizeof(T);
+    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+    Addr vaddr = gpuDynInst->scalarAddr;
+
+    /**
+     * the base address of the cache line where the the last byte of
+     * the request will be stored.
+     */
+    Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+    assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+    /**
+     * if the base cache line address of the last byte is greater
+     * than the address of the first byte then we have a misaligned
+     * access.
+     */
+    bool misaligned_acc = split_addr > vaddr;
+
+    RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
+                                 gpuDynInst->computeUnit()->masterId(), 0,
+                                 gpuDynInst->wfDynId);
+
+    if (misaligned_acc) {
+        RequestPtr req1, req2;
+        req->splitOnVaddr(split_addr, req1, req2);
+        gpuDynInst->numScalarReqs = 2;
+        gpuDynInst->setRequestFlags(req1);
+        gpuDynInst->setRequestFlags(req2);
+        PacketPtr pkt1 = new Packet(req1, mem_req_type);
+        PacketPtr pkt2 = new Packet(req2, mem_req_type);
+        pkt1->dataStatic(gpuDynInst->scalar_data);
+        pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());

+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory requestfor"

+                " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, split_addr);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
+    } else {
+        gpuDynInst->numScalarReqs = 1;
+        gpuDynInst->setRequestFlags(req);
+        PacketPtr pkt = new Packet(req, mem_req_type);
+        pkt->dataStatic(gpuDynInst->scalar_data);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
+    }
+}
+
+#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__

diff --git a/src/arch/gcn3/insts/op_encodings.hhb/src/arch/gcn3/insts/op_encodings.hh

index 3197dc0..308560a 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -37,6 +37,7 @@
 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__

 #include "arch/gcn3/gpu_decoder.hh"
+#include "arch/gcn3/gpu_mem_helpers.hh"
 #include "arch/gcn3/insts/gpu_static_inst.hh"
 #include "arch/gcn3/operand.hh"
 #include "debug/GPUExec.hh"
@@ -174,47 +175,8 @@
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-            int req_size = N * sizeof(ScalarRegU32);
-            Addr vaddr = gpuDynInst->scalarAddr;
-
-            /**

- * the base address of the cache line where the the last byteof

-             * the request will be stored.
-             */
-            Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-            /**
-             * if the base cache line address of the last byte is greater
-             * than the address of the first byte then we have a misaligned
-             * access.
-             */
-            bool misaligned_acc = split_addr > vaddr;
-
-            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
-                    gpuDynInst->computeUnit()->masterId(), 0,
-                    gpuDynInst->wfDynId);
-
-            if (misaligned_acc) {
-                RequestPtr req1, req2;
-                req->splitOnVaddr(split_addr, req1, req2);
-                gpuDynInst->numScalarReqs = 2;
-                gpuDynInst->setRequestFlags(req1);
-                gpuDynInst->setRequestFlags(req2);
-                PacketPtr pkt1 = new Packet(req1, MemCmd::ReadReq);
-                PacketPtr pkt2 = new Packet(req2, MemCmd::ReadReq);
-                pkt1->dataStatic(gpuDynInst->scalar_data);

- pkt2->dataStatic(gpuDynInst->scalar_data +req1->getSize());- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst,pkt1);- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst,pkt2);

-            } else {
-                gpuDynInst->numScalarReqs = 1;
-                gpuDynInst->setRequestFlags(req);
-                PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                pkt->dataStatic(gpuDynInst->scalar_data);

- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst,pkt);

-            }
+            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+                                                    MemCmd::ReadReq);
         }

         /**
@@ -224,47 +186,8 @@
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-            int req_size = N * sizeof(ScalarRegU32);
-            Addr vaddr = gpuDynInst->scalarAddr;
-
-            /**

- * the base address of the cache line where the the last byteof

-             * the request will be stored.
-             */
-            Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-            /**
-             * if the base cache line address of the last byte is greater
-             * than the address of the first byte then we have a misaligned
-             * access.
-             */
-            bool misaligned_acc = split_addr > vaddr;
-
-            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
-                    gpuDynInst->computeUnit()->masterId(), 0,
-                    gpuDynInst->wfDynId);
-
-            if (misaligned_acc) {
-                RequestPtr req1, req2;
-                req->splitOnVaddr(split_addr, req1, req2);
-                gpuDynInst->numScalarReqs = 2;
-                gpuDynInst->setRequestFlags(req1);
-                gpuDynInst->setRequestFlags(req2);
-                PacketPtr pkt1 = new Packet(req1, MemCmd::WriteReq);
-                PacketPtr pkt2 = new Packet(req2, MemCmd::WriteReq);
-                pkt1->dataStatic(gpuDynInst->scalar_data);

- pkt2->dataStatic(gpuDynInst->scalar_data +req1->getSize());- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst,pkt1);- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst,pkt2);

-            } else {
-                gpuDynInst->numScalarReqs = 1;
-                gpuDynInst->setRequestFlags(req);
-                PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                pkt->dataStatic(gpuDynInst->scalar_data);

- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst,pkt);

-            }
+            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+                                                    MemCmd::WriteReq);
         }

         void
@@ -566,59 +489,22 @@
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(), 0,
-                        gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-
-                    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-