[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3, gpu-compute: Implement out-of-range accesses

Anthony Gutierrez (Gerrit) via gem5-dev Wed, 03 Jun 2020 16:04:37 -0700

Hello Michael LeBeane, Tony Gutierrez,

I'd like you to do a code review. Please visit


    https://gem5-review.googlesource.com/c/public/gem5/+/29935

to review the following change.


Change subject: arch-gcn3, gpu-compute: Implement out-of-range accesses
......................................................................

arch-gcn3, gpu-compute: Implement out-of-range accesses

Certain buffer out-of-range memory accesses should be special
cased and not generate memory accesses. This patch implements
those special cases and supresses lanes from accessing memory
when the calculated address falls in an ISA-specified out-of-range
condition.

Change-Id: I8298f861c6b59587789853a01e503ba7d98cb13d
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
M src/gpu-compute/global_memory_pipeline.cc
3 files changed, 96 insertions(+), 6 deletions(-)

diff --git a/src/arch/gcn3/insts/instructions.ccb/src/arch/gcn3/insts/instructions.cc

index b923eae..2e39bf5 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -34453,8 +34453,12 @@

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane]);
+                if (!oobMask[lane]) {

+ vdst[lane] =(VecElemU32)((reinterpret_cast<VecElemU8*>(

+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }

@@ -34580,8 +34584,12 @@

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane]);
+                if (!oobMask[lane]) {

+ vdst[lane] =(VecElemU32)((reinterpret_cast<VecElemU16*>(

+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }

@@ -34707,8 +34715,12 @@

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
+                if (!oobMask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }

diff --git a/src/arch/gcn3/insts/op_encodings.hhb/src/arch/gcn3/insts/op_encodings.hh

index 308560a..22c146a 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -40,6 +40,7 @@
 #include "arch/gcn3/gpu_mem_helpers.hh"
 #include "arch/gcn3/insts/gpu_static_inst.hh"
 #include "arch/gcn3/operand.hh"
+#include "debug/GCN3.hh"
 #include "debug/GPUExec.hh"
 #include "mem/ruby/system/RubySystem.hh"

@@ -489,14 +490,26 @@
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {

+ // temporarily modify exec_mask to supress memory accesses tooob+ // regions. Only issue memory requests for lanes that havetheir

+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
             initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            gpuDynInst->exec_mask = old_exec_mask;
         }

         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {

+ // temporarily modify exec_mask to supress memory accesses tooob+ // regions. Only issue memory requests for lanes that havetheir

+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
             initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            gpuDynInst->exec_mask = old_exec_mask;
         }

         void
@@ -566,6 +579,42 @@

                     buf_off = v_off[lane] + inst_offset;

+
+                    /**
+                     * Range check behavior causes out of range accesses to

+ * to be treated differently. Out of range accessesreturn

+                     * 0 for loads and are ignored for stores. For
+                     * non-formatted accesses, this is done on a per-lane
+                     * basis.
+                     */
+                    if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
+                        if (buf_off + stride * buf_idx >=
+                            rsrc_desc.numRecords - s_offset.rawData()) {

+ DPRINTF(GCN3, "mubuf out-of-bounds condition1: "

+                                    "lane = %d, buffer_offset = %llx, "
+                                    "const_stride = %llx, "
+                                    "const_num_records = %llx\n",
+                                    lane, buf_off + stride * buf_idx,

+ rsrc_desc.stride,rsrc_desc.numRecords);

+                            oobMask.set(lane);
+                            continue;
+                        }
+                    }
+
+                    if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
+                        if (buf_idx >= rsrc_desc.numRecords ||
+                            buf_off >= stride) {

+ DPRINTF(GCN3, "mubuf out-of-bounds condition2: "

+                                    "lane = %d, offset = %llx, "
+                                    "index = %llx, "
+                                    "const_num_records = %llx\n",
+                                    lane, buf_off, buf_idx,
+                                    rsrc_desc.numRecords);
+                            oobMask.set(lane);
+                            continue;
+                        }
+                    }
+
                     if (rsrc_desc.swizzleEn) {
                         Addr idx_stride = 8 << rsrc_desc.idxStride;
                         Addr elem_size = 2 << rsrc_desc.elemSize;
@@ -573,6 +622,12 @@
                         Addr idx_lsb = buf_idx % idx_stride;
                         Addr off_msb = buf_off / elem_size;
                         Addr off_lsb = buf_off % elem_size;
+                        DPRINTF(GCN3, "mubuf swizzled lane %d: "
+                                "idx_stride = %llx, elem_size = %llx, "
+                                "idx_msb = %llx, idx_lsb = %llx, "
+                                "off_msb = %llx, off_lsb = %llx\n",

+ lane, idx_stride, elem_size, idx_msb,idx_lsb,

+                                off_msb, off_lsb);

                         vaddr += ((idx_msb * stride + off_msb * elem_size)
                             * idx_stride + idx_lsb * elem_size + off_lsb);
@@ -580,6 +635,11 @@
                         vaddr += buf_off + stride * buf_idx;
                     }

+                    DPRINTF(GCN3, "Calculating mubuf address for lane %d: "
+                            "vaddr = %llx, base_addr = %llx, "

+ "stride = %llx, buf_idx = %llx, buf_off= %llx\n",

+                            lane, vaddr, base_addr, stride,
+                            buf_idx, buf_off);
                     gpuDynInst->addr.at(lane) = vaddr;
                 }
             }
@@ -589,6 +649,10 @@
         InFmt_MUBUF instData;
         // second instruction DWORD
         InFmt_MUBUF_1 extData;
+        // Mask of lanes with out-of-bounds accesses.  Needs to be tracked
+        // seperately from the exec_mask so that we remember to write zero
+        // to the registers associated with out of bounds lanes.
+        VectorMask oobMask;
     }; // Inst_MUBUF

     class Inst_MTBUF : public GCN3GPUStaticInst

diff --git a/src/gpu-compute/global_memory_pipeline.ccb/src/gpu-compute/global_memory_pipeline.cc

index c73184a..cfd7c3d 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -206,6 +206,20 @@
                 std::make_pair(mp, false)));
         }

+ if (!mp->isMemSync() && !mp->isEndOfKernel() &&mp->allLanesZero()) {

+            /**
+            * Memory accesses instructions that do not generate any memory

+ * requests (such as out-of-bounds buffer acceses where alllanes+ * are out of bounds) will not trigger a callback to completethe

+            * request, so we need to mark it as completed as soon as it is
+            * issued.  Note this this will still insert an entry in the
+            * ordered return FIFO such that waitcnt is still resolved
+            * correctly.
+            */
+            handleResponse(mp);
+            computeUnit->getTokenManager()->recvTokens(1);
+        }
+
         gmIssuedRequests.pop();

         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29935

To unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I8298f861c6b59587789853a01e503ba7d98cb13d
Gerrit-Change-Number: 29935
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez <[email protected]>
Gerrit-Reviewer: Michael LeBeane <[email protected]>
Gerrit-Reviewer: Tony Gutierrez <[email protected]>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3, gpu-compute: Implement out-of-range accesses

Reply via email to