Hello Michael LeBeane, Tony Gutierrez,

I'd like you to do a code review. Please visit

    https://gem5-review.googlesource.com/c/public/gem5/+/29935

to review the following change.


Change subject: arch-gcn3, gpu-compute: Implement out-of-range accesses
......................................................................

arch-gcn3, gpu-compute: Implement out-of-range accesses

Certain buffer out-of-range memory accesses should be special
cased and not generate memory accesses. This patch implements
those special cases and supresses lanes from accessing memory
when the calculated address falls in an ISA-specified out-of-range
condition.

Change-Id: I8298f861c6b59587789853a01e503ba7d98cb13d
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
M src/gpu-compute/global_memory_pipeline.cc
3 files changed, 96 insertions(+), 6 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc
index b923eae..2e39bf5 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -34453,8 +34453,12 @@

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane]);
+                if (!oobMask[lane]) {
+ vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }

@@ -34580,8 +34584,12 @@

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane]);
+                if (!oobMask[lane]) {
+ vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }

@@ -34707,8 +34715,12 @@

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
+                if (!oobMask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }

diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh
index 308560a..22c146a 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -40,6 +40,7 @@
 #include "arch/gcn3/gpu_mem_helpers.hh"
 #include "arch/gcn3/insts/gpu_static_inst.hh"
 #include "arch/gcn3/operand.hh"
+#include "debug/GCN3.hh"
 #include "debug/GPUExec.hh"
 #include "mem/ruby/system/RubySystem.hh"

@@ -489,14 +490,26 @@
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
+ // temporarily modify exec_mask to supress memory accesses to oob + // regions. Only issue memory requests for lanes that have their
+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
             initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            gpuDynInst->exec_mask = old_exec_mask;
         }

         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
+ // temporarily modify exec_mask to supress memory accesses to oob + // regions. Only issue memory requests for lanes that have their
+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
             initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            gpuDynInst->exec_mask = old_exec_mask;
         }

         void
@@ -566,6 +579,42 @@

                     buf_off = v_off[lane] + inst_offset;

+
+                    /**
+                     * Range check behavior causes out of range accesses to
+ * to be treated differently. Out of range accesses return
+                     * 0 for loads and are ignored for stores. For
+                     * non-formatted accesses, this is done on a per-lane
+                     * basis.
+                     */
+                    if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
+                        if (buf_off + stride * buf_idx >=
+                            rsrc_desc.numRecords - s_offset.rawData()) {
+ DPRINTF(GCN3, "mubuf out-of-bounds condition 1: "
+                                    "lane = %d, buffer_offset = %llx, "
+                                    "const_stride = %llx, "
+                                    "const_num_records = %llx\n",
+                                    lane, buf_off + stride * buf_idx,
+ rsrc_desc.stride, rsrc_desc.numRecords);
+                            oobMask.set(lane);
+                            continue;
+                        }
+                    }
+
+                    if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
+                        if (buf_idx >= rsrc_desc.numRecords ||
+                            buf_off >= stride) {
+ DPRINTF(GCN3, "mubuf out-of-bounds condition 2: "
+                                    "lane = %d, offset = %llx, "
+                                    "index = %llx, "
+                                    "const_num_records = %llx\n",
+                                    lane, buf_off, buf_idx,
+                                    rsrc_desc.numRecords);
+                            oobMask.set(lane);
+                            continue;
+                        }
+                    }
+
                     if (rsrc_desc.swizzleEn) {
                         Addr idx_stride = 8 << rsrc_desc.idxStride;
                         Addr elem_size = 2 << rsrc_desc.elemSize;
@@ -573,6 +622,12 @@
                         Addr idx_lsb = buf_idx % idx_stride;
                         Addr off_msb = buf_off / elem_size;
                         Addr off_lsb = buf_off % elem_size;
+                        DPRINTF(GCN3, "mubuf swizzled lane %d: "
+                                "idx_stride = %llx, elem_size = %llx, "
+                                "idx_msb = %llx, idx_lsb = %llx, "
+                                "off_msb = %llx, off_lsb = %llx\n",
+ lane, idx_stride, elem_size, idx_msb, idx_lsb,
+                                off_msb, off_lsb);

                         vaddr += ((idx_msb * stride + off_msb * elem_size)
                             * idx_stride + idx_lsb * elem_size + off_lsb);
@@ -580,6 +635,11 @@
                         vaddr += buf_off + stride * buf_idx;
                     }

+                    DPRINTF(GCN3, "Calculating mubuf address for lane %d: "
+                            "vaddr = %llx, base_addr = %llx, "
+ "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
+                            lane, vaddr, base_addr, stride,
+                            buf_idx, buf_off);
                     gpuDynInst->addr.at(lane) = vaddr;
                 }
             }
@@ -589,6 +649,10 @@
         InFmt_MUBUF instData;
         // second instruction DWORD
         InFmt_MUBUF_1 extData;
+        // Mask of lanes with out-of-bounds accesses.  Needs to be tracked
+        // seperately from the exec_mask so that we remember to write zero
+        // to the registers associated with out of bounds lanes.
+        VectorMask oobMask;
     }; // Inst_MUBUF

     class Inst_MTBUF : public GCN3GPUStaticInst
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index c73184a..cfd7c3d 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -206,6 +206,20 @@
                 std::make_pair(mp, false)));
         }

+ if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
+            /**
+            * Memory accesses instructions that do not generate any memory
+ * requests (such as out-of-bounds buffer acceses where all lanes + * are out of bounds) will not trigger a callback to complete the
+            * request, so we need to mark it as completed as soon as it is
+            * issued.  Note this this will still insert an entry in the
+            * ordered return FIFO such that waitcnt is still resolved
+            * correctly.
+            */
+            handleResponse(mp);
+            computeUnit->getTokenManager()->recvTokens(1);
+        }
+
         gmIssuedRequests.pop();

         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29935
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I8298f861c6b59587789853a01e503ba7d98cb13d
Gerrit-Change-Number: 29935
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez <[email protected]>
Gerrit-Reviewer: Michael LeBeane <[email protected]>
Gerrit-Reviewer: Tony Gutierrez <[email protected]>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to