Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29922 )

Change subject: arch-gcn3: fix bug with SDWA support
......................................................................

arch-gcn3: fix bug with SDWA support

Instructions that use the SDWA field need to use the extra SRC0
register associated with the SDWA instruction instead of the
"default" SRC0 register, since the default SRC0 register contains
the SDWA information when SDWA is being used.  This commit fixes
15de044c to take this into account.  Additionally, this commit
removes reads of the registers from the SDWA helper functions,
since they overwrite any changes made to the destination register.
Finally, this change modifies the instructions that use SDWA to
simplify the flow through the execute() functions.

Change-Id: I3bad83133808dfffc6a4c40bbd49c3d76599e669
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29922
Reviewed-by: Anthony Gutierrez <anthony.gutier...@amd.com>
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutier...@amd.com>
Tested-by: kokoro <noreply+kok...@google.com>
---
M src/arch/gcn3/insts/inst_util.hh
M src/arch/gcn3/insts/instructions.cc
2 files changed, 133 insertions(+), 110 deletions(-)

Approvals:
  Anthony Gutierrez: Looks good to me, approved; Looks good to me, approved
  Matt Sinclair: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh
index a3b2f4a..292e3ba 100644
--- a/src/arch/gcn3/insts/inst_util.hh
+++ b/src/arch/gcn3/insts/inst_util.hh
@@ -547,8 +547,8 @@
      * operations are done on it.
      */
     template<typename T>
-    T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel,
-                             bool signExt)
+    T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
+                             const SDWASelVals sel, const bool signExt)
     {
         // local variables
         int first_bit = 0, last_bit = 0;
@@ -635,16 +635,14 @@
      *   2.  if sign extend is set, then sign extend the value
      */
     template<typename T>
-    void sdwaInstSrcImpl(T & currOper, T & origCurrOper, SDWASelVals sel,
-                         bool signExt)
+    void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
+                         const SDWASelVals sel, const bool signExt)
     {
         // iterate over all lanes, setting appropriate, selected value
-        currOper.read();
-        origCurrOper.read();
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
-                                                   origCurrOper[lane], sel,
-                                                   signExt);
+ origCurrOper[lane], sel,
+                                                    signExt);
         }
     }

@@ -656,8 +654,9 @@
      * operations are done on it.
      */
     template<typename T>
-    T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp,
- SDWASelVals sel, SDWADstVals unusedBits_format)
+    T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
+                             const bool clamp, const SDWASelVals sel,
+                             const SDWADstVals unusedBits_format)
     {
         // local variables
         int first_bit = 0, last_bit = 0;
@@ -756,12 +755,11 @@
      *       2 (SDWA_UNUSED_PRESERVE): select data[31:0]
      */
     template<typename T>
-    void sdwaInstDstImpl(T & dstOper, T & origDstOper, bool clamp,
-                         SDWASelVals sel, SDWADstVals unusedBits_format)
+    void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
+                         const SDWASelVals sel,
+                         const SDWADstVals unusedBits_format)
     {
         // iterate over all lanes, setting appropriate, selected value
-        dstOper.read();
-        origDstOper.read();
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
origDstOper[lane], clamp,
@@ -779,8 +777,9 @@
      */
     template<typename T>
     void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
-                                SDWASelVals src_sel, bool src_signExt,
-                                bool src_abs, bool src_neg)
+                                const SDWASelVals src_sel,
+                                const bool src_signExt, const bool src_abs,
+                                const bool src_neg)
     {
         /**
          * STEP 1: check if the absolute value (ABS) or negation (NEG) tags
@@ -812,14 +811,13 @@
      * processSDWA_src is called before the math.
      */
     template<typename T>
-    void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & src0, T & origSrc0)
+    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
     {
         // local variables
-        SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        bool src0_signExt = sdwaInst.SRC0_SEXT;
-        bool src0_neg = sdwaInst.SRC0_NEG;
-        bool src0_abs = sdwaInst.SRC0_ABS;
+        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+        const bool src0_signExt = sdwaInst.SRC0_SEXT;
+        const bool src0_neg = sdwaInst.SRC0_NEG;
+        const bool src0_abs = sdwaInst.SRC0_ABS;

// NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1 // operand. So ensure that SRC1 fields are not set, then call helper
@@ -841,18 +839,18 @@
      * called before the math.
      */
     template<typename T>
-    void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & src0, T & origSrc0, T & src1, T & origSrc1)
+    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
+                         T & src1, T & origSrc1)
     {
         // local variables
-        SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        bool src0_signExt = sdwaInst.SRC0_SEXT;
-        bool src0_neg = sdwaInst.SRC0_NEG;
-        bool src0_abs = sdwaInst.SRC0_ABS;
-        SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
-        bool src1_signExt = sdwaInst.SRC1_SEXT;
-        bool src1_neg = sdwaInst.SRC1_NEG;
-        bool src1_abs = sdwaInst.SRC1_ABS;
+        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+        const bool src0_signExt = sdwaInst.SRC0_SEXT;
+        const bool src0_neg = sdwaInst.SRC0_NEG;
+        const bool src0_abs = sdwaInst.SRC0_ABS;
+        const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
+        const bool src1_signExt = sdwaInst.SRC1_SEXT;
+        const bool src1_neg = sdwaInst.SRC1_NEG;
+        const bool src1_abs = sdwaInst.SRC1_ABS;

         processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
                                src0_abs, src0_neg);
@@ -869,13 +867,13 @@
      * processSDWA_src is called before the math.
      */
     template<typename T>
-    void processSDWA_dst(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & dst, T & origDst)
+    void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
     {
         // local variables
- SDWADstVals dst_unusedBits_format = (SDWADstVals)sdwaInst.DST_UNUSED;
-        SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
-        bool clamp = sdwaInst.CLAMP;
+        const SDWADstVals dst_unusedBits_format =
+            (SDWADstVals)sdwaInst.DST_UNUSED;
+        const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
+        const bool clamp = sdwaInst.CLAMP;

         /**
* STEP 1: select the appropriate bits for dst and pad/sign-extend as diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc
index bd6e4f4..2789f3e 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -5543,12 +5543,20 @@
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

+        src0.readSrc();
+        src1.read();
+
         if (isSDWAInst()) {
VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();

DPRINTF(GCN3, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register " "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
@@ -5566,27 +5574,27 @@
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);

-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);

-        src0.readSrc();
-        src1.read();
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }

-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
- vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                }
             }
         }

-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
-        }

         vdst.write();
     }
@@ -5895,12 +5903,20 @@
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

+        src0.readSrc();
+        src1.read();
+
         if (isSDWAInst()) {
VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and vdst during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();

DPRINTF(GCN3, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register " "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
@@ -5918,26 +5934,23 @@
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);

-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);

-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
             }
-        }

-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+                }
+            }
         }

         vdst.write();
@@ -5995,12 +6008,20 @@
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

+        src0.readSrc();
+        src1.read();
+
         if (isSDWAInst()) {
VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();

DPRINTF(GCN3, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], " "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
@@ -6018,26 +6039,23 @@
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);

-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);

-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane];
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] | src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
             }
-        }

-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] | src1[lane];
+                }
+            }
         }

         vdst.write();
@@ -6222,12 +6240,20 @@
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

+        src0.readSrc();
+        src1.read();
+
         if (isSDWAInst()) {
VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();

DPRINTF(GCN3, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], " "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
@@ -6245,28 +6271,27 @@
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);

-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);

-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
+ + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
             }
-        }

-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                    vcc.setBit(lane, ((VecElemU64)src0[lane]
+ + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
         }

         vcc.write();

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29922
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I3bad83133808dfffc6a4c40bbd49c3d76599e669
Gerrit-Change-Number: 29922
Gerrit-PatchSet: 6
Gerrit-Owner: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Tony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Tuan Ta <q...@cornell.edu>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to