[gem5-dev] [M] Change in gem5/gem5[develop]: arch-vega: Helper methods for SDWA/DPP for VOP2

2023-06-15 Thread Matthew Poremba (Gerrit) via gem5-dev
Matthew Poremba has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/70738?usp=email )


Change subject: arch-vega: Helper methods for SDWA/DPP for VOP2
..

arch-vega: Helper methods for SDWA/DPP for VOP2

Many of the outstanding issues with the GPU model are related to
instructions not having SDWA/DPP implementations and executing by
ignoring the special registers leading to incorrect executiong.
Adding SDWA/DPP is current very cumbersome as there is a lot of
boilerplate code.

This changeset adds helper methods for VOP2 with one instruction
changed as an example. This review is intended to get feedback
before applying this change to all VOP2 instructions that support
SDWA/DPP.

Change-Id: I1edbc3f3bb166d34f151545aa9f47a94150e1406
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70738
Reviewed-by: Matt Sinclair 
Maintainer: Matt Sinclair 
Tested-by: kokoro 
---
M src/arch/amdgpu/vega/insts/instructions.cc
M src/arch/amdgpu/vega/insts/op_encodings.hh
2 files changed, 97 insertions(+), 52 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved; Looks good to me, approved
  kokoro: Regressions pass




diff --git a/src/arch/amdgpu/vega/insts/instructions.cc  
b/src/arch/amdgpu/vega/insts/instructions.cc

index 6c014bc..0d3f2dc 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -6384,65 +6384,17 @@
 void
 Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
 {
-Wavefront *wf = gpuDynInst->wavefront();
-ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-src0.readSrc();
-src1.read();
-
-if (isSDWAInst()) {
-VecOperandU32 src0_sdwa(gpuDynInst,  
extData.iFmt_VOP_SDWA.SRC0);

-// use copies of original src0, src1, and dest during selecting
-VecOperandU32 origSrc0_sdwa(gpuDynInst,
-extData.iFmt_VOP_SDWA.SRC0);
-VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-src0_sdwa.read();
-origSrc0_sdwa.read();
-origSrc1.read();
-
-DPRINTF(VEGA, "Handling V_MUL_U32_U24 SRC SDWA. SRC0:  
register "

-"v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
-"%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d,  
SRC1_SEL: "

-"%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-extData.iFmt_VOP_SDWA.SRC0,  
extData.iFmt_VOP_SDWA.DST_SEL,

-extData.iFmt_VOP_SDWA.DST_U,
-extData.iFmt_VOP_SDWA.CLMP,
-extData.iFmt_VOP_SDWA.SRC0_SEL,
-extData.iFmt_VOP_SDWA.SRC0_SEXT,
-extData.iFmt_VOP_SDWA.SRC0_NEG,
-extData.iFmt_VOP_SDWA.SRC0_ABS,
-extData.iFmt_VOP_SDWA.SRC1_SEL,
-extData.iFmt_VOP_SDWA.SRC1_SEXT,
-extData.iFmt_VOP_SDWA.SRC1_NEG,
-extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa,  
origSrc0_sdwa,

-src1, origSrc1);
-
-for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-if (wf->execMask(lane)) {
-vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
- bits(src1[lane], 23, 0);
-origVdst[lane] = vdst[lane]; // keep copy consistent
-}
-}
-
-processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-} else {
+auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
+ VecOperandU32& vdst, Wavefront* wf) {
 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
 vdst[lane] = bits(src0[lane], 23, 0) *
  bits(src1[lane], 23, 0);
 }
 }
-}
+};

-
-vdst.write();
+vop2Helper(gpuDynInst, opImpl);
 } // execute
 // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---

diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh  
b/src/arch/amdgpu/vega/insts/op_encodings.hh

index 1071ead..f195472 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -272,6 +272,99 @@
 InstFormat extData;
 uint32_t varSize;

+template
+T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T & src1)
+{
+T src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+// use copies of original src0, src1, and dest during selecting
+ 

[gem5-dev] [M] Change in gem5/gem5[develop]: arch-vega: Helper methods for SDWA/DPP for VOP2

2023-05-17 Thread Matthew Poremba (Gerrit) via gem5-dev
Matthew Poremba has uploaded this change for review. (  
https://gem5-review.googlesource.com/c/public/gem5/+/70738?usp=email )



Change subject: arch-vega: Helper methods for SDWA/DPP for VOP2
..

arch-vega: Helper methods for SDWA/DPP for VOP2

Many of the outstanding issues with the GPU model are related to
instructions not having SDWA/DPP implementations and executing by
ignoring the special registers leading to incorrect executiong.
Adding SDWA/DPP is current very cumbersome as there is a lot of
boilerplate code.

This changeset adds helper methods for VOP2 with one instruction
changed as an example. This review is intended to get feedback
before applying this change to all VOP2 instructions that support
SDWA/DPP.

Change-Id: I1edbc3f3bb166d34f151545aa9f47a94150e1406
---
M src/arch/amdgpu/vega/insts/instructions.cc
M src/arch/amdgpu/vega/insts/op_encodings.hh
2 files changed, 97 insertions(+), 52 deletions(-)



diff --git a/src/arch/amdgpu/vega/insts/instructions.cc  
b/src/arch/amdgpu/vega/insts/instructions.cc

index 6c014bc..0d3f2dc 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -6384,65 +6384,17 @@
 void
 Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
 {
-Wavefront *wf = gpuDynInst->wavefront();
-ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-src0.readSrc();
-src1.read();
-
-if (isSDWAInst()) {
-VecOperandU32 src0_sdwa(gpuDynInst,  
extData.iFmt_VOP_SDWA.SRC0);

-// use copies of original src0, src1, and dest during selecting
-VecOperandU32 origSrc0_sdwa(gpuDynInst,
-extData.iFmt_VOP_SDWA.SRC0);
-VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-src0_sdwa.read();
-origSrc0_sdwa.read();
-origSrc1.read();
-
-DPRINTF(VEGA, "Handling V_MUL_U32_U24 SRC SDWA. SRC0:  
register "

-"v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
-"%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d,  
SRC1_SEL: "

-"%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-extData.iFmt_VOP_SDWA.SRC0,  
extData.iFmt_VOP_SDWA.DST_SEL,

-extData.iFmt_VOP_SDWA.DST_U,
-extData.iFmt_VOP_SDWA.CLMP,
-extData.iFmt_VOP_SDWA.SRC0_SEL,
-extData.iFmt_VOP_SDWA.SRC0_SEXT,
-extData.iFmt_VOP_SDWA.SRC0_NEG,
-extData.iFmt_VOP_SDWA.SRC0_ABS,
-extData.iFmt_VOP_SDWA.SRC1_SEL,
-extData.iFmt_VOP_SDWA.SRC1_SEXT,
-extData.iFmt_VOP_SDWA.SRC1_NEG,
-extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa,  
origSrc0_sdwa,

-src1, origSrc1);
-
-for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-if (wf->execMask(lane)) {
-vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
- bits(src1[lane], 23, 0);
-origVdst[lane] = vdst[lane]; // keep copy consistent
-}
-}
-
-processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-} else {
+auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
+ VecOperandU32& vdst, Wavefront* wf) {
 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 if (wf->execMask(lane)) {
 vdst[lane] = bits(src0[lane], 23, 0) *
  bits(src1[lane], 23, 0);
 }
 }
-}
+};

-
-vdst.write();
+vop2Helper(gpuDynInst, opImpl);
 } // execute
 // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---

diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh  
b/src/arch/amdgpu/vega/insts/op_encodings.hh

index 1071ead..f195472 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -272,6 +272,99 @@
 InstFormat extData;
 uint32_t varSize;

+template
+T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T & src1)
+{
+T src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+// use copies of original src0, src1, and dest during selecting
+T origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+T origSrc1(gpuDynInst, instData.VSRC1);
+
+src0_sdwa.read();
+origSrc0_sdwa.read();
+origSrc1.read();
+
+DPRINTF(VEGA,