[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement ds_swizzle

2020-07-17 Thread Anthony Gutierrez (Gerrit) via gem5-dev
Anthony Gutierrez has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/29958 )


Change subject: arch-gcn3: Implement ds_swizzle
..

arch-gcn3: Implement ds_swizzle

Change-Id: I7d188388afa16932217ae207368666a724207c52
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29958
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
Reviewed-by: Matt Sinclair 
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 102 insertions(+), 2 deletions(-)

Approvals:
  Matt Sinclair: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 71efd8f..002c4d5 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32266,6 +32266,7 @@
 Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
 : Inst_DS(iFmt, "ds_swizzle_b32")
 {
+ setFlag(Load);
 } // Inst_DS__DS_SWIZZLE_B32

 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
@@ -32277,8 +32278,107 @@
 void
 Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+wf->rdLmReqsInPipe--;
+wf->validateRequestCounters();
+
+if (gpuDynInst->exec_mask.none()) {
+return;
+}
+
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()
+->cyclesToTicks(Cycles(24)));
+
+ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);
+/**
+ * The "DS pattern" is comprised of both offset fields. That is,  
the

+ * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
+ * which swizzle mode to use. There are two different swizzle
+ * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
+ * QDMode else use Bit-masks mode. The remaining bits dictate how  
to

+ * swizzle the lanes.
+ *
+ * QDMode:  Chunks the lanes into 4s and swizzles among them.
+ *  Bits 7:6 dictate where lane 3 (of the current  
chunk)

+ *  gets its date, 5:4 lane 2, etc.
+ *
+ * Bit-mask:This mode breaks bits 14:0 into 3 equal-sized  
chunks.

+ *  14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
+ *  is the and_mask. Each lane is swizzled by  
performing

+ *  the appropriate operation using these masks.
+ */
+VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) |  
instData.OFFSET0);

+
+data.read();
+
+if (bits(ds_pattern, 15)) {
+// QDMode
+for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
+/**
+ * This operation allows data sharing between groups
+ * of four consecutive threads. Note the increment by
+ * 4 in the for loop.
+ */
+if (gpuDynInst->exec_mask[lane]) {
+int index0 = lane + bits(ds_pattern, 1, 0);
+panic_if(index0 >= NumVecElemPerVecReg, "%s: index0  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index0);
+vdst[lane]
+= gpuDynInst->exec_mask[index0] ? data[index0]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 1]) {
+int index1 = lane + bits(ds_pattern, 3, 2);
+panic_if(index1 >= NumVecElemPerVecReg, "%s: index1  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index1);
+vdst[lane + 1]
+= gpuDynInst->exec_mask[index1] ? data[index1]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 2]) {
+int index2 = lane + bits(ds_pattern, 5, 4);
+panic_if(index2 >= NumVecElemPerVecReg, "%s: index2  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index2);
+vdst[lane + 2]
+= gpuDynInst->exec_mask[index2] ? data[index2]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 3]) {
+int index3 = lane + bits(ds_pattern, 7, 6);
+panic_if(index3 >= NumVecElemPerVecReg, "%s: index3  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index3);
+   

[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement ds_swizzle

2020-06-03 Thread Anthony Gutierrez (Gerrit) via gem5-dev

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit

https://gem5-review.googlesource.com/c/public/gem5/+/29958

to review the following change.


Change subject: arch-gcn3: Implement ds_swizzle
..

arch-gcn3: Implement ds_swizzle

Change-Id: I7d188388afa16932217ae207368666a724207c52
---
M src/arch/gcn3/insts/instructions.cc
1 file changed, 102 insertions(+), 2 deletions(-)



diff --git a/src/arch/gcn3/insts/instructions.cc  
b/src/arch/gcn3/insts/instructions.cc

index 71efd8f..002c4d5 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32266,6 +32266,7 @@
 Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
 : Inst_DS(iFmt, "ds_swizzle_b32")
 {
+ setFlag(Load);
 } // Inst_DS__DS_SWIZZLE_B32

 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
@@ -32277,8 +32278,107 @@
 void
 Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
 {
-panicUnimplemented();
-}
+Wavefront *wf = gpuDynInst->wavefront();
+wf->rdLmReqsInPipe--;
+wf->validateRequestCounters();
+
+if (gpuDynInst->exec_mask.none()) {
+return;
+}
+
+gpuDynInst->execUnitId = wf->execUnitId;
+gpuDynInst->latency.init(gpuDynInst->computeUnit());
+gpuDynInst->latency.set(gpuDynInst->computeUnit()
+->cyclesToTicks(Cycles(24)));
+
+ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+VecOperandU32 vdst(gpuDynInst, extData.VDST);
+/**
+ * The "DS pattern" is comprised of both offset fields. That is,  
the

+ * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
+ * which swizzle mode to use. There are two different swizzle
+ * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
+ * QDMode else use Bit-masks mode. The remaining bits dictate how  
to

+ * swizzle the lanes.
+ *
+ * QDMode:  Chunks the lanes into 4s and swizzles among them.
+ *  Bits 7:6 dictate where lane 3 (of the current  
chunk)

+ *  gets its date, 5:4 lane 2, etc.
+ *
+ * Bit-mask:This mode breaks bits 14:0 into 3 equal-sized  
chunks.

+ *  14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
+ *  is the and_mask. Each lane is swizzled by  
performing

+ *  the appropriate operation using these masks.
+ */
+VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) |  
instData.OFFSET0);

+
+data.read();
+
+if (bits(ds_pattern, 15)) {
+// QDMode
+for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
+/**
+ * This operation allows data sharing between groups
+ * of four consecutive threads. Note the increment by
+ * 4 in the for loop.
+ */
+if (gpuDynInst->exec_mask[lane]) {
+int index0 = lane + bits(ds_pattern, 1, 0);
+panic_if(index0 >= NumVecElemPerVecReg, "%s: index0  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index0);
+vdst[lane]
+= gpuDynInst->exec_mask[index0] ? data[index0]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 1]) {
+int index1 = lane + bits(ds_pattern, 3, 2);
+panic_if(index1 >= NumVecElemPerVecReg, "%s: index1  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index1);
+vdst[lane + 1]
+= gpuDynInst->exec_mask[index1] ? data[index1]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 2]) {
+int index2 = lane + bits(ds_pattern, 5, 4);
+panic_if(index2 >= NumVecElemPerVecReg, "%s: index2  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index2);
+vdst[lane + 2]
+= gpuDynInst->exec_mask[index2] ? data[index2]: 0;
+}
+if (gpuDynInst->exec_mask[lane + 3]) {
+int index3 = lane + bits(ds_pattern, 7, 6);
+panic_if(index3 >= NumVecElemPerVecReg, "%s: index3  
(%d) "
+ "is out of bounds.\n",  
gpuDynInst->disassemble(),

+ index3);
+vdst[lane + 3]
+= gpuDynInst->exec_mask[index3] ? data[index3]: 0;
+}
+}
+} else {
+// Bit Mode
+int and_mask =