[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement ds_swizzle
Anthony Gutierrez has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/29958 ) Change subject: arch-gcn3: Implement ds_swizzle .. arch-gcn3: Implement ds_swizzle Change-Id: I7d188388afa16932217ae207368666a724207c52 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29958 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 102 insertions(+), 2 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved Anthony Gutierrez: Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 71efd8f..002c4d5 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32266,6 +32266,7 @@ Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_swizzle_b32") { + setFlag(Load); } // Inst_DS__DS_SWIZZLE_B32 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32() @@ -32277,8 +32278,107 @@ void Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +wf->rdLmReqsInPipe--; +wf->validateRequestCounters(); + +if (gpuDynInst->exec_mask.none()) { +return; +} + +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit() +->cyclesToTicks(Cycles(24))); + +ConstVecOperandU32 data(gpuDynInst, extData.DATA0); +VecOperandU32 vdst(gpuDynInst, extData.VDST); +/** + * The "DS pattern" is comprised of both offset fields. That is, the + * swizzle pattern between lanes. Bit 15 of the DS pattern dictates + * which swizzle mode to use. There are two different swizzle + * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use + * QDMode else use Bit-masks mode. The remaining bits dictate how to + * swizzle the lanes. + * + * QDMode: Chunks the lanes into 4s and swizzles among them. + * Bits 7:6 dictate where lane 3 (of the current chunk) + * gets its date, 5:4 lane 2, etc. + * + * Bit-mask:This mode breaks bits 14:0 into 3 equal-sized chunks. + * 14:10 is the xor_mask, 9:5 is the or_mask, and 4:0 + * is the and_mask. Each lane is swizzled by performing + * the appropriate operation using these masks. + */ +VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0); + +data.read(); + +if (bits(ds_pattern, 15)) { +// QDMode +for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) { +/** + * This operation allows data sharing between groups + * of four consecutive threads. Note the increment by + * 4 in the for loop. + */ +if (gpuDynInst->exec_mask[lane]) { +int index0 = lane + bits(ds_pattern, 1, 0); +panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index0); +vdst[lane] += gpuDynInst->exec_mask[index0] ? data[index0]: 0; +} +if (gpuDynInst->exec_mask[lane + 1]) { +int index1 = lane + bits(ds_pattern, 3, 2); +panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index1); +vdst[lane + 1] += gpuDynInst->exec_mask[index1] ? data[index1]: 0; +} +if (gpuDynInst->exec_mask[lane + 2]) { +int index2 = lane + bits(ds_pattern, 5, 4); +panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index2); +vdst[lane + 2] += gpuDynInst->exec_mask[index2] ? data[index2]: 0; +} +if (gpuDynInst->exec_mask[lane + 3]) { +int index3 = lane + bits(ds_pattern, 7, 6); +panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index3); +
[gem5-dev] Change in gem5/gem5[develop]: arch-gcn3: Implement ds_swizzle
Hello Tony Gutierrez, I'd like you to do a code review. Please visit https://gem5-review.googlesource.com/c/public/gem5/+/29958 to review the following change. Change subject: arch-gcn3: Implement ds_swizzle .. arch-gcn3: Implement ds_swizzle Change-Id: I7d188388afa16932217ae207368666a724207c52 --- M src/arch/gcn3/insts/instructions.cc 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 71efd8f..002c4d5 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32266,6 +32266,7 @@ Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_swizzle_b32") { + setFlag(Load); } // Inst_DS__DS_SWIZZLE_B32 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32() @@ -32277,8 +32278,107 @@ void Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst) { -panicUnimplemented(); -} +Wavefront *wf = gpuDynInst->wavefront(); +wf->rdLmReqsInPipe--; +wf->validateRequestCounters(); + +if (gpuDynInst->exec_mask.none()) { +return; +} + +gpuDynInst->execUnitId = wf->execUnitId; +gpuDynInst->latency.init(gpuDynInst->computeUnit()); +gpuDynInst->latency.set(gpuDynInst->computeUnit() +->cyclesToTicks(Cycles(24))); + +ConstVecOperandU32 data(gpuDynInst, extData.DATA0); +VecOperandU32 vdst(gpuDynInst, extData.VDST); +/** + * The "DS pattern" is comprised of both offset fields. That is, the + * swizzle pattern between lanes. Bit 15 of the DS pattern dictates + * which swizzle mode to use. There are two different swizzle + * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use + * QDMode else use Bit-masks mode. The remaining bits dictate how to + * swizzle the lanes. + * + * QDMode: Chunks the lanes into 4s and swizzles among them. + * Bits 7:6 dictate where lane 3 (of the current chunk) + * gets its date, 5:4 lane 2, etc. + * + * Bit-mask:This mode breaks bits 14:0 into 3 equal-sized chunks. + * 14:10 is the xor_mask, 9:5 is the or_mask, and 4:0 + * is the and_mask. Each lane is swizzled by performing + * the appropriate operation using these masks. + */ +VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0); + +data.read(); + +if (bits(ds_pattern, 15)) { +// QDMode +for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) { +/** + * This operation allows data sharing between groups + * of four consecutive threads. Note the increment by + * 4 in the for loop. + */ +if (gpuDynInst->exec_mask[lane]) { +int index0 = lane + bits(ds_pattern, 1, 0); +panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index0); +vdst[lane] += gpuDynInst->exec_mask[index0] ? data[index0]: 0; +} +if (gpuDynInst->exec_mask[lane + 1]) { +int index1 = lane + bits(ds_pattern, 3, 2); +panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index1); +vdst[lane + 1] += gpuDynInst->exec_mask[index1] ? data[index1]: 0; +} +if (gpuDynInst->exec_mask[lane + 2]) { +int index2 = lane + bits(ds_pattern, 5, 4); +panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index2); +vdst[lane + 2] += gpuDynInst->exec_mask[index2] ? data[index2]: 0; +} +if (gpuDynInst->exec_mask[lane + 3]) { +int index3 = lane + bits(ds_pattern, 7, 6); +panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) " + "is out of bounds.\n", gpuDynInst->disassemble(), + index3); +vdst[lane + 3] += gpuDynInst->exec_mask[index3] ? data[index3]: 0; +} +} +} else { +// Bit Mode +int and_mask =