[gem5-dev] [M] Change in gem5/gem5[develop]: arch-vega: Fix several issues with DPP
Matthew Poremba has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/66752?usp=email ) ( 2 is the latest approved patch-set. No files were changed between the latest approved patch-set and the submitted one. )Change subject: arch-vega: Fix several issues with DPP .. arch-vega: Fix several issues with DPP DPP processing has several issues which are fixed in this changeset: 1) Incorrect comment is updated 2) newLane calculation for shift/rotate instructions is corrected 3) A copy of original data is made so that a copy of a copy is not made 4) Reset all booleans (OOB, zeroSrc, laneDisabled) after each lane iteration The shift, rotate, and broadcast variants were tested by implementing them in assembly and running on silicon. Change-Id: If86fbb26c87eaca4ef0587fd846978115858b168 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/66752 Maintainer: Matt Sinclair Tested-by: kokoro Reviewed-by: Matt Sinclair --- M src/arch/amdgpu/vega/insts/inst_util.hh 1 file changed, 58 insertions(+), 23 deletions(-) Approvals: Matt Sinclair: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/amdgpu/vega/insts/inst_util.hh b/src/arch/amdgpu/vega/insts/inst_util.hh index 01925f9..7ec2e2d 100644 --- a/src/arch/amdgpu/vega/insts/inst_util.hh +++ b/src/arch/amdgpu/vega/insts/inst_util.hh @@ -303,9 +303,9 @@ * Currently the values are: * 0x0 - 0xFF: full permute of four threads * 0x100: reserved - * 0x101 - 0x10F: row shift right by 1-15 threads + * 0x101 - 0x10F: row shift left by 1-15 threads * 0x111 - 0x11F: row shift right by 1-15 threads - * 0x121 - 0x12F: row shift right by 1-15 threads + * 0x121 - 0x12F: row rotate right by 1-15 threads * 0x130: wavefront left shift by 1 thread * 0x134: wavefront left rotate by 1 thread * 0x138: wavefront right shift by 1 thread @@ -322,7 +322,8 @@ // newLane will be the same as the input lane unless swizzling happens int newLane = currLane; // for shift/rotate permutations; positive values are LEFT rotates -int count = 1; +// shift/rotate left means lane n -> lane n-1 (e.g., lane 1 -> lane 0) +int count = 0; int localRowOffset = rowOffset; int localRowNum = rowNum; @@ -335,51 +336,47 @@ panic("ERROR: instruction using reserved DPP_CTRL value\n"); } else if ((dppCtrl >= SQ_DPP_ROW_SL1) && (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15} -count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1); +count = (dppCtrl - SQ_DPP_ROW_SL1 + 1); if ((localRowOffset + count >= 0) && (localRowOffset + count < ROW_SIZE)) { localRowOffset += count; -newLane = (rowNum | localRowOffset); +newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else { outOfBounds = true; } } else if ((dppCtrl >= SQ_DPP_ROW_SR1) && (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15} -count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1); +count = -(dppCtrl - SQ_DPP_ROW_SR1 + 1); if ((localRowOffset + count >= 0) && (localRowOffset + count < ROW_SIZE)) { localRowOffset += count; -newLane = (rowNum | localRowOffset); +newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else { outOfBounds = true; } } else if ((dppCtrl >= SQ_DPP_ROW_RR1) && (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15} -count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1); +count = -(dppCtrl - SQ_DPP_ROW_RR1 + 1); localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE; -newLane = (rowNum | localRowOffset); +newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1 -count = 1; if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) { -newLane += count; +newLane += 1; } else { outOfBounds = true; } } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1 -count = 1; -newLane = (currLane + count + NumVecElemPerVecReg) % +newLane = (currLane - 1 + NumVecElemPerVecReg) % NumVecElemPerVecReg; } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1 -count = -1; -int currVal = (currLane + count); +int currVal = (currLane - 1); if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) { -newLane += count; +newLane -= 1; }
[gem5-dev] [M] Change in gem5/gem5[develop]: arch-vega: Fix several issues with DPP
Matthew Poremba has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/66752?usp=email ) Change subject: arch-vega: Fix several issues with DPP .. arch-vega: Fix several issues with DPP DPP processing has several issues which are fixed in this changeset: 1) Incorrect comment is updated 2) newLane calculation for shift/rotate instructions is corrected 3) A copy of original data is made so that a copy of a copy is not made 4) Reset all booleans (OOB, zeroSrc, laneDisabled) after each lane iteration The shift, rotate, and broadcast variants were tested by implementing them in assembly and running on silicon. Change-Id: If86fbb26c87eaca4ef0587fd846978115858b168 --- M src/arch/amdgpu/vega/insts/inst_util.hh 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/src/arch/amdgpu/vega/insts/inst_util.hh b/src/arch/amdgpu/vega/insts/inst_util.hh index 01925f9..10bb629 100644 --- a/src/arch/amdgpu/vega/insts/inst_util.hh +++ b/src/arch/amdgpu/vega/insts/inst_util.hh @@ -303,9 +303,9 @@ * Currently the values are: * 0x0 - 0xFF: full permute of four threads * 0x100: reserved - * 0x101 - 0x10F: row shift right by 1-15 threads + * 0x101 - 0x10F: row shift left by 1-15 threads * 0x111 - 0x11F: row shift right by 1-15 threads - * 0x121 - 0x12F: row shift right by 1-15 threads + * 0x121 - 0x12F: row rotate right by 1-15 threads * 0x130: wavefront left shift by 1 thread * 0x134: wavefront left rotate by 1 thread * 0x138: wavefront right shift by 1 thread @@ -322,7 +322,8 @@ // newLane will be the same as the input lane unless swizzling happens int newLane = currLane; // for shift/rotate permutations; positive values are LEFT rotates -int count = 1; +// shift/rotate left means lane n -> lane n-1 (e.g., lane 1 -> lane 0) +int count = 0; int localRowOffset = rowOffset; int localRowNum = rowNum; @@ -335,51 +336,47 @@ panic("ERROR: instruction using reserved DPP_CTRL value\n"); } else if ((dppCtrl >= SQ_DPP_ROW_SL1) && (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15} -count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1); +count = (dppCtrl - SQ_DPP_ROW_SL1 + 1); if ((localRowOffset + count >= 0) && (localRowOffset + count < ROW_SIZE)) { localRowOffset += count; -newLane = (rowNum | localRowOffset); +newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else { outOfBounds = true; } } else if ((dppCtrl >= SQ_DPP_ROW_SR1) && (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15} -count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1); +count = -(dppCtrl - SQ_DPP_ROW_SR1 + 1); if ((localRowOffset + count >= 0) && (localRowOffset + count < ROW_SIZE)) { localRowOffset += count; -newLane = (rowNum | localRowOffset); +newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else { outOfBounds = true; } } else if ((dppCtrl >= SQ_DPP_ROW_RR1) && (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15} -count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1); +count = -(dppCtrl - SQ_DPP_ROW_RR1 + 1); localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE; -newLane = (rowNum | localRowOffset); +newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1 -count = 1; if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) { -newLane += count; +newLane += 1; } else { outOfBounds = true; } } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1 -count = 1; -newLane = (currLane + count + NumVecElemPerVecReg) % +newLane = (currLane - 1 + NumVecElemPerVecReg) % NumVecElemPerVecReg; } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1 -count = -1; -int currVal = (currLane + count); +int currVal = (currLane - 1); if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) { -newLane += count; +newLane -= 1; } else { outOfBounds = true; } } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1 -count = -1; -newLane = (currLane + count + NumVecElemPerVecReg) % +newLane = (currLane - 1 + NumVecElemPerVecReg) % NumVecElemPerVecReg; } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { //