Anthony Gutierrez has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/29946 )
Change subject: arch-gcn3: implement multi-dword buffer loads and stores
......................................................................
arch-gcn3: implement multi-dword buffer loads and stores
Add support for all multi-dword buffer loads and stores:
buffer_load_dword x2, x3, and x4 and buffer_store_dword x2, x3, and x4
Change-Id: I4017b6b4f625fc92002ce8ade695ae29700fa55e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29946
Maintainer: Anthony Gutierrez <anthony.gutier...@amd.com>
Tested-by: kokoro <noreply+kok...@google.com>
Reviewed-by: Matt Sinclair <mattdsincl...@gmail.com>
---
M src/arch/gcn3/insts/instructions.cc
M src/arch/gcn3/insts/op_encodings.hh
2 files changed, 504 insertions(+), 18 deletions(-)
Approvals:
Matt Sinclair: Looks good to me, approved
Anthony Gutierrez: Looks good to me, approved
kokoro: Regressions pass
diff --git a/src/arch/gcn3/insts/instructions.cc
b/src/arch/gcn3/insts/instructions.cc
index 817b339..b852281 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -34777,7 +34777,11 @@
{
setFlag(MemoryRef);
setFlag(Load);
- setFlag(GlobalSegment);
+ if (instData.LDS) {
+ setFlag(GroupSegment);
+ } else {
+ setFlag(GlobalSegment);
+ }
} // Inst_MUBUF__BUFFER_LOAD_DWORDX2
Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
@@ -34788,17 +34792,88 @@
void
Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
- }
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+ ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+ ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+ rsrcDesc.read();
+ offset.read();
+
+ int inst_offset = instData.OFFSET;
+
+ if (!instData.IDXEN && !instData.OFFEN) {
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (!instData.IDXEN && instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (instData.IDXEN && !instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ } else {
+ addr0.read();
+ addr1.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ }
+
+ if (isLocalMem()) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->rdLmReqsInPipe--;
+ wf->outstandingReqsRdLm++;
+ } else {
+ gpuDynInst->computeUnit()->globalMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ }
+
+ wf->outstandingReqs++;
+ wf->validateRequestCounters();
+ } // execute
void
Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
{
+ initMemRead<2>(gpuDynInst);
} // initiateAcc
void
Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
{
+ VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+ VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ if (!oobMask[lane]) {
+ vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 2];
+ vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 2 + 1];
+ } else {
+ vdst0[lane] = 0;
+ vdst1[lane] = 0;
+ }
+ }
+ }
+
+ vdst0.write();
+ vdst1.write();
} // completeAcc
Inst_MUBUF__BUFFER_LOAD_DWORDX3
@@ -34807,7 +34882,11 @@
{
setFlag(MemoryRef);
setFlag(Load);
- setFlag(GlobalSegment);
+ if (instData.LDS) {
+ setFlag(GroupSegment);
+ } else {
+ setFlag(GlobalSegment);
+ }
} // Inst_MUBUF__BUFFER_LOAD_DWORDX3
Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
@@ -34818,17 +34897,93 @@
void
Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
- }
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+ ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+ ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+ rsrcDesc.read();
+ offset.read();
+
+ int inst_offset = instData.OFFSET;
+
+ if (!instData.IDXEN && !instData.OFFEN) {
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (!instData.IDXEN && instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (instData.IDXEN && !instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ } else {
+ addr0.read();
+ addr1.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ }
+
+ if (isLocalMem()) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->rdLmReqsInPipe--;
+ wf->outstandingReqsRdLm++;
+ } else {
+ gpuDynInst->computeUnit()->globalMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ }
+
+ wf->outstandingReqs++;
+ wf->validateRequestCounters();
+ } // execute
void
Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
{
+ initMemRead<3>(gpuDynInst);
} // initiateAcc
void
Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
{
+ VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+ VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+ VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ if (!oobMask[lane]) {
+ vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 3];
+ vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 3 + 1];
+ vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 3 + 2];
+ } else {
+ vdst0[lane] = 0;
+ vdst1[lane] = 0;
+ vdst2[lane] = 0;
+ }
+ }
+ }
+
+ vdst0.write();
+ vdst1.write();
+ vdst2.write();
} // completeAcc
Inst_MUBUF__BUFFER_LOAD_DWORDX4
@@ -34837,7 +34992,11 @@
{
setFlag(MemoryRef);
setFlag(Load);
- setFlag(GlobalSegment);
+ if (instData.LDS) {
+ setFlag(GroupSegment);
+ } else {
+ setFlag(GlobalSegment);
+ }
} // Inst_MUBUF__BUFFER_LOAD_DWORDX4
Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
@@ -34848,17 +35007,98 @@
void
Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
- }
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+ ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+ ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+ rsrcDesc.read();
+ offset.read();
+
+ int inst_offset = instData.OFFSET;
+
+ if (!instData.IDXEN && !instData.OFFEN) {
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (!instData.IDXEN && instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (instData.IDXEN && !instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ } else {
+ addr0.read();
+ addr1.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ }
+
+ if (isLocalMem()) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->rdLmReqsInPipe--;
+ wf->outstandingReqsRdLm++;
+ } else {
+ gpuDynInst->computeUnit()->globalMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->rdGmReqsInPipe--;
+ wf->outstandingReqsRdGm++;
+ }
+
+ wf->outstandingReqs++;
+ wf->validateRequestCounters();
+ } // execute
void
Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
{
+ initMemRead<4>(gpuDynInst);
} // initiateAcc
void
Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
{
+ VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+ VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+ VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
+ VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ if (!oobMask[lane]) {
+ vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4];
+ vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 1];
+ vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 2];
+ vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * 4 + 3];
+ } else {
+ vdst0[lane] = 0;
+ vdst1[lane] = 0;
+ vdst2[lane] = 0;
+ vdst3[lane] = 0;
+ }
+ }
+ }
+
+ vdst0.write();
+ vdst1.write();
+ vdst2.write();
+ vdst3.write();
} // completeAcc
Inst_MUBUF__BUFFER_STORE_BYTE
@@ -35155,7 +35395,11 @@
{
setFlag(MemoryRef);
setFlag(Store);
- setFlag(GlobalSegment);
+ if (instData.LDS) {
+ setFlag(GroupSegment);
+ } else {
+ setFlag(GlobalSegment);
+ }
} // Inst_MUBUF__BUFFER_STORE_DWORDX2
Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
@@ -35166,12 +35410,77 @@
void
Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
- }
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+ ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+ ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+ ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+ ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+
+ rsrcDesc.read();
+ offset.read();
+ data0.read();
+ data1.read();
+
+ int inst_offset = instData.OFFSET;
+
+ if (!instData.IDXEN && !instData.OFFEN) {
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (!instData.IDXEN && instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (instData.IDXEN && !instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ } else {
+ addr0.read();
+ addr1.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ }
+
+ if (isLocalMem()) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->wrLmReqsInPipe--;
+ wf->outstandingReqsWrLm++;
+ } else {
+ gpuDynInst->computeUnit()->globalMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ }
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane *
4]
+ = data0[lane];
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4
+ 1]
+ = data1[lane];
+ }
+ }
+
+ wf->outstandingReqs++;
+ wf->validateRequestCounters();
+ } // execute
void
Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
{
+ initMemWrite<2>(gpuDynInst);
} // initiateAcc
void
@@ -35185,7 +35494,11 @@
{
setFlag(MemoryRef);
setFlag(Store);
- setFlag(GlobalSegment);
+ if (instData.LDS) {
+ setFlag(GroupSegment);
+ } else {
+ setFlag(GlobalSegment);
+ }
} // Inst_MUBUF__BUFFER_STORE_DWORDX3
Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
@@ -35196,12 +35509,81 @@
void
Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
- }
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+ ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+ ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+ ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+ ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+ ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
+
+ rsrcDesc.read();
+ offset.read();
+ data0.read();
+ data1.read();
+ data2.read();
+
+ int inst_offset = instData.OFFSET;
+
+ if (!instData.IDXEN && !instData.OFFEN) {
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (!instData.IDXEN && instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (instData.IDXEN && !instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ } else {
+ addr0.read();
+ addr1.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ }
+
+ if (isLocalMem()) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->wrLmReqsInPipe--;
+ wf->outstandingReqsWrLm++;
+ } else {
+ gpuDynInst->computeUnit()->globalMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ }
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane *
4]
+ = data0[lane];
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4
+ 1]
+ = data1[lane];
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4
+ 2]
+ = data2[lane];
+ }
+ }
+
+ wf->outstandingReqs++;
+ wf->validateRequestCounters();
+ } // execute
void
Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
{
+ initMemWrite<3>(gpuDynInst);
} // initiateAcc
void
@@ -35215,7 +35597,11 @@
{
setFlag(MemoryRef);
setFlag(Store);
- setFlag(GlobalSegment);
+ if (instData.LDS) {
+ setFlag(GroupSegment);
+ } else {
+ setFlag(GlobalSegment);
+ }
} // Inst_MUBUF__BUFFER_STORE_DWORDX4
Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
@@ -35226,12 +35612,85 @@
void
Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
{
- panicUnimplemented();
- }
+ Wavefront *wf = gpuDynInst->wavefront();
+ gpuDynInst->execUnitId = wf->execUnitId;
+ gpuDynInst->exec_mask = wf->execMask();
+ gpuDynInst->latency.init(gpuDynInst->computeUnit());
+ gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+ ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+ ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+ ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+ ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+ ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+ ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+ ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
+ ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
+
+ rsrcDesc.read();
+ offset.read();
+ data0.read();
+ data1.read();
+ data2.read();
+ data3.read();
+
+ int inst_offset = instData.OFFSET;
+
+ if (!instData.IDXEN && !instData.OFFEN) {
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (!instData.IDXEN && instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr0, addr1, rsrcDesc, offset, inst_offset);
+ } else if (instData.IDXEN && !instData.OFFEN) {
+ addr0.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ } else {
+ addr0.read();
+ addr1.read();
+ calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+ ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+ addr1, addr0, rsrcDesc, offset, inst_offset);
+ }
+
+ if (isLocalMem()) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->wrLmReqsInPipe--;
+ wf->outstandingReqsWrLm++;
+ } else {
+ gpuDynInst->computeUnit()->globalMemoryPipe
+ .issueRequest(gpuDynInst);
+ wf->wrGmReqsInPipe--;
+ wf->outstandingReqsWrGm++;
+ }
+
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane *
4]
+ = data0[lane];
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4
+ 1]
+ = data1[lane];
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4
+ 2]
+ = data2[lane];
+ (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4
+ 3]
+ = data3[lane];
+ }
+ }
+
+ wf->outstandingReqs++;
+ wf->validateRequestCounters();
+ } // execute
void
Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
{
+ initMemWrite<4>(gpuDynInst);
} // initiateAcc
void
diff --git a/src/arch/gcn3/insts/op_encodings.hh
b/src/arch/gcn3/insts/op_encodings.hh
index 4f151b9..4056f0a 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -505,6 +505,20 @@
gpuDynInst->exec_mask = old_exec_mask;
}
+
+ template<int N>
+ void
+ initMemRead(GPUDynInstPtr gpuDynInst)
+ {
+ // temporarily modify exec_mask to supress memory accesses to
oob
+ // regions. Only issue memory requests for lanes that have
their
+ // exec_mask set and are not out of bounds.
+ VectorMask old_exec_mask = gpuDynInst->exec_mask;
+ gpuDynInst->exec_mask &= ~oobMask;
+ initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+ gpuDynInst->exec_mask = old_exec_mask;
+ }
+
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
@@ -518,6 +532,19 @@
gpuDynInst->exec_mask = old_exec_mask;
}
+ template<int N>
+ void
+ initMemWrite(GPUDynInstPtr gpuDynInst)
+ {
+ // temporarily modify exec_mask to supress memory accesses to
oob
+ // regions. Only issue memory requests for lanes that have
their
+ // exec_mask set and are not out of bounds.
+ VectorMask old_exec_mask = gpuDynInst->exec_mask;
+ gpuDynInst->exec_mask &= ~oobMask;
+ initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+ gpuDynInst->exec_mask = old_exec_mask;
+ }
+
void
injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
{
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29946
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I4017b6b4f625fc92002ce8ade695ae29700fa55e
Gerrit-Change-Number: 29946
Gerrit-PatchSet: 7
Gerrit-Owner: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Tony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Tuan Ta <q...@cornell.edu>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s