Kyle Roarty has uploaded this change for review. (
https://gem5-review.googlesource.com/c/public/gem5/+/48343 )
Change subject: arch-gcn3: Implement LDS accesses in Flat instructions
......................................................................
arch-gcn3: Implement LDS accesses in Flat instructions
Add support for LDS accesses by allowing Flat instructions to dispatch
into the local memory pipeline if the requested address is in the group
aperture.
This requires implementing LDS accesses in the Flat initMemRead/Write
functions, in a similar fashion to the DS functions of the same name.
Because we now can potentially dispatch to the local memory pipeline,
this change also adds a check to regain any tokens we requested as a
flat instruction.
Change-Id: Id26191f7ee43291a5e5ca5f39af06af981ec23ab
---
M src/arch/amdgpu/gcn3/insts/instructions.cc
M src/arch/amdgpu/gcn3/insts/op_encodings.hh
M src/gpu-compute/gpu_dyn_inst.cc
M src/gpu-compute/local_memory_pipeline.cc
4 files changed, 156 insertions(+), 6 deletions(-)
diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc
b/src/arch/amdgpu/gcn3/insts/instructions.cc
index 79af7ac..95af790 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -39384,6 +39384,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -39448,6 +39451,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -39511,6 +39517,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -39603,6 +39612,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -39667,6 +39679,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -39731,6 +39746,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -39804,6 +39822,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -39889,6 +39910,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -39952,6 +39976,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40015,6 +40042,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40079,6 +40109,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40151,6 +40184,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40227,6 +40263,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40294,6 +40333,9 @@
"Flats to private aperture not tested yet\n");
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40408,6 +40450,9 @@
"Flats to private aperture not tested yet\n");
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40492,6 +40537,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40576,6 +40624,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40834,6 +40885,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -40918,6 +40972,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -41044,6 +41101,9 @@
"Flats to private aperture not tested yet\n");
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -41129,6 +41189,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -41215,6 +41278,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -41483,6 +41549,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
@@ -41570,6 +41639,9 @@
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
}
diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
index a061285..27b9b99 100644
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
@@ -799,35 +799,107 @@
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
- initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+ if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+ initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ Wavefront *wf = gpuDynInst->wavefront();
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ Addr vaddr = gpuDynInst->addr[lane];
+ (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
+ = wf->ldsChunk->read<T>(vaddr);
+ }
+ }
+ }
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
- initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+ if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+ initMemReqHelper<VecElemU32, N>(gpuDynInst,
MemCmd::ReadReq);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ Wavefront *wf = gpuDynInst->wavefront();
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ Addr vaddr = gpuDynInst->addr[lane];
+ for (int i = 0; i < N; ++i) {
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * N + i]
+ = wf->ldsChunk->read<VecElemU32>(
+ vaddr + i*sizeof(VecElemU32));
+ }
+ }
+ }
+ }
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
- initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+ if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+ initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ Wavefront *wf = gpuDynInst->wavefront();
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ Addr vaddr = gpuDynInst->addr[lane];
+ wf->ldsChunk->write<T>(vaddr,
+
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
+ }
+ }
+ }
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
- initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+ if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+ initMemReqHelper<VecElemU32, N>(gpuDynInst,
MemCmd::WriteReq);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ Wavefront *wf = gpuDynInst->wavefront();
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ Addr vaddr = gpuDynInst->addr[lane];
+ for (int i = 0; i < N; ++i) {
+ wf->ldsChunk->write<VecElemU32>(
+ vaddr + i*sizeof(VecElemU32),
+ (reinterpret_cast<VecElemU32*>(
+ gpuDynInst->d_data))[lane * N + i]);
+ }
+ }
+ }
+ }
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
- initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+ if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+ initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+ } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+ Wavefront *wf = gpuDynInst->wavefront();
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ Addr vaddr = gpuDynInst->addr[lane];
+ AtomicOpFunctor* amo_op =
+ gpuDynInst->makeAtomicOpFunctor<T>(
+ &(reinterpret_cast<T*>(
+ gpuDynInst->a_data))[lane],
+ &(reinterpret_cast<T*>(
+ gpuDynInst->x_data))[lane]).get();
+
+ T tmp = wf->ldsChunk->read<T>(vaddr);
+ (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
+ wf->ldsChunk->write<T>(vaddr, tmp);
+ (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] =
tmp;
+ }
+ }
+ }
}
void
diff --git a/src/gpu-compute/gpu_dyn_inst.cc
b/src/gpu-compute/gpu_dyn_inst.cc
index fb9bf07..d98dce0 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -834,7 +834,8 @@
if (mask[lane]) {
// flat address calculation goes here.
// addr[lane] = segmented address
- panic("Flat group memory operation is unimplemented!\n");
+ addr[lane] = addr[lane] -
+ wavefront()->computeUnit->shader->ldsApe().base;
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
diff --git a/src/gpu-compute/local_memory_pipeline.cc
b/src/gpu-compute/local_memory_pipeline.cc
index 995ea75..c99be00 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -76,6 +76,11 @@
lmReturnedRequests.pop();
w = m->wavefront();
+ if (m->isFlat() && !m->isMemSync() && !m->isEndOfKernel()
+ && m->allLanesZero()) {
+ computeUnit.getTokenManager()->recvTokens(1);
+ }
+
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem
instr %s\n",
m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/48343
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: release-staging-v21-1
Gerrit-Change-Id: Id26191f7ee43291a5e5ca5f39af06af981ec23ab
Gerrit-Change-Number: 48343
Gerrit-PatchSet: 1
Gerrit-Owner: Kyle Roarty <[email protected]>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s