Hello Tony Gutierrez,
I'd like you to do a code review. Please visit
https://gem5-review.googlesource.com/c/public/gem5/+/29973
to review the following change.
Change subject: gpu-compute, arch-gcn3: Change how waitcnts are implemented
......................................................................
gpu-compute, arch-gcn3: Change how waitcnts are implemented
Use single counters per memory operation type and increment
them upon issue, not execute.
Change-Id: I6afc0b66b21882538ef90a14a57a3ab3cc7bd6f3
---
M src/arch/gcn3/insts/instructions.cc
M src/gpu-compute/global_memory_pipeline.cc
M src/gpu-compute/gpu_dyn_inst.cc
M src/gpu-compute/local_memory_pipeline.cc
M src/gpu-compute/scalar_memory_pipeline.cc
M src/gpu-compute/schedule_stage.cc
M src/gpu-compute/wavefront.cc
M src/gpu-compute/wavefront.hh
8 files changed, 106 insertions(+), 18 deletions(-)
diff --git a/src/arch/gcn3/insts/instructions.cc
b/src/arch/gcn3/insts/instructions.cc
index 9987fad..7c2cf0e 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32565,6 +32565,7 @@
vdst.write();
+ wf->decLGKMInstsIssued();
wf->rdLmReqsInPipe--;
wf->validateRequestCounters();
} // execute
@@ -32635,6 +32636,7 @@
vdst.write();
+ wf->decLGKMInstsIssued();
wf->rdLmReqsInPipe--;
wf->validateRequestCounters();
} // execute
@@ -39400,6 +39402,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->rdGmReqsInPipe--;
wf->rdLmReqsInPipe--;
return;
@@ -39496,6 +39500,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->rdGmReqsInPipe--;
wf->rdLmReqsInPipe--;
return;
@@ -39592,6 +39598,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->rdGmReqsInPipe--;
wf->rdLmReqsInPipe--;
return;
@@ -39660,6 +39668,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->rdGmReqsInPipe--;
wf->rdLmReqsInPipe--;
return;
@@ -39728,6 +39738,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->rdGmReqsInPipe--;
wf->rdLmReqsInPipe--;
return;
@@ -39805,6 +39817,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->rdGmReqsInPipe--;
wf->rdLmReqsInPipe--;
}
@@ -39884,6 +39898,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->wrLmReqsInPipe--;
return;
@@ -39952,6 +39968,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->wrLmReqsInPipe--;
return;
@@ -40021,6 +40039,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->wrLmReqsInPipe--;
return;
@@ -40090,6 +40110,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->wrLmReqsInPipe--;
return;
@@ -40159,6 +40181,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->wrLmReqsInPipe--;
return;
@@ -40237,6 +40261,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->wrLmReqsInPipe--;
return;
@@ -40325,6 +40351,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
@@ -40425,6 +40453,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
@@ -40526,6 +40556,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
@@ -40893,6 +40925,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
@@ -40995,6 +41029,8 @@
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
+ wf->decVMemInstsIssued();
+ wf->decLGKMInstsIssued();
wf->wrGmReqsInPipe--;
wf->rdGmReqsInPipe--;
return;
diff --git a/src/gpu-compute/global_memory_pipeline.cc
b/src/gpu-compute/global_memory_pipeline.cc
index dcc80f0..9fc515a 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -130,6 +130,7 @@
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem
instr %s\n",
m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);
+ w->decVMemInstsIssued();
if (m->isLoad() || m->isAtomicRet()) {
w->computeUnit->vrf[w->simdId]->
diff --git a/src/gpu-compute/gpu_dyn_inst.cc
b/src/gpu-compute/gpu_dyn_inst.cc
index 2a49522..03ed689 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -819,6 +819,7 @@
if (executedAs() == Enums::SC_GLOBAL) {
// no transormation for global segment
wavefront()->execUnitId = wavefront()->flatGmUnitId;
+ wavefront()->decLGKMInstsIssued();
if (isLoad()) {
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
@@ -838,6 +839,7 @@
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
+ wavefront()->decVMemInstsIssued();
if (isLoad()) {
wavefront()->rdGmReqsInPipe--;
} else if (isStore()) {
@@ -897,6 +899,7 @@
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
+ wavefront()->decLGKMInstsIssued();
if (isLoad()) {
wavefront()->rdGmReqsInPipe--;
} else if (isStore()) {
diff --git a/src/gpu-compute/local_memory_pipeline.cc
b/src/gpu-compute/local_memory_pipeline.cc
index df57690..ca090e9 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -76,6 +76,7 @@
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem
instr %s\n",
m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);
+ w->decLGKMInstsIssued();
if (m->isLoad() || m->isAtomicRet()) {
w->computeUnit->vrf[w->simdId]->
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc
b/src/gpu-compute/scalar_memory_pipeline.cc
index 35b4ca5..5e4496d 100644
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -85,6 +85,7 @@
}
m->completeAcc(m);
+ w->decLGKMInstsIssued();
if (m->isLoad() || m->isAtomic()) {
returnedLoads.pop();
diff --git a/src/gpu-compute/schedule_stage.cc
b/src/gpu-compute/schedule_stage.cc
index fb52b6d..005e6f6 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -135,6 +135,15 @@
// this wave spends in SCH stage.
wf->schCycles++;
addToSchListStalls[j]++;
+ } else {
+ if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
+ wf->incLGKMInstsIssued();
+ } else {
+ wf->incVMemInstsIssued();
+ if (gpu_dyn_inst->isFlat()) {
+ wf->incLGKMInstsIssued();
+ }
+ }
}
}
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index f72cd50..0e737db 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -53,6 +53,7 @@
: SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId),
maxIbSize(p->max_ib_size), _gpuISA(*this),
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
+ vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
barId(WFBarrier::InvalidID)
{
lastTrace = 0;
@@ -1253,37 +1254,27 @@
return false;
}
- // If we reach here, that means waitCnt instruction is executed and
- // the waitcnts are set by the execute method. Check if waitcnts are
- // satisfied.
-
- // current number of vector memory ops in flight
- int vm_cnt = outstandingReqsWrGm + outstandingReqsRdGm;
-
- // current number of export insts or vector memory writes in flight
- int exp_cnt = outstandingReqsWrGm;
-
- // current number of scalar/LDS memory ops in flight
- // we do not consider GDS/message ops
- int lgkm_cnt = outstandingReqsWrLm + outstandingReqsRdLm +
- scalarOutstandingReqsRdGm + scalarOutstandingReqsWrGm;
-
+ /**
+ * If we reach here, that means an s_waitcnt instruction was executed
+ * and the waitcnts are set by the execute method. Check if waitcnts
+ * are satisfied.
+ */
if (vmWaitCnt != -1) {
- if (vm_cnt > vmWaitCnt) {
+ if (vmemInstsIssued > vmWaitCnt) {
// vmWaitCnt not satisfied
return false;
}
}
if (expWaitCnt != -1) {
- if (exp_cnt > expWaitCnt) {
+ if (expInstsIssued > expWaitCnt) {
// expWaitCnt not satisfied
return false;
}
}
if (lgkmWaitCnt != -1) {
- if (lgkm_cnt > lgkmWaitCnt) {
+ if (lgkmInstsIssued > lgkmWaitCnt) {
// lgkmWaitCnt not satisfied
return false;
}
@@ -1355,6 +1346,42 @@
status = S_RUNNING;
}
+void
+Wavefront::incVMemInstsIssued()
+{
+ ++vmemInstsIssued;
+}
+
+void
+Wavefront::incExpInstsIssued()
+{
+ ++expInstsIssued;
+}
+
+void
+Wavefront::incLGKMInstsIssued()
+{
+ ++lgkmInstsIssued;
+}
+
+void
+Wavefront::decVMemInstsIssued()
+{
+ --vmemInstsIssued;
+}
+
+void
+Wavefront::decExpInstsIssued()
+{
+ --expInstsIssued;
+}
+
+void
+Wavefront::decLGKMInstsIssued()
+{
+ --lgkmInstsIssued;
+}
+
Addr
Wavefront::pc() const
{
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index e07af0e..34e45fa 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -304,6 +304,13 @@
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
void clearWaitCnts();
+ void incVMemInstsIssued();
+ void incExpInstsIssued();
+ void incLGKMInstsIssued();
+ void decVMemInstsIssued();
+ void decExpInstsIssued();
+ void decLGKMInstsIssued();
+
/** Freeing VRF space */
void freeRegisterFile();
@@ -343,6 +350,9 @@
int vmWaitCnt;
int expWaitCnt;
int lgkmWaitCnt;
+ int vmemInstsIssued;
+ int expInstsIssued;
+ int lgkmInstsIssued;
status_e status;
Addr _pc;
VectorMask _execMask;
--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/29973
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I6afc0b66b21882538ef90a14a57a3ab3cc7bd6f3
Gerrit-Change-Number: 29973
Gerrit-PatchSet: 1
Gerrit-Owner: Anthony Gutierrez <[email protected]>
Gerrit-Reviewer: Tony Gutierrez <[email protected]>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s