[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Fixing HSA's barrier bit implementation

2020-08-12 Thread Matt Sinclair (Gerrit) via gem5-dev
Matt Sinclair has submitted this change. (  
https://gem5-review.googlesource.com/c/public/gem5/+/30354 )


Change subject: gpu-compute: Fixing HSA's barrier bit implementation
..

gpu-compute: Fixing HSA's barrier bit implementation

This changeset fixes several bugs in the HSA barrier bit implementation.

1. Forces AQL packet launch to wait for completion of all previous packets
2. Enforces barrier bit blocking only if there are packets pending  
completion

3. Barrier bit unblocking is correclty done by the last pending packet
4. Implementing barrier bit for all packets to conform to HSA spec

Change-Id: I62ce589dff57dcde4d64054a1b6ffd962acd5eb8
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30354
Reviewed-by: Sooraj Puthoor 
Maintainer: Anthony Gutierrez 
Tested-by: kokoro 
---
M src/dev/hsa/hsa_packet_processor.cc
M src/dev/hsa/hsa_packet_processor.hh
2 files changed, 100 insertions(+), 34 deletions(-)

Approvals:
  Sooraj Puthoor: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/dev/hsa/hsa_packet_processor.cc  
b/src/dev/hsa/hsa_packet_processor.cc

index 4143019..68cdcf4 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -282,11 +282,11 @@
 }

 void
-HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx, Tick delay)
 {
 RQLEntry *queue = regdQList[rl_idx];
-if (!queue->aqlProcessEvent.scheduled() && !queue->getBarrierBit()) {
-Tick processingTick = curTick() + pktProcessDelay;
+if (!queue->aqlProcessEvent.scheduled()) {
+Tick processingTick = curTick() + delay;
 schedule(queue->aqlProcessEvent, processingTick);
 DPRINTF(HSAPacketProcessor, "AQL processing scheduled at  
tick: %d\n",

 processingTick);
@@ -295,42 +295,48 @@
 }
 }

-bool
+void
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
+{
+schedAQLProcessing(rl_idx, pktProcessDelay);
+}
+
+Q_STATE
 HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr  
host_pkt_addr)

 {
-bool is_submitted = false;
+Q_STATE is_submitted = BLOCKED_BPKT;
 SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
 // Dependency signals are not read yet. And this can only be a retry.
 // The retry logic will schedule the packet processor wakeup
 if (dep_sgnl_rd_st->pendingReads != 0) {
-return false;
+return BLOCKED_BPKT;
 }
 // `pkt` can be typecasted to any type of AQL packet since they all
 // have header information at offset zero
 auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
 hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
+if (IS_BARRIER(disp_pkt) &&
+regdQList[rl_idx]->compltnPending() > 0) {
+// If this packet is using the "barrier bit" to enforce ordering  
with

+// previous packets, and if there are outstanding packets, set the
+// barrier bit for this queue and block the queue.
+DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
+" list ID = %d\n", __FUNCTION__, rl_idx);
+regdQList[rl_idx]->setBarrierBit(true);
+return BLOCKED_BBIT;
+}
 if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
 DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
 " active list ID = %d\n", __FUNCTION__, rl_idx);
 // Submit packet to HSA device (dispatcher)
 hsa_device->submitVendorPkt((void *)disp_pkt, rl_idx,  
host_pkt_addr);

-is_submitted = true;
+is_submitted = UNBLOCKED;
 } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
 DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
 " active list ID = %d\n", __FUNCTION__, rl_idx);
 // Submit packet to HSA device (dispatcher)
 hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx,  
host_pkt_addr);

-is_submitted = true;
-/*
-  If this packet is using the "barrier bit" to enforce ordering  
with

-  subsequent kernels, set the bit for this queue now, after
-  dispatching.
-*/
-if (IS_BARRIER(disp_pkt)) {
-DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for  
active" \

-" list ID = %d\n", __FUNCTION__, rl_idx);
-regdQList[rl_idx]->setBarrierBit(true);
-}
+is_submitted = UNBLOCKED;
 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
 DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
 " active list ID = %d\n", __FUNCTION__, rl_idx);
@@ -387,7 +393,7 @@
 // TODO: Completion signal of barrier packet to be
 // atomically decremented here
 finishPkt((void*)bar_and_pkt, rl_idx);
-

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Fixing HSA's barrier bit implementation

2020-06-15 Thread Matt Sinclair (Gerrit) via gem5-dev
Matt Sinclair has uploaded this change for review. (  
https://gem5-review.googlesource.com/c/public/gem5/+/30354 )



Change subject: gpu-compute: Fixing HSA's barrier bit implementation
..

gpu-compute: Fixing HSA's barrier bit implementation

This changeset fixes several bugs in the HSA barrier bit implementation.

1. Forces AQL packet launch to wait for completion of all previous packets
2. Enforces barrier bit blocking only if there are packets pending  
completion

3. Barrier bit unblocking is correclty done by the last pending packet
4. Implementing barrier bit for all packets to conform to HSA spec

Change-Id: I62ce589dff57dcde4d64054a1b6ffd962acd5eb8
---
M src/dev/hsa/hsa_packet_processor.cc
M src/dev/hsa/hsa_packet_processor.hh
2 files changed, 84 insertions(+), 14 deletions(-)



diff --git a/src/dev/hsa/hsa_packet_processor.cc  
b/src/dev/hsa/hsa_packet_processor.cc

index f9880e4..4183d6d 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -277,11 +277,11 @@
 }

 void
-HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx, Tick delay)
 {
 RQLEntry *queue = regdQList[rl_idx];
 if (!queue->aqlProcessEvent.scheduled()) {
-Tick processingTick = curTick() + pktProcessDelay;
+Tick processingTick = curTick() + delay;
 schedule(queue->aqlProcessEvent, processingTick);
 DPRINTF(HSAPacketProcessor, "AQL processing scheduled at  
tick: %d\n",

 processingTick);
@@ -290,32 +290,48 @@
 }
 }

-bool
-HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr  
host_pkt_addr)

+void
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
 {
-bool is_submitted = false;
+schedAQLProcessing(rl_idx, pktProcessDelay);
+}
+
+Q_STATE HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx,
+   Addr host_pkt_addr)
+{
+Q_STATE is_submitted = BLOCKED_BPKT;
 SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
 // Dependency signals are not read yet. And this can only be a retry.
 // The retry logic will schedule the packet processor wakeup
 if (dep_sgnl_rd_st->pendingReads != 0) {
-return false;
+return BLOCKED_BPKT;
 }
 // `pkt` can be typecasted to any type of AQL packet since they all
 // have header information at offset zero
 auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
 hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
+if (IS_BARRIER(disp_pkt) &&
+regdQList[rl_idx]->compltnPending() > 0) {
+// If this packet is using the "barrier bit" to enforce ordering  
with

+// previous packets, and if there are outstanding packets, set the
+// barrier bit for this queue and block the queue.
+DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
+" list ID = %d\n", __FUNCTION__, rl_idx);
+regdQList[rl_idx]->setBarrierBit(true);
+return BLOCKED_BBIT;
+}
 if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
 DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
 " active list ID = %d\n", __FUNCTION__, rl_idx);
 // Submit packet to HSA device (dispatcher)
 hsa_device->submitVendorPkt((void *)disp_pkt, rl_idx,  
host_pkt_addr);

-is_submitted = true;
+is_submitted = UNBLOCKED;
 } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
 DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
 " active list ID = %d\n", __FUNCTION__, rl_idx);
 // Submit packet to HSA device (dispatcher)
 hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx,  
host_pkt_addr);

-is_submitted = true;
+is_submitted = UNBLOCKED;
 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
 DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
 " active list ID = %d\n", __FUNCTION__, rl_idx);
@@ -372,7 +388,7 @@
 // TODO: Completion signal of barrier packet to be
 // atomically decremented here
 finishPkt((void*)bar_and_pkt, rl_idx);
-is_submitted = true;
+is_submitted = UNBLOCKED;
 // Reset signal values
 dep_sgnl_rd_st->resetSigVals();
 // The completion signal is connected
@@ -432,6 +448,13 @@
 " dispIdx %d, active list ID = %d\n",
 __FUNCTION__, aqlRingBuffer->rdIdx(),
 aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
+// If barrier bit is set, then this wakeup is a dummy wakeup
+// just to model the processing time. Do nothing.
+if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
+DPRINTF(HSAPacketProcessor,
+"Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
+