Matt Sinclair has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/30354 )

Change subject: gpu-compute: Fixing HSA's barrier bit implementation
......................................................................

gpu-compute: Fixing HSA's barrier bit implementation

This changeset fixes several bugs in the HSA barrier bit implementation.

1. Forces AQL packet launch to wait for completion of all previous packets
2. Enforces barrier bit blocking only if there are packets pending completion
3. Barrier bit unblocking is correclty done by the last pending packet
4. Implementing barrier bit for all packets to conform to HSA spec

Change-Id: I62ce589dff57dcde4d64054a1b6ffd962acd5eb8
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30354
Reviewed-by: Sooraj Puthoor <puthoorsoo...@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutier...@amd.com>
Tested-by: kokoro <noreply+kok...@google.com>
---
M src/dev/hsa/hsa_packet_processor.cc
M src/dev/hsa/hsa_packet_processor.hh
2 files changed, 100 insertions(+), 34 deletions(-)

Approvals:
  Sooraj Puthoor: Looks good to me, approved
  Anthony Gutierrez: Looks good to me, approved
  kokoro: Regressions pass



diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc
index 4143019..68cdcf4 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -282,11 +282,11 @@
 }

 void
-HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx, Tick delay)
 {
     RQLEntry *queue = regdQList[rl_idx];
-    if (!queue->aqlProcessEvent.scheduled() && !queue->getBarrierBit()) {
-        Tick processingTick = curTick() + pktProcessDelay;
+    if (!queue->aqlProcessEvent.scheduled()) {
+        Tick processingTick = curTick() + delay;
         schedule(queue->aqlProcessEvent, processingTick);
DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
                 processingTick);
@@ -295,42 +295,48 @@
     }
 }

-bool
+void
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
+{
+    schedAQLProcessing(rl_idx, pktProcessDelay);
+}
+
+Q_STATE
HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
 {
-    bool is_submitted = false;
+    Q_STATE is_submitted = BLOCKED_BPKT;
     SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
     // Dependency signals are not read yet. And this can only be a retry.
     // The retry logic will schedule the packet processor wakeup
     if (dep_sgnl_rd_st->pendingReads != 0) {
-        return false;
+        return BLOCKED_BPKT;
     }
     // `pkt` can be typecasted to any type of AQL packet since they all
     // have header information at offset zero
     auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
     hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
+    if (IS_BARRIER(disp_pkt) &&
+        regdQList[rl_idx]->compltnPending() > 0) {
+ // If this packet is using the "barrier bit" to enforce ordering with
+        // previous packets, and if there are outstanding packets, set the
+        // barrier bit for this queue and block the queue.
+        DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
+                " list ID = %d\n", __FUNCTION__, rl_idx);
+        regdQList[rl_idx]->setBarrierBit(true);
+        return BLOCKED_BBIT;
+    }
     if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
         DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
                 " active list ID = %d\n", __FUNCTION__, rl_idx);
         // Submit packet to HSA device (dispatcher)
hsa_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
-        is_submitted = true;
+        is_submitted = UNBLOCKED;
     } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
         DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
                 " active list ID = %d\n", __FUNCTION__, rl_idx);
         // Submit packet to HSA device (dispatcher)
hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
-        is_submitted = true;
-        /*
- If this packet is using the "barrier bit" to enforce ordering with
-          subsequent kernels, set the bit for this queue now, after
-          dispatching.
-        */
-        if (IS_BARRIER(disp_pkt)) {
- DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
-                    " list ID = %d\n", __FUNCTION__, rl_idx);
-            regdQList[rl_idx]->setBarrierBit(true);
-        }
+        is_submitted = UNBLOCKED;
     } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
         DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
                 " active list ID = %d\n", __FUNCTION__, rl_idx);
@@ -387,7 +393,7 @@
             // TODO: Completion signal of barrier packet to be
             // atomically decremented here
             finishPkt((void*)bar_and_pkt, rl_idx);
-            is_submitted = true;
+            is_submitted = UNBLOCKED;
             // Reset signal values
             dep_sgnl_rd_st->resetSigVals();
             // The completion signal is connected
@@ -447,6 +453,13 @@
             " dispIdx %d, active list ID = %d\n",
             __FUNCTION__, aqlRingBuffer->rdIdx(),
             aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
+    // If barrier bit is set, then this wakeup is a dummy wakeup
+    // just to model the processing time. Do nothing.
+    if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
+        DPRINTF(HSAPacketProcessor,
+            "Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
+        return;
+    }
     // In the future, we may support batch processing of packets.
     // Then, we can just remove the break statements and the code
     // will support batch processing. That is why we are using a
@@ -456,7 +469,8 @@
DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
                 __FUNCTION__, aqlRingBuffer->dispIdx());
         Addr host_addr = aqlRingBuffer->hostDispAddr();
-        if (hsaPP->processPkt(pkt, rqIdx, host_addr)) {
+        Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr);
+        if (q_state == UNBLOCKED) {
              aqlRingBuffer->incDispIdx(1);
              DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
                      __FUNCTION__, aqlRingBuffer->dispIdx());
@@ -464,10 +478,20 @@
                  hsaPP->schedAQLProcessing(rqIdx);
              }
              break;
-        } else {
-            // This queue is blocked, scheduled a processing event
+        } else if (q_state == BLOCKED_BPKT) {
+            // This queue is blocked by barrier packet,
+            // schedule a processing event
             hsaPP->schedAQLProcessing(rqIdx);
             break;
+        } else if (q_state == BLOCKED_BBIT) {
+            // This queue is blocked by barrier bit, and processing event
+            // should be scheduled from finishPkt(). However, to elapse
+            // "pktProcessDelay" processing time, let us schedule a dummy
+            // wakeup once which will just wakeup and will do nothing.
+            hsaPP->schedAQLProcessing(rqIdx);
+            break;
+        } else {
+            panic("Unknown queue state\n");
         }
     }
 }
@@ -647,19 +671,23 @@
 {
     HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;

-    // if barrier bit was set, unset it here -- we assume that finishPkt is
-    // only called after the completion of a kernel
-    if (regdQList[rl_idx]->getBarrierBit()) {
+    // if barrier bit was set and this is the last
+    // outstanding packet from that queue,
+    // unset it here
+    if (regdQList[rl_idx]->getBarrierBit() &&
+        regdQList[rl_idx]->isLastOutstandingPkt()) {
         DPRINTF(HSAPacketProcessor,
                 "Unset barrier bit for active list ID %d\n", rl_idx);
         regdQList[rl_idx]->setBarrierBit(false);
-        // if pending kernels in the queue after this kernel, reschedule
-        if (regdQList[rl_idx]->dispPending()) {
-            DPRINTF(HSAPacketProcessor,
- "Rescheduling active list ID %d after unsetting barrier "
-                    "bit\n", rl_idx);
-            schedAQLProcessing(rl_idx);
-        }
+        panic_if(!regdQList[rl_idx]->dispPending(),
+                 "There should be pending kernels in this queue\n");
+        DPRINTF(HSAPacketProcessor,
+                "Rescheduling active list ID %d after unsetting barrier "
+                "bit\n", rl_idx);
+        // Try to schedule wakeup in the next cycle.  There is a minimum
+        // pktProcessDelay for queue wake up. If that processing delay is
+        // elapsed, schedAQLProcessing will wakeup next tick.
+        schedAQLProcessing(rl_idx, 1);
     }

     // If set, then blocked schedule, so need to reschedule
diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh
index 3ff7ad2..551d09d 100644
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -52,6 +52,19 @@
 // HSA runtime supports only 5 signals per barrier packet
 #define NumSignalsPerBarrier 5

+// Ideally, each queue should store this status and
+// the processPkt() should make decisions based on that
+// status variable.
+typedef enum {
+    UNBLOCKED = 0, // Unblocked queue, can submit packets.
+    BLOCKED_BBIT,  // Queue blocked by barrier bit.
+                   // Can submit packet packets after
+                   // previous packet completes.
+    BLOCKED_BPKT,  // Queue blocked by barrier packet.
+                   // Can submit packet packets after
+                   // barrier packet completes.
+} Q_STATE;
+
 class HSADevice;
 class HWScheduler;

@@ -151,6 +164,25 @@
return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
      }

+     /**
+      * Packets aren't guaranteed to be completed in-order, and we need
+      * to know when the last packet is finished in order to un-set
+      * the barrier bit. In order to confirm if the packet at _rdIdx
+      * is the last packet, we check if the packets ahead of _rdIdx
+      * are finished. If they are, _rdIdx is the last packet. If not,
+      * there are other outstanding packets.
+      */
+     bool
+     isLastOutstandingPkt() const
+     {
+       for (int i = _rdIdx + 1; i < _dispIdx; i++) {
+         if (!_aqlComplete[i % _aqlBuf.size()]) {
+           return false;
+         }
+       }
+       return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
+     }
+
      uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
      uint32_t numObjs() const { return _aqlBuf.size(); };
@@ -162,7 +194,7 @@
      void incRdIdx(uint64_t value) { _rdIdx += value; }
      void incWrIdx(uint64_t value) { _wrIdx += value; }
      void incDispIdx(uint64_t value) { _dispIdx += value; }
-
+     uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
 };

 typedef struct QueueContext {
@@ -233,10 +265,15 @@
             : aqlProcessEvent(hsaPP, rqIdx) {}
         QCntxt qCntxt;
         bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
+ uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
         SignalState depSignalRdState;
         QueueProcessEvent aqlProcessEvent;
         void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
         bool getBarrierBit() const { return qCntxt.barrierBit; }
+        bool isLastOutstandingPkt() const
+        {
+          return qCntxt.aqlBuf->isLastOutstandingPkt();
+        }
     };
     // Keeps track of queueDescriptors of registered queues
     std::vector<class RQLEntry *> regdQList;
@@ -250,7 +287,7 @@

     void dmaWriteVirt(Addr host_addr, unsigned size, Event *event,
                       void *data, Tick delay = 0);
-    bool processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
+    Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
     void displayQueueDescriptor(int pid, uint32_t rl_idx);

   public:
@@ -290,6 +327,7 @@
     void finishPkt(void *pkt, uint32_t rl_idx);
     void finishPkt(void *pkt) { finishPkt(pkt, 0); }
     void schedAQLProcessing(uint32_t rl_idx);
+    void schedAQLProcessing(uint32_t rl_idx, Tick delay);

     class DepSignalsReadDmaEvent : public Event
     {

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30354
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I62ce589dff57dcde4d64054a1b6ffd962acd5eb8
Gerrit-Change-Number: 30354
Gerrit-PatchSet: 8
Gerrit-Owner: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Alexandru Duțu <alexandru.d...@amd.com>
Gerrit-Reviewer: Anthony Gutierrez <anthony.gutier...@amd.com>
Gerrit-Reviewer: Bradford Beckmann <brad.beckm...@amd.com>
Gerrit-Reviewer: Jason Lowe-Power <power...@gmail.com>
Gerrit-Reviewer: Kyle Roarty <kyleroarty1...@gmail.com>
Gerrit-Reviewer: Matt Sinclair <mattdsincl...@gmail.com>
Gerrit-Reviewer: Sooraj Puthoor <puthoorsoo...@gmail.com>
Gerrit-Reviewer: kokoro <noreply+kok...@google.com>
Gerrit-CC: Matthew Poremba <matthew.pore...@amd.com>
Gerrit-MessageType: merged
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to