changeset 5fad5a37d6fc in /z/repo/gem5
details: http://repo.gem5.org/gem5?cmd=changeset;node=5fad5a37d6fc
description:
        gpu-compute: Added method to compute the actual workgroup size
        This patch adds a method to the Wavefront class to compute the actual 
workgroup
        size. This can be different from the maximum workgroup size specified 
when
        launching the kernel through the NDRange object. Current solution is 
still not
        optimal, as we are computing these for each wavefront and the 
dispatcher also
        needs to have this information and can't actually call
        Wavefront::computeActuallWgSz before the wavefronts are being created. 
A long
        term solution would be to have a Workgroup class that deals with all 
these
        details.

diffstat:

 src/gpu-compute/compute_unit.cc |  48 +++++++++++++++-------------------------
 src/gpu-compute/compute_unit.hh |   6 ++--
 src/gpu-compute/wavefront.cc    |  11 +++++++++
 src/gpu-compute/wavefront.hh    |   6 +++++
 4 files changed, 38 insertions(+), 33 deletions(-)

diffs (183 lines):

diff -r 02a0c6b9c057 -r 5fad5a37d6fc src/gpu-compute/compute_unit.cc
--- a/src/gpu-compute/compute_unit.cc   Tue Oct 04 15:44:52 2016 +0100
+++ b/src/gpu-compute/compute_unit.cc   Tue Oct 04 13:03:52 2016 -0400
@@ -174,7 +174,7 @@
 }
 
 void
-ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
+ComputeUnit::fillKernelState(Wavefront *w, NDRange *ndr)
 {
     w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
 
@@ -190,6 +190,7 @@
     w->spillSizePerItem = ndr->q.spillMemPerItem;
     w->roBase = ndr->q.roMemStart;
     w->roSize = ndr->q.roMemTotal;
+    w->computeActualWgSz(ndr);
 }
 
 void
@@ -220,19 +221,16 @@
 
 
 void
-ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
-                     int waveId, LdsChunk *ldsChunk, NDRange *ndr)
+ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
+                            NDRange *ndr)
 {
     static int _n_wave = 0;
 
-    // Fill in Kernel state
-    FillKernelState(w, ndr);
-
     VectorMask init_mask;
     init_mask.reset();
 
     for (int k = 0; k < wfSize(); ++k) {
-        if (k + waveId * wfSize() < trueWgSizeTotal)
+        if (k + waveId * wfSize() < w->actualWgSzTotal)
             init_mask[k] = 1;
     }
 
@@ -241,18 +239,18 @@
     w->initMask = init_mask.to_ullong();
 
     for (int k = 0; k < wfSize(); ++k) {
-        w->workItemId[0][k] = (k + waveId * wfSize()) % trueWgSize[0];
-        w->workItemId[1][k] =
-            ((k + waveId * wfSize()) / trueWgSize[0]) % trueWgSize[1];
-        w->workItemId[2][k] =
-            (k + waveId * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
+        w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
+        w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
+                             w->actualWgSz[1];
+        w->workItemId[2][k] = (k + waveId * wfSize()) /
+                              (w->actualWgSz[0] * w->actualWgSz[1]);
 
-        w->workItemFlatId[k] = w->workItemId[2][k] * trueWgSize[0] *
-            trueWgSize[1] + w->workItemId[1][k] * trueWgSize[0] +
+        w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
+            w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
             w->workItemId[0][k];
     }
 
-    w->barrierSlots = divCeil(trueWgSizeTotal, wfSize());
+    w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
 
     w->barCnt.resize(wfSize(), 0);
 
@@ -294,8 +292,8 @@
     // is this the last wavefront in the workgroup
     // if set the spillWidth to be the remaining work-items
     // so that the vector access is correct
-    if ((waveId + 1) * wfSize() >= trueWgSizeTotal) {
-        w->spillWidth = trueWgSizeTotal - (waveId * wfSize());
+    if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
+        w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
     } else {
         w->spillWidth = wfSize();
     }
@@ -328,17 +326,6 @@
         injectGlobalMemFence(gpuDynInst, true);
     }
 
-    // Get true size of workgroup (after clamping to grid size)
-    int trueWgSize[3];
-    int trueWgSizeTotal = 1;
-
-    for (int d = 0; d < 3; ++d) {
-        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
-                                 ndr->wgId[d] * ndr->q.wgSize[d]);
-
-        trueWgSizeTotal *= trueWgSize[d];
-    }
-
     // calculate the number of 32-bit vector registers required by wavefront
     int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
     int wave_id = 0;
@@ -350,9 +337,10 @@
         // It must be stopped and not waiting
         // for a release to complete S_RETURNING
         if (w->status == Wavefront::S_STOPPED) {
+            fillKernelState(w, ndr);
             // if we have scheduled all work items then stop
             // scheduling wavefronts
-            if (wave_id * wfSize() >= trueWgSizeTotal)
+            if (wave_id * wfSize() >= w->actualWgSzTotal)
                 break;
 
             // reserve vector registers for the scheduled wavefront
@@ -365,7 +353,7 @@
             w->reservedVectorRegs = normSize;
             vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
 
-            StartWF(w, trueWgSize, trueWgSizeTotal, wave_id, ldsChunk, ndr);
+            startWavefront(w, wave_id, ldsChunk, ndr);
             ++wave_id;
         }
     }
diff -r 02a0c6b9c057 -r 5fad5a37d6fc src/gpu-compute/compute_unit.hh
--- a/src/gpu-compute/compute_unit.hh   Tue Oct 04 15:44:52 2016 +0100
+++ b/src/gpu-compute/compute_unit.hh   Tue Oct 04 13:03:52 2016 -0400
@@ -254,10 +254,10 @@
     void exec();
     void initiateFetch(Wavefront *wavefront);
     void fetch(PacketPtr pkt, Wavefront *wavefront);
-    void FillKernelState(Wavefront *w, NDRange *ndr);
+    void fillKernelState(Wavefront *w, NDRange *ndr);
 
-    void StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
-                     int cnt, LdsChunk *ldsChunk, NDRange *ndr);
+    void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
+                        NDRange *ndr);
 
     void StartWorkgroup(NDRange *ndr);
     int ReadyWorkgroup(NDRange *ndr);
diff -r 02a0c6b9c057 -r 5fad5a37d6fc src/gpu-compute/wavefront.cc
--- a/src/gpu-compute/wavefront.cc      Tue Oct 04 15:44:52 2016 +0100
+++ b/src/gpu-compute/wavefront.cc      Tue Oct 04 13:03:52 2016 -0400
@@ -1066,3 +1066,14 @@
             ldsChunk->write<char>(i, val);
         }
 }
+
+void
+Wavefront::computeActualWgSz(NDRange *ndr)
+{
+    actualWgSzTotal = 1;
+    for (int d = 0; d < 3; ++d) {
+        actualWgSz[d] = std::min(workGroupSz[d],
+                                 gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
+        actualWgSzTotal *= actualWgSz[d];
+    }
+}
diff -r 02a0c6b9c057 -r 5fad5a37d6fc src/gpu-compute/wavefront.hh
--- a/src/gpu-compute/wavefront.hh      Tue Oct 04 15:44:52 2016 +0100
+++ b/src/gpu-compute/wavefront.hh      Tue Oct 04 13:03:52 2016 -0400
@@ -47,6 +47,7 @@
 #include "gpu-compute/condition_register_state.hh"
 #include "gpu-compute/lds_state.hh"
 #include "gpu-compute/misc.hh"
+#include "gpu-compute/ndrange.hh"
 #include "params/Wavefront.hh"
 #include "sim/sim_object.hh"
 
@@ -189,11 +190,16 @@
     std::vector<Addr> lastAddr;
     std::vector<uint32_t> workItemId[3];
     std::vector<uint32_t> workItemFlatId;
+    /* kernel launch parameters */
     uint32_t workGroupId[3];
     uint32_t workGroupSz[3];
     uint32_t gridSz[3];
     uint32_t wgId;
     uint32_t wgSz;
+    /* the actual WG size can differ than the maximum size */
+    uint32_t actualWgSz[3];
+    uint32_t actualWgSzTotal;
+    void computeActualWgSz(NDRange *ndr);
     // wavefront id within a workgroup
     uint32_t wfId;
     uint32_t maxDynWaveId;
_______________________________________________
gem5-dev mailing list
[email protected]
http://m5sim.org/mailman/listinfo/gem5-dev

Reply via email to