changeset 7106f550afad in /z/repo/gem5
details: http://repo.gem5.org/gem5?cmd=changeset;node=7106f550afad
description:
        gpu-compute: parametrize Wavefront size

        Eliminate the VSZ constant that defined the Wavefront size (in numbers 
of work
        items); replaced it with a parameter in the GPU.py configuration script.
        Changed all data structures dependent on the Wavefront size to be 
dynamically
        sized. Legal values of Wavefront size are 16, 32, 64 for now and 
checked at
        initialization time.

diffstat:

 configs/example/apu_se.py                 |    3 +-
 src/arch/hsail/gen.py                     |   14 +-
 src/arch/hsail/insts/branch.hh            |    2 +-
 src/arch/hsail/insts/main.cc              |    2 +-
 src/arch/hsail/insts/mem.hh               |   16 ++-
 src/arch/hsail/insts/mem_impl.hh          |   46 ++++++-----
 src/arch/hsail/insts/pseudo_inst.cc       |   57 +++++++-------
 src/arch/hsail/operand.hh                 |   44 +++++++---
 src/gpu-compute/GPU.py                    |    2 +
 src/gpu-compute/cl_driver.cc              |    2 +-
 src/gpu-compute/compute_unit.cc           |  117 +++++++++++++++++------------
 src/gpu-compute/compute_unit.hh           |   18 +----
 src/gpu-compute/dispatcher.cc             |    6 +
 src/gpu-compute/dispatcher.hh             |    1 +
 src/gpu-compute/global_memory_pipeline.cc |    4 +-
 src/gpu-compute/gpu_dyn_inst.cc           |   22 +++++-
 src/gpu-compute/gpu_dyn_inst.hh           |   10 +-
 src/gpu-compute/local_memory_pipeline.cc  |    4 +-
 src/gpu-compute/misc.hh                   |   18 +----
 src/gpu-compute/qstruct.hh                |    2 +-
 src/gpu-compute/vector_register_file.cc   |    2 +-
 src/gpu-compute/vector_register_state.cc  |   15 +++-
 src/gpu-compute/vector_register_state.hh  |    6 +-
 src/gpu-compute/wavefront.cc              |   10 ++-
 src/gpu-compute/wavefront.hh              |   26 +++---
 25 files changed, 256 insertions(+), 193 deletions(-)

diffs (truncated from 1463 to 300 lines):

diff -r 2aa4d7bd47ec -r 7106f550afad configs/example/apu_se.py
--- a/configs/example/apu_se.py Wed Jun 08 09:12:41 2016 -0500
+++ b/configs/example/apu_se.py Thu Jun 09 11:24:55 2016 -0400
@@ -250,7 +250,8 @@
     vrfs = []
     for j in xrange(options.simds_per_cu):
         for k in xrange(shader.n_wf):
-            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
+            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
+                                        wfSize = options.wf_size))
         vrfs.append(VectorRegisterFile(simd_id=j,
                               num_regs_per_simd=options.vreg_file_size))
     compute_units[-1].wavefronts = wavefronts
diff -r 2aa4d7bd47ec -r 7106f550afad src/arch/hsail/gen.py
--- a/src/arch/hsail/gen.py     Wed Jun 08 09:12:41 2016 -0500
+++ b/src/arch/hsail/gen.py     Thu Jun 09 11:24:55 2016 -0400
@@ -235,7 +235,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             DestCType dest_val = $expr;
             this->dest.set(w, lane, dest_val);
@@ -256,7 +256,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
             DestCType dest_val = $expr;
@@ -277,7 +277,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             CType dest_val;
             if ($dest_is_src_flag) {
@@ -312,7 +312,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             CType dest_val;
 
@@ -346,7 +346,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             DestT dest_val;
             if ($dest_is_src_flag) {
@@ -372,7 +372,7 @@
     Wavefront *w = gpuDynInst->wavefront();
 
     const VectorMask &mask = w->get_pred();
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             CType dest_val;
 
@@ -401,7 +401,7 @@
 
     const VectorMask &mask = w->get_pred();
 
-    for (int lane = 0; lane < VSZ; ++lane) {
+    for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
         if (mask[lane]) {
             DestCType dest_val;
             SrcCType src_val[$num_srcs];
diff -r 2aa4d7bd47ec -r 7106f550afad src/arch/hsail/insts/branch.hh
--- a/src/arch/hsail/insts/branch.hh    Wed Jun 08 09:12:41 2016 -0500
+++ b/src/arch/hsail/insts/branch.hh    Thu Jun 09 11:24:55 2016 -0400
@@ -279,7 +279,7 @@
         // taken branch
         const uint32_t true_pc = getTargetPc();
         VectorMask true_mask;
-        for (unsigned int lane = 0; lane < VSZ; ++lane) {
+        for (unsigned int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
         }
 
diff -r 2aa4d7bd47ec -r 7106f550afad src/arch/hsail/insts/main.cc
--- a/src/arch/hsail/insts/main.cc      Wed Jun 08 09:12:41 2016 -0500
+++ b/src/arch/hsail/insts/main.cc      Thu Jun 09 11:24:55 2016 -0400
@@ -134,7 +134,7 @@
         const VectorMask &mask = w->get_pred();
 
         // mask off completed work-items
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 w->init_mask[lane] = 0;
             }
diff -r 2aa4d7bd47ec -r 7106f550afad src/arch/hsail/insts/mem.hh
--- a/src/arch/hsail/insts/mem.hh       Wed Jun 08 09:12:41 2016 -0500
+++ b/src/arch/hsail/insts/mem.hh       Thu Jun 09 11:24:55 2016 -0400
@@ -457,7 +457,7 @@
             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 
             if (num_dest_operands > 1) {
-                for (int i = 0; i < VSZ; ++i)
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
                     if (gpuDynInst->exec_mask[i])
                         gpuDynInst->statusVector.push_back(num_dest_operands);
                     else
@@ -466,9 +466,10 @@
 
             for (int k = 0; k < num_dest_operands; ++k) {
 
-                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+                c0 *d = &((c0*)gpuDynInst->d_data)
+                    [k * gpuDynInst->computeUnit()->wfSize()];
 
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                     if (gpuDynInst->exec_mask[i]) {
                         Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
 
@@ -1004,7 +1005,7 @@
             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 
             if (num_src_operands > 1) {
-                for (int i = 0; i < VSZ; ++i)
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
                     if (gpuDynInst->exec_mask[i])
                         gpuDynInst->statusVector.push_back(num_src_operands);
                     else
@@ -1012,9 +1013,10 @@
             }
 
             for (int k = 0; k < num_src_operands; ++k) {
-                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+                c0 *d = &((c0*)gpuDynInst->d_data)
+                    [k * gpuDynInst->computeUnit()->wfSize()];
 
-                for (int i = 0; i < VSZ; ++i) {
+                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                     if (gpuDynInst->exec_mask[i]) {
                         Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
 
@@ -1402,7 +1404,7 @@
             c0 *e = &((c0*) gpuDynInst->a_data)[0];
             c0 *f = &((c0*) gpuDynInst->x_data)[0];
 
-            for (int i = 0; i < VSZ; ++i) {
+            for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                 if (gpuDynInst->exec_mask[i]) {
                     Addr vaddr = gpuDynInst->addr[i];
 
diff -r 2aa4d7bd47ec -r 7106f550afad src/arch/hsail/insts/mem_impl.hh
--- a/src/arch/hsail/insts/mem_impl.hh  Wed Jun 08 09:12:41 2016 -0500
+++ b/src/arch/hsail/insts/mem_impl.hh  Thu Jun 09 11:24:55 2016 -0400
@@ -60,14 +60,16 @@
 
         typedef typename DestDataType::CType CType M5_VAR_USED;
         const VectorMask &mask = w->get_pred();
-        uint64_t addr_vec[VSZ];
+        std::vector<Addr> addr_vec;
+        addr_vec.resize(w->computeUnit->wfSize(), (Addr)0);
         this->addr.calcVector(w, addr_vec);
 
-        for (int lane = 0; lane < VSZ; ++lane) {
+        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 this->dest.set(w, lane, addr_vec[lane]);
             }
         }
+        addr_vec.clear();
     }
 
     template<typename MemDataType, typename DestDataType,
@@ -121,8 +123,8 @@
             i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
         assert(se);
 
-        return w->wfSlotId * w->privSizePerItem * VSZ +
-            se->offset * VSZ +
+        return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() +
+            se->offset * w->computeUnit->wfSize() +
             lane * se->size;
         */
 
@@ -139,9 +141,11 @@
         Addr addr_div8 = addr / 8;
         Addr addr_mod8 = addr % 8;
 
-        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+        Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 +
+            addr_mod8 + w->privBase;
 
-        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+        assert(ret < w->privBase +
+               (w->privSizePerItem * w->computeUnit->wfSize()));
 
         return ret;
     }
@@ -175,7 +179,7 @@
 
             DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
 
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     this->dest.set(w, lane, val);
                 }
@@ -184,7 +188,7 @@
             return;
         } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
             uint64_t address = this->addr.calcUniform();
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     MemCType val = w->readCallArgMem<MemCType>(lane, address);
 
@@ -239,7 +243,7 @@
             // this is a complete hack to get around a compiler bug
             // (the compiler currently generates global access for private
             //  addresses (starting from 0). We need to add the private offset)
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (m->addr[lane] < w->privSizePerItem) {
                     if (mask[lane]) {
                         // what is the size of the object we are accessing?
@@ -267,7 +271,7 @@
             m->pipeId = GLBMEM_PIPE;
             m->latency.set(w->computeUnit->shader->ticks(1));
             {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                     //  note: this calculation will NOT WORK if the compiler
                     //  ever generates loads/stores to the same address with
                     //  different widths (e.g., a ld_u32 addr and a ld_u16 
addr)
@@ -301,7 +305,7 @@
             m->pipeId = GLBMEM_PIPE;
             m->latency.set(w->computeUnit->shader->ticks(1));
 
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
                     m->addr[lane] += w->roBase;
@@ -318,7 +322,7 @@
             m->pipeId = GLBMEM_PIPE;
             m->latency.set(w->computeUnit->shader->ticks(1));
             {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                     if (mask[lane]) {
                         assert(m->addr[lane] < w->privSizePerItem);
 
@@ -360,7 +364,7 @@
         if (this->segment == Brig::BRIG_SEGMENT_ARG) {
             uint64_t address = this->addr.calcUniform();
 
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     CType data = this->src.template get<CType>(w, lane);
                     DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
@@ -378,7 +382,7 @@
         this->addr.calcVector(w, m->addr);
 
         if (num_src_operands == 1) {
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     ((CType*)m->d_data)[lane] =
                         this->src.template get<CType>(w, lane);
@@ -386,9 +390,9 @@
             }
         } else {
             for (int k= 0; k < num_src_operands; ++k) {
-                for (int lane = 0; lane < VSZ; ++lane) {
+                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                     if (mask[lane]) {
-                        ((CType*)m->d_data)[k * VSZ + lane] =
+                        ((CType*)m->d_data)[k * w->computeUnit->wfSize() + 
lane] =
                             this->src_vect[k].template get<CType>(w, lane);
                     }
                 }
@@ -428,7 +432,7 @@
             // this is a complete hack to get around a compiler bug
             // (the compiler currently generates global access for private
             //  addresses (starting from 0). We need to add the private offset)
-            for (int lane = 0; lane < VSZ; ++lane) {
+            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
                 if (mask[lane]) {
                     if (m->addr[lane] < w->privSizePerItem) {
 
@@ -454,7 +458,7 @@
_______________________________________________
gem5-dev mailing list
gem5-dev@gem5.org
http://m5sim.org/mailman/listinfo/gem5-dev

Reply via email to