Mesa (main): intel/fs: switch register allocation spilling to use LSC on Gfx12.5+

GitLab Mirror Wed, 24 Aug 2022 11:13:16 -0700

Module: Mesa
Branch: main
Commit: 37b3601052c35ebce78a14a34d0ae0095890bce3
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=37b3601052c35ebce78a14a34d0ae0095890bce3


Author: Lionel Landwerlin <[email protected]>
Date:   Mon Jul 18 12:27:53 2022 +0300

intel/fs: switch register allocation spilling to use LSC on Gfx12.5+

v2: drop the hardcoded inst->mlen=1 (Rohan)

v3: Move back to LOAD/STORE messages (limited to SIMD16 for LSC)

v4: Also use 4 GRFs transpose loads for fills (Curro)

v5: Reduce amount of needed register to build per lane offsets (Curro)
    Drop some now useless SIMD32 code
    Unify unspill code

Signed-off-by: Lionel Landwerlin <[email protected]>
Reviewed-by: Francisco Jerez <[email protected]>
Reviewed-by: Rohan Garg <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17555>

---

 src/intel/compiler/brw_eu.h                |   1 +
 src/intel/compiler/brw_eu_emit.c           |  13 +-
 src/intel/compiler/brw_fs_generator.cpp    |   5 +-
 src/intel/compiler/brw_fs_reg_allocate.cpp | 199 ++++++++++++++++++++++-------
 src/intel/compiler/brw_ir.h                |   4 +
 5 files changed, 176 insertions(+), 46 deletions(-)

diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 18683e15631..f0785046bb6 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1587,6 +1587,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
                                 unsigned desc_imm,
                                 struct brw_reg ex_desc,
                                 unsigned ex_desc_imm,
+                                bool ex_desc_scratch,
                                 bool eot);
 
 void brw_ff_sync(struct brw_codegen *p,
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index e647319613b..281094ed406 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -2746,6 +2746,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
                                 unsigned desc_imm,
                                 struct brw_reg ex_desc,
                                 unsigned ex_desc_imm,
+                                bool ex_desc_scratch,
                                 bool eot)
 {
    const struct intel_device_info *devinfo = p->devinfo;
@@ -2781,6 +2782,7 @@ brw_send_indirect_split_message(struct brw_codegen *p,
    }
 
    if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
+       !ex_desc_scratch &&
        (devinfo->ver >= 12 ||
         ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
       ex_desc.ud |= ex_desc_imm;
@@ -2807,7 +2809,16 @@ brw_send_indirect_split_message(struct brw_codegen *p,
        */
       unsigned imm_part = ex_desc_imm | sfid | eot << 5;
 
-      if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
+      if (ex_desc_scratch) {
+         /* Or the scratch surface offset together with the immediate part of
+          * the extended descriptor.
+          */
+         assert(devinfo->verx10 >= 125);
+         brw_AND(p, addr,
+                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(INTEL_MASK(31, 10)));
+         brw_OR(p, addr, addr, brw_imm_ud(imm_part));
+      } else if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
          /* ex_desc bits 15:12 don't exist in the instruction encoding prior
           * to Gfx12, so we may have fallen back to an indirect extended
           * descriptor.
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index e8c45b4e59c..62f5cb51fe1 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -335,13 +335,14 @@ fs_generator::generate_send(fs_inst *inst,
    uint32_t ex_desc_imm = inst->ex_desc |
       brw_message_ex_desc(devinfo, inst->ex_mlen);
 
-   if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) {
+   if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm ||
+       inst->send_ex_desc_scratch) {
       /* If we have any sort of extended descriptor, then we need SENDS.  This
        * also covers the dual-payload case because ex_mlen goes in ex_desc.
        */
       brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
                                       desc, desc_imm, ex_desc, ex_desc_imm,
-                                      inst->eot);
+                                      inst->send_ex_desc_scratch, inst->eot);
       if (inst->check_tdr)
          brw_inst_set_opcode(p->isa, brw_last_inst,
                              devinfo->ver >= 12 ? BRW_OPCODE_SENDC : 
BRW_OPCODE_SENDSC);
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp 
b/src/intel/compiler/brw_fs_reg_allocate.cpp
index 4f63f8ad50c..1b6082e5801 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -348,10 +348,15 @@ private:
    void build_interference_graph(bool allow_spilling);
    void discard_interference_graph();
 
+   fs_reg build_lane_offsets(const fs_builder &bld,
+                             uint32_t spill_offset, int ip);
+   fs_reg build_single_offset(const fs_builder &bld,
+                              uint32_t spill_offset, int ip);
+
    void emit_unspill(const fs_builder &bld, struct shader_stats *stats,
-                     fs_reg dst, uint32_t spill_offset, unsigned count);
+                     fs_reg dst, uint32_t spill_offset, unsigned count, int 
ip);
    void emit_spill(const fs_builder &bld, struct shader_stats *stats,
-                   fs_reg src, uint32_t spill_offset, unsigned count);
+                   fs_reg src, uint32_t spill_offset, unsigned count, int ip);
 
    void set_spill_costs();
    int choose_spill_reg();
@@ -448,6 +453,10 @@ namespace {
    unsigned
    spill_max_size(const backend_shader *s)
    {
+      /* LSC is limited to SIMD16 sends */
+      if (s->devinfo->has_lsc)
+         return 2;
+
       /* FINISHME - On Gfx7+ it should be possible to avoid this limit
        *            altogether by spilling directly from the temporary GRF
        *            allocated to hold the result of the instruction (and the
@@ -661,7 +670,7 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling)
    first_vgrf_node = node_count;
    node_count += fs->alloc.count;
    last_vgrf_node = node_count - 1;
-   if (devinfo->ver >= 9 && allow_spilling) {
+   if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) {
       scratch_header_node = node_count++;
    } else {
       scratch_header_node = -1;
@@ -742,11 +751,59 @@ fs_reg_alloc::discard_interference_graph()
    have_spill_costs = false;
 }
 
+fs_reg
+fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t 
spill_offset, int ip)
+{
+   fs_reg offset = retype(alloc_spill_reg(1, ip), BRW_REGISTER_TYPE_UD);
+   fs_inst *inst = bld.MOV(offset, brw_imm_ud(spill_offset));
+   _mesa_set_add(spill_insts, inst);
+   return offset;
+}
+
+fs_reg
+fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, 
int ip)
+{
+   /* LSC messages are limited to SIMD16 */
+   assert(bld.dispatch_width() <= 16);
+
+   const fs_builder ubld = bld.exec_all();
+   const unsigned reg_count = ubld.dispatch_width() / 8;
+
+   fs_reg offset = retype(alloc_spill_reg(reg_count, ip), 
BRW_REGISTER_TYPE_UD);
+   fs_inst *inst;
+
+   /* Build an offset per lane in SIMD8 */
+   inst = ubld.group(8, 0).MOV(retype(offset, BRW_REGISTER_TYPE_UW),
+                               brw_imm_uv(0x76543210));
+   _mesa_set_add(spill_insts, inst);
+   inst = ubld.group(8, 0).MOV(offset, retype(offset, BRW_REGISTER_TYPE_UW));
+   _mesa_set_add(spill_insts, inst);
+
+   /* Build offsets in the upper 8 lanes of SIMD16 */
+   if (ubld.dispatch_width() > 8) {
+      inst = ubld.group(8, 0).ADD(
+         byte_offset(offset, REG_SIZE),
+         byte_offset(offset, 0),
+         brw_imm_ud(8));
+      _mesa_set_add(spill_insts, inst);
+   }
+
+   /* Make the offset a dword */
+   inst = ubld.SHL(offset, offset, brw_imm_ud(2));
+   _mesa_set_add(spill_insts, inst);
+
+   /* Add the base offset */
+   inst = ubld.ADD(offset, offset, brw_imm_ud(spill_offset));
+   _mesa_set_add(spill_insts, inst);
+
+   return offset;
+}
+
 void
 fs_reg_alloc::emit_unspill(const fs_builder &bld,
                            struct shader_stats *stats,
                            fs_reg dst,
-                           uint32_t spill_offset, unsigned count)
+                           uint32_t spill_offset, unsigned count, int ip)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
@@ -757,7 +814,53 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
       ++stats->fill_count;
 
       fs_inst *unspill_inst;
-      if (devinfo->ver >= 9) {
+      if (devinfo->verx10 >= 125) {
+         /* LSC is limited to SIMD16 load/store but we can load more using
+          * transpose messages.
+          */
+         const bool use_transpose = bld.dispatch_width() > 16;
+         const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : 
bld;
+         fs_reg offset;
+         if (use_transpose) {
+            offset = build_single_offset(ubld, spill_offset, ip);
+         } else {
+            offset = build_lane_offsets(ubld, spill_offset, ip);
+         }
+         /* We leave the extended descriptor empty and flag the instruction to
+          * ask the generated to insert the extended descriptor in the address
+          * register. That way we don't need to burn an additional register
+          * for register allocation spill/fill.
+          */
+         fs_reg srcs[] = {
+            brw_imm_ud(0), /* desc */
+            brw_imm_ud(0), /* ex_desc */
+            offset,        /* payload */
+            fs_reg(),      /* payload2 */
+         };
+
+         unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst,
+                                  srcs, ARRAY_SIZE(srcs));
+         unspill_inst->sfid = GFX12_SFID_UGM;
+         unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
+                                           unspill_inst->exec_size,
+                                           LSC_ADDR_SURFTYPE_BSS,
+                                           LSC_ADDR_SIZE_A32,
+                                           1 /* num_coordinates */,
+                                           LSC_DATA_SIZE_D32,
+                                           use_transpose ? reg_size * 8 : 1 /* 
num_channels */,
+                                           use_transpose,
+                                           LSC_CACHE_LOAD_L1STATE_L3MOCS,
+                                           true /* has_dest */);
+         unspill_inst->header_size = 0;
+         unspill_inst->mlen =
+            lsc_msg_desc_src0_len(devinfo, unspill_inst->desc);
+         unspill_inst->ex_mlen = 0;
+         unspill_inst->size_written =
+            lsc_msg_desc_dest_len(devinfo, unspill_inst->desc) * REG_SIZE;
+         unspill_inst->send_has_side_effects = false;
+         unspill_inst->send_is_volatile = true;
+         unspill_inst->send_ex_desc_scratch = true;
+      } else if (devinfo->ver >= 9) {
          fs_reg header = this->scratch_header;
          fs_builder ubld = bld.exec_all().group(1, 0);
          assert(spill_offset % 16 == 0);
@@ -765,15 +868,8 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
                                  brw_imm_ud(spill_offset / 16));
          _mesa_set_add(spill_insts, unspill_inst);
 
-         unsigned bti;
-         fs_reg ex_desc;
-         if (devinfo->verx10 >= 125) {
-            bti = GFX9_BTI_BINDLESS;
-            ex_desc = component(this->scratch_header, 0);
-         } else {
-            bti = GFX8_BTI_STATELESS_NON_COHERENT;
-            ex_desc = brw_imm_ud(0);
-         }
+         const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
+         const fs_reg ex_desc = brw_imm_ud(0);
 
          fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header };
          unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
@@ -815,7 +911,7 @@ void
 fs_reg_alloc::emit_spill(const fs_builder &bld,
                          struct shader_stats *stats,
                          fs_reg src,
-                         uint32_t spill_offset, unsigned count)
+                         uint32_t spill_offset, unsigned count, int ip)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const unsigned reg_size = src.component_size(bld.dispatch_width()) /
@@ -826,7 +922,40 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
       ++stats->spill_count;
 
       fs_inst *spill_inst;
-      if (devinfo->ver >= 9) {
+      if (devinfo->verx10 >= 125) {
+         fs_reg offset = build_lane_offsets(bld, spill_offset, ip);
+         /* We leave the extended descriptor empty and flag the instruction
+          * relocate the extended descriptor. That way the surface offset is
+          * directly put into the instruction and we don't need to use a
+          * register to hold it.
+          */
+         fs_reg srcs[] = {
+            brw_imm_ud(0),        /* desc */
+            brw_imm_ud(0),        /* ex_desc */
+            offset,               /* payload */
+            src,                  /* payload2 */
+         };
+         spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
+                               srcs, ARRAY_SIZE(srcs));
+         spill_inst->sfid = GFX12_SFID_UGM;
+         spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
+                                         bld.dispatch_width(),
+                                         LSC_ADDR_SURFTYPE_BSS,
+                                         LSC_ADDR_SIZE_A32,
+                                         1 /* num_coordinates */,
+                                         LSC_DATA_SIZE_D32,
+                                         1 /* num_channels */,
+                                         false /* transpose */,
+                                         LSC_CACHE_LOAD_L1STATE_L3MOCS,
+                                         false /* has_dest */);
+         spill_inst->header_size = 0;
+         spill_inst->mlen = lsc_msg_desc_src0_len(devinfo, spill_inst->desc);
+         spill_inst->ex_mlen = reg_size;
+         spill_inst->size_written = 0;
+         spill_inst->send_has_side_effects = true;
+         spill_inst->send_is_volatile = false;
+         spill_inst->send_ex_desc_scratch = true;
+      } else if (devinfo->ver >= 9) {
          fs_reg header = this->scratch_header;
          fs_builder ubld = bld.exec_all().group(1, 0);
          assert(spill_offset % 16 == 0);
@@ -834,15 +963,8 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
                                brw_imm_ud(spill_offset / 16));
          _mesa_set_add(spill_insts, spill_inst);
 
-         unsigned bti;
-         fs_reg ex_desc;
-         if (devinfo->verx10 >= 125) {
-            bti = GFX9_BTI_BINDLESS;
-            ex_desc = component(this->scratch_header, 0);
-         } else {
-            bti = GFX8_BTI_STATELESS_NON_COHERENT;
-            ex_desc = brw_imm_ud(0);
-         }
+         const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
+         const fs_reg ex_desc = brw_imm_ud(0);
 
          fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src };
          spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
@@ -1033,25 +1155,16 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
     * SIMD16 mode, because we'd stomp the FB writes.
     */
    if (!fs->spilled_any_registers) {
-      if (devinfo->ver >= 9) {
+      if (devinfo->verx10 >= 125) {
+         /* We will allocate a register on the fly */
+      } else if (devinfo->ver >= 9) {
          this->scratch_header = alloc_scratch_header();
          fs_builder ubld = fs->bld.exec_all().group(8, 0).at(
             fs->cfg->first_block(), fs->cfg->first_block()->start());
 
-         fs_inst *inst;
-         if (devinfo->verx10 >= 125) {
-            inst = ubld.MOV(this->scratch_header, brw_imm_ud(0));
-            _mesa_set_add(spill_insts, inst);
-            inst = ubld.group(1, 0).AND(component(this->scratch_header, 0),
-                                        retype(brw_vec1_grf(0, 5),
-                                               BRW_REGISTER_TYPE_UD),
-                                        brw_imm_ud(INTEL_MASK(31, 10)));
-            _mesa_set_add(spill_insts, inst);
-         } else {
-            inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER,
-                             this->scratch_header);
-            _mesa_set_add(spill_insts, inst);
-         }
+         fs_inst *inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER,
+                                   this->scratch_header);
+         _mesa_set_add(spill_insts, inst);
       } else {
          bool mrf_used[BRW_MAX_MRF(devinfo->ver)];
          get_used_mrfs(fs, mrf_used);
@@ -1112,7 +1225,7 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
              * unspill destination is a block-local temporary.
              */
             emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
-                         unspill_dst, subset_spill_offset, count);
+                         unspill_dst, subset_spill_offset, count, ip);
         }
       }
 
@@ -1167,10 +1280,10 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
          if (inst->is_partial_write() ||
              (!inst->force_writemask_all && !per_channel))
             emit_unspill(ubld, &fs->shader_stats, spill_src,
-                         subset_spill_offset, regs_written(inst));
+                         subset_spill_offset, regs_written(inst), ip);
 
          emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src,
-                    subset_spill_offset, regs_written(inst));
+                    subset_spill_offset, regs_written(inst), ip);
       }
 
       for (fs_inst *inst = (fs_inst *)before->next;
diff --git a/src/intel/compiler/brw_ir.h b/src/intel/compiler/brw_ir.h
index 8db0c988ceb..33011f7299d 100644
--- a/src/intel/compiler/brw_ir.h
+++ b/src/intel/compiler/brw_ir.h
@@ -174,6 +174,10 @@ struct backend_instruction {
    bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
    bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
    bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
+   bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
+                                 *   the scratch surface offset to build
+                                 *   extended descriptor
+                                 */
    bool eot:1;
 
    /* Chooses which flag subregister (f0.0 to f1.1) is used for conditional

Mesa (main): intel/fs: switch register allocation spilling to use LSC on Gfx12.5+

Reply via email to