Module: Mesa
Branch: main
Commit: aa65f83203f698188adc3646e4fc53fa2ad88f15
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=aa65f83203f698188adc3646e4fc53fa2ad88f15

Author: Lionel Landwerlin <[email protected]>
Date:   Fri Jul 15 13:08:23 2022 +0300

intel/fs: switch compute push constant loads to LSC

We're now able to load up to 8 GRFs in one send.

v2: Switch to use transpose + vector of up to 64 (Thanks Curro!)

v3: Increase parallelism by not reusing the same register for push
    constant offset (Curro)

v4: Drop dead ADD() instruction (Curro)

Signed-off-by: Lionel Landwerlin <[email protected]>
Reviewed-by: Francisco Jerez <[email protected]>
Reviewed-by: Rohan Garg <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17555>

---

 src/intel/compiler/brw_fs.cpp | 69 ++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 40 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index fd2dec26348..700e8f4bbff 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1586,67 +1586,56 @@ fs_visitor::assign_curb_setup()
       assert(devinfo->verx10 >= 125);
       assert(uniform_push_length <= 1);
    } else if (is_compute && devinfo->verx10 >= 125) {
-      fs_builder ubld = bld.exec_all().group(8, 0).at(
+      assert(devinfo->has_lsc);
+      fs_builder ubld = bld.exec_all().group(1, 0).at(
          cfg->first_block(), cfg->first_block()->start());
 
-      /* The base address for our push data is passed in as R0.0[31:6].  We
-       * have to mask off the bottom 6 bits.
+      /* The base offset for our push data is passed in as R0.0[31:6]. We have
+       * to mask off the bottom 6 bits.
        */
       fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-      ubld.group(1, 0).AND(base_addr,
-                           retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
-                           brw_imm_ud(INTEL_MASK(31, 6)));
-
-      fs_reg header0 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-      ubld.MOV(header0, brw_imm_ud(0));
-      ubld.group(1, 0).SHR(component(header0, 2), base_addr, brw_imm_ud(4));
+      ubld.AND(base_addr,
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(INTEL_MASK(31, 6)));
 
       /* On Gfx12-HP we load constants at the start of the program using A32
        * stateless messages.
        */
       for (unsigned i = 0; i < uniform_push_length;) {
-         /* Limit ourselves to HW limit of 8 Owords (8 * 16bytes = 128 bytes
-          * or 4 registers).
-          */
-         unsigned num_regs = MIN2(uniform_push_length - i, 4);
+         /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
+         unsigned num_regs = MIN2(uniform_push_length - i, 8);
          assert(num_regs > 0);
          num_regs = 1 << util_logbase2(num_regs);
 
-         fs_reg header;
-         if (i == 0) {
-            header = header0;
-         } else {
-            header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-            ubld.MOV(header, brw_imm_ud(0));
-            ubld.group(1, 0).ADD(component(header, 2),
-                                 component(header0, 2),
-                                 brw_imm_ud(i * 2));
-         }
+         fs_reg addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.ADD(addr, base_addr, brw_imm_ud(i * REG_SIZE));
 
          fs_reg srcs[4] = {
             brw_imm_ud(0), /* desc */
             brw_imm_ud(0), /* ex_desc */
-            header, /* payload */
-            fs_reg(), /* payload2 */
+            addr,          /* payload */
+            fs_reg(),      /* payload2 */
          };
 
          fs_reg dest = retype(brw_vec8_grf(payload.num_regs + i, 0),
                               BRW_REGISTER_TYPE_UD);
+         fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
 
-         /* This instruction has to be run SIMD16 if we're filling more than a
-          * single register.
-          */
-         unsigned send_width = MIN2(16, num_regs * 8);
-
-         fs_inst *send = ubld.group(send_width, 0).emit(SHADER_OPCODE_SEND,
-                                                        dest, srcs, 4);
-         send->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
-         send->desc = brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
-                                  GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
-                                  BRW_DATAPORT_OWORD_BLOCK_OWORDS(num_regs * 
2));
-         send->header_size = 1;
-         send->mlen = 1;
-         send->size_written = num_regs * REG_SIZE;
+         send->sfid = GFX12_SFID_UGM;
+         send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
+                                   1 /* exec_size */,
+                                   LSC_ADDR_SURFTYPE_FLAT,
+                                   LSC_ADDR_SIZE_A32,
+                                   1 /* num_coordinates */,
+                                   LSC_DATA_SIZE_D32,
+                                   num_regs * 8 /* num_channels */,
+                                   true /* transpose */,
+                                   LSC_CACHE_LOAD_L1STATE_L3MOCS,
+                                   true /* has_dest */);
+         send->header_size = 0;
+         send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc);
+         send->size_written =
+            lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE;
          send->send_is_volatile = true;
 
          i += num_regs;

Reply via email to