Module: Mesa
Branch: main
Commit: 6031ad4bf690fe250d90063dec7e0269da5b3016
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6031ad4bf690fe250d90063dec7e0269da5b3016

Author: Sagar Ghuge <[email protected]>
Date:   Wed Oct 27 14:11:27 2021 -0700

intel/fs: Add Wa_22013689345

v2: Use a simpler framework (Lionel)

v3: Rebase, add task/mesh (Lionel)

v4: Fixup fence exec size (SIMDX -> SIMD1)

v5: Fix invalidate_analysis, add finishme comment (Curro)

Signed-off-by: Sagar Ghuge <[email protected]>
Signed-off-by: Lionel Landwerlin <[email protected]>
Cc: 22.0 <mesa-stable>
Reviewed-by: Tapani Pälli <[email protected]>
Reviewed-by: Francisco Jerez <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14947>

---

 src/intel/compiler/brw_disasm.c           |  1 +
 src/intel/compiler/brw_eu.h               |  1 +
 src/intel/compiler/brw_eu_defines.h       |  5 ++
 src/intel/compiler/brw_eu_emit.c          | 10 ++--
 src/intel/compiler/brw_fs.cpp             | 78 +++++++++++++++++++++++++++++++
 src/intel/compiler/brw_fs.h               |  1 +
 src/intel/compiler/brw_fs_generator.cpp   |  1 +
 src/intel/compiler/brw_vec4_generator.cpp |  1 +
 8 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c
index 4980ccbbde3..5e0ad96adf8 100644
--- a/src/intel/compiler/brw_disasm.c
+++ b/src/intel/compiler/brw_disasm.c
@@ -679,6 +679,7 @@ static const char* const lsc_flush_type[] = {
    [LSC_FLUSH_TYPE_DISCARD]    = "discard",
    [LSC_FLUSH_TYPE_CLEAN]      = "clean",
    [LSC_FLUSH_TYPE_L3ONLY]     = "l3only",
+   [LSC_FLUSH_TYPE_NONE_6]     = "none_6",
 };
 
 static const char* const lsc_addr_size[] = {
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 1c1301e9403..17fde1e2d1e 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1781,6 +1781,7 @@ brw_memory_fence(struct brw_codegen *p,
                  struct brw_reg src,
                  enum opcode send_op,
                  enum brw_message_target sfid,
+                 uint32_t desc,
                  bool commit_enable,
                  unsigned bti);
 
diff --git a/src/intel/compiler/brw_eu_defines.h 
b/src/intel/compiler/brw_eu_defines.h
index addf63e870d..4e3b126dc47 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -1991,6 +1991,11 @@ enum PACKED lsc_flush_type {
     * Flush "RW" section of the L3 cache, but leave L1 and L2 caches untouched.
     */
    LSC_FLUSH_TYPE_L3ONLY = 5,
+   /*
+    * HW maps this flush type internally to NONE.
+    */
+   LSC_FLUSH_TYPE_NONE_6 = 6,
+
 };
 
 enum PACKED lsc_backup_fence_routing {
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 32cfae00093..09207d9f806 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3256,7 +3256,8 @@ brw_set_memory_fence_message(struct brw_codegen *p,
 static void
 gfx12_set_memory_fence_message(struct brw_codegen *p,
                                struct brw_inst *insn,
-                               enum brw_message_target sfid)
+                               enum brw_message_target sfid,
+                               uint32_t desc)
 {
    const unsigned mlen = 1; /* g0 header */
     /* Completion signaled by write to register. No data returned. */
@@ -3268,8 +3269,8 @@ gfx12_set_memory_fence_message(struct brw_codegen *p,
       brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
                             brw_message_desc(p->devinfo, mlen, rlen, false));
    } else {
-      enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP;
-      enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
+      enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc);
+      enum lsc_flush_type flush_type = 
lsc_fence_msg_desc_flush_type(p->devinfo, desc);
 
       if (sfid == GFX12_SFID_TGM) {
          scope = LSC_FENCE_TILE;
@@ -3288,6 +3289,7 @@ brw_memory_fence(struct brw_codegen *p,
                  struct brw_reg src,
                  enum opcode send_op,
                  enum brw_message_target sfid,
+                 uint32_t desc,
                  bool commit_enable,
                  unsigned bti)
 {
@@ -3307,7 +3309,7 @@ brw_memory_fence(struct brw_codegen *p,
 
    /* All DG2 hardware requires LSC for fence messages, even A-step */
    if (devinfo->has_lsc)
-      gfx12_set_memory_fence_message(p, insn, sfid);
+      gfx12_set_memory_fence_message(p, insn, sfid, desc);
    else
       brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
 }
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0cbc6b6016c..225df2d04e6 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -8618,6 +8618,75 @@ fs_visitor::fixup_3src_null_dest()
                           DEPENDENCY_VARIABLES);
 }
 
+static bool
+needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
+{
+   /* This workaround is about making sure that any instruction writing
+    * through UGM has completed before we hit EOT.
+    *
+    * The workaround talks about UGM writes or atomic message but what is
+    * important is anything that hasn't completed. Usually any SEND
+    * instruction that has a destination register will be read by something
+    * else so we don't need to care about those as they will be synchronized
+    * by other parts of the shader or optimized away. What is left are
+    * instructions that don't have a destination register.
+    */
+   if (inst->sfid != GFX12_SFID_UGM)
+      return false;
+
+   return inst->dst.file == BAD_FILE;
+}
+
+/* Wa_22013689345
+ *
+ * We need to emit UGM fence message before EOT, if shader has any UGM write
+ * or atomic message.
+ *
+ * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
+ *                We probably need a better criteria in needs_dummy_fence().
+ */
+void
+fs_visitor::emit_dummy_memory_fence_before_eot()
+{
+   bool progress = false;
+   bool has_ugm_write_or_atomic = false;
+
+   if (!intel_device_info_is_dg2(devinfo))
+      return;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (!inst->eot) {
+         if (needs_dummy_fence(devinfo, inst))
+            has_ugm_write_or_atomic = true;
+         continue;
+      }
+
+      if (!has_ugm_write_or_atomic)
+         break;
+
+      const fs_builder ibld(this, block, inst);
+      const fs_builder ubld = ibld.exec_all().group(1, 0);
+
+      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
+                                       dst, brw_vec8_grf(0, 0),
+                                       /* commit enable */ brw_imm_ud(1),
+                                       /* bti */ brw_imm_ud(0));
+      dummy_fence->sfid = GFX12_SFID_UGM;
+      dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
+                                             LSC_FLUSH_TYPE_NONE_6, false);
+      ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
+      progress = true;
+      /* TODO: remove this break if we ever have shader with multiple EOT. */
+      break;
+   }
+
+   if (progress) {
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
+                          DEPENDENCY_VARIABLES);
+   }
+}
+
 /**
  * Find the first instruction in the program that might start a region of
  * divergent control flow due to a HALT jump.  There is no
@@ -8927,6 +8996,7 @@ fs_visitor::run_vs()
    assign_vs_urb_setup();
 
    fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
    allocate_registers(true /* allow_spilling */);
 
    return !failed;
@@ -9049,6 +9119,7 @@ fs_visitor::run_tcs()
    assign_tcs_urb_setup();
 
    fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
    allocate_registers(true /* allow_spilling */);
 
    return !failed;
@@ -9077,6 +9148,7 @@ fs_visitor::run_tes()
    assign_tes_urb_setup();
 
    fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
    allocate_registers(true /* allow_spilling */);
 
    return !failed;
@@ -9120,6 +9192,7 @@ fs_visitor::run_gs()
    assign_gs_urb_setup();
 
    fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
    allocate_registers(true /* allow_spilling */);
 
    return !failed;
@@ -9220,6 +9293,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
       assign_urb_setup();
 
       fixup_3src_null_dest();
+      emit_dummy_memory_fence_before_eot();
 
       allocate_registers(allow_spilling);
    }
@@ -9255,6 +9329,7 @@ fs_visitor::run_cs(bool allow_spilling)
    assign_curb_setup();
 
    fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
    allocate_registers(allow_spilling);
 
    return !failed;
@@ -9283,6 +9358,7 @@ fs_visitor::run_bs(bool allow_spilling)
    assign_curb_setup();
 
    fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
    allocate_registers(allow_spilling);
 
    return !failed;
@@ -9327,6 +9403,7 @@ fs_visitor::run_task(bool allow_spilling)
    assign_curb_setup();
 
    fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
    allocate_registers(allow_spilling);
 
    return !failed;
@@ -9371,6 +9448,7 @@ fs_visitor::run_mesh(bool allow_spilling)
    assign_curb_setup();
 
    fixup_3src_null_dest();
+   emit_dummy_memory_fence_before_eot();
    allocate_registers(allow_spilling);
 
    return !failed;
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index f4107e18321..3f6489a88cd 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -136,6 +136,7 @@ public:
    void setup_cs_payload();
    bool fixup_sends_duplicate_payload();
    void fixup_3src_null_dest();
+   void emit_dummy_memory_fence_before_eot();
    bool fixup_nomask_control_flow();
    void assign_curb_setup();
    void assign_urb_setup();
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index 0af40c739e5..c7a4aaf2150 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -2382,6 +2382,7 @@ fs_generator::generate_code(const cfg_t *cfg, int 
dispatch_width,
 
          brw_memory_fence(p, dst, src[0], send_op,
                           brw_message_target(inst->sfid),
+                          inst->desc,
                           /* commit_enable */ src[1].ud,
                           /* bti */ src[2].ud);
          send_count++;
diff --git a/src/intel/compiler/brw_vec4_generator.cpp 
b/src/intel/compiler/brw_vec4_generator.cpp
index bf548265099..d51a8ba8e1c 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1926,6 +1926,7 @@ generate_code(struct brw_codegen *p,
       case SHADER_OPCODE_MEMORY_FENCE:
          brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND,
                           brw_message_target(inst->sfid),
+                          inst->desc,
                           /* commit_enable */ false,
                           /* bti */ 0);
          send_count++;

Reply via email to