[Mesa-dev] [PATCH] i965/skl: Add the header for constant loads outside of the generator

Neil Roberts Tue, 14 Apr 2015 11:14:41 -0700

Commit 5a06ee738 added a step to the generator to set up the message
header when generating the VS_OPCODE_PULL_CONSTANT_LOAD_GEN7
instruction. That pseudo opcode is implemented in terms of multiple
actual opcodes, one of which writes to one of the source registers in
order to set up the message header. This causes problems because the
scheduler isn't aware that the source register is written to and it
can end up reorganising the instructions incorrectly such that the
write to the source register overwrites a needed value from a previous
instruction. This problem was presenting itself as a rendering error
in the weapon in Enemy Territory: Quake Wars.


Since commit 588859e1 there is an additional problem that the double
register allocated to include the message header would end up being
split into two. This wasn't happening previously because the code to
split registers was explicitly avoided for instructions that are
sending from the GRF.

This patch fixes both problems by splitting the code to set up the
message header into a new pseudo opcode so that it will be done
outside of the generator. This new opcode has the header register as a
destination so the scheduler can recognise that the register is
written to. There is now a helper function to emit the pseudo opcode
correctly so that it can include this second opcode too. This has the
additional benefit that the scheduler can optimise the message header
slightly better by moving the mov instructions further away from the
send instructions.

On Skylake it appears to fix the following three Piglit tests without
causing any regressions:

 gs-float-array-variable-index
 gs-mat3x4-row-major
 gs-mat4x3-row-major

I think we actually may need to do something similar for the fs
backend and possibly for message headers from regular texture sampling
but I'm not entirely sure.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=89058
---
 src/mesa/drivers/dri/i965/brw_defines.h            |   1 +
 src/mesa/drivers/dri/i965/brw_shader.cpp           |   4 +
 src/mesa/drivers/dri/i965/brw_vec4.h               |   7 ++
 .../dri/i965/brw_vec4_dead_code_eliminate.cpp      |   1 +
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp   |  53 ++++----
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp     | 135 ++++++++++++---------
 src/mesa/drivers/dri/i965/brw_vec4_vp.cpp          |  27 +----
 7 files changed, 125 insertions(+), 103 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index da6ed5b..a97a944 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -948,6 +948,7 @@ enum opcode {
    VS_OPCODE_URB_WRITE,
    VS_OPCODE_PULL_CONSTANT_LOAD,
    VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+   VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
    VS_OPCODE_UNPACK_FLAGS_SIMD4X2,
 
    /**
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 335a800..0d6ac0c 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -568,6 +568,10 @@ brw_instruction_name(enum opcode op)
       return "pull_constant_load";
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
       return "pull_constant_load_gen7";
+
+   case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+      return "set_simd4x2_header_gen9";
+
    case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
       return "unpack_flags_simd4x2";
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h 
b/src/mesa/drivers/dri/i965/brw_vec4.h
index 700ca69..a0ee2cc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -364,6 +364,11 @@ public:
                                dst_reg dst,
                                src_reg orig_src,
                                int base_offset);
+   void emit_pull_constant_load_reg(dst_reg dst,
+                                    src_reg surf_index,
+                                    src_reg offset,
+                                    bblock_t *before_block,
+                                    vec4_instruction *before_inst);
    src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
                                 vec4_instruction *inst, src_reg src);
 
@@ -495,6 +500,8 @@ private:
                                          struct brw_reg dst,
                                          struct brw_reg surf_index,
                                          struct brw_reg offset);
+   void generate_set_simd4x2_header_gen9(vec4_instruction *inst,
+                                         struct brw_reg dst);
    void generate_unpack_flags(struct brw_reg dst);
 
    void generate_untyped_atomic(vec4_instruction *inst,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
index 980e266..70d2af5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
@@ -44,6 +44,7 @@ can_do_writemask(const struct brw_context *brw,
    case SHADER_OPCODE_GEN4_SCRATCH_READ:
    case VS_OPCODE_PULL_CONSTANT_LOAD:
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+   case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
       return false;
    default:
       /* The MATH instruction on Gen6 only executes in align1 mode, which does
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index e4addf7..7d6421b 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1039,38 +1039,18 @@ 
vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
 {
    assert(surf_index.type == BRW_REGISTER_TYPE_UD);
 
-   struct brw_reg src = offset;
-   bool header_present = false;
-   int mlen = 1;
-
-   if (brw->gen >= 9) {
-      /* Skylake requires a message header in order to use SIMD4x2 mode. */
-      src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD);
-      mlen = 2;
-      header_present = true;
-
-      brw_push_insn_state(p);
-      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-      brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-      brw_set_default_access_mode(p, BRW_ALIGN_1);
-
-      brw_MOV(p, get_element_ud(src, 2),
-              brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
-      brw_pop_insn_state(p);
-   }
-
    if (surf_index.file == BRW_IMMEDIATE_VALUE) {
 
       brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
       brw_set_dest(p, insn, dst);
-      brw_set_src0(p, insn, src);
+      brw_set_src0(p, insn, offset);
       brw_set_sampler_message(p, insn,
                               surf_index.dw1.ud,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
-                              mlen,
-                              header_present,
+                              inst->mlen,
+                              inst->header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
 
@@ -1095,14 +1075,14 @@ 
vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
 
       /* dst = send(offset, a0.0 | <descriptor>) */
       brw_inst *insn = brw_send_indirect_message(
-         p, BRW_SFID_SAMPLER, dst, src, addr);
+         p, BRW_SFID_SAMPLER, dst, offset, addr);
       brw_set_sampler_message(p, insn,
                               0 /* surface */,
                               0 /* sampler */,
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1 /* rlen */,
-                              mlen /* mlen */,
-                              header_present /* header */,
+                              inst->mlen,
+                              inst->header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
 
@@ -1113,6 +1093,23 @@ 
vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
 }
 
 void
+vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst,
+                                                 struct brw_reg dst)
+{
+   /* Skylake requires a message header in order to use SIMD4x2 mode. */
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   brw_MOV(p, dst, retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, get_element_ud(dst, 2),
+           brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
+
+   brw_pop_insn_state(p);
+}
+
+void
 vec4_generator::generate_untyped_atomic(vec4_instruction *inst,
                                         struct brw_reg dst,
                                         struct brw_reg atomic_op,
@@ -1435,6 +1432,10 @@ vec4_generator::generate_code(const cfg_t *cfg)
          generate_pull_constant_load_gen7(inst, dst, src[0], src[1]);
          break;
 
+      case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+         generate_set_simd4x2_header_gen9(inst, dst);
+         break;
+
       case GS_OPCODE_URB_WRITE:
          generate_gs_urb_write(inst);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index ffbe04d..9714973 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1296,6 +1296,78 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
    }
 }
 
+/**
+ * Emits the instructions needed to perform a pull constant load. before_block
+ * and before_inst can be NULL in which case the instruction will be appended
+ * to the end of the instruction list.
+ */
+void
+vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
+                                          src_reg surf_index,
+                                          src_reg offset_reg,
+                                          bblock_t *before_block,
+                                          vec4_instruction *before_inst)
+{
+   assert((before_inst == NULL && before_block == NULL) ||
+          (before_inst && before_block));
+
+   vec4_instruction *pull;
+
+   if (brw->gen >= 9) {
+      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
+      src_reg header(this, glsl_type::uvec4_type, 2);
+
+      pull = new(mem_ctx)
+         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+                          dst_reg(header));
+
+      if (before_inst)
+         emit_before(before_block, before_inst, pull);
+      else
+         emit(pull);
+
+      dst_reg index_reg = retype(offset(dst_reg(header), 1),
+                                 offset_reg.type);
+      pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
+
+      if (before_inst)
+         emit_before(before_block, before_inst, pull);
+      else
+         emit(pull);
+
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+                                           dst,
+                                           surf_index,
+                                           src_reg(header));
+      pull->mlen = 2;
+      pull->header_present = true;
+   } else if (brw->gen >= 7) {
+      dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
+
+      grf_offset.type = offset_reg.type;
+
+      emit(MOV(grf_offset, offset_reg));
+
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+                                           dst,
+                                           surf_index,
+                                           src_reg(grf_offset));
+      pull->mlen = 1;
+   } else {
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
+                                           dst,
+                                           surf_index,
+                                           offset_reg);
+      pull->base_mrf = 14;
+      pull->mlen = 1;
+   }
+
+   if (before_inst)
+      emit_before(before_block, before_inst, pull);
+   else
+      emit(pull);
+}
+
 void
 vec4_visitor::visit(ir_expression *ir)
 {
@@ -1774,36 +1846,10 @@ vec4_visitor::visit(ir_expression *ir)
          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
       }
 
-      if (brw->gen >= 7) {
-         dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
-
-         /* We have to use a message header on Skylake to get SIMD4x2 mode.
-          * Reserve space for the register.
-          */
-         if (brw->gen >= 9) {
-            grf_offset.reg_offset++;
-            alloc.sizes[grf_offset.reg] = 2;
-         }
-
-         grf_offset.type = offset.type;
-
-         emit(MOV(grf_offset, offset));
-
-         vec4_instruction *pull =
-            emit(new(mem_ctx) 
vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
-                                               dst_reg(packed_consts),
-                                               surf_index,
-                                               src_reg(grf_offset)));
-         pull->mlen = 1;
-      } else {
-         vec4_instruction *pull =
-            emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
-                                               dst_reg(packed_consts),
-                                               surf_index,
-                                               offset));
-         pull->base_mrf = 14;
-         pull->mlen = 1;
-      }
+      emit_pull_constant_load_reg(dst_reg(packed_consts),
+                                  surf_index,
+                                  offset,
+                                  NULL, NULL /* before_block/inst */);
 
       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
@@ -3475,32 +3521,11 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, 
vec4_instruction *inst,
    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
                                              reg_offset);
-   vec4_instruction *load;
-
-   if (brw->gen >= 7) {
-      dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 
-      /* We have to use a message header on Skylake to get SIMD4x2 mode.
-       * Reserve space for the register.
-       */
-      if (brw->gen >= 9) {
-         grf_offset.reg_offset++;
-         alloc.sizes[grf_offset.reg] = 2;
-      }
-
-      grf_offset.type = offset.type;
-      emit_before(block, inst, MOV(grf_offset, offset));
-
-      load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
-                                           temp, index, src_reg(grf_offset));
-      load->mlen = 1;
-   } else {
-      load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
-                                           temp, index, offset);
-      load->base_mrf = 14;
-      load->mlen = 1;
-   }
-   emit_before(block, inst, load);
+   emit_pull_constant_load_reg(temp,
+                               index,
+                               offset,
+                               block, inst);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
index c3b0233..8756bef 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
@@ -528,14 +528,6 @@ vec4_vs_visitor::get_vp_src_reg(const prog_src_register 
&src)
          /* Add the small constant index to the address register */
          src_reg reladdr = src_reg(this, glsl_type::int_type);
 
-         /* We have to use a message header on Skylake to get SIMD4x2 mode.
-          * Reserve space for the register.
-          */
-         if (brw->gen >= 9) {
-            reladdr.reg_offset++;
-            alloc.sizes[reladdr.reg] = 2;
-         }
-
          dst_reg dst_reladdr = dst_reg(reladdr);
          dst_reladdr.writemask = WRITEMASK_X;
          emit(ADD(dst_reladdr, this->vp_addr_reg, src_reg(src.Index)));
@@ -553,20 +545,11 @@ vec4_vs_visitor::get_vp_src_reg(const prog_src_register 
&src)
 
          result = src_reg(this, glsl_type::vec4_type);
          src_reg surf_index = 
src_reg(unsigned(prog_data->base.binding_table.pull_constants_start));
-         vec4_instruction *load;
-         if (brw->gen >= 7) {
-            load = new(mem_ctx)
-               vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
-                                dst_reg(result), surf_index, reladdr);
-            load->mlen = 1;
-         } else {
-            load = new(mem_ctx)
-               vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
-                                dst_reg(result), surf_index, reladdr);
-            load->base_mrf = 14;
-            load->mlen = 1;
-         }
-         emit(load);
+
+         emit_pull_constant_load_reg(dst_reg(result),
+                                     surf_index,
+                                     reladdr,
+                                     NULL, NULL /* before_block/inst */);
          break;
       }
 
-- 
1.9.3

_______________________________________________
mesa-dev mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965/skl: Add the header for constant loads outside of the generator

Reply via email to