vs: Add support for pull constant loads for uniform arrays.

Kenneth Graunke Wed, 24 Aug 2011 11:24:51 -0700

On 08/23/2011 06:25 PM, Eric Anholt wrote:

---
  src/mesa/drivers/dri/i965/brw_defines.h        |    1 +
  src/mesa/drivers/dri/i965/brw_vec4.h           |   11 ++
  src/mesa/drivers/dri/i965/brw_vec4_emit.cpp    |   51 +++++++++-
  src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp |  133 +++++++++++++++++++++++-
  4 files changed, 192 insertions(+), 4 deletions(-)


Some suggestions for tidying up the patch below.  Otherwise looks good.

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index d1799c0..5f34939 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -643,6 +643,7 @@ enum opcode {
     VS_OPCODE_URB_WRITE,
     VS_OPCODE_SCRATCH_READ,
     VS_OPCODE_SCRATCH_WRITE,
+   VS_OPCODE_PULL_CONSTANT_LOAD,
  };

  #define BRW_PREDICATE_NONE             0
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h 
b/src/mesa/drivers/dri/i965/brw_vec4.h
index 1db910e..5a4ba0f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -364,6 +364,7 @@ public:
      */
     dst_reg output_reg[VERT_RESULT_MAX];
     int uniform_size[MAX_UNIFORMS];
+   int uniform_vector_size[MAX_UNIFORMS];
     int uniforms;

     struct hash_table *variable_ht;
@@ -380,6 +381,7 @@ public:
     void reg_allocate_trivial();
     void reg_allocate();
     void move_grf_array_access_to_scratch();
+   void move_uniform_array_access_to_pull_constants();
     void calculate_live_intervals();
     bool dead_code_eliminate();
     bool virtual_grf_interferes(int a, int b);
@@ -439,6 +441,8 @@ public:

     src_reg get_scratch_offset(vec4_instruction *inst,
                              src_reg *reladdr, int reg_offset);
+   src_reg get_pull_constant_offset(vec4_instruction *inst,
+                                   src_reg *reladdr, int reg_offset);
     void emit_scratch_read(vec4_instruction *inst,
                          dst_reg dst,
                          src_reg orig_src,
@@ -447,6 +451,10 @@ public:
                           src_reg temp,
                           dst_reg orig_dst,
                           int base_offset);
+   void emit_pull_constant_load(vec4_instruction *inst,
+                               dst_reg dst,
+                               src_reg orig_src,
+                               int base_offset);

     GLboolean try_emit_sat(ir_expression *ir);

@@ -482,6 +490,9 @@ public:
     void generate_scratch_read(vec4_instruction *inst,
                              struct brw_reg dst,
                              struct brw_reg index);
+   void generate_pull_constant_load(vec4_instruction *inst,
+                                   struct brw_reg dst,
+                                   struct brw_reg index);
  };

  } /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index 794f499..d77b2c6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -114,7 +114,7 @@ vec4_visitor::setup_uniforms(int reg)
      * matter what, or the GPU would hang.
      */
     if (intel->gen<  6&&  this->uniforms == 0) {
-      this->uniform_size[this->uniforms] = 1;
+      this->uniform_vector_size[this->uniforms] = 1;

        for (unsigned int i = 0; i<  4; i++) {
         unsigned int slot = this->uniforms * 4 + i;
@@ -229,6 +229,9 @@ vec4_instruction::get_src(int i)
         brw_reg = brw_abs(brw_reg);
        if (src[i].negate)
         brw_reg = negate(brw_reg);
+
+      /* This should have been moved to pull constants. */
+      assert(!src[i].reladdr);
        break;

     case HW_REG:
@@ -488,6 +491,47 @@ vec4_visitor::generate_scratch_write(vec4_instruction 
*inst,
  }

  void
+vec4_visitor::generate_pull_constant_load(vec4_instruction *inst,
+                                         struct brw_reg dst,
+                                         struct brw_reg index)
+{
+   if (intel->gen>= 6) {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_MOV(p,
+             retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_D),
+             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D));
+      brw_pop_insn_state(p);
+   }


I think you mean:

   gen6_resolve_implied_move(p, brw_vec8_grf(0, 0), inst->base_mrf);

+   brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_D),
+          index);
+
+   uint32_t msg_type;
+
+   if (intel->gen>= 6)
+      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (intel->gen == 5 || intel->is_g4x)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, brw_message_reg(inst->base_mrf));
+   brw_set_dp_read_message(p, send,
+                          SURF_INDEX_VERT_CONST_BUFFER,
+                          BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                          msg_type,
+                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                          2, /* mlen */
+                          1 /* rlen */);
+}
+
+void
  vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
                                      struct brw_reg dst,
                                      struct brw_reg *src)
@@ -529,6 +573,10 @@ vec4_visitor::generate_vs_instruction(vec4_instruction 
*instruction,
        generate_scratch_write(inst, dst, src[0], src[1]);
        break;

+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+      generate_pull_constant_load(inst, dst, src[0]);
+      break;
+
     default:
        if (inst->opcode<  (int)ARRAY_SIZE(brw_opcodes)) {
         fail("unsupported opcode in `%s' in VS\n",
@@ -556,6 +604,7 @@ vec4_visitor::run()
      * often do repeated subexpressions for those.
      */
     move_grf_array_access_to_scratch();
+   move_uniform_array_access_to_pull_constants();

     bool progress;
     do {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 46f826c..b1266c1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -372,7 +372,10 @@ vec4_visitor::setup_uniform_values(int loc, const 
glsl_type *type)
         c->prog_data.param[this->uniforms * 4 + i] =&zero;
        }

-      this->uniform_size[this->uniforms] = type->vector_elements;
+      /* Track the size of this uniform vector, for future packing of
+       * uniforms.
+       */
+      this->uniform_vector_size[this->uniforms] = type->vector_elements;
        this->uniforms++;

        return 1;
@@ -420,7 +423,7 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
                                            (gl_state_index *)slots[i].tokens);
        float *values =&this->vp->Base.Parameters->ParameterValues[index][0].f;

-      this->uniform_size[this->uniforms] = 0;
+      this->uniform_vector_size[this->uniforms] = 0;
        /* Add each of the unique swizzled channels of the element.
         * This will end up matching the size of the glsl_type of this field.
         */
@@ -431,7 +434,7 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)

         c->prog_data.param[this->uniforms * 4 + j] =&values[swiz];
         if (swiz<= last_swiz)
-           this->uniform_size[this->uniforms]++;
+           this->uniform_vector_size[this->uniforms]++;
        }
        this->uniforms++;
     }
@@ -668,6 +671,11 @@ vec4_visitor::visit(ir_variable *ir)
     case ir_var_uniform:
        reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);

+      /* Track how big the whole uniform variable is, in case we need to put a
+       * copy of its data into pull constants for array access.
+       */
+      this->uniform_size[this->uniforms] = type_size(ir->type);
+
        if (!strncmp(ir->name, "gl_", 3)) {
         setup_builtin_uniform_values(ir);
        } else {
@@ -1938,6 +1946,38 @@ vec4_visitor::get_scratch_offset(vec4_instruction *inst,
     }
  }

+src_reg
+vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
+                                      src_reg *reladdr, int reg_offset)
+{
+   if (reladdr) {
+      src_reg index = src_reg(this, glsl_type::int_type);
+
+      vec4_instruction *add = emit(BRW_OPCODE_ADD,
+                                  dst_reg(index),
+                                  *reladdr,
+                                  src_reg(reg_offset));
+      /* Move our new instruction from the tail to its correct place. */
+      add->remove();
+      inst->insert_before(add);


This is ugly.  Can we please make a vec4_instruction constructor:

vec4_instruction(enum opcode opcode, dst_reg dst,
                 src_reg src0, src_reg src1, src_reg src2)
{
   this->opcode = opcode;
   this->dst = dst;
   this->src[0] = src0;
   this->src[1] = src1;
   this->src[2] = src2;
}

update vec4_visitor::emit to use it, and then make this:

vec4_instruction *add =
   new(mem_ctx) vec4_instruction(BRW_OPCODE_ADD, dst_reg(index),
                                 *reladdr, src_reg(reg_offset));
add->ir = this->base_ir;
inst->insert_before(add);

(maybe base_ir should be passed to the constructor too; not sure)

+      /* Pre-gen6, the message header uses byte offsets instead of vec4
+       * (16-byte) offset units.
+       */
+      if (intel->gen<  6) {
+        vec4_instruction *mul = emit(BRW_OPCODE_MUL, dst_reg(index),
+                                     index, src_reg(16));
+        mul->remove();
+        inst->insert_before(mul);
+      }

Ditto. Alternatively, maybe make an emit_before() helper that doesinsert_before() instead of push_tail().

+      return index;
+   } else {
+      int message_header_scale = intel->gen<  6 ? 16 : 1;
+      return src_reg(reg_offset * message_header_scale);
+   }
+}
+
  /**
   * Emits an instruction before @inst to load the value named by @orig_src
   * from scratch space at @base_offset to @temp.
@@ -2063,6 +2103,93 @@ vec4_visitor::move_grf_array_access_to_scratch()
     }
  }

+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from the pull constant buffer (surface) at @base_offset to @temp.
+ */
+void
+vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
+                                     dst_reg temp, src_reg orig_src,
+                                     int base_offset)
+{
+   int reg_offset = base_offset + orig_src.reg_offset;
+   src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, 
reg_offset);
+
+   vec4_instruction *const_load_inst = emit(VS_OPCODE_PULL_CONSTANT_LOAD,
+                                           temp, index);
+
+   const_load_inst->base_mrf = 14;
+   const_load_inst->mlen = 1;
+   /* Move our instruction from the tail to its correct place. */
+   const_load_inst->remove();
+   inst->insert_before(const_load_inst);


Ditto.  emit_before()...

+}
+
+/**
+ * Implements array access of uniforms by inserting a
+ * PULL_CONSTANT_LOAD instruction.
+ *
+ * Unlike temporary GRF array access (where we don't support it due to
+ * the difficulty of doing relative addressing on instruction
+ * destinations), we could potentially do array access of uniforms
+ * that were loaded in GRF space as push constants.  In real-world
+ * usage we've seen, though, the arrays being used are always larger
+ * than we could load as push constants, so just always move all
+ * uniform array access out to a pull constant buffer.
+ */
+void
+vec4_visitor::move_uniform_array_access_to_pull_constants()
+{
+   int pull_constant_loc[this->uniforms];
+
+   for (int i = 0; i<  this->uniforms; i++) {
+      pull_constant_loc[i] = -1;
+   }
+
+   /* Walk through and find array access of uniforms.  Put a copy of that
+    * uniform in the pull constant buffer.
+    *
+    * Note that we don't move constant-indexed accesses to arrays.  No
+    * testing has been done of the performance impact of this choice.
+    */
+   foreach_list_safe(node,&this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      for (int i = 0 ; i<  3; i++) {
+        if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
+           continue;
+
+        int uniform = inst->src[i].reg;
+
+        /* If this array isn't already present in the pull constant buffer,
+         * add it.
+         */
+        if (pull_constant_loc[uniform] == -1) {
+           const float **values =&prog_data->param[uniform * 4];
+
+           pull_constant_loc[uniform] = prog_data->nr_pull_params;
+
+           for (int j = 0; j<  uniform_size[uniform] * 4; j++) {
+              prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
+           }
+        }
+
+        /* Set up the annotation tracking for new generated instructions. */
+        base_ir = inst->ir;
+        current_annotation = inst->annotation;
+
+        dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+
+        emit_pull_constant_load(inst, temp, inst->src[i],
+                                pull_constant_loc[uniform]);
+
+        inst->src[i].file = temp.file;
+        inst->src[i].reg = temp.reg;
+        inst->src[i].reg_offset = temp.reg_offset;
+        inst->src[i].reladdr = NULL;
+      }
+   }
+}

  vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
                           struct gl_shader_program *prog,


_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 06/10] i965/vs: Add support for pull constant loads for uniform arrays.

Reply via email to