On Mon, Sep 29, 2014 at 9:50 AM, Matt Turner <matts...@gmail.com> wrote:
> On Fri, Sep 26, 2014 at 12:24 PM, Jason Ekstrand <ja...@jlekstrand.net> > wrote: > > This commit reworks copy propagation a bit to support propagating the > > copying of partial registers. This comes up every time we have pull > > constants because we do a pull constant read immediately followed by a > move > > to splat the one component of the out to 8 or 16-wide. This allows us to > > eliminate the copy and simply use the one component of the register. > > > > Shader DB results: > > > > total instructions in shared programs: 5044937 -> 5044428 (-0.01%) > > instructions in affected programs: 66112 -> 65603 (-0.77%) > > GAINED: 0 > > LOST: 0 > > > > Signed-off-by: Jason Ekstrand <jason.ekstr...@intel.com> > > --- > > src/mesa/drivers/dri/i965/brw_fs.h | 1 + > > .../drivers/dri/i965/brw_fs_copy_propagation.cpp | 83 > ++++++++++++++++------ > > 2 files changed, 64 insertions(+), 20 deletions(-) > > > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > > index 50b5fc1..9b63114 100644 > > --- a/src/mesa/drivers/dri/i965/brw_fs.h > > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > > @@ -337,6 +337,7 @@ public: > > bool opt_cse_local(bblock_t *block); > > bool opt_copy_propagate(); > > bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry); > > + bool try_constant_propagate(fs_inst *inst, acp_entry *entry); > > bool opt_copy_propagate_local(void *mem_ctx, bblock_t *block, > > exec_list *acp); > > void opt_drop_redundant_mov_to_flags(); > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp > b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp > > index e5816df..a97dc04 100644 > > --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp > > @@ -277,24 +277,30 @@ is_logic_op(enum opcode opcode) > > bool > > fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) > > { > > + if (inst->src[arg].file != GRF) > > + return false; > > + > > if (entry->src.file == IMM) > > return false; > > + assert(entry->src.file == GRF || entry->src.file == UNIFORM); > > > > if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && > > inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) > > return false; > > > > - /* Bail if inst is reading more than entry is writing. */ > > - if ((inst->regs_read(this, arg) * inst->src[arg].stride * > > - type_sz(inst->src[arg].type)) > type_sz(entry->dst.type)) > > + assert(entry->dst.file == GRF); > > + if (inst->src[arg].reg != entry->dst.reg) > > return false; > > > > - if (inst->src[arg].file != entry->dst.file || > > - inst->src[arg].reg != entry->dst.reg || > > - inst->src[arg].reg_offset != entry->dst.reg_offset || > > - inst->src[arg].subreg_offset != entry->dst.subreg_offset) { > > + /* Bail if inst is reading a range that isn't contained in the range > > + * that entry is writing. > > + */ > > + int reg_size = dispatch_width * sizeof(float); > > + if (inst->src[arg].reg_offset < entry->dst.reg_offset || > > + (inst->src[arg].reg_offset * reg_size + > inst->src[arg].subreg_offset + > > + inst->regs_read(this, arg) * inst->src[arg].stride * reg_size) > > > + (entry->dst.reg_offset + 1) * reg_size) > > return false; > > - } > > > > /* See resolve_ud_negate() and comment in brw_fs_emit.cpp. */ > > if (inst->conditional_mod && > > @@ -361,11 +367,39 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int > arg, acp_entry *entry) > > > > inst->src[arg].file = entry->src.file; > > inst->src[arg].reg = entry->src.reg; > > - inst->src[arg].reg_offset = entry->src.reg_offset; > > - inst->src[arg].subreg_offset = entry->src.subreg_offset; > > inst->src[arg].stride *= entry->src.stride; > > inst->saturate = inst->saturate || entry->saturate; > > > > + switch (entry->src.file) { > > + case BAD_FILE: > > + case HW_REG: > > + case UNIFORM: > > + inst->src[arg].reg_offset = entry->src.reg_offset; > > + inst->src[arg].subreg_offset = entry->src.subreg_offset; > > + break; > > + case GRF: > > + { > > + /* In this case, we have to deal with mapping parts of vgrfs to > > + * other parts of vgrfs so we have to do some reg_offset magic. > > + */ > > + > > + /* Compute the offset of inst->src[arg] relative to inst->dst > */ > > + assert(entry->dst.subreg_offset == 0); > > + int rel_offset = inst->src[arg].reg_offset - > entry->dst.reg_offset; > > + int rel_suboffset = inst->src[arg].subreg_offset; > > + > > + /* Compute the final register offset (in bytes) */ > > + int offset = entry->src.reg_offset * reg_size + > entry->src.subreg_offset; > > + offset += rel_offset * reg_size + rel_suboffset; > > + inst->src[arg].reg_offset = offset / reg_size; > > + inst->src[arg].subreg_offset = offset % reg_size; > > + } > > + break; > > + default: > > + unreachable("Invalid register file"); > > + break; > > + } > > + > > if (!inst->src[arg].abs) { > > inst->src[arg].abs = entry->src.abs; > > inst->src[arg].negate ^= entry->src.negate; > > @@ -375,9 +409,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int > arg, acp_entry *entry) > > } > > > > > > -static bool > > -try_constant_propagate(struct brw_context *brw, fs_inst *inst, > > - acp_entry *entry) > > +bool > > +fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) > > { > > bool progress = false; > > > > @@ -385,12 +418,22 @@ try_constant_propagate(struct brw_context *brw, > fs_inst *inst, > > return false; > > > > for (int i = inst->sources - 1; i >= 0; i--) { > > - if (inst->src[i].file != entry->dst.file || > > - inst->src[i].reg != entry->dst.reg || > > - inst->src[i].reg_offset != entry->dst.reg_offset || > > - inst->src[i].subreg_offset != entry->dst.subreg_offset || > > - inst->src[i].type != entry->dst.type || > > - inst->src[i].stride > 1) > > + if (inst->src[i].file != GRF) > > + continue; > > + > > + assert(entry->dst.file == GRF); > > + if (inst->src[i].reg != entry->dst.reg || > > + inst->src[i].type != entry->dst.type) > > + continue; > > + > > + /* Bail if inst is reading a range that isn't contained in the > range > > + * that entry is writing. > > + */ > > + int reg_size = dispatch_width * sizeof(float); > > + if (inst->src[i].reg_offset < entry->dst.reg_offset || > > + (inst->src[i].reg_offset * reg_size + > inst->src[i].subreg_offset + > > + inst->regs_read(this, i) * inst->src[i].stride * reg_size) > > > + (entry->dst.reg_offset + 1) * reg_size) > > continue; > > Does this bit, the constant propagation code, actually benefit from > this? I guess it'd let an immediate loaded with a mov(16) be constant > propagated into two SIMD8 instructions? That seems plausible, I was > just a little surprised at first. > Yes, that's exactly what it's for
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev