On Mon, 2016-12-19 at 13:58 -0800, Francisco Jerez wrote: > Iago Toral Quiroga <ito...@igalia.com> writes: > > > From: "Juan A. Suarez Romero" <jasua...@igalia.com> > > > > Our current data flow analysis does not take into account that > > channels > > on 64-bit operands are 64-bit. This is a problem when the same > > register > > is accessed using both 64-bit and 32-bit channels. This is very > > common > > in operations where we need to access 64-bit data in 32-bit chunks, > > such as the double packing and packing operations. > > > > This patch changes the analysis by checking the bits that each > > source > > or destination datatype needs. Actually, rather than bits, we use > > blocks of 32bits, which is the minimum channel size. > > > > Because a vgrf can contain a dvec4 (256 bits), we reserve 8 > > 32-bit blocks to map the channels. > > > > v2 (Curro): > > - Simplify code by making the var_from_reg helpers take an extra > > argument with the register component we want. > > - Fix a couple of cases where we had to update the code to the > > new > > way of representing live variables. > > --- > > src/mesa/drivers/dri/i965/brw_vec4.cpp | 2 +- > > src/mesa/drivers/dri/i965/brw_vec4_cse.cpp | 2 +- > > .../dri/i965/brw_vec4_dead_code_eliminate.cpp | 25 +++++++++- > > ------- > > .../drivers/dri/i965/brw_vec4_live_variables.cpp | 32 > > +++++++++++----------- > > .../drivers/dri/i965/brw_vec4_live_variables.h | 15 ++++++---- > > 5 files changed, 42 insertions(+), 34 deletions(-) > > > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp > > b/src/mesa/drivers/dri/i965/brw_vec4.cpp > > index 3191eab..34cab04 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp > > @@ -1140,7 +1140,7 @@ vec4_visitor::opt_register_coalesce() > > /* Can't coalesce this GRF if someone else was going to > > * read it later. > > */ > > - if (var_range_end(var_from_reg(alloc, dst_reg(inst- > > >src[0])), 4) > ip) > > + if (var_range_end(var_from_reg(alloc, dst_reg(inst- > > >src[0])), 8) > ip) > > continue; > > > > /* We need to check interference with the final destination > > between this > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp > > b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp > > index 1b91db9..bef897a 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp > > @@ -246,7 +246,7 @@ vec4_visitor::opt_cse_local(bblock_t *block) > > * more -- a sure sign they'll fail operands_match(). > > */ > > if (src->file == VGRF) { > > - if (var_range_end(var_from_reg(alloc, > > dst_reg(*src)), 4) < ip) { > > + if (var_range_end(var_from_reg(alloc, > > dst_reg(*src)), 8) < ip) { > > entry->remove(); > > ralloc_free(entry); > > break; > > diff --git > > a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > > b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > > index 950c6c8..6a80810 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > > @@ -57,12 +57,13 @@ vec4_visitor::dead_code_eliminate() > > if ((inst->dst.file == VGRF && !inst->has_side_effects()) > > || > > (inst->dst.is_null() && inst->writes_flag())){ > > bool result_live[4] = { false }; > > - > > if (inst->dst.file == VGRF) { > > - for (unsigned i = 0; i < regs_written(inst); i++) { > > - for (int c = 0; c < 4; c++) > > - result_live[c] |= BITSET_TEST( > > - live, var_from_reg(alloc, offset(inst- > > >dst, i), c)); > > + for (unsigned i = 0; i < 2 * regs_written(inst); > > i++) { > > One of the issues we discussed in the past about this approach is > that > it would overestimate the number of register OWORDs accessed by > instructions with size_written < REG_SIZE (or size_read(i) < > REG_SIZE), > which will be emitted by the SIMD lowering pass. Now that the amount > of > data read and written by instructions is represented in byte units > you > can avoid this problem by using DIV_ROUND_UP(inst->size_written, 16) > instead of 2 * regs_written(inst) above. > > > + for (int c = 0; c < 4; c++) { > > + const unsigned v = > > + var_from_reg(alloc, inst->dst, c, i); > > + result_live[c] |= BITSET_TEST(live, v); > > + } > > } > > } else { > > for (unsigned c = 0; c < 4; c++) > > @@ -111,11 +112,12 @@ vec4_visitor::dead_code_eliminate() > > > > if (inst->dst.file == VGRF && !inst->predicate && > > !inst->is_align1_partial_write()) { > > - for (unsigned i = 0; i < regs_written(inst); i++) { > > + for (unsigned i = 0; i < 2 * regs_written(inst); i++) > > { > > Same rounding issue as above. > > > for (int c = 0; c < 4; c++) { > > if (inst->dst.writemask & (1 << c)) { > > - BITSET_CLEAR(live, var_from_reg(alloc, > > - offset(inst- > > >dst, i), c)); > > + const unsigned v = > > + var_from_reg(alloc, inst->dst, c, i); > > + BITSET_CLEAR(live, v); > > } > > } > > } > > @@ -133,10 +135,11 @@ vec4_visitor::dead_code_eliminate() > > > > for (int i = 0; i < 3; i++) { > > if (inst->src[i].file == VGRF) { > > - for (unsigned j = 0; j < regs_read(inst, i); j++) { > > + for (unsigned j = 0; j < 2 * regs_read(inst, i); > > j++) { > > Same rounding issue as above. > > > for (int c = 0; c < 4; c++) { > > - BITSET_SET(live, var_from_reg(alloc, > > - offset(inst- > > >src[i], j), c)); > > + const unsigned v = > > + var_from_reg(alloc, inst->src[i], c, j); > > + BITSET_SET(live, v); > > } > > } > > } > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp > > b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp > > index 20344ed..4827798 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp > > @@ -76,10 +76,10 @@ vec4_live_variables::setup_def_use() > > /* Set use[] for this instruction */ > > for (unsigned int i = 0; i < 3; i++) { > > if (inst->src[i].file == VGRF) { > > - for (unsigned j = 0; j < regs_read(inst, i); j++) { > > + for (unsigned j = 0; j < 2 * regs_read(inst, i); > > j++) { > > Same rounding issue as above. > > > for (int c = 0; c < 4; c++) { > > const unsigned v = > > - var_from_reg(alloc, offset(inst->src[i], > > j), c); > > + var_from_reg(alloc, inst->src[i], c, j); > > if (!BITSET_TEST(bd->def, v)) > > BITSET_SET(bd->use, v); > > } > > @@ -99,11 +99,11 @@ vec4_live_variables::setup_def_use() > > */ > > if (inst->dst.file == VGRF && > > (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) > > { > > - for (unsigned i = 0; i < regs_written(inst); i++) { > > + for (unsigned i = 0; i < 2 * regs_written(inst); i++) > > { > > Same rounding issue as above. > > > for (int c = 0; c < 4; c++) { > > if (inst->dst.writemask & (1 << c)) { > > const unsigned v = > > - var_from_reg(alloc, offset(inst->dst, i), > > c); > > + var_from_reg(alloc, inst->dst, c, i); > > if (!BITSET_TEST(bd->use, v)) > > BITSET_SET(bd->def, v); > > } > > @@ -188,7 +188,7 @@ vec4_live_variables::vec4_live_variables(const > > simple_allocator &alloc, > > { > > mem_ctx = ralloc_context(NULL); > > > > - num_vars = alloc.total_size * 4; > > + num_vars = alloc.total_size * 8; > > block_data = rzalloc_array(mem_ctx, struct block_data, cfg- > > >num_blocks); > > > > bitset_words = BITSET_WORDS(num_vars); > > @@ -238,14 +238,14 @@ vec4_visitor::calculate_live_intervals() > > if (this->live_intervals) > > return; > > > > - int *start = ralloc_array(mem_ctx, int, this->alloc.total_size > > * 4); > > - int *end = ralloc_array(mem_ctx, int, this->alloc.total_size * > > 4); > > + int *start = ralloc_array(mem_ctx, int, this->alloc.total_size > > * 8); > > + int *end = ralloc_array(mem_ctx, int, this->alloc.total_size * > > 8); > > ralloc_free(this->virtual_grf_start); > > ralloc_free(this->virtual_grf_end); > > this->virtual_grf_start = start; > > this->virtual_grf_end = end; > > > > - for (unsigned i = 0; i < this->alloc.total_size * 4; i++) { > > + for (unsigned i = 0; i < this->alloc.total_size * 8; i++) { > > start[i] = MAX_INSTRUCTION; > > end[i] = -1; > > } > > @@ -257,10 +257,10 @@ vec4_visitor::calculate_live_intervals() > > foreach_block_and_inst(block, vec4_instruction, inst, cfg) { > > for (unsigned int i = 0; i < 3; i++) { > > if (inst->src[i].file == VGRF) { > > - for (unsigned j = 0; j < regs_read(inst, i); j++) { > > + for (unsigned j = 0; j < 2 * regs_read(inst, i); j++) > > { > > Same rounding issue as above. > > > for (int c = 0; c < 4; c++) { > > const unsigned v = > > - var_from_reg(alloc, offset(inst->src[i], j), > > c); > > + var_from_reg(alloc, inst->src[i], c, j); > > start[v] = MIN2(start[v], ip); > > end[v] = ip; > > } > > @@ -269,11 +269,11 @@ vec4_visitor::calculate_live_intervals() > > } > > > > if (inst->dst.file == VGRF) { > > - for (unsigned i = 0; i < regs_written(inst); i++) { > > + for (unsigned i = 0; i < 2 * regs_written(inst); i++) { > > Same rounding issue as above. > > > for (int c = 0; c < 4; c++) { > > if (inst->dst.writemask & (1 << c)) { > > const unsigned v = > > - var_from_reg(alloc, offset(inst->dst, i), c); > > + var_from_reg(alloc, inst->dst, c, i); > > start[v] = MIN2(start[v], ip); > > end[v] = ip; > > } > > @@ -340,8 +340,8 @@ vec4_visitor::var_range_end(unsigned v, > > unsigned n) const > > bool > > vec4_visitor::virtual_grf_interferes(int a, int b) > > { > > - return !((var_range_end(4 * alloc.offsets[a], 4 * > > alloc.sizes[a]) <= > > - var_range_start(4 * alloc.offsets[b], 4 * > > alloc.sizes[b])) || > > - (var_range_end(4 * alloc.offsets[b], 4 * > > alloc.sizes[b]) <= > > - var_range_start(4 * alloc.offsets[a], 4 * > > alloc.sizes[a]))); > > + return !((var_range_end(8 * alloc.offsets[a], 8 * > > alloc.sizes[a]) <= > > + var_range_start(8 * alloc.offsets[b], 8 * > > alloc.sizes[b])) || > > + (var_range_end(8 * alloc.offsets[b], 8 * > > alloc.sizes[b]) <= > > + var_range_start(8 * alloc.offsets[a], 8 * > > alloc.sizes[a]))); > > } > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h > > b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h > > index 2fbcaa1..26a074f 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h > > +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h > > @@ -78,23 +78,28 @@ protected: > > void *mem_ctx; > > }; > > > > +/* Returns the variable index for the k-th dword of the c-th > > component of > > + * register reg */ > > Non-standard comment formatting. > > > inline unsigned > > var_from_reg(const simple_allocator &alloc, const src_reg ®, > > - unsigned c = 0) > > + unsigned c = 0, unsigned k = 0) > > Odd indentation. > > > { > > assert(reg.file == VGRF && reg.nr < alloc.count && > > reg.offset / REG_SIZE < alloc.sizes[reg.nr] && c < 4); > > You could simplify this to "assert(reg.file == VGRF && reg.nr < > alloc.count && c < 4)", and move the rest of the assertion after the > calculation so you can make sure that the result (including terms > dependent on k) is less than "8 * alloc.sizes[reg.nr]". > > > - return (4 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) + > > - BRW_GET_SWZ(reg.swizzle, c)); > > + const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4); > > + return 8 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) + > > + (BRW_GET_SWZ(reg.swizzle, c) + k / csize * 4) * csize + > > k % csize; > > } > > > > inline unsigned > > var_from_reg(const simple_allocator &alloc, const dst_reg ®, > > - unsigned c = 0) > > + unsigned c = 0, unsigned k = 0) > > Odd indentation. > > > { > > assert(reg.file == VGRF && reg.nr < alloc.count && > > reg.offset / REG_SIZE < alloc.sizes[reg.nr] && c < 4); > > Same suggestion as for your last assert. > > > - return 4 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) + c; > > + const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4); > > + return 8 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) + > > + (c + k / csize * 4) * csize + k % csize; > > } > > > > } /* namespace brw */ > > -- > > 2.7.4 >
Thanks for the suggestions. We are going to apply them. As these are minor changes, can we consider this commit as R-b with them? BR, J.A. _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev