Re: [Mesa-dev] [PATCH v2 043/103] i965/vec4: handle 32 and 64 bit channels in liveness analysis

2016-12-20 Thread Francisco Jerez
"Juan A. Suarez Romero"  writes:

> On Mon, 2016-12-19 at 13:58 -0800, Francisco Jerez wrote:
>> Iago Toral Quiroga  writes:
>> 
>> > From: "Juan A. Suarez Romero" 
>> > 
>> > Our current data flow analysis does not take into account that
>> > channels
>> > on 64-bit operands are 64-bit. This is a problem when the same
>> > register
>> > is accessed using both 64-bit and 32-bit channels. This is very
>> > common
>> > in operations where we need to access 64-bit data in 32-bit chunks,
>> > such as the double packing and packing operations.
>> > 
>> > This patch changes the analysis by checking the bits that each
>> > source
>> > or destination datatype needs. Actually, rather than bits, we use
>> > blocks of 32bits, which is the minimum channel size.
>> > 
>> > Because a vgrf can contain a dvec4 (256 bits), we reserve 8
>> > 32-bit blocks to map the channels.
>> > 
>> > v2 (Curro):
>> >   - Simplify code by making the var_from_reg helpers take an extra
>> > argument with the register component we want.
>> >   - Fix a couple of cases where we had to update the code to the
>> > new
>> > way of representing live variables.
>> > ---
>> >  src/mesa/drivers/dri/i965/brw_vec4.cpp |  2 +-
>> >  src/mesa/drivers/dri/i965/brw_vec4_cse.cpp |  2 +-
>> >  .../dri/i965/brw_vec4_dead_code_eliminate.cpp  | 25 +-
>> > ---
>> >  .../drivers/dri/i965/brw_vec4_live_variables.cpp   | 32
>> > +++---
>> >  .../drivers/dri/i965/brw_vec4_live_variables.h | 15 ++
>> >  5 files changed, 42 insertions(+), 34 deletions(-)
>> > 
>> > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> > b/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> > index 3191eab..34cab04 100644
>> > --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> > +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> > @@ -1140,7 +1140,7 @@ vec4_visitor::opt_register_coalesce()
>> >    /* Can't coalesce this GRF if someone else was going to
>> > * read it later.
>> > */
>> > -  if (var_range_end(var_from_reg(alloc, dst_reg(inst-
>> > >src[0])), 4) > ip)
>> > +  if (var_range_end(var_from_reg(alloc, dst_reg(inst-
>> > >src[0])), 8) > ip)
>> >     continue;
>> >  
>> >    /* We need to check interference with the final destination
>> > between this
>> > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
>> > b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
>> > index 1b91db9..bef897a 100644
>> > --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
>> > +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
>> > @@ -246,7 +246,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
>> >   * more -- a sure sign they'll fail operands_match().
>> >   */
>> >  if (src->file == VGRF) {
>> > -   if (var_range_end(var_from_reg(alloc,
>> > dst_reg(*src)), 4) < ip) {
>> > +   if (var_range_end(var_from_reg(alloc,
>> > dst_reg(*src)), 8) < ip) {
>> >    entry->remove();
>> >    ralloc_free(entry);
>> >    break;
>> > diff --git
>> > a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
>> > b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
>> > index 950c6c8..6a80810 100644
>> > --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
>> > +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
>> > @@ -57,12 +57,13 @@ vec4_visitor::dead_code_eliminate()
>> >   if ((inst->dst.file == VGRF && !inst->has_side_effects()) 
>> > ||
>> >   (inst->dst.is_null() && inst->writes_flag())){
>> >  bool result_live[4] = { false };
>> > -
>> >  if (inst->dst.file == VGRF) {
>> > -   for (unsigned i = 0; i < regs_written(inst); i++) {
>> > -  for (int c = 0; c < 4; c++)
>> > - result_live[c] |= BITSET_TEST(
>> > -live, var_from_reg(alloc, offset(inst-
>> > >dst, i), c));
>> > +   for (unsigned i = 0; i < 2 * regs_written(inst);
>> > i++) {
>> 
>> One of the issues we discussed in the past about this approach is
>> that
>> it would overestimate the number of register OWORDs accessed by
>> instructions with size_written < REG_SIZE (or size_read(i) <
>> REG_SIZE),
>> which will be emitted by the SIMD lowering pass.  Now that the amount
>> of
>> data read and written by instructions is represented in byte units
>> you
>> can avoid this problem by using DIV_ROUND_UP(inst->size_written, 16)
>> instead of 2 * regs_written(inst) above.
>> 
>> > +  for (int c = 0; c < 4; c++) {
>> > + const unsigned v =
>> > +var_from_reg(alloc, inst->dst, c, i);
>> > + result_live[c] |= BITSET_TEST(live, v);
>> > +  }
>> > }
>> >  } else {
>> > for (unsigned c = 0; c < 4; c++)
>> > @@ 

Re: [Mesa-dev] [PATCH v2 043/103] i965/vec4: handle 32 and 64 bit channels in liveness analysis

2016-12-20 Thread Juan A. Suarez Romero
On Mon, 2016-12-19 at 13:58 -0800, Francisco Jerez wrote:
> Iago Toral Quiroga  writes:
> 
> > From: "Juan A. Suarez Romero" 
> > 
> > Our current data flow analysis does not take into account that
> > channels
> > on 64-bit operands are 64-bit. This is a problem when the same
> > register
> > is accessed using both 64-bit and 32-bit channels. This is very
> > common
> > in operations where we need to access 64-bit data in 32-bit chunks,
> > such as the double packing and packing operations.
> > 
> > This patch changes the analysis by checking the bits that each
> > source
> > or destination datatype needs. Actually, rather than bits, we use
> > blocks of 32bits, which is the minimum channel size.
> > 
> > Because a vgrf can contain a dvec4 (256 bits), we reserve 8
> > 32-bit blocks to map the channels.
> > 
> > v2 (Curro):
> >   - Simplify code by making the var_from_reg helpers take an extra
> > argument with the register component we want.
> >   - Fix a couple of cases where we had to update the code to the
> > new
> > way of representing live variables.
> > ---
> >  src/mesa/drivers/dri/i965/brw_vec4.cpp |  2 +-
> >  src/mesa/drivers/dri/i965/brw_vec4_cse.cpp |  2 +-
> >  .../dri/i965/brw_vec4_dead_code_eliminate.cpp  | 25 +-
> > ---
> >  .../drivers/dri/i965/brw_vec4_live_variables.cpp   | 32
> > +++---
> >  .../drivers/dri/i965/brw_vec4_live_variables.h | 15 ++
> >  5 files changed, 42 insertions(+), 34 deletions(-)
> > 
> > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> > b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> > index 3191eab..34cab04 100644
> > --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> > @@ -1140,7 +1140,7 @@ vec4_visitor::opt_register_coalesce()
> >    /* Can't coalesce this GRF if someone else was going to
> > * read it later.
> > */
> > -  if (var_range_end(var_from_reg(alloc, dst_reg(inst-
> > >src[0])), 4) > ip)
> > +  if (var_range_end(var_from_reg(alloc, dst_reg(inst-
> > >src[0])), 8) > ip)
> >      continue;
> >  
> >    /* We need to check interference with the final destination
> > between this
> > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> > b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> > index 1b91db9..bef897a 100644
> > --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> > @@ -246,7 +246,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
> >   * more -- a sure sign they'll fail operands_match().
> >   */
> >  if (src->file == VGRF) {
> > -   if (var_range_end(var_from_reg(alloc,
> > dst_reg(*src)), 4) < ip) {
> > +   if (var_range_end(var_from_reg(alloc,
> > dst_reg(*src)), 8) < ip) {
> >    entry->remove();
> >    ralloc_free(entry);
> >    break;
> > diff --git
> > a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> > b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> > index 950c6c8..6a80810 100644
> > --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> > @@ -57,12 +57,13 @@ vec4_visitor::dead_code_eliminate()
> >   if ((inst->dst.file == VGRF && !inst->has_side_effects()) 
> > ||
> >   (inst->dst.is_null() && inst->writes_flag())){
> >  bool result_live[4] = { false };
> > -
> >  if (inst->dst.file == VGRF) {
> > -   for (unsigned i = 0; i < regs_written(inst); i++) {
> > -  for (int c = 0; c < 4; c++)
> > - result_live[c] |= BITSET_TEST(
> > -live, var_from_reg(alloc, offset(inst-
> > >dst, i), c));
> > +   for (unsigned i = 0; i < 2 * regs_written(inst);
> > i++) {
> 
> One of the issues we discussed in the past about this approach is
> that
> it would overestimate the number of register OWORDs accessed by
> instructions with size_written < REG_SIZE (or size_read(i) <
> REG_SIZE),
> which will be emitted by the SIMD lowering pass.  Now that the amount
> of
> data read and written by instructions is represented in byte units
> you
> can avoid this problem by using DIV_ROUND_UP(inst->size_written, 16)
> instead of 2 * regs_written(inst) above.
> 
> > +  for (int c = 0; c < 4; c++) {
> > + const unsigned v =
> > +var_from_reg(alloc, inst->dst, c, i);
> > + result_live[c] |= BITSET_TEST(live, v);
> > +  }
> > }
> >  } else {
> > for (unsigned c = 0; c < 4; c++)
> > @@ -111,11 +112,12 @@ vec4_visitor::dead_code_eliminate()
> >  
> >   if (inst->dst.file == VGRF && !inst->predicate &&
> >   

Re: [Mesa-dev] [PATCH v2 043/103] i965/vec4: handle 32 and 64 bit channels in liveness analysis

2016-12-19 Thread Francisco Jerez
Iago Toral Quiroga  writes:

> From: "Juan A. Suarez Romero" 
>
> Our current data flow analysis does not take into account that channels
> on 64-bit operands are 64-bit. This is a problem when the same register
> is accessed using both 64-bit and 32-bit channels. This is very common
> in operations where we need to access 64-bit data in 32-bit chunks,
> such as the double packing and packing operations.
>
> This patch changes the analysis by checking the bits that each source
> or destination datatype needs. Actually, rather than bits, we use
> blocks of 32bits, which is the minimum channel size.
>
> Because a vgrf can contain a dvec4 (256 bits), we reserve 8
> 32-bit blocks to map the channels.
>
> v2 (Curro):
>   - Simplify code by making the var_from_reg helpers take an extra
> argument with the register component we want.
>   - Fix a couple of cases where we had to update the code to the new
> way of representing live variables.
> ---
>  src/mesa/drivers/dri/i965/brw_vec4.cpp |  2 +-
>  src/mesa/drivers/dri/i965/brw_vec4_cse.cpp |  2 +-
>  .../dri/i965/brw_vec4_dead_code_eliminate.cpp  | 25 +
>  .../drivers/dri/i965/brw_vec4_live_variables.cpp   | 32 
> +++---
>  .../drivers/dri/i965/brw_vec4_live_variables.h | 15 ++
>  5 files changed, 42 insertions(+), 34 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index 3191eab..34cab04 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -1140,7 +1140,7 @@ vec4_visitor::opt_register_coalesce()
>/* Can't coalesce this GRF if someone else was going to
> * read it later.
> */
> -  if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 4) > ip)
> +  if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
>continue;
>  
>/* We need to check interference with the final destination between 
> this
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> index 1b91db9..bef897a 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> @@ -246,7 +246,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
>   * more -- a sure sign they'll fail operands_match().
>   */
>  if (src->file == VGRF) {
> -   if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 4) < 
> ip) {
> +   if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < 
> ip) {
>entry->remove();
>ralloc_free(entry);
>break;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> index 950c6c8..6a80810 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> @@ -57,12 +57,13 @@ vec4_visitor::dead_code_eliminate()
>   if ((inst->dst.file == VGRF && !inst->has_side_effects()) ||
>   (inst->dst.is_null() && inst->writes_flag())){
>  bool result_live[4] = { false };
> -
>  if (inst->dst.file == VGRF) {
> -   for (unsigned i = 0; i < regs_written(inst); i++) {
> -  for (int c = 0; c < 4; c++)
> - result_live[c] |= BITSET_TEST(
> -live, var_from_reg(alloc, offset(inst->dst, i), c));
> +   for (unsigned i = 0; i < 2 * regs_written(inst); i++) {

One of the issues we discussed in the past about this approach is that
it would overestimate the number of register OWORDs accessed by
instructions with size_written < REG_SIZE (or size_read(i) < REG_SIZE),
which will be emitted by the SIMD lowering pass.  Now that the amount of
data read and written by instructions is represented in byte units you
can avoid this problem by using DIV_ROUND_UP(inst->size_written, 16)
instead of 2 * regs_written(inst) above.

> +  for (int c = 0; c < 4; c++) {
> + const unsigned v =
> +var_from_reg(alloc, inst->dst, c, i);
> + result_live[c] |= BITSET_TEST(live, v);
> +  }
> }
>  } else {
> for (unsigned c = 0; c < 4; c++)
> @@ -111,11 +112,12 @@ vec4_visitor::dead_code_eliminate()
>  
>   if (inst->dst.file == VGRF && !inst->predicate &&
>   !inst->is_align1_partial_write()) {
> -for (unsigned i = 0; i < regs_written(inst); i++) {
> +for (unsigned i = 0; i < 2 * regs_written(inst); i++) {

Same rounding issue as above.

> for (int c = 0; c < 4; c++) {
>if (inst->dst.writemask & (1 << c)) {
> - 

[Mesa-dev] [PATCH v2 043/103] i965/vec4: handle 32 and 64 bit channels in liveness analysis

2016-10-11 Thread Iago Toral Quiroga
From: "Juan A. Suarez Romero" 

Our current data flow analysis does not take into account that channels
on 64-bit operands are 64-bit. This is a problem when the same register
is accessed using both 64-bit and 32-bit channels. This is very common
in operations where we need to access 64-bit data in 32-bit chunks,
such as the double packing and packing operations.

This patch changes the analysis by checking the bits that each source
or destination datatype needs. Actually, rather than bits, we use
blocks of 32bits, which is the minimum channel size.

Because a vgrf can contain a dvec4 (256 bits), we reserve 8
32-bit blocks to map the channels.

v2 (Curro):
  - Simplify code by making the var_from_reg helpers take an extra
argument with the register component we want.
  - Fix a couple of cases where we had to update the code to the new
way of representing live variables.
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp |  2 +-
 src/mesa/drivers/dri/i965/brw_vec4_cse.cpp |  2 +-
 .../dri/i965/brw_vec4_dead_code_eliminate.cpp  | 25 +
 .../drivers/dri/i965/brw_vec4_live_variables.cpp   | 32 +++---
 .../drivers/dri/i965/brw_vec4_live_variables.h | 15 ++
 5 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 3191eab..34cab04 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1140,7 +1140,7 @@ vec4_visitor::opt_register_coalesce()
   /* Can't coalesce this GRF if someone else was going to
* read it later.
*/
-  if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 4) > ip)
+  if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
 continue;
 
   /* We need to check interference with the final destination between this
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 1b91db9..bef897a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -246,7 +246,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
  * more -- a sure sign they'll fail operands_match().
  */
 if (src->file == VGRF) {
-   if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 4) < ip) {
+   if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
   entry->remove();
   ralloc_free(entry);
   break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
index 950c6c8..6a80810 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
@@ -57,12 +57,13 @@ vec4_visitor::dead_code_eliminate()
  if ((inst->dst.file == VGRF && !inst->has_side_effects()) ||
  (inst->dst.is_null() && inst->writes_flag())){
 bool result_live[4] = { false };
-
 if (inst->dst.file == VGRF) {
-   for (unsigned i = 0; i < regs_written(inst); i++) {
-  for (int c = 0; c < 4; c++)
- result_live[c] |= BITSET_TEST(
-live, var_from_reg(alloc, offset(inst->dst, i), c));
+   for (unsigned i = 0; i < 2 * regs_written(inst); i++) {
+  for (int c = 0; c < 4; c++) {
+ const unsigned v =
+var_from_reg(alloc, inst->dst, c, i);
+ result_live[c] |= BITSET_TEST(live, v);
+  }
}
 } else {
for (unsigned c = 0; c < 4; c++)
@@ -111,11 +112,12 @@ vec4_visitor::dead_code_eliminate()
 
  if (inst->dst.file == VGRF && !inst->predicate &&
  !inst->is_align1_partial_write()) {
-for (unsigned i = 0; i < regs_written(inst); i++) {
+for (unsigned i = 0; i < 2 * regs_written(inst); i++) {
for (int c = 0; c < 4; c++) {
   if (inst->dst.writemask & (1 << c)) {
- BITSET_CLEAR(live, var_from_reg(alloc,
- offset(inst->dst, i), c));
+ const unsigned v =
+var_from_reg(alloc, inst->dst, c, i);
+ BITSET_CLEAR(live, v);
   }
}
 }
@@ -133,10 +135,11 @@ vec4_visitor::dead_code_eliminate()
 
  for (int i = 0; i < 3; i++) {
 if (inst->src[i].file == VGRF) {
-   for (unsigned j = 0; j < regs_read(inst, i); j++) {
+   for (unsigned j = 0; j < 2 * regs_read(inst, i); j++) {
   for (int c = 0; c < 4; c++) {
- BITSET_SET(live, var_from_reg(alloc,
-