Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-12 Thread Iago Toral
On Wed, 2016-05-11 at 12:05 -0700, Francisco Jerez wrote:
> Iago Toral  writes:
> 
> > On Wed, 2016-05-11 at 12:49 +0200, Iago Toral wrote:
> >> On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
> >> > Samuel Iglesias Gonsálvez  writes:
> >> > 
> >> > > From: Iago Toral Quiroga 
> >> > >
> >> > > There are a few places where we need to shuffle the result of a 32-bit 
> >> > > load
> >> > > into valid 64-bit data, so extract this logic into a separate helper 
> >> > > that we
> >> > > can reuse.
> >> > >
> >> > > Also, the shuffling needs to operate with WE_all set, which we were 
> >> > > missing
> >> > > before, because we are changing the layout of the data across the 
> >> > > various
> >> > > channels. Otherwise we will run into problems in non-uniform 
> >> > > control-flow
> >> > > scenarios.
> >> > > ---
> >> > >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
> >> > > +---
> >> > >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
> >> > >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
> >> > >  3 files changed, 73 insertions(+), 73 deletions(-)
> >> > >
> >> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> >> > > b/src/mesa/drivers/dri/i965/brw_fs.cpp
> >> > > index dff13ea..709e4b8 100644
> >> > > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> >> > > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> >> > > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> >> > > fs_builder ,
> >> > >  
> >> > > vec4_result.type = dst.type;
> >> > >  
> >> > > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit 
> >> > > elements. If we
> >> > > -* are reading doubles this means that we get this:
> >> > > -*
> >> > > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
> >> > > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
> >> > > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
> >> > > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
> >> > > -*
> >> > > -* Fix this up so we return valid double elements:
> >> > > -*
> >> > > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
> >> > > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
> >> > > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
> >> > > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
> >> > > -*/
> >> > > -   if (type_sz(dst.type) == 8) {
> >> > > -  int multiplier = bld.dispatch_width() / 8;
> >> > > -  fs_reg fixed_res =
> >> > > - fs_reg(VGRF, alloc.allocate(2 * multiplier), 
> >> > > BRW_REGISTER_TYPE_F);
> >> > > -  /* We only have 2 doubles in a 32-bit vec4 */
> >> > > -  for (int i = 0; i < 2; i++) {
> >> > > - fs_reg vec4_float =
> >> > > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
> >> > > - multiplier * 16 * i);
> >> > > -
> >> > > - bld.MOV(stride(fixed_res, 2), vec4_float);
> >> > > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
> >> > > - horiz_offset(vec4_float, 8 * multiplier));
> >> > > -
> >> > > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
> >> > > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
> >> > > -  }
> >> > > -   }
> >> > > +   if (type_sz(dst.type) == 8)
> >> > > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
> >> > > vec4_result, 2);
> >> > >  
> >> > > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
> >> > > bld.MOV(dst, offset(vec4_result, bld,
> >> > > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> >> > > fs_builder ,
> >> > >  }
> >> > >  
> >> > >  /**
> >> > > + * This helper takes the result of a load operation that reads 32-bit 
> >> > > elements
> >> > > + * in this format:
> >> > > + *
> >> > > + * x x x x x x x x
> >> > > + * y y y y y y y y
> >> > > + * z z z z z z z z
> >> > > + * w w w w w w w w
> >> > > + *
> >> > > + * and shuffles the data to get this:
> >> > > + *
> >> > > + * x y x y x y x y
> >> > > + * x y x y x y x y
> >> > > + * z w z w z w z w
> >> > > + * z w z w z w z w
> >> > > + *
> >> > > + * Which is exactly what we want if the load is reading 64-bit 
> >> > > components
> >> > > + * like doubles, where x represents the low 32-bit of the x double 
> >> > > component
> >> > > + * and y represents the high 32-bit of the x double component 
> >> > > (likewise with
> >> > > + * z and w for double component y). The parameter @components 
> >> > > represents
> >> > > + * the number of 64-bit components present in @src. This would 
> >> > > typically be
> >> > > + * 2 at most, since we can only fit 2 double elements in the result 
> >> > > of a
> >> > > + * vec4 load.
> >> > > + *
> >> > > + * Notice that @dst and @src can be the same register.
> >> > > + */
> >> > > +void
> >> > > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder 
> >> > > ,
> >> > 
> >> > I don't see any reason to make this an fs_visitor method.  Declare this
> >> > as a static function local to brw_fs_nir.cpp what should improve
> >> > 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-11 Thread Francisco Jerez
Francisco Jerez  writes:

> Iago Toral  writes:
>
>> On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
>>> Samuel Iglesias Gonsálvez  writes:
>>> 
>>> > From: Iago Toral Quiroga 
>>> >
>>> > There are a few places where we need to shuffle the result of a 32-bit 
>>> > load
>>> > into valid 64-bit data, so extract this logic into a separate helper that 
>>> > we
>>> > can reuse.
>>> >
>>> > Also, the shuffling needs to operate with WE_all set, which we were 
>>> > missing
>>> > before, because we are changing the layout of the data across the various
>>> > channels. Otherwise we will run into problems in non-uniform control-flow
>>> > scenarios.
>>> > ---
>>> >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
>>> > +---
>>> >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
>>> >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
>>> >  3 files changed, 73 insertions(+), 73 deletions(-)
>>> >
>>> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>>> > b/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> > index dff13ea..709e4b8 100644
>>> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
>>> > fs_builder ,
>>> >  
>>> > vec4_result.type = dst.type;
>>> >  
>>> > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. 
>>> > If we
>>> > -* are reading doubles this means that we get this:
>>> > -*
>>> > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
>>> > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
>>> > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
>>> > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
>>> > -*
>>> > -* Fix this up so we return valid double elements:
>>> > -*
>>> > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
>>> > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
>>> > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
>>> > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
>>> > -*/
>>> > -   if (type_sz(dst.type) == 8) {
>>> > -  int multiplier = bld.dispatch_width() / 8;
>>> > -  fs_reg fixed_res =
>>> > - fs_reg(VGRF, alloc.allocate(2 * multiplier), 
>>> > BRW_REGISTER_TYPE_F);
>>> > -  /* We only have 2 doubles in a 32-bit vec4 */
>>> > -  for (int i = 0; i < 2; i++) {
>>> > - fs_reg vec4_float =
>>> > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
>>> > - multiplier * 16 * i);
>>> > -
>>> > - bld.MOV(stride(fixed_res, 2), vec4_float);
>>> > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
>>> > - horiz_offset(vec4_float, 8 * multiplier));
>>> > -
>>> > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
>>> > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
>>> > -  }
>>> > -   }
>>> > +   if (type_sz(dst.type) == 8)
>>> > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
>>> > vec4_result, 2);
>>> >  
>>> > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
>>> > bld.MOV(dst, offset(vec4_result, bld,
>>> > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
>>> > fs_builder ,
>>> >  }
>>> >  
>>> >  /**
>>> > + * This helper takes the result of a load operation that reads 32-bit 
>>> > elements
>>> > + * in this format:
>>> > + *
>>> > + * x x x x x x x x
>>> > + * y y y y y y y y
>>> > + * z z z z z z z z
>>> > + * w w w w w w w w
>>> > + *
>>> > + * and shuffles the data to get this:
>>> > + *
>>> > + * x y x y x y x y
>>> > + * x y x y x y x y
>>> > + * z w z w z w z w
>>> > + * z w z w z w z w
>>> > + *
>>> > + * Which is exactly what we want if the load is reading 64-bit components
>>> > + * like doubles, where x represents the low 32-bit of the x double 
>>> > component
>>> > + * and y represents the high 32-bit of the x double component (likewise 
>>> > with
>>> > + * z and w for double component y). The parameter @components represents
>>> > + * the number of 64-bit components present in @src. This would typically 
>>> > be
>>> > + * 2 at most, since we can only fit 2 double elements in the result of a
>>> > + * vec4 load.
>>> > + *
>>> > + * Notice that @dst and @src can be the same register.
>>> > + */
>>> > +void
>>> > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder 
>>> > ,
>>> 
>>> I don't see any reason to make this an fs_visitor method.  Declare this
>>> as a static function local to brw_fs_nir.cpp what should improve
>>> encapsulation and reduce the amount of boilerplate.  Also please don't
>>> write it in capitals unless you want people to shout the name of your
>>> function while discussing out loud about it. ;)
>>> 
>>> > +const fs_reg dst,
>>> > +const fs_reg src,
>>> > +uint32_t components)
>>> > +{
>>> > +   int multiplier = 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-11 Thread Francisco Jerez
Iago Toral  writes:

> On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
>> Samuel Iglesias Gonsálvez  writes:
>> 
>> > From: Iago Toral Quiroga 
>> >
>> > There are a few places where we need to shuffle the result of a 32-bit load
>> > into valid 64-bit data, so extract this logic into a separate helper that 
>> > we
>> > can reuse.
>> >
>> > Also, the shuffling needs to operate with WE_all set, which we were missing
>> > before, because we are changing the layout of the data across the various
>> > channels. Otherwise we will run into problems in non-uniform control-flow
>> > scenarios.
>> > ---
>> >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
>> > +---
>> >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
>> >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
>> >  3 files changed, 73 insertions(+), 73 deletions(-)
>> >
>> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>> > b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > index dff13ea..709e4b8 100644
>> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
>> > fs_builder ,
>> >  
>> > vec4_result.type = dst.type;
>> >  
>> > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. 
>> > If we
>> > -* are reading doubles this means that we get this:
>> > -*
>> > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
>> > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
>> > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
>> > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
>> > -*
>> > -* Fix this up so we return valid double elements:
>> > -*
>> > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
>> > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
>> > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
>> > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
>> > -*/
>> > -   if (type_sz(dst.type) == 8) {
>> > -  int multiplier = bld.dispatch_width() / 8;
>> > -  fs_reg fixed_res =
>> > - fs_reg(VGRF, alloc.allocate(2 * multiplier), 
>> > BRW_REGISTER_TYPE_F);
>> > -  /* We only have 2 doubles in a 32-bit vec4 */
>> > -  for (int i = 0; i < 2; i++) {
>> > - fs_reg vec4_float =
>> > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
>> > - multiplier * 16 * i);
>> > -
>> > - bld.MOV(stride(fixed_res, 2), vec4_float);
>> > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
>> > - horiz_offset(vec4_float, 8 * multiplier));
>> > -
>> > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
>> > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
>> > -  }
>> > -   }
>> > +   if (type_sz(dst.type) == 8)
>> > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
>> > vec4_result, 2);
>> >  
>> > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
>> > bld.MOV(dst, offset(vec4_result, bld,
>> > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
>> > fs_builder ,
>> >  }
>> >  
>> >  /**
>> > + * This helper takes the result of a load operation that reads 32-bit 
>> > elements
>> > + * in this format:
>> > + *
>> > + * x x x x x x x x
>> > + * y y y y y y y y
>> > + * z z z z z z z z
>> > + * w w w w w w w w
>> > + *
>> > + * and shuffles the data to get this:
>> > + *
>> > + * x y x y x y x y
>> > + * x y x y x y x y
>> > + * z w z w z w z w
>> > + * z w z w z w z w
>> > + *
>> > + * Which is exactly what we want if the load is reading 64-bit components
>> > + * like doubles, where x represents the low 32-bit of the x double 
>> > component
>> > + * and y represents the high 32-bit of the x double component (likewise 
>> > with
>> > + * z and w for double component y). The parameter @components represents
>> > + * the number of 64-bit components present in @src. This would typically 
>> > be
>> > + * 2 at most, since we can only fit 2 double elements in the result of a
>> > + * vec4 load.
>> > + *
>> > + * Notice that @dst and @src can be the same register.
>> > + */
>> > +void
>> > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,
>> 
>> I don't see any reason to make this an fs_visitor method.  Declare this
>> as a static function local to brw_fs_nir.cpp what should improve
>> encapsulation and reduce the amount of boilerplate.  Also please don't
>> write it in capitals unless you want people to shout the name of your
>> function while discussing out loud about it. ;)
>> 
>> > +const fs_reg dst,
>> > +const fs_reg src,
>> > +uint32_t components)
>> > +{
>> > +   int multiplier = bld.dispatch_width() / 8;
>> 
>> This definition is redundant with the changes below taken into account.
>> 
>> > +
>> > +   /* A temporary that we will use to shuffle the 32-bit data of each
>> 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-11 Thread Francisco Jerez
Iago Toral  writes:

> On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
>> Samuel Iglesias Gonsálvez  writes:
>> 
>> > From: Iago Toral Quiroga 
>> >
>> > There are a few places where we need to shuffle the result of a 32-bit load
>> > into valid 64-bit data, so extract this logic into a separate helper that 
>> > we
>> > can reuse.
>> >
>> > Also, the shuffling needs to operate with WE_all set, which we were missing
>> > before, because we are changing the layout of the data across the various
>> > channels. Otherwise we will run into problems in non-uniform control-flow
>> > scenarios.
>> > ---
>> >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
>> > +---
>> >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
>> >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
>> >  3 files changed, 73 insertions(+), 73 deletions(-)
>> >
>> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>> > b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > index dff13ea..709e4b8 100644
>> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
>> > fs_builder ,
>> >  
>> > vec4_result.type = dst.type;
>> >  
>> > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. 
>> > If we
>> > -* are reading doubles this means that we get this:
>> > -*
>> > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
>> > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
>> > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
>> > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
>> > -*
>> > -* Fix this up so we return valid double elements:
>> > -*
>> > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
>> > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
>> > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
>> > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
>> > -*/
>> > -   if (type_sz(dst.type) == 8) {
>> > -  int multiplier = bld.dispatch_width() / 8;
>> > -  fs_reg fixed_res =
>> > - fs_reg(VGRF, alloc.allocate(2 * multiplier), 
>> > BRW_REGISTER_TYPE_F);
>> > -  /* We only have 2 doubles in a 32-bit vec4 */
>> > -  for (int i = 0; i < 2; i++) {
>> > - fs_reg vec4_float =
>> > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
>> > - multiplier * 16 * i);
>> > -
>> > - bld.MOV(stride(fixed_res, 2), vec4_float);
>> > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
>> > - horiz_offset(vec4_float, 8 * multiplier));
>> > -
>> > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
>> > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
>> > -  }
>> > -   }
>> > +   if (type_sz(dst.type) == 8)
>> > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
>> > vec4_result, 2);
>> >  
>> > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
>> > bld.MOV(dst, offset(vec4_result, bld,
>> > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
>> > fs_builder ,
>> >  }
>> >  
>> >  /**
>> > + * This helper takes the result of a load operation that reads 32-bit 
>> > elements
>> > + * in this format:
>> > + *
>> > + * x x x x x x x x
>> > + * y y y y y y y y
>> > + * z z z z z z z z
>> > + * w w w w w w w w
>> > + *
>> > + * and shuffles the data to get this:
>> > + *
>> > + * x y x y x y x y
>> > + * x y x y x y x y
>> > + * z w z w z w z w
>> > + * z w z w z w z w
>> > + *
>> > + * Which is exactly what we want if the load is reading 64-bit components
>> > + * like doubles, where x represents the low 32-bit of the x double 
>> > component
>> > + * and y represents the high 32-bit of the x double component (likewise 
>> > with
>> > + * z and w for double component y). The parameter @components represents
>> > + * the number of 64-bit components present in @src. This would typically 
>> > be
>> > + * 2 at most, since we can only fit 2 double elements in the result of a
>> > + * vec4 load.
>> > + *
>> > + * Notice that @dst and @src can be the same register.
>> > + */
>> > +void
>> > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,
>> 
>> I don't see any reason to make this an fs_visitor method.  Declare this
>> as a static function local to brw_fs_nir.cpp what should improve
>> encapsulation and reduce the amount of boilerplate.  Also please don't
>> write it in capitals unless you want people to shout the name of your
>> function while discussing out loud about it. ;)
>> 
>> > +const fs_reg dst,
>> > +const fs_reg src,
>> > +uint32_t components)
>> > +{
>> > +   int multiplier = bld.dispatch_width() / 8;
>> 
>> This definition is redundant with the changes below taken into account.
>> 
>> > +
>> > +   /* A temporary that we will use to shuffle the 32-bit data of each
>> 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-11 Thread Francisco Jerez
Iago Toral  writes:

> On Wed, 2016-05-11 at 12:49 +0200, Iago Toral wrote:
>> On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
>> > Samuel Iglesias Gonsálvez  writes:
>> > 
>> > > From: Iago Toral Quiroga 
>> > >
>> > > There are a few places where we need to shuffle the result of a 32-bit 
>> > > load
>> > > into valid 64-bit data, so extract this logic into a separate helper 
>> > > that we
>> > > can reuse.
>> > >
>> > > Also, the shuffling needs to operate with WE_all set, which we were 
>> > > missing
>> > > before, because we are changing the layout of the data across the various
>> > > channels. Otherwise we will run into problems in non-uniform control-flow
>> > > scenarios.
>> > > ---
>> > >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
>> > > +---
>> > >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
>> > >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
>> > >  3 files changed, 73 insertions(+), 73 deletions(-)
>> > >
>> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>> > > b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > > index dff13ea..709e4b8 100644
>> > > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> > > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
>> > > fs_builder ,
>> > >  
>> > > vec4_result.type = dst.type;
>> > >  
>> > > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. 
>> > > If we
>> > > -* are reading doubles this means that we get this:
>> > > -*
>> > > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
>> > > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
>> > > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
>> > > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
>> > > -*
>> > > -* Fix this up so we return valid double elements:
>> > > -*
>> > > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
>> > > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
>> > > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
>> > > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
>> > > -*/
>> > > -   if (type_sz(dst.type) == 8) {
>> > > -  int multiplier = bld.dispatch_width() / 8;
>> > > -  fs_reg fixed_res =
>> > > - fs_reg(VGRF, alloc.allocate(2 * multiplier), 
>> > > BRW_REGISTER_TYPE_F);
>> > > -  /* We only have 2 doubles in a 32-bit vec4 */
>> > > -  for (int i = 0; i < 2; i++) {
>> > > - fs_reg vec4_float =
>> > > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
>> > > - multiplier * 16 * i);
>> > > -
>> > > - bld.MOV(stride(fixed_res, 2), vec4_float);
>> > > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
>> > > - horiz_offset(vec4_float, 8 * multiplier));
>> > > -
>> > > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
>> > > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
>> > > -  }
>> > > -   }
>> > > +   if (type_sz(dst.type) == 8)
>> > > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
>> > > vec4_result, 2);
>> > >  
>> > > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
>> > > bld.MOV(dst, offset(vec4_result, bld,
>> > > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
>> > > fs_builder ,
>> > >  }
>> > >  
>> > >  /**
>> > > + * This helper takes the result of a load operation that reads 32-bit 
>> > > elements
>> > > + * in this format:
>> > > + *
>> > > + * x x x x x x x x
>> > > + * y y y y y y y y
>> > > + * z z z z z z z z
>> > > + * w w w w w w w w
>> > > + *
>> > > + * and shuffles the data to get this:
>> > > + *
>> > > + * x y x y x y x y
>> > > + * x y x y x y x y
>> > > + * z w z w z w z w
>> > > + * z w z w z w z w
>> > > + *
>> > > + * Which is exactly what we want if the load is reading 64-bit 
>> > > components
>> > > + * like doubles, where x represents the low 32-bit of the x double 
>> > > component
>> > > + * and y represents the high 32-bit of the x double component (likewise 
>> > > with
>> > > + * z and w for double component y). The parameter @components represents
>> > > + * the number of 64-bit components present in @src. This would 
>> > > typically be
>> > > + * 2 at most, since we can only fit 2 double elements in the result of a
>> > > + * vec4 load.
>> > > + *
>> > > + * Notice that @dst and @src can be the same register.
>> > > + */
>> > > +void
>> > > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder 
>> > > ,
>> > 
>> > I don't see any reason to make this an fs_visitor method.  Declare this
>> > as a static function local to brw_fs_nir.cpp what should improve
>> > encapsulation and reduce the amount of boilerplate.  Also please don't
>> > write it in capitals unless you want people to shout the name of your
>> > function while discussing out loud about it. ;)
>> 
>> I know, I saw that we also had VARYING_PULL_CONSTANT_LOAD and figured
>> that maybe that was a style thing for certain helpers in the 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-11 Thread Iago Toral
On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
> Samuel Iglesias Gonsálvez  writes:
> 
> > From: Iago Toral Quiroga 
> >
> > There are a few places where we need to shuffle the result of a 32-bit load
> > into valid 64-bit data, so extract this logic into a separate helper that we
> > can reuse.
> >
> > Also, the shuffling needs to operate with WE_all set, which we were missing
> > before, because we are changing the layout of the data across the various
> > channels. Otherwise we will run into problems in non-uniform control-flow
> > scenarios.
> > ---
> >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
> > +---
> >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
> >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
> >  3 files changed, 73 insertions(+), 73 deletions(-)
> >
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> > b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > index dff13ea..709e4b8 100644
> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> > fs_builder ,
> >  
> > vec4_result.type = dst.type;
> >  
> > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If 
> > we
> > -* are reading doubles this means that we get this:
> > -*
> > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
> > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
> > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
> > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
> > -*
> > -* Fix this up so we return valid double elements:
> > -*
> > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
> > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
> > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
> > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
> > -*/
> > -   if (type_sz(dst.type) == 8) {
> > -  int multiplier = bld.dispatch_width() / 8;
> > -  fs_reg fixed_res =
> > - fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> > -  /* We only have 2 doubles in a 32-bit vec4 */
> > -  for (int i = 0; i < 2; i++) {
> > - fs_reg vec4_float =
> > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
> > - multiplier * 16 * i);
> > -
> > - bld.MOV(stride(fixed_res, 2), vec4_float);
> > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
> > - horiz_offset(vec4_float, 8 * multiplier));
> > -
> > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
> > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
> > -  }
> > -   }
> > +   if (type_sz(dst.type) == 8)
> > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
> > vec4_result, 2);
> >  
> > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
> > bld.MOV(dst, offset(vec4_result, bld,
> > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> > fs_builder ,
> >  }
> >  
> >  /**
> > + * This helper takes the result of a load operation that reads 32-bit 
> > elements
> > + * in this format:
> > + *
> > + * x x x x x x x x
> > + * y y y y y y y y
> > + * z z z z z z z z
> > + * w w w w w w w w
> > + *
> > + * and shuffles the data to get this:
> > + *
> > + * x y x y x y x y
> > + * x y x y x y x y
> > + * z w z w z w z w
> > + * z w z w z w z w
> > + *
> > + * Which is exactly what we want if the load is reading 64-bit components
> > + * like doubles, where x represents the low 32-bit of the x double 
> > component
> > + * and y represents the high 32-bit of the x double component (likewise 
> > with
> > + * z and w for double component y). The parameter @components represents
> > + * the number of 64-bit components present in @src. This would typically be
> > + * 2 at most, since we can only fit 2 double elements in the result of a
> > + * vec4 load.
> > + *
> > + * Notice that @dst and @src can be the same register.
> > + */
> > +void
> > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,
> 
> I don't see any reason to make this an fs_visitor method.  Declare this
> as a static function local to brw_fs_nir.cpp what should improve
> encapsulation and reduce the amount of boilerplate.  Also please don't
> write it in capitals unless you want people to shout the name of your
> function while discussing out loud about it. ;)
> 
> > +const fs_reg dst,
> > +const fs_reg src,
> > +uint32_t components)
> > +{
> > +   int multiplier = bld.dispatch_width() / 8;
> 
> This definition is redundant with the changes below taken into account.
> 
> > +
> > +   /* A temporary that we will use to shuffle the 32-bit data of each
> > +* component in the vector into valid 64-bit data
> > +*/
> > +   fs_reg tmp =
> > +  fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> 
> I don't 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-11 Thread Iago Toral
On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
> Samuel Iglesias Gonsálvez  writes:
> 
> > From: Iago Toral Quiroga 
> >
> > There are a few places where we need to shuffle the result of a 32-bit load
> > into valid 64-bit data, so extract this logic into a separate helper that we
> > can reuse.
> >
> > Also, the shuffling needs to operate with WE_all set, which we were missing
> > before, because we are changing the layout of the data across the various
> > channels. Otherwise we will run into problems in non-uniform control-flow
> > scenarios.
> > ---
> >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
> > +---
> >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
> >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
> >  3 files changed, 73 insertions(+), 73 deletions(-)
> >
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> > b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > index dff13ea..709e4b8 100644
> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> > fs_builder ,
> >  
> > vec4_result.type = dst.type;
> >  
> > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If 
> > we
> > -* are reading doubles this means that we get this:
> > -*
> > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
> > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
> > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
> > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
> > -*
> > -* Fix this up so we return valid double elements:
> > -*
> > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
> > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
> > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
> > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
> > -*/
> > -   if (type_sz(dst.type) == 8) {
> > -  int multiplier = bld.dispatch_width() / 8;
> > -  fs_reg fixed_res =
> > - fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> > -  /* We only have 2 doubles in a 32-bit vec4 */
> > -  for (int i = 0; i < 2; i++) {
> > - fs_reg vec4_float =
> > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
> > - multiplier * 16 * i);
> > -
> > - bld.MOV(stride(fixed_res, 2), vec4_float);
> > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
> > - horiz_offset(vec4_float, 8 * multiplier));
> > -
> > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
> > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
> > -  }
> > -   }
> > +   if (type_sz(dst.type) == 8)
> > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
> > vec4_result, 2);
> >  
> > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
> > bld.MOV(dst, offset(vec4_result, bld,
> > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> > fs_builder ,
> >  }
> >  
> >  /**
> > + * This helper takes the result of a load operation that reads 32-bit 
> > elements
> > + * in this format:
> > + *
> > + * x x x x x x x x
> > + * y y y y y y y y
> > + * z z z z z z z z
> > + * w w w w w w w w
> > + *
> > + * and shuffles the data to get this:
> > + *
> > + * x y x y x y x y
> > + * x y x y x y x y
> > + * z w z w z w z w
> > + * z w z w z w z w
> > + *
> > + * Which is exactly what we want if the load is reading 64-bit components
> > + * like doubles, where x represents the low 32-bit of the x double 
> > component
> > + * and y represents the high 32-bit of the x double component (likewise 
> > with
> > + * z and w for double component y). The parameter @components represents
> > + * the number of 64-bit components present in @src. This would typically be
> > + * 2 at most, since we can only fit 2 double elements in the result of a
> > + * vec4 load.
> > + *
> > + * Notice that @dst and @src can be the same register.
> > + */
> > +void
> > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,
> 
> I don't see any reason to make this an fs_visitor method.  Declare this
> as a static function local to brw_fs_nir.cpp what should improve
> encapsulation and reduce the amount of boilerplate.  Also please don't
> write it in capitals unless you want people to shout the name of your
> function while discussing out loud about it. ;)
> 
> > +const fs_reg dst,
> > +const fs_reg src,
> > +uint32_t components)
> > +{
> > +   int multiplier = bld.dispatch_width() / 8;
> 
> This definition is redundant with the changes below taken into account.
> 
> > +
> > +   /* A temporary that we will use to shuffle the 32-bit data of each
> > +* component in the vector into valid 64-bit data
> > +*/
> > +   fs_reg tmp =
> > +  fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> 
> I don't 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-11 Thread Iago Toral
On Wed, 2016-05-11 at 12:49 +0200, Iago Toral wrote:
> On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
> > Samuel Iglesias Gonsálvez  writes:
> > 
> > > From: Iago Toral Quiroga 
> > >
> > > There are a few places where we need to shuffle the result of a 32-bit 
> > > load
> > > into valid 64-bit data, so extract this logic into a separate helper that 
> > > we
> > > can reuse.
> > >
> > > Also, the shuffling needs to operate with WE_all set, which we were 
> > > missing
> > > before, because we are changing the layout of the data across the various
> > > channels. Otherwise we will run into problems in non-uniform control-flow
> > > scenarios.
> > > ---
> > >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
> > > +---
> > >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
> > >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
> > >  3 files changed, 73 insertions(+), 73 deletions(-)
> > >
> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> > > b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > index dff13ea..709e4b8 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> > > fs_builder ,
> > >  
> > > vec4_result.type = dst.type;
> > >  
> > > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. 
> > > If we
> > > -* are reading doubles this means that we get this:
> > > -*
> > > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
> > > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
> > > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
> > > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
> > > -*
> > > -* Fix this up so we return valid double elements:
> > > -*
> > > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
> > > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
> > > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
> > > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
> > > -*/
> > > -   if (type_sz(dst.type) == 8) {
> > > -  int multiplier = bld.dispatch_width() / 8;
> > > -  fs_reg fixed_res =
> > > - fs_reg(VGRF, alloc.allocate(2 * multiplier), 
> > > BRW_REGISTER_TYPE_F);
> > > -  /* We only have 2 doubles in a 32-bit vec4 */
> > > -  for (int i = 0; i < 2; i++) {
> > > - fs_reg vec4_float =
> > > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
> > > - multiplier * 16 * i);
> > > -
> > > - bld.MOV(stride(fixed_res, 2), vec4_float);
> > > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
> > > - horiz_offset(vec4_float, 8 * multiplier));
> > > -
> > > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
> > > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
> > > -  }
> > > -   }
> > > +   if (type_sz(dst.type) == 8)
> > > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
> > > vec4_result, 2);
> > >  
> > > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
> > > bld.MOV(dst, offset(vec4_result, bld,
> > > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> > > fs_builder ,
> > >  }
> > >  
> > >  /**
> > > + * This helper takes the result of a load operation that reads 32-bit 
> > > elements
> > > + * in this format:
> > > + *
> > > + * x x x x x x x x
> > > + * y y y y y y y y
> > > + * z z z z z z z z
> > > + * w w w w w w w w
> > > + *
> > > + * and shuffles the data to get this:
> > > + *
> > > + * x y x y x y x y
> > > + * x y x y x y x y
> > > + * z w z w z w z w
> > > + * z w z w z w z w
> > > + *
> > > + * Which is exactly what we want if the load is reading 64-bit components
> > > + * like doubles, where x represents the low 32-bit of the x double 
> > > component
> > > + * and y represents the high 32-bit of the x double component (likewise 
> > > with
> > > + * z and w for double component y). The parameter @components represents
> > > + * the number of 64-bit components present in @src. This would typically 
> > > be
> > > + * 2 at most, since we can only fit 2 double elements in the result of a
> > > + * vec4 load.
> > > + *
> > > + * Notice that @dst and @src can be the same register.
> > > + */
> > > +void
> > > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder 
> > > ,
> > 
> > I don't see any reason to make this an fs_visitor method.  Declare this
> > as a static function local to brw_fs_nir.cpp what should improve
> > encapsulation and reduce the amount of boilerplate.  Also please don't
> > write it in capitals unless you want people to shout the name of your
> > function while discussing out loud about it. ;)
> 
> I know, I saw that we also had VARYING_PULL_CONSTANT_LOAD and figured
> that maybe that was a style thing for certain helpers in the fs_visitor.
> I'll make it lower case.
> 
> Also, I think I made it an fs_visitor method so I could use alloc to
> create the temporary vgrf, and I did that because I wanted to 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-11 Thread Iago Toral
On Tue, 2016-05-10 at 19:10 -0700, Francisco Jerez wrote:
> Samuel Iglesias Gonsálvez  writes:
> 
> > From: Iago Toral Quiroga 
> >
> > There are a few places where we need to shuffle the result of a 32-bit load
> > into valid 64-bit data, so extract this logic into a separate helper that we
> > can reuse.
> >
> > Also, the shuffling needs to operate with WE_all set, which we were missing
> > before, because we are changing the layout of the data across the various
> > channels. Otherwise we will run into problems in non-uniform control-flow
> > scenarios.
> > ---
> >  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
> > +---
> >  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
> >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
> >  3 files changed, 73 insertions(+), 73 deletions(-)
> >
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> > b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > index dff13ea..709e4b8 100644
> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> > fs_builder ,
> >  
> > vec4_result.type = dst.type;
> >  
> > -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If 
> > we
> > -* are reading doubles this means that we get this:
> > -*
> > -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
> > -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
> > -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
> > -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
> > -*
> > -* Fix this up so we return valid double elements:
> > -*
> > -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
> > -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
> > -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
> > -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
> > -*/
> > -   if (type_sz(dst.type) == 8) {
> > -  int multiplier = bld.dispatch_width() / 8;
> > -  fs_reg fixed_res =
> > - fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> > -  /* We only have 2 doubles in a 32-bit vec4 */
> > -  for (int i = 0; i < 2; i++) {
> > - fs_reg vec4_float =
> > -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
> > - multiplier * 16 * i);
> > -
> > - bld.MOV(stride(fixed_res, 2), vec4_float);
> > - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
> > - horiz_offset(vec4_float, 8 * multiplier));
> > -
> > - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
> > - retype(fixed_res, BRW_REGISTER_TYPE_DF));
> > -  }
> > -   }
> > +   if (type_sz(dst.type) == 8)
> > +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
> > vec4_result, 2);
> >  
> > int type_slots = MAX2(type_sz(dst.type) / 4, 1);
> > bld.MOV(dst, offset(vec4_result, bld,
> > @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const 
> > fs_builder ,
> >  }
> >  
> >  /**
> > + * This helper takes the result of a load operation that reads 32-bit 
> > elements
> > + * in this format:
> > + *
> > + * x x x x x x x x
> > + * y y y y y y y y
> > + * z z z z z z z z
> > + * w w w w w w w w
> > + *
> > + * and shuffles the data to get this:
> > + *
> > + * x y x y x y x y
> > + * x y x y x y x y
> > + * z w z w z w z w
> > + * z w z w z w z w
> > + *
> > + * Which is exactly what we want if the load is reading 64-bit components
> > + * like doubles, where x represents the low 32-bit of the x double 
> > component
> > + * and y represents the high 32-bit of the x double component (likewise 
> > with
> > + * z and w for double component y). The parameter @components represents
> > + * the number of 64-bit components present in @src. This would typically be
> > + * 2 at most, since we can only fit 2 double elements in the result of a
> > + * vec4 load.
> > + *
> > + * Notice that @dst and @src can be the same register.
> > + */
> > +void
> > +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,
> 
> I don't see any reason to make this an fs_visitor method.  Declare this
> as a static function local to brw_fs_nir.cpp what should improve
> encapsulation and reduce the amount of boilerplate.  Also please don't
> write it in capitals unless you want people to shout the name of your
> function while discussing out loud about it. ;)

I know, I saw that we also had VARYING_PULL_CONSTANT_LOAD and figured
that maybe that was a style thing for certain helpers in the fs_visitor.
I'll make it lower case.

Also, I think I made it an fs_visitor method so I could use alloc to
create the temporary vgrf, and I did that because I wanted to make sure
that we always did the WE_all writes to a temporary to avoid problems
with non-uniform control-flow. However, now that we use subscript I
suppose we can skip all that. I'll do the changes and check again that
everything keeps working fine.

> > +const fs_reg dst,
> > +  

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-10 Thread Francisco Jerez
Francisco Jerez  writes:

> Samuel Iglesias Gonsálvez  writes:
>
>> From: Iago Toral Quiroga 
>>
>> There are a few places where we need to shuffle the result of a 32-bit load
>> into valid 64-bit data, so extract this logic into a separate helper that we
>> can reuse.
>>
>> Also, the shuffling needs to operate with WE_all set, which we were missing
>> before, because we are changing the layout of the data across the various
>> channels. Otherwise we will run into problems in non-uniform control-flow
>> scenarios.
>> ---
>>  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
>> +---
>>  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
>>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
>>  3 files changed, 73 insertions(+), 73 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> index dff13ea..709e4b8 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
>> ,
>>  
>> vec4_result.type = dst.type;
>>  
>> -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If 
>> we
>> -* are reading doubles this means that we get this:
>> -*
>> -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
>> -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
>> -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
>> -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
>> -*
>> -* Fix this up so we return valid double elements:
>> -*
>> -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
>> -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
>> -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
>> -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
>> -*/
>> -   if (type_sz(dst.type) == 8) {
>> -  int multiplier = bld.dispatch_width() / 8;
>> -  fs_reg fixed_res =
>> - fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
>> -  /* We only have 2 doubles in a 32-bit vec4 */
>> -  for (int i = 0; i < 2; i++) {
>> - fs_reg vec4_float =
>> -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
>> - multiplier * 16 * i);
>> -
>> - bld.MOV(stride(fixed_res, 2), vec4_float);
>> - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
>> - horiz_offset(vec4_float, 8 * multiplier));
>> -
>> - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
>> - retype(fixed_res, BRW_REGISTER_TYPE_DF));
>> -  }
>> -   }
>> +   if (type_sz(dst.type) == 8)
>> +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
>> vec4_result, 2);
>>  
>> int type_slots = MAX2(type_sz(dst.type) / 4, 1);
>> bld.MOV(dst, offset(vec4_result, bld,
>> @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
>> ,
>>  }
>>  
>>  /**
>> + * This helper takes the result of a load operation that reads 32-bit 
>> elements
>> + * in this format:
>> + *
>> + * x x x x x x x x
>> + * y y y y y y y y
>> + * z z z z z z z z
>> + * w w w w w w w w
>> + *
>> + * and shuffles the data to get this:
>> + *
>> + * x y x y x y x y
>> + * x y x y x y x y
>> + * z w z w z w z w
>> + * z w z w z w z w
>> + *
>> + * Which is exactly what we want if the load is reading 64-bit components
>> + * like doubles, where x represents the low 32-bit of the x double component
>> + * and y represents the high 32-bit of the x double component (likewise with
>> + * z and w for double component y). The parameter @components represents
>> + * the number of 64-bit components present in @src. This would typically be
>> + * 2 at most, since we can only fit 2 double elements in the result of a
>> + * vec4 load.
>> + *
>> + * Notice that @dst and @src can be the same register.
>> + */
>> +void
>> +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,
>
> I don't see any reason to make this an fs_visitor method.  Declare this
> as a static function local to brw_fs_nir.cpp what should improve
> encapsulation and reduce the amount of boilerplate.  Also please don't
> write it in capitals unless you want people to shout the name of your
> function while discussing out loud about it. ;)
>
>> +const fs_reg dst,
>> +const fs_reg src,
>> +uint32_t components)
>> +{
>> +   int multiplier = bld.dispatch_width() / 8;
>
> This definition is redundant with the changes below taken into account.
>
>> +
>> +   /* A temporary that we will use to shuffle the 32-bit data of each
>> +* component in the vector into valid 64-bit data
>> +*/
>> +   fs_reg tmp =
>> +  fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
>
> I don't think there is any reason to do this inside a temporary instead
> of writing into the destination register directly.
>
>> +
>> +   /* We 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-10 Thread Francisco Jerez
Samuel Iglesias Gonsálvez  writes:

> From: Iago Toral Quiroga 
>
> There are a few places where we need to shuffle the result of a 32-bit load
> into valid 64-bit data, so extract this logic into a separate helper that we
> can reuse.
>
> Also, the shuffling needs to operate with WE_all set, which we were missing
> before, because we are changing the layout of the data across the various
> channels. Otherwise we will run into problems in non-uniform control-flow
> scenarios.
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
> +---
>  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
>  3 files changed, 73 insertions(+), 73 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index dff13ea..709e4b8 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
> ,
>  
> vec4_result.type = dst.type;
>  
> -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If we
> -* are reading doubles this means that we get this:
> -*
> -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
> -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
> -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
> -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
> -*
> -* Fix this up so we return valid double elements:
> -*
> -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
> -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
> -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
> -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
> -*/
> -   if (type_sz(dst.type) == 8) {
> -  int multiplier = bld.dispatch_width() / 8;
> -  fs_reg fixed_res =
> - fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> -  /* We only have 2 doubles in a 32-bit vec4 */
> -  for (int i = 0; i < 2; i++) {
> - fs_reg vec4_float =
> -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
> - multiplier * 16 * i);
> -
> - bld.MOV(stride(fixed_res, 2), vec4_float);
> - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
> - horiz_offset(vec4_float, 8 * multiplier));
> -
> - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
> - retype(fixed_res, BRW_REGISTER_TYPE_DF));
> -  }
> -   }
> +   if (type_sz(dst.type) == 8)
> +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, vec4_result, 
> 2);
>  
> int type_slots = MAX2(type_sz(dst.type) / 4, 1);
> bld.MOV(dst, offset(vec4_result, bld,
> @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
> ,
>  }
>  
>  /**
> + * This helper takes the result of a load operation that reads 32-bit 
> elements
> + * in this format:
> + *
> + * x x x x x x x x
> + * y y y y y y y y
> + * z z z z z z z z
> + * w w w w w w w w
> + *
> + * and shuffles the data to get this:
> + *
> + * x y x y x y x y
> + * x y x y x y x y
> + * z w z w z w z w
> + * z w z w z w z w
> + *
> + * Which is exactly what we want if the load is reading 64-bit components
> + * like doubles, where x represents the low 32-bit of the x double component
> + * and y represents the high 32-bit of the x double component (likewise with
> + * z and w for double component y). The parameter @components represents
> + * the number of 64-bit components present in @src. This would typically be
> + * 2 at most, since we can only fit 2 double elements in the result of a
> + * vec4 load.
> + *
> + * Notice that @dst and @src can be the same register.
> + */
> +void
> +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,

I don't see any reason to make this an fs_visitor method.  Declare this
as a static function local to brw_fs_nir.cpp what should improve
encapsulation and reduce the amount of boilerplate.  Also please don't
write it in capitals unless you want people to shout the name of your
function while discussing out loud about it. ;)

> +const fs_reg dst,
> +const fs_reg src,
> +uint32_t components)
> +{
> +   int multiplier = bld.dispatch_width() / 8;

This definition is redundant with the changes below taken into account.

> +
> +   /* A temporary that we will use to shuffle the 32-bit data of each
> +* component in the vector into valid 64-bit data
> +*/
> +   fs_reg tmp =
> +  fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);

I don't think there is any reason to do this inside a temporary instead
of writing into the destination register directly.

> +
> +   /* We are going to manipulate the data in elements of 32-bit */
> +   fs_reg src_data = retype(src, BRW_REGISTER_TYPE_F);
> +
> +   /* We are going to manipulate the dst in elements of 64-bit */
> 

Re: [Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-10 Thread Kenneth Graunke
On Tuesday, May 3, 2016 2:22:04 PM PDT Samuel Iglesias Gonsálvez wrote:
> From: Iago Toral Quiroga 
> 
> There are a few places where we need to shuffle the result of a 32-bit load
> into valid 64-bit data, so extract this logic into a separate helper that we
> can reuse.
> 
> Also, the shuffling needs to operate with WE_all set, which we were missing
> before, because we are changing the layout of the data across the various
> channels. Otherwise we will run into problems in non-uniform control-flow
> scenarios.
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp | 95 
+---
>  src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
>  3 files changed, 73 insertions(+), 73 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/
i965/brw_fs.cpp
> index dff13ea..709e4b8 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
>  
> vec4_result.type = dst.type;
>  
> -   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If 
we
> -* are reading doubles this means that we get this:
> -*
> -*  r0: x0 x0 x0 x0 x0 x0 x0 x0
> -*  r1: x1 x1 x1 x1 x1 x1 x1 x1
> -*  r2: y0 y0 y0 y0 y0 y0 y0 y0
> -*  r3: y1 y1 y1 y1 y1 y1 y1 y1
> -*
> -* Fix this up so we return valid double elements:
> -*
> -*  r0: x0 x1 x0 x1 x0 x1 x0 x1
> -*  r1: x0 x1 x0 x1 x0 x1 x0 x1
> -*  r2: y0 y1 y0 y1 y0 y1 y0 y1
> -*  r3: y0 y1 y0 y1 y0 y1 y0 y1
> -*/
> -   if (type_sz(dst.type) == 8) {
> -  int multiplier = bld.dispatch_width() / 8;
> -  fs_reg fixed_res =
> - fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> -  /* We only have 2 doubles in a 32-bit vec4 */
> -  for (int i = 0; i < 2; i++) {
> - fs_reg vec4_float =
> -horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
> - multiplier * 16 * i);
> -
> - bld.MOV(stride(fixed_res, 2), vec4_float);
> - bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
> - horiz_offset(vec4_float, 8 * multiplier));
> -
> - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
> - retype(fixed_res, BRW_REGISTER_TYPE_DF));
> -  }
> -   }
> +   if (type_sz(dst.type) == 8)
> +  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, 
vec4_result, 2);
>  
> int type_slots = MAX2(type_sz(dst.type) / 4, 1);
> bld.MOV(dst, offset(vec4_result, bld,
> @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
>  }
>  
>  /**
> + * This helper takes the result of a load operation that reads 32-bit 
elements
> + * in this format:
> + *
> + * x x x x x x x x
> + * y y y y y y y y
> + * z z z z z z z z
> + * w w w w w w w w
> + *
> + * and shuffles the data to get this:
> + *
> + * x y x y x y x y
> + * x y x y x y x y
> + * z w z w z w z w
> + * z w z w z w z w
> + *
> + * Which is exactly what we want if the load is reading 64-bit components
> + * like doubles, where x represents the low 32-bit of the x double 
component
> + * and y represents the high 32-bit of the x double component (likewise 
with
> + * z and w for double component y). The parameter @components represents
> + * the number of 64-bit components present in @src. This would typically be
> + * 2 at most, since we can only fit 2 double elements in the result of a
> + * vec4 load.
> + *
> + * Notice that @dst and @src can be the same register.
> + */
> +void
> +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,
> +const fs_reg dst,
> +const fs_reg src,
> +uint32_t components)
> +{
> +   int multiplier = bld.dispatch_width() / 8;
> +
> +   /* A temporary that we will use to shuffle the 32-bit data of each
> +* component in the vector into valid 64-bit data
> +*/
> +   fs_reg tmp =
> +  fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
> +
> +   /* We are going to manipulate the data in elements of 32-bit */
> +   fs_reg src_data = retype(src, BRW_REGISTER_TYPE_F);
> +
> +   /* We are going to manipulate the dst in elements of 64-bit */
> +   fs_reg dst_data = retype(dst, BRW_REGISTER_TYPE_DF);

How about:

const fs_builder nomask_bld = bld.exec_all();

...

nomask_bld.MOV(...);

instead of bld.MOV(...)->force_writemask_all = true?


> +
> +   /* Shuffle the data */
> +   for (unsigned i = 0; i < components; i++) {
> +  fs_reg component_i = horiz_offset(src_data, multiplier * 16 * i);
> +
> +  bld.MOV(stride(tmp, 2), component_i)->force_writemask_all = true;
> +  bld.MOV(stride(horiz_offset(tmp, 1), 2),
> +  

[Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper

2016-05-03 Thread Samuel Iglesias Gonsálvez
From: Iago Toral Quiroga 

There are a few places where we need to shuffle the result of a 32-bit load
into valid 64-bit data, so extract this logic into a separate helper that we
can reuse.

Also, the shuffling needs to operate with WE_all set, which we were missing
before, because we are changing the layout of the data across the various
channels. Otherwise we will run into problems in non-uniform control-flow
scenarios.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 95 +---
 src/mesa/drivers/dri/i965/brw_fs.h   |  5 ++
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--
 3 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index dff13ea..709e4b8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
 
vec4_result.type = dst.type;
 
-   /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If we
-* are reading doubles this means that we get this:
-*
-*  r0: x0 x0 x0 x0 x0 x0 x0 x0
-*  r1: x1 x1 x1 x1 x1 x1 x1 x1
-*  r2: y0 y0 y0 y0 y0 y0 y0 y0
-*  r3: y1 y1 y1 y1 y1 y1 y1 y1
-*
-* Fix this up so we return valid double elements:
-*
-*  r0: x0 x1 x0 x1 x0 x1 x0 x1
-*  r1: x0 x1 x0 x1 x0 x1 x0 x1
-*  r2: y0 y1 y0 y1 y0 y1 y0 y1
-*  r3: y0 y1 y0 y1 y0 y1 y0 y1
-*/
-   if (type_sz(dst.type) == 8) {
-  int multiplier = bld.dispatch_width() / 8;
-  fs_reg fixed_res =
- fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
-  /* We only have 2 doubles in a 32-bit vec4 */
-  for (int i = 0; i < 2; i++) {
- fs_reg vec4_float =
-horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
- multiplier * 16 * i);
-
- bld.MOV(stride(fixed_res, 2), vec4_float);
- bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
- horiz_offset(vec4_float, 8 * multiplier));
-
- bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
- retype(fixed_res, BRW_REGISTER_TYPE_DF));
-  }
-   }
+   if (type_sz(dst.type) == 8)
+  SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, vec4_result, 
2);
 
int type_slots = MAX2(type_sz(dst.type) / 4, 1);
bld.MOV(dst, offset(vec4_result, bld,
@@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
 }
 
 /**
+ * This helper takes the result of a load operation that reads 32-bit elements
+ * in this format:
+ *
+ * x x x x x x x x
+ * y y y y y y y y
+ * z z z z z z z z
+ * w w w w w w w w
+ *
+ * and shuffles the data to get this:
+ *
+ * x y x y x y x y
+ * x y x y x y x y
+ * z w z w z w z w
+ * z w z w z w z w
+ *
+ * Which is exactly what we want if the load is reading 64-bit components
+ * like doubles, where x represents the low 32-bit of the x double component
+ * and y represents the high 32-bit of the x double component (likewise with
+ * z and w for double component y). The parameter @components represents
+ * the number of 64-bit components present in @src. This would typically be
+ * 2 at most, since we can only fit 2 double elements in the result of a
+ * vec4 load.
+ *
+ * Notice that @dst and @src can be the same register.
+ */
+void
+fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder ,
+const fs_reg dst,
+const fs_reg src,
+uint32_t components)
+{
+   int multiplier = bld.dispatch_width() / 8;
+
+   /* A temporary that we will use to shuffle the 32-bit data of each
+* component in the vector into valid 64-bit data
+*/
+   fs_reg tmp =
+  fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
+
+   /* We are going to manipulate the data in elements of 32-bit */
+   fs_reg src_data = retype(src, BRW_REGISTER_TYPE_F);
+
+   /* We are going to manipulate the dst in elements of 64-bit */
+   fs_reg dst_data = retype(dst, BRW_REGISTER_TYPE_DF);
+
+   /* Shuffle the data */
+   for (unsigned i = 0; i < components; i++) {
+  fs_reg component_i = horiz_offset(src_data, multiplier * 16 * i);
+
+  bld.MOV(stride(tmp, 2), component_i)->force_writemask_all = true;
+  bld.MOV(stride(horiz_offset(tmp, 1), 2),
+  horiz_offset(component_i, 8 * multiplier))
+ ->force_writemask_all = true;
+
+  bld.MOV(horiz_offset(dst_data, multiplier * 8 * i),
+  retype(tmp, BRW_REGISTER_TYPE_DF))->force_writemask_all = true;
+   }
+}
+
+/**
  * A helper for MOV generation for fixing up broken hardware SEND dependency
  * handling.
  */
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index 1f18112..c95b30a 100644
---