Re: [Mesa-dev] [PATCH v2 009/103] i965/vec4: add double/float conversion pseudo-opcodes

2016-10-18 Thread Ian Romanick
Based on my (fairly weak) understanding of vstrides, this patch is

Reviewed-by: Ian Romanick 

On 10/11/2016 02:01 AM, Iago Toral Quiroga wrote:
> These need to be emitted as align1 MOV's, since they need to have a
> stride of 2 on the float register (whether src or dest) so that data
> from another thread doesn't cross the middle of a SIMD8 register.
> 
> v2 (Iago):
> - The float-to-double needs to align 32-bit data to 64-bit before doing the
> conversion. This was doable in align16 when we tried to use an execsize
> of 4, but with an execsize of 8 we would need another align1 opcode to do
> that (since we need data to cross the middle of a SIMD register). Just
> making the opcode handle this internally seems more practical that adding
> another opcode just for this purpose and having the caller know about this
> before converting.
> - The double-to-float conversion produces 32-bit elements aligned to 64-bit
> so we make the opcode re-pack the result to 32-bit and fit in one register,
> as expected by SIMD4x2 operation. This still requires that callers reserve
> two registers for the float data destination because we need to produce
> 64-bit aligned data first, and repack it later on the same destination
> register, but it saves the need for a re-pack opcode only to achieve this
> making the operation complete in a single opcode. Hopefully that is worth
> the weirdness of the double register allocation...
> 
> Signed-off-by: Connor Abbott 
> Signed-off-by: Iago Toral Quiroga 
> ---
>  src/mesa/drivers/dri/i965/brw_defines.h  |  2 ++
>  src/mesa/drivers/dri/i965/brw_shader.cpp |  4 +++
>  src/mesa/drivers/dri/i965/brw_vec4.cpp   |  8 +
>  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 44 
> 
>  4 files changed, 58 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
> b/src/mesa/drivers/dri/i965/brw_defines.h
> index c4e0f27..79b96a4 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1098,6 +1098,8 @@ enum opcode {
> VEC4_OPCODE_MOV_BYTES,
> VEC4_OPCODE_PACK_BYTES,
> VEC4_OPCODE_UNPACK_UNIFORM,
> +   VEC4_OPCODE_DOUBLE_TO_FLOAT,
> +   VEC4_OPCODE_FLOAT_TO_DOUBLE,
>  
> FS_OPCODE_DDX_COARSE,
> FS_OPCODE_DDX_FINE,
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
> b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index ed81563..b063f77 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -317,6 +317,10 @@ brw_instruction_name(const struct gen_device_info 
> *devinfo, enum opcode op)
>return "pack_bytes";
> case VEC4_OPCODE_UNPACK_UNIFORM:
>return "unpack_uniform";
> +   case VEC4_OPCODE_DOUBLE_TO_FLOAT:
> +  return "double_to_float";
> +   case VEC4_OPCODE_FLOAT_TO_DOUBLE:
> +  return "float_to_double";
>  
> case FS_OPCODE_DDX_COARSE:
>return "ddx_coarse";
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index c29cfb5..32c04b2 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -253,6 +253,8 @@ vec4_instruction::can_do_writemask(const struct 
> gen_device_info *devinfo)
>  {
> switch (opcode) {
> case SHADER_OPCODE_GEN4_SCRATCH_READ:
> +   case VEC4_OPCODE_DOUBLE_TO_FLOAT:
> +   case VEC4_OPCODE_FLOAT_TO_DOUBLE:
> case VS_OPCODE_PULL_CONSTANT_LOAD:
> case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
> case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
> @@ -505,6 +507,12 @@ vec4_visitor::opt_reduce_swizzle()
>case BRW_OPCODE_DP2:
>   swizzle = brw_swizzle_for_size(2);
>   break;
> +
> +  case VEC4_OPCODE_FLOAT_TO_DOUBLE:
> +  case VEC4_OPCODE_DOUBLE_TO_FLOAT:
> + swizzle = brw_swizzle_for_size(4);
> + break;
> +
>default:
>   swizzle = brw_swizzle_for_mask(inst->dst.writemask);
>   break;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> index 163cf9d..6f4c438 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> @@ -1896,6 +1896,50 @@ generate_code(struct brw_codegen *p,
>   break;
>}
>  
> +  case VEC4_OPCODE_DOUBLE_TO_FLOAT: {
> + assert(src[0].type == BRW_REGISTER_TYPE_DF);
> + assert(dst.type == BRW_REGISTER_TYPE_F);
> +
> + brw_set_default_access_mode(p, BRW_ALIGN_1);
> +
> + dst.hstride = BRW_HORIZONTAL_STRIDE_2;
> + dst.width = BRW_WIDTH_4;
> + src[0].vstride = BRW_VERTICAL_STRIDE_4;
> + src[0].width = BRW_WIDTH_4;
> + brw_MOV(p, dst, src[0]);
> +
> + struct brw_reg dst_as_src = dst;
> + dst.hstride = BRW_HORIZONTAL_STRIDE_1;
> + dst.width = BRW_WIDTH_8;
> +  

[Mesa-dev] [PATCH v2 009/103] i965/vec4: add double/float conversion pseudo-opcodes

2016-10-11 Thread Iago Toral Quiroga
These need to be emitted as align1 MOV's, since they need to have a
stride of 2 on the float register (whether src or dest) so that data
from another thread doesn't cross the middle of a SIMD8 register.

v2 (Iago):
- The float-to-double needs to align 32-bit data to 64-bit before doing the
conversion. This was doable in align16 when we tried to use an execsize
of 4, but with an execsize of 8 we would need another align1 opcode to do
that (since we need data to cross the middle of a SIMD register). Just
making the opcode handle this internally seems more practical that adding
another opcode just for this purpose and having the caller know about this
before converting.
- The double-to-float conversion produces 32-bit elements aligned to 64-bit
so we make the opcode re-pack the result to 32-bit and fit in one register,
as expected by SIMD4x2 operation. This still requires that callers reserve
two registers for the float data destination because we need to produce
64-bit aligned data first, and repack it later on the same destination
register, but it saves the need for a re-pack opcode only to achieve this
making the operation complete in a single opcode. Hopefully that is worth
the weirdness of the double register allocation...

Signed-off-by: Connor Abbott 
Signed-off-by: Iago Toral Quiroga 
---
 src/mesa/drivers/dri/i965/brw_defines.h  |  2 ++
 src/mesa/drivers/dri/i965/brw_shader.cpp |  4 +++
 src/mesa/drivers/dri/i965/brw_vec4.cpp   |  8 +
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 44 
 4 files changed, 58 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index c4e0f27..79b96a4 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1098,6 +1098,8 @@ enum opcode {
VEC4_OPCODE_MOV_BYTES,
VEC4_OPCODE_PACK_BYTES,
VEC4_OPCODE_UNPACK_UNIFORM,
+   VEC4_OPCODE_DOUBLE_TO_FLOAT,
+   VEC4_OPCODE_FLOAT_TO_DOUBLE,
 
FS_OPCODE_DDX_COARSE,
FS_OPCODE_DDX_FINE,
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
b/src/mesa/drivers/dri/i965/brw_shader.cpp
index ed81563..b063f77 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -317,6 +317,10 @@ brw_instruction_name(const struct gen_device_info 
*devinfo, enum opcode op)
   return "pack_bytes";
case VEC4_OPCODE_UNPACK_UNIFORM:
   return "unpack_uniform";
+   case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+  return "double_to_float";
+   case VEC4_OPCODE_FLOAT_TO_DOUBLE:
+  return "float_to_double";
 
case FS_OPCODE_DDX_COARSE:
   return "ddx_coarse";
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index c29cfb5..32c04b2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -253,6 +253,8 @@ vec4_instruction::can_do_writemask(const struct 
gen_device_info *devinfo)
 {
switch (opcode) {
case SHADER_OPCODE_GEN4_SCRATCH_READ:
+   case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+   case VEC4_OPCODE_FLOAT_TO_DOUBLE:
case VS_OPCODE_PULL_CONSTANT_LOAD:
case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
@@ -505,6 +507,12 @@ vec4_visitor::opt_reduce_swizzle()
   case BRW_OPCODE_DP2:
  swizzle = brw_swizzle_for_size(2);
  break;
+
+  case VEC4_OPCODE_FLOAT_TO_DOUBLE:
+  case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+ swizzle = brw_swizzle_for_size(4);
+ break;
+
   default:
  swizzle = brw_swizzle_for_mask(inst->dst.writemask);
  break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 163cf9d..6f4c438 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1896,6 +1896,50 @@ generate_code(struct brw_codegen *p,
  break;
   }
 
+  case VEC4_OPCODE_DOUBLE_TO_FLOAT: {
+ assert(src[0].type == BRW_REGISTER_TYPE_DF);
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+ dst.width = BRW_WIDTH_4;
+ src[0].vstride = BRW_VERTICAL_STRIDE_4;
+ src[0].width = BRW_WIDTH_4;
+ brw_MOV(p, dst, src[0]);
+
+ struct brw_reg dst_as_src = dst;
+ dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+ dst.width = BRW_WIDTH_8;
+ brw_MOV(p, dst, dst_as_src);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+  }
+
+  case VEC4_OPCODE_FLOAT_TO_DOUBLE: {
+ assert(src[0].type == BRW_REGISTER_TYPE_F);
+ assert(dst.type == BRW_REGISTER_TYPE_DF);
+
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ struct brw_reg tmp = retype(dst, src[0].type);
+