Re: [Mesa-dev] [PATCH 02/10] intel/fs: Implement quad swizzles on ICL+.

2019-01-07 Thread Iago Toral
On Sat, 2018-12-29 at 12:38 -0800, Francisco Jerez wrote:
> Align16 is no longer a thing, so a new implementation is provided
> using Align1 instead.  Not all possible swizzles can be represented
> as
> a single Align1 region, but some fast paths are provided for
> frequently used swizzles that can be represented efficiently in
> Align1
> mode.
> 
> Fixes ~90 subgroup quad swap Vulkan CTS tests.
> 
> Cc: mesa-sta...@lists.freedesktop.org
> ---
>  src/intel/compiler/brw_fs.cpp   | 25 +++-
>  src/intel/compiler/brw_fs.h |  4 ++
>  src/intel/compiler/brw_fs_generator.cpp | 82 ---
> --
>  3 files changed, 93 insertions(+), 18 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_fs.cpp
> b/src/intel/compiler/brw_fs.cpp
> index 2f0f0151219..97544fdf465 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard()
> const
> * may stomp all over it.
> */
>return true;
> +   case SHADER_OPCODE_QUAD_SWIZZLE:
> +  switch (src[1].ud) {

Maybe it is worth adding a small comment here indicating that these are
the cases where we implement the opcode as a single instruction and
refer to the generator for details?

> +  case BRW_SWIZZLE_:
> +  case BRW_SWIZZLE_:
> +  case BRW_SWIZZLE_:
> +  case BRW_SWIZZLE_:
> +  case BRW_SWIZZLE_XXZZ:
> +  case BRW_SWIZZLE_YYWW:
> +  case BRW_SWIZZLE_XYXY:
> +  case BRW_SWIZZLE_ZWZW:
> + return false;
> +  default:
> + return !is_uniform(src[0]);

Shouldn't this be:

return !is_uniform(src[0]) ||
   (devinfo->gen < 11 && type_sz(src.type) == 4);

Since in that case we also implement the opcode with a single ALIGN16
instruction.

> +  }
> default:
>/* The SIMD16 compressed instruction
> *
> @@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct
> gen_device_info *devinfo,
> case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
>return MIN2(8, inst->exec_size);
>  
> -   case SHADER_OPCODE_QUAD_SWIZZLE:
> -  return 8;
> -
> +   case SHADER_OPCODE_QUAD_SWIZZLE: {
> +  const unsigned swiz = inst->src[1].ud;
> +  return (is_uniform(inst->src[0]) ?
> + get_fpu_lowered_simd_width(devinfo, inst) :
> +  devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ?
> 8 :
> +  swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ?
> 4 :
> +  get_fpu_lowered_simd_width(devinfo, inst));
> +   }
> case SHADER_OPCODE_MOV_INDIRECT: {
>/* From IVB and HSW PRMs:
> *
> diff --git a/src/intel/compiler/brw_fs.h
> b/src/intel/compiler/brw_fs.h
> index 53d9b6ce7bf..dc36ecc21ac 100644
> --- a/src/intel/compiler/brw_fs.h
> +++ b/src/intel/compiler/brw_fs.h
> @@ -480,6 +480,10 @@ private:
>   struct brw_reg src,
>   struct brw_reg idx);
>  
> +   void generate_quad_swizzle(const fs_inst *inst,
> +  struct brw_reg dst, struct brw_reg
> src,
> +  unsigned swiz);
> +
> bool patch_discard_jumps_to_fb_writes();
>  
> const struct brw_compiler *compiler;
> diff --git a/src/intel/compiler/brw_fs_generator.cpp
> b/src/intel/compiler/brw_fs_generator.cpp
> index 08dd83dded7..84627e83132 100644
> --- a/src/intel/compiler/brw_fs_generator.cpp
> +++ b/src/intel/compiler/brw_fs_generator.cpp
> @@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst,
> }
>  }
>  
> +void
> +fs_generator::generate_quad_swizzle(const fs_inst *inst,
> +struct brw_reg dst, struct
> brw_reg src,
> +unsigned swiz)
> +{
> +   /* Requires a quad. */
> +   assert(inst->exec_size >= 4);
> +
> +   if (src.file == BRW_IMMEDIATE_VALUE ||
> +   has_scalar_region(src)) {
> +  /* The value is uniform across all channels */
> +  brw_MOV(p, dst, src);
> +
> +   } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
> +  /* This only works on 8-wide 32-bit values */
> +  assert(inst->exec_size == 8);
> +  assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
> +  assert(src.vstride == src.width + 1);
> +  brw_set_default_access_mode(p, BRW_ALIGN_16);
> +  struct brw_reg swiz_src = stride(src, 4, 4, 1);
> +  swiz_src.swizzle = swiz;
> +  brw_MOV(p, dst, swiz_src);
> +

Extra blank line.

> +   } else {
> +  assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
> +  assert(src.vstride == src.width + 1);
> +  const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz,
> 0));
> +
> +  switch (swiz) {
> +  case BRW_SWIZZLE_:
> +  case BRW_SWIZZLE_:
> +  case BRW_SWIZZLE_:
> +  case BRW_SWIZZLE_:
> + brw_MOV(p, dst, stride(src_0, 4, 4, 0));
> + break;
> +
> +  case BRW_SWIZZLE_XXZZ:
> +  case BRW_SWIZZLE_YYWW:
> + 

[Mesa-dev] [PATCH 02/10] intel/fs: Implement quad swizzles on ICL+.

2018-12-29 Thread Francisco Jerez
Align16 is no longer a thing, so a new implementation is provided
using Align1 instead.  Not all possible swizzles can be represented as
a single Align1 region, but some fast paths are provided for
frequently used swizzles that can be represented efficiently in Align1
mode.

Fixes ~90 subgroup quad swap Vulkan CTS tests.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp   | 25 +++-
 src/intel/compiler/brw_fs.h |  4 ++
 src/intel/compiler/brw_fs_generator.cpp | 82 -
 3 files changed, 93 insertions(+), 18 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 2f0f0151219..97544fdf465 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard() const
* may stomp all over it.
*/
   return true;
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+  switch (src[1].ud) {
+  case BRW_SWIZZLE_:
+  case BRW_SWIZZLE_:
+  case BRW_SWIZZLE_:
+  case BRW_SWIZZLE_:
+  case BRW_SWIZZLE_XXZZ:
+  case BRW_SWIZZLE_YYWW:
+  case BRW_SWIZZLE_XYXY:
+  case BRW_SWIZZLE_ZWZW:
+ return false;
+  default:
+ return !is_uniform(src[0]);
+  }
default:
   /* The SIMD16 compressed instruction
*
@@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct gen_device_info 
*devinfo,
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
   return MIN2(8, inst->exec_size);
 
-   case SHADER_OPCODE_QUAD_SWIZZLE:
-  return 8;
-
+   case SHADER_OPCODE_QUAD_SWIZZLE: {
+  const unsigned swiz = inst->src[1].ud;
+  return (is_uniform(inst->src[0]) ?
+ get_fpu_lowered_simd_width(devinfo, inst) :
+  devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
+  swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
+  get_fpu_lowered_simd_width(devinfo, inst));
+   }
case SHADER_OPCODE_MOV_INDIRECT: {
   /* From IVB and HSW PRMs:
*
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 53d9b6ce7bf..dc36ecc21ac 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -480,6 +480,10 @@ private:
  struct brw_reg src,
  struct brw_reg idx);
 
+   void generate_quad_swizzle(const fs_inst *inst,
+  struct brw_reg dst, struct brw_reg src,
+  unsigned swiz);
+
bool patch_discard_jumps_to_fb_writes();
 
const struct brw_compiler *compiler;
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index 08dd83dded7..84627e83132 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst,
}
 }
 
+void
+fs_generator::generate_quad_swizzle(const fs_inst *inst,
+struct brw_reg dst, struct brw_reg src,
+unsigned swiz)
+{
+   /* Requires a quad. */
+   assert(inst->exec_size >= 4);
+
+   if (src.file == BRW_IMMEDIATE_VALUE ||
+   has_scalar_region(src)) {
+  /* The value is uniform across all channels */
+  brw_MOV(p, dst, src);
+
+   } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
+  /* This only works on 8-wide 32-bit values */
+  assert(inst->exec_size == 8);
+  assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+  assert(src.vstride == src.width + 1);
+  brw_set_default_access_mode(p, BRW_ALIGN_16);
+  struct brw_reg swiz_src = stride(src, 4, 4, 1);
+  swiz_src.swizzle = swiz;
+  brw_MOV(p, dst, swiz_src);
+
+   } else {
+  assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+  assert(src.vstride == src.width + 1);
+  const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
+
+  switch (swiz) {
+  case BRW_SWIZZLE_:
+  case BRW_SWIZZLE_:
+  case BRW_SWIZZLE_:
+  case BRW_SWIZZLE_:
+ brw_MOV(p, dst, stride(src_0, 4, 4, 0));
+ break;
+
+  case BRW_SWIZZLE_XXZZ:
+  case BRW_SWIZZLE_YYWW:
+ brw_MOV(p, dst, stride(src_0, 2, 2, 0));
+ break;
+
+  case BRW_SWIZZLE_XYXY:
+  case BRW_SWIZZLE_ZWZW:
+ assert(inst->exec_size == 4);
+ brw_MOV(p, dst, stride(src_0, 0, 2, 1));
+ break;
+
+  default:
+ assert(inst->force_writemask_all);
+ brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
+
+ for (unsigned c = 0; c < 4; c++) {
+brw_inst *insn = brw_MOV(
+   p, stride(suboffset(dst, c),
+ 4 * inst->dst.stride, 1, 4 * inst->dst.stride),
+   stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
+
+brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
+