Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-20 Thread Ilya Tocar
  
  The patch is OK with the above improvement.
  
 
 
 Will commit version below, if no objections in 24 hours.
 

Sorry,
I've missed palignr, which should also have v64qi version,
and lost return in expand_vec_perm_palignr case
(this caused avx512f-vec-unpack test failures).
Patch below fixes it. Ok for trunk?

2014-10-20  Ilya Tocar  ilya.to...@intel.com

* config/i386/i386.c (expand_vec_perm_1): Fix
expand_vec_perm_palignr case.
* config/i386/sse.md (ssse3_avx2_palignrmode_mask): Use
VI1_AVX512.

---
 gcc/config/i386/i386.c |  1 +
 gcc/config/i386/sse.md | 12 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 33b21f4..34273ca 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -43552,6 +43552,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 
   /* Try the AVX2 vpalignr instruction.  */
   if (expand_vec_perm_palignr (d, true))
+return true;
 
   /* Try the AVX512F vpermi2 instructions.  */
   if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8157045..a3f336f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -13716,14 +13716,14 @@
(set_attr mode DI)])
 
 (define_insn ssse3_avx2_palignrmode_mask
-  [(set (match_operand:VI1_AVX2 0 register_operand =v)
-(vec_merge:VI1_AVX2
- (unspec:VI1_AVX2
-   [(match_operand:VI1_AVX2 1 register_operand v)
-(match_operand:VI1_AVX2 2 nonimmediate_operand vm)
+  [(set (match_operand:VI1_AVX512 0 register_operand =v)
+(vec_merge:VI1_AVX512
+ (unspec:VI1_AVX512
+   [(match_operand:VI1_AVX512 1 register_operand v)
+(match_operand:VI1_AVX512 2 nonimmediate_operand vm)
 (match_operand:SI 3 const_0_to_255_mul_8_operand n)]
UNSPEC_PALIGNR)
-   (match_operand:VI1_AVX2 4 vector_move_operand 0C)
+   (match_operand:VI1_AVX512 4 vector_move_operand 0C)
(match_operand:avx512fmaskmode 5 register_operand Yk)))]
   TARGET_AVX512BW  (MODE_SIZE == 64 || TARGET_AVX512VL)
 {
-- 
1.8.3.1



Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-20 Thread Uros Bizjak
On Mon, Oct 20, 2014 at 5:19 PM, Ilya Tocar tocarip.in...@gmail.com wrote:
 
  The patch is OK with the above improvement.
 
 

 Will commit version below, if no objections in 24 hours.


 Sorry,
 I've missed palignr, which should also have v64qi version,
 and lost return in expand_vec_perm_palignr case
 (this caused avx512f-vec-unpack test failures).
 Patch below fixes it. Ok for trunk?

 2014-10-20  Ilya Tocar  ilya.to...@intel.com

 * config/i386/i386.c (expand_vec_perm_1): Fix
 expand_vec_perm_palignr case.
 * config/i386/sse.md (ssse3_avx2_palignrmode_mask): Use
 VI1_AVX512.

OK.

Thanks,
Uros.


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-16 Thread Ilya Tocar
On 10 Oct 18:37, Uros Bizjak wrote:
 On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar tocarip.in...@gmail.com wrote:
 
 
 Please recode that horrible first switch statement to:
 
 --cut here--
   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
 
   switch (mode)
 {
 case V8HImode:
   if (TARGET_AVX512VL  TARGET_AVX152BW)
 gen = gen_avx512vl_vpermi2varv8hi3;
   break;
 
 ...
 
 case V2DFmode:
   if (TARGET_AVX512VL)
 {
   gen = gen_avx512vl_vpermi2varv2df3;
   maskmode = V2DImode;
 
 The patch is OK with the above improvement.
 
 Thanks,
 Uros.


Will commit version below, if no objections in 24 hours.

---
 gcc/config/i386/i386.c | 292 ++---
 gcc/config/i386/sse.md |  45 
 2 files changed, 255 insertions(+), 82 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index aedac19..e1228e3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21411,35 +21411,132 @@ ix86_expand_int_vcond (rtx operands[])
   return true;
 }
 
+/* AVX512F does support 64-byte integer vector operations,
+   thus the longest vector we are faced with is V64QImode.  */
+#define MAX_VECT_LEN   64
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_operand_p;
+  bool testing_p;
+};
+
 static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
+ struct expand_vec_perm_d *d)
 {
-  enum machine_mode mode = GET_MODE (op0);
+  /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc.  */
+  enum machine_mode mode = GET_MODE (d ? d-op0 : op0);
+  enum machine_mode maskmode = mode;
+  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+
   switch (mode)
 {
+case V8HImode:
+  if (TARGET_AVX512VL  TARGET_AVX512BW)
+   gen = gen_avx512vl_vpermi2varv8hi3;
+  break;
+case V16HImode:
+  if (TARGET_AVX512VL  TARGET_AVX512BW)
+   gen = gen_avx512vl_vpermi2varv16hi3;
+  break;
+case V32HImode:
+  if (TARGET_AVX512BW)
+   gen = gen_avx512bw_vpermi2varv32hi3;
+  break;
+case V4SImode:
+  if (TARGET_AVX512VL)
+   gen = gen_avx512vl_vpermi2varv4si3;
+  break;
+case V8SImode:
+  if (TARGET_AVX512VL)
+   gen = gen_avx512vl_vpermi2varv8si3;
+  break;
 case V16SImode:
-  emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
-  force_reg (V16SImode, mask),
-  op1));
-  return true;
+  if (TARGET_AVX512F)
+   gen = gen_avx512f_vpermi2varv16si3;
+  break;
+case V4SFmode:
+  if (TARGET_AVX512VL)
+   {
+ gen = gen_avx512vl_vpermi2varv4sf3;
+ maskmode = V4SImode;
+   }
+  break;
+case V8SFmode:
+  if (TARGET_AVX512VL)
+   {
+ gen = gen_avx512vl_vpermi2varv8sf3;
+ maskmode = V8SImode;
+   }
+  break;
 case V16SFmode:
-  emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
-  force_reg (V16SImode, mask),
-  op1));
-  return true;
+  if (TARGET_AVX512F)
+   {
+ gen = gen_avx512f_vpermi2varv16sf3;
+ maskmode = V16SImode;
+   }
+  break;
+case V2DImode:
+  if (TARGET_AVX512VL)
+   gen = gen_avx512vl_vpermi2varv2di3;
+  break;
+case V4DImode:
+  if (TARGET_AVX512VL)
+   gen = gen_avx512vl_vpermi2varv4di3;
+  break;
 case V8DImode:
-  emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
- force_reg (V8DImode, mask),
- op1));
-  return true;
+  if (TARGET_AVX512F)
+   gen = gen_avx512f_vpermi2varv8di3;
+  break;
+case V2DFmode:
+  if (TARGET_AVX512VL)
+   {
+ gen = gen_avx512vl_vpermi2varv2df3;
+ maskmode = V2DImode;
+   }
+  break;
+case V4DFmode:
+  if (TARGET_AVX512VL)
+   {
+ gen = gen_avx512vl_vpermi2varv4df3;
+ maskmode = V4DImode;
+   }
+  break;
 case V8DFmode:
-  emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
- force_reg (V8DImode, mask),
- op1));
-  return true;
+  if (TARGET_AVX512F)
+   {
+ gen = gen_avx512f_vpermi2varv8df3;
+ maskmode = V8DImode;
+   }
+  break;
 default:
-  return false;
+  break;
 }
+
+  if (gen == NULL)
+return false;
+
+  /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc.  */
+  if (d)
+{
+  

Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-16 Thread Jakub Jelinek
On Thu, Oct 16, 2014 at 02:23:16PM +0400, Ilya Tocar wrote:
 On 10 Oct 18:37, Uros Bizjak wrote:
  On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar tocarip.in...@gmail.com wrote:
  
  
  Please recode that horrible first switch statement to:
  
  --cut here--
rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
  
switch (mode)
  {
  case V8HImode:
if (TARGET_AVX512VL  TARGET_AVX152BW)
  gen = gen_avx512vl_vpermi2varv8hi3;
break;
  
  ...
  
  case V2DFmode:
if (TARGET_AVX512VL)
  {
gen = gen_avx512vl_vpermi2varv2df3;
maskmode = V2DImode;
  
  The patch is OK with the above improvement.
  
  Thanks,
  Uros.
 
 
 Will commit version below, if no objections in 24 hours.

No need to wait, it is ok now (with proper ChangeLog of course).

Jakub


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-10 Thread Ilya Tocar
On 09 Oct 20:51, Jakub Jelinek wrote:
 On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote:
  --- a/gcc/config/i386/i386.c
  +++ b/gcc/config/i386/i386.c
  @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[])
 return true;
   }
   
  -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
  +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, 
  struct expand_vec_perm_d *d)
 
 Too long line, please wrap it.

Fixed.

   {
  -  enum machine_mode mode = GET_MODE (op0);
  +  enum machine_mode mode = GET_MODE (d ? d-op0 : op0);
  +
 switch (mode)
   {
  +case V8HImode:
  +  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
  +   return false;
 
 My strong preference would be:
   enum machine_mode maskmode = mode;
   rtx (*gen) (rtx, rtx, rtx, rtx);
 right below the enum machine_mode mode = GET_MODE (d ? d-op0 : op0);
 line and then inside of the first switch just do:
 ...
 case V16SImode:
   if (!TARGET_AVX512F)
   return false;
   gen = gen_avx512f_vpermi2varv16si3;
   break;
 case V4SFmode:
   if (!TARGET_AVX512VL)
   return false;
   gen = gen_avx512vl_vpermi2varv4sf3;
   maskmode = V4SImode;
   break;
 ...
 etc., then in the mask = line use:
 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d-nelt, vec));
 and finally instead of the second switch do:
   emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
   return true;

Updated patch below.

---
 gcc/config/i386/i386.c | 281 +++--
 gcc/config/i386/sse.md |  45 
 2 files changed, 253 insertions(+), 73 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..2247da8 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21358,33 +21358,132 @@ ix86_expand_int_vcond (rtx operands[])
   return true;
 }
 
+/* AVX512F does support 64-byte integer vector operations,
+   thus the longest vector we are faced with is V64QImode.  */
+#define MAX_VECT_LEN   64
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_operand_p;
+  bool testing_p;
+};
+
 static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
+ struct expand_vec_perm_d *d)
 {
-  enum machine_mode mode = GET_MODE (op0);
+  /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc.  */
+  enum machine_mode mode = GET_MODE (d ? d-op0 : op0);
+  enum machine_mode maskmode = mode;
+  rtx (*gen) (rtx, rtx, rtx, rtx);
+
   switch (mode)
 {
+case V8HImode:
+  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+   return false;
+  gen = gen_avx512vl_vpermi2varv8hi3; 
+  break;
+case V16HImode:
+  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+   return false;
+  gen = gen_avx512vl_vpermi2varv16hi3;
+  break;
+case V32HImode:
+  if (!TARGET_AVX512BW)
+   return false;
+  gen = gen_avx512bw_vpermi2varv32hi3; 
+  break;
+case V4SImode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv4si3;
+  break;
+case V8SImode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv8si3;
+  break;
 case V16SImode:
-  emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
- force_reg (V16SImode, mask),
- op1));
-  return true;
+  if (!TARGET_AVX512F)
+   return false;
+  gen = gen_avx512f_vpermi2varv16si3;
+  break;
+case V4SFmode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv4sf3;
+  maskmode = V4SImode;
+  break;
+case V8SFmode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv8sf3;
+  maskmode = V8SImode;
+  break;
 case V16SFmode:
-  emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
- force_reg (V16SImode, mask),
- op1));
-  return true;
+  if (!TARGET_AVX512F)
+   return false;
+  gen = gen_avx512f_vpermi2varv16sf3;
+  maskmode = V16SImode;
+  break;
+case V2DImode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv2di3;
+  break;
+case V4DImode:
+  if (!TARGET_AVX512VL)
+   return false;
+  gen = gen_avx512vl_vpermi2varv4di3;
+  break;
 case V8DImode:
-  emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
-force_reg (V8DImode, mask), op1));
-  return true;
+  if (!TARGET_AVX512F)
+   return false;
+  

Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-10 Thread Jakub Jelinek
On Fri, Oct 10, 2014 at 07:47:19PM +0400, Ilya Tocar wrote:
 Updated patch below.

You haven't posted ChangeLog entry this time, so using the last one:

* config/i386/i386.c
   
(MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2.
   
...
* config/i386/sse.md
   
(define_mode_iterator VI1_AVX512): New. 
   
I'd think you should avoid the line break after filename in these
cases, so
* config/i386/i386.c (MAX_VECT_LEN): Move above
ix86_expand_vec_perm_vpermi2.
...
* config/i386/sse.md (define_mode_iterator VI1_AVX512): New.

Other than that nit it looks good to me.

Jakub


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-10 Thread Uros Bizjak
On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar tocarip.in...@gmail.com wrote:

 My strong preference would be:
   enum machine_mode maskmode = mode;
   rtx (*gen) (rtx, rtx, rtx, rtx);
 right below the enum machine_mode mode = GET_MODE (d ? d-op0 : op0);
 line and then inside of the first switch just do:
 ...
 case V16SImode:
   if (!TARGET_AVX512F)
   return false;
   gen = gen_avx512f_vpermi2varv16si3;
   break;
 case V4SFmode:
   if (!TARGET_AVX512VL)
   return false;
   gen = gen_avx512vl_vpermi2varv4sf3;
   maskmode = V4SImode;
   break;
 ...
 etc., then in the mask = line use:
 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d-nelt, vec));
 and finally instead of the second switch do:
   emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
   return true;

 Updated patch below.

Please recode that horrible first switch statement to:

--cut here--
  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;

  switch (mode)
{
case V8HImode:
  if (TARGET_AVX512VL  TARGET_AVX152BW)
gen = gen_avx512vl_vpermi2varv8hi3;
  break;

...

case V2DFmode:
  if (TARGET_AVX512VL)
{
  gen = gen_avx512vl_vpermi2varv2df3;
  maskmode = V2DImode;
}
  break;

default:
  break;
}

  if (gen == NULL)
return false;
--cut here--

The patch is OK with the above improvement.

(Please also note that the patch has a bunch of i386.md changes that
will clash with followup patch series).

Thanks,
Uros.


Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-09 Thread Ilya Tocar
Hi,

I think this patch should be split in 2 parts:
V64QI related and non-V64QI related.
This part contains non-V64QI related changes.
Also I've noticed, that not all patterns using VI1_AVX2,
actually have AVX512 versions, so fixed bogus patterns.

On 06 Oct 16:10, Jakub Jelinek wrote:
 On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote:
  --- a/gcc/config/i386/i386.c
  +++ b/gcc/config/i386/i386.c
  @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx 
  op0, rtx mask, rtx op1)
 enum machine_mode mode = GET_MODE (op0);
 switch (mode)
   {
  +  /* There is no byte version of vpermi2.  So we use vpermi2w.  */
  +case V64QImode:
...
 
 I believe this case doesn't belong to this function, other than this
 case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and
 so it should always do that, and there should be a separate function
 that expands the worst case of V64QImode full 2 operand permutation.
 See my previous mail, IMHO it is doable with 5 instructions rather than 7.
 And IMHO we should have a separate function which emits that, supposedly
 one for the constant permutations, one for the variable case (perhaps
 then your 7 insn sequence is best?).
This will be done in following patch.
 
 Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers,
 supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes
 right now plus D, either D would be NULL (then it would behave as now), or
 SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed.
 I.e. the function would start with a switch that would just contain the
 if (...)
 return false; 
 hunks plus break; for the success case, then code to generate CONST_VECTOR
 if sel is NULL_RTX from d, and finally another switch with just the emit
 cases.
Done.

 
  +case V8HImode:
  +  if (!TARGET_AVX512VL)
  +   return false;
  +  emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
  +  force_reg (V8HImode, mask), 
  op1));
  +  return true;
  +case V16HImode:
  +  if (!TARGET_AVX512VL)
  +   return false;
  +  emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
  +force_reg (V16HImode, mask), op1));
  +  return true;
 
 Aren't these two insns there only if both TARGET_AVX512VL  TARGET_AVX512BW?
 I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I
 think neither of these depends on the other one in GCC.  That's unlike insns
 where CPUID AVX512VL and AVX512F are mentioned together, because in GCC
 AVX512VL depends on AVX512F.

Good catch!

  @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
   
 if (d-one_operand_p)
   return false;
  -  if (TARGET_AVX2  GET_MODE_SIZE (vmode) == 32)
  +  if (TARGET_AVX512F  GET_MODE_SIZE (vmode) == 64 
  +  GET_MODE_SIZE (GET_MODE_INNER (vmode)) = 4)
 
 Formatting,  belongs on the second line.
 
Fixed.

  +;
  +  else if (TARGET_AVX512VL)
 
 I'd add  GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here.
 AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones,
 and the == 32 and == 16 cases are handled because AVX512VL implies
 TARGET_AVX2 and TARGET_SSE4_1, doesn't it?
 
As TARGET_AVX512VL always implies TARGET_AVX2 and TARGET_SSE4_1 and
works only on 32/16-byte mode this case is redundant, so I've removed
it.

  @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d 
  *d)
return false;
  }
  }
  +  else if (GET_MODE_SIZE (d-vmode) == 64)
  +   {
  + if (!TARGET_AVX512BW)
  +   return false;
  + if (vmode == V64QImode)
  +   {
  + for (i = 0; i  nelt; ++i)
  +   if ((d-perm[i] ^ i)  (nelt / 4))
  + return false;
 
 Missing comment, I'd duplicate the
   /* vpshufb only works intra lanes, it is not
  possible to shuffle bytes in between the lanes.  */
 comment there.
 
Done.

  @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d-vmode)
  {
  +   case V64QImode:
  + if (TARGET_AVX512VL)
 
 VL?  Isn't that BW?
 
  +   gen = gen_avx512bw_vec_dupv64qi;
  + break;
  case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
  +   case V32HImode:
  + if (TARGET_AVX512VL)
 
 Ditto.

Fixed.

  @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   mode = V8DImode;
 else if (mode == V16SFmode)
   mode = V16SImode;
  +  else if (mode == V4DFmode)
  +mode = V4DImode;
  +  else if (mode == V2DFmode)
  +mode = V2DImode;
  +  else if (mode == V8SFmode)
  +mode = V8SImode;
  +  else if (mode == V4SFmode)
  +mode = V4SImode;
 for (i = 0; i  nelt; ++i)
   vec[i] = GEN_INT (d-perm[i]);
 rtx mask = gen_rtx_CONST_VECTOR 

Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).

2014-10-09 Thread Jakub Jelinek
On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote:
 --- a/gcc/config/i386/i386.c
 +++ b/gcc/config/i386/i386.c
 @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[])
return true;
  }
  
 +/* AVX512F does support 64-byte integer vector operations,
 +   thus the longest vector we are faced with is V64QImode.  */
 +#define MAX_VECT_LEN 64
 +
 +struct expand_vec_perm_d
 +{
 +  rtx target, op0, op1;
 +  unsigned char perm[MAX_VECT_LEN];
 +  enum machine_mode vmode;
 +  unsigned char nelt;
 +  bool one_operand_p;
 +  bool testing_p;
 +};
 +
  static bool
 -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
 +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct 
 expand_vec_perm_d *d)

Too long line, please wrap it.

  {
 -  enum machine_mode mode = GET_MODE (op0);
 +  enum machine_mode mode = GET_MODE (d ? d-op0 : op0);
 +
switch (mode)
  {
 +case V8HImode:
 +  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
 + return false;
 +  break;
 +case V16HImode:
 +  if (!TARGET_AVX512VL || !TARGET_AVX512BW)
 + return false;
 +case V32HImode:
 +  if (!TARGET_AVX512BW)
 + return false;
 +  break;
 +case V4SImode:
 +  if (!TARGET_AVX512VL)
 + return false;
 +  break;
 +case V8SImode:
 +  if (!TARGET_AVX512VL)
 + return false;
 +  break;
 +case V16SImode:
 +  if (!TARGET_AVX512F)
 + return false;
 +  break;
 +case V4SFmode:
 +  if (!TARGET_AVX512VL)
 + return false;
 +  break;
 +case V8SFmode:
 +  if (!TARGET_AVX512VL)
 + return false;
 +  break;
 +case V16SFmode:
 +  if (!TARGET_AVX512F)
 + return false;
 +  break;
 +case V2DImode:
 +  if (!TARGET_AVX512VL)
 + return false;
 +  break;
 +case V4DImode:
 +  if (!TARGET_AVX512VL)
 + return false;
 +  break;
 +case V8DImode:
 +  if (!TARGET_AVX512F)
 + return false;
 +  break;
 +case V2DFmode:
 +  if (!TARGET_AVX512VL)
 + return false;
 +  break;
 +case V4DFmode:
 +  if (!TARGET_AVX512VL)
 + return false;
 +  break;
 +case V8DFmode:
 +  if (!TARGET_AVX512F)
 + return false;
 +  break;
 +default:
 +  return false;
 +}
 +
 +  /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const 
 expander,
 + so args are either in d, or in op0, op1 etc.  */
 +  if (d)
 +{
 +  rtx vec[64];
 +  target = d-target;
 +  op0 = d-op0;
 +  op1 = d-op1;
 +  for (int i = 0; i  d-nelt; ++i)
 + vec[i] = GEN_INT (d-perm[i]);
 +  mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (d-nelt, vec));

Shouldn't the mask use integral vector mode rather than floating?

My strong preference would be:
  enum machine_mode maskmode = mode;
  rtx (*gen) (rtx, rtx, rtx, rtx);
right below the enum machine_mode mode = GET_MODE (d ? d-op0 : op0);
line and then inside of the first switch just do:
...
case V16SImode:
  if (!TARGET_AVX512F)
return false;
  gen = gen_avx512f_vpermi2varv16si3;
  break;
case V4SFmode:
  if (!TARGET_AVX512VL)
return false;
  gen = gen_avx512vl_vpermi2varv4sf3;
  maskmode = V4SImode;
  break;
...
etc., then in the mask = line use:
mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d-nelt, vec));
and finally instead of the second switch do:
  emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
  return true;

Otherwise, the patch LGTM, but will leave the final approval to Uros.

Jakub