Hi!

In lots of patterns we assume that we never see xmm16+ hard registers
with 128-bit and 256-bit vector modes when not -mavx512vl, because
HARD_REGNO_MODE_OK refuses those.
Unfortunately, as this testcase and patch shows, the vec_extract_lo*
splitters work as a loophole around this, we happily create instructions
like (set (reg:V32QI xmm5) (reg:V32QI xmm16)) and then hard register
propagation can propagate the V32QI xmm16 into other insns like vpand.

The following patch fixes it by making sure we never create such registers,
just emit (set (reg:V64QI xmm5) (reg:V64QI xmm16)) instead, which by copying
all the 512 bits also copies the low bits, and as the destination is
originally V32QI which is not HARD_REGNO_MODE_OK in xmm16+, this should be
fine.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2018-04-11  Jakub Jelinek  <ja...@redhat.com>

        PR target/85328
        * config/i386/sse.md
        (<mask_codefor>avx512dq_vextract<shuffletype>64x2_1<mask_name> split,
        <mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name> split,
        vec_extract_lo_<mode><mask_name> split, vec_extract_lo_v32hi,
        vec_extract_lo_v64qi): For non-AVX512VL if input is xmm16+ reg
        and output is a reg, avoid creating invalid lowpart subreg, but
        instead split into a 512-bit move.

        * gcc.target/i386/pr85328.c: New test.

--- gcc/config/i386/sse.md.jj   2018-04-10 14:37:02.092801344 +0200
+++ gcc/config/i386/sse.md      2018-04-11 12:00:44.296840287 +0200
@@ -7362,7 +7362,15 @@ (define_split
          (parallel [(const_int 0) (const_int 1)])))]
   "TARGET_AVX512DQ && reload_completed"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0]
+      = lowpart_subreg (<MODE>mode, operands[0], <ssequartermode>mode);
+  else
+    operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);
+})
 
 (define_insn "<mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name>"
   [(set (match_operand:<ssequartermode> 0 "<store_mask_predicate>" 
"=<store_mask_constraint>")
@@ -7395,7 +7403,15 @@ (define_split
                     (const_int 2) (const_int 3)])))]
   "TARGET_AVX512F && reload_completed"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0]
+      = lowpart_subreg (<MODE>mode, operands[0], <ssequartermode>mode);
+  else
+    operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);
+})
 
 (define_mode_attr extract_type_2
   [(V16SF "avx512dq") (V16SI "avx512dq") (V8DF "avx512f") (V8DI "avx512f")])
@@ -7655,7 +7671,15 @@ (define_split
   "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))
    && reload_completed"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0]
+      = lowpart_subreg (<MODE>mode, operands[0], <ssehalfvecmode>mode);
+  else
+    operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
+})
 
 (define_insn "vec_extract_lo_<mode><mask_name>"
   [(set (match_operand:<ssehalfvecmode> 0 "<store_mask_predicate>" "=v,m")
@@ -7830,7 +7854,14 @@ (define_insn_and_split "vec_extract_lo_v
   "#"
   "&& reload_completed"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (V16HImode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0] = lowpart_subreg (V32HImode, operands[0], V16HImode);
+  else
+    operands[1] = gen_lowpart (V16HImode, operands[1]);
+})
 
 (define_insn "vec_extract_hi_v32hi"
   [(set (match_operand:V16HI 0 "nonimmediate_operand" "=v,m")
@@ -7915,7 +7946,14 @@ (define_insn_and_split "vec_extract_lo_v
   "#"
   "&& reload_completed"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (V32QImode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0] = lowpart_subreg (V64QImode, operands[0], V32QImode);
+  else
+    operands[1] = gen_lowpart (V32QImode, operands[1]);
+})
 
 (define_insn "vec_extract_hi_v64qi"
   [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
--- gcc/testsuite/gcc.target/i386/pr85328.c.jj  2018-04-11 12:07:15.044933408 
+0200
+++ gcc/testsuite/gcc.target/i386/pr85328.c     2018-04-11 10:45:17.269733600 
+0200
@@ -0,0 +1,18 @@
+/* PR target/85328 */
+/* { dg-do assemble { target avx512f } } */
+/* { dg-options "-O3 -fno-caller-saves -mavx512f" } */
+
+typedef char U __attribute__((vector_size (64)));
+typedef int V __attribute__((vector_size (64)));
+U a, b;
+
+extern void bar (void);
+
+V
+foo (V f)
+{
+  b <<= (U){(V){}[63]} & 7;
+  bar ();
+  a = (U)f & 7;
+  return (V)b;
+}

        Jakub

Reply via email to