Hi! In lots of patterns we assume that we never see xmm16+ hard registers with 128-bit and 256-bit vector modes when not -mavx512vl, because HARD_REGNO_MODE_OK refuses those. Unfortunately, as this testcase and patch shows, the vec_extract_lo* splitters work as a loophole around this, we happily create instructions like (set (reg:V32QI xmm5) (reg:V32QI xmm16)) and then hard register propagation can propagate the V32QI xmm16 into other insns like vpand.
The following patch fixes it by making sure we never create such registers, just emit (set (reg:V64QI xmm5) (reg:V64QI xmm16)) instead, which by copying all the 512 bits also copies the low bits, and as the destination is originally V32QI which is not HARD_REGNO_MODE_OK in xmm16+, this should be fine. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2018-04-11 Jakub Jelinek <ja...@redhat.com> PR target/85328 * config/i386/sse.md (<mask_codefor>avx512dq_vextract<shuffletype>64x2_1<mask_name> split, <mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name> split, vec_extract_lo_<mode><mask_name> split, vec_extract_lo_v32hi, vec_extract_lo_v64qi): For non-AVX512VL if input is xmm16+ reg and output is a reg, avoid creating invalid lowpart subreg, but instead split into a 512-bit move. * gcc.target/i386/pr85328.c: New test. --- gcc/config/i386/sse.md.jj 2018-04-10 14:37:02.092801344 +0200 +++ gcc/config/i386/sse.md 2018-04-11 12:00:44.296840287 +0200 @@ -7362,7 +7362,15 @@ (define_split (parallel [(const_int 0) (const_int 1)])))] "TARGET_AVX512DQ && reload_completed" [(set (match_dup 0) (match_dup 1))] - "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);") +{ + if (!TARGET_AVX512VL + && REG_P (operands[0]) + && EXT_REX_SSE_REG_P (operands[1])) + operands[0] + = lowpart_subreg (<MODE>mode, operands[0], <ssequartermode>mode); + else + operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]); +}) (define_insn "<mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name>" [(set (match_operand:<ssequartermode> 0 "<store_mask_predicate>" "=<store_mask_constraint>") @@ -7395,7 +7403,15 @@ (define_split (const_int 2) (const_int 3)])))] "TARGET_AVX512F && reload_completed" [(set (match_dup 0) (match_dup 1))] - "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);") +{ + if (!TARGET_AVX512VL + && REG_P (operands[0]) + && EXT_REX_SSE_REG_P (operands[1])) + operands[0] + = lowpart_subreg (<MODE>mode, operands[0], <ssequartermode>mode); + else + operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]); +}) (define_mode_attr extract_type_2 [(V16SF "avx512dq") (V16SI "avx512dq") (V8DF "avx512f") (V8DI "avx512f")]) @@ -7655,7 +7671,15 @@ (define_split "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1])) && reload_completed" [(set (match_dup 0) (match_dup 1))] - "operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);") +{ + if (!TARGET_AVX512VL + && REG_P (operands[0]) + && EXT_REX_SSE_REG_P (operands[1])) + operands[0] + = lowpart_subreg (<MODE>mode, operands[0], <ssehalfvecmode>mode); + else + operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]); +}) (define_insn "vec_extract_lo_<mode><mask_name>" [(set (match_operand:<ssehalfvecmode> 0 "<store_mask_predicate>" "=v,m") @@ -7830,7 +7854,14 @@ (define_insn_and_split "vec_extract_lo_v "#" "&& reload_completed" [(set (match_dup 0) (match_dup 1))] - "operands[1] = gen_lowpart (V16HImode, operands[1]);") +{ + if (!TARGET_AVX512VL + && REG_P (operands[0]) + && EXT_REX_SSE_REG_P (operands[1])) + operands[0] = lowpart_subreg (V32HImode, operands[0], V16HImode); + else + operands[1] = gen_lowpart (V16HImode, operands[1]); +}) (define_insn "vec_extract_hi_v32hi" [(set (match_operand:V16HI 0 "nonimmediate_operand" "=v,m") @@ -7915,7 +7946,14 @@ (define_insn_and_split "vec_extract_lo_v "#" "&& reload_completed" [(set (match_dup 0) (match_dup 1))] - "operands[1] = gen_lowpart (V32QImode, operands[1]);") +{ + if (!TARGET_AVX512VL + && REG_P (operands[0]) + && EXT_REX_SSE_REG_P (operands[1])) + operands[0] = lowpart_subreg (V64QImode, operands[0], V32QImode); + else + operands[1] = gen_lowpart (V32QImode, operands[1]); +}) (define_insn "vec_extract_hi_v64qi" [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m") --- gcc/testsuite/gcc.target/i386/pr85328.c.jj 2018-04-11 12:07:15.044933408 +0200 +++ gcc/testsuite/gcc.target/i386/pr85328.c 2018-04-11 10:45:17.269733600 +0200 @@ -0,0 +1,18 @@ +/* PR target/85328 */ +/* { dg-do assemble { target avx512f } } */ +/* { dg-options "-O3 -fno-caller-saves -mavx512f" } */ + +typedef char U __attribute__((vector_size (64))); +typedef int V __attribute__((vector_size (64))); +U a, b; + +extern void bar (void); + +V +foo (V f) +{ + b <<= (U){(V){}[63]} & 7; + bar (); + a = (U)f & 7; + return (V)b; +} Jakub