Hello! Attached patch introduces peephole2 pattern to avoid intermediate DImode register in interunit zero-extend sequence.
However, it looks there is still slight problem with RA. Without -mtune=intel, we have direct GR->XMM interunit moves disabled, but pr80425-2.c testcase compiles to: movl a(%rip), %eax movq %rax, -56(%rbp) vmovq -56(%rbp), %xmm1 The compiler could emit a direct mem->XMM zero-extending move, without intermediate stack slot. 2017-05-15 Uros Bizjak <ubiz...@gmail.com> * config/i386.i386.md (*zero_extendsidi2): Do not penalize non-interunit SSE move alternatives with '?'. (zero-extendsidi peephole2): New peephole to skip intermediate general register in SSE zero-extend sequence. testsuite/ChangeLog: 2017-05-15 Uros Bizjak <ubiz...@gmail.com> * gcc.target/i386/pr80425-1.c: New test. * gcc.target/i386/pr80425-2.c: Ditto. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Committed to mainline SVN. Uros.
Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 248065) +++ config/i386/i386.md (working copy) @@ -3762,10 +3762,10 @@ (define_insn "*zero_extendsidi2" [(set (match_operand:DI 0 "nonimmediate_operand" - "=r,?r,?o,r ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,?*x,?*v,*r") + "=r,?r,?o,r ,o,?*Ym,?!*y,?r ,?r,?*Yi,*x,*x,*v,*r") (zero_extend:DI (match_operand:SI 1 "x86_64_zext_operand" - "0 ,rm,r ,rmWz,0,r ,m ,*Yj,*x,r ,m , *x, *v,*k")))] + "0 ,rm,r ,rmWz,0,r ,m ,*Yj,*x,r ,m ,*x,*v,*k")))] "" { switch (get_attr_type (insn)) @@ -3885,6 +3885,15 @@ (set (match_dup 4) (const_int 0))] "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);") +(define_peephole2 + [(set (match_operand:DI 0 "general_reg_operand") + (zero_extend:DI (match_operand:SI 1 "nonimmediate_gr_operand"))) + (set (match_operand:DI 2 "sse_reg_operand") (match_dup 0))] + "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC + && peep2_reg_dead_p (2, operands[0])" + [(set (match_dup 2) + (zero_extend:DI (match_dup 1)))]) + (define_mode_attr kmov_isa [(QI "avx512dq") (HI "avx512f") (SI "avx512bw") (DI "avx512bw")]) Index: testsuite/gcc.target/i386/pr80425-1.c =================================================================== --- testsuite/gcc.target/i386/pr80425-1.c (nonexistent) +++ testsuite/gcc.target/i386/pr80425-1.c (working copy) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -mtune=intel" } */ + +#include <x86intrin.h> + +__m512i +f1 (__m512i x, int a) +{ + return _mm512_srai_epi32 (x, a); +} + +/* { dg-final { scan-assembler-times "movd\[ \\t\]+\[^\n\]*%xmm" 1 } } */ Index: testsuite/gcc.target/i386/pr80425-2.c =================================================================== --- testsuite/gcc.target/i386/pr80425-2.c (nonexistent) +++ testsuite/gcc.target/i386/pr80425-2.c (working copy) @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -mtune=intel" } */ + +#include <x86intrin.h> + +extern int a; + +__m512i +f1 (__m512i x) +{ + return _mm512_srai_epi32 (x, a); +} + +/* { dg-final { scan-assembler-times "movd\[ \\t\]+\[^\n\]*%xmm" 1 } } */