On Wed, 20 Nov 2013, Jakub Jelinek wrote:

> Hi!
> 
> I've noticed we generate terrible code for the testcase below.
> E.g. with -mavx2 it is:
>         leal    6(%rdi), %edx
>         leal    12(%rdi), %ecx
>         leal    18(%rdi), %esi
>         leal    3(%rdi), %eax
>         movl    %edx, -20(%rsp)
>         movl    %ecx, -24(%rsp)
>         leal    9(%rdi), %edx
>         movl    %esi, -28(%rsp)
>         vmovd   -20(%rsp), %xmm6
>         leal    15(%rdi), %ecx
>         movl    %edi, -20(%rsp)
>         leal    21(%rdi), %esi
>         vmovd   -28(%rsp), %xmm4
>         vmovd   -24(%rsp), %xmm5
>         vpinsrd $1, %edx, %xmm6, %xmm3
>         vmovd   -20(%rsp), %xmm7
>         vpinsrd $1, %esi, %xmm4, %xmm2
>         vpinsrd $1, %ecx, %xmm5, %xmm1
>         vpinsrd $1, %eax, %xmm7, %xmm0
>         vpunpcklqdq     %xmm2, %xmm1, %xmm1
>         vpunpcklqdq     %xmm3, %xmm0, %xmm0
>         vinserti128     $0x1, %xmm1, %ymm0, %ymm0
> to load the { x, x + 3, x + 6, x + 9, x + 12, x + 15, x + 18, x + 21 }
> CONSTRUCTOR into a vector register.  This patch expands it as:
>         movl    %edi, -20(%rsp)
>         vbroadcastss    -20(%rsp), %ymm0
>         vpaddd  .LC0(%rip), %ymm0, %ymm0
> instead.  Similarly for SSE2:
>         leal    3(%rdi), %eax
>         movl    %eax, -20(%rsp)
>         leal    6(%rdi), %eax
>         movd    -20(%rsp), %xmm4
>         movl    %eax, -16(%rsp)
>         leal    9(%rdi), %eax
>         movd    -16(%rsp), %xmm1
>         movl    %edi, -16(%rsp)
>         movl    %eax, -12(%rsp)
>         movd    -16(%rsp), %xmm0
>         movd    -12(%rsp), %xmm3
>         punpckldq       %xmm4, %xmm0
>         punpckldq       %xmm3, %xmm1
>         punpcklqdq      %xmm1, %xmm0
> instead of:
>         movl    %edi, -12(%rsp)
>         movd    -12(%rsp), %xmm3
>         pshufd  $0, %xmm3, %xmm0
>         paddd   .LC0(%rip), %xmm0
> 
> The patch leaves that decision to the target (in it's
> *_expand_vector_init).
> 
> Ok?

Aww ;)  Nice improvement.  Generally when I see this I always wonder
whether we want to do this kind of stuff pre RTL expansion.
1st to not rely on being able to TER, 2nd to finally eventually
get rid of TER.

These patches are unfortunately a step backward for #2.

As of the patch, do we have a way to query whether the target
can efficiently broadcast?  If so this IMHO belongs in generic
code, not in ix86_expand_vector_init - passing down plus_constant
results may pessimize other targets that don't understand this
trick, no?  (OTOH, which vector ISA does _not_ know how to
broadcast ...?)

Thanks,
Richard.

> 2013-11-20  Jakub Jelinek  <ja...@redhat.com>
> 
>       * expr.c (store_constructor): If all CONSTRUCTOR values are
>       the same SSA_NAME plus optional constant, pass plus_constant results
>       down to vec_init.
>       * config/i386/i386.c (ix86_expand_vector_init): Optimize it using
>       broadcast followed by addition of a constant vector.
> 
>       * gcc.dg/vect/vect-124.c: New test.
> 
> --- gcc/expr.c.jj     2013-11-19 21:56:27.000000000 +0100
> +++ gcc/expr.c        2013-11-20 09:24:14.435860130 +0100
> @@ -6296,6 +6296,8 @@ store_constructor (tree exp, rtx target,
>       rtvec vector = NULL;
>       unsigned n_elts;
>       alias_set_type alias;
> +     tree base = NULL_TREE;
> +     rtx base_rtx = NULL_RTX;
>  
>       gcc_assert (eltmode != BLKmode);
>  
> @@ -6334,6 +6336,31 @@ store_constructor (tree exp, rtx target,
>                                   TYPE_SIZE (TREE_TYPE (value)),
>                                   TYPE_SIZE (elttype)));
>  
> +             /* Try to detect fairly common pattern of
> +                { a_5, a_5 + 1, a_5 + 2, a_5 + 3 }.  */
> +             if (vector
> +                 && (count == 0 || base)
> +                 && TREE_CODE (value) == SSA_NAME)
> +               {
> +                 gimple g = get_gimple_for_ssa_name (value);
> +                 tree this_base = value;
> +                 if (g && is_gimple_assign (g))
> +                   switch (gimple_assign_rhs_code (g))
> +                     {
> +                     case PLUS_EXPR:
> +                     case POINTER_PLUS_EXPR:
> +                       if (TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME
> +                           && tree_fits_shwi_p (gimple_assign_rhs2 (g)))
> +                         this_base = gimple_assign_rhs1 (g);
> +                       break;
> +                     default:
> +                       break;
> +                     }
> +                 if (count == 0)
> +                   base = this_base;
> +                 else if (base != this_base)
> +                   base = NULL_TREE;
> +               }
>               count += n_elts_here;
>               if (mostly_zeros_p (value))
>                 zero_count += n_elts_here;
> @@ -6342,6 +6369,9 @@ store_constructor (tree exp, rtx target,
>           /* Clear the entire vector first if there are any missing elements,
>              or if the incidence of zero elements is >= 75%.  */
>           need_to_clear = (count < n_elts || 4 * zero_count >= 3 * count);
> +
> +         if (count < n_elts)
> +           base = NULL_TREE;
>         }
>  
>       if (need_to_clear && size > 0 && !vector)
> @@ -6362,6 +6392,9 @@ store_constructor (tree exp, rtx target,
>       else
>         alias = get_alias_set (elttype);
>  
> +     if (base)
> +       base_rtx = expand_normal (base);
> +
>          /* Store each element of the constructor into the corresponding
>          element of TARGET, determined by counting the elements.  */
>       for (idx = 0, i = 0;
> @@ -6385,8 +6418,21 @@ store_constructor (tree exp, rtx target,
>               /* Vector CONSTRUCTORs should only be built from smaller
>                  vectors in the case of BLKmode vectors.  */
>               gcc_assert (TREE_CODE (TREE_TYPE (value)) != VECTOR_TYPE);
> -             RTVEC_ELT (vector, eltpos)
> -               = expand_normal (value);
> +             if (base)
> +               {
> +                 if (value == base)
> +                   RTVEC_ELT (vector, eltpos) = base_rtx;
> +                 else
> +                   {
> +                     gimple g = get_gimple_for_ssa_name (value);
> +                     rtx cst = expand_normal (gimple_assign_rhs2 (g));
> +                     RTVEC_ELT (vector, eltpos)
> +                       = plus_constant (TYPE_MODE (TREE_TYPE (value)),
> +                                        base_rtx, INTVAL (cst));
> +                   }
> +               }
> +             else
> +               RTVEC_ELT (vector, eltpos) = expand_normal (value);
>             }
>           else
>             {
> --- gcc/config/i386/i386.c.jj 2013-11-19 21:56:41.000000000 +0100
> +++ gcc/config/i386/i386.c    2013-11-20 09:20:16.837649403 +0100
> @@ -37736,9 +37736,10 @@ ix86_expand_vector_init (bool mmx_ok, rt
>    enum machine_mode inner_mode = GET_MODE_INNER (mode);
>    int n_elts = GET_MODE_NUNITS (mode);
>    int n_var = 0, one_var = -1;
> -  bool all_same = true, all_const_zero = true;
> +  bool all_same = true, all_const_zero = true, need_force_reg = false;
>    int i;
>    rtx x;
> +  rtx base = NULL_RTX;
>  
>    for (i = 0; i < n_elts; ++i)
>      {
> @@ -37746,11 +37747,25 @@ ix86_expand_vector_init (bool mmx_ok, rt
>        if (!(CONST_INT_P (x)
>           || GET_CODE (x) == CONST_DOUBLE
>           || GET_CODE (x) == CONST_FIXED))
> -     n_var++, one_var = i;
> +     {
> +       n_var++;
> +       one_var = i;
> +       if (!REG_P (x) && !MEM_P (x))
> +         need_force_reg = true;
> +     }
>        else if (x != CONST0_RTX (inner_mode))
>       all_const_zero = false;
>        if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
>       all_same = false;
> +      if (i == 0 || base)
> +     {
> +       if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
> +         x = XEXP (x, 0);
> +       if (i == 0)
> +         base = x;
> +       else if (!rtx_equal_p (x, base))
> +         base = NULL_RTX;
> +     }
>      }
>  
>    /* Constants are best loaded from the constant pool.  */
> @@ -37760,6 +37775,49 @@ ix86_expand_vector_init (bool mmx_ok, rt
>        return;
>      }
>  
> +  /* For { a_5, a_5 + 4, a_5 + 2, a_5 + 7 } generate a broadcast
> +     followed by addition of constant.  */
> +  if (!all_same
> +      && base
> +      && n_elts >= 4
> +      && (REG_P (base) || MEM_P (base)))
> +    {
> +      rtx subtarget = gen_reg_rtx (mode);
> +      if (ix86_expand_vector_init_duplicate (mmx_ok, mode, subtarget, base))
> +     {
> +       rtvec cstvals = rtvec_alloc (n_elts);
> +       rtx cstsubtarget = gen_reg_rtx (mode);
> +       for (i = 0; i < n_elts; i++)
> +         {
> +           x = XVECEXP (vals, 0, i);
> +           if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
> +             RTVEC_ELT (cstvals, i) = XEXP (x, 1);
> +           else
> +             RTVEC_ELT (cstvals, i) = const0_rtx;
> +         }
> +       ix86_expand_vector_init (mmx_ok, cstsubtarget,
> +                                gen_rtx_PARALLEL (mode, cstvals));
> +       subtarget = expand_simple_binop (mode, PLUS, subtarget,
> +                                        cstsubtarget, target, 1,
> +                                        OPTAB_DIRECT);
> +       if (subtarget != target)
> +         emit_move_insn (target, subtarget);
> +       return;
> +     }
> +    }
> +
> +  if (need_force_reg)
> +    for (i = 0; i < n_elts; ++i)
> +      {
> +     x = XVECEXP (vals, 0, i);
> +     if (!(CONST_INT_P (x)
> +           || GET_CODE (x) == CONST_DOUBLE
> +           || GET_CODE (x) == CONST_FIXED
> +           || REG_P (x)
> +           || MEM_P (x)))
> +       XVECEXP (vals, 0, i) = force_reg (GET_MODE (x), x);
> +      }
> +
>    /* If all values are identical, broadcast the value.  */
>    if (all_same
>        && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
> --- gcc/testsuite/gcc.dg/vect/vect-124.c.jj   2013-11-20 09:27:35.760798649 
> +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-124.c      2013-11-20 09:29:32.219199794 
> +0100
> @@ -0,0 +1,28 @@
> +#include "tree-vect.h"
> +
> +#ifndef N
> +#define N 64
> +#endif
> +
> +int a[N];
> +
> +__attribute__((noinline, noclone)) void
> +foo (int x)
> +{
> +  int i;
> +  for (i = 0; i < N; i++, x += 3)
> +    a[i] = x;
> +}
> +
> +int
> +main ()
> +{
> +  int i;
> +  
> +  check_vect ();
> +  foo (6);
> +  for (i = 0; i < N; i++)
> +    if (a[i] != i * 3 + 6)
> +      abort ();
> +  return 0;
> +}
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer

Reply via email to