Re: [PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)

Uros Bizjak Sat, 15 Dec 2018 01:52:03 -0800

On 12/14/18, Jakub Jelinek <ja...@redhat.com> wrote:
> Hi!
>
> In the previous patch I've unfortunately left one important case from the
> testcase and apparently it wasn't covered by anything else in the
> testsuite.
> The 3 functions covered float and double gathers with indexes with the same
> bitsize and WIDENING gather (double gather with int index), but didn't
> cover
> NARROWING case (float gather with long index with -m64).  That was the only
> case that tried to permute the mask, unfortunately that isn't really
> supported and ICEs.  What works is VEC_UNPACK_{LO,HI}_EXPR on the
> VECTOR_BOOLEAN_TYPE_P, that is what other spots in the vectorizer emit for
> those.
>
> I had to also fix up the x86 backend, which had in expansion of these
> NARROWING gather builtins code cut&pasted from the 256-bit builtin,
> unfortunately it wasn't adjusted for the fact that the 512-bit builtin uses
> integral mask argument while the 256-bit one doesn't.  And even in the
> 256-bit one there was a bug, it relied on the mask and src arguments to be
> always in the same register (which is actually what the vectorizer
> generates
> for those right now, but it could do something else).
>
> This patch fixes that and enables also masked x86 AVX512F 512-bit
> scatter support.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> What is still unhandled (doesn't vectorize) is 128-bit or 256-bit scatters,
> I bet the mask operand is vectorized using normal non-bool vectors, but the
> instructions with AVX512VL actually need a mask register.  There are
> instructions that can handle that, but let's defer that for later.
>
> 2018-12-14  Jakub Jelinek  <ja...@redhat.com>
>
>       PR tree-optimization/88464
>       * tree-vect-stmts.c (vect_build_gather_load_calls): For NARROWING
>       and mask with integral masktype, don't try to permute mask vectors,
>       instead emit VEC_UNPACK_{LO,HI}_EXPR.  Fix up NOP_EXPR operand.
>       (vectorizable_store): Handle masked scatters with decl and integral
>       mask type.
>       (permute_vec_elements): Allow scalar_dest to be NULL.
>       * config/i386/i386.c (ix86_get_builtin)
>       <case IX86_BUILTIN_GATHER3ALTDIV16SF>: Use lowpart_subreg for masks.
>       <case IX86_BUILTIN_GATHER3ALTDIV8SF>: Don't assume mask and src have
>       to be the same.
>
>       * gcc.target/i386/avx512f-pr88462-1.c: Rename to ...
>       * gcc.target/i386/avx512f-pr88464-1.c: ... this.  Fix up PR number.
>       Expect 4 vectorized loops instead of 3.
>       (f4): New function.
>       * gcc.target/i386/avx512f-pr88462-2.c: Rename to ...
>       * gcc.target/i386/avx512f-pr88464-2.c: ... this.  Fix up PR number
>       and #include.
>       (avx512f_test): Prepare arguments for f4 and check the results.
>       * gcc.target/i386/avx512f-pr88464-3.c: New test.
>       * gcc.target/i386/avx512f-pr88464-4.c: New test.


LGTM for the x86 part.

Thanks,
Uros.

> --- gcc/tree-vect-stmts.c.jj  2018-12-13 18:01:13.000000000 +0100
> +++ gcc/tree-vect-stmts.c     2018-12-14 17:10:42.079054458 +0100
> @@ -2655,6 +2655,7 @@ vect_build_gather_load_calls (stmt_vec_i
>    if (mask && TREE_CODE (masktype) == INTEGER_TYPE)
>      masktype = build_same_sized_truth_vector_type (srctype);
>
> +  tree mask_halftype = masktype;
>    tree perm_mask = NULL_TREE;
>    tree mask_perm_mask = NULL_TREE;
>    if (known_eq (nunits, gather_off_nunits))
> @@ -2690,13 +2691,16 @@ vect_build_gather_load_calls (stmt_vec_i
>
>        ncopies *= 2;
>
> -      if (mask)
> +      if (mask && masktype == real_masktype)
>       {
>         for (int i = 0; i < count; ++i)
>           sel[i] = i | (count / 2);
>         indices.new_vector (sel, 2, count);
>         mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices);
>       }
> +      else if (mask)
> +     mask_halftype
> +       = build_same_sized_truth_vector_type (gs_info->offset_vectype);
>      }
>    else
>      gcc_unreachable ();
> @@ -2761,7 +2765,7 @@ vect_build_gather_load_calls (stmt_vec_i
>           {
>             if (j == 0)
>               vec_mask = vect_get_vec_def_for_operand (mask, stmt_info);
> -           else
> +           else if (modifier != NARROW || (j & 1) == 0)
>               vec_mask = vect_get_vec_def_for_stmt_copy (loop_vinfo,
>                                                          vec_mask);
>
> @@ -2779,17 +2783,27 @@ vect_build_gather_load_calls (stmt_vec_i
>                 mask_op = var;
>               }
>           }
> +       if (modifier == NARROW && masktype != real_masktype)
> +         {
> +           var = vect_get_new_ssa_name (mask_halftype, vect_simple_var);
> +           gassign *new_stmt
> +             = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR
> +                                                 : VEC_UNPACK_LO_EXPR,
> +                                    mask_op);
> +           vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
> +           mask_op = var;
> +         }
>         src_op = mask_op;
>       }
>
>        tree mask_arg = mask_op;
>        if (masktype != real_masktype)
>       {
> -       tree utype;
> -       if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype))
> +       tree utype, optype = TREE_TYPE (mask_op);
> +       if (TYPE_MODE (real_masktype) == TYPE_MODE (optype))
>           utype = real_masktype;
>         else
> -         utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1);
> +         utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
>         var = vect_get_new_ssa_name (utype, vect_scalar_var);
>         mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op);
>         gassign *new_stmt
> @@ -2801,7 +2815,7 @@ vect_build_gather_load_calls (stmt_vec_i
>             gcc_assert (TYPE_PRECISION (utype)
>                         <= TYPE_PRECISION (real_masktype));
>             var = vect_get_new_ssa_name (real_masktype, vect_scalar_var);
> -           new_stmt = gimple_build_assign (var, NOP_EXPR, utype);
> +           new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
>             vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>             mask_arg = var;
>           }
> @@ -6361,7 +6375,8 @@ vectorizable_store (stmt_vec_info stmt_i
>           return false;
>       }
>        else if (memory_access_type != VMAT_LOAD_STORE_LANES
> -            && (memory_access_type != VMAT_GATHER_SCATTER || gs_info.decl))
> +            && (memory_access_type != VMAT_GATHER_SCATTER
> +                || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype))))
>       {
>         if (dump_enabled_p ())
>           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6419,7 +6434,9 @@ vectorizable_store (stmt_vec_info stmt_i
>        tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src;
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
>        tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
> -      tree ptr, mask, var, scale, perm_mask = NULL_TREE;
> +      tree ptr, var, scale, vec_mask;
> +      tree mask_arg = NULL_TREE, mask_op = NULL_TREE, perm_mask =
> NULL_TREE;
> +      tree mask_halfvectype = mask_vectype;
>        edge pe = loop_preheader_edge (loop);
>        gimple_seq seq;
>        basic_block new_bb;
> @@ -6460,6 +6477,10 @@ vectorizable_store (stmt_vec_info stmt_i
>         perm_mask = vect_gen_perm_mask_checked (vectype, indices);
>         gcc_assert (perm_mask != NULL_TREE);
>         ncopies *= 2;
> +
> +       if (mask)
> +         mask_halfvectype
> +           = build_same_sized_truth_vector_type (gs_info.offset_vectype);
>       }
>        else
>       gcc_unreachable ();
> @@ -6482,10 +6503,11 @@ vectorizable_store (stmt_vec_info stmt_i
>         gcc_assert (!new_bb);
>       }
>
> -      /* Currently we support only unconditional scatter stores,
> -      so mask should be all ones.  */
> -      mask = build_int_cst (masktype, -1);
> -      mask = vect_init_vector (stmt_info, mask, masktype, NULL);
> +      if (mask == NULL_TREE)
> +     {
> +       mask_arg = build_int_cst (masktype, -1);
> +       mask_arg = vect_init_vector (stmt_info, mask_arg, masktype, NULL);
> +     }
>
>        scale = build_int_cst (scaletype, gs_info.scale);
>
> @@ -6494,36 +6516,46 @@ vectorizable_store (stmt_vec_info stmt_i
>       {
>         if (j == 0)
>           {
> -           src = vec_oprnd1
> -             = vect_get_vec_def_for_operand (op, stmt_info);
> -           op = vec_oprnd0
> -             = vect_get_vec_def_for_operand (gs_info.offset, stmt_info);
> +           src = vec_oprnd1 = vect_get_vec_def_for_operand (op, stmt_info);
> +           op = vec_oprnd0 = vect_get_vec_def_for_operand (gs_info.offset,
> +                                                           stmt_info);
> +           if (mask)
> +             mask_op = vec_mask = vect_get_vec_def_for_operand (mask,
> +                                                                stmt_info);
>           }
>         else if (modifier != NONE && (j & 1))
>           {
>             if (modifier == WIDEN)
>               {
> -               src = vec_oprnd1
> -                 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
> +               src
> +                 = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo,
> +                                                                vec_oprnd1);
>                 op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask,
>                                            stmt_info, gsi);
> +               if (mask)
> +                 mask_op
> +                   = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo,
> +                                                                vec_mask);
>               }
>             else if (modifier == NARROW)
>               {
>                 src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask,
>                                             stmt_info, gsi);
> -               op = vec_oprnd0
> -                 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0);
> +               op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo,
> +                                                                 vec_oprnd0);
>               }
>             else
>               gcc_unreachable ();
>           }
>         else
>           {
> -           src = vec_oprnd1
> -             = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
> -           op = vec_oprnd0
> -             = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0);
> +           src = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo,
> +                                                              vec_oprnd1);
> +           op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo,
> +                                                             vec_oprnd0);
> +           if (mask)
> +             mask_op = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo,
> +                                                                  vec_mask);
>           }
>
>         if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
> @@ -6550,8 +6582,45 @@ vectorizable_store (stmt_vec_info stmt_i
>             op = var;
>           }
>
> +       if (mask)
> +         {
> +           tree utype;
> +           mask_arg = mask_op;
> +           if (modifier == NARROW)
> +             {
> +               var = vect_get_new_ssa_name (mask_halfvectype,
> +                                            vect_simple_var);
> +               gassign *new_stmt
> +                 = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR
> +                                                     : VEC_UNPACK_LO_EXPR,
> +                                        mask_op);
> +               vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
> +               mask_arg = var;
> +             }
> +           tree optype = TREE_TYPE (mask_arg);
> +           if (TYPE_MODE (masktype) == TYPE_MODE (optype))
> +             utype = masktype;
> +           else
> +             utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
> +           var = vect_get_new_ssa_name (utype, vect_scalar_var);
> +           mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg);
> +           gassign *new_stmt
> +             = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
> +           vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
> +           mask_arg = var;
> +           if (!useless_type_conversion_p (masktype, utype))
> +             {
> +               gcc_assert (TYPE_PRECISION (utype)
> +                           <= TYPE_PRECISION (masktype));
> +               var = vect_get_new_ssa_name (masktype, vect_scalar_var);
> +               new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
> +               vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
> +               mask_arg = var;
> +             }
> +         }
> +
>         gcall *new_stmt
> -         = gimple_build_call (gs_info.decl, 5, ptr, mask, op, src, scale);
> +         = gimple_build_call (gs_info.decl, 5, ptr, mask_arg, op, src, 
> scale);
>         stmt_vec_info new_stmt_info
>           = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>
> @@ -7284,7 +7353,7 @@ permute_vec_elements (tree x, tree y, tr
>    gimple *perm_stmt;
>
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
> -  if (TREE_CODE (scalar_dest) == SSA_NAME)
> +  if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME)
>      perm_dest = vect_create_destination_var (scalar_dest, vectype);
>    else
>      perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
> --- gcc/config/i386/i386.c.jj 2018-12-13 13:45:11.000000000 +0100
> +++ gcc/config/i386/i386.c    2018-12-14 17:34:11.131135056 +0100
> @@ -37605,13 +37605,7 @@ rdseed_step:
>           op0 = copy_to_mode_reg (GET_MODE (op0), op0);
>         emit_insn (gen (half, op0));
>         op0 = half;
> -       if (GET_MODE (op3) != VOIDmode)
> -         {
> -           if (!nonimmediate_operand (op3, GET_MODE (op3)))
> -             op3 = copy_to_mode_reg (GET_MODE (op3), op3);
> -           emit_insn (gen (half, op3));
> -           op3 = half;
> -         }
> +       op3 = lowpart_subreg (QImode, op3, HImode);
>         break;
>       case IX86_BUILTIN_GATHER3ALTDIV8SF:
>       case IX86_BUILTIN_GATHER3ALTDIV8SI:
> @@ -37628,6 +37622,7 @@ rdseed_step:
>         op0 = half;
>         if (GET_MODE (op3) != VOIDmode)
>           {
> +           half = gen_reg_rtx (mode0);
>             if (!nonimmediate_operand (op3, GET_MODE (op3)))
>               op3 = copy_to_mode_reg (GET_MODE (op3), op3);
>             emit_insn (gen (half, op3));
> --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c.jj      2018-12-14
> 16:34:55.361955571 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c 2018-12-14
> 18:07:25.694686784 +0100
> @@ -0,0 +1,45 @@
> +/* PR tree-optimization/88464 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
> -mtune=skylake-avx512 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte
> vectors" 4 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4
> "vect" } } */
> +
> +__attribute__((noipa)) void
> +f1 (double * __restrict__ a, const double * __restrict__ b, const int *
> __restrict__ c, int n)
> +{
> +  int i;
> +#pragma GCC ivdep
> +  for (i = 0; i < n; ++i)
> +    if (a[i] > 10.0)
> +      a[i] = b[c[i]];
> +}
> +
> +__attribute__((noipa)) void
> +f2 (double * __restrict__ a, const double * __restrict__ b, const long *
> __restrict__ c, int n)
> +{
> +  int i;
> +#pragma GCC ivdep
> +  for (i = 0; i < n; ++i)
> +    if (a[i] > 10.0)
> +      a[i] = b[c[i]];
> +}
> +
> +__attribute__((noipa)) void
> +f3 (float * __restrict__ a, const float * __restrict__ b, const int *
> __restrict__ c, int n)
> +{
> +  int i;
> +#pragma GCC ivdep
> +  for (i = 0; i < n; ++i)
> +    if (a[i] > 10.0f)
> +      a[i] = b[c[i]];
> +}
> +
> +__attribute__((noipa)) void
> +f4 (float * __restrict__ a, const float * __restrict__ b, const long *
> __restrict__ c, int n)
> +{
> +  int i;
> +#pragma GCC ivdep
> +  for (i = 0; i < n; ++i)
> +    if (a[i] > 10.0f)
> +      a[i] = b[c[i]];
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c.jj      2018-12-14
> 16:35:00.681869029 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c 2018-12-14
> 17:43:40.294876267 +0100
> @@ -0,0 +1,61 @@
> +/* PR tree-optimization/88464 */
> +/* { dg-do run { target { avx512f } } } */
> +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
> -mtune=skylake-avx512" } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-pr88464-1.c"
> +
> +static void
> +avx512f_test (void)
> +{
> +  double a[1024], b[1024];
> +  float c[1024], f[1024];
> +  int d[1024];
> +  long e[1024];
> +  int i;
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      a[i] = (i % 3) != 0 ? 15.0 : -5.0;
> +      b[i] = 2 * i;
> +      d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
> +    }
> +  f1 (a, b, d, 1024);
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
> +     abort ();
> +      a[i] = (i % 3) != 1 ? 15.0 : -5.0;
> +      b[i] = 3 * i;
> +      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
> +    }
> +  f2 (a, b, e, 1024);
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
> +     abort ();
> +      c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
> +      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
> +      f[i] = 4 * i;
> +    }
> +  f3 (c, f, d, 1024);
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
> +     abort ();
> +      c[i] = (i % 3) != 0 ? 15.0f : -5.0f;
> +      e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
> +      f[i] = 5 * i;
> +    }
> +  f4 (c, f, e, 1024);
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f))
> +     abort ();
> +    }
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c.jj      2018-12-14
> 18:01:19.297647800 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c 2018-12-14
> 18:07:14.906862302 +0100
> @@ -0,0 +1,45 @@
> +/* PR tree-optimization/88464 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
> -mtune=skylake-avx512 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte
> vectors" 4 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4
> "vect" } } */
> +
> +__attribute__((noipa)) void
> +f1 (double * __restrict__ a, const double * __restrict__ b, const int *
> __restrict__ c, int n)
> +{
> +  int i;
> +#pragma GCC ivdep
> +  for (i = 0; i < n; ++i)
> +    if (b[i] > -2.0)
> +      a[c[i]] = b[i];
> +}
> +
> +__attribute__((noipa)) void
> +f2 (double * __restrict__ a, const double * __restrict__ b, const long *
> __restrict__ c, int n)
> +{
> +  int i;
> +#pragma GCC ivdep
> +  for (i = 0; i < n; ++i)
> +    if (b[i] > -2.0)
> +      a[c[i]] = b[i];
> +}
> +
> +__attribute__((noipa)) void
> +f3 (float * __restrict__ a, const float * __restrict__ b, const int *
> __restrict__ c, int n)
> +{
> +  int i;
> +#pragma GCC ivdep
> +  for (i = 0; i < n; ++i)
> +    if (b[i] > -2.0f)
> +      a[c[i]] = b[i];
> +}
> +
> +__attribute__((noipa)) void
> +f4 (float * __restrict__ a, const float * __restrict__ b, const long *
> __restrict__ c, int n)
> +{
> +  int i;
> +#pragma GCC ivdep
> +  for (i = 0; i < n; ++i)
> +    if (b[i] > -2.0f)
> +      a[c[i]] = b[i];
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c.jj      2018-12-14
> 18:03:03.100958998 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c 2018-12-14
> 18:12:32.209699741 +0100
> @@ -0,0 +1,61 @@
> +/* PR tree-optimization/88464 */
> +/* { dg-do run { target { avx512f } } } */
> +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
> -mtune=skylake-avx512" } */
> +
> +#include "avx512f-check.h"
> +
> +#include "avx512f-pr88464-3.c"
> +
> +static void
> +avx512f_test (void)
> +{
> +  double a[1024], b[1024];
> +  float c[1024], f[1024];
> +  int d[1024];
> +  long e[1024];
> +  int i;
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      a[i] = -5.0;
> +      b[i] = (i % 3) != 0 ? 2.0 * i : -5.0;
> +      d[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
> +    }
> +  f1 (a, b, d, 1024);
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
> +     abort ();
> +      a[i] = -5.0;
> +      b[i] = (i % 3) != 1 ? 3.0 * i : -5.0;
> +      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
> +    }
> +  f2 (a, b, e, 1024);
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      if (a[i] != ((i % 3) != 2 ? (1023 - i) * 3.0 : -5.0))
> +     abort ();
> +      c[i] = -5.0f;
> +      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
> +      f[i] = (i % 3) != 2 ? 4.0f * i : -5.0f;
> +    }
> +  f3 (c, f, d, 1024);
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      if (c[i] != ((i % 3) != 1 ? (1023 - i) * 4.0f : -5.0f))
> +     abort ();
> +      c[i] = -5.0f;
> +      e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
> +      f[i] = (i % 3) != 0 ? 5.0f * i : -5.0f;
> +    }
> +  f4 (c, f, e, 1024);
> +  for (i = 0; i < 1024; i++)
> +    {
> +      asm volatile ("" : "+g" (i));
> +      if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f))
> +     abort ();
> +    }
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj      2018-12-13
> 18:01:13.913271190 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c 2018-11-06
> 14:56:08.851174491 +0100
> @@ -1,35 +0,0 @@
> -/* PR tree-optimization/88462 */
> -/* { dg-do compile } */
> -/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
> -mtune=skylake-avx512 -fdump-tree-vect-details" } */
> -/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte
> vectors" 3 "vect" } } */
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3
> "vect" } } */
> -
> -__attribute__((noipa)) void
> -f1 (double * __restrict__ a, const double * __restrict__ b, const int *
> __restrict__ c, int n)
> -{
> -  int i;
> -#pragma GCC ivdep
> -  for (i = 0; i < n; ++i)
> -    if (a[i] > 10.0)
> -      a[i] = b[c[i]];
> -}
> -
> -__attribute__((noipa)) void
> -f2 (double * __restrict__ a, const double * __restrict__ b, const long *
> __restrict__ c, int n)
> -{
> -  int i;
> -#pragma GCC ivdep
> -  for (i = 0; i < n; ++i)
> -    if (a[i] > 10.0)
> -      a[i] = b[c[i]];
> -}
> -
> -__attribute__((noipa)) void
> -f3 (float * __restrict__ a, const float * __restrict__ b, const int *
> __restrict__ c, int n)
> -{
> -  int i;
> -#pragma GCC ivdep
> -  for (i = 0; i < n; ++i)
> -    if (a[i] > 10.0f)
> -      a[i] = b[c[i]];
> -}
> --- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj      2018-12-13
> 18:01:13.914271174 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c 2018-11-06
> 14:56:08.851174491 +0100
> @@ -1,51 +0,0 @@
> -/* PR tree-optimization/88462 */
> -/* { dg-do run { target { avx512f } } } */
> -/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
> -mtune=skylake-avx512" } */
> -
> -#include "avx512f-check.h"
> -
> -#include "avx512f-pr88462-1.c"
> -
> -static void
> -avx512f_test (void)
> -{
> -  double a[1024], b[1024];
> -  float c[1024], f[1024];
> -  int d[1024];
> -  long e[1024];
> -  int i;
> -  for (i = 0; i < 1024; i++)
> -    {
> -      asm volatile ("" : "+g" (i));
> -      a[i] = (i % 3) != 0 ? 15.0 : -5.0;
> -      b[i] = 2 * i;
> -      d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
> -    }
> -  f1 (a, b, d, 1024);
> -  for (i = 0; i < 1024; i++)
> -    {
> -      asm volatile ("" : "+g" (i));
> -      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
> -     abort ();
> -      a[i] = (i % 3) != 1 ? 15.0 : -5.0;
> -      b[i] = 3 * i;
> -      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
> -    }
> -  f2 (a, b, e, 1024);
> -  for (i = 0; i < 1024; i++)
> -    {
> -      asm volatile ("" : "+g" (i));
> -      if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
> -     abort ();
> -      c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
> -      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
> -      f[i] = 4 * i;
> -    }
> -  f3 (c, f, d, 1024);
> -  for (i = 0; i < 1024; i++)
> -    {
> -      asm volatile ("" : "+g" (i));
> -      if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
> -     abort ();
> -    }
> -}
>
>       Jakub
>

Re: [PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)

Reply via email to