On Wed, Jun 15, 2016 at 10:48 AM, Richard Sandiford
<richard.sandif...@arm.com> wrote:
> I recently relaxed the peeling-for-gaps conditions for LD3 but
> kept them as-is for load-and-permute.  I don't think the conditons
> are needed for load-and-permute either though.  No current load-and-
> permute should load outside the group, so if there is no gap at the end,
> the final vector element loaded will correspond to an element loaded
> by the original scalar loop.
>
> The patch for PR68559 (a missed optimisation PR) increased the peeled
> cases from "exact_log2 (groupsize) == -1" to "vf % group_size == 0", so
> before that fix, we didn't peel for gaps if there was no gap at the end
> of the group and if the group size was a power of 2.
>
> The only current non-power-of-2 load-and-permute size is 3, which
> doesn't require loading more than 3 vectors.
>
> The testcase is based on gcc.dg/vect/pr49038.c.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Ok.

Thanks,
Richard.

> Thanks,
> Richard
>
>
> gcc/
>         * tree-vect-stmts.c (vectorizable_load): Remove unnecessary
>         peeling-for-gaps condition.
>
> gcc/testsuite/
>         * gcc.dg/vect/group-no-gaps-1.c: New test.
>
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c
> +++ gcc/tree-vect-stmts.c
> @@ -6356,13 +6356,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>           gcc_assert (GROUP_GAP (stmt_info));
>         }
>
> -      /* If there is a gap in the end of the group or the group size cannot
> -         be made a multiple of the vector element count then we access excess
> +      /* If there is a gap in the end of the group then we access excess
>          elements in the last iteration and thus need to peel that off.  */
>        if (loop_vinfo
>           && ! STMT_VINFO_STRIDED_P (stmt_info)
> -         && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
> -             || (!slp && !load_lanes_p && vf % group_size != 0)))
> +         && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
>         {
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> Index: gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c
> ===================================================================
> --- /dev/null
> +++ gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c
> @@ -0,0 +1,108 @@
> +/* { dg-require-effective-target mmap } */
> +
> +#include <sys/mman.h>
> +#include <stdio.h>
> +
> +#define COUNT 320
> +#define MMAP_SIZE 0x20000
> +#define ADDRESS1 0x1122000000
> +#define ADDRESS2 (ADDRESS1 + MMAP_SIZE * 16)
> +#define TYPE unsigned int
> +
> +#ifndef MAP_ANONYMOUS
> +#define MAP_ANONYMOUS MAP_ANON
> +#endif
> +
> +#define RHS0(B) b[B]
> +#define RHS1(B) RHS0(B) + b[(B) + 1]
> +#define RHS2(B) RHS1(B) + b[(B) + 2]
> +#define RHS3(B) RHS2(B) + b[(B) + 3]
> +#define RHS4(B) RHS3(B) + b[(B) + 4]
> +#define RHS5(B) RHS4(B) + b[(B) + 5]
> +#define RHS6(B) RHS5(B) + b[(B) + 6]
> +#define RHS7(B) RHS6(B) + b[(B) + 7]
> +
> +#define LHS0(B) a[B]
> +#define LHS1(B) LHS0(B) = a[(B) + 1]
> +#define LHS2(B) LHS1(B) = a[(B) + 2]
> +#define LHS3(B) LHS2(B) = a[(B) + 3]
> +#define LHS4(B) LHS3(B) = a[(B) + 4]
> +#define LHS5(B) LHS4(B) = a[(B) + 5]
> +#define LHS6(B) LHS5(B) = a[(B) + 6]
> +#define LHS7(B) LHS6(B) = a[(B) + 7]
> +
> +#define DEF_GROUP_SIZE(MULT, GAP, NO_GAP)                      \
> +  void __attribute__((noinline, noclone))                      \
> +  gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b)     \
> +  {                                                            \
> +    for (int i = 0; i < COUNT; i++)                            \
> +      a[i] = RHS##GAP (i * MULT);                              \
> +  }                                                            \
> +  void __attribute__((noinline, noclone))                      \
> +  no_gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b)  \
> +  {                                                            \
> +    for (int i = 0; i < COUNT; i++)                            \
> +      a[i] = RHS##NO_GAP (i * MULT);                           \
> +  }                                                            \
> +  void __attribute__((noinline, noclone))                      \
> +  gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b)    \
> +  {                                                            \
> +    for (int i = 0; i < COUNT; i++)                            \
> +      LHS##GAP (i * MULT) = b[i];                              \
> +  }                                                            \
> +  void __attribute__((noinline, noclone))                      \
> +  no_gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b) \
> +  {                                                            \
> +    for (int i = 0; i < COUNT; i++)                            \
> +      LHS##NO_GAP (i * MULT) = b[i];                           \
> +  }
> +
> +#define USE_GROUP_SIZE(MULT)                                   \
> +  gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT + 1);   \
> +  no_gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT);    \
> +  gap_store_##MULT (end_x - COUNT * MULT + 1, end_y - COUNT);  \
> +  no_gap_store_##MULT (end_x - COUNT * MULT, end_y - COUNT)
> +
> +DEF_GROUP_SIZE (2, 0, 1)
> +DEF_GROUP_SIZE (3, 1, 2)
> +DEF_GROUP_SIZE (4, 2, 3)
> +DEF_GROUP_SIZE (5, 3, 4)
> +DEF_GROUP_SIZE (6, 4, 5)
> +DEF_GROUP_SIZE (7, 5, 6)
> +DEF_GROUP_SIZE (8, 6, 7)
> +
> +int
> +main (void)
> +{
> +  void *x, *y;
> +  TYPE *end_x, *end_y;
> +
> +  x = mmap ((void *) ADDRESS1, MMAP_SIZE, PROT_READ | PROT_WRITE,
> +           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +  if (x == MAP_FAILED)
> +    {
> +      perror ("mmap");
> +      return 1;
> +    }
> +
> +  y = mmap ((void *) ADDRESS2, MMAP_SIZE, PROT_READ | PROT_WRITE,
> +           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +  if (y == MAP_FAILED)
> +    {
> +      perror ("mmap");
> +      return 1;
> +    }
> +
> +  end_x = (TYPE *) ((char *) x + MMAP_SIZE);
> +  end_y = (TYPE *) ((char *) y + MMAP_SIZE);
> +
> +  USE_GROUP_SIZE (2);
> +  USE_GROUP_SIZE (3);
> +  USE_GROUP_SIZE (4);
> +  USE_GROUP_SIZE (5);
> +  USE_GROUP_SIZE (6);
> +  USE_GROUP_SIZE (7);
> +  USE_GROUP_SIZE (8);
> +
> +  return 0;
> +}

Reply via email to