On Wed, Jun 15, 2016 at 10:48 AM, Richard Sandiford <richard.sandif...@arm.com> wrote: > I recently relaxed the peeling-for-gaps conditions for LD3 but > kept them as-is for load-and-permute. I don't think the conditons > are needed for load-and-permute either though. No current load-and- > permute should load outside the group, so if there is no gap at the end, > the final vector element loaded will correspond to an element loaded > by the original scalar loop. > > The patch for PR68559 (a missed optimisation PR) increased the peeled > cases from "exact_log2 (groupsize) == -1" to "vf % group_size == 0", so > before that fix, we didn't peel for gaps if there was no gap at the end > of the group and if the group size was a power of 2. > > The only current non-power-of-2 load-and-permute size is 3, which > doesn't require loading more than 3 vectors. > > The testcase is based on gcc.dg/vect/pr49038.c. > > Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
Ok. Thanks, Richard. > Thanks, > Richard > > > gcc/ > * tree-vect-stmts.c (vectorizable_load): Remove unnecessary > peeling-for-gaps condition. > > gcc/testsuite/ > * gcc.dg/vect/group-no-gaps-1.c: New test. > > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c > +++ gcc/tree-vect-stmts.c > @@ -6356,13 +6356,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > gcc_assert (GROUP_GAP (stmt_info)); > } > > - /* If there is a gap in the end of the group or the group size cannot > - be made a multiple of the vector element count then we access excess > + /* If there is a gap in the end of the group then we access excess > elements in the last iteration and thus need to peel that off. */ > if (loop_vinfo > && ! STMT_VINFO_STRIDED_P (stmt_info) > - && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0 > - || (!slp && !load_lanes_p && vf % group_size != 0))) > + && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > Index: gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c > =================================================================== > --- /dev/null > +++ gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c > @@ -0,0 +1,108 @@ > +/* { dg-require-effective-target mmap } */ > + > +#include <sys/mman.h> > +#include <stdio.h> > + > +#define COUNT 320 > +#define MMAP_SIZE 0x20000 > +#define ADDRESS1 0x1122000000 > +#define ADDRESS2 (ADDRESS1 + MMAP_SIZE * 16) > +#define TYPE unsigned int > + > +#ifndef MAP_ANONYMOUS > +#define MAP_ANONYMOUS MAP_ANON > +#endif > + > +#define RHS0(B) b[B] > +#define RHS1(B) RHS0(B) + b[(B) + 1] > +#define RHS2(B) RHS1(B) + b[(B) + 2] > +#define RHS3(B) RHS2(B) + b[(B) + 3] > +#define RHS4(B) RHS3(B) + b[(B) + 4] > +#define RHS5(B) RHS4(B) + b[(B) + 5] > +#define RHS6(B) RHS5(B) + b[(B) + 6] > +#define RHS7(B) RHS6(B) + b[(B) + 7] > + > +#define LHS0(B) a[B] > +#define LHS1(B) LHS0(B) = a[(B) + 1] > +#define LHS2(B) LHS1(B) = a[(B) + 2] > +#define LHS3(B) LHS2(B) = a[(B) + 3] > +#define LHS4(B) LHS3(B) = a[(B) + 4] > +#define LHS5(B) LHS4(B) = a[(B) + 5] > +#define LHS6(B) LHS5(B) = a[(B) + 6] > +#define LHS7(B) LHS6(B) = a[(B) + 7] > + > +#define DEF_GROUP_SIZE(MULT, GAP, NO_GAP) \ > + void __attribute__((noinline, noclone)) \ > + gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b) \ > + { \ > + for (int i = 0; i < COUNT; i++) \ > + a[i] = RHS##GAP (i * MULT); \ > + } \ > + void __attribute__((noinline, noclone)) \ > + no_gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b) \ > + { \ > + for (int i = 0; i < COUNT; i++) \ > + a[i] = RHS##NO_GAP (i * MULT); \ > + } \ > + void __attribute__((noinline, noclone)) \ > + gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b) \ > + { \ > + for (int i = 0; i < COUNT; i++) \ > + LHS##GAP (i * MULT) = b[i]; \ > + } \ > + void __attribute__((noinline, noclone)) \ > + no_gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b) \ > + { \ > + for (int i = 0; i < COUNT; i++) \ > + LHS##NO_GAP (i * MULT) = b[i]; \ > + } > + > +#define USE_GROUP_SIZE(MULT) \ > + gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT + 1); \ > + no_gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT); \ > + gap_store_##MULT (end_x - COUNT * MULT + 1, end_y - COUNT); \ > + no_gap_store_##MULT (end_x - COUNT * MULT, end_y - COUNT) > + > +DEF_GROUP_SIZE (2, 0, 1) > +DEF_GROUP_SIZE (3, 1, 2) > +DEF_GROUP_SIZE (4, 2, 3) > +DEF_GROUP_SIZE (5, 3, 4) > +DEF_GROUP_SIZE (6, 4, 5) > +DEF_GROUP_SIZE (7, 5, 6) > +DEF_GROUP_SIZE (8, 6, 7) > + > +int > +main (void) > +{ > + void *x, *y; > + TYPE *end_x, *end_y; > + > + x = mmap ((void *) ADDRESS1, MMAP_SIZE, PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + if (x == MAP_FAILED) > + { > + perror ("mmap"); > + return 1; > + } > + > + y = mmap ((void *) ADDRESS2, MMAP_SIZE, PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + if (y == MAP_FAILED) > + { > + perror ("mmap"); > + return 1; > + } > + > + end_x = (TYPE *) ((char *) x + MMAP_SIZE); > + end_y = (TYPE *) ((char *) y + MMAP_SIZE); > + > + USE_GROUP_SIZE (2); > + USE_GROUP_SIZE (3); > + USE_GROUP_SIZE (4); > + USE_GROUP_SIZE (5); > + USE_GROUP_SIZE (6); > + USE_GROUP_SIZE (7); > + USE_GROUP_SIZE (8); > + > + return 0; > +}