On Wed, May 9, 2018 at 12:34 PM, Richard Sandiford <richard.sandif...@linaro.org> wrote: > The SLP unrolling factor is calculated by finding the smallest > scalar type for each SLP statement and taking the number of required > lanes from the vector versions of those scalar types. E.g. for an > int32->int64 conversion, it's the vector of int32s rather than the > vector of int64s that determines the unroll factor. > > We rely on tree-vect-patterns.c to replace boolean operations like: > > bool a, b, c; > a = b & c; > > with integer operations of whatever the best size is in context. > E.g. if b and c are fed by comparisons of ints, a, b and c will become > the appropriate size for an int comparison. For most targets this means > that a, b and c will end up as int-sized themselves, but on targets like > SVE and AVX512 with packed vector booleans, they'll instead become a > small bitfield like :1, padded to a byte for memory purposes. > The SLP code would then take these scalar types and try to calculate > the vector type for them, causing the unroll factor to be much higher > than necessary. > > This patch makes SLP use the cached vector boolean type if that's > appropriate. Tested on aarch64-linux-gnu (with and without SVE), > aarch64_be-none-elf and x86_64-linux-gnu. OK to install? > > Richard > > > 2018-05-09 Richard Sandiford <richard.sandif...@linaro.org> > > gcc/ > * tree-vect-slp.c (get_vectype_for_smallest_scalar_type): New > function. > (vect_build_slp_tree_1): Use it when calculating the unroll factor. > > gcc/testsuite/ > * gcc.target/aarch64/sve/vcond_10.c: New test. > * gcc.target/aarch64/sve/vcond_10_run.c: Likewise. > * gcc.target/aarch64/sve/vcond_11.c: Likewise. > * gcc.target/aarch64/sve/vcond_11_run.c: Likewise. > > Index: gcc/tree-vect-slp.c > =================================================================== > --- gcc/tree-vect-slp.c 2018-05-08 09:42:03.526648115 +0100 > +++ gcc/tree-vect-slp.c 2018-05-09 11:30:41.061096063 +0100 > @@ -608,6 +608,41 @@ vect_record_max_nunits (vec_info *vinfo, > return true; > } > > +/* Return the vector type associated with the smallest scalar type in STMT. > */ > + > +static tree > +get_vectype_for_smallest_scalar_type (gimple *stmt) > +{ > + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > + tree vectype = STMT_VINFO_VECTYPE (stmt_info); > + if (vectype != NULL_TREE > + && VECTOR_BOOLEAN_TYPE_P (vectype))
Hum. At this point you can't really rely on vector types being set... > + { > + /* The result of a vector boolean operation has the smallest scalar > + type unless the statement is extending an even narrower boolean. */ > + if (!gimple_assign_cast_p (stmt)) > + return vectype; > + > + tree src = gimple_assign_rhs1 (stmt); > + gimple *def_stmt; > + enum vect_def_type dt; > + tree src_vectype = NULL_TREE; > + if (vect_is_simple_use (src, stmt_info->vinfo, &def_stmt, &dt, > + &src_vectype) > + && src_vectype > + && VECTOR_BOOLEAN_TYPE_P (src_vectype)) > + { > + if (TYPE_PRECISION (TREE_TYPE (src_vectype)) > + < TYPE_PRECISION (TREE_TYPE (vectype))) > + return src_vectype; > + return vectype; > + } > + } > + HOST_WIDE_INT dummy; > + tree scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy); > + return get_vectype_for_scalar_type (scalar_type); > +} > + > /* Verify if the scalar stmts STMTS are isomorphic, require data > permutation or are of unsupported types of operation. Return > true if they are, otherwise return false and indicate in *MATCHES > @@ -636,12 +671,11 @@ vect_build_slp_tree_1 (vec_info *vinfo, > enum tree_code first_cond_code = ERROR_MARK; > tree lhs; > bool need_same_oprnds = false; > - tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE; > + tree vectype = NULL_TREE, first_op1 = NULL_TREE; > optab optab; > int icode; > machine_mode optab_op2_mode; > machine_mode vec_mode; > - HOST_WIDE_INT dummy; > gimple *first_load = NULL, *prev_first_load = NULL; > > /* For every stmt in NODE find its def stmt/s. */ > @@ -685,15 +719,14 @@ vect_build_slp_tree_1 (vec_info *vinfo, > return false; > } > > - scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy); ... so I wonder how this goes wrong here. I suppose we want to ignore vector booleans for the purpose of max_nunits computation. So isn't a better fix to simply "ignore" those in vect_get_smallest_scalar_type instead? I see that for intermediate full-boolean operations like a = x[i] < 0; b = y[i] > 0; tem = a & b; we want to ignore 'tem = a & b' fully here for the purpose of vect_record_max_nunits. So if scalar_type is a bitfield type then skip it? Richard. > - vectype = get_vectype_for_scalar_type (scalar_type); > + vectype = get_vectype_for_smallest_scalar_type (stmt); > if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype, > max_nunits)) > { > /* Fatal mismatch. */ > matches[0] = false; > - return false; > - } > + return false; > + } > > if (gcall *call_stmt = dyn_cast <gcall *> (stmt)) > { > Index: gcc/testsuite/gcc.target/aarch64/sve/vcond_10.c > =================================================================== > --- /dev/null 2018-04-20 16:19:46.369131350 +0100 > +++ gcc/testsuite/gcc.target/aarch64/sve/vcond_10.c 2018-05-09 > 11:30:41.057096221 +0100 > @@ -0,0 +1,36 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */ > + > +#include <stdint.h> > + > +#define DEF_LOOP(TYPE) \ > + void __attribute__ ((noinline, noclone)) \ > + test_##TYPE (TYPE *a, TYPE a1, TYPE a2, TYPE a3, TYPE a4, int n) \ > + { \ > + for (int i = 0; i < n; i += 2) \ > + { > \ > + a[i] = a[i] >= 1 && a[i] != 3 ? a1 : a2; \ > + a[i + 1] = a[i + 1] >= 1 && a[i + 1] != 3 ? a3 : a4; \ > + } > \ > + } > + > +#define FOR_EACH_TYPE(T) \ > + T (int8_t) \ > + T (uint8_t) \ > + T (int16_t) \ > + T (uint16_t) \ > + T (int32_t) \ > + T (uint32_t) \ > + T (int64_t) \ > + T (uint64_t) \ > + T (_Float16) \ > + T (float) \ > + T (double) > + > +FOR_EACH_TYPE (DEF_LOOP) > + > +/* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */ > +/* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */ > +/* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */ > +/* { dg-final { scan-assembler-times {\tld1d\t} 3 } } */ > +/* { dg-final { scan-assembler-times {\tsel\tz[0-9]} 11 } } */ > Index: gcc/testsuite/gcc.target/aarch64/sve/vcond_10_run.c > =================================================================== > --- /dev/null 2018-04-20 16:19:46.369131350 +0100 > +++ gcc/testsuite/gcc.target/aarch64/sve/vcond_10_run.c 2018-05-09 > 11:30:41.057096221 +0100 > @@ -0,0 +1,24 @@ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */ > + > +#include "vcond_10.c" > + > +#define N 133 > + > +#define TEST_LOOP(TYPE) > \ > + { \ > + TYPE a[N]; \ > + for (int i = 0; i < N; ++i) > \ > + a[i] = i % 7; \ > + test_##TYPE (a, 10, 11, 12, 13, N); > \ > + for (int i = 0; i < N; ++i) > \ > + if (a[i] != 10 + (i & 1) * 2 + (i % 7 == 0 || i % 7 == 3)) \ > + __builtin_abort (); \ > + } > + > +int > +main (void) > +{ > + FOR_EACH_TYPE (TEST_LOOP); > + return 0; > +} > Index: gcc/testsuite/gcc.target/aarch64/sve/vcond_11.c > =================================================================== > --- /dev/null 2018-04-20 16:19:46.369131350 +0100 > +++ gcc/testsuite/gcc.target/aarch64/sve/vcond_11.c 2018-05-09 > 11:30:41.057096221 +0100 > @@ -0,0 +1,36 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */ > + > +#include <stdint.h> > + > +#define DEF_LOOP(TYPE) \ > + void __attribute__ ((noinline, noclone)) \ > + test_##TYPE (int *restrict a, TYPE *restrict b, int a1, int a2, \ > + int a3, int a4, int n) \ > + { \ > + for (int i = 0; i < n; i += 2) \ > + { > \ > + a[i] = a[i] >= 1 & b[i] != 3 ? a1 : a2; \ > + a[i + 1] = a[i + 1] >= 1 & b[i + 1] != 3 ? a3 : a4; \ > + } > \ > + } > + > +#define FOR_EACH_TYPE(T) \ > + T (int8_t) \ > + T (uint8_t) \ > + T (int16_t) \ > + T (uint16_t) \ > + T (int64_t) \ > + T (uint64_t) \ > + T (double) > + > +FOR_EACH_TYPE (DEF_LOOP) > + > +/* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */ > +/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */ > +/* 4 for each 8-bit function, 2 for each 16-bit function, 1 for > + each 64-bit function. */ > +/* { dg-final { scan-assembler-times {\tld1w\t} 15 } } */ > +/* 3 64-bit functions * 2 64-bit vectors per 32-bit vector. */ > +/* { dg-final { scan-assembler-times {\tld1d\t} 6 } } */ > +/* { dg-final { scan-assembler-times {\tsel\tz[0-9]} 15 } } */ > Index: gcc/testsuite/gcc.target/aarch64/sve/vcond_11_run.c > =================================================================== > --- /dev/null 2018-04-20 16:19:46.369131350 +0100 > +++ gcc/testsuite/gcc.target/aarch64/sve/vcond_11_run.c 2018-05-09 > 11:30:41.059096142 +0100 > @@ -0,0 +1,28 @@ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */ > + > +#include "vcond_11.c" > + > +#define N 133 > + > +#define TEST_LOOP(TYPE) > \ > + { \ > + int a[N]; \ > + TYPE b[N]; \ > + for (int i = 0; i < N; ++i) > \ > + { > \ > + a[i] = i % 5; \ > + b[i] = i % 7; \ > + } > \ > + test_##TYPE (a, b, 10, 11, 12, 13, N); \ > + for (int i = 0; i < N; ++i) > \ > + if (a[i] != 10 + (i & 1) * 2 + (i % 5 == 0 || i % 7 == 3)) \ > + __builtin_abort (); \ > + } > + > +int > +main (void) > +{ > + FOR_EACH_TYPE (TEST_LOOP); > + return 0; > +}