This patch adds a hook to control whether we avoid executing masked (predicated) stores when the mask is all false. We don't want to do that by default for SVE.
Tested on aarch64-linux-gnu (with and without SVE), x86_64-linux-gnu and powerpc64le-linux-gnu. OK to install? Richard 2017-11-17 Richard Sandiford <richard.sandif...@linaro.org> Alan Hayward <alan.hayw...@arm.com> David Sherwood <david.sherw...@arm.com> gcc/ * target.def (empty_mask_is_expensive): New hook. * doc/tm.texi.in (TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE): New hook. * doc/tm.texi: Regenerate. * targhooks.h (default_empty_mask_is_expensive): Declare. * targhooks.c (default_empty_mask_is_expensive): New function. * tree-vectorizer.c (vectorize_loops): Only call optimize_mask_stores if the target says that empty masks are expensive. * config/aarch64/aarch64.c (aarch64_empty_mask_is_expensive): New function. (TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE): Redefine. Index: gcc/target.def =================================================================== --- gcc/target.def 2017-11-17 15:07:44.219630250 +0000 +++ gcc/target.def 2017-11-17 15:14:34.529678781 +0000 @@ -1907,6 +1907,17 @@ if such a mode exists.", (poly_uint64 nunits, poly_uint64 length), default_get_mask_mode) +/* Function to say whether a masked operation is expensive when the + mask is all zeros. */ +DEFHOOK +(empty_mask_is_expensive, + "This hook returns true if masked internal function @var{ifn} (really of\n\ +type @code{internal_fn}) should be considered expensive when the mask is\n\ +all zeros. GCC can then try to branch around the instruction instead.", + bool, + (unsigned ifn), + default_empty_mask_is_expensive) + /* Target builtin that implements vector gather operation. */ DEFHOOK (builtin_gather, Index: gcc/doc/tm.texi.in =================================================================== --- gcc/doc/tm.texi.in 2017-11-17 15:07:44.217630250 +0000 +++ gcc/doc/tm.texi.in 2017-11-17 15:14:34.529678781 +0000 @@ -4095,6 +4095,8 @@ address; but often a machine-dependent @hook TARGET_VECTORIZE_GET_MASK_MODE +@hook TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE + @hook TARGET_VECTORIZE_INIT_COST @hook TARGET_VECTORIZE_ADD_STMT_COST Index: gcc/doc/tm.texi =================================================================== --- gcc/doc/tm.texi 2017-11-17 15:07:44.216630250 +0000 +++ gcc/doc/tm.texi 2017-11-17 15:14:34.528767752 +0000 @@ -5884,6 +5884,12 @@ is @var{length} bytes long and that cont if such a mode exists. @end deftypefn +@deftypefn {Target Hook} bool TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE (unsigned @var{ifn}) +This hook returns true if masked internal function @var{ifn} (really of +type @code{internal_fn}) should be considered expensive when the mask is +all zeros. GCC can then try to branch around the instruction instead. +@end deftypefn + @deftypefn {Target Hook} {void *} TARGET_VECTORIZE_INIT_COST (struct loop *@var{loop_info}) This hook should initialize target-specific data structures in preparation for modeling the costs of vectorizing a loop or basic block. The default allocates three unsigned integers for accumulating costs for the prologue, body, and epilogue of the loop or basic block. If @var{loop_info} is non-NULL, it identifies the loop being vectorized; otherwise a single block is being vectorized. @end deftypefn Index: gcc/targhooks.h =================================================================== --- gcc/targhooks.h 2017-11-17 15:07:43.533630267 +0000 +++ gcc/targhooks.h 2017-11-17 15:14:34.530589811 +0000 @@ -110,6 +110,7 @@ default_builtin_support_vector_misalignm extern machine_mode default_preferred_simd_mode (scalar_mode mode); extern void default_autovectorize_vector_sizes (vector_sizes *); extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64); +extern bool default_empty_mask_is_expensive (unsigned); extern void *default_init_cost (struct loop *); extern unsigned default_add_stmt_cost (void *, int, enum vect_cost_for_stmt, struct _stmt_vec_info *, int, Index: gcc/targhooks.c =================================================================== --- gcc/targhooks.c 2017-11-17 15:07:43.987630256 +0000 +++ gcc/targhooks.c 2017-11-17 15:14:34.530589811 +0000 @@ -1305,6 +1305,14 @@ default_get_mask_mode (poly_uint64 nunit return opt_machine_mode (); } +/* By default consider masked stores to be expensive. */ + +bool +default_empty_mask_is_expensive (unsigned ifn) +{ + return ifn == IFN_MASK_STORE; +} + /* By default, the cost model accumulates three separate costs (prologue, loop body, and epilogue) for a vectorized loop or block. So allocate an array of three unsigned ints, set it to zero, and return its address. */ Index: gcc/tree-vectorizer.c =================================================================== --- gcc/tree-vectorizer.c 2017-11-17 15:07:38.288630400 +0000 +++ gcc/tree-vectorizer.c 2017-11-17 15:14:34.530589811 +0000 @@ -847,7 +847,8 @@ vectorize_loops (void) if (loop_vinfo) has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo); delete loop_vinfo; - if (has_mask_store) + if (has_mask_store + && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE)) optimize_mask_stores (loop); loop->aux = NULL; } Index: gcc/config/aarch64/aarch64.c =================================================================== --- gcc/config/aarch64/aarch64.c 2017-11-17 15:07:44.230630249 +0000 +++ gcc/config/aarch64/aarch64.c 2017-11-17 15:14:34.526945692 +0000 @@ -17016,6 +17016,16 @@ aarch64_gen_adjusted_ldpstp (rtx *operan return true; } +/* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that + it isn't worth branching around empty masked ops (including masked + stores). */ + +static bool +aarch64_empty_mask_is_expensive (unsigned) +{ + return false; +} + /* Return 1 if pseudo register should be created and used to hold GOT address for PIC code. */ @@ -17660,6 +17670,9 @@ #define TARGET_VECTORIZE_VEC_PERM_CONST_ #undef TARGET_VECTORIZE_GET_MASK_MODE #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode +#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE +#define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \ + aarch64_empty_mask_is_expensive #undef TARGET_INIT_LIBFUNCS #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs