https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111156
--- Comment #14 from Tamar Christina <tnfchris at gcc dot gnu.org> --- (In reply to Richard Biener from comment #13) > I didn't add STMT_VINFO_SLP_VECT_ONLY, I'm quite sure we can now do both SLP > of masked loads and stores, so yes, STMT_VINFO_SLP_VECT_ONLY (when we formed > a DR group of stmts we cannot combine without SLP as the masks are not equal) > should be set for both loads and stores. > > The can_group_stmts_p checks as present seem correct here (but the dump > should not say "Load" but maybe "Access") I guess I'm wondering because of this usage: /* Check that the data-refs have same first location (except init) and they are both either store or load (not load and store, not masked loads or stores). */ if (DR_IS_READ (dra) != DR_IS_READ (drb) || data_ref_compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb)) != 0 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true)) break; We don't exit there now for non-SLP. > > So what's the testcase comment#9 talks about? You should be able to reproduce it with: --- typedef __SIZE_TYPE__ size_t; typedef signed char int8_t; typedef unsigned short uint16_t ; void __attribute__((noinline, noclone)) test_i8_i8_i16_2(int8_t *__restrict dest, int8_t *__restrict src, uint16_t *__restrict cond, size_t n) { for (size_t i = 0; i < n; ++i) { if (cond[i] < 8) dest[i * 2] = src[i]; if (cond[i] > 2) dest[i * 2 + 1] = src[i]; } } void __attribute__((noinline, noclone)) test_i8_i8_i16_2_1(volatile int8_t * dest, volatile int8_t * src, volatile uint16_t * cond, size_t n) { #pragma GCC novector for (size_t i = 0; i < n; ++i) { if (cond[i] < 8) dest[i * 2] = src[i]; if (cond[i] > 2) dest[i * 2 + 1] = src[i]; } } #define size 16 int8_t srcarray[size]; uint16_t maskarray[size]; int8_t destarray[size*2]; int8_t destarray1[size*2]; int main() { #pragma GCC novector for(int i = 0; i < size; i++) { maskarray[i] = i == 10 ? 0 : (i == 5 ? 9 : (21111*i) & 0xff); srcarray[i] = i; } #pragma GCC novector for(int i = 0; i < size*2; i++) { destarray[i] = i; destarray1[i] = i; } test_i8_i8_i16_2(destarray, srcarray, maskarray, size); test_i8_i8_i16_2_1(destarray1, srcarray, maskarray, size); #pragma GCC novector for(int i = 0; i < size*2; i++) { if (destarray[i] != destarray1[i]) __builtin_abort(); } } --- since really only one of the functions needs to vectorize.