https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111156

--- Comment #14 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #13)
> I didn't add STMT_VINFO_SLP_VECT_ONLY, I'm quite sure we can now do both SLP
> of masked loads and stores, so yes, STMT_VINFO_SLP_VECT_ONLY (when we formed
> a DR group of stmts we cannot combine without SLP as the masks are not equal)
> should be set for both loads and stores.
> 
> The can_group_stmts_p checks as present seem correct here (but the dump
> should not say "Load" but maybe "Access")

I guess I'm wondering because of this usage:

          /* Check that the data-refs have same first location (except init)
             and they are both either store or load (not load and store,
             not masked loads or stores).  */
          if (DR_IS_READ (dra) != DR_IS_READ (drb)
              || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
                                        DR_BASE_ADDRESS (drb)) != 0
              || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
              || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
            break;

We don't exit there now for non-SLP.

> 
> So what's the testcase comment#9 talks about?

You should be able to reproduce it with:

---
typedef __SIZE_TYPE__ size_t;
typedef signed char int8_t;
typedef unsigned short uint16_t ;

void __attribute__((noinline, noclone))
test_i8_i8_i16_2(int8_t *__restrict dest, int8_t *__restrict src,
                 uint16_t *__restrict cond, size_t n) {
    for (size_t i = 0; i < n; ++i) {
        if (cond[i] < 8)
            dest[i * 2] = src[i];
        if (cond[i] > 2)
            dest[i * 2 + 1] = src[i];
    }
}
void __attribute__((noinline, noclone))
test_i8_i8_i16_2_1(volatile int8_t * dest, volatile int8_t * src,
                   volatile uint16_t * cond, size_t n) {
#pragma GCC novector
    for (size_t i = 0; i < n; ++i) {
        if (cond[i] < 8)
            dest[i * 2] = src[i];
        if (cond[i] > 2)
            dest[i * 2 + 1] = src[i];
    }
}

#define size 16

int8_t srcarray[size];
uint16_t maskarray[size];
int8_t destarray[size*2];
int8_t destarray1[size*2];

int main()
{
#pragma GCC novector
  for(int i = 0; i < size; i++)
  {
    maskarray[i] = i == 10 ? 0 : (i == 5 ? 9 : (21111*i) & 0xff);
    srcarray[i] = i;
  }
#pragma GCC novector
  for(int i = 0; i < size*2; i++)
  {
    destarray[i] = i;
    destarray1[i] = i;
  }
  test_i8_i8_i16_2(destarray, srcarray, maskarray, size);
  test_i8_i8_i16_2_1(destarray1, srcarray, maskarray, size);

#pragma GCC novector
  for(int i = 0; i < size*2; i++)
  {
    if (destarray[i] != destarray1[i])
      __builtin_abort();
  }
}

---

since really only one of the functions needs to vectorize.

Reply via email to