https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87319

            Bug ID: 87319
           Summary: When vector is wrapped, expression is not optimized.
           Product: gcc
           Version: 8.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bugzi...@poradnik-webmastera.com
  Target Milestone: ---

I was playing with vector extensions and intrinsics, checking if gcc would be
able to optimize vector expression a*c+b*c to (a+b)*c. It turned out that this
works for intrinsics (both wrapped in class and non-wrapped), and vector
extensions (non-wrapped only). When built-in operators for vector extensions
were used, code was not optimized (test3). Code was compiled with "-O3 -mavx2
-std=c++11".

[code]
#include <stdint.h>
#include <immintrin.h>

typedef int32_t VInt __attribute((vector_size(32)));

class V1
{
    VInt v;
public:
    constexpr V1(const V1& v) : v(v.v) {}
    constexpr V1(const VInt& v) : v(v) {}

    constexpr V1 operator+(const V1& v2) const
    { return V1(v + v2.v); }

    constexpr V1 operator*(const V1& v2) const
    { return V1(v * v2.v); }

    constexpr operator VInt() const
    { return v; }
};

class V2
{
    __m256i v;
public:
    constexpr V2(const V2& v) : v(v.v) {}
    constexpr V2(const __m256i& v) : v(v) {}

    V2 operator+(const V2& v2) const
    { return V2(_mm256_add_epi32(v, v2.v)); }

    V2 operator*(const V2& v2) const
    { return V2(_mm256_mullo_epi32(v, v2.v)); }

    constexpr operator __m256i() const
    { return v; }
};

void test1(const int* a, const int* b, const int* c, int* d)
{
    const VInt va = *(VInt*)a;
    const VInt vb = *(VInt*)b;
    const VInt vc = *(VInt*)c;
    *(VInt*)d = va * vc + vb * vc;
}

void test2(const int* a, const int* b, const int* c, int* d)
{
    const __m256i va = *(__m256i*)a;
    const __m256i vb = *(__m256i*)b;
    const __m256i vc = *(__m256i*)c;
    const __m256i vd =_mm256_add_epi32(
        _mm256_mullo_epi32(va, vc),
        _mm256_mullo_epi32(vb, vc)
    );
    *(__m256i*)d = vd;
}

void test3(const int* a, const int* b, const int* c, int* d)
{
    const V1 va = V1(*(VInt*)a);
    const V1 vb = V1(*(VInt*)b);
    const V1 vc = V1(*(VInt*)c);
    *(VInt*)d = va * vc + vb * vc;
}

void test4(const int* a, const int* b, const int* c, int* d)
{
    const V2 va(*(__m256i*)a);
    const V2 vb(*(__m256i*)b);
    const V2 vc(*(__m256i*)c);
    *(__m256i*)d = va * vc + vb * vc;
}
[/code]

[out]
test1(int const*, int const*, int const*, int*):
  vmovdqa ymm0, YMMWORD PTR [rdi]
  vpaddd ymm0, ymm0, YMMWORD PTR [rsi]
  vpmulld ymm0, ymm0, YMMWORD PTR [rdx]
  vmovdqa YMMWORD PTR [rcx], ymm0
  vzeroupper
  ret
test2(int const*, int const*, int const*, int*):
  vmovdqa ymm0, YMMWORD PTR [rdi]
  vpaddd ymm0, ymm0, YMMWORD PTR [rsi]
  vpmulld ymm0, ymm0, YMMWORD PTR [rdx]
  vmovdqa YMMWORD PTR [rcx], ymm0
  vzeroupper
  ret
test3(int const*, int const*, int const*, int*):
  vmovdqa ymm0, YMMWORD PTR [rdx]
  vpmulld ymm1, ymm0, YMMWORD PTR [rdi]
  vpmulld ymm0, ymm0, YMMWORD PTR [rsi]
  vpaddd ymm0, ymm1, ymm0
  vmovdqa YMMWORD PTR [rcx], ymm0
  vzeroupper
  ret
test4(int const*, int const*, int const*, int*):
  vmovdqa ymm0, YMMWORD PTR [rdi]
  vpaddd ymm0, ymm0, YMMWORD PTR [rsi]
  vpmulld ymm0, ymm0, YMMWORD PTR [rdx]
  vmovdqa YMMWORD PTR [rcx], ymm0
  vzeroupper
  ret
[/out]

Reply via email to