https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87319
Bug ID: 87319 Summary: When vector is wrapped, expression is not optimized. Product: gcc Version: 8.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: bugzi...@poradnik-webmastera.com Target Milestone: --- I was playing with vector extensions and intrinsics, checking if gcc would be able to optimize vector expression a*c+b*c to (a+b)*c. It turned out that this works for intrinsics (both wrapped in class and non-wrapped), and vector extensions (non-wrapped only). When built-in operators for vector extensions were used, code was not optimized (test3). Code was compiled with "-O3 -mavx2 -std=c++11". [code] #include <stdint.h> #include <immintrin.h> typedef int32_t VInt __attribute((vector_size(32))); class V1 { VInt v; public: constexpr V1(const V1& v) : v(v.v) {} constexpr V1(const VInt& v) : v(v) {} constexpr V1 operator+(const V1& v2) const { return V1(v + v2.v); } constexpr V1 operator*(const V1& v2) const { return V1(v * v2.v); } constexpr operator VInt() const { return v; } }; class V2 { __m256i v; public: constexpr V2(const V2& v) : v(v.v) {} constexpr V2(const __m256i& v) : v(v) {} V2 operator+(const V2& v2) const { return V2(_mm256_add_epi32(v, v2.v)); } V2 operator*(const V2& v2) const { return V2(_mm256_mullo_epi32(v, v2.v)); } constexpr operator __m256i() const { return v; } }; void test1(const int* a, const int* b, const int* c, int* d) { const VInt va = *(VInt*)a; const VInt vb = *(VInt*)b; const VInt vc = *(VInt*)c; *(VInt*)d = va * vc + vb * vc; } void test2(const int* a, const int* b, const int* c, int* d) { const __m256i va = *(__m256i*)a; const __m256i vb = *(__m256i*)b; const __m256i vc = *(__m256i*)c; const __m256i vd =_mm256_add_epi32( _mm256_mullo_epi32(va, vc), _mm256_mullo_epi32(vb, vc) ); *(__m256i*)d = vd; } void test3(const int* a, const int* b, const int* c, int* d) { const V1 va = V1(*(VInt*)a); const V1 vb = V1(*(VInt*)b); const V1 vc = V1(*(VInt*)c); *(VInt*)d = va * vc + vb * vc; } void test4(const int* a, const int* b, const int* c, int* d) { const V2 va(*(__m256i*)a); const V2 vb(*(__m256i*)b); const V2 vc(*(__m256i*)c); *(__m256i*)d = va * vc + vb * vc; } [/code] [out] test1(int const*, int const*, int const*, int*): vmovdqa ymm0, YMMWORD PTR [rdi] vpaddd ymm0, ymm0, YMMWORD PTR [rsi] vpmulld ymm0, ymm0, YMMWORD PTR [rdx] vmovdqa YMMWORD PTR [rcx], ymm0 vzeroupper ret test2(int const*, int const*, int const*, int*): vmovdqa ymm0, YMMWORD PTR [rdi] vpaddd ymm0, ymm0, YMMWORD PTR [rsi] vpmulld ymm0, ymm0, YMMWORD PTR [rdx] vmovdqa YMMWORD PTR [rcx], ymm0 vzeroupper ret test3(int const*, int const*, int const*, int*): vmovdqa ymm0, YMMWORD PTR [rdx] vpmulld ymm1, ymm0, YMMWORD PTR [rdi] vpmulld ymm0, ymm0, YMMWORD PTR [rsi] vpaddd ymm0, ymm1, ymm0 vmovdqa YMMWORD PTR [rcx], ymm0 vzeroupper ret test4(int const*, int const*, int const*, int*): vmovdqa ymm0, YMMWORD PTR [rdi] vpaddd ymm0, ymm0, YMMWORD PTR [rsi] vpmulld ymm0, ymm0, YMMWORD PTR [rdx] vmovdqa YMMWORD PTR [rcx], ymm0 vzeroupper ret [/out]