https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108320

            Bug ID: 108320
           Summary: Missing vector/array arithmetic optimization compared
                    to valarray
           Product: gcc
           Version: 12.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: diegoandres91b at hotmail dot com
  Target Milestone: ---

The next code (with -O3 -mavx2 -mfma):

#include <valarray>
#include <vector>
#include <array>

using namespace std;

valarray<float> fma1(const valarray<float> &a, const valarray<float> &b, const
valarray<float> &c) {
    return a * b + c;
}

template<class T>
struct vec : vector<T> {
    constexpr vec(size_t count) : vector<T>(count) {}
};

template<class T>
constexpr vec<T> operator*(const vec<T> &a, const vec<T> &b) {
    vec<T> c(a.size());
    for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] * b[i];
    return c;
}

template<class T>
constexpr vec<T> operator+(const vec<T> &a, const vec<T> &b) {
    vec<T> c(a.size());
    for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] + b[i];
    return c;
}

vec<float> fma2(const vec<float> &a, const vec<float> &b, const vec<float> &c)
{
    return a * b + c;
}

template<class T, size_t N>
struct arr : array<T, N> {
};

template<class T, size_t N>
constexpr arr<T, N> operator*(const arr<T, N> &a, const arr<T, N> &b) {
    arr<T, N> c;
    for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] * b[i];
    return c;
}

template<class T, size_t N>
constexpr arr<T, N> operator+(const arr<T, N> &a, const arr<T, N> &b) {
    arr<T, N> c;
    for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] + b[i];
    return c;
}

constexpr size_t N = 1024;

arr<float, N> fma3(const arr<float, N> &a, const arr<float, N> &b, const
arr<float, N> &c) {
    return a * b + c;
}

Only optimizes the valarray version (fma1) of the fma function (uses
vfmadd132ps):

...

.L4:
        vmovups ymm0, YMMWORD PTR [rdi+rax]
        vmovups ymm1, YMMWORD PTR [rcx+rax]
        vfmadd132ps     ymm0, ymm1, YMMWORD PTR [rsi+rax]
        vmovups YMMWORD PTR [rdx+rax], ymm0
        add     rax, 32
        cmp     rax, r8
        jne     .L4
        mov     rax, r10
        and     rax, -8
        lea     r9, [0+rax*4]
        lea     r11, [rdx+r9]
        test    r10b, 7
        je      .L22
        vzeroupper
.L3:
        mov     r8, r10
        sub     r8, rax
        lea     r12, [r8-1]
        cmp     r12, 2
        jbe     .L6
        vmovups xmm0, XMMWORD PTR [rdi+rax*4]
        vmovups xmm2, XMMWORD PTR [rcx+rax*4]
        vfmadd132ps     xmm0, xmm2, XMMWORD PTR [rsi+rax*4]
        vmovups XMMWORD PTR [rdx+r9], xmm0
        test    r8b, 3
        je      .L1
        and     r8, -4
        add     rax, r8
        lea     r11, [r11+r8*4]
        lea     r9, [0+rax*4]

...

But it does not optimize the vector or array versions of the function (fma2 and
fma3).

Note: For smaller N in fma3 optimizes, but for larger numbers like 1024 in the
example it does not.

Compiler Explorer code: https://godbolt.org/z/v8dnx5aMo

Reply via email to