https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
Bug ID: 112824 Summary: Stack spills and vector splitting with vector builtins Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: elrodc at gmail dot com Target Milestone: --- I am not sure which component to place this under, but selected tree-optimization as I suspect this is some sort of alias analysis failure preventing the removal of stack allocations. Godbolt link, reproduces on GCC trunk and 13.2: https://godbolt.org/z/4TPx17Mbn Clang has similar problems in my actual test case, but they don't show up in this minimal example I made. Although Clang isn't perfect here either: it fails to fuse fmadd + masked vmovapd, while GCC does succeed in fusing them. For reference, code behind the godbolt link is: #include <bit> #include <concepts> #include <cstddef> #include <cstdint> template <ptrdiff_t W, typename T> using Vec [[gnu::vector_size(W * sizeof(T))]] = T; // Omitted: 16 without AVX, 32 without AVX512F, // or for forward compatibility some AVX10 may also mean 32-only static constexpr ptrdiff_t VectorBytes = 64; template<typename T> static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T); template <typename T, ptrdiff_t N> struct Vector{ static constexpr ptrdiff_t L = N; T data[L]; static constexpr auto size()->ptrdiff_t{return N;} }; template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{ static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> : ptrdiff_t(std::bit_ceil(size_t(N))); static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0); using V = Vec<W,T>; V data[L]; static constexpr auto size()->ptrdiff_t{return N;} }; /// should be trivially copyable /// codegen is worse when passing by value, even though it seems like it should make /// aliasing simpler to analyze? template<typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> { Vector<T,N> z; for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] + y.data[n]; return z; } template<typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> { Vector<T,N> z; for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] * y.data[n]; return z; } template<typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) -> Vector<T,N> { Vector<T,N> z; for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n]; return z; } template<typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) -> Vector<T,N> { Vector<T,N> z; for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n]; return z; } template <typename T, ptrdiff_t N> struct Dual { T value; Vector<T, N> partials; }; // Here we have a specialization for non-power-of-2 `N` template <typename T, ptrdiff_t N> requires(std::floating_point<T> && (std::popcount(size_t(N))>1)) struct Dual<T,N> { Vector<T, N+1> data; }; template<ptrdiff_t W, typename T> consteval auto firstoff(){ static_assert(std::same_as<T,double>, "type not implemented"); if constexpr (W==2) return Vec<2,int64_t>{0,1} != 0; else if constexpr (W == 4) return Vec<4,int64_t>{0,1,2,3} != 0; else if constexpr (W == 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != 0; else static_assert(false, "vector width not implemented"); } template <typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator+(Dual<T, N> a, Dual<T, N> b) -> Dual<T, N> { if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){ Dual<T,N> c; for (ptrdiff_t l = 0; l < Vector<T,N>::L; ++l) c.data.data[l] = a.data.data[l] + b.data.data[l]; return c; } else return {a.value + b.value, a.partials + b.partials}; } template <typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator*(Dual<T, N> a, Dual<T, N> b) -> Dual<T, N> { if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){ using V = typename Vector<T,N>::V; V va = V{}+a.data.data[0][0], vb = V{}+b.data.data[0][0]; V x = va * b.data.data[0]; Dual<T,N> c; c.data.data[0] = firstoff<Vector<T,N>::W,T>() ? x + vb*a.data.data[0] : x; for (ptrdiff_t l = 1; l < Vector<T,N>::L; ++l) c.data.data[l] = va*b.data.data[l] + vb*a.data.data[l]; return c; } else return {a.value * b.value, a.value * b.partials + b.value * a.partials}; } void prod(Dual<Dual<double,7>,2> &c, const Dual<Dual<double,7>,2> &a, const Dual<Dual<double,7>,2>&b){ c = a*b; } void prod(Dual<Dual<double,8>,2> &c, const Dual<Dual<double,8>,2> &a, const Dual<Dual<double,8>,2>&b){ c = a*b; } GCC 13.2 asm, when compiling with -std=gnu++23 -march=skylake-avx512 -mprefer-vector-width=512 -O3 prod(Dual<Dual<double, 7l>, 2l>&, Dual<Dual<double, 7l>, 2l> const&, Dual<Dual<double, 7l>, 2l> const&): push rbp mov eax, -2 kmovb k1, eax mov rbp, rsp and rsp, -64 sub rsp, 264 vmovdqa ymm4, YMMWORD PTR [rsi+128] vmovapd zmm8, ZMMWORD PTR [rsi] vmovapd zmm9, ZMMWORD PTR [rdx] vmovdqa ymm6, YMMWORD PTR [rsi+64] vmovdqa YMMWORD PTR [rsp+8], ymm4 vmovdqa ymm4, YMMWORD PTR [rdx+96] vbroadcastsd zmm0, xmm8 vmovdqa ymm7, YMMWORD PTR [rsi+96] vbroadcastsd zmm1, xmm9 vmovdqa YMMWORD PTR [rsp-56], ymm6 vmovdqa ymm5, YMMWORD PTR [rdx+128] vmovdqa ymm6, YMMWORD PTR [rsi+160] vmovdqa YMMWORD PTR [rsp+168], ymm4 vxorpd xmm4, xmm4, xmm4 vaddpd zmm0, zmm0, zmm4 vaddpd zmm1, zmm1, zmm4 vmovdqa YMMWORD PTR [rsp-24], ymm7 vmovdqa ymm7, YMMWORD PTR [rdx+64] vmovapd zmm3, ZMMWORD PTR [rsp-56] vmovdqa YMMWORD PTR [rsp+40], ymm6 vmovdqa ymm6, YMMWORD PTR [rdx+160] vmovdqa YMMWORD PTR [rsp+200], ymm5 vmulpd zmm2, zmm0, zmm9 vmovdqa YMMWORD PTR [rsp+136], ymm7 vmulpd zmm5, zmm1, zmm3 vbroadcastsd zmm3, xmm3 vmovdqa YMMWORD PTR [rsp+232], ymm6 vaddpd zmm3, zmm3, zmm4 vmovapd zmm7, zmm2 vmovapd zmm2, ZMMWORD PTR [rsp+8] vfmadd231pd zmm7{k1}, zmm8, zmm1 vmovapd zmm6, zmm5 vmovapd zmm5, ZMMWORD PTR [rsp+136] vmulpd zmm1, zmm1, zmm2 vfmadd231pd zmm6{k1}, zmm9, zmm3 vbroadcastsd zmm2, xmm2 vmovapd zmm3, ZMMWORD PTR [rsp+200] vaddpd zmm2, zmm2, zmm4 vmovapd ZMMWORD PTR [rdi], zmm7 vfmadd231pd zmm1{k1}, zmm9, zmm2 vmulpd zmm2, zmm0, zmm5 vbroadcastsd zmm5, xmm5 vmulpd zmm0, zmm0, zmm3 vbroadcastsd zmm3, xmm3 vaddpd zmm5, zmm5, zmm4 vaddpd zmm3, zmm3, zmm4 vfmadd231pd zmm2{k1}, zmm8, zmm5 vfmadd231pd zmm0{k1}, zmm8, zmm3 vaddpd zmm2, zmm2, zmm6 vaddpd zmm0, zmm0, zmm1 vmovapd ZMMWORD PTR [rdi+64], zmm2 vmovapd ZMMWORD PTR [rdi+128], zmm0 vzeroupper leave ret prod(Dual<Dual<double, 8l>, 2l>&, Dual<Dual<double, 8l>, 2l> const&, Dual<Dual<double, 8l>, 2l> const&): push rbp mov rbp, rsp and rsp, -64 sub rsp, 648 vmovdqa ymm5, YMMWORD PTR [rsi+224] vmovdqa ymm3, YMMWORD PTR [rsi+352] vmovapd zmm0, ZMMWORD PTR [rdx+64] vmovdqa ymm2, YMMWORD PTR [rsi+320] vmovdqa YMMWORD PTR [rsp+104], ymm5 vmovdqa ymm5, YMMWORD PTR [rdx+224] vmovdqa ymm7, YMMWORD PTR [rsi+128] vmovdqa YMMWORD PTR [rsp+232], ymm3 vmovsd xmm3, QWORD PTR [rsi] vmovdqa ymm6, YMMWORD PTR [rsi+192] vmovdqa YMMWORD PTR [rsp+488], ymm5 vmovdqa ymm4, YMMWORD PTR [rdx+192] vmovapd zmm1, ZMMWORD PTR [rsi+64] vbroadcastsd zmm5, xmm3 vmovdqa YMMWORD PTR [rsp+200], ymm2 vmovdqa ymm2, YMMWORD PTR [rdx+320] vmulpd zmm8, zmm5, zmm0 vmovdqa YMMWORD PTR [rsp+8], ymm7 vmovdqa ymm7, YMMWORD PTR [rsi+256] vmovdqa YMMWORD PTR [rsp+72], ymm6 vmovdqa ymm6, YMMWORD PTR [rdx+128] vmovdqa YMMWORD PTR [rsp+584], ymm2 vmovsd xmm2, QWORD PTR [rdx] vmovdqa YMMWORD PTR [rsp+136], ymm7 vmovdqa ymm7, YMMWORD PTR [rdx+256] vmovdqa YMMWORD PTR [rsp+392], ymm6 vmovdqa ymm6, YMMWORD PTR [rdx+352] vmulsd xmm10, xmm3, xmm2 vmovdqa YMMWORD PTR [rsp+456], ymm4 vbroadcastsd zmm4, xmm2 vfmadd231pd zmm8, zmm4, zmm1 vmovdqa YMMWORD PTR [rsp+520], ymm7 vmovdqa YMMWORD PTR [rsp+616], ymm6 vmulpd zmm9, zmm4, ZMMWORD PTR [rsp+72] vmovsd xmm6, QWORD PTR [rsp+520] vmulpd zmm4, zmm4, ZMMWORD PTR [rsp+200] vmulpd zmm11, zmm5, ZMMWORD PTR [rsp+456] vmovsd QWORD PTR [rdi], xmm10 vmulpd zmm5, zmm5, ZMMWORD PTR [rsp+584] vmovapd ZMMWORD PTR [rdi+64], zmm8 vfmadd231pd zmm9, zmm0, QWORD PTR [rsp+8]{1to8} vfmadd231pd zmm4, zmm0, QWORD PTR [rsp+136]{1to8} vmovsd xmm0, QWORD PTR [rsp+392] vmulsd xmm7, xmm3, xmm0 vbroadcastsd zmm0, xmm0 vmulsd xmm3, xmm3, xmm6 vfmadd132pd zmm0, zmm11, zmm1 vbroadcastsd zmm6, xmm6 vfmadd132pd zmm1, zmm5, zmm6 vfmadd231sd xmm7, xmm2, QWORD PTR [rsp+8] vfmadd132sd xmm2, xmm3, QWORD PTR [rsp+136] vaddpd zmm0, zmm0, zmm9 vaddpd zmm1, zmm1, zmm4 vmovapd ZMMWORD PTR [rdi+192], zmm0 vmovsd QWORD PTR [rdi+128], xmm7 vmovsd QWORD PTR [rdi+256], xmm2 vmovapd ZMMWORD PTR [rdi+320], zmm1 vzeroupper leave ret Note all the stores to/loads from rsp, and the use of ymm registers.