https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114576
Bug ID: 114576 Summary: [13 regression][config/i386] GCC 14/trunk emits VEX-prefixed AES instruction without AVX enabled Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: thiago at kde dot org Target Milestone: --- Re: https://bugreports.qt.io/browse/QTBUG-123965 Re: https://bugzilla.redhat.com/show_bug.cgi?id=2262640, https://bugzilla.redhat.com/show_bug.cgi?id=2272758 Godbolt link: https://gcc.godbolt.org/z/6P9fMvoxW Found while compiling Qt 6.6 or 6.7 with GCC 14 (current trunk). This is a regression from GCC 13. This function from qhash.cpp <https://github.com/qt/qtbase/blob/v6.7.0/src/corelib/tools/qhash.cpp#L581-L588>: Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const { { // unlike the Go code, we don't have more per-process seed __m128i state1 = _mm_aesenc_si128(state0, mseed2); return state1; } } Is apparently getting assembled to: .L2: leaq (%rdi,%rsi), %rdx vaesenc %xmm1, %xmm0, %xmm1 Though there's no AVX enabled in this code (the original version in Qt has some AVX/VAES and AVX512 code but the reduced example does not). This function: // hash twice 16 bytes, running 2 scramble rounds of AES on itself static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, const __m128i *src1) { __m128i data0 = _mm_loadu_si128(src0); __m128i data1 = _mm_loadu_si128(src1); state0 = _mm_xor_si128(data0, state0); state1 = _mm_xor_si128(data1, state1); state0 = _mm_aesenc_si128(state0, state0); state1 = _mm_aesenc_si128(state1, state1); state0 = _mm_aesenc_si128(state0, state0); state1 = _mm_aesenc_si128(state1, state1); } Is even emitting: .L20: movdqu (%rax), %xmm2 pxor %xmm0, %xmm2 movdqu -16(%rdx), %xmm0 pxor %xmm0, %xmm1 vaesenc %xmm2, %xmm2, %xmm0 aesenc %xmm1, %xmm1 aesenc %xmm0, %xmm0 aesenc %xmm1, %xmm1 and that makes no sense to use AVX for one of four instructions alone, called from the same source function. For reference, GCC 13 generates respectively: .L2: movdqa %xmm0, %xmm1 leaq (%rdi,%rsi), %rdx aesenc %xmm2, %xmm1 and .L20: movdqu (%rax), %xmm2 pxor %xmm0, %xmm2 movdqu -16(%rdx), %xmm0 aesenc %xmm2, %xmm2 pxor %xmm0, %xmm1 movdqa %xmm2, %xmm0 aesenc %xmm1, %xmm1 aesenc %xmm2, %xmm0 aesenc %xmm1, %xmm1 You can tell that they are the same source block because the labels are the same. Sources: #include <immintrin.h> #ifdef _MSC_VER # define Q_ALWAYS_INLINE __forceinline # define QT_VECTORCALL __vectorcall # define QT_FUNCTION_TARGET(x) #else # define Q_ALWAYS_INLINE inline __attribute__((always_inline)) # define QT_VECTORCALL # define QT_FUNCTION_TARGET(x) __attribute__((target(QT_FUNCTION_TARGET_##x))) # define QT_FUNCTION_TARGET_AES "sse4.2,aes" //# define qCpuHasFeature(x) __builtin_cpu_supports(QT_FUNCTION_TARGET_ ## x) #endif #define QT_COMPILER_SUPPORTS_HERE(x) true # define mm_set1_epz _mm_set1_epi64x # define mm_cvtsz_si128 _mm_cvtsi64_si128 # define mm_cvtsi128_sz _mm_cvtsi128_si64 # define mm256_set1_epz _mm256_set1_epi64x extern bool qCpuHasFeature(const char *) noexcept; #define qCpuHasFeature(x) qCpuHasFeature(#x) using uchar = unsigned char; using quintptr = unsigned long long; using qint8 = signed char; // hash 16 bytes, running 3 scramble rounds of AES on itself (like label "final1") static void Q_ALWAYS_INLINE QT_FUNCTION_TARGET(AES) QT_VECTORCALL hash16bytes(__m128i &state0, __m128i data) { state0 = _mm_xor_si128(state0, data); state0 = _mm_aesenc_si128(state0, state0); state0 = _mm_aesenc_si128(state0, state0); state0 = _mm_aesenc_si128(state0, state0); } // hash twice 16 bytes, running 2 scramble rounds of AES on itself static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, const __m128i *src1) { __m128i data0 = _mm_loadu_si128(src0); __m128i data1 = _mm_loadu_si128(src1); state0 = _mm_xor_si128(data0, state0); state1 = _mm_xor_si128(data1, state1); state0 = _mm_aesenc_si128(state0, state0); state1 = _mm_aesenc_si128(state1, state1); state0 = _mm_aesenc_si128(state0, state0); state1 = _mm_aesenc_si128(state1, state1); } struct AESHashSeed { __m128i state0; __m128i mseed2; AESHashSeed(size_t seed, size_t seed2) QT_FUNCTION_TARGET(AES); __m128i state1() const QT_FUNCTION_TARGET(AES); }; Q_ALWAYS_INLINE AESHashSeed::AESHashSeed(size_t seed, size_t seed2) { __m128i mseed = mm_cvtsz_si128(seed); mseed2 = mm_set1_epz(seed2); // mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, 0, 0, 0 ] mseed = _mm_insert_epi16(mseed, short(seed), 4); // mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, len, len, len ] mseed = _mm_shufflehi_epi16(mseed, 0); // merge with the process-global seed __m128i key = _mm_xor_si128(mseed, mseed2); // scramble the key __m128i state0 = _mm_aesenc_si128(key, key); this->state0 = state0; } Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const { { // unlike the Go code, we don't have more per-process seed __m128i state1 = _mm_aesenc_si128(state0, mseed2); return state1; } } static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL aeshash128_16to32(__m128i state0, __m128i state1, const __m128i *src, const __m128i *srcend) { { if (src + 1 < srcend) { // epilogue: between 16 and 31 bytes hash2x16bytes(state0, state1, src, srcend - 1); } else if (src != srcend) { // epilogue: between 1 and 16 bytes, overlap with the end __m128i data = _mm_loadu_si128(srcend - 1); hash16bytes(state0, data); } // combine results: state0 = _mm_xor_si128(state0, state1); } return mm_cvtsi128_sz(state0); } static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL aeshash128_lt16(__m128i state0, const uchar *p, size_t len) { if (len) { // We're going to load 16 bytes and mask zero the part we don't care // (the hash of a short string is different from the hash of a longer // including NULLs at the end because the length is in the key) // WARNING: this may produce valgrind warnings, but it's safe constexpr quintptr PageSize = 4096; __m128i data; if ((quintptr(p) & (PageSize / 2)) == 0) { // lower half of the page: // load all 16 bytes and mask off the bytes past the end of the source static const qint8 maskarray[] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(maskarray + 15 - len)); data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p)); data = _mm_and_si128(data, mask); } else { // upper half of the page: // load 16 bytes ending at the data end, then shuffle them to the beginning static const qint8 shufflecontrol[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; __m128i control = _mm_loadu_si128(reinterpret_cast<const __m128i *>(shufflecontrol + 15 - len)); data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p + len) - 1); data = _mm_shuffle_epi8(data, control); } hash16bytes(state0, data); } return mm_cvtsi128_sz(state0); } static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL aeshash128_ge32(__m128i state0, __m128i state1, const __m128i *src, const __m128i *srcend) { // main loop: scramble two 16-byte blocks for ( ; src + 2 < srcend; src += 2) hash2x16bytes(state0, state1, src, src + 1); return aeshash128_16to32(state0, state1, src, srcend); } static size_t QT_FUNCTION_TARGET(AES) aeshash128(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept { AESHashSeed state(seed, seed2); auto src = reinterpret_cast<const __m128i *>(p); const auto srcend = reinterpret_cast<const __m128i *>(p + len); if (len < sizeof(__m128i)) return aeshash128_lt16(state.state0, p, len); if (len <= sizeof(__m256i)) return aeshash128_16to32(state.state0, state.state1(), src, srcend); return aeshash128_ge32(state.state0, state.state1(), src, srcend); } static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept { return aeshash128(p, len, seed, seed2); } extern size_t qt_qhash_seed; size_t qHashBits(const void *p, size_t size, size_t seed) noexcept { size_t seed2 = size; if (seed) seed2 = qt_qhash_seed; return aeshash(reinterpret_cast<const uchar *>(p), size, seed, seed2); }