trunk emits VEX-prefixed AES instruction without AVX enabled

thiago at kde dot org via Gcc-bugs Wed, 03 Apr 2024 10:36:49 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114576


            Bug ID: 114576
           Summary: [13 regression][config/i386] GCC 14/trunk emits
                    VEX-prefixed AES instruction without AVX enabled
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: thiago at kde dot org
  Target Milestone: ---

Re: https://bugreports.qt.io/browse/QTBUG-123965
Re: https://bugzilla.redhat.com/show_bug.cgi?id=2262640,
https://bugzilla.redhat.com/show_bug.cgi?id=2272758
Godbolt link: https://gcc.godbolt.org/z/6P9fMvoxW

Found while compiling Qt 6.6 or 6.7 with GCC 14 (current trunk). This is a
regression from GCC 13.

This function from qhash.cpp
<https://github.com/qt/qtbase/blob/v6.7.0/src/corelib/tools/qhash.cpp#L581-L588>:

Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const
{
    {
        // unlike the Go code, we don't have more per-process seed
        __m128i state1 = _mm_aesenc_si128(state0, mseed2);
        return state1;
    }
}

Is apparently getting assembled to:
.L2:
        leaq    (%rdi,%rsi), %rdx
        vaesenc %xmm1, %xmm0, %xmm1

Though there's no AVX enabled in this code (the original version in Qt has some
AVX/VAES and AVX512 code but the reduced example does not).

This function:
    // hash twice 16 bytes, running 2 scramble rounds of AES on itself
    static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL
    hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, const
__m128i *src1)
    {
        __m128i data0 = _mm_loadu_si128(src0);
        __m128i data1 = _mm_loadu_si128(src1);
        state0 = _mm_xor_si128(data0, state0);
        state1 = _mm_xor_si128(data1, state1);
        state0 = _mm_aesenc_si128(state0, state0);
        state1 = _mm_aesenc_si128(state1, state1);
        state0 = _mm_aesenc_si128(state0, state0);
        state1 = _mm_aesenc_si128(state1, state1);
    }

Is even emitting:
.L20:
        movdqu  (%rax), %xmm2
        pxor    %xmm0, %xmm2
        movdqu  -16(%rdx), %xmm0
        pxor    %xmm0, %xmm1
        vaesenc %xmm2, %xmm2, %xmm0
        aesenc  %xmm1, %xmm1
        aesenc  %xmm0, %xmm0
        aesenc  %xmm1, %xmm1

and that makes no sense to use AVX for one of four instructions alone, called
from the same source function.

For reference, GCC 13 generates respectively:

.L2:
        movdqa  %xmm0, %xmm1
        leaq    (%rdi,%rsi), %rdx
        aesenc  %xmm2, %xmm1
and
.L20:
        movdqu  (%rax), %xmm2
        pxor    %xmm0, %xmm2
        movdqu  -16(%rdx), %xmm0
        aesenc  %xmm2, %xmm2
        pxor    %xmm0, %xmm1
        movdqa  %xmm2, %xmm0
        aesenc  %xmm1, %xmm1
        aesenc  %xmm2, %xmm0
        aesenc  %xmm1, %xmm1

You can tell that they are the same source block because the labels are the
same.

Sources:

#include <immintrin.h>
#ifdef _MSC_VER
#  define Q_ALWAYS_INLINE __forceinline
#  define QT_VECTORCALL __vectorcall
#  define QT_FUNCTION_TARGET(x)
#else
#  define Q_ALWAYS_INLINE inline __attribute__((always_inline))
#  define QT_VECTORCALL
#  define QT_FUNCTION_TARGET(x) __attribute__((target(QT_FUNCTION_TARGET_##x)))
#  define QT_FUNCTION_TARGET_AES        "sse4.2,aes"
//#  define qCpuHasFeature(x) __builtin_cpu_supports(QT_FUNCTION_TARGET_ ## x)
#endif
#define QT_COMPILER_SUPPORTS_HERE(x)    true
#    define mm_set1_epz     _mm_set1_epi64x
#    define mm_cvtsz_si128  _mm_cvtsi64_si128
#    define mm_cvtsi128_sz  _mm_cvtsi128_si64
#    define mm256_set1_epz  _mm256_set1_epi64x
extern bool qCpuHasFeature(const char *) noexcept;
#define qCpuHasFeature(x)     qCpuHasFeature(#x)

using uchar = unsigned char;
using quintptr = unsigned long long;
using qint8 = signed char;

    // hash 16 bytes, running 3 scramble rounds of AES on itself (like label
"final1")
    static void Q_ALWAYS_INLINE QT_FUNCTION_TARGET(AES) QT_VECTORCALL
    hash16bytes(__m128i &state0, __m128i data)
    {
        state0 = _mm_xor_si128(state0, data);
        state0 = _mm_aesenc_si128(state0, state0);
        state0 = _mm_aesenc_si128(state0, state0);
        state0 = _mm_aesenc_si128(state0, state0);
    }

    // hash twice 16 bytes, running 2 scramble rounds of AES on itself
    static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL
    hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, const
__m128i *src1)
    {
        __m128i data0 = _mm_loadu_si128(src0);
        __m128i data1 = _mm_loadu_si128(src1);
        state0 = _mm_xor_si128(data0, state0);
        state1 = _mm_xor_si128(data1, state1);
        state0 = _mm_aesenc_si128(state0, state0);
        state1 = _mm_aesenc_si128(state1, state1);
        state0 = _mm_aesenc_si128(state0, state0);
        state1 = _mm_aesenc_si128(state1, state1);
    }

    struct AESHashSeed
    {
        __m128i state0;
        __m128i mseed2;
        AESHashSeed(size_t seed, size_t seed2) QT_FUNCTION_TARGET(AES);
        __m128i state1() const QT_FUNCTION_TARGET(AES);
    };

Q_ALWAYS_INLINE AESHashSeed::AESHashSeed(size_t seed, size_t seed2)
{
    __m128i mseed = mm_cvtsz_si128(seed);
    mseed2 = mm_set1_epz(seed2);

    // mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, 0, 0, 0
]
    mseed = _mm_insert_epi16(mseed, short(seed), 4);
    // mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, len,
len, len ]
    mseed = _mm_shufflehi_epi16(mseed, 0);

    // merge with the process-global seed
    __m128i key = _mm_xor_si128(mseed, mseed2);

    // scramble the key
    __m128i state0 = _mm_aesenc_si128(key, key);
    this->state0 = state0;
}

Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const
{
    {
        // unlike the Go code, we don't have more per-process seed
        __m128i state1 = _mm_aesenc_si128(state0, mseed2);
        return state1;
    }
}

static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_16to32(__m128i state0, __m128i state1, const __m128i *src, const
__m128i *srcend)
{
    {
        if (src + 1 < srcend) {
            // epilogue: between 16 and 31 bytes
            hash2x16bytes(state0, state1, src, srcend - 1);
        } else if (src != srcend) {
            // epilogue: between 1 and 16 bytes, overlap with the end
            __m128i data = _mm_loadu_si128(srcend - 1);
            hash16bytes(state0, data);
        }

        // combine results:
        state0 = _mm_xor_si128(state0, state1);
    }

    return mm_cvtsi128_sz(state0);
}

static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_lt16(__m128i state0, const uchar *p, size_t len)
{
    if (len) {
        // We're going to load 16 bytes and mask zero the part we don't care
        // (the hash of a short string is different from the hash of a longer
        // including NULLs at the end because the length is in the key)
        // WARNING: this may produce valgrind warnings, but it's safe

        constexpr quintptr PageSize = 4096;
        __m128i data;

        if ((quintptr(p) & (PageSize / 2)) == 0) {
            // lower half of the page:
            // load all 16 bytes and mask off the bytes past the end of the
source
            static const qint8 maskarray[] = {
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
            };
            __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i
*>(maskarray + 15 - len));
            data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
            data = _mm_and_si128(data, mask);
        } else {
            // upper half of the page:
            // load 16 bytes ending at the data end, then shuffle them to the
beginning
            static const qint8 shufflecontrol[] = {
                1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            };
            __m128i control = _mm_loadu_si128(reinterpret_cast<const __m128i
*>(shufflecontrol + 15 - len));
            data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p + len) -
1);
            data = _mm_shuffle_epi8(data, control);
        }

        hash16bytes(state0, data);
    }
    return mm_cvtsi128_sz(state0);
}

static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_ge32(__m128i state0, __m128i state1, const __m128i *src, const
__m128i *srcend)
{
    // main loop: scramble two 16-byte blocks
    for ( ; src + 2 < srcend; src += 2)
        hash2x16bytes(state0, state1, src, src + 1);

    return aeshash128_16to32(state0, state1, src, srcend);
}


static size_t QT_FUNCTION_TARGET(AES)
aeshash128(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
{
    AESHashSeed state(seed, seed2);
    auto src = reinterpret_cast<const __m128i *>(p);
    const auto srcend = reinterpret_cast<const __m128i *>(p + len);

    if (len < sizeof(__m128i))
        return aeshash128_lt16(state.state0, p, len);

    if (len <= sizeof(__m256i))
        return aeshash128_16to32(state.state0, state.state1(), src, srcend);

    return aeshash128_ge32(state.state0, state.state1(), src, srcend);
}

static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2)
noexcept
{
    return aeshash128(p, len, seed, seed2);
}

extern size_t qt_qhash_seed;
size_t qHashBits(const void *p, size_t size, size_t seed) noexcept
{
    size_t seed2 = size;
    if (seed)
        seed2 = qt_qhash_seed;
    return aeshash(reinterpret_cast<const uchar *>(p), size, seed, seed2);
}

[Bug target/114576] New: [13 regression][config/i386] GCC 14/trunk emits VEX-prefixed AES instruction without AVX enabled

Reply via email to