https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123631

--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
Gemini comes up with

#include <stdio.h>
#include <stdint.h>

#define ITERATIONS 100000000

static inline uint64_t rdtsc() {
    uint32_t lo, hi;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ((uint64_t)hi << 32) | lo;
}

int main() {
    uint64_t start, end;
    float dummy_res[4];

    // Constant for the memory broadcast
    static const uint32_t CST = 0x0000000A;

    printf("Running benchmarks (%d iterations)...\n", ITERATIONS);

    // --- Test 1: GPR to XMM Path ---
    start = rdtsc();
    for (int i = 0; i < ITERATIONS; i++) {
        __asm__ __volatile__ (
            "mov $0xa, %%eax\n\t"
            "vmovd %%eax, %%xmm0\n\t"
            "vpbroadcastd %%xmm0, %%xmm0\n\t"
            "vpaddd %%xmm0, %%xmm1, %%xmm1\n\t" // Link xmm0 to the accumulator
            : : : "eax", "xmm0", "xmm1"
        );
    }
    end = rdtsc();
    printf("GPR Path:    %lu cycles total (~%.2f cycles/iter)\n", 
            (end - start), (double)(end - start) / ITERATIONS);

    // --- Test 2: Memory Path ---
    start = rdtsc();
    for (int i = 0; i < ITERATIONS; i++) {
        __asm__ __volatile__ (
            "vbroadcastss %0, %%xmm0\n\t"
            "vpaddd %%xmm0, %%xmm1, %%xmm1\n\t" // Link xmm0 to the accumulator
            : : "m"(CST) : "xmm0", "xmm1"
        );
    }
    end = rdtsc();
    printf("Memory Path: %lu cycles total (~%.2f cycles/iter)\n", 
            (end - start), (double)(end - start) / ITERATIONS);

    return 0;
}

which on Zen2 shows

GPR Path:    126567740 cycles total (~1.27 cycles/iter)
Memory Path: 83913348 cycles total (~0.84 cycles/iter)

this obviously assumes in L1-cache data.  I'm unsure whether LRA would
ever re-materialize vector constants rather than spilling/reloading,
but I expect us to hoist any such initialization out of loops and
the non-memory variant consumes an extra GPR.

Reply via email to