https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125937

            Bug ID: 125937
           Summary: [17 Regression] A TSVC testcase slower by ~40% since
                    r17-223-ga22b31304e0a1a
           Product: gcc
           Version: 17.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: pheeck at gcc dot gnu.org
                CC: rguenth at gcc dot gnu.org
  Target Milestone: ---
              Host: x86_64-pc-linux-gnu
            Target: x86_64-pc-linux-gnu

Created attachment 64826
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=64826&action=edit
s4115 testcase with all the other necessary TSVC files

The TSVC benchmark suite testcases s4115 and s4116 slowed down.

Compile flags: *.c -Ofast -march=native -lm

CPU: Zen3, though I've seen this on other machines

I've bisected this to r17-223-ga22b31304e0a1a

commit a22b31304e0a1ad21751f882c02c32c167c78793
Author:     Richard Biener <[email protected]>
AuthorDate: Fri Apr 24 14:35:49 2026 +0200
Commit:     Richard Biener <[email protected]>
CommitDate: Thu Apr 30 08:13:03 2026 +0200

    flip --param ix86-vect-compare-costs default

Here are the testcases:

----
//int s4115(int* __restrict__ ip)
real_t s4115(struct args_t * func_args)
{

//    indirect addressing
//    sparse dot product
//    gather is required

    int * __restrict__ ip = func_args->arg_info;

    initialise_arrays(__func__);
    gettimeofday(&func_args->t1, NULL);

    real_t sum;
    for (int nl = 0; nl < iterations; nl++) {
        sum = 0.;
        for (int i = 0; i < LEN_1D; i++) {
            sum += a[i] * b[ip[i]];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

    gettimeofday(&func_args->t2, NULL);
    return sum;
}

// %4.11

//int s4116(int* __restrict__ ip, int j, int inc)
real_t s4116(struct args_t * func_args)
{

//    indirect addressing
//    more complicated sparse sdot
//    gather is required

    struct{int * __restrict__ a;int b;int c;} * x = func_args->arg_info;
    int * __restrict__ ip = x->a;
    int j = x->b;
    int inc = x->c;

    initialise_arrays(__func__);
    gettimeofday(&func_args->t1, NULL);

    real_t sum;
    int off;
    for (int nl = 0; nl < 100*iterations; nl++) {
        sum = 0.;
        for (int i = 0; i < LEN_2D-1; i++) {
            off = inc + i;
            sum += a[off] * aa[j-1][ip[i]];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

    gettimeofday(&func_args->t2, NULL);
    return sum;
}
----

I've packaged s4115 together with all the other necessary TSVC sources into an
archive and attach it to this report.

Here are the differences I see in the dissassembly of tsvc.c from the
attachement. The '-' correspond to r17-222, the '+' to r17-223:

--- before-culprit/tsvc.s       2026-06-22 15:03:03.674209255 +0200
+++ culprit/tsvc.s      2026-06-22 15:05:14.190621248 +0200
@@ -27,26 +27,24 @@
        call    gettimeofday
 .L2:
        xorl    %eax, %eax
-       vxorps  %xmm2, %xmm2, %xmm2
+       vxorps  %xmm0, %xmm0, %xmm0
+       .p2align 6
        .p2align 4
        .p2align 3
 .L3:
        leaq    (%r12,%rax), %rdx
-       addq    $16, %rax
-       movslq  (%rdx), %rsi
-       movslq  8(%rdx), %rdi
-       movslq  4(%rdx), %rcx
-       movslq  12(%rdx), %rdx
-       vmovss  b(,%rdi,4), %xmm1
-       vmovss  b(,%rsi,4), %xmm0
+       vmovq   a(%rax), %xmm2
+       addq    $8, %rax
+       movslq  (%rdx), %rcx
+       movslq  4(%rdx), %rdx
+       vmovss  b(,%rcx,4), %xmm1
        vinsertps       $0x10, b(,%rdx,4), %xmm1, %xmm1
-       vinsertps       $0x10, b(,%rcx,4), %xmm0, %xmm0
-       vmovlhps        %xmm1, %xmm0, %xmm0
-       vmulps  a-16(%rax), %xmm0, %xmm0
-       vaddps  %xmm0, %xmm2, %xmm2
+       vmulps  %xmm2, %xmm1, %xmm1
+       vaddps  %xmm0, %xmm1, %xmm1
+       vmovaps %xmm1, %xmm0
        cmpq    $128000, %rax
        jne     .L3
-       vmovaps %xmm2, (%rsp)
+       vmovaps %xmm1, (%rsp)
        movl    $c, %edx
        pushq   $cc
        .cfi_def_cfa_offset 56
@@ -68,7 +66,7 @@
        leaq    16(%rbp), %rdi
        xorl    %esi, %esi
        call    gettimeofday
-       vmovaps (%rsp), %xmm2
+       vmovaps (%rsp), %xmm1
        addq    $16, %rsp
        .cfi_def_cfa_offset 32
        popq    %rbx
@@ -77,10 +75,9 @@
        .cfi_def_cfa_offset 16
        popq    %r12
        .cfi_def_cfa_offset 8
-       vmovhlps        %xmm2, %xmm2, %xmm1
-       vaddps  %xmm2, %xmm1, %xmm1
-       vshufps $85, %xmm1, %xmm1, %xmm0
-       vaddps  %xmm1, %xmm0, %xmm0
+       vpsrlq  $32, %xmm1, %xmm0
+       vaddps  %xmm0, %xmm1, %xmm1
+       vmovaps %xmm1, %xmm0
        ret
        .cfi_endproc
 .LFE11:

Reply via email to