[Bug tree-optimization/123367] New: [16 Regression] Regression of 523.xalancbmk_r after vectorizing uncounted loops on Zen{4,5}

vekumar at gcc dot gnu.org via Gcc-bugs Fri, 02 Jan 2026 03:14:32 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123367


            Bug ID: 123367
           Summary: [16 Regression] Regression of 523.xalancbmk_r after
                    vectorizing uncounted loops on Zen{4,5}
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vekumar at gcc dot gnu.org
  Target Milestone: ---

Below case is simulated from 523.xalancbmk_r (spec2017) benchmark. 

typedef unsigned short XMLCh;
unsigned int stringLen(const XMLCh* const src)
{
         const XMLCh* pszTmp = src + 1;

        while (*pszTmp)
            ++pszTmp;

        return (unsigned int)(pszTmp - src);
 }

Compiling with -O3 -march=znver5 flags,

GCC 15 generated simpler code.
--snip--

        cmpw    $0, 2(%rdi)     #, MEM[(const XMLCh *)src_5(D) + 2B]
        je      .L4 #,
        leaq    2(%rdi), %rax   #, pszTmp
.L3:
        addq    $2, %rax        #, pszTmp
        cmpw    $0, (%rax)      #, MEM[(const XMLCh *)pszTmp_9]
        jne     .L3       #
--snip--


GCC 16 vectorizes the (uncounted) loop but in the process does peeling for
alignment (prolog).

The benchmark now spends time on the prolog loop which has extra checks for the
peeled counters (Loop L5). 

--snip--

        cmpw    $0, 2(%rdi)     #, MEM[(const XMLCh *)src_5(D) + 2B]
        je      .L8 #,
        leaq    4(%rdi), %rcx   #, vectp_src.4_18
# /app/example.cpp:4:          const XMLCh* pszTmp = src + 1;
        leaq    2(%rdi), %rax   #, tmp.6
        shrq    %rcx    # _15
        negq    %rcx    # _14
        andl    $31, %ecx       #, prolog_loop_niters.5
        je      .L3 #,
        xorl    %edx, %edx      # ivtmp.28
.L5:
        addq    $2, %rax        #, tmp.6
        cmpw    $0, (%rax)      #, MEM[(const XMLCh *)pszTmp_4]
        je      .L4 #,
        incq    %rdx    # ivtmp.28
        cmpq    %rdx, %rcx      # ivtmp.28, prolog_loop_niters.5
        jne     .L5       #,
.L3:
        leaq    4(%rdi,%rcx,2), %rsi    #, vectp_src.10
# /app/example.cpp:4:          const XMLCh* pszTmp = src + 1;
        xorl    %edx, %edx      # ivtmp.22
.L6:
        vpxor   %xmm0, %xmm0, %xmm0   # tmp134
        movq    %rdx, %rcx      # ivtmp.22, ivtmp.22
        vpcmpeqw        (%rsi,%rdx,2), %zmm0, %k0       # MEM <const vector(32)
short unsigned int> [(const XMLCh *)vectp_src.10_57 + ivtmp.22_35 * 2], tmp134,
vexit_inv_64
        addq    $32, %rdx       #, ivtmp.22
        kortestd        %k0, %k0        # vexit_inv_64
        je      .L6 #,
        leaq    (%rax,%rcx,2), %rax     #, tmp.6
--snip--

I observed a slowdown of about 1.5% on Zen5 machine.  Vectorization seems not
profitable here.

[Bug tree-optimization/123367] New: [16 Regression] Regression of 523.xalancbmk_r after vectorizing uncounted loops on Zen{4,5}

Reply via email to