https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123367

            Bug ID: 123367
           Summary: [16 Regression] Regression of 523.xalancbmk_r after
                    vectorizing uncounted loops on Zen{4,5}
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vekumar at gcc dot gnu.org
  Target Milestone: ---

Below case is simulated from 523.xalancbmk_r (spec2017) benchmark. 

typedef unsigned short XMLCh;
unsigned int stringLen(const XMLCh* const src)
{
         const XMLCh* pszTmp = src + 1;

        while (*pszTmp)
            ++pszTmp;

        return (unsigned int)(pszTmp - src);
 }

Compiling with -O3 -march=znver5 flags,

GCC 15 generated simpler code.
--snip--

        cmpw    $0, 2(%rdi)     #, MEM[(const XMLCh *)src_5(D) + 2B]
        je      .L4 #,
        leaq    2(%rdi), %rax   #, pszTmp
.L3:
        addq    $2, %rax        #, pszTmp
        cmpw    $0, (%rax)      #, MEM[(const XMLCh *)pszTmp_9]
        jne     .L3       #
--snip--


GCC 16 vectorizes the (uncounted) loop but in the process does peeling for
alignment (prolog).

The benchmark now spends time on the prolog loop which has extra checks for the
peeled counters (Loop L5). 

--snip--

        cmpw    $0, 2(%rdi)     #, MEM[(const XMLCh *)src_5(D) + 2B]
        je      .L8 #,
        leaq    4(%rdi), %rcx   #, vectp_src.4_18
# /app/example.cpp:4:          const XMLCh* pszTmp = src + 1;
        leaq    2(%rdi), %rax   #, tmp.6
        shrq    %rcx    # _15
        negq    %rcx    # _14
        andl    $31, %ecx       #, prolog_loop_niters.5
        je      .L3 #,
        xorl    %edx, %edx      # ivtmp.28
.L5:
        addq    $2, %rax        #, tmp.6
        cmpw    $0, (%rax)      #, MEM[(const XMLCh *)pszTmp_4]
        je      .L4 #,
        incq    %rdx    # ivtmp.28
        cmpq    %rdx, %rcx      # ivtmp.28, prolog_loop_niters.5
        jne     .L5       #,
.L3:
        leaq    4(%rdi,%rcx,2), %rsi    #, vectp_src.10
# /app/example.cpp:4:          const XMLCh* pszTmp = src + 1;
        xorl    %edx, %edx      # ivtmp.22
.L6:
        vpxor   %xmm0, %xmm0, %xmm0   # tmp134
        movq    %rdx, %rcx      # ivtmp.22, ivtmp.22
        vpcmpeqw        (%rsi,%rdx,2), %zmm0, %k0       # MEM <const vector(32)
short unsigned int> [(const XMLCh *)vectp_src.10_57 + ivtmp.22_35 * 2], tmp134,
vexit_inv_64
        addq    $32, %rdx       #, ivtmp.22
        kortestd        %k0, %k0        # vexit_inv_64
        je      .L6 #,
        leaq    (%rax,%rcx,2), %rax     #, tmp.6
--snip--

I observed a slowdown of about 1.5% on Zen5 machine.  Vectorization seems not
profitable here.

Reply via email to