https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94651

            Bug ID: 94651
           Summary: Missed peephole optimization: m >= POWER_OF_TWO || n
                    >= POWER_OF_TWO
           Product: gcc
           Version: 9.3.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: pascal_cuoq at hotmail dot com
  Target Milestone: ---

Consider the functions:

(Compiler Explorer link: https://gcc.godbolt.org/z/Uzd6nd )

#define POWER_OF_TWO (1UL << 20)

int check(unsigned long m, unsigned long n)
{
    return m >= POWER_OF_TWO || n >= POWER_OF_TWO;
}

void g(unsigned long, unsigned long);

void test1(unsigned long m, unsigned long n)
{
    if (m >= POWER_OF_TWO || n >= POWER_OF_TWO) g(m, 0);
}

void test2(unsigned long m, unsigned long n)
{
    if (m >= POWER_OF_TWO || n >= POWER_OF_TWO) g(m, n);
}

At least for the test1 and test2 functions, it seems that code that implements
(m|n) >= POWER_OF_TWO will be faster on average for more input distributions
than code with two comparisons on pretty much every modern architecture. This
is what Clang 10 generates:

check:                                  # @check
        orq     %rsi, %rdi
        xorl    %eax, %eax
        cmpq    $1048575, %rdi          # imm = 0xFFFFF
        seta    %al
        retq
test1:                                  # @test1
        orq     %rdi, %rsi
        cmpq    $1048576, %rsi          # imm = 0x100000
        jb      .LBB1_1
        xorl    %esi, %esi
        jmp     g                       # TAILCALL
.LBB1_1:
        retq
test2:                                  # @test2
        movq    %rsi, %rax
        orq     %rdi, %rax
        cmpq    $1048576, %rax          # imm = 0x100000
        jb      .LBB2_1
        jmp     g                       # TAILCALL
.LBB2_1:
        retq


GCC 9.3 does one comparison after the other. This leads to extra instructions
being necessary afterwards for the function check on x86, although it saves one
register-register move in the function test2:

check:
        cmpq    $1048575, %rdi
        seta    %al
        cmpq    $1048575, %rsi
        seta    %dl
        orl     %edx, %eax
        movzbl  %al, %eax
        ret
test1:
        cmpq    $1048575, %rdi
        ja      .L6
        cmpq    $1048575, %rsi
        ja      .L6
        ret
.L6:
        xorl    %esi, %esi
        jmp     g
test2:
        cmpq    $1048575, %rdi
        ja      .L10
        cmpq    $1048575, %rsi
        ja      .L10
        ret
.L10:
        jmp     g

Reply via email to