Issue 181483
Summary LLVM doesn't generate branchless code when using Bit OR or Bit AND on bools
Labels new issue
Assignees
Reporter AngelicosPhosphoros
    Examples on Godbolt (using Clang 21.1.0 and Rust 1.93.0):

- Clang example: https://godbolt.org/z/h75j8aThK
- Rust example: https://godbolt.org/z/crqnsWc3n

Compilation options for Clang: `-g -o output.s -mllvm --x86-asm-syntax=intel -fno-verbose-asm -S --gcc-toolchain=/opt/compiler-explorer/gcc-15.2.0 -fcolor-diagnostics -fno-crash-diagnostics -O3 example.cpp`

 ### The problem

When I try to optimize a hot loop with several simple boolean expressions (e.g., comparing a character to several options), I can optimize it by switching to bit operations on bools to avoid short-circuiting of `&&` and `||` operators. This should generate code with fewer branches.

However, LLVM replaces non-short-circuiting bit operations on booleans by short-circuiting logical ones, generating a lot of conditional jumps (which we were trying to avoid by using bit operations in the first place).

I think that it is the result of either `SimplifyCfg` or `X86-isel` passes.

### Example

```cpp
size_t res = 0;
    while (true) {
        const char current = *s;
        const auto found = 
            (current == '\0') |
            (current == '\n') |
            (current == '\r') |
            (current == needle)
 ;
        if (found)
        {
            return res;
        }
 ++res;
        ++s;
    }
```

Desired code:

```asm
$LN18:
 mov     QWORD PTR [rsp+8], rbx
        mov     QWORD PTR [rsp+16], rdi
        movzx   r8d, BYTE PTR [rcx]
        xor     r9d, r9d
 xor     r10d, r10d
        mov     edi, 1
        cmp     r8b, dl
 movzx   ebx, dl
        mov     r11, rcx
        sete    r9b
        xor eax, eax
        cmp     r8b, 13
        sete    al
        or      r9d, eax
        cmp     r8b, 10
        cmove   r9d, edi
        xor     eax, eax
        test    r8b, r8b
        sete    al
        or      r9d, eax
 jne     SHORT $LN14@find_offse ; A single conditional jump per loop iteration.
```

Actually generated code:

```asm
        xor     ecx, ecx
        mov     edx, 9217
.LBB0_1:
        movzx   r9d, byte ptr [rdi]
        cmp     r9b, sil
        je      .LBB0_2      ; Conditional short-circuiting branch
        movzx   r8d, r9b
        cmp     r9b, 13
 ja      .LBB0_6      ; Conditional short-circuiting branch
        bt edx, r8d
        jae     .LBB0_6    ; Conditional short-circuiting branch
        mov     rax, rcx
        cmp     r9b, 13
        jbe .LBB0_8   ; Conditional short-circuiting branch
        jmp .LBB0_1
.LBB0_6:
        inc     rcx
        inc     rdi
        cmp r9b, 13
        ja      .LBB0_1    ; Conditional short-circuiting branch
.LBB0_8:
        bt      edx, r8d
        jae     .LBB0_1   ; Conditional short-circuiting branch
        ret
.LBB0_2:
        mov rax, rcx
        ret
```

## Workaround

I have discovered that replacing bit OR by an integer addition works. Bit AND can be replaced by comparing sum of comparisons to the number of comparisons. Examples (same as at godbolt):

With OR:

```cpp
const auto found = 
        #if !USE_ADD
            // This generates extra branches as || operator.
 (current == '\0') |
            (current == '\n') |
 (current == '\r') |
            (current == needle)
        #else
 // This don't generate extra branches.        
            (current == '\0') +
            (current == '\n') +
            (current == '\r') +
            (current == needle)
        #endif
        ;
        if (found)
        {
            return res;
        }
```

With AND:

```cpp
        #if !USE_ADD
            // Branchy version
 (aa == bb) &
            (aa == cc) &
            (aa == dd) &
 (aa == ee)
        #else
            // Branchless version
 (aa == bb) +
            (aa == cc) +
            (aa == dd) +
 (aa == ee)
                == 4
        #endif
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to