https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122611

            Bug ID: 122611
           Summary: Miscompilation: vectorization breaks `if` semantics
           Product: gcc
           Version: 15.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: nicula at nicula dot xyz
  Target Milestone: ---

Code:

    #include <cstdint>
    #include <cstdio>
    #include <new>

    using u8 = uint8_t;

    __attribute__((noinline)) bool find_nonzero(u8 *data, size_t len) {
        for (size_t i = 0; i + 3 < len; i += 4) {
            // Go DWORD by DWORD.
            u8 res = data[i] | data[i + 1] | data[i + 2] | data[i + 3];

            // When we find a non-zero DWORD, return immediately.
            if (res != 0) {
                return true;
            }
        }
        return false;
    }

    int main() {
        // Allocate DWORD-sized buffer with alignment of 16 bytes
        // needed for the XMMWORD load check (`test dil, 15`).
        auto buf = (u8 *)operator new(4, std::align_val_t{16});

        // Initialize with 1 so the first `find_nonzero()` iteration
        // returns `true`.
        for (size_t i = 0; i < 4; i++) {
            buf[i] = 1;
        }

        // Intentionally pass a length bigger than the buffer's size
        // to illustrate the erroneous vectorization.
        puts(find_nonzero(buf, 100) ? "found" : "not found");

        operator delete(buf, std::align_val_t{16});
    }

Assembly, compiled with `-O3 -march=znver1` (only `find_nonzero()` is
relevant):

    find_nonzero(unsigned char*, unsigned long):
            cmp     rsi, 3
            jbe     .L10
            lea     rax, [rsi-4]
            cmp     rax, 59
            jbe     .L3
            test    dil, 15
            jne     .L3
            shr     rax, 2
            vpcmpeqd        xmm4, xmm4, xmm4
            mov     r9d, 64
            vmovdqa xmm5, XMMWORD PTR .LC0[rip]
            lea     rcx, [rax+1]
            vmovq   xmm6, r9
            vmovdqa xmm3, XMMWORD PTR .LC1[rip]
            mov     rax, rdi
            mov     r8, rcx
            xor     edx, edx
            vpxor   xmm7, xmm7, xmm7
            vpunpcklqdq     xmm6, xmm6, xmm6
            shr     r8, 4
            vpsrlw  xmm4, xmm4, 8
            jmp     .L6
    .L4:
            inc     rdx
            vpaddq  xmm3, xmm3, xmm6
            vpaddq  xmm5, xmm5, xmm6
            add     rax, 64
            cmp     r8, rdx
            je      .L18
    .L6:
            vmovdqa xmm1, XMMWORD PTR [rax]
            vmovdqa xmm10, XMMWORD PTR [rax+16]
            vmovdqa xmm8, XMMWORD PTR [rax+32]
            vmovdqa xmm0, XMMWORD PTR [rax+48]
            vpsrlw  xmm9, xmm10, 8
            vpsrlw  xmm2, xmm1, 8
            vpand   xmm10, xmm4, xmm10
            vpackuswb       xmm2, xmm2, xmm9
            vpsrlw  xmm11, xmm0, 8
            vpsrlw  xmm9, xmm8, 8
            vpand   xmm0, xmm4, xmm0
            vpackuswb       xmm9, xmm9, xmm11
            vpand   xmm1, xmm4, xmm1
            vpand   xmm8, xmm4, xmm8
            vpackuswb       xmm1, xmm1, xmm10
            vpackuswb       xmm8, xmm8, xmm0
            vpand   xmm10, xmm4, xmm9
            vpand   xmm0, xmm4, xmm2
            vpsrlw  xmm9, xmm9, 8
            vpackuswb       xmm0, xmm0, xmm10
            vpsrlw  xmm2, xmm2, 8
            vpackuswb       xmm2, xmm2, xmm9
            vpor    xmm0, xmm0, xmm2
            vpand   xmm9, xmm4, xmm8
            vpand   xmm2, xmm4, xmm1
            vpsrlw  xmm8, xmm8, 8
            vpackuswb       xmm2, xmm2, xmm9
            vpsrlw  xmm1, xmm1, 8
            vpackuswb       xmm1, xmm1, xmm8
            vpor    xmm0, xmm0, xmm2
            vpor    xmm0, xmm0, xmm1
            vpcmpeqb        xmm0, xmm0, xmm7
            vpcmpeqb        xmm0, xmm0, xmm7
            vptest  xmm0, xmm0
            je      .L4
            vmovq   rdx, xmm5
            vmovq   rax, xmm3
    .L5:
            add     rax, 7
            jmp     .L7
    .L19:
            mov     rdx, rax
            cmp     rax, rsi
            jnb     .L10
            add     rax, 4
    .L7:
            movzx   edx, BYTE PTR [rdi+rdx]
            movzx   ecx, BYTE PTR [rdi-7+rax]
            or      dl, BYTE PTR [rdi-5+rax]
            or      cl, BYTE PTR [rdi-6+rax]
            or      dl, cl
            je      .L19
    .L13:
            mov     eax, 1
            ret
    .L10:
            xor     eax, eax
            ret
    .L18:
            test    cl, 15
            je      .L10
            and     rcx, -16
            lea     rax, [0+rcx*4]
            lea     rdx, [rax+3]
            jmp     .L5
    .L3:
            and     rax, -4
            lea     rcx, [rdi+4+rax]
            jmp     .L8
    .L20:
            add     rdi, 4
            cmp     rdi, rcx
            je      .L10
    .L8:
            movzx   eax, BYTE PTR [rdi+1]
            movzx   edx, BYTE PTR [rdi+3]
            or      al, BYTE PTR [rdi]
            or      dl, BYTE PTR [rdi+2]
            or      al, dl
            je      .L20
            jmp     .L13

Problem: under C/C++ semantics, it's not legal to 'delay' the evaluation of the
`if` in order to read more than one DWORD at a time, right? GCC is doing that
here. It wants to read XMMWORD-sized chunks although the `if` statement -- as
far as I understand -- imposes that no more than 4 bytes are read at a time,
*regardless* of the value of `len`. The `len` argument does not represent the
actual length of the buffer; it could be that when a non-zero DWORD is found,
indexing `data` past that index causes a segfault (even if that index is less
than `len`). So this vectorization breaks the language semantics.

Valgrind error when running the binary compiled with `-O3 -march=znver1 -ggdb`:

    ==485803== Invalid read of size 16
    ==485803==    at 0x40124C: find_nonzero(unsigned char*, unsigned long)
(f.cpp:10)
    ==485803==    by 0x4010A5: main (f.cpp:33)
    ==485803==  Address 0x4e15090 is 12 bytes after a block of size 4 alloc'd
    ==485803==    at 0x4848711: operator new(unsigned long, std::align_val_t)
(in
/nix/store/bx80d84y5vw7s2ba3n12d186k6hddyv0-valgrind-3.24.0/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
    ==485803==    by 0x40108F: main (f.cpp:23)

Godbolt: https://godbolt.org/z/9n51jWds4

Note: Clang does not read more than 4 bytes even with optimizations enabled.

Reply via email to