https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122611
Bug ID: 122611
Summary: Miscompilation: vectorization breaks `if` semantics
Product: gcc
Version: 15.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: nicula at nicula dot xyz
Target Milestone: ---
Code:
#include <cstdint>
#include <cstdio>
#include <new>
using u8 = uint8_t;
__attribute__((noinline)) bool find_nonzero(u8 *data, size_t len) {
for (size_t i = 0; i + 3 < len; i += 4) {
// Go DWORD by DWORD.
u8 res = data[i] | data[i + 1] | data[i + 2] | data[i + 3];
// When we find a non-zero DWORD, return immediately.
if (res != 0) {
return true;
}
}
return false;
}
int main() {
// Allocate DWORD-sized buffer with alignment of 16 bytes
// needed for the XMMWORD load check (`test dil, 15`).
auto buf = (u8 *)operator new(4, std::align_val_t{16});
// Initialize with 1 so the first `find_nonzero()` iteration
// returns `true`.
for (size_t i = 0; i < 4; i++) {
buf[i] = 1;
}
// Intentionally pass a length bigger than the buffer's size
// to illustrate the erroneous vectorization.
puts(find_nonzero(buf, 100) ? "found" : "not found");
operator delete(buf, std::align_val_t{16});
}
Assembly, compiled with `-O3 -march=znver1` (only `find_nonzero()` is
relevant):
find_nonzero(unsigned char*, unsigned long):
cmp rsi, 3
jbe .L10
lea rax, [rsi-4]
cmp rax, 59
jbe .L3
test dil, 15
jne .L3
shr rax, 2
vpcmpeqd xmm4, xmm4, xmm4
mov r9d, 64
vmovdqa xmm5, XMMWORD PTR .LC0[rip]
lea rcx, [rax+1]
vmovq xmm6, r9
vmovdqa xmm3, XMMWORD PTR .LC1[rip]
mov rax, rdi
mov r8, rcx
xor edx, edx
vpxor xmm7, xmm7, xmm7
vpunpcklqdq xmm6, xmm6, xmm6
shr r8, 4
vpsrlw xmm4, xmm4, 8
jmp .L6
.L4:
inc rdx
vpaddq xmm3, xmm3, xmm6
vpaddq xmm5, xmm5, xmm6
add rax, 64
cmp r8, rdx
je .L18
.L6:
vmovdqa xmm1, XMMWORD PTR [rax]
vmovdqa xmm10, XMMWORD PTR [rax+16]
vmovdqa xmm8, XMMWORD PTR [rax+32]
vmovdqa xmm0, XMMWORD PTR [rax+48]
vpsrlw xmm9, xmm10, 8
vpsrlw xmm2, xmm1, 8
vpand xmm10, xmm4, xmm10
vpackuswb xmm2, xmm2, xmm9
vpsrlw xmm11, xmm0, 8
vpsrlw xmm9, xmm8, 8
vpand xmm0, xmm4, xmm0
vpackuswb xmm9, xmm9, xmm11
vpand xmm1, xmm4, xmm1
vpand xmm8, xmm4, xmm8
vpackuswb xmm1, xmm1, xmm10
vpackuswb xmm8, xmm8, xmm0
vpand xmm10, xmm4, xmm9
vpand xmm0, xmm4, xmm2
vpsrlw xmm9, xmm9, 8
vpackuswb xmm0, xmm0, xmm10
vpsrlw xmm2, xmm2, 8
vpackuswb xmm2, xmm2, xmm9
vpor xmm0, xmm0, xmm2
vpand xmm9, xmm4, xmm8
vpand xmm2, xmm4, xmm1
vpsrlw xmm8, xmm8, 8
vpackuswb xmm2, xmm2, xmm9
vpsrlw xmm1, xmm1, 8
vpackuswb xmm1, xmm1, xmm8
vpor xmm0, xmm0, xmm2
vpor xmm0, xmm0, xmm1
vpcmpeqb xmm0, xmm0, xmm7
vpcmpeqb xmm0, xmm0, xmm7
vptest xmm0, xmm0
je .L4
vmovq rdx, xmm5
vmovq rax, xmm3
.L5:
add rax, 7
jmp .L7
.L19:
mov rdx, rax
cmp rax, rsi
jnb .L10
add rax, 4
.L7:
movzx edx, BYTE PTR [rdi+rdx]
movzx ecx, BYTE PTR [rdi-7+rax]
or dl, BYTE PTR [rdi-5+rax]
or cl, BYTE PTR [rdi-6+rax]
or dl, cl
je .L19
.L13:
mov eax, 1
ret
.L10:
xor eax, eax
ret
.L18:
test cl, 15
je .L10
and rcx, -16
lea rax, [0+rcx*4]
lea rdx, [rax+3]
jmp .L5
.L3:
and rax, -4
lea rcx, [rdi+4+rax]
jmp .L8
.L20:
add rdi, 4
cmp rdi, rcx
je .L10
.L8:
movzx eax, BYTE PTR [rdi+1]
movzx edx, BYTE PTR [rdi+3]
or al, BYTE PTR [rdi]
or dl, BYTE PTR [rdi+2]
or al, dl
je .L20
jmp .L13
Problem: under C/C++ semantics, it's not legal to 'delay' the evaluation of the
`if` in order to read more than one DWORD at a time, right? GCC is doing that
here. It wants to read XMMWORD-sized chunks although the `if` statement -- as
far as I understand -- imposes that no more than 4 bytes are read at a time,
*regardless* of the value of `len`. The `len` argument does not represent the
actual length of the buffer; it could be that when a non-zero DWORD is found,
indexing `data` past that index causes a segfault (even if that index is less
than `len`). So this vectorization breaks the language semantics.
Valgrind error when running the binary compiled with `-O3 -march=znver1 -ggdb`:
==485803== Invalid read of size 16
==485803== at 0x40124C: find_nonzero(unsigned char*, unsigned long)
(f.cpp:10)
==485803== by 0x4010A5: main (f.cpp:33)
==485803== Address 0x4e15090 is 12 bytes after a block of size 4 alloc'd
==485803== at 0x4848711: operator new(unsigned long, std::align_val_t)
(in
/nix/store/bx80d84y5vw7s2ba3n12d186k6hddyv0-valgrind-3.24.0/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==485803== by 0x40108F: main (f.cpp:23)
Godbolt: https://godbolt.org/z/9n51jWds4
Note: Clang does not read more than 4 bytes even with optimizations enabled.