https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125392
Bug ID: 125392
Summary: inefficient SVE vectorization when loop contains
hazards
Product: gcc
Version: 16.0
Status: UNCONFIRMED
Keywords: aarch64-sve
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: tnfchris at gcc dot gnu.org
Blocks: 53947
Target Milestone: ---
Target: aarch64*
Consider the following loop:
#include <stdint.h>
void f (uint8_t* restrict a, uint8_t *b, uint8_t* c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
when compiled with -O3 -march=armv9-a
we generate a hazard check between b and c:
f:
cmp w3, 0
ble .L1
mov x4, 0
whilewr p15.b, x1, x2
b.nlast .L3
whilelo p7.b, wzr, w3
.L4:
ld1b z31.b, p7/z, [x0, x4]
ld1b z30.b, p7/z, [x1, x4]
add z30.b, z30.b, z31.b
st1b z30.b, p7, [x2, x4]
incb x4
whilelo p7.b, w4, w3
b.any .L4
.L1:
ret
.L3:
sxtw x3, w3
.L5:
ldrb w5, [x1, x4]
ldrb w6, [x0, x4]
add w5, w5, w6
strb w5, [x2, x4]
add x4, x4, 1
cmp x3, x4
bne .L5
ret
notice that this uses the SVE instructions to check for write-read hazards.
note that there is one for read-write hazards as well.
The current codegen says that if there is any hazard we jump to scalar code.
While valid this is inefficient. The instructions return the safe amount to
processes during the loop body.
the above can be instead vectorized as
f:
cmp w3, 0
ble .L1
mov x4, 0
whilewr p15.b, x1, x2
b.nlast .L3
whilelo p7.b, wzr, w3
.L4:
ld1b z31.b, p7/z, [x0, x4]
ld1b z30.b, p7/z, [x1, x4]
add z30.b, z30.b, z31.b
st1b z30.b, p7, [x2, x4]
incb x4
whilelo p7.b, w4, w3
b.any .L4
.L1:
ret
.L3:
sxtw x3, w3
.L5:
ldrb w5, [x1, x4]
ldrb w6, [x0, x4]
add w5, w5, w6
strb w5, [x2, x4]
add x4, x4, 1
cmp x3, x4
bne .L5
ret
as
f:
cmp w3, 0
ble .L1
mov x4, 0
whilewr p15.b, x1, x2
b.none .L3
whilelo p7.b, wzr, w3
.L4:
and p7.b, p7/z, p7.b, p15.b
ld1b z31.b, p7/z, [x0, x4]
ld1b z30.b, p7/z, [x1, x4]
add z30.b, z30.b, z31.b
st1b z30.b, p7, [x2, x4]
incp x4, p7
whilelo p7.b, w4, w3
b.any .L4
.L1:
ret
.L3:
sxtw x3, w3
.L5:
ldrb w5, [x1, x4]
ldrb w6, [x0, x4]
add w5, w5, w6
strb w5, [x2, x4]
add x4, x4, 1
cmp x3, x4
bne .L5
ret
note that the scalar loop is still required since the mask can be empty and so
you need to be able to make progress.
This does add a new predicate instruction on the critical path, so we may
instead consider versioning the loop instead:
f:
cmp w3, 0
ble .L1
mov x4, 0
whilewr p15.b, x1, x2
b.none .L7
b.nlast .L3
whilelo p7.b, wzr, w3
.L4:
ld1b z31.b, p7/z, [x0, x4]
ld1b z30.b, p7/z, [x1, x4]
add z30.b, z30.b, z31.b
st1b z30.b, p7, [x2, x4]
incb x4
whilelo p7.b, w4, w3
b.any .L4
.L1:
ret
.L3:
whilelo p7.b, wzr, w3
.L5:
and p7.b, p7/z, p7.b, p15.b
ld1b z31.b, p7/z, [x0, x4]
ld1b z30.b, p7/z, [x1, x4]
add z30.b, z30.b, z31.b
st1b z30.b, p7, [x2, x4]
incp x4, p7
whilelo p7.b, w4, w3
b.any .L5
.L6:
ret
.L7:
sxtw x3, w3
.L8:
ldrb w5, [x1, x4]
ldrb w6, [x0, x4]
add w5, w5, w6
strb w5, [x2, x4]
add x4, x4, 1
cmp x3, x4
bne .L8
ret
such that if there is no hazard at all don't take the extra dependency.
Probably only useful at -O3.
Referenced Bugs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations