https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123954
Bug ID: 123954
Summary: [riscv64, vector] missed optimization, extra move
instruction
Product: gcc
Version: 16.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: jpegqs at gmail dot com
Target Milestone: ---
Why does GCC insert a move instruction for vwaddu when vd = vs2? Does this for
both RVV 1.0 and xtheadvector.
Here's an example:
#include <riscv_vector.h>
void test(uint8_t *src, uint16_t *dst) {
vuint16m2_t vsum = __riscv_vmv_v_x_u16m2(0, 8);
for (int i = 0; i < 8; i++) {
vuint8m1_t h0 = __riscv_vle8_v_u8m1(src + i * 8, 8);
vsum = __riscv_vwaddu_wv_u16m2(vsum, h0, 8);
}
__riscv_vse16_v_u16m2(dst, vsum, 8);
}
Clang can use the same register. Whose assembly code is correct?
// GCC 16.0.0 -march=rv64gc_xtheadvector -Os
test:
li a5,8
th.vsetvli zero,a5,e16,m2
th.vmv.v.i v2,0
addi a4,a0,64
th.vsetvli zero,a5,e8,m1
.L2:
th.vle.v v1,0(a0)
th.vsetvli zero,zero,e16,m2
th.vmv.v.v v4,v2
addi a0,a0,8
th.vsetvli zero,a5,e8,m1
th.vwaddu.wv v2,v4,v1
bne a0,a4,.L2
th.vsetvli zero,a5,e16,m2
th.vse.v v2,0(a1)
ret
// GCC 16.0.0 -march=rv64gcv -Os
test:
vsetivli zero,8,e16,m2,ta,ma
vmv.v.i v2,0
addi a5,a0,64
vsetvli zero,zero,e8,m1,ta,ma
.L2:
vle8.v v1,0(a0)
vmv2r.v v4,v2
addi a0,a0,8
vwaddu.wv v2,v4,v1
bne a0,a5,.L2
vse16.v v2,0(a1)
ret
// GCC 16.0.0 -march=rv64gcv -O3
test:
vsetivli zero,8,e16,m2,ta,ma
vle8.v v6,0(a0)
addi a5,a0,8
vle8.v v1,0(a5)
vmv.v.i v2,0
addi a5,a0,16
vle8.v v7,0(a5)
vsetvli zero,zero,e8,m1,ta,ma
vwaddu.wv v4,v2,v6
addi a5,a0,24
vle8.v v6,0(a5)
vwaddu.wv v2,v4,v1
addi a5,a0,32
vle8.v v1,0(a5)
vwaddu.wv v4,v2,v7
addi a5,a0,40
vle8.v v7,0(a5)
vwaddu.wv v2,v4,v6
addi a5,a0,48
vle8.v v6,0(a5)
vwaddu.wv v4,v2,v1
addi a0,a0,56
vle8.v v1,0(a0)
vwaddu.wv v2,v4,v7
vwaddu.wv v4,v2,v6
vwaddu.wv v2,v4,v1
vse16.v v2,0(a1)
ret
// Clang 21.1.0 -march=rv64gcv -Os
// Here Clang doesn't move vsetvli before the loop, but uses the same vd = vs2.
test:
li a2, 8
vsetivli zero, 8, e16, m2, ta, ma
vmv.v.i v8, 0
.LBB0_1:
vsetvli zero, zero, e8, m1, ta, ma
vle8.v v10, (a0)
addi a2, a2, -1
vwaddu.wv v8, v8, v10
addi a0, a0, 8
bnez a2, .LBB0_1
vse16.v v8, (a1)
ret
// Clang 21.1.0 -march=rv64gcv -O3
test:
vsetivli zero, 8, e16, m2, ta, ma
vmv.v.i v8, 0
vle8.v v10, (a0)
addi a2, a0, 8
vle8.v v11, (a2)
addi a2, a0, 16
vle8.v v12, (a2)
addi a2, a0, 24
vle8.v v13, (a2)
addi a2, a0, 32
vsetvli zero, zero, e8, m1, ta, ma
vwaddu.wv v8, v8, v10
vle8.v v10, (a2)
addi a2, a0, 40
vwaddu.wv v8, v8, v11
vle8.v v11, (a2)
addi a2, a0, 48
addi a0, a0, 56
vwaddu.wv v8, v8, v12
vle8.v v12, (a2)
vwaddu.wv v8, v8, v13
vle8.v v13, (a0)
vwaddu.wv v8, v8, v10
vwaddu.wv v8, v8, v11
vwaddu.wv v8, v8, v12
vwaddu.wv v8, v8, v13
vse16.v v8, (a1)
ret