https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123954

            Bug ID: 123954
           Summary: [riscv64, vector] missed optimization, extra move
                    instruction
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jpegqs at gmail dot com
  Target Milestone: ---

Why does GCC insert a move instruction for vwaddu when vd = vs2? Does this for
both RVV 1.0 and xtheadvector.

Here's an example:

#include <riscv_vector.h>

void test(uint8_t *src, uint16_t *dst) {
        vuint16m2_t vsum = __riscv_vmv_v_x_u16m2(0, 8);
        for (int i = 0; i < 8; i++) {
                vuint8m1_t h0 = __riscv_vle8_v_u8m1(src + i * 8, 8);
                vsum = __riscv_vwaddu_wv_u16m2(vsum, h0, 8);
        }
        __riscv_vse16_v_u16m2(dst, vsum, 8);
}

Clang can use the same register. Whose assembly code is correct?

// GCC 16.0.0 -march=rv64gc_xtheadvector -Os

test:
        li      a5,8
        th.vsetvli      zero,a5,e16,m2
        th.vmv.v.i      v2,0
        addi    a4,a0,64
        th.vsetvli      zero,a5,e8,m1
.L2:
        th.vle.v        v1,0(a0)
        th.vsetvli      zero,zero,e16,m2
        th.vmv.v.v      v4,v2
        addi    a0,a0,8
        th.vsetvli      zero,a5,e8,m1
        th.vwaddu.wv    v2,v4,v1
        bne     a0,a4,.L2
        th.vsetvli      zero,a5,e16,m2
        th.vse.v        v2,0(a1)
        ret

// GCC 16.0.0 -march=rv64gcv -Os

test:
        vsetivli        zero,8,e16,m2,ta,ma
        vmv.v.i v2,0
        addi    a5,a0,64
        vsetvli zero,zero,e8,m1,ta,ma
.L2:
        vle8.v  v1,0(a0)
        vmv2r.v v4,v2
        addi    a0,a0,8
        vwaddu.wv       v2,v4,v1
        bne     a0,a5,.L2
        vse16.v v2,0(a1)
        ret

// GCC 16.0.0 -march=rv64gcv -O3

test:
        vsetivli        zero,8,e16,m2,ta,ma
        vle8.v  v6,0(a0)
        addi    a5,a0,8
        vle8.v  v1,0(a5)
        vmv.v.i v2,0
        addi    a5,a0,16
        vle8.v  v7,0(a5)
        vsetvli zero,zero,e8,m1,ta,ma
        vwaddu.wv       v4,v2,v6
        addi    a5,a0,24
        vle8.v  v6,0(a5)
        vwaddu.wv       v2,v4,v1
        addi    a5,a0,32
        vle8.v  v1,0(a5)
        vwaddu.wv       v4,v2,v7
        addi    a5,a0,40
        vle8.v  v7,0(a5)
        vwaddu.wv       v2,v4,v6
        addi    a5,a0,48
        vle8.v  v6,0(a5)
        vwaddu.wv       v4,v2,v1
        addi    a0,a0,56
        vle8.v  v1,0(a0)
        vwaddu.wv       v2,v4,v7
        vwaddu.wv       v4,v2,v6
        vwaddu.wv       v2,v4,v1
        vse16.v v2,0(a1)
        ret

// Clang 21.1.0 -march=rv64gcv -Os

// Here Clang doesn't move vsetvli before the loop, but uses the same vd = vs2.

test:
        li      a2, 8
        vsetivli        zero, 8, e16, m2, ta, ma
        vmv.v.i v8, 0
.LBB0_1:
        vsetvli zero, zero, e8, m1, ta, ma
        vle8.v  v10, (a0)
        addi    a2, a2, -1
        vwaddu.wv       v8, v8, v10
        addi    a0, a0, 8
        bnez    a2, .LBB0_1
        vse16.v v8, (a1)
        ret

// Clang 21.1.0 -march=rv64gcv -O3

test:
        vsetivli        zero, 8, e16, m2, ta, ma
        vmv.v.i v8, 0
        vle8.v  v10, (a0)
        addi    a2, a0, 8
        vle8.v  v11, (a2)
        addi    a2, a0, 16
        vle8.v  v12, (a2)
        addi    a2, a0, 24
        vle8.v  v13, (a2)
        addi    a2, a0, 32
        vsetvli zero, zero, e8, m1, ta, ma
        vwaddu.wv       v8, v8, v10
        vle8.v  v10, (a2)
        addi    a2, a0, 40
        vwaddu.wv       v8, v8, v11
        vle8.v  v11, (a2)
        addi    a2, a0, 48
        addi    a0, a0, 56
        vwaddu.wv       v8, v8, v12
        vle8.v  v12, (a2)
        vwaddu.wv       v8, v8, v13
        vle8.v  v13, (a0)
        vwaddu.wv       v8, v8, v10
        vwaddu.wv       v8, v8, v11
        vwaddu.wv       v8, v8, v12
        vwaddu.wv       v8, v8, v13
        vse16.v v8, (a1)
        ret

Reply via email to