https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108271

            Bug ID: 108271
           Summary: Missed RVV cost model
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

#include "riscv_vector.h"

void f3 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
{
  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
  vbool64_t mask = *(vbool64_t*)mask_in;
  for (int i = 0; i < n; i++)
    {
      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);

      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);

      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in +
i + 200), 13);
      *(vint32mf2_t*)(out + i + 200) = v3;

      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i +
300), 11);
      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);

      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i
+ 500), 11);
      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);

      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i +
600), 11);
      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);

      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);

      vuint8mf4_t v8 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 800), 11);
      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 800), v7, 11);

      vuint8mf4_t v9 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 900), 11);
      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 900), v7, 11);

      vuint8mf4_t v10 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 1000), 11);
      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 1000), v7, 11);
    }
}

-O3 -S ASM:
f3:
        li      a5,40960
        addi    a5,a5,-960
        addi    sp,sp,-64
        sd      s4,24(sp)
        add     a4,a0,a5
        add     a5,a1,a5
        vsetivli        zero,19,e32,mf2,ta,ma
        vle32.v v24,0(a4)
        vse32.v v24,0(a5)
        vsetvli s4,zero,e8,mf8,ta,ma
        vlm.v   v0,0(a2)
        ble     a3,zero,.L1
        addi    a3,a3,1
        sd      s3,32(sp)
        slli    a3,a3,2
        li      s3,4096
        sd      s2,40(sp)
        sd      s5,16(sp)
        sd      s6,8(sp)
        addi    t6,s3,-1700
        addi    t5,s3,-1300
        addi    s6,s3,-900
        addi    s5,s3,-500
        sd      s0,56(sp)
        sd      s1,48(sp)
        addi    a0,a0,4
        addi    a4,a1,4
        add     s2,a1,a3
        addi    s3,s3,-100
.L3:
        vsetivli        zero,19,e16,mf2,ta,ma
        mv      a5,a4
        vle16.v v24,0(a0)
        mv      a3,a0
        vse16.v v24,0(a4)
        addi    a0,a0,4
        vsetivli        zero,19,e32,mf2,ta,ma
        addi    a4,a4,4
        vle32.v v24,0(a0)
        addi    s1,a3,796
        vse32.v v24,0(a4)
        vsetivli        zero,13,e32,mf2,tu,mu
        addi    s0,a5,796
        vle32.v v24,0(s1),v0.t
        addi    a1,a3,1196
        addi    t4,a5,1196
        addi    t2,a3,1996
        addi    t3,a5,1996
        add     t0,a3,t6
        vsetvli s4,zero,e32,mf2,ta,ma
        add     t1,a5,t6
        vse32.v v24,0(s0)
        add     a7,a5,t5
        vsetivli        zero,11,e64,m1,tu,mu
        add     a6,a5,s6
        vle64.v v24,0(a1),v0.t
        add     a2,a5,s5
        vse64.v v24,0(t4)
        add     a3,a3,t5
        vle64.v v24,0(t2),v0.t
        add     a5,a5,s3
        vse64.v v24,0(t3)
        vle64.v v24,0(t0),v0.t
        vse64.v v24,0(t1),v0.t
        vsetivli        zero,11,e8,mf4,ta,ma
        vle8.v  v24,0(a3)
        vse8.v  v24,0(a7)
        vse8.v  v24,0(a6)
        vse8.v  v24,0(a2)
        vse8.v  v24,0(a5)
        bne     s2,a4,.L3
        ld      s0,56(sp)
        ld      s1,48(sp)
        ld      s2,40(sp)
        ld      s3,32(sp)
        ld      s5,16(sp)
        ld      s6,8(sp)
.L1:
        ld      s4,24(sp)
        addi    sp,sp,64
        jr      ra

GCC allocate redundant stack and generate a lot of redundant ld or sd
instructions.

However, if we use -O3 -fno-schedule-insns ASM:
f3:
        li      a5,40960
        addi    a5,a5,-960
        add     a4,a0,a5
        add     a5,a1,a5
        vsetivli        zero,19,e32,mf2,ta,ma
        vle32.v v24,0(a4)
        vse32.v v24,0(a5)
        vsetvli t3,zero,e8,mf8,ta,ma
        vlm.v   v0,0(a2)
        ble     a3,zero,.L1
        addi    a3,a3,1
        li      t1,4096
        slli    a3,a3,2
        addi    a4,a1,4
        addi    a7,t1,-1700
        addi    a6,t1,-1300
        addi    t5,t1,-900
        addi    t4,t1,-500
        addi    a2,a0,4
        add     a1,a1,a3
        addi    t1,t1,-100
.L3:
        mv      a3,a2
        vsetivli        zero,19,e16,mf2,ta,ma
        mv      a5,a4
        vle16.v v24,0(a2)
        addi    a0,a3,796
        vse16.v v24,0(a4)
        addi    a2,a2,4
        vsetivli        zero,19,e32,mf2,ta,ma
        addi    a4,a4,4
        vle32.v v24,0(a2)
        vse32.v v24,0(a4)
        vsetivli        zero,13,e32,mf2,tu,mu
        vle32.v v24,0(a0),v0.t
        addi    a0,a5,796
        vsetvli t3,zero,e32,mf2,ta,ma
        vse32.v v24,0(a0)
        addi    a0,a3,1196
        vsetivli        zero,11,e64,m1,tu,mu
        vle64.v v24,0(a0),v0.t
        addi    a0,a5,1196
        vse64.v v24,0(a0)
        addi    a0,a3,1996
        vle64.v v24,0(a0),v0.t
        addi    a0,a5,1996
        vse64.v v24,0(a0)
        add     a0,a3,a7
        vle64.v v24,0(a0),v0.t
        add     a3,a3,a6
        add     a0,a5,a7
        vse64.v v24,0(a0),v0.t
        vsetivli        zero,11,e8,mf4,ta,ma
        vle8.v  v24,0(a3)
        add     a3,a5,a6
        vse8.v  v24,0(a3)
        add     a3,a5,t5
        vse8.v  v24,0(a3)
        add     a3,a5,t4
        add     a5,a5,t1
        vse8.v  v24,0(a3)
        vse8.v  v24,0(a5)
        bne     a1,a4,.L3
.L1:
        ret

This issue is gone. we should correctly adjust the RVV instruction COST model
to make the codegen of with -fno-schedule-insns and without -fno-schedule-insns
the same.

Reply via email to