https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108271
Bug ID: 108271 Summary: Missed RVV cost model Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: juzhe.zhong at rivai dot ai Target Milestone: --- #include "riscv_vector.h" void f3 (int * restrict in, int * restrict out, void * restrict mask_in, int n) { vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19); __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19); vbool64_t mask = *(vbool64_t*)mask_in; for (int i = 0; i < n; i++) { vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19); __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19); vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19); __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19); vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13); *(vint32mf2_t*)(out + i + 200) = v3; vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11); __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11); vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11); __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11); vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11); __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11); vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11); vuint8mf4_t v8 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 800), 11); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 800), v7, 11); vuint8mf4_t v9 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 900), 11); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 900), v7, 11); vuint8mf4_t v10 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 1000), 11); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 1000), v7, 11); } } -O3 -S ASM: f3: li a5,40960 addi a5,a5,-960 addi sp,sp,-64 sd s4,24(sp) add a4,a0,a5 add a5,a1,a5 vsetivli zero,19,e32,mf2,ta,ma vle32.v v24,0(a4) vse32.v v24,0(a5) vsetvli s4,zero,e8,mf8,ta,ma vlm.v v0,0(a2) ble a3,zero,.L1 addi a3,a3,1 sd s3,32(sp) slli a3,a3,2 li s3,4096 sd s2,40(sp) sd s5,16(sp) sd s6,8(sp) addi t6,s3,-1700 addi t5,s3,-1300 addi s6,s3,-900 addi s5,s3,-500 sd s0,56(sp) sd s1,48(sp) addi a0,a0,4 addi a4,a1,4 add s2,a1,a3 addi s3,s3,-100 .L3: vsetivli zero,19,e16,mf2,ta,ma mv a5,a4 vle16.v v24,0(a0) mv a3,a0 vse16.v v24,0(a4) addi a0,a0,4 vsetivli zero,19,e32,mf2,ta,ma addi a4,a4,4 vle32.v v24,0(a0) addi s1,a3,796 vse32.v v24,0(a4) vsetivli zero,13,e32,mf2,tu,mu addi s0,a5,796 vle32.v v24,0(s1),v0.t addi a1,a3,1196 addi t4,a5,1196 addi t2,a3,1996 addi t3,a5,1996 add t0,a3,t6 vsetvli s4,zero,e32,mf2,ta,ma add t1,a5,t6 vse32.v v24,0(s0) add a7,a5,t5 vsetivli zero,11,e64,m1,tu,mu add a6,a5,s6 vle64.v v24,0(a1),v0.t add a2,a5,s5 vse64.v v24,0(t4) add a3,a3,t5 vle64.v v24,0(t2),v0.t add a5,a5,s3 vse64.v v24,0(t3) vle64.v v24,0(t0),v0.t vse64.v v24,0(t1),v0.t vsetivli zero,11,e8,mf4,ta,ma vle8.v v24,0(a3) vse8.v v24,0(a7) vse8.v v24,0(a6) vse8.v v24,0(a2) vse8.v v24,0(a5) bne s2,a4,.L3 ld s0,56(sp) ld s1,48(sp) ld s2,40(sp) ld s3,32(sp) ld s5,16(sp) ld s6,8(sp) .L1: ld s4,24(sp) addi sp,sp,64 jr ra GCC allocate redundant stack and generate a lot of redundant ld or sd instructions. However, if we use -O3 -fno-schedule-insns ASM: f3: li a5,40960 addi a5,a5,-960 add a4,a0,a5 add a5,a1,a5 vsetivli zero,19,e32,mf2,ta,ma vle32.v v24,0(a4) vse32.v v24,0(a5) vsetvli t3,zero,e8,mf8,ta,ma vlm.v v0,0(a2) ble a3,zero,.L1 addi a3,a3,1 li t1,4096 slli a3,a3,2 addi a4,a1,4 addi a7,t1,-1700 addi a6,t1,-1300 addi t5,t1,-900 addi t4,t1,-500 addi a2,a0,4 add a1,a1,a3 addi t1,t1,-100 .L3: mv a3,a2 vsetivli zero,19,e16,mf2,ta,ma mv a5,a4 vle16.v v24,0(a2) addi a0,a3,796 vse16.v v24,0(a4) addi a2,a2,4 vsetivli zero,19,e32,mf2,ta,ma addi a4,a4,4 vle32.v v24,0(a2) vse32.v v24,0(a4) vsetivli zero,13,e32,mf2,tu,mu vle32.v v24,0(a0),v0.t addi a0,a5,796 vsetvli t3,zero,e32,mf2,ta,ma vse32.v v24,0(a0) addi a0,a3,1196 vsetivli zero,11,e64,m1,tu,mu vle64.v v24,0(a0),v0.t addi a0,a5,1196 vse64.v v24,0(a0) addi a0,a3,1996 vle64.v v24,0(a0),v0.t addi a0,a5,1996 vse64.v v24,0(a0) add a0,a3,a7 vle64.v v24,0(a0),v0.t add a3,a3,a6 add a0,a5,a7 vse64.v v24,0(a0),v0.t vsetivli zero,11,e8,mf4,ta,ma vle8.v v24,0(a3) add a3,a5,a6 vse8.v v24,0(a3) add a3,a5,t5 vse8.v v24,0(a3) add a3,a5,t4 add a5,a5,t1 vse8.v v24,0(a3) vse8.v v24,0(a5) bne a1,a4,.L3 .L1: ret This issue is gone. we should correctly adjust the RVV instruction COST model to make the codegen of with -fno-schedule-insns and without -fno-schedule-insns the same.