From: Zhanheng Yang <[email protected]> Bench on A210 C908 core(VLEN 128). put_hevc_epel_hv4_8_c: 390.0 ( 1.00x) put_hevc_epel_hv4_8_rvv_i32: 213.0 ( 1.83x) put_hevc_epel_hv6_8_c: 749.8 ( 1.00x) put_hevc_epel_hv6_8_rvv_i32: 290.8 ( 2.58x) put_hevc_epel_hv8_8_c: 1215.5 ( 1.00x) put_hevc_epel_hv8_8_rvv_i32: 360.7 ( 3.37x) put_hevc_epel_hv12_8_c: 2602.5 ( 1.00x) put_hevc_epel_hv12_8_rvv_i32: 515.4 ( 5.05x) put_hevc_epel_hv16_8_c: 4417.0 ( 1.00x) put_hevc_epel_hv16_8_rvv_i32: 661.8 ( 6.67x) put_hevc_epel_hv24_8_c: 9524.8 ( 1.00x) put_hevc_epel_hv24_8_rvv_i32: 1909.2 ( 4.99x) put_hevc_epel_hv32_8_c: 16589.1 ( 1.00x) put_hevc_epel_hv32_8_rvv_i32: 2508.0 ( 6.61x) put_hevc_epel_hv48_8_c: 37145.4 ( 1.00x) put_hevc_epel_hv48_8_rvv_i32: 5526.8 ( 6.72x) put_hevc_epel_hv64_8_c: 65015.9 ( 1.00x) put_hevc_epel_hv64_8_rvv_i32: 9751.9 ( 6.67x) put_hevc_epel_uni_hv4_8_c: 434.8 ( 1.00x) put_hevc_epel_uni_hv4_8_rvv_i32: 238.8 ( 1.82x) put_hevc_epel_uni_hv6_8_c: 856.8 ( 1.00x) put_hevc_epel_uni_hv6_8_rvv_i32: 329.6 ( 2.60x) put_hevc_epel_uni_hv8_8_c: 1474.2 ( 1.00x) put_hevc_epel_uni_hv8_8_rvv_i32: 412.9 ( 3.57x) put_hevc_epel_uni_hv12_8_c: 2995.9 ( 1.00x) put_hevc_epel_uni_hv12_8_rvv_i32: 593.9 ( 5.04x) put_hevc_epel_uni_hv16_8_c: 5128.2 ( 1.00x) put_hevc_epel_uni_hv16_8_rvv_i32: 770.6 ( 6.66x) put_hevc_epel_uni_hv24_8_c: 11159.5 ( 1.00x) put_hevc_epel_uni_hv24_8_rvv_i32: 2223.1 ( 5.02x) put_hevc_epel_uni_hv32_8_c: 19462.3 ( 1.00x) put_hevc_epel_uni_hv32_8_rvv_i32: 2925.1 ( 6.65x) put_hevc_epel_uni_hv48_8_c: 43480.5 ( 1.00x) put_hevc_epel_uni_hv48_8_rvv_i32: 6476.7 ( 6.71x) put_hevc_epel_uni_hv64_8_c: 76411.2 ( 1.00x) put_hevc_epel_uni_hv64_8_rvv_i32: 11456.7 ( 6.67x) put_hevc_epel_uni_w_hv4_8_c: 557.8 ( 1.00x) put_hevc_epel_uni_w_hv4_8_rvv_i32: 287.9 ( 1.94x) put_hevc_epel_uni_w_hv6_8_c: 1068.0 ( 1.00x) put_hevc_epel_uni_w_hv6_8_rvv_i32: 399.4 ( 2.67x) put_hevc_epel_uni_w_hv8_8_c: 1835.2 ( 1.00x) put_hevc_epel_uni_w_hv8_8_rvv_i32: 507.3 ( 3.62x) put_hevc_epel_uni_w_hv12_8_c: 3758.9 ( 1.00x) put_hevc_epel_uni_w_hv12_8_rvv_i32: 729.2 ( 5.15x) put_hevc_epel_uni_w_hv16_8_c: 6524.5 ( 1.00x) put_hevc_epel_uni_w_hv16_8_rvv_i32: 954.7 ( 6.83x) put_hevc_epel_uni_w_hv24_8_c: 14094.2 ( 1.00x) put_hevc_epel_uni_w_hv24_8_rvv_i32: 2764.9 ( 5.10x) put_hevc_epel_uni_w_hv32_8_c: 24887.0 ( 1.00x) put_hevc_epel_uni_w_hv32_8_rvv_i32: 3640.5 ( 6.84x) put_hevc_epel_uni_w_hv48_8_c: 55341.0 ( 1.00x) put_hevc_epel_uni_w_hv48_8_rvv_i32: 8083.8 ( 6.85x) put_hevc_epel_uni_w_hv64_8_c: 97377.8 ( 1.00x) put_hevc_epel_uni_w_hv64_8_rvv_i32: 14322.9 ( 6.80x) put_hevc_epel_bi_hv4_8_c: 472.2 ( 1.00x) put_hevc_epel_bi_hv4_8_rvv_i32: 250.0 ( 1.89x) put_hevc_epel_bi_hv6_8_c: 903.1 ( 1.00x) put_hevc_epel_bi_hv6_8_rvv_i32: 341.3 ( 2.65x) put_hevc_epel_bi_hv8_8_c: 1583.5 ( 1.00x) put_hevc_epel_bi_hv8_8_rvv_i32: 433.1 ( 3.66x) put_hevc_epel_bi_hv12_8_c: 3205.8 ( 1.00x) put_hevc_epel_bi_hv12_8_rvv_i32: 615.0 ( 5.21x) put_hevc_epel_bi_hv16_8_c: 5504.1 ( 1.00x) put_hevc_epel_bi_hv16_8_rvv_i32: 800.3 ( 6.88x) put_hevc_epel_bi_hv24_8_c: 11897.2 ( 1.00x) put_hevc_epel_bi_hv24_8_rvv_i32: 2309.9 ( 5.15x) put_hevc_epel_bi_hv32_8_c: 20823.8 ( 1.00x) put_hevc_epel_bi_hv32_8_rvv_i32: 3031.2 ( 6.87x) put_hevc_epel_bi_hv48_8_c: 46854.5 ( 1.00x) put_hevc_epel_bi_hv48_8_rvv_i32: 6713.2 ( 6.98x) put_hevc_epel_bi_hv64_8_c: 82399.2 ( 1.00x) put_hevc_epel_bi_hv64_8_rvv_i32: 11901.4 ( 6.92x)
Signed-off-by: Zhanheng Yang <[email protected]> --- libavcodec/riscv/h26x/h2656dsp.h | 11 + libavcodec/riscv/h26x/hevcepel_rvv.S | 325 +++++++++++++++++++++++++-- libavcodec/riscv/hevcdsp_init.c | 4 + 3 files changed, 325 insertions(+), 15 deletions(-) diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h index 7e320bd795..b8a116bdf7 100644 --- a/libavcodec/riscv/h26x/h2656dsp.h +++ b/libavcodec/riscv/h26x/h2656dsp.h @@ -81,4 +81,15 @@ void ff_hevc_put_epel_uni_w_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, void ff_hevc_put_epel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_hv_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_uni_hv_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_uni_w_hv_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_bi_hv_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, + ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t + mx, intptr_t my, int width); #endif diff --git a/libavcodec/riscv/h26x/hevcepel_rvv.S b/libavcodec/riscv/h26x/hevcepel_rvv.S index caca0b88ab..7a4a3f3318 100644 --- a/libavcodec/riscv/h26x/hevcepel_rvv.S +++ b/libavcodec/riscv/h26x/hevcepel_rvv.S @@ -285,8 +285,8 @@ func ff_hevc_put_epel_v_8_\lmul\()_rvv, zve32x sx s4, 24(sp) load_filter a5 sub a1, a1, a2 # src - src_stride - li t1, 0 # offset - mv t4, a3 + li t1, 0 # offset + mv t4, a3 1: add t2, a1, t1 @@ -310,7 +310,7 @@ func ff_hevc_put_epel_v_8_\lmul\()_rvv, zve32x vse16.v v0, (t3) add t3, t3, 2*HEVC_MAX_PB_SIZE addi a3, a3, -1 - bgt a3, zero, 2b + bgt a3, zero, 2b add t1, t1, t5 sub a6, a6, t5 mv a3, t4 @@ -325,7 +325,7 @@ func ff_hevc_put_epel_v_8_\lmul\()_rvv, zve32x endfunc func ff_hevc_put_epel_uni_v_8_\lmul\()_rvv, zve32x - csrwi vxrm, 0 + csrwi vxrm, 0 addi sp, sp, -32 sx s1, 0(sp) sx s2, 8(sp) @@ -333,8 +333,8 @@ func ff_hevc_put_epel_uni_v_8_\lmul\()_rvv, zve32x sx s4, 24(sp) load_filter a6 sub a2, a2, a3 # src - src_stride - li t1, 0 # offset - mv t4, a4 + li t1, 0 # offset + mv t4, a4 1: add t2, a2, t1 @@ -360,7 +360,7 @@ func ff_hevc_put_epel_uni_v_8_\lmul\()_rvv, zve32x vse8.v v0, (t3) add t3, t3, a1 addi a4, a4, -1 - bgt a4, zero, 2b + bgt a4, zero, 2b add t1, t1, t5 sub a7, a7, t5 mv a4, t4 @@ -375,7 +375,7 @@ func ff_hevc_put_epel_uni_v_8_\lmul\()_rvv, zve32x endfunc func ff_hevc_put_epel_uni_w_v_8_\lmul\()_rvv, zve32x - csrwi vxrm, 0 + csrwi vxrm, 0 #if (__riscv_xlen == 32) lw t1, 4(sp) # my lw t6, 8(sp) # width @@ -391,8 +391,8 @@ func ff_hevc_put_epel_uni_w_v_8_\lmul\()_rvv, zve32x load_filter t1 addi a5, a5, 6 # shift sub a2, a2, a3 # src - src_stride - li t1, 0 # offset - mv t4, a4 + li t1, 0 # offset + mv t4, a4 1: add t2, a2, t1 @@ -424,7 +424,7 @@ func ff_hevc_put_epel_uni_w_v_8_\lmul\()_rvv, zve32x vse8.v v0, (t3) add t3, t3, a1 addi a4, a4, -1 - bgt a4, zero, 2b + bgt a4, zero, 2b add t1, t1, t5 sub t6, t6, t5 mv a4, t4 @@ -439,7 +439,7 @@ func ff_hevc_put_epel_uni_w_v_8_\lmul\()_rvv, zve32x endfunc func ff_hevc_put_epel_bi_v_8_\lmul\()_rvv, zve32x - csrwi vxrm, 0 + csrwi vxrm, 0 lw t6, 0(sp) # width addi sp, sp, -32 sx s1, 0(sp) @@ -448,8 +448,8 @@ func ff_hevc_put_epel_bi_v_8_\lmul\()_rvv, zve32x sx s4, 24(sp) load_filter a7 sub a2, a2, a3 # src - src_stride - li t1, 0 # offset - mv t4, a5 + li t1, 0 # offset + mv t4, a5 1: add t2, a2, t1 @@ -495,4 +495,299 @@ func ff_hevc_put_epel_bi_v_8_\lmul\()_rvv, zve32x endfunc .endm -hevc_epel_v m1, m2, m4 \ No newline at end of file +hevc_epel_v m1, m2, m4 + +.macro filter_v_s vdst, vsrc0, vsrc1, vsrc2, vsrc3 + vwmul.vx \vdst, \vsrc0, s5 + vwmacc.vx \vdst, s6, \vsrc1 + vmv.v.v \vsrc0, \vsrc1 + vwmacc.vx \vdst, s7, \vsrc2 + vmv.v.v \vsrc1, \vsrc2 + vwmacc.vx \vdst, s8, \vsrc3 + vmv.v.v \vsrc2, \vsrc3 +.endm + +/* clobbers t0, t1 */ +.macro load_filter2 m + la t0, qpel_filters + slli t1, \m, 2 + add t0, t0, t1 + lb s5, 0(t0) + lb s6, 1(t0) + lb s7, 2(t0) + lb s8, 3(t0) +.endm + +.macro hevc_epel_hv lmul, lmul2, lmul4 +func ff_hevc_put_epel_hv_8_\lmul\()_rvv, zve32x + csrwi vxrm, 2 + addi sp, sp, -64 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + sx s5, 32(sp) + sx s6, 40(sp) + sx s7, 48(sp) + sx s8, 56(sp) + load_filter a4 + load_filter2 a5 + sub a1, a1, a2 # src - src_stride + mv t0, a3 + li t1, 0 # offset + +1: + add t2, a1, t1 + slli t3, t1, 1 + add t3, a0, t3 + + vsetvli t6, a6, e8, \lmul, ta, ma + filter_h v4, v24, v26, v28, v30, t2 + add t2, t2, a2 + filter_h v8, v24, v26, v28, v30, t2 + add t2, t2, a2 + filter_h v12, v24, v26, v28, v30, t2 + add t2, t2, a2 + +2: + vsetvli zero, zero, e8, \lmul, ta, ma + filter_h v16, v24, v26, v28, v30, t2 + add t2, t2, a2 + + vsetvli zero, zero, e16, \lmul2, ta, ma + filter_v_s v0, v4, v8, v12, v16 + vnclip.wi v0, v0, 6 + vse16.v v0, (t3) + addi a3, a3, -1 + addi t3, t3, 2*HEVC_MAX_PB_SIZE + bgt a3, zero, 2b + mv a3, t0 + add t1, t1, t6 + sub a6, a6, t6 + bgt a6, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + lx s5, 32(sp) + lx s6, 40(sp) + lx s7, 48(sp) + lx s8, 56(sp) + addi sp, sp, 64 + ret +endfunc + +func ff_hevc_put_epel_uni_hv_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 + addi sp, sp, -64 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + sx s5, 32(sp) + sx s6, 40(sp) + sx s7, 48(sp) + sx s8, 56(sp) + load_filter a5 + load_filter2 a6 + sub a2, a2, a3 # src - src_stride + mv t0, a4 + li t1, 0 # offset + +1: + add t2, a2, t1 + add t3, a0, t1 + + vsetvli t6, a7, e8, \lmul, ta, ma + filter_h v4, v24, v26, v28, v30, t2 + add t2, t2, a3 + filter_h v8, v24, v26, v28, v30, t2 + add t2, t2, a3 + filter_h v12, v24, v26, v28, v30, t2 + add t2, t2, a3 + +2: + vsetvli zero, zero, e8, \lmul, ta, ma + filter_h v16, v24, v26, v28, v30, t2 + add t2, t2, a3 + + vsetvli zero, zero, e16, \lmul2, ta, ma + filter_v_s v0, v4, v8, v12, v16 + vsetvli zero, zero, e32, \lmul4, ta, ma + vsra.vi v0, v0, 6 + vmax.vx v0, v0, zero + vsetvli zero, zero, e16, \lmul2, ta, ma + vnclipu.wi v0, v0, 6 + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 0 + vse8.v v0, (t3) + addi a4, a4, -1 + add t3, t3, a1 + bgt a4, zero, 2b + mv a4, t0 + add t1, t1, t6 + sub a7, a7, t6 + bgt a7, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + lx s5, 32(sp) + lx s6, 40(sp) + lx s7, 48(sp) + lx s8, 56(sp) + addi sp, sp, 64 + ret +endfunc + +func ff_hevc_put_epel_uni_w_hv_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 + lx t2, 0(sp) # mx +#if (__riscv_xlen == 32) + lw t4, 4(sp) # my + lw t5, 8(sp) # width +#elif (__riscv_xlen == 64) + ld t4, 8(sp) + lw t5, 16(sp) +#endif + addi a5, a5, 6 # shift + addi sp, sp, -64 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + sx s5, 32(sp) + sx s6, 40(sp) + sx s7, 48(sp) + sx s8, 56(sp) + load_filter t2 + load_filter2 t4 + sub a2, a2, a3 # src - src_stride + mv t0, a4 + li t1, 0 # offset + +1: + add t2, a2, t1 + add t3, a0, t1 + + vsetvli t6, t5, e8, \lmul, ta, ma + filter_h v4, v24, v26, v28, v30, t2 + add t2, t2, a3 + filter_h v8, v24, v26, v28, v30, t2 + add t2, t2, a3 + filter_h v12, v24, v26, v28, v30, t2 + add t2, t2, a3 + +2: + vsetvli zero, zero, e8, \lmul, ta, ma + filter_h v16, v24, v26, v28, v30, t2 + add t2, t2, a3 + + vsetvli zero, zero, e16, \lmul2, ta, ma + filter_v_s v0, v4, v8, v12, v16 + vsetvli zero, zero, e32, \lmul4, ta, ma + vsra.vi v0, v0, 6 + vmul.vx v0, v0, a6 + vssra.vx v0, v0, a5 + vsadd.vx v0, v0, a7 + vmax.vx v0, v0, zero + vsetvli zero, zero, e16, \lmul2, ta, ma + vnclip.wi v0, v0, 0 + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 0 + vse8.v v0, (t3) + addi a4, a4, -1 + add t3, t3, a1 + bgt a4, zero, 2b + mv a4, t0 + add t1, t1, t6 + sub t5, t5, t6 + bgt t5, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + lx s5, 32(sp) + lx s6, 40(sp) + lx s7, 48(sp) + lx s8, 56(sp) + addi sp, sp, 64 + ret +endfunc + +func ff_hevc_put_epel_bi_hv_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 + lw t3, 0(sp) # width + addi sp, sp, -64 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + sx s5, 32(sp) + sx s6, 40(sp) + sx s7, 48(sp) + sx s8, 56(sp) + load_filter a6 + load_filter2 a7 + mv a6, t3 + sub a2, a2, a3 # src - src_stride + mv t0, a5 + li t1, 0 # offset + +1: + add t2, a2, t1 + add t3, a0, t1 + slli t5, t1, 1 + add t5, a4, t5 + + vsetvli t6, a6, e8, \lmul, ta, ma + filter_h v4, v24, v26, v28, v30, t2 + add t2, t2, a3 + filter_h v8, v24, v26, v28, v30, t2 + add t2, t2, a3 + filter_h v12, v24, v26, v28, v30, t2 + add t2, t2, a3 + +2: + vsetvli zero, zero, e8, \lmul, ta, ma + filter_h v16, v24, v26, v28, v30, t2 + add t2, t2, a3 + + vsetvli zero, zero, e16, \lmul2, ta, ma + vle16.V v24, (t5) + addi t5, t5, 2*HEVC_MAX_PB_SIZE + filter_v_s v0, v4, v8, v12, v16 + vsetvli zero, zero, e32, \lmul4, ta, ma + vsra.vi v0, v0, 6 + vsetvli zero, zero, e16, \lmul2, ta, ma + vwadd.wv v0, v0, v24 + vnclip.wi v0, v0, 7 + vmax.vx v0, v0, zero + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 0 + vse8.v v0, (t3) + addi a5, a5, -1 + add t3, t3, a1 + bgt a5, zero, 2b + mv a5, t0 + add t1, t1, t6 + sub a6, a6, t6 + bgt a6, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + lx s5, 32(sp) + lx s6, 40(sp) + lx s7, 48(sp) + lx s8, 56(sp) + addi sp, sp, 64 + ret +endfunc +.endm + +hevc_epel_hv m1, m2, m4 diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c index 53c800626f..1df7eb654a 100644 --- a/libavcodec/riscv/hevcdsp_init.c +++ b/libavcodec/riscv/hevcdsp_init.c @@ -102,6 +102,10 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth) RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 1, 0, ff_hevc_put_epel_uni_v_8_m1_rvv); RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 1, 0, ff_hevc_put_epel_uni_w_v_8_m1_rvv); RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 1, 0, ff_hevc_put_epel_bi_v_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel, 1, 1, ff_hevc_put_epel_hv_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 1, 1, ff_hevc_put_epel_uni_hv_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 1, 1, ff_hevc_put_epel_uni_w_hv_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 1, 1, ff_hevc_put_epel_bi_hv_8_m1_rvv); break; default: break; -- 2.25.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
