From: Zhanheng Yang <[email protected]> Bench on A210 C908 core(VLEN 128). put_hevc_epel_v4_8_c: 157.8 ( 1.00x) put_hevc_epel_v4_8_rvv_i32: 73.2 ( 2.16x) put_hevc_epel_v6_8_c: 314.6 ( 1.00x) put_hevc_epel_v6_8_rvv_i32: 101.2 ( 3.11x) put_hevc_epel_v8_8_c: 545.5 ( 1.00x) put_hevc_epel_v8_8_rvv_i32: 124.4 ( 4.39x) put_hevc_epel_v12_8_c: 1240.8 ( 1.00x) put_hevc_epel_v12_8_rvv_i32: 183.6 ( 6.76x) put_hevc_epel_v16_8_c: 2170.7 ( 1.00x) put_hevc_epel_v16_8_rvv_i32: 235.1 ( 9.23x) put_hevc_epel_v24_8_c: 4743.5 ( 1.00x) put_hevc_epel_v24_8_rvv_i32: 677.5 ( 7.00x) put_hevc_epel_v32_8_c: 8353.4 ( 1.00x) put_hevc_epel_v32_8_rvv_i32: 892.1 ( 9.36x) put_hevc_epel_v48_8_c: 18608.1 ( 1.00x) put_hevc_epel_v48_8_rvv_i32: 1956.1 ( 9.51x) put_hevc_epel_v64_8_c: 32934.3 ( 1.00x) put_hevc_epel_v64_8_rvv_i32: 3454.1 ( 9.53x) put_hevc_epel_uni_v4_8_c: 237.5 ( 1.00x) put_hevc_epel_uni_v4_8_rvv_i32: 87.5 ( 2.72x) put_hevc_epel_uni_v6_8_c: 509.5 ( 1.00x) put_hevc_epel_uni_v6_8_rvv_i32: 119.6 ( 4.26x) put_hevc_epel_uni_v8_8_c: 982.8 ( 1.00x) put_hevc_epel_uni_v8_8_rvv_i32: 147.1 ( 6.68x) put_hevc_epel_uni_v12_8_c: 2027.7 ( 1.00x) put_hevc_epel_uni_v12_8_rvv_i32: 211.0 ( 9.61x) put_hevc_epel_uni_v16_8_c: 3525.4 ( 1.00x) put_hevc_epel_uni_v16_8_rvv_i32: 278.8 (12.64x) put_hevc_epel_uni_v24_8_c: 7804.3 ( 1.00x) put_hevc_epel_uni_v24_8_rvv_i32: 778.9 (10.02x) put_hevc_epel_uni_v32_8_c: 13807.3 ( 1.00x) put_hevc_epel_uni_v32_8_rvv_i32: 1028.7 (13.42x) put_hevc_epel_uni_v48_8_c: 30934.9 ( 1.00x) put_hevc_epel_uni_v48_8_rvv_i32: 2265.1 (13.66x) put_hevc_epel_uni_v64_8_c: 54705.5 ( 1.00x) put_hevc_epel_uni_v64_8_rvv_i32: 4003.7 (13.66x) put_hevc_epel_uni_w_v4_8_c: 313.8 ( 1.00x) put_hevc_epel_uni_w_v4_8_rvv_i32: 156.6 ( 2.00x) put_hevc_epel_uni_w_v6_8_c: 674.3 ( 1.00x) put_hevc_epel_uni_w_v6_8_rvv_i32: 222.8 ( 3.03x) put_hevc_epel_uni_w_v8_8_c: 1253.3 ( 1.00x) put_hevc_epel_uni_w_v8_8_rvv_i32: 279.4 ( 4.49x) put_hevc_epel_uni_w_v12_8_c: 2619.4 ( 1.00x) put_hevc_epel_uni_w_v12_8_rvv_i32: 410.2 ( 6.39x) put_hevc_epel_uni_w_v16_8_c: 4614.2 ( 1.00x) put_hevc_epel_uni_w_v16_8_rvv_i32: 535.8 ( 8.61x) put_hevc_epel_uni_w_v24_8_c: 10290.6 ( 1.00x) put_hevc_epel_uni_w_v24_8_rvv_i32: 1550.6 ( 6.64x) put_hevc_epel_uni_w_v32_8_c: 18169.4 ( 1.00x) put_hevc_epel_uni_w_v32_8_rvv_i32: 2047.2 ( 8.88x) put_hevc_epel_uni_w_v48_8_c: 40704.3 ( 1.00x) put_hevc_epel_uni_w_v48_8_rvv_i32: 4552.4 ( 8.94x) put_hevc_epel_uni_w_v64_8_c: 72197.1 ( 1.00x) put_hevc_epel_uni_w_v64_8_rvv_i32: 8069.4 ( 8.95x) put_hevc_epel_bi_v4_8_c: 262.7 ( 1.00x) put_hevc_epel_bi_v4_8_rvv_i32: 105.9 ( 2.48x) put_hevc_epel_bi_v6_8_c: 553.0 ( 1.00x) put_hevc_epel_bi_v6_8_rvv_i32: 145.4 ( 3.80x) put_hevc_epel_bi_v8_8_c: 1045.5 ( 1.00x) put_hevc_epel_bi_v8_8_rvv_i32: 180.3 ( 5.80x) put_hevc_epel_bi_v12_8_c: 2172.7 ( 1.00x) put_hevc_epel_bi_v12_8_rvv_i32: 264.2 ( 8.22x) put_hevc_epel_bi_v16_8_c: 3791.6 ( 1.00x) put_hevc_epel_bi_v16_8_rvv_i32: 336.5 (11.27x) put_hevc_epel_bi_v24_8_c: 8424.1 ( 1.00x) put_hevc_epel_bi_v24_8_rvv_i32: 967.2 ( 8.71x) put_hevc_epel_bi_v32_8_c: 14910.8 ( 1.00x) put_hevc_epel_bi_v32_8_rvv_i32: 1270.7 (11.73x) put_hevc_epel_bi_v48_8_c: 33326.5 ( 1.00x) put_hevc_epel_bi_v48_8_rvv_i32: 2804.7 (11.88x) put_hevc_epel_bi_v64_8_c: 59177.9 ( 1.00x) put_hevc_epel_bi_v64_8_rvv_i32: 5022.3 (11.78x)
Signed-off-by: Zhanheng Yang <[email protected]> --- libavcodec/riscv/h26x/h2656dsp.h | 11 ++ libavcodec/riscv/h26x/hevcepel_rvv.S | 235 ++++++++++++++++++++++++++- libavcodec/riscv/hevcdsp_init.c | 4 + 3 files changed, 249 insertions(+), 1 deletion(-) diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h index fa2f5a88e3..085ed4cf14 100644 --- a/libavcodec/riscv/h26x/h2656dsp.h +++ b/libavcodec/riscv/h26x/h2656dsp.h @@ -59,4 +59,15 @@ void ff_hevc_put_epel_uni_w_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, void ff_hevc_put_epel_bi_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_v_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_uni_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_uni_w_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, + ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t + mx, intptr_t my, int width); #endif diff --git a/libavcodec/riscv/h26x/hevcepel_rvv.S b/libavcodec/riscv/h26x/hevcepel_rvv.S index 81044846f7..caca0b88ab 100644 --- a/libavcodec/riscv/h26x/hevcepel_rvv.S +++ b/libavcodec/riscv/h26x/hevcepel_rvv.S @@ -262,4 +262,237 @@ func ff_hevc_put_epel_bi_h_8_\lmul\()_rvv, zve32x endfunc .endm -hevc_epel_h m1, m2, m4 \ No newline at end of file +hevc_epel_h m1, m2, m4 + +/* output is unclipped; clobbers v4 */ +.macro filter_v vdst, vsrc0, vsrc1, vsrc2, vsrc3 + vmv.v.x v4, s1 + vwmulsu.vv \vdst, v4, \vsrc0 + vwmaccsu.vx \vdst, s2, \vsrc1 + vmv.v.v \vsrc0, \vsrc1 + vwmaccsu.vx \vdst, s3, \vsrc2 + vmv.v.v \vsrc1, \vsrc2 + vwmaccsu.vx \vdst, s4, \vsrc3 + vmv.v.v \vsrc2, \vsrc3 +.endm + +.macro hevc_epel_v lmul, lmul2, lmul4 +func ff_hevc_put_epel_v_8_\lmul\()_rvv, zve32x + addi sp, sp, -32 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + load_filter a5 + sub a1, a1, a2 # src - src_stride + li t1, 0 # offset + mv t4, a3 + +1: + add t2, a1, t1 + slli t3, t1, 1 + add t3, a0, t3 + + vsetvli t5, a6, e8, \lmul, ta, ma + vle8.V v16, (t2) + add t2, t2, a2 + vle8.V v18, (t2) + add t2, t2, a2 + vle8.V v20, (t2) + add t2, t2, a2 + +2: + vsetvli zero, zero, e8, \lmul, ta, ma + vle8.V v22, (t2) + add t2, t2, a2 + filter_v v0, v16, v18, v20, v22 + vsetvli zero, zero, e16, \lmul2, ta, ma + vse16.v v0, (t3) + add t3, t3, 2*HEVC_MAX_PB_SIZE + addi a3, a3, -1 + bgt a3, zero, 2b + add t1, t1, t5 + sub a6, a6, t5 + mv a3, t4 + bgt a6, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + addi sp, sp, 32 + ret +endfunc + +func ff_hevc_put_epel_uni_v_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 + addi sp, sp, -32 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + load_filter a6 + sub a2, a2, a3 # src - src_stride + li t1, 0 # offset + mv t4, a4 + +1: + add t2, a2, t1 + add t3, a0, t1 + + vsetvli t5, a7, e8, \lmul, ta, ma + vle8.V v16, (t2) + add t2, t2, a3 + vle8.V v18, (t2) + add t2, t2, a3 + vle8.V v20, (t2) + add t2, t2, a3 + +2: + vsetvli zero, zero, e8, \lmul, ta, ma + vle8.V v22, (t2) + add t2, t2, a3 + filter_v v0, v16, v18, v20, v22 + vsetvli zero, zero, e16, \lmul2, ta, ma + vmax.vx v0, v0, zero + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 6 + vse8.v v0, (t3) + add t3, t3, a1 + addi a4, a4, -1 + bgt a4, zero, 2b + add t1, t1, t5 + sub a7, a7, t5 + mv a4, t4 + bgt a7, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + addi sp, sp, 32 + ret +endfunc + +func ff_hevc_put_epel_uni_w_v_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 +#if (__riscv_xlen == 32) + lw t1, 4(sp) # my + lw t6, 8(sp) # width +#elif (__riscv_xlen == 64) + ld t1, 8(sp) + lw t6, 16(sp) +#endif + addi sp, sp, -32 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + load_filter t1 + addi a5, a5, 6 # shift + sub a2, a2, a3 # src - src_stride + li t1, 0 # offset + mv t4, a4 + +1: + add t2, a2, t1 + add t3, a0, t1 + + vsetvli t5, t6, e8, \lmul, ta, ma + vle8.V v16, (t2) + add t2, t2, a3 + vle8.V v18, (t2) + add t2, t2, a3 + vle8.V v20, (t2) + add t2, t2, a3 + +2: + vsetvli zero, zero, e8, \lmul, ta, ma + vle8.V v22, (t2) + add t2, t2, a3 + filter_v v0, v16, v18, v20, v22 + vsetvli zero, zero, e16, \lmul2, ta, ma + vwmul.vx v8, v0, a6 + vsetvli zero, zero, e32, \lmul4, ta, ma + vssra.vx v0, v8, a5 + vsadd.vx v0, v0, a7 + vmax.vx v0, v0, zero + vsetvli zero, zero, e16, \lmul2, ta, ma + vnclip.wi v0, v0, 0 + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 0 + vse8.v v0, (t3) + add t3, t3, a1 + addi a4, a4, -1 + bgt a4, zero, 2b + add t1, t1, t5 + sub t6, t6, t5 + mv a4, t4 + bgt t6, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + addi sp, sp, 32 + ret +endfunc + +func ff_hevc_put_epel_bi_v_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 + lw t6, 0(sp) # width + addi sp, sp, -32 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + load_filter a7 + sub a2, a2, a3 # src - src_stride + li t1, 0 # offset + mv t4, a5 + +1: + add t2, a2, t1 + add t3, a0, t1 + slli t0, t1, 1 + add t0, a4, t0 + + vsetvli t5, t6, e8, \lmul, ta, ma + vle8.V v16, (t2) + add t2, t2, a3 + vle8.V v18, (t2) + add t2, t2, a3 + vle8.V v20, (t2) + add t2, t2, a3 + +2: + vsetvli zero, zero, e8, \lmul, ta, ma + vle8.V v22, (t2) + add t2, t2, a3 + filter_v v0, v16, v18, v20, v22 + vsetvli zero, zero, e16, \lmul2, ta, ma + vle16.v v8, (t0) + addi t0, t0, 2*HEVC_MAX_PB_SIZE + vsadd.vv v0, v0, v8 + vmax.vx v0, v0, zero + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 7 + vse8.v v0, (t3) + add t3, t3, a1 + addi a5, a5, -1 + bgt a5, zero, 2b + add t1, t1, t5 + sub t6, t6, t5 + mv a5, t4 + bgt t6, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + addi sp, sp, 32 + ret +endfunc +.endm + +hevc_epel_v m1, m2, m4 \ No newline at end of file diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c index 8608fdbd19..c7874996a8 100644 --- a/libavcodec/riscv/hevcdsp_init.c +++ b/libavcodec/riscv/hevcdsp_init.c @@ -94,6 +94,10 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth) RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 0, 1, ff_hevc_put_epel_uni_h_8_m1_rvv); RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 0, 1, ff_hevc_put_epel_uni_w_h_8_m1_rvv); RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 0, 1, ff_hevc_put_epel_bi_h_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel, 1, 0, ff_hevc_put_epel_v_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 1, 0, ff_hevc_put_epel_uni_v_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 1, 0, ff_hevc_put_epel_uni_w_v_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 1, 0, ff_hevc_put_epel_bi_v_8_m1_rvv); break; default: break; -- 2.25.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
