From: Zhanheng Yang <[email protected]> Bench on A210 C908 core(VLEN 128). put_hevc_epel_h4_8_c: 146.2 ( 1.00x) put_hevc_epel_h4_8_rvv_i32: 81.8 ( 1.79x) put_hevc_epel_h6_8_c: 305.4 ( 1.00x) put_hevc_epel_h6_8_rvv_i32: 115.5 ( 2.65x) put_hevc_epel_h8_8_c: 532.7 ( 1.00x) put_hevc_epel_h8_8_rvv_i32: 156.7 ( 3.40x) put_hevc_epel_h12_8_c: 1233.8 ( 1.00x) put_hevc_epel_h12_8_rvv_i32: 225.7 ( 5.47x) put_hevc_epel_h16_8_c: 2223.8 ( 1.00x) put_hevc_epel_h16_8_rvv_i32: 296.2 ( 7.51x) put_hevc_epel_h24_8_c: 4739.4 ( 1.00x) put_hevc_epel_h24_8_rvv_i32: 800.7 ( 5.92x) put_hevc_epel_h32_8_c: 8344.4 ( 1.00x) put_hevc_epel_h32_8_rvv_i32: 1066.0 ( 7.83x) put_hevc_epel_h48_8_c: 18595.3 ( 1.00x) put_hevc_epel_h48_8_rvv_i32: 2324.3 ( 8.00x) put_hevc_epel_h64_8_c: 32911.2 ( 1.00x) put_hevc_epel_h64_8_rvv_i32: 4079.8 ( 8.07x) put_hevc_epel_uni_h4_8_c: 225.1 ( 1.00x) put_hevc_epel_uni_h4_8_rvv_i32: 99.0 ( 2.27x) put_hevc_epel_uni_h6_8_c: 500.0 ( 1.00x) put_hevc_epel_uni_h6_8_rvv_i32: 138.1 ( 3.62x) put_hevc_epel_uni_h8_8_c: 895.6 ( 1.00x) put_hevc_epel_uni_h8_8_rvv_i32: 186.3 ( 4.81x) put_hevc_epel_uni_h12_8_c: 1925.0 ( 1.00x) put_hevc_epel_uni_h12_8_rvv_i32: 264.4 ( 7.28x) put_hevc_epel_uni_h16_8_c: 3372.3 ( 1.00x) put_hevc_epel_uni_h16_8_rvv_i32: 342.7 ( 9.84x) put_hevc_epel_uni_h24_8_c: 7501.4 ( 1.00x) put_hevc_epel_uni_h24_8_rvv_i32: 935.6 ( 8.02x) put_hevc_epel_uni_h32_8_c: 13232.0 ( 1.00x) put_hevc_epel_uni_h32_8_rvv_i32: 1240.0 (10.67x) put_hevc_epel_uni_h48_8_c: 29608.1 ( 1.00x) put_hevc_epel_uni_h48_8_rvv_i32: 2710.5 (10.92x) put_hevc_epel_uni_h64_8_c: 52452.8 ( 1.00x) put_hevc_epel_uni_h64_8_rvv_i32: 4775.5 (10.98x) put_hevc_epel_uni_w_h4_8_c: 298.5 ( 1.00x) put_hevc_epel_uni_w_h4_8_rvv_i32: 176.6 ( 1.69x) put_hevc_epel_uni_w_h6_8_c: 645.3 ( 1.00x) put_hevc_epel_uni_w_h6_8_rvv_i32: 254.9 ( 2.53x) put_hevc_epel_uni_w_h8_8_c: 1187.0 ( 1.00x) put_hevc_epel_uni_w_h8_8_rvv_i32: 335.3 ( 3.54x) put_hevc_epel_uni_w_h12_8_c: 2535.6 ( 1.00x) put_hevc_epel_uni_w_h12_8_rvv_i32: 487.8 ( 5.20x) put_hevc_epel_uni_w_h16_8_c: 4491.0 ( 1.00x) put_hevc_epel_uni_w_h16_8_rvv_i32: 641.8 ( 7.00x) put_hevc_epel_uni_w_h24_8_c: 9974.7 ( 1.00x) put_hevc_epel_uni_w_h24_8_rvv_i32: 1791.4 ( 5.57x) put_hevc_epel_uni_w_h32_8_c: 17646.1 ( 1.00x) put_hevc_epel_uni_w_h32_8_rvv_i32: 2379.0 ( 7.42x) put_hevc_epel_uni_w_h48_8_c: 39569.2 ( 1.00x) put_hevc_epel_uni_w_h48_8_rvv_i32: 5226.0 ( 7.57x) put_hevc_epel_uni_w_h64_8_c: 70274.5 ( 1.00x) put_hevc_epel_uni_w_h64_8_rvv_i32: 9214.3 ( 7.63x) put_hevc_epel_bi_h4_8_c: 234.5 ( 1.00x) put_hevc_epel_bi_h4_8_rvv_i32: 128.3 ( 1.83x) put_hevc_epel_bi_h6_8_c: 505.0 ( 1.00x) put_hevc_epel_bi_h6_8_rvv_i32: 177.1 ( 2.85x) put_hevc_epel_bi_h8_8_c: 958.2 ( 1.00x) put_hevc_epel_bi_h8_8_rvv_i32: 235.2 ( 4.07x) put_hevc_epel_bi_h12_8_c: 2001.0 ( 1.00x) put_hevc_epel_bi_h12_8_rvv_i32: 338.5 ( 5.91x) put_hevc_epel_bi_h16_8_c: 3510.2 ( 1.00x) put_hevc_epel_bi_h16_8_rvv_i32: 446.5 ( 7.86x) put_hevc_epel_bi_h24_8_c: 7803.2 ( 1.00x) put_hevc_epel_bi_h24_8_rvv_i32: 1189.6 ( 6.56x) put_hevc_epel_bi_h32_8_c: 13764.5 ( 1.00x) put_hevc_epel_bi_h32_8_rvv_i32: 1579.3 ( 8.72x) put_hevc_epel_bi_h48_8_c: 30827.4 ( 1.00x) put_hevc_epel_bi_h48_8_rvv_i32: 3422.3 ( 9.01x) put_hevc_epel_bi_h64_8_c: 54715.6 ( 1.00x) put_hevc_epel_bi_h64_8_rvv_i32: 6059.8 ( 9.03x)
Signed-off-by: Zhanheng Yang <[email protected]> --- libavcodec/riscv/Makefile | 3 +- libavcodec/riscv/h26x/h2656dsp.h | 12 ++ libavcodec/riscv/h26x/hevcepel_rvv.S | 265 +++++++++++++++++++++++++++ libavcodec/riscv/hevcdsp_init.c | 4 + 4 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/h26x/hevcepel_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 414790ae0c..bf65e827e7 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -37,7 +37,8 @@ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o \ - riscv/h26x/hevcqpel_rvv.o + riscv/h26x/hevcqpel_rvv.o \ + riscv/h26x/hevcepel_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h index 2dabc16aee..fa2f5a88e3 100644 --- a/libavcodec/riscv/h26x/h2656dsp.h +++ b/libavcodec/riscv/h26x/h2656dsp.h @@ -47,4 +47,16 @@ void ff_hevc_put_qpel_uni_w_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, void ff_hevc_put_qpel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width); + +void ff_hevc_put_epel_h_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_uni_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_uni_w_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_epel_bi_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, + ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t + mx, intptr_t my, int width); #endif diff --git a/libavcodec/riscv/h26x/hevcepel_rvv.S b/libavcodec/riscv/h26x/hevcepel_rvv.S new file mode 100644 index 0000000000..81044846f7 --- /dev/null +++ b/libavcodec/riscv/h26x/hevcepel_rvv.S @@ -0,0 +1,265 @@ + /* + * Copyright (C) 2026 Alibaba Group Holding Limited. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +.data +.align 2 +qpel_filters: + .byte 0, 0, 0, 0 + .byte -2, 58, 10, -2 + .byte -4, 54, 16, -2 + .byte -6, 46, 28, -4 + .byte -4, 36, 36, -4 + .byte -4, 28, 46, -6 + .byte -2, 16, 54, -4 + .byte -2, 10, 58, -2 + +.text +#include "libavutil/riscv/asm.S" +#define HEVC_MAX_PB_SIZE 64 + +.macro lx rd, addr +#if (__riscv_xlen == 32) + lw \rd, \addr +#elif (__riscv_xlen == 64) + ld \rd, \addr +#else + lq \rd, \addr +#endif +.endm + +.macro sx rd, addr +#if (__riscv_xlen == 32) + sw \rd, \addr +#elif (__riscv_xlen == 64) + sd \rd, \addr +#else + sq \rd, \addr +#endif +.endm + +/* clobbers t0, t1 */ +.macro load_filter m + la t0, qpel_filters + slli t1, \m, 2 + add t0, t0, t1 + lb s1, 0(t0) + lb s2, 1(t0) + lb s3, 2(t0) + lb s4, 3(t0) +.endm + +/* output is unclipped; clobbers t4 */ +.macro filter_h vdst, vsrc0, vsrc1, vsrc2, vsrc3, src + addi t4, \src, -1 + vle8.v \vsrc0, (t4) + vmv.v.x \vsrc3, s1 + vwmulsu.vv \vdst, \vsrc3, \vsrc0 + vle8.v \vsrc1, (\src) + addi t4, \src, 1 + vle8.v \vsrc2, (t4) + addi t4, \src, 2 + vle8.v \vsrc3, (t4) + + vwmaccsu.vx \vdst, s2, \vsrc1 + vwmaccsu.vx \vdst, s3, \vsrc2 + vwmaccsu.vx \vdst, s4, \vsrc3 +.endm + +.macro vreg + +.endm + +.macro hevc_epel_h lmul, lmul2, lmul4 +func ff_hevc_put_epel_h_8_\lmul\()_rvv, zve32x + addi sp, sp, -32 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + load_filter a4 + mv t3, a6 + li t1, 0 # offset + +1: + vsetvli t6, t3, e8, \lmul, ta, ma + add t2, a1, t1 + filter_h v0, v16, v18, v20, v22, t2 + vsetvli zero, zero, e16, \lmul2, ta, ma + slli t2, t1, 1 + add t2, a0, t2 + vse16.v v0, (t2) + sub t3, t3, t6 + add t1, t1, t6 + bgt t3, zero, 1b + addi a3, a3, -1 + mv t3, a6 + add a1, a1, a2 + addi a0, a0, 2*HEVC_MAX_PB_SIZE + li t1, 0 + bgt a3, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + addi sp, sp, 32 + ret +endfunc + +func ff_hevc_put_epel_uni_h_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 + addi sp, sp, -32 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + load_filter a5 + mv t3, a7 + li t1, 0 # offset + +1: + vsetvli t6, t3, e8, \lmul, ta, ma + add t2, a2, t1 + filter_h v0, v16, v18, v20, v22, t2 + vsetvli zero, zero, e16, \lmul2, ta, ma + vmax.vx v0, v0, zero + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 6 + add t2, a0, t1 + vse8.v v0, (t2) + sub t3, t3, t6 + add t1, t1, t6 + bgt t3, zero, 1b + addi a4, a4, -1 + mv t3, a7 + add a2, a2, a3 + add a0, a0, a1 + li t1, 0 + bgt a4, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + addi sp, sp, 32 + ret +endfunc + +func ff_hevc_put_epel_uni_w_h_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 + lx t2, 0(sp) # mx + addi a5, a5, 6 # shift +#if (__riscv_xlen == 32) + lw t3, 8(sp) # width +#elif (__riscv_xlen == 64) + lw t3, 16(sp) +#endif + addi sp, sp, -32 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + load_filter t2 + li t2, 0 # offset + +1: + vsetvli t6, t3, e8, \lmul, ta, ma + add t1, a2, t2 + filter_h v8, v16, v18, v20, v22, t1 + vsetvli zero, zero, e16, \lmul2, ta, ma + vwmul.vx v0, v8, a6 + vsetvli zero, zero, e32, \lmul4, ta, ma + vssra.vx v0, v0, a5 + vsadd.vx v0, v0, a7 + vmax.vx v0, v0, zero + vsetvli zero, zero, e16, \lmul2, ta, ma + vnclip.wi v0, v0, 0 + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 0 + add t1, a0, t2 + vse8.v v0, (t1) + sub t3, t3, t6 + add t2, t2, t6 + bgt t3, zero, 1b + addi a4, a4, -1 +#if (__riscv_xlen == 32) + lw t3, 40(sp) +#elif (__riscv_xlen == 64) + ld t3, 48(sp) +#endif + add a2, a2, a3 + add a0, a0, a1 + li t2, 0 + bgt a4, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + addi sp, sp, 32 + ret +endfunc + +func ff_hevc_put_epel_bi_h_8_\lmul\()_rvv, zve32x + csrwi vxrm, 0 + lw t3, 0(sp) # width + addi sp, sp, -32 + sx s1, 0(sp) + sx s2, 8(sp) + sx s3, 16(sp) + sx s4, 24(sp) + load_filter a6 + li t1, 0 # offset + +1: + vsetvli t6, t3, e16, \lmul2, ta, ma + slli t2, t1, 1 + add t2, a4, t2 + vle16.v v12, (t2) + vsetvli zero, zero, e8, \lmul, ta, ma + add t2, a2, t1 + filter_h v0, v16, v18, v20, v22, t2 + vsetvli zero, zero, e16, \lmul2, ta, ma + vsadd.vv v0, v0, v12 + vmax.vx v0, v0, zero + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v0, v0, 7 + add t2, a0, t1 + vse8.v v0, (t2) + sub t3, t3, t6 + add t1, t1, t6 + bgt t3, zero, 1b + addi a5, a5, -1 + lw t3, 32(sp) + add a2, a2, a3 + add a0, a0, a1 + addi a4, a4, 2*HEVC_MAX_PB_SIZE + li t1, 0 + bgt a5, zero, 1b + + lx s1, 0(sp) + lx s2, 8(sp) + lx s3, 16(sp) + lx s4, 24(sp) + addi sp, sp, 32 + ret +endfunc +.endm + +hevc_epel_h m1, m2, m4 \ No newline at end of file diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c index 480cfd2968..8608fdbd19 100644 --- a/libavcodec/riscv/hevcdsp_init.c +++ b/libavcodec/riscv/hevcdsp_init.c @@ -90,6 +90,10 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth) RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni_w, 1, 0, ff_hevc_put_qpel_uni_w_v_8_m1_rvv); RVV_FNASSIGN_PEL(c->put_hevc_qpel_bi, 1, 0, ff_hevc_put_qpel_bi_v_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel, 0, 1, ff_hevc_put_epel_h_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 0, 1, ff_hevc_put_epel_uni_h_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 0, 1, ff_hevc_put_epel_uni_w_h_8_m1_rvv); + RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 0, 1, ff_hevc_put_epel_bi_h_8_m1_rvv); break; default: break; -- 2.25.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
