From: Zhanheng Yang <[email protected]>

Bench on A210 C908 core(VLEN 128).
put_hevc_epel_h4_8_c:                                  146.2 ( 1.00x)
put_hevc_epel_h4_8_rvv_i32:                             81.8 ( 1.79x)
put_hevc_epel_h6_8_c:                                  305.4 ( 1.00x)
put_hevc_epel_h6_8_rvv_i32:                            115.5 ( 2.65x)
put_hevc_epel_h8_8_c:                                  532.7 ( 1.00x)
put_hevc_epel_h8_8_rvv_i32:                            156.7 ( 3.40x)
put_hevc_epel_h12_8_c:                                1233.8 ( 1.00x)
put_hevc_epel_h12_8_rvv_i32:                           225.7 ( 5.47x)
put_hevc_epel_h16_8_c:                                2223.8 ( 1.00x)
put_hevc_epel_h16_8_rvv_i32:                           296.2 ( 7.51x)
put_hevc_epel_h24_8_c:                                4739.4 ( 1.00x)
put_hevc_epel_h24_8_rvv_i32:                           800.7 ( 5.92x)
put_hevc_epel_h32_8_c:                                8344.4 ( 1.00x)
put_hevc_epel_h32_8_rvv_i32:                          1066.0 ( 7.83x)
put_hevc_epel_h48_8_c:                               18595.3 ( 1.00x)
put_hevc_epel_h48_8_rvv_i32:                          2324.3 ( 8.00x)
put_hevc_epel_h64_8_c:                               32911.2 ( 1.00x)
put_hevc_epel_h64_8_rvv_i32:                          4079.8 ( 8.07x)
put_hevc_epel_uni_h4_8_c:                              225.1 ( 1.00x)
put_hevc_epel_uni_h4_8_rvv_i32:                         99.0 ( 2.27x)
put_hevc_epel_uni_h6_8_c:                              500.0 ( 1.00x)
put_hevc_epel_uni_h6_8_rvv_i32:                        138.1 ( 3.62x)
put_hevc_epel_uni_h8_8_c:                              895.6 ( 1.00x)
put_hevc_epel_uni_h8_8_rvv_i32:                        186.3 ( 4.81x)
put_hevc_epel_uni_h12_8_c:                            1925.0 ( 1.00x)
put_hevc_epel_uni_h12_8_rvv_i32:                       264.4 ( 7.28x)
put_hevc_epel_uni_h16_8_c:                            3372.3 ( 1.00x)
put_hevc_epel_uni_h16_8_rvv_i32:                       342.7 ( 9.84x)
put_hevc_epel_uni_h24_8_c:                            7501.4 ( 1.00x)
put_hevc_epel_uni_h24_8_rvv_i32:                       935.6 ( 8.02x)
put_hevc_epel_uni_h32_8_c:                           13232.0 ( 1.00x)
put_hevc_epel_uni_h32_8_rvv_i32:                      1240.0 (10.67x)
put_hevc_epel_uni_h48_8_c:                           29608.1 ( 1.00x)
put_hevc_epel_uni_h48_8_rvv_i32:                      2710.5 (10.92x)
put_hevc_epel_uni_h64_8_c:                           52452.8 ( 1.00x)
put_hevc_epel_uni_h64_8_rvv_i32:                      4775.5 (10.98x)
put_hevc_epel_uni_w_h4_8_c:                            298.5 ( 1.00x)
put_hevc_epel_uni_w_h4_8_rvv_i32:                      176.6 ( 1.69x)
put_hevc_epel_uni_w_h6_8_c:                            645.3 ( 1.00x)
put_hevc_epel_uni_w_h6_8_rvv_i32:                      254.9 ( 2.53x)
put_hevc_epel_uni_w_h8_8_c:                           1187.0 ( 1.00x)
put_hevc_epel_uni_w_h8_8_rvv_i32:                      335.3 ( 3.54x)
put_hevc_epel_uni_w_h12_8_c:                          2535.6 ( 1.00x)
put_hevc_epel_uni_w_h12_8_rvv_i32:                     487.8 ( 5.20x)
put_hevc_epel_uni_w_h16_8_c:                          4491.0 ( 1.00x)
put_hevc_epel_uni_w_h16_8_rvv_i32:                     641.8 ( 7.00x)
put_hevc_epel_uni_w_h24_8_c:                          9974.7 ( 1.00x)
put_hevc_epel_uni_w_h24_8_rvv_i32:                    1791.4 ( 5.57x)
put_hevc_epel_uni_w_h32_8_c:                         17646.1 ( 1.00x)
put_hevc_epel_uni_w_h32_8_rvv_i32:                    2379.0 ( 7.42x)
put_hevc_epel_uni_w_h48_8_c:                         39569.2 ( 1.00x)
put_hevc_epel_uni_w_h48_8_rvv_i32:                    5226.0 ( 7.57x)
put_hevc_epel_uni_w_h64_8_c:                         70274.5 ( 1.00x)
put_hevc_epel_uni_w_h64_8_rvv_i32:                    9214.3 ( 7.63x)
put_hevc_epel_bi_h4_8_c:                               234.5 ( 1.00x)
put_hevc_epel_bi_h4_8_rvv_i32:                         128.3 ( 1.83x)
put_hevc_epel_bi_h6_8_c:                               505.0 ( 1.00x)
put_hevc_epel_bi_h6_8_rvv_i32:                         177.1 ( 2.85x)
put_hevc_epel_bi_h8_8_c:                               958.2 ( 1.00x)
put_hevc_epel_bi_h8_8_rvv_i32:                         235.2 ( 4.07x)
put_hevc_epel_bi_h12_8_c:                             2001.0 ( 1.00x)
put_hevc_epel_bi_h12_8_rvv_i32:                        338.5 ( 5.91x)
put_hevc_epel_bi_h16_8_c:                             3510.2 ( 1.00x)
put_hevc_epel_bi_h16_8_rvv_i32:                        446.5 ( 7.86x)
put_hevc_epel_bi_h24_8_c:                             7803.2 ( 1.00x)
put_hevc_epel_bi_h24_8_rvv_i32:                       1189.6 ( 6.56x)
put_hevc_epel_bi_h32_8_c:                            13764.5 ( 1.00x)
put_hevc_epel_bi_h32_8_rvv_i32:                       1579.3 ( 8.72x)
put_hevc_epel_bi_h48_8_c:                            30827.4 ( 1.00x)
put_hevc_epel_bi_h48_8_rvv_i32:                       3422.3 ( 9.01x)
put_hevc_epel_bi_h64_8_c:                            54715.6 ( 1.00x)
put_hevc_epel_bi_h64_8_rvv_i32:                       6059.8 ( 9.03x)

Signed-off-by: Zhanheng Yang <[email protected]>
---
 libavcodec/riscv/Makefile            |   3 +-
 libavcodec/riscv/h26x/h2656dsp.h     |  12 ++
 libavcodec/riscv/h26x/hevcepel_rvv.S | 265 +++++++++++++++++++++++++++
 libavcodec/riscv/hevcdsp_init.c      |   4 +
 4 files changed, 283 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/h26x/hevcepel_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 414790ae0c..bf65e827e7 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -37,7 +37,8 @@ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o \
-                                    riscv/h26x/hevcqpel_rvv.o
+                                    riscv/h26x/hevcqpel_rvv.o \
+                                    riscv/h26x/hevcepel_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
 OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h
index 2dabc16aee..fa2f5a88e3 100644
--- a/libavcodec/riscv/h26x/h2656dsp.h
+++ b/libavcodec/riscv/h26x/h2656dsp.h
@@ -47,4 +47,16 @@ void ff_hevc_put_qpel_uni_w_v_8_m1_rvv(uint8_t *_dst,  
ptrdiff_t _dststride,
 void ff_hevc_put_qpel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const 
uint8_t *_src,
         ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
         mx, intptr_t my, int width);
+
+void ff_hevc_put_epel_h_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t 
_srcstride, int height,
+        intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_uni_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, 
const uint8_t *_src,
+        ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_uni_w_h_8_m1_rvv(uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_bi_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const 
uint8_t *_src,
+        ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
+        mx, intptr_t my, int width);
 #endif
diff --git a/libavcodec/riscv/h26x/hevcepel_rvv.S 
b/libavcodec/riscv/h26x/hevcepel_rvv.S
new file mode 100644
index 0000000000..81044846f7
--- /dev/null
+++ b/libavcodec/riscv/h26x/hevcepel_rvv.S
@@ -0,0 +1,265 @@
+ /*
+ * Copyright (C) 2026 Alibaba Group Holding Limited.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+.data
+.align 2
+qpel_filters:
+    .byte  0,  0,  0,  0
+    .byte -2, 58, 10, -2
+    .byte -4, 54, 16, -2
+    .byte -6, 46, 28, -4
+    .byte -4, 36, 36, -4
+    .byte -4, 28, 46, -6
+    .byte -2, 16, 54, -4
+    .byte -2, 10, 58, -2
+
+.text
+#include "libavutil/riscv/asm.S"
+#define HEVC_MAX_PB_SIZE 64
+
+.macro  lx rd, addr
+#if (__riscv_xlen == 32)
+        lw      \rd, \addr
+#elif (__riscv_xlen == 64)
+        ld      \rd, \addr
+#else
+        lq      \rd, \addr
+#endif
+.endm
+
+.macro  sx rd, addr
+#if (__riscv_xlen == 32)
+        sw      \rd, \addr
+#elif (__riscv_xlen == 64)
+        sd      \rd, \addr
+#else
+        sq      \rd, \addr
+#endif
+.endm
+
+/* clobbers t0, t1 */
+.macro load_filter m
+        la          t0, qpel_filters
+        slli        t1, \m, 2
+        add         t0, t0, t1
+        lb          s1, 0(t0)
+        lb          s2, 1(t0)
+        lb          s3, 2(t0)
+        lb          s4, 3(t0)
+.endm
+
+/* output is unclipped; clobbers t4 */
+.macro filter_h         vdst, vsrc0, vsrc1, vsrc2, vsrc3, src
+        addi             t4, \src, -1
+        vle8.v           \vsrc0, (t4)
+        vmv.v.x          \vsrc3, s1
+        vwmulsu.vv       \vdst, \vsrc3, \vsrc0
+        vle8.v           \vsrc1, (\src)
+        addi             t4, \src, 1
+        vle8.v           \vsrc2, (t4)
+        addi             t4, \src, 2
+        vle8.v           \vsrc3, (t4)
+
+        vwmaccsu.vx      \vdst, s2, \vsrc1
+        vwmaccsu.vx      \vdst, s3, \vsrc2
+        vwmaccsu.vx      \vdst, s4, \vsrc3
+.endm
+
+.macro vreg
+
+.endm
+
+.macro hevc_epel_h       lmul, lmul2, lmul4
+func ff_hevc_put_epel_h_8_\lmul\()_rvv, zve32x
+    addi        sp, sp, -32
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    load_filter a4
+    mv          t3, a6
+    li          t1, 0       # offset
+
+1:
+    vsetvli     t6, t3, e8, \lmul, ta, ma
+    add         t2, a1, t1
+    filter_h    v0, v16, v18, v20, v22, t2
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    slli        t2, t1, 1
+    add         t2, a0, t2
+    vse16.v     v0, (t2)
+    sub         t3, t3, t6
+    add         t1, t1, t6
+    bgt         t3, zero, 1b
+    addi        a3, a3, -1
+    mv          t3, a6
+    add         a1, a1, a2
+    addi        a0, a0, 2*HEVC_MAX_PB_SIZE
+    li          t1, 0
+    bgt         a3, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    addi        sp, sp, 32
+    ret
+endfunc
+
+func ff_hevc_put_epel_uni_h_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0
+    addi        sp, sp, -32
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    load_filter a5
+    mv          t3, a7
+    li          t1, 0       # offset
+
+1:
+    vsetvli     t6, t3, e8, \lmul, ta, ma
+    add         t2, a2, t1
+    filter_h    v0, v16, v18, v20, v22, t2
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 6
+    add         t2, a0, t1
+    vse8.v      v0, (t2)
+    sub         t3, t3, t6
+    add         t1, t1, t6
+    bgt         t3, zero, 1b
+    addi        a4, a4, -1
+    mv          t3, a7
+    add         a2, a2, a3
+    add         a0, a0, a1
+    li          t1, 0
+    bgt         a4, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    addi        sp, sp, 32
+    ret
+endfunc
+
+func ff_hevc_put_epel_uni_w_h_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0
+    lx          t2, 0(sp)       # mx
+    addi        a5, a5, 6       # shift
+#if (__riscv_xlen == 32)
+    lw          t3, 8(sp)       # width
+#elif (__riscv_xlen == 64)
+    lw          t3, 16(sp)
+#endif
+    addi        sp, sp, -32
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    load_filter t2
+    li          t2, 0           # offset
+
+1:
+    vsetvli     t6, t3, e8, \lmul, ta, ma
+    add         t1, a2, t2
+    filter_h    v8, v16, v18, v20, v22, t1
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vwmul.vx    v0, v8, a6
+    vsetvli     zero, zero, e32, \lmul4, ta, ma
+    vssra.vx    v0, v0, a5
+    vsadd.vx    v0, v0, a7
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vnclip.wi   v0, v0, 0
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 0
+    add         t1, a0, t2
+    vse8.v      v0, (t1)
+    sub         t3, t3, t6
+    add         t2, t2, t6
+    bgt         t3, zero, 1b
+    addi        a4, a4, -1
+#if (__riscv_xlen == 32)
+    lw          t3, 40(sp)
+#elif (__riscv_xlen == 64)
+    ld          t3, 48(sp)
+#endif
+    add         a2, a2, a3
+    add         a0, a0, a1
+    li          t2, 0
+    bgt         a4, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    addi        sp, sp, 32
+    ret
+endfunc
+
+func ff_hevc_put_epel_bi_h_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0
+    lw          t3, 0(sp)      # width
+    addi        sp, sp, -32
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    load_filter a6
+    li          t1, 0          # offset
+
+1:
+    vsetvli     t6, t3, e16, \lmul2, ta, ma
+    slli        t2, t1, 1
+    add         t2, a4, t2
+    vle16.v     v12, (t2)
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    add         t2, a2, t1
+    filter_h    v0, v16, v18, v20, v22, t2
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vsadd.vv    v0, v0, v12
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 7
+    add         t2, a0, t1
+    vse8.v      v0, (t2)
+    sub         t3, t3, t6
+    add         t1, t1, t6
+    bgt         t3, zero, 1b
+    addi        a5, a5, -1
+    lw          t3, 32(sp)
+    add         a2, a2, a3
+    add         a0, a0, a1
+    addi        a4, a4, 2*HEVC_MAX_PB_SIZE
+    li          t1, 0
+    bgt         a5, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    addi        sp, sp, 32
+    ret
+endfunc
+.endm
+
+hevc_epel_h m1, m2, m4
\ No newline at end of file
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 480cfd2968..8608fdbd19 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -90,6 +90,10 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int 
bit_depth)
                 RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni_w, 1, 0, 
ff_hevc_put_qpel_uni_w_v_8_m1_rvv);
                 RVV_FNASSIGN_PEL(c->put_hevc_qpel_bi, 1, 0, 
ff_hevc_put_qpel_bi_v_8_m1_rvv);
 
+                RVV_FNASSIGN_PEL(c->put_hevc_epel, 0, 1, 
ff_hevc_put_epel_h_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 0, 1, 
ff_hevc_put_epel_uni_h_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 0, 1, 
ff_hevc_put_epel_uni_w_h_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 0, 1, 
ff_hevc_put_epel_bi_h_8_m1_rvv);
                 break;
             default:
                 break;
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to