From: daichengrong <daichengr...@iscas.ac.cn>
riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8
On Banana PI F3:
hevc_idct_32x32_8_c: 118945.0 ( 1.00x)
hevc_idct_32x32_8_rvv_i64: 28503.7 ( 4.17x)
Signed-off-by: daichengrong <daichengr...@iscas.ac.cn>
---
libavcodec/riscv/Makefile | 1 +
libavcodec/riscv/hevcdsp_idct_rvv.S | 973 ++++++++++++++++++++++++++++
libavcodec/riscv/hevcdsp_init.c | 52 +-
3 files changed, 1006 insertions(+), 20 deletions(-)
create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o
riscv/h264dsp_rvv.o \
OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o
OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 0000000000..561b8ada47
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+ .2byte 64, 83, 64, 36
+ .2byte 89, 75, 50, 18
+ .2byte 90, 87, 80, 70
+ .2byte 57, 43, 25, 9
+ .2byte 90, 90, 88, 85
+ .2byte 82, 78, 73, 67
+ .2byte 61, 54, 46, 38
+ .2byte 31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+ mv t0, \c
+ .ifc \op, -
+ neg t0, t0
+ .endif
+ vsetivli zero, 4, e16, mf2, tu, ma
+ .ifc \p, 2
+ vslidedown.vi v8, \in, 4
+ vwmacc.vx \out, t0, v8
+ .else
+ vwmacc.vx \out, t0, \in
+ .endif
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
+ sum_sub v24, \in, \t0, \op0, \p
+ sum_sub v25, \in, \t1, \op1, \p
+ sum_sub v26, \in, \t2, \op2, \p
+ sum_sub v27, \in, \t3, \op3, \p
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+ vsetivli zero, 4, e32, m1, tu, ma
+ vadd.vv \tmp_p, \e, \o
+ vsub.vv \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+ vsetivli zero, 4, e32, m1, tu, ma
+ vadd.vv v20, \in0, \in1
+ vsub.vv \in0, \in0, \in1
+ vadd.vv \in1, \in2, \in3
+ vsub.vv \in2, \in2, \in3
+ vadd.vv \in3, \in4, \in5
+ vsub.vv \in4, \in4, \in5
+ vadd.vv \in5, \in6, \in7
+ vsub.vv \in6, \in6, \in7
+.endm
+
+.macro multiply in
+ vsetivli zero, 4, e16, m1, tu, ma
+ vse16.v \in, (s0)
+ ld s2, 0*2(s0)
+ ld s3, 1*2(s0)
+ ld s4, 2*2(s0)
+ ld s5, 3*2(s0)
+
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vwmul.vx v24, v4, s2
+ vwmul.vx v25, v4, s3
+ vwmul.vx v26, v4, s4
+ vwmul.vx v27, v4, s5
+.endm
+
+func tr_block1, zve64x
+ multiply v0
+
+ addi sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ sd x\i,8*(\i - 10)(sp)
+.endr
+ vsetivli zero, 4, e16, m1, tu, ma
+ vse16.v v0, (s0)
+ ld x10, 0*2(s0)
+ ld x11, 1*2(s0)
+ ld x12, 2*2(s0)
+ ld x13, 3*2(s0)
+ vse16.v v1, (s0)
+ ld x14, 0*2(s0)
+ ld x15, 1*2(s0)
+ ld x16, 2*2(s0)
+ ld x17, 3*2(s0)
+ vse16.v v2, (s0)
+ ld x18, 0*2(s0)
+ ld x19, 1*2(s0)
+ ld x20, 2*2(s0)
+ ld x21, 3*2(s0)
+ vse16.v v3, (s0)
+ ld x22, 0*2(s0)
+ ld x23, 1*2(s0)
+ ld x24, 2*2(s0)
+ ld x25, 3*2(s0)
+
+ add_member32 v4, x11, x14, x17, x20, +, +, +, +, 2
+ add_member32 v5, x12, x17, x22, x24, +, +, +, -
+ add_member32 v5, x13, x20, x24, x17, +, +, -, -, 2
+ add_member32 v6, x14, x23, x19, x10, +, +, -, -
+ add_member32 v6, x15, x25, x14, x16, +, -, -, -, 2
+ add_member32 v7, x16, x22, x10, x23, +, -, -, -
+ add_member32 v7, x17, x19, x15, x21, +, -, -, +, 2
+ add_member32 v16, x18, x16, x20, x14, +, -, -, +
+ add_member32 v16, x19, x13, x25, x12, +, -, -, +, 2
+ add_member32 v17, x20, x11, x21, x19, +, -, +, +
+ add_member32 v17, x21, x12, x16, x25, +, -, +, -, 2
+ add_member32 v18, x22, x15, x11, x18, +, -, +, -
+ add_member32 v18, x23, x18, x13, x11, +, -, +, -, 2
+ add_member32 v19, x24, x21, x18, x15, +, -, +, -
+ add_member32 v19, x25, x24, x23, x22, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ ld x\i, 8*(\i - 10)(sp)
+.endr
+ addi sp, sp, 8*16
+
+ ret
+endfunc
+
+func tr_block2, zve64x
+ multiply v1
+
+ addi sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ sd x\i,8*(\i - 10)(sp)
+.endr
+ vsetivli zero, 4, e16, m1, tu, ma
+ vse16.v v0, (s0)
+ ld x10, 0*2(s0)
+ ld x11, 1*2(s0)
+ ld x12, 2*2(s0)
+ ld x13, 3*2(s0)
+ vse16.v v1, (s0)
+ ld x14, 0*2(s0)
+ ld x15, 1*2(s0)
+ ld x16, 2*2(s0)
+ ld x17, 3*2(s0)
+ vse16.v v2, (s0)
+ ld x18, 0*2(s0)
+ ld x19, 1*2(s0)
+ ld x20, 2*2(s0)
+ ld x21, 3*2(s0)
+ vse16.v v3, (s0)
+ ld x22, 0*2(s0)
+ ld x23, 1*2(s0)
+ ld x24, 2*2(s0)
+ ld x25, 3*2(s0)
+
+ add_member32 v4, x23, x25, x22, x19, +, -, -, -, 2
+ add_member32 v5, x19, x14, x10, x15, -, -, -, -
+ add_member32 v5, x10, x16, x23, x21, -, -, -, +, 2
+ add_member32 v6, x18, x24, x15, x13, -, +, +, +
+ add_member32 v6, x24, x13, x17, x23, +, +, +, -, 2
+ add_member32 v7, x15, x17, x21, x10, +, +, -, -
+ add_member32 v7, x13, x23, x11, x25, +, -, -, +, 2
+ add_member32 v16, x22, x12, x24, x11, +, -, -, +
+ add_member32 v16, x20, x18, x14, x24, -, -, +, +, 2
+ add_member32 v17, x11, x22, x18, x12, -, +, +, -
+ add_member32 v17, x17, x11, x20, x22, -, +, -, -, 2
+ add_member32 v18, x25, x19, x12, x14, +, +, -, +
+ add_member32 v18, x16, x21, x25, x20, +, -, -, +, 2
+ add_member32 v19, x12, x11, x13, x16, +, -, +, -
+ add_member32 v19, x21, x20, x19, x18, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ ld x\i, 8*(\i - 10)(sp)
+.endr
+ addi sp, sp, 8*16
+
+ ret
+endfunc
+
+func tr_block3, zve64x
+ multiply v2
+ addi sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ sd x\i,8*(\i - 10)(sp)
+.endr
+ vsetivli zero, 4, e16, m1, tu, ma
+ vse16.v v0, (s0)
+ ld x10, 0*2(s0)
+ ld x11, 1*2(s0)
+ ld x12, 2*2(s0)
+ ld x13, 3*2(s0)
+ vse16.v v1, (s0)
+ ld x14, 0*2(s0)
+ ld x15, 1*2(s0)
+ ld x16, 2*2(s0)
+ ld x17, 3*2(s0)
+ vse16.v v2, (s0)
+ ld x18, 0*2(s0)
+ ld x19, 1*2(s0)
+ ld x20, 2*2(s0)
+ ld x21, 3*2(s0)
+ vse16.v v3, (s0)
+ ld x22, 0*2(s0)
+ ld x23, 1*2(s0)
+ ld x24, 2*2(s0)
+ ld x25, 3*2(s0)
+
+ add_member32 v4, x16, x13, x10, x12, -, -, -, -, 2
+ add_member32 v5, x20, x25, x21, x16, -, -, +, +
+ add_member32 v5, x14, x12, x19, x25, +, +, +, -, 2
+ add_member32 v6, x22, x20, x11, x17, +, -, -, -
+ add_member32 v6, x12, x18, x22, x10, -, -, +, +, 2
+ add_member32 v7, x24, x14, x18, x20, -, +, +, -
+ add_member32 v7, x10, x24, x12, x22, +, +, -, -, 2
+ add_member32 v16, x25, x11, x23, x13, -, -, +, +
+ add_member32 v16, x11, x21, x17, x15, -, +, +, -, 2
+ add_member32 v17, x23, x17, x13, x24, +, +, -, +
+ add_member32 v17, x13, x15, x24, x18, +, -, +, +, 2
+ add_member32 v18, x21, x23, x16, x11, -, -, +, -
+ add_member32 v18, x15, x10, x14, x19, -, +, -, +, 2
+ add_member32 v19, x19, x22, x25, x23, +, -, +, +
+ add_member32 v19, x17, x16, x15, x14, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ ld x\i, 8*(\i - 10)(sp)
+.endr
+ addi sp, sp, 8*16
+
+ ret
+endfunc
+
+func tr_block4, zve64x
+ multiply v3
+ addi sp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ sd x\i,8*(\i - 10)(sp)
+.endr
+ vsetivli zero, 4, e16, m1, tu, ma
+ vse16.v v0, (s0)
+ ld x10, 0*2(s0)
+ ld x11, 1*2(s0)
+ ld x12, 2*2(s0)
+ ld x13, 3*2(s0)
+ vse16.v v1, (s0)
+ ld x14, 0*2(s0)
+ ld x15, 1*2(s0)
+ ld x16, 2*2(s0)
+ ld x17, 3*2(s0)
+ vse16.v v2, (s0)
+ ld x18, 0*2(s0)
+ ld x19, 1*2(s0)
+ ld x20, 2*2(s0)
+ ld x21, 3*2(s0)
+ vse16.v v3, (s0)
+ ld x22, 0*2(s0)
+ ld x23, 1*2(s0)
+ ld x24, 2*2(s0)
+ ld x25, 3*2(s0)
+
+ add_member32 v4, x15, x18, x21, x24, -, -, -, -, 2
+ add_member32 v5, x10, x13, x18, x23, +, +, +, +
+ add_member32 v5, x18, x10, x15, x22, -, -, -, -, 2
+ add_member32 v6, x25, x16, x12, x21, +, +, +, +
+ add_member32 v6, x19, x21, x10, x20, +, -, -, -, 2
+ add_member32 v7, x12, x25, x13, x19, -, -, +, +
+ add_member32 v7, x14, x20, x16, x18, +, +, -, -, 2
+ add_member32 v16, x21, x15, x19, x17, -, -, +, +
+ add_member32 v16, x23, x11, x22, x16, -, +, -, -, 2
+ add_member32 v17, x16, x14, x25, x15, +, -, +, +
+ add_member32 v17, x11, x19, x23, x14, -, +, +, -, 2
+ add_member32 v18, x17, x24, x20, x13, +, -, -, +
+ add_member32 v18, x24, x22, x17, x12, -, -, +, -, 2
+ add_member32 v19, x20, x17, x14, x11, -, +, -, +
+ add_member32 v19, x13, x12, x11, x10, +, -, +, -, 2
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ ld x\i, 8*(\i - 10)(sp)
+.endr
+ addi sp, sp, 8*16
+
+ ret
+endfunc
+
+.macro butterfly32 in0, in1, in2, in3, out
+ vsetivli zero, 4, e32, m1, tu, ma
+ vadd.vv \out, \in0, \in1
+ vsub.vv \in0, \in0, \in1
+ vadd.vv \in1, \in2, \in3
+ vsub.vv \in2, \in2, \in3
+.endm
+
+.macro load16 in0, in1, in2, in3
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx \in0, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v \in0, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx \in1, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v \in1, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx \in2, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v \in2, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx \in3, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v \in3, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+.endm
+
+.macro store16 in0, in1, in2, in3, rx
+ vsetivli zero, 1, e64, m1, tu, ma
+ vse64.v \in0, (a1)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1down.vx v8, \in0, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vse64.v v8, (a3)
+ add a1, a1, a2
+ add a3, a3, \rx
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vse64.v \in1, (a1)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1down.vx v8, \in1, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vse64.v v8, (a3)
+ add a1, a1, a2
+ add a3, a3, \rx
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vse64.v \in2, (a1)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1down.vx v8, \in2, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vse64.v v8, (a3)
+ add a1, a1, a2
+ add a3, a3, \rx
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vse64.v \in3, (a1)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1down.vx v8, \in3, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vse64.v v8, (a3)
+ add a1, a1, a2
+ add a3, a3, \rx
+.endm
+
+.macro load32
+ addi a1, a5, 64
+ addi a3, a1, 128
+ li a2, 256
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx v4, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v4, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx v5, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v5, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx v6, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v6, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx v7, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v7, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx v16, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v16, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx v17, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v17, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx v18, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v18, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v8, (a3)
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx v19, v8, zero
+ vsetivli zero, 1, e64, m1, tu, ma
+ vle64.v v19, (a1)
+ add a1, a1, a2
+ add a3, a3, a2
+
+.endm
+
+.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7, op0, op1, op2,
op3, op4, op5, op6, op7, p
+ sum_sub v21, \in, \tt0, \op0, \p
+ sum_sub v22, \in, \tt1, \op1, \p
+ sum_sub v23, \in, \tt2, \op2, \p
+ sum_sub v24, \in, \tt3, \op3, \p
+ sum_sub v25, \in, \tt4, \op4, \p
+ sum_sub v26, \in, \tt5, \op5, \p
+ sum_sub v27, \in, \tt6, \op6, \p
+ sum_sub v28, \in, \tt7, \op7, \p
+.endm
+
+.macro scale_store shift
+ vsetivli zero, 8, e16, m1, tu, ma
+ vle16.v v28, (a4)
+ addi a4, a4, 2*8
+ vle16.v v29, (a4)
+ addi a4, a4, 2*8
+ vle16.v v30, (a4)
+ addi a4, a4, 2*8
+ vle16.v v31, (a4)
+ addi a4, a4, 2*8
+
+ butterfly32 v28, v24, v29, v25, v2
+ butterfly32 v30, v26, v31, v27, v3
+
+ scale v20, v21, v22, v23, v2, v28, v24, v29, v3, v30, v26,
v31, \shift
+
+ transpose16_4x4_2 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+
+ store16 v20, v21, v22, v23, t1
+
+ vsetivli zero, 4, e16, m1, tu, ma
+ vle16.v v2, (t2)
+ addi t2, t2, 8
+ vle16.v v3, (t2)
+ addi t2, t2, -8
+.endm
+
+.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
+ li a7, \off1
+ add a1, sp, a7
+ li a7, \off2
+ add a3, sp, a7
+ li a2, -16
+ li a4, 16
+
+ vsetivli zero, 4, e32, m1, tu, ma
+ vse32.v \in0, (a1)
+ add a1, a1, a4
+ vse32.v \in1, (a3)
+ add a3, a3, a2
+ vse32.v \in2, (a1)
+ add a1, a1, a4
+ vse32.v \in3, (a3)
+ add a3, a3, a2
+ vse32.v \in4, (a1)
+ add a1, a1, a4
+ vse32.v \in5, (a3)
+ add a3, a3, a2
+ vse32.v \in6, (a1)
+ vse32.v \in7, (a3)
+.endm
+
+.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
+ vsetivli zero, 8, e16, m1, tu, ma
+ vse16.v v0, (s0)
+
+ vsetivli zero, 8, e16, m1, tu, ma
+ vid.v v0
+ vand.vi v8, v0, 1
+ vmsne.vi v0, v8, 0
+
+ vslideup.vi v8, v\r1, 1
+ vsetivli zero, 4, e16, m1, ta, ma
+ vmerge.vvm v\tmp0\(), v\r0\(), v8, v0
+
+ vslidedown.vi v8, v\r0, 1
+ vmerge.vvm v\tmp1\(), v8, v\r1\(), v0
+
+ vslideup.vi v8, v\r3, 1
+ vmerge.vvm v\tmp2\(), v\r2\(), v8, v0
+
+ vslidedown.vi v8, v\r2\(), 1
+ vmerge.vvm v\tmp3\(), v8, v\r3\(), v0
+
+ vsetivli zero, 2, e32, m1, tu
+
+ vslideup.vi v8, v\tmp2\(), 1
+ vmerge.vvm v\tmp4\(), v\tmp0\(), v8, v0
+
+ vslidedown.vi v8, v\tmp0\(), 1
+ vmerge.vvm v\tmp5\(), v8, v\tmp2\(), v0
+
+ vslideup.vi v8, v\tmp3\(), 1
+ vmerge.vvm v\tmp0\(), v\tmp1\(), v8, v0
+
+ vslidedown.vi v8, v\tmp1\(), 1
+ vmerge.vvm v\tmp2\(), v8, v\tmp3\(), v0
+
+ vsetivli zero, 1, e64, m1, tu, ma
+ vmv.v.v v\r0\(), v\tmp4\()
+ vmv.v.v v\r2\(), v\tmp5\()
+ vmv.v.v v\r1\(), v\tmp0\()
+ vmv.v.v v\r3\(), v\tmp2\()
+
+ vsetivli zero, 8, e16, m1, tu, ma
+
+ vslideup.vi v8, v\r2\(), 1
+ vmerge.vvm v\tmp0\(), v\r3\(), v8, v0
+
+ vslidedown.vi v8, v\r3\(), 1
+ vmerge.vvm v\tmp1\(), v8, v\r2\(), v0
+
+ vslideup.vi v8, v\r0\(), 1
+ vmerge.vvm v\tmp2\(), v\r1\(),v8, v0
+
+ vslidedown.vi v8, v\r1\(), 1
+ vmerge.vvm v\tmp3\(), v8, v\r0\(), v0
+
+ vsetivli zero, 4, e32, m1, tu, ma
+
+ vslideup.vi v8, v\tmp2\(), 1
+ vmerge.vvm v\tmp4\(), v\tmp0\(), v8, v0
+
+ vslidedown.vi v8, v\tmp0\(), 1
+ vmerge.vvm v\tmp5\(), v8, v\tmp2\(), v0
+
+ vslideup.vi v8, v\tmp3\(), 1
+ vmerge.vvm v\tmp0\(), v\tmp1\(), v8, v0
+
+ vslidedown.vi v8, v\tmp1\(), 1
+ vmerge.vvm v\tmp2\(), v8, v\tmp3\(), v0
+
+ vsetivli zero, 2, e64, m1, tu, ma
+
+ vmerge.vvm v\r3\(), v\r3\(), v\tmp4\(), v0
+ vmerge.vvm v\r1\(), v\r1\(), v\tmp5\(), v0
+ vmerge.vvm v\r2\(), v\r2\(), v\tmp0\(), v0
+ vmerge.vvm v\r0\(), v\r0\(), v\tmp2\(), v0
+
+ vsetivli zero, 8, e16, m1, tu, ma
+ vle16.v v0, (s0)
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, offset
+ tr_4x4_8 \in0, \in1, \in2, \in3, v24, v25, v26, v27
+
+ vsetivli zero, 8, e16, m1, tu, ma
+ vse16.v v0, (s0)
+ lh s6, 4*2(s0)
+ lh s7, 5*2(s0)
+ lh s8, 6*2(s0)
+ lh s9, 7*2(s0)
+
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vslidedown.vi v8, \in0, 4
+ vwmul.vx v28, v8, s6
+ vslidedown.vi v8, \in0, 4
+ vwmul.vx v29, v8, s7
+ vslidedown.vi v8, \in0, 4
+ vwmul.vx v30, v8, s8
+ vslidedown.vi v8, \in0, 4
+ vwmul.vx v31, v8, s9
+
+ sum_sub v28, \in1, s7, +, 2
+ sum_sub v29, \in1, s9, -, 2
+ sum_sub v30, \in1, s6, -, 2
+ sum_sub v31, \in1, s8, -, 2
+ sum_sub v28, \in2, s8, +, 2
+ sum_sub v29, \in2, s6, -, 2
+ sum_sub v30, \in2, s9, +, 2
+ sum_sub v31, \in2, s7, +, 2
+ sum_sub v28, \in3, s9, +, 2
+ sum_sub v29, \in3, s8, -, 2
+ sum_sub v30, \in3, s7, +, 2
+ sum_sub v31, \in3, s6, -, 2
+
+ butterfly v24, v28, v16, v23
+ butterfly v25, v29, v17, v22
+ butterfly v26, v30, v18, v21
+ butterfly v27, v31, v19, v20
+
+ li a7, \offset
+ add a4, sp, a7
+
+ vsetivli zero, 4, e32, m1, tu, ma
+ vse32.v v16, (a4)
+ add a4, a4, 16
+ vse32.v v17, (a4)
+ add a4, a4, 16
+ vse32.v v18, (a4)
+ add a4, a4, 16
+ vse32.v v19, (a4)
+ add a4, a4, 16
+
+ vse32.v v20, (a4)
+ add a4, a4, 16
+ vse32.v v21, (a4)
+ add a4, a4, 16
+ vse32.v v22, (a4)
+ add a4, a4, 16
+ vse32.v v23, (a4)
+ add a4, a4, 16
+
+ add a4, a4, -64
+.endm
+
+.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7,
shift
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vnclip.wi \out0\(), \in0\(), \shift
+ vsetivli zero, 1, e64, m1, tu, ma
+ vmv.x.s a7, \out0\()
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vnclip.wi v8, \in1\(), \shift
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx \out0\(), v8, a7
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vnclip.wi \out1\(), \in2\(), \shift
+ vsetivli zero, 1, e64, m1, tu, ma
+ vmv.x.s a7, \out1\()
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vnclip.wi v8, \in3\(), \shift
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx \out1\(), v8, a7
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vnclip.wi \out2\(), \in4\(), \shift
+ vsetivli zero, 1, e64, m1, tu, ma
+ vmv.x.s a7, \out2\()
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vnclip.wi v8, \in5\(), \shift
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx \out2\(), v8, a7
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vnclip.wi \out3\(), \in6\(), \shift
+ vsetivli zero, 1, e64, m1, tu, ma
+ vmv.x.s a7, \out3\()
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vnclip.wi v8, \in7\(), \shift
+ vsetivli zero, 2, e64, m1, tu, ma
+ vslide1up.vx \out3\(), v8, a7
+.endm
+
+.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
+ vsetivli zero, 4, e16, m1, tu, ma
+ vwcvt.x.x.v v8, \in0
+ vsetivli zero, 4, e32, m1, tu, ma
+ vsll.vi v28, v8, 6
+
+ vsetivli zero, 16, e8, m1, tu, ma
+ vmv.v.v v29, v28
+
+ vsetivli zero, 1, e16, m1, tu, ma
+ vmv.x.s s2, v0
+ vslidedown.vi v12, v0, 1
+ vmv.x.s s3, v12
+ vslidedown.vi v12, v0, 3
+ vmv.x.s s5, v12
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vwmul.vx v30, \in1, s3
+ vwmul.vx v31, \in1, s5
+ vwmacc.vx v28, s2, \in2
+ neg s2, s2
+ vwmacc.vx v29, s2, \in2
+ neg s2, s2
+ vwmacc.vx v30, s5, \in3
+ neg s3, s3
+ vwmacc.vx v31, s3, \in3
+ neg s3, s3
+
+ vsetivli zero, 4, e32, m1, tu, ma
+ vadd.vv \out0, v28, v30
+ vadd.vv \out1, v29, v31
+ vsub.vv \out2, v29, v31
+ vsub.vv \out3, v28, v30
+.endm
+
+.macro tr_16x4 name, shift, offset, step
+func func_tr_16x4_\name, zve64x
+ mv a1, a5
+ addi a3, a5, \step * 64
+ li a2, \step * 128
+ load16 v16, v17, v18, v19
+
+ lla a1, trans
+
+ vsetivli zero, 8, e16, m1, tu, ma
+ vle16.v v0, (a1)
+
+ tr16_8x4 v16, v17, v18, v19, \offset
+
+ addi a1, a5, \step * 32
+ addi a3, a5, \step * 3 *32
+ li a2, \step * 128
+ load16 v20, v17, v18, v19
+
+ lla a1, trans
+ addi a1, a1, 16
+
+ vsetivli zero, 8, e16, m1, tu, ma
+ vle16.v v1, (a1)
+
+ lh s2, 0*2(a1)
+ lh s3, 1*2(a1)
+ lh s4, 2*2(a1)
+ lh s5, 3*2(a1)
+ lh s6, 4*2(a1)
+ lh s7, 5*2(a1)
+ lh s8, 6*2(a1)
+ lh s9, 7*2(a1)
+
+ vsetivli zero, 4, e16, mf2, tu, ma
+ vwmul.vx v21, v20, s2
+ vwmul.vx v22, v20, s3
+ vwmul.vx v23, v20, s4
+ vwmul.vx v24, v20, s5
+ vwmul.vx v25, v20, s6
+ vwmul.vx v26, v20, s7
+ vwmul.vx v27, v20, s8
+ vwmul.vx v28, v20, s9
+
+ add_member v20, s3, s6, s9, s7, s4, s2, s5, s8, +, +, +, -, -, -,
-, -, 2
+ add_member v17, s4, s9, s5, s3, s8, s6, s2, s7, +, +, -, -, -, +,
+, +
+ add_member v17, s5, s7, s3, s9, s2, s8, s4, s6, +, -, -, +, +, +,
-, -, 2
+ add_member v18, s6, s4, s8, s2, s9, s3, s7, s5, +, -, -, +, -, -,
+, +
+ add_member v18, s7, s2, s6, s8, s3, s5, s9, s4, +, -, +, +, -, +,
+, -, 2
+ add_member v19, s8, s5, s2, s4, s7, s9, s6, s3, +, -, +, -, +, +,
-, +
+ add_member v19, s9, s8, s7, s6, s5, s4, s3, s2, +, -, +, -, +, -,
+, -, 2
+
+ li a7, \offset
+ add a4, sp, a7
+
+ vsetivli zero, 4, e32, m1, tu, ma
+ vle32.v v16, (a4)
+ addi a4, a4, 16
+ vle32.v v17, (a4)
+ addi a4, a4, 16
+ vle32.v v18, (a4)
+ addi a4, a4, 16
+ vle32.v v19, (a4)
+ addi a4, a4, 16
+
+ butterfly16 v16, v21, v17, v22, v18, v23, v19, v24
+ .if \shift > 0
+ scale v29, v30, v31, v24, v20, v16, v21, v17, v22, v18,
v23, v19, \shift
+
+ transpose16_4x4_2 29, 30, 31, 24, 2, 3, 4, 5, 6, 7
+
+ mv a1, a6
+ addi a3, a6, 24 +3*32
+ li a2, 32
+ li a4, -32
+
+ store16 v29, v30, v31, v24, a4
+ .else
+ store_to_stack \offset, (\offset + 240), v20, v21, v22, v23, v19,
v18, v17, v16
+ .endif
+
+ li a7, \offset+64
+ add a4, sp, a7
+
+ vsetivli zero, 4, e32, m1, tu, ma
+ vle32.v v16, (a4)
+ addi a4, a4, 16
+ vle32.v v17, (a4)
+ addi a4, a4, 16
+ vle32.v v18, (a4)
+ addi a4, a4, 16
+ vle32.v v19, (a4)
+ addi a4, a4, 16
+
+ butterfly16 v16, v25, v17, v26, v18, v27, v19, v28
+ .if \shift > 0
+ scale v29, v30, v31, v20, v20, v16, v25, v17, v26, v18,
v27, v19, \shift
+
+ transpose16_4x4_2 29, 30, 31, 20, 2, 3, 4, 5, 6, 7
+
+ add a1, a6, 8
+ add a3, a6, (16 + 3 * 32)
+ li a2, 32
+ li a4, -32
+ store16 v29, v30, v31, v20, a4
+ .else
+ store_to_stack (\offset + 64), (\offset + 176), v20, v25, v26, v27,
v19, v18, v17, v16
+ .endif
+ ret
+endfunc
+.endm
+
+tr_16x4 noscale, 0, 2048, 4
+
+.macro tr_32x4 name, shift
+func func_tr_32x4_\name, zve64x
+ mv t3, ra
+ jal func_tr_16x4_noscale
+
+ load32
+
+ lla t2, trans
+ addi t2, t2, 32
+
+ vsetivli zero, 4, e16, m1, tu, ma
+ vle16.v v0, (t2)
+ addi t2, t2, 2*4
+ vle16.v v1, (t2)
+ addi t2, t2, 2*4
+ vle16.v v2, (t2)
+ addi t2, t2, 2*4
+ vle16.v v3, (t2)
+ addi t2, t2, -2*4
+
+ li a7, 2048
+ add a4, sp, a7
+
+ li a2, 64
+ li t1, -64
+
+ jal tr_block1
+ mv a1, t4
+ addi a3, t4, (56 + 3 * 64)
+ scale_store \shift
+
+ jal tr_block2
+ addi a1, t4, 8
+ addi a3, t4, (48 + 3 * 64)
+ scale_store \shift
+
+ jal tr_block3
+ addi a1, t4, 16
+ addi a3, t4, (40 + 3 * 64)
+ scale_store \shift
+
+ jal tr_block4
+ addi a1, t4, 24
+ addi a3, t4, (32 + 3 * 64)
+ scale_store \shift
+
+ jr t3
+endfunc
+.endm
+
+tr_32x4 firstpass, 7
+tr_32x4 secondpass_8, 20 - 8
+
+.macro idct_32x32 bitdepth
+func ff_hevc_idct_32x32_\bitdepth\()_rvv, zve64x
+ mv t6, ra
+ addi sp, sp, -8*13
+ sd ra, 8*12(sp)
+ sd s0, 8*11(sp)
+ sd s1, 8*10(sp)
+ sd s2, 8*9(sp)
+ sd s3, 8*8(sp)
+ sd s4, 8*7(sp)
+ sd s5, 8*6(sp)
+ sd s6, 8*5(sp)
+ sd s7, 8*4(sp)
+ sd s8, 8*3(sp)
+ sd s9, 8*2(sp)
+ sd s10, 8*1(sp)
+ sd s11, 8*0(sp)
+
+ add sp, sp, -16
+ mv s0, sp
+
+ csrwi vxrm, 1
+ li a7, 2432
+ sub sp, sp, a7
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ li a7, 8 * \i
+ add a5, a0, a7
+
+ li a7, 8 * \i * 32
+ add t4, sp, a7
+ jal func_tr_32x4_firstpass
+.endr
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ addi a5, sp, 8 * \i
+ addi t4, a0, 8 * \i * 32
+ jal func_tr_32x4_secondpass_\bitdepth
+.endr
+
+ li a7, 2432
+ add sp, sp, a7
+
+ add sp, sp, 16
+
+ ld ra, 8*12(sp)
+ ld s0, 8*11(sp)
+ ld s1, 8*10(sp)
+ ld s2, 8*9(sp)
+ ld s3, 8*8(sp)
+ ld s4, 8*7(sp)
+ ld s5, 8*6(sp)
+ ld s6, 8*5(sp)
+ ld s7, 8*4(sp)
+ ld s8, 8*3(sp)
+ ld s9, 8*2(sp)
+ ld s10, 8*1(sp)
+ ld s11, 8*0(sp)
+ addi sp, sp, 8*13
+
+ jr t6
+endfunc
+.endm
+
+idct_32x32 8
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 1d8326a573..6dfb889eec 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -27,6 +27,8 @@
#include "libavcodec/hevc/dsp.h"
#include "libavcodec/riscv/h26x/h2656dsp.h"
+void ff_hevc_idct_32x32_8_rvv(int16_t *coeffs, int col_limit);
+
#define RVV_FNASSIGN(member, v, h, fn, ext) \
member[1][v][h] = ff_h2656_put_pixels_##8_##ext; \
member[3][v][h] = ff_h2656_put_pixels_##8_##ext; \
@@ -40,27 +42,37 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int
bit_depth)
const int flags = av_get_cpu_flags();
int vlenb;
- if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB))
- return;
-
vlenb = ff_get_rv_vlenb();
- if (vlenb >= 32) {
- switch (bit_depth) {
- case 8:
- RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
- RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
- break;
- default:
- break;
- }
- } else if (vlenb >= 16) {
- switch (bit_depth) {
- case 8:
- RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
- RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
- break;
- default:
- break;
+
+ if (flags & AV_CPU_FLAG_RVV_I64)
+ if (vlenb >= 16)
+ switch (bit_depth) {
+ case 8:
+ c->idct[3] = ff_hevc_idct_32x32_8_rvv;
+ break;
+ default:
+ break;
+ }
+
+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){
+ if (vlenb >= 32) {
+ switch (bit_depth) {
+ case 8:
+ RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256);
+ RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256);
+ break;
+ default:
+ break;
+ }
+ } else if (vlenb >= 16) {
+ switch (bit_depth) {
+ case 8:
+ RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128);
+ RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128);
+ break;
+ default:
+ break;
+ }
}
}
#endif