PR #21048 opened by Rémi Denis-Courmont (Courmisch)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21048
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21048.patch


From d81b88782e181bdee9599e0fac1ca62915dfb723 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <[email protected]>
Date: Sat, 29 Nov 2025 17:46:55 +0200
Subject: [PATCH 1/2] lavc/h264idct: R-V V 8-bit h264_luma_dc_dequant_idct

This does not improve performance with current hardware due to the poor
performance of segmented accesses. Performance should be slightly better
with expensive or near-future hardware that I don't have, however it is
still limited by two other factors:
- There are only 4 elements.
- The final stores are necessarily indexed and hit multiple cache lines,
  thus as slow as scalar.
---
 libavcodec/riscv/Makefile               |  2 +-
 libavcodec/riscv/h264dsp_init.c         |  7 +-
 libavcodec/riscv/h264idct_dequant_rvv.S | 86 +++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/riscv/h264idct_dequant_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 736f873fe8..3d2a2b4b6f 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -32,7 +32,7 @@ OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
 RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
 OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
 RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
-                              riscv/h264idct_rvv.o
+                              riscv/h264idct_rvv.o riscv/h264idct_dequant_rvv.o
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index f214486bbe..7ab8d38698 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -80,7 +80,8 @@ void ff_h264_idct4_add8_##depth##_rvv(uint8_t **d, const int 
*soffset, \
                                       const uint8_t nnzc[5 * 8]); \
 void ff_h264_idct4_add8_422_##depth##_rvv(uint8_t **d, const int *soffset, \
                                           int16_t *s, int stride, \
-                                          const uint8_t nnzc[5 * 8]);
+                                          const uint8_t nnzc[5 * 8]); \
+void ff_h264_luma_dc_dequant_idct_##depth##_rvv(int16_t *d, int16_t *s, int q);
 
 IDCT_DEPTH(8)
 IDCT_DEPTH(9)
@@ -174,6 +175,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, 
const int bit_depth,
                     dsp->h264_idct_add8   = ff_h264_idct4_add8_422_8_rvv;
 #  endif
             }
+
+            dsp->h264_luma_dc_dequant_idct =
+                ff_h264_luma_dc_dequant_idct_8_rvv;
+
             if (flags & AV_CPU_FLAG_RVV_I64) {
                 dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv;
                 if (flags & AV_CPU_FLAG_RVB)
diff --git a/libavcodec/riscv/h264idct_dequant_rvv.S 
b/libavcodec/riscv/h264idct_dequant_rvv.S
new file mode 100644
index 0000000000..73a68a28ab
--- /dev/null
+++ b/libavcodec/riscv/h264idct_dequant_rvv.S
@@ -0,0 +1,86 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright © 2025 Rémi Denis-Courmont.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const offsets_8, 1
+        .short 0, 64, 256, 320
+endconst
+
+func ff_h264_luma_dc_dequant_idct_8_rvv, zve32x
+        lpad   0
+        csrwi  vxrm, 0
+        vsetivli    zero, 4, e16, mf2, ta, ma
+        vlseg4e16.v v8, (a1)
+        vwadd.vv    v16, v8, v9     # z0
+        addi    t1, sp, 4 * 4 * -3
+        vwadd.vv    v19, v10, v11   # z3
+        addi    t2, sp, 4 * 4 * -2
+        vwsub.vv    v17, v8, v9     # z1
+        addi    t3, sp, 4 * 4 * -1
+        vwsub.vv    v18, v10, v11   # z2
+        vsetvli zero, zero, e32, m1, ta, ma
+        vadd.vv v8, v16, v19
+        addi    sp, sp, 4 * 4 * -4
+        vsub.vv v9, v16, v19
+        vsub.vv v10, v17, v18
+        vadd.vv v11, v17, v18
+        vsseg4e32.v v8, (sp)
+        vle32.v v8, (sp)
+        vle32.v v9, (t1)
+        vle32.v v10, (t2)
+        vle32.v v11, (t3)
+        vadd.vv v16, v8, v10    # z0
+        addi    sp, sp, 4 * 4 * 4
+        vadd.vv v19, v9, v11    # z3
+        lla     t0, offsets_8
+        vsub.vv v17, v8, v10    # z1
+        vsub.vv v18, v9, v11    # z2
+        vadd.vv v8, v16, v19
+        vadd.vv v9, v17, v18
+        vsub.vv v10, v17, v18
+        vsub.vv v11, v16, v19
+        vle16.v v24, (t0)
+        vmul.vx v8, v8, a2
+        vmul.vx v9, v9, a2
+        vmul.vx v10, v10, a2
+        vmul.vx v11, v11, a2
+        vsetvli zero, zero, e16, mf2, ta, ma
+        vnclip.wi   v16, v8, 8
+        addi    t1, a0, 2 * 16 * 1
+        vnclip.wi   v17, v9, 8
+        addi    t2, a0, 2 * 16 * 4
+        vnclip.wi   v18, v10, 8
+        addi    t3, a0, 2 * 16 * 5
+        vnclip.wi   v19, v11, 8
+        vsuxei16.v  v16, (a0), v24
+        vsuxei16.v  v17, (t1), v24
+        vsuxei16.v  v18, (t2), v24
+        vsuxei16.v  v19, (t3), v24
+        ret
+endfunc
-- 
2.49.1


From 9e42a42b229365250316774269948835e736020d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <[email protected]>
Date: Sat, 29 Nov 2025 22:51:01 +0200
Subject: [PATCH 2/2] lavc/h264idct: R-V V 9-bit h264_luma_dc_dequant_idct

Note that, like the C reference, the same function can be used for
larger bit depths.
---
 libavcodec/riscv/h264dsp_init.c         |  5 ++-
 libavcodec/riscv/h264idct_dequant_rvv.S | 55 +++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 7ab8d38698..06cb3c59de 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -189,8 +189,11 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, 
const int bit_depth,
 
 #define IDCT_DEPTH(depth) \
         if (bit_depth == depth) { \
-            if (zvl128b) \
+            if (zvl128b) { \
                 dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \
+                dsp->h264_luma_dc_dequant_idct = \
+                    ff_h264_luma_dc_dequant_idct_9_rvv; \
+            } \
             if (flags & AV_CPU_FLAG_RVB) \
                 dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \
             if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \
diff --git a/libavcodec/riscv/h264idct_dequant_rvv.S 
b/libavcodec/riscv/h264idct_dequant_rvv.S
index 73a68a28ab..bc49ca6ad4 100644
--- a/libavcodec/riscv/h264idct_dequant_rvv.S
+++ b/libavcodec/riscv/h264idct_dequant_rvv.S
@@ -84,3 +84,58 @@ func ff_h264_luma_dc_dequant_idct_8_rvv, zve32x
         vsuxei16.v  v19, (t3), v24
         ret
 endfunc
+
+const offsets_9, 1
+        .short 0, 128, 512, 640
+endconst
+
+func ff_h264_luma_dc_dequant_idct_9_rvv, zve32x
+        lpad   0
+        csrwi  vxrm, 0
+        vsetivli    zero, 4, e32, m1, ta, ma
+        vlseg4e32.v v8, (a1)
+        vadd.vv v16, v8, v9     # z0
+        addi    t1, sp, 4 * 4 * -3
+        vadd.vv v19, v10, v11   # z3
+        addi    t2, sp, 4 * 4 * -2
+        vsub.vv v17, v8, v9     # z1
+        addi    t3, sp, 4 * 4 * -1
+        vsub.vv v18, v10, v11   # z2
+        vadd.vv v8, v16, v19
+        addi    sp, sp, 4 * 4 * -4
+        vsub.vv v9, v16, v19
+        vsub.vv v10, v17, v18
+        vadd.vv v11, v17, v18
+        vsseg4e32.v v8, (sp)
+        vle32.v v8, (sp)
+        vle32.v v9, (t1)
+        vle32.v v10, (t2)
+        vle32.v v11, (t3)
+        vadd.vv v16, v8, v10    # z0
+        addi    sp, sp, 4 * 4 * 4
+        vadd.vv v19, v9, v11    # z3
+        lla     t0, offsets_9
+        vsub.vv v17, v8, v10    # z1
+        vsub.vv v18, v9, v11    # z2
+        vadd.vv v8, v16, v19
+        vadd.vv v9, v17, v18
+        vsub.vv v10, v17, v18
+        vsub.vv v11, v16, v19
+        vle16.v v24, (t0)
+        vmul.vx v8, v8, a2
+        vmul.vx v9, v9, a2
+        vmul.vx v10, v10, a2
+        vmul.vx v11, v11, a2
+        vssra.vi    v16, v8, 8
+        addi    t1, a0, 4 * 16 * 1
+        vssra.vi    v17, v9, 8
+        addi    t2, a0, 4 * 16 * 4
+        vssra.vi    v18, v10, 8
+        addi    t3, a0, 4 * 16 * 5
+        vssra.vi    v19, v11, 8
+        vsuxei16.v  v16, (a0), v24
+        vsuxei16.v  v17, (t1), v24
+        vsuxei16.v  v18, (t2), v24
+        vsuxei16.v  v19, (t3), v24
+        ret
+endfunc
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to