Le maanantaina 10. kesäkuuta 2024, 9.22.14 EEST J. Dekker a écrit : > checkasm: bench runs 131072 (1 << 17) > h264_idct4_add_dc_8bpp_c: 1.5 > h264_idct4_add_dc_8bpp_rvv_i64: 0.7
RDCYCLE would exhibit larger and more meaningful numbers here. > h264_idct4_add_dc_9bpp_c: 1.5 > h264_idct4_add_dc_9bpp_rvv_i64: 0.7 > h264_idct4_add_dc_10bpp_c: 1.5 > h264_idct4_add_dc_10bpp_rvv_i64: 0.7 > h264_idct4_add_dc_12bpp_c: 1.2 > h264_idct4_add_dc_12bpp_rvv_i64: 0.7 > h264_idct4_add_dc_14bpp_c: 1.2 > h264_idct4_add_dc_14bpp_rvv_i64: 0.7 > h264_idct8_add_dc_8bpp_c: 5.2 > h264_idct8_add_dc_8bpp_rvv_i64: 1.5 > h264_idct8_add_dc_9bpp_c: 5.5 > h264_idct8_add_dc_9bpp_rvv_i64: 1.2 > h264_idct8_add_dc_10bpp_c: 5.5 > h264_idct8_add_dc_10bpp_rvv_i64: 1.2 > h264_idct8_add_dc_12bpp_c: 4.2 > h264_idct8_add_dc_12bpp_rvv_i64: 1.2 > h264_idct8_add_dc_14bpp_c: 4.2 > h264_idct8_add_dc_14bpp_rvv_i64: 1.2 > > Signed-off-by: J. Dekker <j...@itanimul.li> > --- > libavcodec/riscv/Makefile | 1 + > libavcodec/riscv/h264dsp_init.c | 40 +++++++++- > libavcodec/riscv/h264dsp_rvv.S | 129 ++++++++++++++++++++++++++++++++ > 3 files changed, 167 insertions(+), 3 deletions(-) > create mode 100644 libavcodec/riscv/h264dsp_rvv.S > > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile > index 590655f829..cb63631de4 100644 > --- a/libavcodec/riscv/Makefile > +++ b/libavcodec/riscv/Makefile > @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o > OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o > RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o > OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o > +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o > OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o > RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o > OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o > diff --git a/libavcodec/riscv/h264dsp_init.c > b/libavcodec/riscv/h264dsp_init.c index dbbf3db400..fe3caf686d 100644 > --- a/libavcodec/riscv/h264dsp_init.c > +++ b/libavcodec/riscv/h264dsp_init.c > @@ -1,4 +1,5 @@ > /* > + * Copyright (c) 2024 J. Dekker <j...@itanimul.li> > * Copyright © 2024 Rémi Denis-Courmont. > * > * This file is part of FFmpeg. > @@ -24,22 +25,55 @@ > > #include "libavutil/attributes.h" > #include "libavutil/cpu.h" > +#include "libavutil/riscv/cpu.h" > #include "libavcodec/h264dsp.h" > > extern int ff_startcode_find_candidate_rvb(const uint8_t *, int); > extern int ff_startcode_find_candidate_rvv(const uint8_t *, int); > +void ff_h264_idct4_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride); > +void ff_h264_idct8_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride); > +void ff_h264_idct4_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride); > +void ff_h264_idct8_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride); > +void ff_h264_idct4_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride); > +void ff_h264_idct8_dc_add_10_rvv(uint8_t *dst, int16_t *block, int > stride); +void ff_h264_idct4_dc_add_12_rvv(uint8_t *dst, int16_t *block, > int stride); +void ff_h264_idct8_dc_add_12_rvv(uint8_t *dst, int16_t > *block, int stride); +void ff_h264_idct4_dc_add_14_rvv(uint8_t *dst, > int16_t *block, int stride); +void ff_h264_idct8_dc_add_14_rvv(uint8_t > *dst, int16_t *block, int stride); > > -av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int > bit_depth, +av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int > bit_depth, const int chroma_format_idc) > { > #if HAVE_RV > int flags = av_get_cpu_flags(); > > if (flags & AV_CPU_FLAG_RVB_BASIC) > - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb; > + c->startcode_find_candidate = ff_startcode_find_candidate_rvb; > # if HAVE_RVV > if (flags & AV_CPU_FLAG_RVV_I32) > - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv; > + c->startcode_find_candidate = ff_startcode_find_candidate_rvv; > # endif > + if ((flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) { ff_rv_vlen_least() > + if (bit_depth == 8) { switch() > + c->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv; > + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv; > + } > + if (bit_depth == 9) { > + c->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv; > + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv; > + } > + if (bit_depth == 10) { > + c->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv; > + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv; > + } > + if (bit_depth == 12) { > + c->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv; > + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv; > + } > + if (bit_depth == 14) { > + c->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv; > + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv; > + } > + } > #endif > } > diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S > new file mode 100644 > index 0000000000..c392403add > --- /dev/null > +++ b/libavcodec/riscv/h264dsp_rvv.S > @@ -0,0 +1,129 @@ > +/* > + * SPDX-License-Identifier: BSD-2-Clause > + * > + * Copyright (c) 2024 J. Dekker <j...@itanimul.li> > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS > IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED > TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A > PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY > THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT > (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE > OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#include "libavutil/riscv/asm.S" > + > +.macro clip min, max, regs:vararg > +.irp x, \regs > + vmax.vx \x, \x, \min > +.endr > +.irp x, \regs > + vmin.vx \x, \x, \max > +.endr > +.endm In my experience, single use macros juse make the code harder to follow and hide possible optimisations (indeed, this is the case here). > + > +.macro idct_dc_add depth > +func ff_h264_idct_dc_add_\depth\()_rvv, zve64x, zbb In this particular case, it seems to me that sharing the code between 8-bit and 16-bit is counter-productive. > +.if \depth == 8 > + lh a3, 0(a1) > +.else > + lw a3, 0(a1) > +.endif > + addi a3, a3, 32 > + mv t1, a4 // lines = cols > + srai a3, a3, 6 > +.if \depth == 8 > + sh zero, 0(a1) > +.else > + sw zero, 0(a1) > +.endif > +1: add t4, a0, a2 // &src[line * 1] > + sh1add t5, a2, a0 // &src[line * 2] > + sh1add t6, a2, t4 // &src[line * 3] > +.if \depth == 8 > + vsetvli zero, a4, e8, mf2, ta, ma > + vle8.v v0, (a0) > + vle8.v v1, (t4) > + vle8.v v2, (t5) > + vle8.v v3, (t6) I doubt that you'll gain anything from unrolling 4x here. Maybe 2x will help amortise consecutive latencies, but that's pretty much it. > + vwcvtu.x.x.v v8, v0 > + vwcvtu.x.x.v v9, v1 > + vwcvtu.x.x.v v10, v2 > + vwcvtu.x.x.v v11, v3 As a general note, when up-converting integers, you have to choose between VWCVT{,U} and V{S,Z}EXT. Here, VZEXT fits better because you could stick with an element size of 16 bits. > + vsetvli zero, a4, e16, m1, ta, ma > +.else > + vsetvli zero, a4, e16, m1, ta, ma This resets the same type and length every time. Just set them once in the calling code then leave VL and VTYPE alone. > + vle16.v v8, (a0) > + vle16.v v9, (t4) > + vle16.v v10, (t5) > + vle16.v v11, (t6) > +.endif > + vadd.vx v8, v8, a3 > + vadd.vx v9, v9, a3 > + vadd.vx v10, v10, a3 > + vadd.vx v11, v11, a3 In the 8-bit case, you could add -128 to a3. Effectively this converts the samples to signed 8-bit. > + clip zero, a5, v8, v9, v10, v11 > + addi t1, t1, -4 > +.if \depth == 8 > + vsetvli zero, a4, e8, mf2, ta, ma It is idiomatic to use zero as source here. > + vncvt.x.x.w v0, v8 > + vncvt.x.x.w v1, v9 > + vncvt.x.x.w v2, v10 > + vncvt.x.x.w v3, v11 If you added -128 above, then you can use VNCLIP.WI to clip *and* narrow, eliminating VMAX, VMIN and VNCVT. However you need to add a VXOR.VX to flip the sign bit back. Alternatively, replace VMIN and VNCT with VNCLIPU.WI. Either way, there won't much code left to share between 8-bit and HDR... > + vse8.v v0, (a0) > + vse8.v v1, (t4) > + vse8.v v2, (t5) > + vse8.v v3, (t6) > +.else > + vse16.v v8, (a0) > + vse16.v v9, (t4) > + vse16.v v10, (t5) > + vse16.v v11, (t6) > +.endif > + sh2add a0, a2, a0 // src += stride * 4 > + bnez t1, 1b > + ret > +endfunc > +.endm > + > +idct_dc_add 8 > +idct_dc_add 16 > + > +func ff_h264_idct4_dc_add_8_rvv, zve64x > + li a4, 4 > + li a5, 255 Using VNCLIP{,U} should make this redundant. > + j ff_h264_idct_dc_add_8_rvv > +endfunc > +func ff_h264_idct8_dc_add_8_rvv, zve64x > + li a4, 8 > + li a5, 255 > + j ff_h264_idct_dc_add_8_rvv > +endfunc > + > +.irp depth,9,10,12,14 > +func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x VSETIVLI here. And while at it, pick 1/2 group multiplier which should be enough for 4 elements per vector. That trick won't work for 8-bit, and that's another reason not to share the code between bit-widths. > + li a4, 4 > + li a5, (1 << \depth) - 1 > + j ff_h264_idct_dc_add_16_rvv > +endfunc > + > +func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x > + li a4, 8 > + li a5, (1 << \depth) - 1 > + j ff_h264_idct_dc_add_16_rvv > +endfunc > +.endr Last but not least, in the case of 32-bit or 64-bit rows, and if they are suitably aligned, it is probably faster to pack all values contiguously in vector groups using strided loads and stores. Thus all looping and unrolling would be eliminated. That trick does not work for 128-bit rows unfortunately, and it also will not work with more complex functions involving calculations between rows. -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".