ping On Sun, Feb 4, 2024 at 3:42 PM Ramiro Polla <ramiro.po...@gmail.com> wrote: > > The code is imported from libjpeg-turbo-3.0.1. The neon registers used > have been changed to avoid modifying v8-v15. > --- > libavcodec/aarch64/Makefile | 2 + > libavcodec/aarch64/fdct.h | 26 ++ > libavcodec/aarch64/fdctdsp_init_aarch64.c | 39 +++ > libavcodec/aarch64/fdctdsp_neon.S | 369 ++++++++++++++++++++++ > libavcodec/avcodec.h | 1 + > libavcodec/fdctdsp.c | 4 +- > libavcodec/fdctdsp.h | 2 + > libavcodec/options_table.h | 1 + > libavcodec/tests/aarch64/dct.c | 2 + > 9 files changed, 445 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/aarch64/fdct.h > create mode 100644 libavcodec/aarch64/fdctdsp_init_aarch64.c > create mode 100644 libavcodec/aarch64/fdctdsp_neon.S > > diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile > index beb6a02f5f..eebccbe4a5 100644 > --- a/libavcodec/aarch64/Makefile > +++ b/libavcodec/aarch64/Makefile > @@ -1,4 +1,5 @@ > # subsystems > +OBJS-$(CONFIG_FDCTDSP) += aarch64/fdctdsp_init_aarch64.o > OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o > OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o > OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o > @@ -35,6 +36,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += > aarch64/videodsp.o > > # subsystems > NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o > +NEON-OBJS-$(CONFIG_FDCTDSP) += aarch64/fdctdsp_neon.o > NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o > NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o > NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o > \ > diff --git a/libavcodec/aarch64/fdct.h b/libavcodec/aarch64/fdct.h > new file mode 100644 > index 0000000000..0901b53a83 > --- /dev/null > +++ b/libavcodec/aarch64/fdct.h > @@ -0,0 +1,26 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#ifndef AVCODEC_AARCH64_FDCT_H > +#define AVCODEC_AARCH64_FDCT_H > + > +#include <stdint.h> > + > +void ff_fdct_neon(int16_t *block); > + > +#endif /* AVCODEC_AARCH64_FDCT_H */ > diff --git a/libavcodec/aarch64/fdctdsp_init_aarch64.c > b/libavcodec/aarch64/fdctdsp_init_aarch64.c > new file mode 100644 > index 0000000000..59d91bc8fc > --- /dev/null > +++ b/libavcodec/aarch64/fdctdsp_init_aarch64.c > @@ -0,0 +1,39 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/aarch64/cpu.h" > +#include "libavcodec/avcodec.h" > +#include "libavcodec/fdctdsp.h" > +#include "fdct.h" > + > +av_cold void ff_fdctdsp_init_aarch64(FDCTDSPContext *c, AVCodecContext > *avctx, > + unsigned high_bit_depth) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (have_neon(cpu_flags)) { > + if (!high_bit_depth) { > + if (avctx->dct_algo == FF_DCT_AUTO || > + avctx->dct_algo == FF_DCT_NEON) { > + c->fdct = ff_fdct_neon; > + } > + } > + } > +} > diff --git a/libavcodec/aarch64/fdctdsp_neon.S > b/libavcodec/aarch64/fdctdsp_neon.S > new file mode 100644 > index 0000000000..978c8d3002 > --- /dev/null > +++ b/libavcodec/aarch64/fdctdsp_neon.S > @@ -0,0 +1,369 @@ > +/* > + * Armv8 Neon optimizations for libjpeg-turbo > + * > + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). > + * All Rights Reserved. > + * Author: Siarhei Siamashka <siarhei.siamas...@nokia.com> > + * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. > + * Author: Ragesh Radhakrishnan <rages...@linaro.org> > + * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved. > + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. > + * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. > + * > + * This software is provided 'as-is', without any express or implied > + * warranty. In no event will the authors be held liable for any damages > + * arising from the use of this software. > + * > + * Permission is granted to anyone to use this software for any purpose, > + * including commercial applications, and to alter it and redistribute it > + * freely, subject to the following restrictions: > + * > + * 1. The origin of this software must not be misrepresented; you must not > + * claim that you wrote the original software. If you use this software > + * in a product, an acknowledgment in the product documentation would be > + * appreciated but is not required. > + * 2. Altered source versions must be plainly marked as such, and must not be > + * misrepresented as being the original software. > + * 3. This notice may not be removed or altered from any source distribution. > + */ > + > +#include "libavutil/aarch64/asm.S" > +#include "neon.S" > + > +// #define EIGHT_BIT_SAMPLES > + > +/* Constants for jsimd_fdct_islow_neon() */ > + > +#define F_0_298 2446 /* FIX(0.298631336) */ > +#define F_0_390 3196 /* FIX(0.390180644) */ > +#define F_0_541 4433 /* FIX(0.541196100) */ > +#define F_0_765 6270 /* FIX(0.765366865) */ > +#define F_0_899 7373 /* FIX(0.899976223) */ > +#define F_1_175 9633 /* FIX(1.175875602) */ > +#define F_1_501 12299 /* FIX(1.501321110) */ > +#define F_1_847 15137 /* FIX(1.847759065) */ > +#define F_1_961 16069 /* FIX(1.961570560) */ > +#define F_2_053 16819 /* FIX(2.053119869) */ > +#define F_2_562 20995 /* FIX(2.562915447) */ > +#define F_3_072 25172 /* FIX(3.072711026) */ > + > +const jsimd_fdct_islow_neon_consts, align=4 > + .short F_0_298 > + .short -F_0_390 > + .short F_0_541 > + .short F_0_765 > + .short - F_0_899 > + .short F_1_175 > + .short F_1_501 > + .short - F_1_847 > + .short - F_1_961 > + .short F_2_053 > + .short - F_2_562 > + .short F_3_072 > + .short 0 /* padding */ > + .short 0 > + .short 0 > + .short 0 > +endconst > + > +#undef F_0_298 > +#undef F_0_390 > +#undef F_0_541 > +#undef F_0_765 > +#undef F_0_899 > +#undef F_1_175 > +#undef F_1_501 > +#undef F_1_847 > +#undef F_1_961 > +#undef F_2_053 > +#undef F_2_562 > +#undef F_3_072 > + > +/*****************************************************************************/ > + > +/* > + * jsimd_fdct_islow_neon > + * > + * This file contains a slower but more accurate integer implementation of > the > + * forward DCT (Discrete Cosine Transform). The following code is based > + * directly on the IJG''s original jfdctint.c; see the jfdctint.c for > + * more details. > + */ > + > +#define CONST_BITS 13 > +#ifdef EIGHT_BIT_SAMPLES > +#define PASS1_BITS 2 > +#else > +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ > +#endif > + > +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) > +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) > + > +#define XFIX_P_0_298 v0.h[0] > +#define XFIX_N_0_390 v0.h[1] > +#define XFIX_P_0_541 v0.h[2] > +#define XFIX_P_0_765 v0.h[3] > +#define XFIX_N_0_899 v0.h[4] > +#define XFIX_P_1_175 v0.h[5] > +#define XFIX_P_1_501 v0.h[6] > +#define XFIX_N_1_847 v0.h[7] > +#define XFIX_N_1_961 v1.h[0] > +#define XFIX_P_2_053 v1.h[1] > +#define XFIX_N_2_562 v1.h[2] > +#define XFIX_P_3_072 v1.h[3] > + > +function ff_fdct_neon, export=1 > + > + DATA .req x0 > + TMP .req x9 > + > + /* Load constants */ > + movrel TMP, jsimd_fdct_islow_neon_consts > + ld1 {v0.8h, v1.8h}, [TMP] > + > + /* Load all DATA into Neon registers with the following allocation: > + * 0 1 2 3 | 4 5 6 7 > + * ---------+-------- > + * 0 | d16 | d17 | v16.8h > + * 1 | d18 | d19 | v17.8h > + * 2 | d20 | d21 | v18.8h > + * 3 | d22 | d23 | v19.8h > + * 4 | d24 | d25 | v20.8h > + * 5 | d26 | d27 | v21.8h > + * 6 | d28 | d29 | v22.8h > + * 7 | d30 | d31 | v23.8h > + */ > + > + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 > + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] > + sub DATA, DATA, #64 > + > + /* Transpose */ > + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v31, v2 > + /* 1-D FDCT */ > + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + > dataptr[7]; */ > + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - > dataptr[7]; */ > + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + > dataptr[6]; */ > + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - > dataptr[6]; */ > + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + > dataptr[5]; */ > + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - > dataptr[5]; */ > + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + > dataptr[4]; */ > + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - > dataptr[4]; */ > + > + /* even part */ > + > + add v4.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ > + sub v5.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ > + add v6.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ > + sub v7.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ > + > + add v16.8h, v4.8h, v6.8h /* tmp10 + tmp11 */ > + sub v20.8h, v4.8h, v6.8h /* tmp10 - tmp11 */ > + > + add v18.8h, v7.8h, v5.8h /* tmp12 + tmp13 */ > + > + shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = > (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ > + shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = > (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ > + > + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 > + tmp13, XFIX_P_0_541); */ > + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 > + tmp13, XFIX_P_0_541); */ > + mov v22.16b, v18.16b > + mov v25.16b, v24.16b > + > + smlal v18.4s, v5.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, > XFIX_P_0_765) */ > + smlal2 v24.4s, v5.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, > XFIX_P_0_765) */ > + smlal v22.4s, v7.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, > XFIX_N_1_847) */ > + smlal2 v25.4s, v7.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, > XFIX_N_1_847) */ > + > + rshrn v18.4h, v18.4s, #DESCALE_P1 > + rshrn v22.4h, v22.4s, #DESCALE_P1 > + rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = > (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); > */ > + rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = > (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); > */ > + > + /* Odd part */ > + > + add v2.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ > + add v3.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ > + add v6.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ > + add v7.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ > + smull v4.4s, v6.4h, XFIX_P_1_175 /* z5 lo = z3 lo * > XFIX_P_1_175 */ > + smull2 v5.4s, v6.8h, XFIX_P_1_175 > + smlal v4.4s, v7.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, > FIX_1_175875602); */ > + smlal2 v5.4s, v7.8h, XFIX_P_1_175 > + > + smull2 v24.4s, v28.8h, XFIX_P_0_298 > + smull2 v25.4s, v29.8h, XFIX_P_2_053 > + smull2 v26.4s, v30.8h, XFIX_P_3_072 > + smull2 v27.4s, v31.8h, XFIX_P_1_501 > + smull v23.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, > FIX_0_298631336); */ > + smull v21.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, > FIX_2_053119869); */ > + smull v19.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, > FIX_3_072711026); */ > + smull v17.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, > FIX_1_501321110); */ > + > + smull2 v28.4s, v2.8h, XFIX_N_0_899 > + smull2 v29.4s, v3.8h, XFIX_N_2_562 > + smull2 v30.4s, v6.8h, XFIX_N_1_961 > + smull2 v31.4s, v7.8h, XFIX_N_0_390 > + smull v2.4s, v2.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, > -FIX_0_899976223); */ > + smull v3.4s, v3.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, > -FIX_2_562915447); */ > + smull v6.4s, v6.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, > -FIX_1_961570560); */ > + smull v7.4s, v7.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, > -FIX_0_390180644); */ > + > + add v6.4s, v6.4s, v4.4s /* z3 += z5 */ > + add v30.4s, v30.4s, v5.4s > + add v7.4s, v7.4s, v4.4s /* z4 += z5 */ > + add v31.4s, v31.4s, v5.4s > + > + add v23.4s, v23.4s, v2.4s /* tmp4 += z1 */ > + add v24.4s, v24.4s, v28.4s > + add v21.4s, v21.4s, v3.4s /* tmp5 += z2 */ > + add v25.4s, v25.4s, v29.4s > + add v19.4s, v19.4s, v6.4s /* tmp6 += z3 */ > + add v26.4s, v26.4s, v30.4s > + add v17.4s, v17.4s, v7.4s /* tmp7 += z4 */ > + add v27.4s, v27.4s, v31.4s > + > + add v23.4s, v23.4s, v6.4s /* tmp4 += z3 */ > + add v24.4s, v24.4s, v30.4s > + add v21.4s, v21.4s, v7.4s /* tmp5 += z4 */ > + add v25.4s, v25.4s, v31.4s > + add v19.4s, v19.4s, v3.4s /* tmp6 += z2 */ > + add v26.4s, v26.4s, v29.4s > + add v17.4s, v17.4s, v2.4s /* tmp7 += z1 */ > + add v27.4s, v27.4s, v28.4s > + > + rshrn v23.4h, v23.4s, #DESCALE_P1 > + rshrn v21.4h, v21.4s, #DESCALE_P1 > + rshrn v19.4h, v19.4s, #DESCALE_P1 > + rshrn v17.4h, v17.4s, #DESCALE_P1 > + rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = > (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ > + rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = > (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ > + rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = > (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ > + rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = > (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ > + > + /* Transpose */ > + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v31, v2 > + > + /* 1-D FDCT */ > + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + > dataptr[7]; */ > + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - > dataptr[7]; */ > + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + > dataptr[6]; */ > + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - > dataptr[6]; */ > + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + > dataptr[5]; */ > + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - > dataptr[5]; */ > + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + > dataptr[4]; */ > + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - > dataptr[4]; */ > + > + /* even part */ > + add v4.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ > + sub v5.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ > + add v6.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ > + sub v7.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ > + > + add v16.8h, v4.8h, v6.8h /* tmp10 + tmp11 */ > + sub v20.8h, v4.8h, v6.8h /* tmp10 - tmp11 */ > + > + add v18.8h, v7.8h, v5.8h /* tmp12 + tmp13 */ > + > + srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = > (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ > + srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = > (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ > + > + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 > + tmp13, XFIX_P_0_541); */ > + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 > + tmp13, XFIX_P_0_541); */ > + mov v22.16b, v18.16b > + mov v25.16b, v24.16b > + > + smlal v18.4s, v5.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, > XFIX_P_0_765) */ > + smlal2 v24.4s, v5.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, > XFIX_P_0_765) */ > + smlal v22.4s, v7.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, > XFIX_N_1_847) */ > + smlal2 v25.4s, v7.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, > XFIX_N_1_847) */ > + > + rshrn v18.4h, v18.4s, #DESCALE_P2 > + rshrn v22.4h, v22.4s, #DESCALE_P2 > + rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = > (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS+PASS1_BITS); > */ > + rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = > (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS+PASS1_BITS); > */ > + > + /* Odd part */ > + add v2.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ > + add v3.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ > + add v6.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ > + add v7.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ > + > + smull v4.4s, v6.4h, XFIX_P_1_175 /* z5 lo = z3 lo * > XFIX_P_1_175 */ > + smull2 v5.4s, v6.8h, XFIX_P_1_175 > + smlal v4.4s, v7.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, > FIX_1_175875602); */ > + smlal2 v5.4s, v7.8h, XFIX_P_1_175 > + > + smull2 v24.4s, v28.8h, XFIX_P_0_298 > + smull2 v25.4s, v29.8h, XFIX_P_2_053 > + smull2 v26.4s, v30.8h, XFIX_P_3_072 > + smull2 v27.4s, v31.8h, XFIX_P_1_501 > + smull v23.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, > FIX_0_298631336); */ > + smull v21.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, > FIX_2_053119869); */ > + smull v19.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, > FIX_3_072711026); */ > + smull v17.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, > FIX_1_501321110); */ > + > + smull2 v28.4s, v2.8h, XFIX_N_0_899 > + smull2 v29.4s, v3.8h, XFIX_N_2_562 > + smull2 v30.4s, v6.8h, XFIX_N_1_961 > + smull2 v31.4s, v7.8h, XFIX_N_0_390 > + smull v2.4s, v2.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, > -FIX_0_899976223); */ > + smull v3.4s, v3.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, > -FIX_2_562915447); */ > + smull v6.4s, v6.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, > -FIX_1_961570560); */ > + smull v7.4s, v7.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, > -FIX_0_390180644); */ > + > + add v6.4s, v6.4s, v4.4s /* z3 += z5 */ > + add v30.4s, v30.4s, v5.4s > + add v7.4s, v7.4s, v4.4s /* z4 += z5 */ > + add v31.4s, v31.4s, v5.4s > + > + add v23.4s, v23.4s, v2.4s /* tmp4 += z1 */ > + add v24.4s, v24.4s, v28.4s > + add v21.4s, v21.4s, v3.4s /* tmp5 += z2 */ > + add v25.4s, v25.4s, v29.4s > + add v19.4s, v19.4s, v6.4s /* tmp6 += z3 */ > + add v26.4s, v26.4s, v30.4s > + add v17.4s, v17.4s, v7.4s /* tmp7 += z4 */ > + add v27.4s, v27.4s, v31.4s > + > + add v23.4s, v23.4s, v6.4s /* tmp4 += z3 */ > + add v24.4s, v24.4s, v30.4s > + add v21.4s, v21.4s, v7.4s /* tmp5 += z4 */ > + add v25.4s, v25.4s, v31.4s > + add v19.4s, v19.4s, v3.4s /* tmp6 += z2 */ > + add v26.4s, v26.4s, v29.4s > + add v17.4s, v17.4s, v2.4s /* tmp7 += z1 */ > + add v27.4s, v27.4s, v28.4s > + > + rshrn v23.4h, v23.4s, #DESCALE_P2 > + rshrn v21.4h, v21.4s, #DESCALE_P2 > + rshrn v19.4h, v19.4s, #DESCALE_P2 > + rshrn v17.4h, v17.4s, #DESCALE_P2 > + rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = > (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS+PASS1_BITS); */ > + rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = > (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS+PASS1_BITS); */ > + rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = > (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS+PASS1_BITS); */ > + rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = > (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS+PASS1_BITS); */ > + > + /* store results */ > + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 > + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] > + > + ret > + > + .unreq DATA > + .unreq TMP > +endfunc > + > +#undef XFIX_P_0_298 > +#undef XFIX_N_0_390 > +#undef XFIX_P_0_541 > +#undef XFIX_P_0_765 > +#undef XFIX_N_0_899 > +#undef XFIX_P_1_175 > +#undef XFIX_P_1_501 > +#undef XFIX_N_1_847 > +#undef XFIX_N_1_961 > +#undef XFIX_P_2_053 > +#undef XFIX_N_2_562 > +#undef XFIX_P_3_072 > diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h > index 7fb44e28f4..f9b86f1d58 100644 > --- a/libavcodec/avcodec.h > +++ b/libavcodec/avcodec.h > @@ -1477,6 +1477,7 @@ typedef struct AVCodecContext { > #define FF_DCT_MMX 3 > #define FF_DCT_ALTIVEC 5 > #define FF_DCT_FAAN 6 > +#define FF_DCT_NEON 7 > > /** > * IDCT algorithm, see FF_IDCT_* below. > diff --git a/libavcodec/fdctdsp.c b/libavcodec/fdctdsp.c > index f8ba17426c..d20558ce88 100644 > --- a/libavcodec/fdctdsp.c > +++ b/libavcodec/fdctdsp.c > @@ -42,7 +42,9 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, > AVCodecContext *avctx) > c->fdct248 = ff_fdct248_islow_8; > } > > -#if ARCH_PPC > +#if ARCH_AARCH64 > + ff_fdctdsp_init_aarch64(c, avctx, high_bit_depth); > +#elif ARCH_PPC > ff_fdctdsp_init_ppc(c, avctx, high_bit_depth); > #elif ARCH_X86 > ff_fdctdsp_init_x86(c, avctx, high_bit_depth); > diff --git a/libavcodec/fdctdsp.h b/libavcodec/fdctdsp.h > index 7378eab870..cad99ed7ca 100644 > --- a/libavcodec/fdctdsp.h > +++ b/libavcodec/fdctdsp.h > @@ -32,6 +32,8 @@ typedef struct FDCTDSPContext { > > FF_VISIBILITY_PUSH_HIDDEN > void ff_fdctdsp_init(FDCTDSPContext *c, struct AVCodecContext *avctx); > +void ff_fdctdsp_init_aarch64(FDCTDSPContext *c, struct AVCodecContext *avctx, > + unsigned high_bit_depth); > void ff_fdctdsp_init_ppc(FDCTDSPContext *c, struct AVCodecContext *avctx, > unsigned high_bit_depth); > void ff_fdctdsp_init_x86(FDCTDSPContext *c, struct AVCodecContext *avctx, > diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h > index ee243d9894..d9a3c92f28 100644 > --- a/libavcodec/options_table.h > +++ b/libavcodec/options_table.h > @@ -159,6 +159,7 @@ static const AVOption avcodec_options[] = { > {"mmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, > V|E, "dct"}, > {"altivec", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, > INT_MAX, V|E, "dct"}, > {"faan", "floating point AAN DCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN > }, INT_MIN, INT_MAX, V|E, "dct"}, > +{"neon", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_NEON }, INT_MIN, > INT_MAX, V|E, "dct"}, > {"lumi_mask", "compresses bright areas stronger than medium ones", > OFFSET(lumi_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E}, > {"tcplx_mask", "temporal complexity masking", OFFSET(temporal_cplx_masking), > AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E}, > {"scplx_mask", "spatial complexity masking", OFFSET(spatial_cplx_masking), > AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E}, > diff --git a/libavcodec/tests/aarch64/dct.c b/libavcodec/tests/aarch64/dct.c > index 9e477328d5..e98a887cd5 100644 > --- a/libavcodec/tests/aarch64/dct.c > +++ b/libavcodec/tests/aarch64/dct.c > @@ -19,9 +19,11 @@ > #include "config.h" > > #include "libavutil/cpu.h" > +#include "libavcodec/aarch64/fdct.h" > #include "libavcodec/aarch64/idct.h" > > static const struct algo fdct_tab_arch[] = { > + { "neon", ff_fdct_neon, FF_IDCT_PERM_NONE, AV_CPU_FLAG_NEON }, > { 0 } > }; > > -- > 2.30.2 > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".