PR #21148 opened by george.zaguri URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21148 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21148.patch
RPi4 (auto-vectorisation is on) put_luma_v_8_4x4_c: 332.6 ( 1.00x) put_luma_v_8_4x4_neon: 54.1 ( 6.14x) put_luma_v_8_8x8_c: 583.9 ( 1.00x) put_luma_v_8_8x8_neon: 161.5 ( 3.61x) put_luma_v_8_16x16_c: 1544.9 ( 1.00x) put_luma_v_8_16x16_neon: 603.5 ( 2.56x) put_luma_v_8_32x32_c: 4444.4 ( 1.00x) put_luma_v_8_32x32_neon: 2387.3 ( 1.86x) put_luma_v_8_64x64_c: 14699.9 ( 1.00x) put_luma_v_8_64x64_neon: 11128.0 ( 1.32x) put_luma_v_8_128x128_c: 52626.9 ( 1.00x) put_luma_v_8_128x128_neon: 43070.2 ( 1.22x) put_luma_v_10_4x4_c: 303.3 ( 1.00x) put_luma_v_10_4x4_neon: 55.7 ( 5.45x) put_luma_v_10_8x8_c: 1106.7 ( 1.00x) put_luma_v_10_8x8_neon: 163.8 ( 6.76x) put_luma_v_10_16x16_c: 2242.1 ( 1.00x) put_luma_v_10_16x16_neon: 672.7 ( 3.33x) put_luma_v_10_32x32_c: 7057.3 ( 1.00x) put_luma_v_10_32x32_neon: 2731.3 ( 2.58x) put_luma_v_10_64x64_c: 25699.8 ( 1.00x) put_luma_v_10_64x64_neon: 12145.6 ( 2.12x) put_luma_v_10_128x128_c: 90694.6 ( 1.00x) put_luma_v_10_128x128_neon: 44862.4 ( 2.02x) Apple M4 (auto-vectorisation is on): put_luma_v_10_4x4_c: 25.6 ( 1.00x) put_luma_v_10_4x4_neon: 3.1 ( 8.18x) put_luma_v_10_8x8_c: 34.7 ( 1.00x) put_luma_v_10_8x8_neon: 10.5 ( 3.32x) put_luma_v_10_16x16_c: 103.9 ( 1.00x) put_luma_v_10_16x16_neon: 42.3 ( 2.45x) put_luma_v_10_32x32_c: 399.7 ( 1.00x) put_luma_v_10_32x32_neon: 161.8 ( 2.47x) put_luma_v_10_64x64_c: 1276.7 ( 1.00x) put_luma_v_10_64x64_neon: 840.1 ( 1.52x) put_luma_v_10_128x128_c: 4981.3 ( 1.00x) put_luma_v_10_128x128_neon: 3008.0 ( 1.66x) put_luma_v_12_4x4_c: 23.6 ( 1.00x) put_luma_v_12_4x4_neon: 2.0 (11.84x) put_luma_v_12_8x8_c: 31.8 ( 1.00x) put_luma_v_12_8x8_neon: 12.4 ( 2.55x) put_luma_v_12_16x16_c: 100.8 ( 1.00x) put_luma_v_12_16x16_neon: 44.9 ( 2.25x) put_luma_v_12_32x32_c: 331.1 ( 1.00x) put_luma_v_12_32x32_neon: 175.2 ( 1.89x) put_luma_v_12_64x64_c: 1227.1 ( 1.00x) put_luma_v_12_64x64_neon: 712.7 ( 1.72x) put_luma_v_12_128x128_c: 5149.1 ( 1.00x) put_luma_v_12_128x128_neon: 2809.3 ( 1.83x) >From 1ca5d962e9653eef69d32ebbb0496a62065c6df7 Mon Sep 17 00:00:00 2001 From: Georgii Zagoruiko <[email protected]> Date: Tue, 9 Dec 2025 22:17:37 +0000 Subject: [PATCH] aarch64/vvc: Optimisations of put_luma_v() functions for 10/12-bit RPi4 (auto-vectorisation is on) put_luma_v_8_4x4_c: 332.6 ( 1.00x) put_luma_v_8_4x4_neon: 54.1 ( 6.14x) put_luma_v_8_8x8_c: 583.9 ( 1.00x) put_luma_v_8_8x8_neon: 161.5 ( 3.61x) put_luma_v_8_16x16_c: 1544.9 ( 1.00x) put_luma_v_8_16x16_neon: 603.5 ( 2.56x) put_luma_v_8_32x32_c: 4444.4 ( 1.00x) put_luma_v_8_32x32_neon: 2387.3 ( 1.86x) put_luma_v_8_64x64_c: 14699.9 ( 1.00x) put_luma_v_8_64x64_neon: 11128.0 ( 1.32x) put_luma_v_8_128x128_c: 52626.9 ( 1.00x) put_luma_v_8_128x128_neon: 43070.2 ( 1.22x) put_luma_v_10_4x4_c: 303.3 ( 1.00x) put_luma_v_10_4x4_neon: 55.7 ( 5.45x) put_luma_v_10_8x8_c: 1106.7 ( 1.00x) put_luma_v_10_8x8_neon: 163.8 ( 6.76x) put_luma_v_10_16x16_c: 2242.1 ( 1.00x) put_luma_v_10_16x16_neon: 672.7 ( 3.33x) put_luma_v_10_32x32_c: 7057.3 ( 1.00x) put_luma_v_10_32x32_neon: 2731.3 ( 2.58x) put_luma_v_10_64x64_c: 25699.8 ( 1.00x) put_luma_v_10_64x64_neon: 12145.6 ( 2.12x) put_luma_v_10_128x128_c: 90694.6 ( 1.00x) put_luma_v_10_128x128_neon: 44862.4 ( 2.02x) Apple M4 (auto-vectorisation is on): put_luma_v_10_4x4_c: 25.6 ( 1.00x) put_luma_v_10_4x4_neon: 3.1 ( 8.18x) put_luma_v_10_8x8_c: 34.7 ( 1.00x) put_luma_v_10_8x8_neon: 10.5 ( 3.32x) put_luma_v_10_16x16_c: 103.9 ( 1.00x) put_luma_v_10_16x16_neon: 42.3 ( 2.45x) put_luma_v_10_32x32_c: 399.7 ( 1.00x) put_luma_v_10_32x32_neon: 161.8 ( 2.47x) put_luma_v_10_64x64_c: 1276.7 ( 1.00x) put_luma_v_10_64x64_neon: 840.1 ( 1.52x) put_luma_v_10_128x128_c: 4981.3 ( 1.00x) put_luma_v_10_128x128_neon: 3008.0 ( 1.66x) put_luma_v_12_4x4_c: 23.6 ( 1.00x) put_luma_v_12_4x4_neon: 2.0 (11.84x) put_luma_v_12_8x8_c: 31.8 ( 1.00x) put_luma_v_12_8x8_neon: 12.4 ( 2.55x) put_luma_v_12_16x16_c: 100.8 ( 1.00x) put_luma_v_12_16x16_neon: 44.9 ( 2.25x) put_luma_v_12_32x32_c: 331.1 ( 1.00x) put_luma_v_12_32x32_neon: 175.2 ( 1.89x) put_luma_v_12_64x64_c: 1227.1 ( 1.00x) put_luma_v_12_64x64_neon: 712.7 ( 1.72x) put_luma_v_12_128x128_c: 5149.1 ( 1.00x) put_luma_v_12_128x128_neon: 2809.3 ( 1.83x) --- libavcodec/aarch64/vvc/dsp_init.c | 31 +++ libavcodec/aarch64/vvc/inter.S | 392 ++++++++++++++++++++++++++++++ 2 files changed, 423 insertions(+) diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index aa75d22b78..bc2677945e 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -43,6 +43,23 @@ void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdif void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_v16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_v4_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_v8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); + void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps); #define BIT_DEPTH 8 @@ -263,6 +280,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[0][5][0][1] = c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon; + c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_10_neon; + c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_10_neon; + c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_10_neon; + c->inter.put[0][4][1][0] = + c->inter.put[0][5][1][0] = + c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_10_neon; + c->alf.filter[LUMA] = alf_filter_luma_10_neon; c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; c->alf.classify = alf_classify_10_neon; @@ -279,6 +303,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[0][5][0][1] = c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_12_neon; + c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_12_neon; + c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_12_neon; + c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_12_neon; + c->inter.put[0][4][1][0] = + c->inter.put[0][5][1][0] = + c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_12_neon; + c->alf.filter[LUMA] = alf_filter_luma_12_neon; c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; c->alf.classify = alf_classify_12_neon; diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 41444ec44c..887e456a66 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -1832,3 +1832,395 @@ endfunc function ff_vvc_put_luma_h_x16_12_neon, export=1 put_luma_h_x16_xx_neon 4 endfunc + +.macro put_luma_v4_xx_neon shift + mov x9, #(VVC_MAX_PB_SIZE * 2) + sub x1, x1, x2, lsl #1 + ld1 {v0.8b}, [x5] + sub x1, x1, x2 + sxtl v0.8h, v0.8b + ld1 {v20.4h}, [x1], x2 + ld1 {v21.4h}, [x1], x2 + ld1 {v22.4h}, [x1], x2 + ld1 {v23.4h}, [x1], x2 + ld1 {v24.4h}, [x1], x2 + ld1 {v25.4h}, [x1], x2 + ld1 {v26.4h}, [x1], x2 +1: + ld1 {v27.4h}, [x1], x2 + + smull v1.4s, v20.4h, v0.h[0] + smull v2.4s, v21.4h, v0.h[1] + smlal v1.4s, v22.4h, v0.h[2] + smlal v2.4s, v23.4h, v0.h[3] + smlal v1.4s, v24.4h, v0.h[4] + smlal v2.4s, v25.4h, v0.h[5] + smlal v1.4s, v26.4h, v0.h[6] + smlal v2.4s, v27.4h, v0.h[7] + + ld1 {v28.4h}, [x1], x2 + + smull v3.4s, v21.4h, v0.h[0] + smull v4.4s, v22.4h, v0.h[1] + smlal v3.4s, v23.4h, v0.h[2] + smlal v4.4s, v24.4h, v0.h[3] + smlal v3.4s, v25.4h, v0.h[4] + smlal v4.4s, v26.4h, v0.h[5] + smlal v3.4s, v27.4h, v0.h[6] + smlal v4.4s, v28.4h, v0.h[7] + add v1.4s, v1.4s, v2.4s + add v3.4s, v3.4s, v4.4s + sqshrn v1.4h, v1.4s, #(\shift) + sqshrn v3.4h, v3.4s, #(\shift) + + st1 {v1.4h}, [x0], x9 + ld1 {v29.4h}, [x1], x2 + st1 {v3.4h}, [x0], x9 + + smull v1.4s, v22.4h, v0.h[0] + smull v2.4s, v23.4h, v0.h[1] + smlal v1.4s, v24.4h, v0.h[2] + smlal v2.4s, v25.4h, v0.h[3] + smlal v1.4s, v26.4h, v0.h[4] + smlal v2.4s, v27.4h, v0.h[5] + smlal v1.4s, v28.4h, v0.h[6] + smlal v2.4s, v29.4h, v0.h[7] + + ld1 {v30.4h}, [x1], x2 + + smull v3.4s, v23.4h, v0.h[0] + smull v4.4s, v24.4h, v0.h[1] + smlal v3.4s, v25.4h, v0.h[2] + smlal v4.4s, v26.4h, v0.h[3] + smlal v3.4s, v27.4h, v0.h[4] + smlal v4.4s, v28.4h, v0.h[5] + smlal v3.4s, v29.4h, v0.h[6] + smlal v4.4s, v30.4h, v0.h[7] + add v1.4s, v1.4s, v2.4s + add v3.4s, v3.4s, v4.4s + sqshrn v1.4h, v1.4s, #(\shift) + sqshrn v3.4h, v3.4s, #(\shift) + + st1 {v1.4h}, [x0], x9 + + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + mov v23.16b, v27.16b + mov v24.16b, v28.16b + mov v25.16b, v29.16b + mov v26.16b, v30.16b + + subs w3, w3, #4 + st1 {v3.4h}, [x0], x9 + b.gt 1b + ret +.endm + +function ff_vvc_put_luma_v4_10_neon, export=1 + put_luma_v4_xx_neon 2 +endfunc + +function ff_vvc_put_luma_v4_12_neon, export=1 + put_luma_v4_xx_neon 4 +endfunc + +.macro put_luma_v8_xx_neon shift + mov x9, #(VVC_MAX_PB_SIZE * 2) + sub x1, x1, x2, lsl #1 + ld1 {v0.8b}, [x5] + sub x1, x1, x2 + sxtl v0.8h, v0.8b + ld1 {v20.8h}, [x1], x2 + ld1 {v21.8h}, [x1], x2 + ld1 {v22.8h}, [x1], x2 + ld1 {v23.8h}, [x1], x2 + ld1 {v24.8h}, [x1], x2 + ld1 {v25.8h}, [x1], x2 + ld1 {v26.8h}, [x1], x2 +1: + ld1 {v27.8h}, [x1], x2 + + smull v1.4s, v20.4h, v0.h[0] + smull2 v2.4s, v20.8h, v0.h[0] + smlal v1.4s, v21.4h, v0.h[1] + smlal2 v2.4s, v21.8h, v0.h[1] + smlal v1.4s, v22.4h, v0.h[2] + smlal2 v2.4s, v22.8h, v0.h[2] + smlal v1.4s, v23.4h, v0.h[3] + smlal2 v2.4s, v23.8h, v0.h[3] + smlal v1.4s, v24.4h, v0.h[4] + smlal2 v2.4s, v24.8h, v0.h[4] + smlal v1.4s, v25.4h, v0.h[5] + smlal2 v2.4s, v25.8h, v0.h[5] + smlal v1.4s, v26.4h, v0.h[6] + smlal2 v2.4s, v26.8h, v0.h[6] + smlal v1.4s, v27.4h, v0.h[7] + smlal2 v2.4s, v27.8h, v0.h[7] + sqshrn v1.4h, v1.4s, #(\shift) + sqshrn v2.4h, v2.4s, #(\shift) + + ld1 {v28.8h}, [x1], x2 + st1 {v1.4h-v2.4h}, [x0], x9 + + smull v3.4s, v21.4h, v0.h[0] + smull2 v4.4s, v21.8h, v0.h[0] + smlal v3.4s, v22.4h, v0.h[1] + smlal2 v4.4s, v22.8h, v0.h[1] + smlal v3.4s, v23.4h, v0.h[2] + smlal2 v4.4s, v23.8h, v0.h[2] + smlal v3.4s, v24.4h, v0.h[3] + smlal2 v4.4s, v24.8h, v0.h[3] + smlal v3.4s, v25.4h, v0.h[4] + smlal2 v4.4s, v25.8h, v0.h[4] + smlal v3.4s, v26.4h, v0.h[5] + smlal2 v4.4s, v26.8h, v0.h[5] + smlal v3.4s, v27.4h, v0.h[6] + smlal2 v4.4s, v27.8h, v0.h[6] + smlal v3.4s, v28.4h, v0.h[7] + smlal2 v4.4s, v28.8h, v0.h[7] + sqshrn v3.4h, v3.4s, #(\shift) + sqshrn v4.4h, v4.4s, #(\shift) + + ld1 {v29.8h}, [x1], x2 + st1 {v3.4h-v4.4h}, [x0], x9 + + smull v1.4s, v22.4h, v0.h[0] + smull2 v2.4s, v22.8h, v0.h[0] + smlal v1.4s, v23.4h, v0.h[1] + smlal2 v2.4s, v23.8h, v0.h[1] + smlal v1.4s, v24.4h, v0.h[2] + smlal2 v2.4s, v24.8h, v0.h[2] + smlal v1.4s, v25.4h, v0.h[3] + smlal2 v2.4s, v25.8h, v0.h[3] + smlal v1.4s, v26.4h, v0.h[4] + smlal2 v2.4s, v26.8h, v0.h[4] + smlal v1.4s, v27.4h, v0.h[5] + smlal2 v2.4s, v27.8h, v0.h[5] + smlal v1.4s, v28.4h, v0.h[6] + smlal2 v2.4s, v28.8h, v0.h[6] + smlal v1.4s, v29.4h, v0.h[7] + smlal2 v2.4s, v29.8h, v0.h[7] + sqshrn v1.4h, v1.4s, #(\shift) + sqshrn v2.4h, v2.4s, #(\shift) + + ld1 {v30.8h}, [x1], x2 + st1 {v1.4h-v2.4h}, [x0], x9 + + smull v3.4s, v23.4h, v0.h[0] + smull2 v4.4s, v23.8h, v0.h[0] + smlal v3.4s, v24.4h, v0.h[1] + smlal2 v4.4s, v24.8h, v0.h[1] + smlal v3.4s, v25.4h, v0.h[2] + smlal2 v4.4s, v25.8h, v0.h[2] + smlal v3.4s, v26.4h, v0.h[3] + smlal2 v4.4s, v26.8h, v0.h[3] + smlal v3.4s, v27.4h, v0.h[4] + smlal2 v4.4s, v27.8h, v0.h[4] + smlal v3.4s, v28.4h, v0.h[5] + smlal2 v4.4s, v28.8h, v0.h[5] + smlal v3.4s, v29.4h, v0.h[6] + smlal2 v4.4s, v29.8h, v0.h[6] + smlal v3.4s, v30.4h, v0.h[7] + smlal2 v4.4s, v30.8h, v0.h[7] + sqshrn v3.4h, v3.4s, #(\shift) + sqshrn v4.4h, v4.4s, #(\shift) + + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + mov v23.16b, v27.16b + mov v24.16b, v28.16b + mov v25.16b, v29.16b + mov v26.16b, v30.16b + + subs w3, w3, #4 + st1 {v3.4h-v4.4h}, [x0], x9 + b.gt 1b + ret +.endm + +function ff_vvc_put_luma_v8_10_neon, export=1 + put_luma_v8_xx_neon 2 +endfunc + +function ff_vvc_put_luma_v8_12_neon, export=1 + put_luma_v8_xx_neon 4 +endfunc + +.macro put_luma_v_x16_vector_filter shift + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + smlal v2.4s, v18.4h, v1.h[1] + smlal2 v3.4s, v18.8h, v1.h[1] + smlal v2.4s, v20.4h, v1.h[2] + smlal2 v3.4s, v20.8h, v1.h[2] + smlal v2.4s, v22.4h, v1.h[3] + smlal2 v3.4s, v22.8h, v1.h[3] + smlal v2.4s, v24.4h, v1.h[4] + smlal2 v3.4s, v24.8h, v1.h[4] + smlal v2.4s, v26.4h, v1.h[5] + smlal2 v3.4s, v26.8h, v1.h[5] + smlal v2.4s, v28.4h, v1.h[6] + smlal2 v3.4s, v28.8h, v1.h[6] + smlal v2.4s, v30.4h, v1.h[7] + smlal2 v3.4s, v30.8h, v1.h[7] + + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v4.4s, v19.4h, v1.h[1] + smlal2 v5.4s, v19.8h, v1.h[1] + smlal v4.4s, v21.4h, v1.h[2] + smlal2 v5.4s, v21.8h, v1.h[2] + smlal v4.4s, v23.4h, v1.h[3] + smlal2 v5.4s, v23.8h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[4] + smlal2 v5.4s, v25.8h, v1.h[4] + smlal v4.4s, v27.4h, v1.h[5] + smlal2 v5.4s, v27.8h, v1.h[5] + smlal v4.4s, v29.4h, v1.h[6] + smlal2 v5.4s, v29.8h, v1.h[6] + smlal v4.4s, v31.4h, v1.h[7] + smlal2 v5.4s, v31.8h, v1.h[7] + + sqshrn v6.4h, v2.4s, #(\shift) + sqshrn v7.4h, v4.4s, #(\shift) + sqshrn2 v6.8h, v3.4s, #(\shift) + sqshrn2 v7.8h, v5.4s, #(\shift) +.endm + +.macro put_luma_v16_xx_neon shift + mov x9, #(VVC_MAX_PB_SIZE * 2) + sub x1, x1, x2, lsl #1 + ld1 {v0.8b}, [x5] + sub x1, x1, x2 + sxtl v0.8h, v0.8b + ld1 {v16.8h-v17.8h}, [x1], x2 + ld1 {v18.8h-v19.8h}, [x1], x2 + ld1 {v20.8h-v21.8h}, [x1], x2 + ld1 {v22.8h-v23.8h}, [x1], x2 + ld1 {v24.8h-v25.8h}, [x1], x2 + ld1 {v26.8h-v27.8h}, [x1], x2 + ld1 {v28.8h-v29.8h}, [x1], x2 +1: + mov v1.16b, v0.16b + ld1 {v30.8h-v31.8h}, [x1], x2 + + put_luma_v_x16_vector_filter \shift + + ld1 {v16.8h-v17.8h}, [x1], x2 + ext v1.16b, v0.16b, v0.16b, #14 + st1 {v6.8h-v7.8h}, [x0], x9 + + put_luma_v_x16_vector_filter \shift + + ld1 {v18.8h-v19.8h}, [x1], x2 + ext v1.16b, v0.16b, v0.16b, #12 + st1 {v6.8h-v7.8h}, [x0], x9 + + put_luma_v_x16_vector_filter \shift + + ld1 {v20.8h-v21.8h}, [x1], x2 + ext v1.16b, v0.16b, v0.16b, #10 + st1 {v6.8h-v7.8h}, [x0], x9 + + put_luma_v_x16_vector_filter \shift + + subs w3, w3, #4 + st1 {v6.8h-v7.8h}, [x0], x9 + + mov v2.16b, v16.16b + mov v3.16b, v17.16b + mov v16.16b, v24.16b + mov v17.16b, v25.16b + mov v24.16b, v2.16b + mov v25.16b, v3.16b + + mov v2.16b, v18.16b + mov v3.16b, v19.16b + mov v18.16b, v26.16b + mov v19.16b, v27.16b + mov v26.16b, v2.16b + mov v27.16b, v3.16b + + mov v2.16b, v20.16b + mov v3.16b, v21.16b + mov v20.16b, v28.16b + mov v21.16b, v29.16b + mov v28.16b, v2.16b + mov v29.16b, v3.16b + + mov v22.16b, v30.16b + mov v23.16b, v31.16b + b.gt 1b + ret +.endm + +function ff_vvc_put_luma_v16_10_neon, export=1 + put_luma_v16_xx_neon 2 +endfunc + +function ff_vvc_put_luma_v16_12_neon, export=1 + put_luma_v16_xx_neon 4 +endfunc + + +.macro put_luma_v_x16_xx_neon shift + mov x9, #(VVC_MAX_PB_SIZE * 2) + sub x1, x1, x2, lsl #1 + ld1 {v0.8b}, [x5] + sub x1, x1, x2 + sxtl v0.8h, v0.8b +1: + mov w8, #0 +2: + add x11, x1, x8, lsl #1 + add x10, x0, x8, lsl #1 + ld1 {v16.8h-v17.8h}, [x11], x2 + add x8, x8, #16 + ld1 {v18.8h-v19.8h}, [x11], x2 + cmp w8, w6 + ld1 {v20.8h-v21.8h}, [x11], x2 + mov v1.16b, v0.16b + ld1 {v22.8h-v23.8h}, [x11], x2 + ld1 {v24.8h-v25.8h}, [x11], x2 + ld1 {v26.8h-v27.8h}, [x11], x2 + ld1 {v28.8h-v29.8h}, [x11], x2 + ld1 {v30.8h-v31.8h}, [x11], x2 + + put_luma_v_x16_vector_filter \shift + + ld1 {v16.8h-v17.8h}, [x11], x2 + ext v1.16b, v0.16b, v0.16b, #14 + st1 {v6.8h-v7.8h}, [x10], x9 + + put_luma_v_x16_vector_filter \shift + + st1 {v6.8h-v7.8h}, [x10], x9 + ext v1.16b, v0.16b, v0.16b, #12 + ld1 {v18.8h-v19.8h}, [x11], x2 + + put_luma_v_x16_vector_filter \shift + + ld1 {v20.8h-v21.8h}, [x11], x2 + ext v1.16b, v0.16b, v0.16b, #10 + st1 {v6.8h-v7.8h}, [x10], x9 + + put_luma_v_x16_vector_filter \shift + + st1 {v6.8h-v7.8h}, [x10], x9 + b.lt 2b + add x0, x0, x9, lsl #2 + subs w3, w3, #4 + add x1, x1, x2, lsl #2 + b.gt 1b + ret +.endm + +function ff_vvc_put_luma_v_x16_10_neon, export=1 + put_luma_v_x16_xx_neon 2 +endfunc + +function ff_vvc_put_luma_v_x16_12_neon, export=1 + put_luma_v_x16_xx_neon 4 +endfunc -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
