PR #22500 opened by george.zaguri URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22500 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22500.patch
Apple M4: put_chroma_v_10_2x2_c: 5.8 ( 1.00x) put_chroma_v_10_4x4_c: 9.0 ( 1.00x) put_chroma_v_10_4x4_neon: 1.7 ( 5.29x) put_chroma_v_10_8x8_c: 22.1 ( 1.00x) put_chroma_v_10_8x8_neon: 5.8 ( 3.79x) put_chroma_v_10_16x16_c: 56.3 ( 1.00x) put_chroma_v_10_16x16_neon: 21.2 ( 2.66x) put_chroma_v_10_32x32_c: 181.6 ( 1.00x) put_chroma_v_10_32x32_neon: 86.9 ( 2.09x) put_chroma_v_10_64x64_c: 680.3 ( 1.00x) put_chroma_v_10_64x64_neon: 337.4 ( 2.02x) put_chroma_v_10_128x128_c: 2567.3 ( 1.00x) put_chroma_v_10_128x128_neon: 1374.8 ( 1.87x) put_chroma_v_12_2x2_c: 6.4 ( 1.00x) put_chroma_v_12_4x4_c: 8.2 ( 1.00x) put_chroma_v_12_4x4_neon: 1.5 ( 5.56x) put_chroma_v_12_8x8_c: 18.9 ( 1.00x) put_chroma_v_12_8x8_neon: 5.7 ( 3.29x) put_chroma_v_12_16x16_c: 52.6 ( 1.00x) put_chroma_v_12_16x16_neon: 19.9 ( 2.65x) put_chroma_v_12_32x32_c: 185.7 ( 1.00x) put_chroma_v_12_32x32_neon: 81.9 ( 2.27x) put_chroma_v_12_64x64_c: 661.8 ( 1.00x) put_chroma_v_12_64x64_neon: 342.1 ( 1.93x) put_chroma_v_12_128x128_c: 2547.8 ( 1.00x) put_chroma_v_12_128x128_neon: 1368.0 ( 1.86x) RPi4: put_chroma_v_10_2x2_c: 64.8 ( 1.00x) put_chroma_v_10_4x4_c: 157.2 ( 1.00x) put_chroma_v_10_4x4_neon: 39.7 ( 3.96x) put_chroma_v_10_8x8_c: 562.1 ( 1.00x) put_chroma_v_10_8x8_neon: 98.8 ( 5.69x) put_chroma_v_10_16x16_c: 1170.7 ( 1.00x) put_chroma_v_10_16x16_neon: 380.7 ( 3.07x) put_chroma_v_10_32x32_c: 3696.6 ( 1.00x) put_chroma_v_10_32x32_neon: 1723.8 ( 2.14x) put_chroma_v_10_64x64_c: 13170.9 ( 1.00x) put_chroma_v_10_64x64_neon: 7284.1 ( 1.81x) put_chroma_v_10_128x128_c: 46068.3 ( 1.00x) put_chroma_v_10_128x128_neon: 27219.5 ( 1.69x) put_chroma_v_12_2x2_c: 63.8 ( 1.00x) put_chroma_v_12_4x4_c: 156.5 ( 1.00x) put_chroma_v_12_4x4_neon: 39.3 ( 3.98x) put_chroma_v_12_8x8_c: 560.9 ( 1.00x) put_chroma_v_12_8x8_neon: 98.7 ( 5.68x) put_chroma_v_12_16x16_c: 1169.9 ( 1.00x) put_chroma_v_12_16x16_neon: 380.8 ( 3.07x) put_chroma_v_12_32x32_c: 3693.9 ( 1.00x) put_chroma_v_12_32x32_neon: 1728.4 ( 2.14x) put_chroma_v_12_64x64_c: 13170.9 ( 1.00x) put_chroma_v_12_64x64_neon: 7284.9 ( 1.81x) put_chroma_v_12_128x128_c: 46068.0 ( 1.00x) put_chroma_v_12_128x128_neon: 27224.6 ( 1.69x) >From 92ae528b4fdcbf79971e578155be696408806b61 Mon Sep 17 00:00:00 2001 From: Georgii Zagoruiko <[email protected]> Date: Sat, 14 Mar 2026 14:16:48 +0000 Subject: [PATCH] aarch64/vvc: Optimisations of put_chroma_v() functions for 10/12-bit Apple M4: put_chroma_v_10_2x2_c: 5.8 ( 1.00x) put_chroma_v_10_4x4_c: 9.0 ( 1.00x) put_chroma_v_10_4x4_neon: 1.7 ( 5.29x) put_chroma_v_10_8x8_c: 22.1 ( 1.00x) put_chroma_v_10_8x8_neon: 5.8 ( 3.79x) put_chroma_v_10_16x16_c: 56.3 ( 1.00x) put_chroma_v_10_16x16_neon: 21.2 ( 2.66x) put_chroma_v_10_32x32_c: 181.6 ( 1.00x) put_chroma_v_10_32x32_neon: 86.9 ( 2.09x) put_chroma_v_10_64x64_c: 680.3 ( 1.00x) put_chroma_v_10_64x64_neon: 337.4 ( 2.02x) put_chroma_v_10_128x128_c: 2567.3 ( 1.00x) put_chroma_v_10_128x128_neon: 1374.8 ( 1.87x) put_chroma_v_12_2x2_c: 6.4 ( 1.00x) put_chroma_v_12_4x4_c: 8.2 ( 1.00x) put_chroma_v_12_4x4_neon: 1.5 ( 5.56x) put_chroma_v_12_8x8_c: 18.9 ( 1.00x) put_chroma_v_12_8x8_neon: 5.7 ( 3.29x) put_chroma_v_12_16x16_c: 52.6 ( 1.00x) put_chroma_v_12_16x16_neon: 19.9 ( 2.65x) put_chroma_v_12_32x32_c: 185.7 ( 1.00x) put_chroma_v_12_32x32_neon: 81.9 ( 2.27x) put_chroma_v_12_64x64_c: 661.8 ( 1.00x) put_chroma_v_12_64x64_neon: 342.1 ( 1.93x) put_chroma_v_12_128x128_c: 2547.8 ( 1.00x) put_chroma_v_12_128x128_neon: 1368.0 ( 1.86x) RPi4: put_chroma_v_10_2x2_c: 64.8 ( 1.00x) put_chroma_v_10_4x4_c: 157.2 ( 1.00x) put_chroma_v_10_4x4_neon: 39.7 ( 3.96x) put_chroma_v_10_8x8_c: 562.1 ( 1.00x) put_chroma_v_10_8x8_neon: 98.8 ( 5.69x) put_chroma_v_10_16x16_c: 1170.7 ( 1.00x) put_chroma_v_10_16x16_neon: 380.7 ( 3.07x) put_chroma_v_10_32x32_c: 3696.6 ( 1.00x) put_chroma_v_10_32x32_neon: 1723.8 ( 2.14x) put_chroma_v_10_64x64_c: 13170.9 ( 1.00x) put_chroma_v_10_64x64_neon: 7284.1 ( 1.81x) put_chroma_v_10_128x128_c: 46068.3 ( 1.00x) put_chroma_v_10_128x128_neon: 27219.5 ( 1.69x) put_chroma_v_12_2x2_c: 63.8 ( 1.00x) put_chroma_v_12_4x4_c: 156.5 ( 1.00x) put_chroma_v_12_4x4_neon: 39.3 ( 3.98x) put_chroma_v_12_8x8_c: 560.9 ( 1.00x) put_chroma_v_12_8x8_neon: 98.7 ( 5.68x) put_chroma_v_12_16x16_c: 1169.9 ( 1.00x) put_chroma_v_12_16x16_neon: 380.8 ( 3.07x) put_chroma_v_12_32x32_c: 3693.9 ( 1.00x) put_chroma_v_12_32x32_neon: 1728.4 ( 2.14x) put_chroma_v_12_64x64_c: 13170.9 ( 1.00x) put_chroma_v_12_64x64_neon: 7284.9 ( 1.81x) put_chroma_v_12_128x128_c: 46068.0 ( 1.00x) put_chroma_v_12_128x128_neon: 27224.6 ( 1.69x) --- libavcodec/aarch64/vvc/dsp_init.c | 31 +++++ libavcodec/aarch64/vvc/inter.S | 223 ++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+) diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index c541695a5c..e5ed0ea244 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -56,6 +56,23 @@ void ff_vvc_put_chroma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrd void ff_vvc_put_chroma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_chroma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_chroma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_chroma_v16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_chroma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_chroma_v4_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_chroma_v8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_chroma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_chroma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); + void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width); void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, @@ -316,6 +333,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[0][5][0][1] = c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon; + c->inter.put[1][1][1][0] = ff_vvc_put_chroma_v4_10_neon; + c->inter.put[1][2][1][0] = ff_vvc_put_chroma_v8_10_neon; + c->inter.put[1][3][1][0] = ff_vvc_put_chroma_v16_10_neon; + c->inter.put[1][4][1][0] = + c->inter.put[1][5][1][0] = + c->inter.put[1][6][1][0] = ff_vvc_put_chroma_v_x16_10_neon; + c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_10_neon; c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_10_neon; c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_10_neon; @@ -368,6 +392,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[0][5][1][0] = c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_12_neon; + c->inter.put[1][1][1][0] = ff_vvc_put_chroma_v4_12_neon; + c->inter.put[1][2][1][0] = ff_vvc_put_chroma_v8_12_neon; + c->inter.put[1][3][1][0] = ff_vvc_put_chroma_v16_12_neon; + c->inter.put[1][4][1][0] = + c->inter.put[1][5][1][0] = + c->inter.put[1][6][1][0] = ff_vvc_put_chroma_v_x16_12_neon; + c->alf.filter[LUMA] = alf_filter_luma_12_neon; c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; c->alf.classify = alf_classify_12_neon; diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 092aad3b48..0dc1c59631 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -2356,6 +2356,229 @@ function ff_vvc_put_luma_v_x16_12_neon, export=1 put_luma_v_x16_xx_neon 4 endfunc +.macro put_chroma_v4_xx_neon shift + // dst .req x0 + // _src .req x1 + // _src_stride .req x2 + // height .req x3 + // hf .req x4 + // vf .req x5 + // width .req x6 + mov x9, #(VVC_MAX_PB_SIZE * 2) + ldr s0, [x5] + sub x1, x1, x2 + sxtl v0.8h, v0.8b + ld1 {v20.4h}, [x1], x2 + ld1 {v21.4h}, [x1], x2 + ld1 {v22.4h}, [x1], x2 +1: + ld1 {v23.4h}, [x1], x2 + + smull v1.4s, v20.4h, v0.h[0] + smull v2.4s, v21.4h, v0.h[1] + smlal v1.4s, v22.4h, v0.h[2] + smlal v2.4s, v23.4h, v0.h[3] + + ld1 {v24.4h}, [x1], x2 + + smull v3.4s, v21.4h, v0.h[0] + smull v4.4s, v22.4h, v0.h[1] + smlal v3.4s, v23.4h, v0.h[2] + smlal v4.4s, v24.4h, v0.h[3] + + add v1.4s, v1.4s, v2.4s + add v3.4s, v3.4s, v4.4s + sqshrn v1.4h, v1.4s, #(\shift) + sqshrn v3.4h, v3.4s, #(\shift) + + st1 {v1.4h}, [x0], x9 + mov v20.16b, v22.16b + mov v21.16b, v23.16b + mov v22.16b, v24.16b + subs w3, w3, #2 + st1 {v3.4h}, [x0], x9 + b.gt 1b + ret +.endm + +function ff_vvc_put_chroma_v4_10_neon, export=1 + put_chroma_v4_xx_neon 2 +endfunc + +function ff_vvc_put_chroma_v4_12_neon, export=1 + put_chroma_v4_xx_neon 4 +endfunc + +.macro put_chroma_v8_xx_neon shift + // dst .req x0 + // _src .req x1 + // _src_stride .req x2 + // height .req x3 + // hf .req x4 + // vf .req x5 + // width .req x6 + mov x9, #(VVC_MAX_PB_SIZE * 2) + ldr s0, [x5] + sub x1, x1, x2 + sxtl v0.8h, v0.8b + ld1 {v20.8h}, [x1], x2 + ld1 {v21.8h}, [x1], x2 + ld1 {v22.8h}, [x1], x2 +1: + ld1 {v23.8h}, [x1], x2 + + smull v1.4s, v20.4h, v0.h[0] + smull2 v2.4s, v20.8h, v0.h[0] + smlal v1.4s, v21.4h, v0.h[1] + smlal2 v2.4s, v21.8h, v0.h[1] + smlal v1.4s, v22.4h, v0.h[2] + smlal2 v2.4s, v22.8h, v0.h[2] + smlal v1.4s, v23.4h, v0.h[3] + smlal2 v2.4s, v23.8h, v0.h[3] + sqshrn v1.4h, v1.4s, #(\shift) + sqshrn v2.4h, v2.4s, #(\shift) + + ld1 {v24.8h}, [x1], x2 + st1 {v1.4h-v2.4h}, [x0], x9 + + smull v3.4s, v21.4h, v0.h[0] + smull2 v4.4s, v21.8h, v0.h[0] + smlal v3.4s, v22.4h, v0.h[1] + smlal2 v4.4s, v22.8h, v0.h[1] + smlal v3.4s, v23.4h, v0.h[2] + smlal2 v4.4s, v23.8h, v0.h[2] + smlal v3.4s, v24.4h, v0.h[3] + smlal2 v4.4s, v24.8h, v0.h[3] + sqshrn v3.4h, v3.4s, #(\shift) + sqshrn v4.4h, v4.4s, #(\shift) + + mov v20.16b, v22.16b + mov v21.16b, v23.16b + mov v22.16b, v24.16b + subs w3, w3, #2 + st1 {v3.4h-v4.4h}, [x0], x9 + b.gt 1b + ret +.endm + +function ff_vvc_put_chroma_v8_10_neon, export=1 + put_chroma_v8_xx_neon 2 +endfunc + +function ff_vvc_put_chroma_v8_12_neon, export=1 + put_chroma_v8_xx_neon 4 +endfunc + +.macro put_chroma_v_x16_horizontal_filter shift, src0, src1, src2, src3, src4, src5, src6, src7 + smull v2.4s, \src0\().4h, v0.h[0] + smull2 v3.4s, \src0\().8h, v0.h[0] + smlal v2.4s, \src2\().4h, v0.h[1] + smlal2 v3.4s, \src2\().8h, v0.h[1] + smlal v2.4s, \src4\().4h, v0.h[2] + smlal2 v3.4s, \src4\().8h, v0.h[2] + smlal v2.4s, \src6\().4h, v0.h[3] + smlal2 v3.4s, \src6\().8h, v0.h[3] + + smull v4.4s, \src1\().4h, v0.h[0] + smull2 v5.4s, \src1\().8h, v0.h[0] + smlal v4.4s, \src3\().4h, v0.h[1] + smlal2 v5.4s, \src3\().8h, v0.h[1] + smlal v4.4s, \src5\().4h, v0.h[2] + smlal2 v5.4s, \src5\().8h, v0.h[2] + smlal v4.4s, \src7\().4h, v0.h[3] + smlal2 v5.4s, \src7\().8h, v0.h[3] + + sqshrn v6.4h, v2.4s, #(\shift) + sqshrn v7.4h, v4.4s, #(\shift) + sqshrn2 v6.8h, v3.4s, #(\shift) + sqshrn2 v7.8h, v5.4s, #(\shift) +.endm + +.macro put_chroma_v16_xx_neon shift + // dst .req x0 + // _src .req x1 + // _src_stride .req x2 + // height .req x3 + // hf .req x4 + // vf .req x5 + // width .req x6 + mov x9, #(VVC_MAX_PB_SIZE * 2) + ldr s0, [x5] + sub x1, x1, x2 + sxtl v0.8h, v0.8b + ld1 {v16.8h-v17.8h}, [x1], x2 + ld1 {v18.8h-v19.8h}, [x1], x2 + ld1 {v20.8h-v21.8h}, [x1], x2 +1: + ld1 {v22.8h-v23.8h}, [x1], x2 + put_chroma_v_x16_horizontal_filter \shift, v16, v17, v18, v19, v20, v21, v22, v23 + ld1 {v24.8h-v25.8h}, [x1], x2 + st1 {v6.8h-v7.8h}, [x0], x9 + put_chroma_v_x16_horizontal_filter \shift, v18, v19, v20, v21, v22, v23, v24, v25 + subs w3, w3, #2 + st1 {v6.8h-v7.8h}, [x0], x9 + + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b + mov v20.16b, v24.16b + mov v21.16b, v25.16b + b.gt 1b + ret +.endm + +function ff_vvc_put_chroma_v16_10_neon, export=1 + put_chroma_v16_xx_neon 2 +endfunc + +function ff_vvc_put_chroma_v16_12_neon, export=1 + put_chroma_v16_xx_neon 4 +endfunc + +.macro put_chroma_v_x16_xx_neon shift + // dst .req x0 + // _src .req x1 + // _src_stride .req x2 + // height .req x3 + // hf .req x4 + // vf .req x5 + // width .req x6 + mov x9, #(VVC_MAX_PB_SIZE * 2) + ldr s0, [x5] + sub x1, x1, x2 + sxtl v0.8h, v0.8b +1: + mov w8, #0 +2: + add x11, x1, x8, lsl #1 + add x10, x0, x8, lsl #1 + ld1 {v16.8h-v17.8h}, [x11], x2 + add x8, x8, #16 + ld1 {v18.8h-v19.8h}, [x11], x2 + cmp w8, w6 + ld1 {v20.8h-v21.8h}, [x11], x2 + ld1 {v22.8h-v23.8h}, [x11], x2 + ld1 {v24.8h-v25.8h}, [x11], x2 + put_chroma_v_x16_horizontal_filter \shift, v16, v17, v18, v19, v20, v21, v22, v23 + st1 {v6.8h-v7.8h}, [x10], x9 + put_chroma_v_x16_horizontal_filter \shift, v18, v19, v20, v21, v22, v23, v24, v25 + st1 {v6.8h-v7.8h}, [x10], x9 + b.lt 2b + add x0, x0, x9, lsl #1 + subs w3, w3, #2 + add x1, x1, x2, lsl #1 + b.gt 1b + ret +.endm + +function ff_vvc_put_chroma_v_x16_10_neon, export=1 + put_chroma_v_x16_xx_neon 2 +endfunc + +function ff_vvc_put_chroma_v_x16_12_neon, export=1 + put_chroma_v_x16_xx_neon 4 +endfunc .macro put_luma_hv_x8_horizontal_filter shift, dst, src0, src1 ext v2.16b, \src0\().16b, \src1\().16b, #2 -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
