PR #22500 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22500
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22500.patch

Apple M4:
put_chroma_v_10_2x2_c:                                   5.8 ( 1.00x)
put_chroma_v_10_4x4_c:                                   9.0 ( 1.00x)
put_chroma_v_10_4x4_neon:                                1.7 ( 5.29x)
put_chroma_v_10_8x8_c:                                  22.1 ( 1.00x)
put_chroma_v_10_8x8_neon:                                5.8 ( 3.79x)
put_chroma_v_10_16x16_c:                                56.3 ( 1.00x)
put_chroma_v_10_16x16_neon:                             21.2 ( 2.66x)
put_chroma_v_10_32x32_c:                               181.6 ( 1.00x)
put_chroma_v_10_32x32_neon:                             86.9 ( 2.09x)
put_chroma_v_10_64x64_c:                               680.3 ( 1.00x)
put_chroma_v_10_64x64_neon:                            337.4 ( 2.02x)
put_chroma_v_10_128x128_c:                            2567.3 ( 1.00x)
put_chroma_v_10_128x128_neon:                         1374.8 ( 1.87x)
put_chroma_v_12_2x2_c:                                   6.4 ( 1.00x)
put_chroma_v_12_4x4_c:                                   8.2 ( 1.00x)
put_chroma_v_12_4x4_neon:                                1.5 ( 5.56x)
put_chroma_v_12_8x8_c:                                  18.9 ( 1.00x)
put_chroma_v_12_8x8_neon:                                5.7 ( 3.29x)
put_chroma_v_12_16x16_c:                                52.6 ( 1.00x)
put_chroma_v_12_16x16_neon:                             19.9 ( 2.65x)
put_chroma_v_12_32x32_c:                               185.7 ( 1.00x)
put_chroma_v_12_32x32_neon:                             81.9 ( 2.27x)
put_chroma_v_12_64x64_c:                               661.8 ( 1.00x)
put_chroma_v_12_64x64_neon:                            342.1 ( 1.93x)
put_chroma_v_12_128x128_c:                            2547.8 ( 1.00x)
put_chroma_v_12_128x128_neon:                         1368.0 ( 1.86x)

RPi4:
put_chroma_v_10_2x2_c:                                  64.8 ( 1.00x)
put_chroma_v_10_4x4_c:                                 157.2 ( 1.00x)
put_chroma_v_10_4x4_neon:                               39.7 ( 3.96x)
put_chroma_v_10_8x8_c:                                 562.1 ( 1.00x)
put_chroma_v_10_8x8_neon:                               98.8 ( 5.69x)
put_chroma_v_10_16x16_c:                              1170.7 ( 1.00x)
put_chroma_v_10_16x16_neon:                            380.7 ( 3.07x)
put_chroma_v_10_32x32_c:                              3696.6 ( 1.00x)
put_chroma_v_10_32x32_neon:                           1723.8 ( 2.14x)
put_chroma_v_10_64x64_c:                             13170.9 ( 1.00x)
put_chroma_v_10_64x64_neon:                           7284.1 ( 1.81x)
put_chroma_v_10_128x128_c:                           46068.3 ( 1.00x)
put_chroma_v_10_128x128_neon:                        27219.5 ( 1.69x)
put_chroma_v_12_2x2_c:                                  63.8 ( 1.00x)
put_chroma_v_12_4x4_c:                                 156.5 ( 1.00x)
put_chroma_v_12_4x4_neon:                               39.3 ( 3.98x)
put_chroma_v_12_8x8_c:                                 560.9 ( 1.00x)
put_chroma_v_12_8x8_neon:                               98.7 ( 5.68x)
put_chroma_v_12_16x16_c:                              1169.9 ( 1.00x)
put_chroma_v_12_16x16_neon:                            380.8 ( 3.07x)
put_chroma_v_12_32x32_c:                              3693.9 ( 1.00x)
put_chroma_v_12_32x32_neon:                           1728.4 ( 2.14x)
put_chroma_v_12_64x64_c:                             13170.9 ( 1.00x)
put_chroma_v_12_64x64_neon:                           7284.9 ( 1.81x)
put_chroma_v_12_128x128_c:                           46068.0 ( 1.00x)
put_chroma_v_12_128x128_neon:                        27224.6 ( 1.69x)


>From 92ae528b4fdcbf79971e578155be696408806b61 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <[email protected]>
Date: Sat, 14 Mar 2026 14:16:48 +0000
Subject: [PATCH] aarch64/vvc: Optimisations of put_chroma_v() functions for
 10/12-bit

Apple M4:
put_chroma_v_10_2x2_c:                                   5.8 ( 1.00x)
put_chroma_v_10_4x4_c:                                   9.0 ( 1.00x)
put_chroma_v_10_4x4_neon:                                1.7 ( 5.29x)
put_chroma_v_10_8x8_c:                                  22.1 ( 1.00x)
put_chroma_v_10_8x8_neon:                                5.8 ( 3.79x)
put_chroma_v_10_16x16_c:                                56.3 ( 1.00x)
put_chroma_v_10_16x16_neon:                             21.2 ( 2.66x)
put_chroma_v_10_32x32_c:                               181.6 ( 1.00x)
put_chroma_v_10_32x32_neon:                             86.9 ( 2.09x)
put_chroma_v_10_64x64_c:                               680.3 ( 1.00x)
put_chroma_v_10_64x64_neon:                            337.4 ( 2.02x)
put_chroma_v_10_128x128_c:                            2567.3 ( 1.00x)
put_chroma_v_10_128x128_neon:                         1374.8 ( 1.87x)
put_chroma_v_12_2x2_c:                                   6.4 ( 1.00x)
put_chroma_v_12_4x4_c:                                   8.2 ( 1.00x)
put_chroma_v_12_4x4_neon:                                1.5 ( 5.56x)
put_chroma_v_12_8x8_c:                                  18.9 ( 1.00x)
put_chroma_v_12_8x8_neon:                                5.7 ( 3.29x)
put_chroma_v_12_16x16_c:                                52.6 ( 1.00x)
put_chroma_v_12_16x16_neon:                             19.9 ( 2.65x)
put_chroma_v_12_32x32_c:                               185.7 ( 1.00x)
put_chroma_v_12_32x32_neon:                             81.9 ( 2.27x)
put_chroma_v_12_64x64_c:                               661.8 ( 1.00x)
put_chroma_v_12_64x64_neon:                            342.1 ( 1.93x)
put_chroma_v_12_128x128_c:                            2547.8 ( 1.00x)
put_chroma_v_12_128x128_neon:                         1368.0 ( 1.86x)

RPi4:
put_chroma_v_10_2x2_c:                                  64.8 ( 1.00x)
put_chroma_v_10_4x4_c:                                 157.2 ( 1.00x)
put_chroma_v_10_4x4_neon:                               39.7 ( 3.96x)
put_chroma_v_10_8x8_c:                                 562.1 ( 1.00x)
put_chroma_v_10_8x8_neon:                               98.8 ( 5.69x)
put_chroma_v_10_16x16_c:                              1170.7 ( 1.00x)
put_chroma_v_10_16x16_neon:                            380.7 ( 3.07x)
put_chroma_v_10_32x32_c:                              3696.6 ( 1.00x)
put_chroma_v_10_32x32_neon:                           1723.8 ( 2.14x)
put_chroma_v_10_64x64_c:                             13170.9 ( 1.00x)
put_chroma_v_10_64x64_neon:                           7284.1 ( 1.81x)
put_chroma_v_10_128x128_c:                           46068.3 ( 1.00x)
put_chroma_v_10_128x128_neon:                        27219.5 ( 1.69x)
put_chroma_v_12_2x2_c:                                  63.8 ( 1.00x)
put_chroma_v_12_4x4_c:                                 156.5 ( 1.00x)
put_chroma_v_12_4x4_neon:                               39.3 ( 3.98x)
put_chroma_v_12_8x8_c:                                 560.9 ( 1.00x)
put_chroma_v_12_8x8_neon:                               98.7 ( 5.68x)
put_chroma_v_12_16x16_c:                              1169.9 ( 1.00x)
put_chroma_v_12_16x16_neon:                            380.8 ( 3.07x)
put_chroma_v_12_32x32_c:                              3693.9 ( 1.00x)
put_chroma_v_12_32x32_neon:                           1728.4 ( 2.14x)
put_chroma_v_12_64x64_c:                             13170.9 ( 1.00x)
put_chroma_v_12_64x64_neon:                           7284.9 ( 1.81x)
put_chroma_v_12_128x128_c:                           46068.0 ( 1.00x)
put_chroma_v_12_128x128_neon:                        27224.6 ( 1.69x)
---
 libavcodec/aarch64/vvc/dsp_init.c |  31 +++++
 libavcodec/aarch64/vvc/inter.S    | 223 ++++++++++++++++++++++++++++++
 2 files changed, 254 insertions(+)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index c541695a5c..e5ed0ea244 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -56,6 +56,23 @@ void ff_vvc_put_chroma_h16_12_neon(int16_t *dst, const 
uint8_t *_src, const ptrd
 void ff_vvc_put_chroma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
                                      const int height, const int8_t *hf, const 
int8_t *vf, const int width);
 
+void ff_vvc_put_chroma_v4_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_v8_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_v16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                     const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_v4_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_v8_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                  const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_v16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_chroma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                     const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+
 void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
                                 const int height, const int8_t *hf, const 
int8_t *vf, const int width);
 void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
@@ -316,6 +333,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.put[0][5][0][1] =
         c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon;
 
+        c->inter.put[1][1][1][0] = ff_vvc_put_chroma_v4_10_neon;
+        c->inter.put[1][2][1][0] = ff_vvc_put_chroma_v8_10_neon;
+        c->inter.put[1][3][1][0] = ff_vvc_put_chroma_v16_10_neon;
+        c->inter.put[1][4][1][0] =
+        c->inter.put[1][5][1][0] =
+        c->inter.put[1][6][1][0] = ff_vvc_put_chroma_v_x16_10_neon;
+
         c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_10_neon;
         c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_10_neon;
         c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_10_neon;
@@ -368,6 +392,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.put[0][5][1][0] =
         c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_12_neon;
 
+        c->inter.put[1][1][1][0] = ff_vvc_put_chroma_v4_12_neon;
+        c->inter.put[1][2][1][0] = ff_vvc_put_chroma_v8_12_neon;
+        c->inter.put[1][3][1][0] = ff_vvc_put_chroma_v16_12_neon;
+        c->inter.put[1][4][1][0] =
+        c->inter.put[1][5][1][0] =
+        c->inter.put[1][6][1][0] = ff_vvc_put_chroma_v_x16_12_neon;
+
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
         c->alf.classify = alf_classify_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 092aad3b48..0dc1c59631 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -2356,6 +2356,229 @@ function ff_vvc_put_luma_v_x16_12_neon, export=1
         put_luma_v_x16_xx_neon 4
 endfunc
 
+.macro put_chroma_v4_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v20.4h}, [x1], x2
+        ld1             {v21.4h}, [x1], x2
+        ld1             {v22.4h}, [x1], x2
+1:
+        ld1             {v23.4h}, [x1], x2
+
+        smull           v1.4s, v20.4h, v0.h[0]
+        smull           v2.4s, v21.4h, v0.h[1]
+        smlal           v1.4s, v22.4h, v0.h[2]
+        smlal           v2.4s, v23.4h, v0.h[3]
+
+        ld1             {v24.4h}, [x1], x2
+
+        smull           v3.4s, v21.4h, v0.h[0]
+        smull           v4.4s, v22.4h, v0.h[1]
+        smlal           v3.4s, v23.4h, v0.h[2]
+        smlal           v4.4s, v24.4h, v0.h[3]
+
+        add             v1.4s, v1.4s, v2.4s
+        add             v3.4s, v3.4s, v4.4s
+        sqshrn          v1.4h, v1.4s, #(\shift)
+        sqshrn          v3.4h, v3.4s, #(\shift)
+
+        st1             {v1.4h}, [x0], x9
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        mov             v22.16b, v24.16b
+        subs            w3, w3, #2
+        st1             {v3.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_v4_10_neon, export=1
+        put_chroma_v4_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_v4_12_neon, export=1
+        put_chroma_v4_xx_neon 4
+endfunc
+
+.macro put_chroma_v8_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v20.8h}, [x1], x2
+        ld1             {v21.8h}, [x1], x2
+        ld1             {v22.8h}, [x1], x2
+1:
+        ld1             {v23.8h}, [x1], x2
+
+        smull           v1.4s, v20.4h, v0.h[0]
+        smull2          v2.4s, v20.8h, v0.h[0]
+        smlal           v1.4s, v21.4h, v0.h[1]
+        smlal2          v2.4s, v21.8h, v0.h[1]
+        smlal           v1.4s, v22.4h, v0.h[2]
+        smlal2          v2.4s, v22.8h, v0.h[2]
+        smlal           v1.4s, v23.4h, v0.h[3]
+        smlal2          v2.4s, v23.8h, v0.h[3]
+        sqshrn          v1.4h, v1.4s, #(\shift)
+        sqshrn          v2.4h, v2.4s, #(\shift)
+
+        ld1             {v24.8h}, [x1], x2
+        st1             {v1.4h-v2.4h}, [x0], x9
+
+        smull           v3.4s, v21.4h, v0.h[0]
+        smull2          v4.4s, v21.8h, v0.h[0]
+        smlal           v3.4s, v22.4h, v0.h[1]
+        smlal2          v4.4s, v22.8h, v0.h[1]
+        smlal           v3.4s, v23.4h, v0.h[2]
+        smlal2          v4.4s, v23.8h, v0.h[2]
+        smlal           v3.4s, v24.4h, v0.h[3]
+        smlal2          v4.4s, v24.8h, v0.h[3]
+        sqshrn          v3.4h, v3.4s, #(\shift)
+        sqshrn          v4.4h, v4.4s, #(\shift)
+
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        mov             v22.16b, v24.16b
+        subs            w3, w3, #2
+        st1             {v3.4h-v4.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_v8_10_neon, export=1
+        put_chroma_v8_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_v8_12_neon, export=1
+        put_chroma_v8_xx_neon 4
+endfunc
+
+.macro put_chroma_v_x16_horizontal_filter shift, src0, src1, src2, src3, src4, 
src5, src6, src7
+        smull           v2.4s, \src0\().4h, v0.h[0]
+        smull2          v3.4s, \src0\().8h, v0.h[0]
+        smlal           v2.4s, \src2\().4h, v0.h[1]
+        smlal2          v3.4s, \src2\().8h, v0.h[1]
+        smlal           v2.4s, \src4\().4h, v0.h[2]
+        smlal2          v3.4s, \src4\().8h, v0.h[2]
+        smlal           v2.4s, \src6\().4h, v0.h[3]
+        smlal2          v3.4s, \src6\().8h, v0.h[3]
+
+        smull           v4.4s, \src1\().4h, v0.h[0]
+        smull2          v5.4s, \src1\().8h, v0.h[0]
+        smlal           v4.4s, \src3\().4h, v0.h[1]
+        smlal2          v5.4s, \src3\().8h, v0.h[1]
+        smlal           v4.4s, \src5\().4h, v0.h[2]
+        smlal2          v5.4s, \src5\().8h, v0.h[2]
+        smlal           v4.4s, \src7\().4h, v0.h[3]
+        smlal2          v5.4s, \src7\().8h, v0.h[3]
+
+        sqshrn          v6.4h, v2.4s, #(\shift)
+        sqshrn          v7.4h, v4.4s, #(\shift)
+        sqshrn2         v6.8h, v3.4s, #(\shift)
+        sqshrn2         v7.8h, v5.4s, #(\shift)
+.endm
+
+.macro put_chroma_v16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v16.8h-v17.8h}, [x1], x2
+        ld1             {v18.8h-v19.8h}, [x1], x2
+        ld1             {v20.8h-v21.8h}, [x1], x2
+1:
+        ld1             {v22.8h-v23.8h}, [x1], x2
+        put_chroma_v_x16_horizontal_filter \shift, v16, v17, v18, v19, v20, 
v21, v22, v23
+        ld1             {v24.8h-v25.8h}, [x1], x2
+        st1             {v6.8h-v7.8h}, [x0], x9
+        put_chroma_v_x16_horizontal_filter \shift, v18, v19, v20, v21, v22, 
v23, v24, v25
+        subs            w3, w3, #2
+        st1             {v6.8h-v7.8h}, [x0], x9
+
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_v16_10_neon, export=1
+        put_chroma_v16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_v16_12_neon, export=1
+        put_chroma_v16_xx_neon 4
+endfunc
+
+.macro put_chroma_v_x16_xx_neon shift
+        // dst         .req x0
+        // _src        .req x1
+        // _src_stride .req x2
+        // height      .req x3
+        // hf          .req x4
+        // vf          .req x5
+        // width       .req x6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ldr             s0, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+1:
+        mov             w8, #0
+2:
+        add             x11, x1, x8, lsl #1
+        add             x10, x0, x8, lsl #1
+        ld1             {v16.8h-v17.8h}, [x11], x2
+        add             x8, x8, #16
+        ld1             {v18.8h-v19.8h}, [x11], x2
+        cmp             w8, w6
+        ld1             {v20.8h-v21.8h}, [x11], x2
+        ld1             {v22.8h-v23.8h}, [x11], x2
+        ld1             {v24.8h-v25.8h}, [x11], x2
+        put_chroma_v_x16_horizontal_filter \shift, v16, v17, v18, v19, v20, 
v21, v22, v23
+        st1             {v6.8h-v7.8h}, [x10], x9
+        put_chroma_v_x16_horizontal_filter \shift, v18, v19, v20, v21, v22, 
v23, v24, v25
+        st1             {v6.8h-v7.8h}, [x10], x9
+        b.lt            2b
+        add             x0, x0, x9, lsl #1
+        subs            w3, w3, #2
+        add             x1, x1, x2, lsl #1
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_chroma_v_x16_10_neon, export=1
+        put_chroma_v_x16_xx_neon 2
+endfunc
+
+function ff_vvc_put_chroma_v_x16_12_neon, export=1
+        put_chroma_v_x16_xx_neon 4
+endfunc
 
 .macro put_luma_hv_x8_horizontal_filter shift, dst, src0, src1
         ext             v2.16b, \src0\().16b, \src1\().16b, #2
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to