[FFmpeg-devel] [PATCH] aarch64/vvc: SME optimisations of put_luma_h(64x64,128x128) functions for 8-bit (PR #21194)

george.zaguri via ffmpeg-devel Sun, 14 Dec 2025 08:00:42 -0800

PR #21194 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21194
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21194.patch


Apple M4:
put_luma_h_8_64x64_c:                                  644.5 ( 1.00x)
put_luma_h_8_64x64_neon:                               520.3 ( 1.24x)
put_luma_h_8_64x64_i8mm:                               440.9 ( 1.46x)
put_luma_h_8_64x64_sme:                                405.7 ( 1.59x)
put_luma_h_8_128x128_c:                               2340.3 ( 1.00x)
put_luma_h_8_128x128_neon:                            2078.7 ( 1.13x)
put_luma_h_8_128x128_i8mm:                            1711.9 ( 1.37x)
put_luma_h_8_128x128_sme:                             1604.5 ( 1.46x)


>From 151199038279cbe8b7100ce2c41a73791f71bd45 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <[email protected]>
Date: Sun, 14 Dec 2025 15:58:39 +0000
Subject: [PATCH] aarch64/vvc: SME optimisations of put_luma_h(64x64,128x128)
 functions for 8-bit

Apple M4:
put_luma_h_8_64x64_c:                                  644.5 ( 1.00x)
put_luma_h_8_64x64_neon:                               520.3 ( 1.24x)
put_luma_h_8_64x64_i8mm:                               440.9 ( 1.46x)
put_luma_h_8_64x64_sme:                                405.7 ( 1.59x)
put_luma_h_8_128x128_c:                               2340.3 ( 1.00x)
put_luma_h_8_128x128_neon:                            2078.7 ( 1.13x)
put_luma_h_8_128x128_i8mm:                            1711.9 ( 1.37x)
put_luma_h_8_128x128_sme:                             1604.5 ( 1.46x)
---
 libavcodec/aarch64/vvc/Makefile    |   1 +
 libavcodec/aarch64/vvc/dsp_init.c  |   6 ++
 libavcodec/aarch64/vvc/inter_sme.S | 132 +++++++++++++++++++++++++++++
 3 files changed, 139 insertions(+)
 create mode 100644 libavcodec/aarch64/vvc/inter_sme.S

diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile
index ed80338969..56282478a7 100644
--- a/libavcodec/aarch64/vvc/Makefile
+++ b/libavcodec/aarch64/vvc/Makefile
@@ -8,3 +8,4 @@ NEON-OBJS-$(CONFIG_VVC_DECODER)         += aarch64/vvc/alf.o \
                                            aarch64/h26x/epel_neon.o \
                                            aarch64/h26x/qpel_neon.o \
                                            aarch64/h26x/sao_neon.o
+SME-OBJS-$(CONFIG_VVC_DECODER)          += aarch64/vvc/inter_sme.o
diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index aa75d22b78..d86e431215 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -42,6 +42,8 @@ void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t 
*_src, const ptrdif
                                  const int height, const int8_t *hf, const 
int8_t *vf, const int width);
 void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
                                    const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_h_8_sme(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                             const int height, const int8_t *hf, const int8_t 
*vf, const int width);
 
 void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t 
gshift, uint32_t steps);
 
@@ -251,6 +253,10 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
             c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
             c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
         }
+        if (have_sme(cpu_flags)) {
+            c->inter.put[0][5][0][1] =
+            c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_8_sme;
+        }
     } else if (bd == 10) {
         c->inter.avg = ff_vvc_avg_10_neon;
         c->inter.w_avg = vvc_w_avg_10;
diff --git a/libavcodec/aarch64/vvc/inter_sme.S 
b/libavcodec/aarch64/vvc/inter_sme.S
new file mode 100644
index 0000000000..d3592518cb
--- /dev/null
+++ b/libavcodec/aarch64/vvc/inter_sme.S
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2025 Georgii Zagoruiko <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define VVC_MAX_PB_SIZE 128
+
+#if HAVE_SME
+ENABLE_SME
+
+function ff_vvc_put_luma_h_8_sme, export=1
+        // dst           .req x0
+        // _src          .req x1
+        // _src_stride   .req x2
+        // height        .req w3
+        // hf            .req x4
+        // vf            .req x5
+        // width         .req w6
+        smstart
+        cntb            x8
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        mov             w13, #0
+        mov             w14, #1
+        mov             w15, #2
+        mov             w16, #3
+        ptrue           p0.b, VL8
+        ptrue           p1.s
+        ld1b            z30.b, p0/Z, [x4]
+        eor             z0.b, z0.b, z0.b
+        mov             z31.d, z30.d
+        sub             x1, x1, #3
+        ext             z31.b, z31.b, z0.b, #4
+.Loop_H:
+        cmp             w6, w8
+        csel            w11, w6, w8, ls
+        whilelo         p0.b, xzr, x6
+        mov             w10, w3
+        asr             w12, w11, #1
+        whilelo         p2.h, xzr, x12
+.Loop_W:
+        ld1b            z0.b, p0/z, [x1]
+        ld1b            z1.b, p0/z, [x1, x14]
+        ld1b            z2.b, p0/z, [x1, x15]
+        ld1b            z3.b, p0/z, [x1, x16]
+        add             x1, x1, #4
+        ld1b            z4.b, p0/z, [x1]
+        ld1b            z5.b, p0/z, [x1, x14]
+        ld1b            z6.b, p0/z, [x1, x15]
+        ld1b            z7.b, p0/z, [x1, x16]
+        sub             x1, x1, #4
+        usmopa          za0.s, p0/m, p0/m, z0.b, z30.b
+        usmopa          za1.s, p0/m, p0/m, z1.b, z30.b
+        usmopa          za2.s, p0/m, p0/m, z2.b, z30.b
+        usmopa          za3.s, p0/m, p0/m, z3.b, z30.b
+        usmopa          za0.s, p0/m, p0/m, z4.b, z31.b
+        usmopa          za1.s, p0/m, p0/m, z5.b, z31.b
+        usmopa          za2.s, p0/m, p0/m, z6.b, z31.b
+        usmopa          za3.s, p0/m, p0/m, z7.b, z31.b
+        mova            z22.s, p1/m, za0v.s[w13, 0]
+        mova            z24.s, p1/m, za1v.s[w13, 0]
+        mova            z26.s, p1/m, za2v.s[w13, 0]
+        mova            z28.s, p1/m, za3v.s[w13, 0]
+        add             x1, x1, x2
+        zero            {za}
+        ld1b            z0.b, p0/z, [x1]
+        ld1b            z1.b, p0/z, [x1, x14]
+        ld1b            z2.b, p0/z, [x1, x15]
+        ld1b            z3.b, p0/z, [x1, x16]
+        add             x1, x1, #4
+        ld1b            z4.b, p0/z, [x1]
+        ld1b            z5.b, p0/z, [x1, x14]
+        ld1b            z6.b, p0/z, [x1, x15]
+        ld1b            z7.b, p0/z, [x1, x16]
+        sub             x1, x1, #4
+        sqxtnb          z21.h, z22.s
+        sqxtnb          z22.h, z24.s
+        sqxtnt          z21.h, z26.s
+        sqxtnt          z22.h, z28.s
+        st2h            {z21.h-z22.h}, p2, [x0]
+        add             x1, x1, x2
+        add             x0, x0, x9
+
+        usmopa          za0.s, p0/m, p0/m, z0.b, z30.b
+        usmopa          za1.s, p0/m, p0/m, z1.b, z30.b
+        usmopa          za2.s, p0/m, p0/m, z2.b, z30.b
+        usmopa          za3.s, p0/m, p0/m, z3.b, z30.b
+        usmopa          za0.s, p0/m, p0/m, z4.b, z31.b
+        usmopa          za1.s, p0/m, p0/m, z5.b, z31.b
+        usmopa          za2.s, p0/m, p0/m, z6.b, z31.b
+        usmopa          za3.s, p0/m, p0/m, z7.b, z31.b
+        mova            z22.s, p1/m, za0v.s[w13, 0]
+        mova            z24.s, p1/m, za1v.s[w13, 0]
+        mova            z26.s, p1/m, za2v.s[w13, 0]
+        mova            z28.s, p1/m, za3v.s[w13, 0]
+        sqxtnb          z21.h, z22.s
+        sqxtnb          z22.h, z24.s
+        sqxtnt          z21.h, z26.s
+        sqxtnt          z22.h, z28.s
+        zero            {za}
+        st2h            {z21.h-z22.h}, p2, [x0]
+        subs            w10, w10, #2
+        add             x0, x0, x9
+        b.gt            .Loop_W
+        msub            x0, x3, x9, x0
+        msub            x1, x3, x2, x1
+        add             x0, x0, x11, lsl #1
+        subs            w6, w6, w11
+        add             x1, x1, x11
+        b.gt            .Loop_H
+        smstop
+        ret
+endfunc
+
+DISABLE_SME
+#endif
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] aarch64/vvc: SME optimisations of put_luma_h(64x64,128x128) functions for 8-bit (PR #21194)

Reply via email to