12 bit

Georgii Zagoruiko via ffmpeg-cvslog Wed, 04 Mar 2026 14:23:46 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 510999f6b07b6996983c7ef24c3cf41a06241261
Author:     Georgii Zagoruiko <[email protected]>
AuthorDate: Tue Mar 3 18:49:47 2026 +0000
Commit:     Martin Storsjö <[email protected]>
CommitDate: Wed Mar 4 23:52:58 2026 +0200

    aarch64/vvc: sme2 optimisation of alf_filter_luma() 8/10/12 bit
    
    Apple M4:
    vvc_alf_filter_luma_8x8_8_c:                           347.3 ( 1.00x)
    vvc_alf_filter_luma_8x8_8_neon:                        138.7 ( 2.50x)
    vvc_alf_filter_luma_8x8_8_sme2:                        134.5 ( 2.58x)
    vvc_alf_filter_luma_8x8_10_c:                          299.8 ( 1.00x)
    vvc_alf_filter_luma_8x8_10_neon:                       129.8 ( 2.31x)
    vvc_alf_filter_luma_8x8_10_sme2:                       128.6 ( 2.33x)
    vvc_alf_filter_luma_8x8_12_c:                          293.0 ( 1.00x)
    vvc_alf_filter_luma_8x8_12_neon:                       126.8 ( 2.31x)
    vvc_alf_filter_luma_8x8_12_sme2:                       126.3 ( 2.32x)
    vvc_alf_filter_luma_16x16_8_c:                        1386.1 ( 1.00x)
    vvc_alf_filter_luma_16x16_8_neon:                      560.3 ( 2.47x)
    vvc_alf_filter_luma_16x16_8_sme2:                      540.1 ( 2.57x)
    vvc_alf_filter_luma_16x16_10_c:                       1200.3 ( 1.00x)
    vvc_alf_filter_luma_16x16_10_neon:                     515.6 ( 2.33x)
    vvc_alf_filter_luma_16x16_10_sme2:                     531.3 ( 2.26x)
    vvc_alf_filter_luma_16x16_12_c:                       1223.8 ( 1.00x)
    vvc_alf_filter_luma_16x16_12_neon:                     510.7 ( 2.40x)
    vvc_alf_filter_luma_16x16_12_sme2:                     524.9 ( 2.33x)
    vvc_alf_filter_luma_32x32_8_c:                        5488.8 ( 1.00x)
    vvc_alf_filter_luma_32x32_8_neon:                     2233.4 ( 2.46x)
    vvc_alf_filter_luma_32x32_8_sme2:                     1093.6 ( 5.02x)
    vvc_alf_filter_luma_32x32_10_c:                       4738.0 ( 1.00x)
    vvc_alf_filter_luma_32x32_10_neon:                    2057.5 ( 2.30x)
    vvc_alf_filter_luma_32x32_10_sme2:                    1053.6 ( 4.50x)
    vvc_alf_filter_luma_32x32_12_c:                       4808.3 ( 1.00x)
    vvc_alf_filter_luma_32x32_12_neon:                    1981.2 ( 2.43x)
    vvc_alf_filter_luma_32x32_12_sme2:                    1047.7 ( 4.59x)
    vvc_alf_filter_luma_64x64_8_c:                       22116.8 ( 1.00x)
    vvc_alf_filter_luma_64x64_8_neon:                     8951.0 ( 2.47x)
    vvc_alf_filter_luma_64x64_8_sme2:                     4225.2 ( 5.23x)
    vvc_alf_filter_luma_64x64_10_c:                      19072.8 ( 1.00x)
    vvc_alf_filter_luma_64x64_10_neon:                    8448.1 ( 2.26x)
    vvc_alf_filter_luma_64x64_10_sme2:                    4225.8 ( 4.51x)
    vvc_alf_filter_luma_64x64_12_c:                      19312.6 ( 1.00x)
    vvc_alf_filter_luma_64x64_12_neon:                    8270.9 ( 2.34x)
    vvc_alf_filter_luma_64x64_12_sme2:                    4245.4 ( 4.55x)
    vvc_alf_filter_luma_128x128_8_c:                     88530.5 ( 1.00x)
    vvc_alf_filter_luma_128x128_8_neon:                  35686.3 ( 2.48x)
    vvc_alf_filter_luma_128x128_8_sme2:                  16961.2 ( 5.22x)
    vvc_alf_filter_luma_128x128_10_c:                    76904.9 ( 1.00x)
    vvc_alf_filter_luma_128x128_10_neon:                 32439.5 ( 2.37x)
    vvc_alf_filter_luma_128x128_10_sme2:                 16845.6 ( 4.57x)
    vvc_alf_filter_luma_128x128_12_c:                    77363.3 ( 1.00x)
    vvc_alf_filter_luma_128x128_12_neon:                 32907.5 ( 2.35x)
    vvc_alf_filter_luma_128x128_12_sme2:                 17018.1 ( 4.55x)
---
 libavcodec/aarch64/vvc/Makefile       |   1 +
 libavcodec/aarch64/vvc/alf_template.c |  26 ++
 libavcodec/aarch64/vvc/dsp_init.c     |   9 +
 libavcodec/aarch64/vvc/inter_sme2.S   | 657 ++++++++++++++++++++++++++++++++++
 4 files changed, 693 insertions(+)

diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile
index ed80338969..7c336bc031 100644
--- a/libavcodec/aarch64/vvc/Makefile
+++ b/libavcodec/aarch64/vvc/Makefile
@@ -8,3 +8,4 @@ NEON-OBJS-$(CONFIG_VVC_DECODER)         += aarch64/vvc/alf.o \
                                            aarch64/h26x/epel_neon.o \
                                            aarch64/h26x/qpel_neon.o \
                                            aarch64/h26x/sao_neon.o
+SME2-OBJS-$(CONFIG_VVC_DECODER)         += aarch64/vvc/inter_sme2.o
diff --git a/libavcodec/aarch64/vvc/alf_template.c 
b/libavcodec/aarch64/vvc/alf_template.c
index 364bd9cded..0b63879c1f 100644
--- a/libavcodec/aarch64/vvc/alf_template.c
+++ b/libavcodec/aarch64/vvc/alf_template.c
@@ -241,3 +241,29 @@ static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int 
*class_idx, int *transpose
     FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, 
_src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
     FUNC(alf_classify)(class_idx, transpose_idx, _src, _src_stride, width, 
height, vb_pos, (int16_t*)gradient_tmp);
 }
+
+
+void FUNC2(ff_vvc_alf_filter_luma, BIT_DEPTH, _sme2)(uint8_t *dst, const 
uint8_t *src, const uint64_t strides,
+                                                     const uint64_t dims, 
const int16_t *filter, const int16_t *clip,
+                                                     const int vb_pos);
+
+#define ALF_ALIGN_BY_4(x) (4*((x - 1) >> 2u)+4)
+
+static void FUNC2(alf_filter_luma, BIT_DEPTH, _sme2)(uint8_t *_dst,
+                                                     ptrdiff_t dst_stride,
+                                                     const uint8_t *_src,
+                                                     ptrdiff_t src_stride,
+                                                     const int width, const 
int height,
+                                                     const int16_t *filter,
+                                                     const int16_t *clip,
+                                                     const int vb_pos)
+{
+    if ((width >= 16) && (height >= 16)) {
+        int aligned_width = ALF_ALIGN_BY_4(width); // align width by 4
+        uint64_t dims = ((uint64_t)height << 32u) | (uint64_t)aligned_width;
+        uint64_t strides = ((uint64_t)src_stride << 32u) | 
(uint64_t)dst_stride;
+        FUNC2(ff_vvc_alf_filter_luma, BIT_DEPTH, _sme2)(_dst, _src, strides, 
dims, filter, clip, vb_pos);
+    } else {
+        FUNC2(alf_filter_luma, BIT_DEPTH, _neon)(_dst, dst_stride, _src, 
src_stride, width, height, filter, clip, vb_pos);
+    }
+}
diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index 8375ee71c2..956fa0779c 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -281,6 +281,9 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
             c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
             c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
         }
+        if (have_sme2(cpu_flags) && have_sme_i16i64(cpu_flags)) {
+            c->alf.filter[LUMA] = alf_filter_luma_8_sme2;
+        }
     } else if (bd == 10) {
         c->inter.avg = ff_vvc_avg_10_neon;
         c->inter.w_avg = vvc_w_avg_10;
@@ -309,6 +312,9 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
         c->alf.classify = alf_classify_10_neon;
+        if (have_sme2(cpu_flags) && have_sme_i16i64(cpu_flags)) {
+            c->alf.filter[LUMA] = alf_filter_luma_10_sme2;
+        }
     } else if (bd == 12) {
         c->inter.avg = ff_vvc_avg_12_neon;
         c->inter.w_avg = vvc_w_avg_12;
@@ -338,6 +344,9 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
         c->alf.classify = alf_classify_12_neon;
+        if (have_sme2(cpu_flags) && have_sme_i16i64(cpu_flags)) {
+            c->alf.filter[LUMA] = alf_filter_luma_12_sme2;
+        }
     }
 
     c->inter.sad = ff_vvc_sad_neon;
diff --git a/libavcodec/aarch64/vvc/inter_sme2.S 
b/libavcodec/aarch64/vvc/inter_sme2.S
new file mode 100644
index 0000000000..093f823823
--- /dev/null
+++ b/libavcodec/aarch64/vvc/inter_sme2.S
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2025 Georgii Zagoruiko <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define VVC_MAX_PB_SIZE 128
+
+ENABLE_SME2
+
+#if HAVE_SME_I16I64
+ENABLE_SME_I16I64
+.macro first_group_filter_luma_offsets breg, shift
+        // x20-x23: p5[0],p3[-1],p1[0],p0[3]
+        // x24-x27: p6[0],p4[1],p2[0],p0[-3]
+        neg             x26, x11
+        ubfx            x20, \breg, #(3+\shift), #2
+        ubfx            x21, \breg, #(1+\shift), #2
+        mul             x24, x20, x26
+        mul             x20, x20, x11
+        mul             x25, x21, x26
+        mul             x21, x21, x11
+        ubfx            x22, \breg, #(\shift), #1
+        sub             x21, x21, #1
+        mul             x26, x22, x26
+        mul             x22, x22, x11
+        mov             x23, #3
+        add             x25, x25, #1
+        mov             x27, #-3
+.endm
+
+.macro second_group_filter_luma_offsets breg, shift
+        // x20-x23: p3[ 1],p1[ 2],p1[-1],p0[ 2]
+        // x24-x27: p4[-1],p2[-2],p2[ 1],p0[-2]
+        neg             x26, x11
+        ubfx            x20, \breg, #(1+\shift), #2
+        ubfx            x21, \breg, #(\shift), #1
+        mul             x24, x20, x26
+        mul             x20, x20, x11
+        mul             x25, x21, x26
+        mul             x26, x21, x26
+        mul             x21, x21, x11
+        add             x20, x20, #1
+        sub             x22, x21, #1
+        add             x21, x21, #2
+        mov             x23, #2
+        sub             x24, x24, #1
+        sub             x25, x25, #2
+        add             x26, x26, #1
+        mov             x27, #-2
+.endm
+
+.macro third_group_filter_luma_offsets breg, shift
+        // x20-x23: p3[0],p1[ 1],p1[-2],p0[ 1]
+        // x24-x27: p4[0],p2[-1],p2[ 2],p0[-1]
+        neg             x26, x11
+        ubfx            x21, \breg, #(\shift), #1
+        ubfx            x20, \breg, #(1+\shift), #2
+        mul             x25, x21, x26
+        mul             x26, x21, x26
+        mul             x21, x21, x11
+        mul             x24, x20, x26
+        mul             x20, x20, x11
+        sub             x22, x21, #2
+        add             x21, x21, #1
+        mov             x23, #1
+        sub             x25, x25, #1
+        mov             x27, #-1
+        add             x26, x26, #2
+.endm
+
+.macro kernel_filter_luma_8_sme2 src, zreg, idx
+        ld1b            z20.h, p0/z, [\src, x20]
+        ld1b            z21.h, p0/z, [\src, x21]
+        ld1b            z22.h, p0/z, [\src, x22]
+        ld1b            z23.h, p0/z, [\src, x23]
+        ld1b            z24.h, p0/z, [\src, x24]
+        ld1b            z25.h, p0/z, [\src, x25]
+        neg             z8.h, p0/m, \zreg // -p0
+        ld1b            z26.h, p0/z, [\src, x26]
+        ld1b            z27.h, p0/z, [\src, x27]
+        add             {z20.h-z23.h}, {z20.h-z23.h}, z8.h
+        add             {z24.h-z27.h}, {z24.h-z27.h}, z8.h
+        // transpose data vectors
+        zip             {z20.h-z23.h}, {z20.h-z23.h}
+        zip             {z24.h-z27.h}, {z24.h-z27.h}
+        // clip data
+        sclamp          z20.h, z16.h, z12.h
+        sclamp          z24.h, z16.h, z12.h
+        sclamp          z21.h, z17.h, z13.h
+        sclamp          z25.h, z17.h, z13.h
+        sclamp          z22.h, z18.h, z14.h
+        sclamp          z26.h, z18.h, z14.h
+        sclamp          z23.h, z19.h, z15.h
+        sclamp          z27.h, z19.h, z15.h
+        sdot            za.d[w10, \idx], {z20.h-z23.h}, {z28.h-z31.h}
+        sdot            za.d[w10, \idx], {z24.h-z27.h}, {z28.h-z31.h}
+.endm
+
+function ff_vvc_alf_filter_luma_8_sme2, export=1
+        // dst           .req x0
+        // src           .req x1
+        // strides       .req x2
+        // dims          .req x3
+        // filter        .req x4
+        // clip          .req x5
+        // vb            .req x6
+        sme_entry
+        stp             x29, x30, [sp, #-96]!
+        mov             x29, sp
+        stp             x19, x20, [sp, #16]
+        stp             x21, x22, [sp, #32]
+        stp             x23, x24, [sp, #48]
+        stp             x25, x26, [sp, #64]
+        stp             x27, x28, [sp, #80]
+
+        lsr             x7, x3, #32
+        cnth            x11
+        mov             w8, w3
+        sub             w9, w8, #1
+        sdiv            w9, w9, w11
+        msub            w9, w9, w11, w8
+        whilelo         p10.h, xzr, x9
+        ptrue           p1.h
+        lsr             x11, x2, #32 // src stride
+        lsr             w2, w2, #0 // leave dst stride only
+        mov             w10, #0
+        mov             w12, #255
+        dup             z9.h, w10
+        dup             z10.h, w12
+1:
+        lsr             x20, x3, #32
+        mov             p0.b, p10.b
+        sub             w20, w20, w7
+        mov             w12, w9
+        sub             w6, w6, #6
+        // offsets are packed into the format: (M<<3)|(N<<1)|K, where M is 
p5/p6 offset (multiply), N is p3/p4 offset, K is p1/p2 offset
+        mov             w21, #0
+        mov             w22, #0xB
+        mov             w23, #0x15
+        mov             w13, #0x1D // 0x1D == (3<<3)|(2<<1)|1
+        mov             w14, #0x1D
+        mov             w15, #0x1D
+        mov             w16, #0x1D
+        // y == vb_pos - 6
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w16, w16, w23, ne
+        // y == vb_pos - 5
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w15, w15, w23, ne
+        csel            w16, w16, w22, ne
+        // y == vb_pos - 4
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w14, w14, w23, ne
+        csel            w15, w15, w22, ne
+        csel            w16, w16, w21, ne
+        // y == vb_pos - 3
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w23, ne
+        csel            w14, w14, w22, ne
+        csel            w15, w15, w21, ne
+        csel            w16, w16, w21, ne
+        // y == vb_pos - 2
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w22, ne
+        csel            w14, w14, w21, ne
+        csel            w15, w15, w21, ne
+        csel            w16, w16, w22, ne
+        // y == vb_pos - 1
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w21, ne
+        csel            w14, w14, w21, ne
+        csel            w15, w15, w22, ne
+        csel            w16, w16, w23, ne
+        // y == vb_pos
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w21, ne
+        csel            w14, w14, w22, ne
+        csel            w15, w15, w23, ne
+        // y == vb_pos + 1
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w22, ne
+        csel            w14, w14, w23, ne
+        // y == vb_pos + 2
+        cmp             w20, w6
+        sub             w6, w6, #2
+        csel            w13, w13, w23, ne
+        orr             w13, w13, w14, lsl #8
+        orr             w13, w13, w15, lsl #16
+        orr             w13, w13, w16, lsl #24
+        mov             x14, x1
+        mov             x19, x0
+2:
+        // Load clip [12=>3x4 memory layout]
+        ld3h            {z0.h-z2.h}, p0/z, [x5]
+        // Load filter [12=>3x4 memory layout]
+        ld3h            {z3.h-z5.h}, p0/z, [x4]
+        add             x15, x14, x11
+        add             x16, x14, x11, lsl #1
+        add             x17, x15, x11, lsl #1
+        add             x30, x19, x2, lsl #1
+
+        mov             z12.d, z0.d
+        mov             z13.d, z0.d
+        mov             z14.d, z0.d
+        mov             z15.d, z0.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z3.d
+        mov             z29.d, z3.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z3.d
+        mov             z31.d, z3.d
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+        // p0 (curr)
+        ld1b            z6.h, p0/z, [x14]
+        ld1b            z7.h, p0/z, [x15]
+        ld1b            z0.h, p0/z, [x16]
+        ld1b            z3.h, p0/z, [x17]
+        // clip & filter (first group): a0,a3,a6,a9, a12...
+        // {p5[0],p3[-1],p1[0],p0[3]} -> left operand in clip
+        // {p6[0],p4[1],p2[0],p0[-3]} -> right operand in clip
+        first_group_filter_luma_offsets x13, 0
+        kernel_filter_luma_8_sme2 x14, z6.h, 0
+        first_group_filter_luma_offsets x13, 8
+        kernel_filter_luma_8_sme2 x15, z7.h, 1
+        first_group_filter_luma_offsets x13, 16
+        kernel_filter_luma_8_sme2 x16, z0.h, 2
+        first_group_filter_luma_offsets x13, 24
+        kernel_filter_luma_8_sme2 x17, z3.h, 3
+
+        mov             z12.d, z1.d
+        mov             z13.d, z1.d
+        mov             z14.d, z1.d
+        mov             z15.d, z1.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z4.d
+        mov             z29.d, z4.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z4.d
+        mov             z31.d, z4.d
+        // -clip
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+        // clip & filter (second group): a1,a4,a7,a10,a13...
+        // left:  {p3[ 1],p1[ 2],p1[-1],p0[ 2]}
+        // right: {p4[-1],p2[-2],p2[ 1],p0[-2]}
+        second_group_filter_luma_offsets x13, 0
+        kernel_filter_luma_8_sme2 x14, z6.h, 0
+        second_group_filter_luma_offsets x13, 8
+        kernel_filter_luma_8_sme2 x15, z7.h, 1
+        second_group_filter_luma_offsets x13, 16
+        kernel_filter_luma_8_sme2 x16, z0.h, 2
+        second_group_filter_luma_offsets x13, 24
+        kernel_filter_luma_8_sme2 x17, z3.h, 3
+
+        mov             z12.d, z2.d
+        mov             z13.d, z2.d
+        mov             z14.d, z2.d
+        mov             z15.d, z2.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z5.d
+        mov             z29.d, z5.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z5.d
+        mov             z31.d, z5.d
+        // -clip
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+        // clip & filter (third group): a2,a5,a8,a11,a14...
+        // left:  {p3[0],p1[ 1],p1[-2],p0[ 1]}
+        // right: {p4[0],p2[-1],p2[ 2],p0[-1]}
+        third_group_filter_luma_offsets x13, 0
+        kernel_filter_luma_8_sme2 x14, z6.h, 0
+        third_group_filter_luma_offsets x13, 8
+        kernel_filter_luma_8_sme2 x15, z7.h, 1
+        third_group_filter_luma_offsets x13, 16
+        kernel_filter_luma_8_sme2 x16, z0.h, 2
+        third_group_filter_luma_offsets x13, 24
+        kernel_filter_luma_8_sme2 x17, z3.h, 3
+        mova            {z16.d-z19.d}, za.d[w10, 0]
+        mova            {z20.d-z23.d}, za.d[w10, 1]
+        mova            {z24.d-z27.d}, za.d[w10, 2]
+        mova            {z28.d-z31.d}, za.d[w10, 3]
+        sqrshr          z12.h, {z16.d-z19.d}, #7
+        sqrshr          z13.h, {z20.d-z23.d}, #7
+        sqrshr          z14.h, {z24.d-z27.d}, #7
+        sqrshr          z15.h, {z28.d-z31.d}, #7
+        tbnz            x13, #0, 10f
+        sqrshr          z12.h, {z16.d-z19.d}, #10
+10:
+        tbnz            x13, #8, 11f
+        sqrshr          z13.h, {z20.d-z23.d}, #10
+11:
+        tbnz            x13, #16, 12f
+        sqrshr          z14.h, {z24.d-z27.d}, #10
+12:
+        tbnz            x13, #24, 13f
+        sqrshr          z15.h, {z28.d-z31.d}, #10
+13:
+        add             z12.h, z12.h, z6.h
+        add             z13.h, z13.h, z7.h
+        add             z14.h, z14.h, z0.h
+        add             z15.h, z15.h, z3.h
+        sclamp          {z12.h-z15.h}, z9.h, z10.h
+        st1b            z12.h, p0, [x19]
+        st1b            z13.h, p0, [x19, x2]
+        st1b            z14.h, p0, [x30]
+        st1b            z15.h, p0, [x30, x2]
+        zero            {za}
+        add             x14, x14, x12
+        add             x19, x19, x12
+        ptrue           p0.h
+        subs            w8, w8, w12
+        add             w12, w12, w12, lsl #1
+        add             x4, x4, x12, lsl #1
+        add             x5, x5, x12, lsl #1
+        cnth            x12
+        b.gt            2b
+        mov             w8, w3
+        subs            w7, w7, #4
+        add             x1, x1, x11, lsl #2
+        add             x0, x0, x2, lsl #2
+        b.gt            1b
+
+        ldp             x19, x20, [sp, #16]
+        ldp             x21, x22, [sp, #32]
+        ldp             x23, x24, [sp, #48]
+        ldp             x25, x26, [sp, #64]
+        ldp             x27, x28, [sp, #80]
+        ldp             x29, x30, [sp], #96
+        sme_exit
+        ret
+endfunc
+
+.macro kernel_filter_luma_16_sme2 src, zreg, idx
+        ld1h            z20.h, p0/z, [\src, x20, lsl #1]
+        ld1h            z21.h, p0/z, [\src, x21, lsl #1]
+        ld1h            z22.h, p0/z, [\src, x22, lsl #1]
+        ld1h            z23.h, p0/z, [\src, x23, lsl #1]
+        ld1h            z24.h, p0/z, [\src, x24, lsl #1]
+        ld1h            z25.h, p0/z, [\src, x25, lsl #1]
+        neg             z8.h, p0/m, \zreg // -p0
+        ld1h            z26.h, p0/z, [\src, x26, lsl #1]
+        ld1h            z27.h, p0/z, [\src, x27, lsl #1]
+        add             {z20.h-z23.h}, {z20.h-z23.h}, z8.h
+        add             {z24.h-z27.h}, {z24.h-z27.h}, z8.h
+        // transpose data vectors
+        zip             {z20.h-z23.h}, {z20.h-z23.h}
+        zip             {z24.h-z27.h}, {z24.h-z27.h}
+        // clip data
+        sclamp          z20.h, z16.h, z12.h
+        sclamp          z24.h, z16.h, z12.h
+        sclamp          z21.h, z17.h, z13.h
+        sclamp          z25.h, z17.h, z13.h
+        sclamp          z22.h, z18.h, z14.h
+        sclamp          z26.h, z18.h, z14.h
+        sclamp          z23.h, z19.h, z15.h
+        sclamp          z27.h, z19.h, z15.h
+        sdot            za.d[w10, \idx], {z20.h-z23.h}, {z28.h-z31.h}
+        sdot            za.d[w10, \idx], {z24.h-z27.h}, {z28.h-z31.h}
+.endm
+
+function ff_vvc_alf_filter_luma_12_sme2, export=1
+        mov             w12, #4095
+        b               0f
+endfunc
+
+function ff_vvc_alf_filter_luma_10_sme2, export=1
+        // dst           .req x0
+        // src           .req x1
+        // strides       .req x2
+        // dims          .req x3
+        // filter        .req x4
+        // clip          .req x5
+        // vb            .req x6
+        mov             w12, #1023
+0:
+        sme_entry
+        stp             x29, x30, [sp, #-96]!
+        mov             x29, sp
+        stp             x19, x20, [sp, #16]
+        stp             x21, x22, [sp, #32]
+        stp             x23, x24, [sp, #48]
+        stp             x25, x26, [sp, #64]
+        stp             x27, x28, [sp, #80]
+
+        lsr             x7, x3, #32
+        cnth            x11
+        mov             w8, w3
+        sub             w9, w8, #1
+        sdiv            w9, w9, w11
+        msub            w9, w9, w11, w8
+        whilelo         p10.h, xzr, x9
+        ptrue           p1.h
+        lsr             x11, x2, #33 // src stride
+        lsr             w2, w2, #1
+        mov             w10, #0
+        dup             z9.h, w10
+        dup             z10.h, w12
+1:
+        lsr             x20, x3, #32
+        mov             p0.b, p10.b
+        sub             w20, w20, w7
+        mov             w12, w9
+        sub             w6, w6, #6
+        // offsets are packed into the format: (M<<3)|(N<<1)|K, where M is 
p5/p6 offset (multiply), N is p3/p4 offset, K is p1/p2 offset
+        mov             w21, #0
+        mov             w22, #0xB
+        mov             w23, #0x15
+        mov             w13, #0x1D // 0x1D == (3<<3)|(2<<1)|1
+        mov             w14, #0x1D
+        mov             w15, #0x1D
+        mov             w16, #0x1D
+        // y == vb_pos - 6
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w16, w16, w23, ne
+        // y == vb_pos - 5
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w15, w15, w23, ne
+        csel            w16, w16, w22, ne
+        // y == vb_pos - 4
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w14, w14, w23, ne
+        csel            w15, w15, w22, ne
+        csel            w16, w16, w21, ne
+        // y == vb_pos - 3
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w23, ne
+        csel            w14, w14, w22, ne
+        csel            w15, w15, w21, ne
+        csel            w16, w16, w21, ne
+        // y == vb_pos - 2
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w22, ne
+        csel            w14, w14, w21, ne
+        csel            w15, w15, w21, ne
+        csel            w16, w16, w22, ne
+        // y == vb_pos - 1
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w21, ne
+        csel            w14, w14, w21, ne
+        csel            w15, w15, w22, ne
+        csel            w16, w16, w23, ne
+        // y == vb_pos
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w21, ne
+        csel            w14, w14, w22, ne
+        csel            w15, w15, w23, ne
+        // y == vb_pos + 1
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w22, ne
+        csel            w14, w14, w23, ne
+        // y == vb_pos + 2
+        cmp             w20, w6
+        sub             w6, w6, #2
+        csel            w13, w13, w23, ne
+        orr             w13, w13, w14, lsl #8
+        orr             w13, w13, w15, lsl #16
+        orr             w13, w13, w16, lsl #24
+        mov             x14, x1
+        mov             x19, x0
+2:
+        // Load clip [12=>3x4 memory layout]
+        ld3h            {z0.h-z2.h}, p0/z, [x5]
+        // Load filter [12=>3x4 memory layout]
+        ld3h            {z3.h-z5.h}, p0/z, [x4]
+        add             x15, x14, x11, lsl #1
+        add             x16, x14, x11, lsl #2
+        add             x17, x15, x11, lsl #2
+        add             x30, x19, x2, lsl #2
+
+        mov             z12.d, z0.d
+        mov             z13.d, z0.d
+        mov             z14.d, z0.d
+        mov             z15.d, z0.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z3.d
+        mov             z29.d, z3.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z3.d
+        mov             z31.d, z3.d
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+        // p0 (curr)
+        ld1h            z6.h, p0/z, [x14]
+        ld1h            z7.h, p0/z, [x15]
+        ld1h            z0.h, p0/z, [x16]
+        ld1h            z3.h, p0/z, [x17]
+        // clip & filter (first group): a0,a3,a6,a9, a12...
+        // {p5[0],p3[-1],p1[0],p0[3]} -> left operand in clip
+        // {p6[0],p4[1],p2[0],p0[-3]} -> right operand in clip
+        first_group_filter_luma_offsets x13, 0
+        kernel_filter_luma_16_sme2 x14, z6.h, 0
+        first_group_filter_luma_offsets x13, 8
+        kernel_filter_luma_16_sme2 x15, z7.h, 1
+        first_group_filter_luma_offsets x13, 16
+        kernel_filter_luma_16_sme2 x16, z0.h, 2
+        first_group_filter_luma_offsets x13, 24
+        kernel_filter_luma_16_sme2 x17, z3.h, 3
+
+        mov             z12.d, z1.d
+        mov             z13.d, z1.d
+        mov             z14.d, z1.d
+        mov             z15.d, z1.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z4.d
+        mov             z29.d, z4.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z4.d
+        mov             z31.d, z4.d
+        // -clip
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+        // clip & filter (second group): a1,a4,a7,a10,a13...
+        // left:  {p3[ 1],p1[ 2],p1[-1],p0[ 2]}
+        // right: {p4[-1],p2[-2],p2[ 1],p0[-2]}
+        second_group_filter_luma_offsets x13, 0
+        kernel_filter_luma_16_sme2 x14, z6.h, 0
+        second_group_filter_luma_offsets x13, 8
+        kernel_filter_luma_16_sme2 x15, z7.h, 1
+        second_group_filter_luma_offsets x13, 16
+        kernel_filter_luma_16_sme2 x16, z0.h, 2
+        second_group_filter_luma_offsets x13, 24
+        kernel_filter_luma_16_sme2 x17, z3.h, 3
+
+        mov             z12.d, z2.d
+        mov             z13.d, z2.d
+        mov             z14.d, z2.d
+        mov             z15.d, z2.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z5.d
+        mov             z29.d, z5.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z5.d
+        mov             z31.d, z5.d
+        // -clip
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+
+        // clip & filter (third group): a2,a5,a8,a11,a14...
+        // left:  {p3[0],p1[ 1],p1[-2],p0[ 1]}
+        // right: {p4[0],p2[-1],p2[ 2],p0[-1]}
+        third_group_filter_luma_offsets x13, 0
+        kernel_filter_luma_16_sme2 x14, z6.h, 0
+        third_group_filter_luma_offsets x13, 8
+        kernel_filter_luma_16_sme2 x15, z7.h, 1
+        third_group_filter_luma_offsets x13, 16
+        kernel_filter_luma_16_sme2 x16, z0.h, 2
+        third_group_filter_luma_offsets x13, 24
+        kernel_filter_luma_16_sme2 x17, z3.h, 3
+        mova            {z16.d-z19.d}, za.d[w10, 0]
+        mova            {z20.d-z23.d}, za.d[w10, 1]
+        mova            {z24.d-z27.d}, za.d[w10, 2]
+        mova            {z28.d-z31.d}, za.d[w10, 3]
+        sqrshr          z12.h, {z16.d-z19.d}, #7
+        sqrshr          z13.h, {z20.d-z23.d}, #7
+        sqrshr          z14.h, {z24.d-z27.d}, #7
+        sqrshr          z15.h, {z28.d-z31.d}, #7
+        tbnz            x13, #0, 10f
+        sqrshr          z12.h, {z16.d-z19.d}, #10
+10:
+        tbnz            x13, #8, 11f
+        sqrshr          z13.h, {z20.d-z23.d}, #10
+11:
+        tbnz            x13, #16, 12f
+        sqrshr          z14.h, {z24.d-z27.d}, #10
+12:
+        tbnz            x13, #24, 13f
+        sqrshr          z15.h, {z28.d-z31.d}, #10
+13:
+        add             z12.h, z12.h, z6.h
+        add             z13.h, z13.h, z7.h
+        add             z14.h, z14.h, z0.h
+        add             z15.h, z15.h, z3.h
+        sclamp          {z12.h-z15.h}, z9.h, z10.h
+        st1h            z12.h, p0, [x19]
+        st1h            z13.h, p0, [x19, x2, lsl #1]
+        st1h            z14.h, p0, [x30]
+        st1h            z15.h, p0, [x30, x2, lsl #1]
+        zero            {za}
+        add             x14, x14, x12, lsl #1
+        add             x19, x19, x12, lsl #1
+        ptrue           p0.h
+        subs            w8, w8, w12
+        add             w12, w12, w12, lsl #1
+        add             x4, x4, x12, lsl #1
+        add             x5, x5, x12, lsl #1
+        cnth            x12
+        b.gt            2b
+        mov             w8, w3
+        subs            w7, w7, #4
+        add             x1, x1, x11, lsl #3
+        add             x0, x0, x2, lsl #3
+        b.gt            1b
+
+        ldp             x19, x20, [sp, #16]
+        ldp             x21, x22, [sp, #32]
+        ldp             x23, x24, [sp, #48]
+        ldp             x25, x26, [sp, #64]
+        ldp             x27, x28, [sp, #80]
+        ldp             x29, x30, [sp], #96
+        sme_exit
+        ret
+endfunc
+DISABLE_SME_I16I64
+#endif

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 04/04: aarch64/vvc: sme2 optimisation of alf_filter_luma() 8/10/12 bit

Reply via email to