aarch64: Add neon implementation for pix_median_abs16

Martin Storsjö Fri, 16 Sep 2022 14:09:19 -0700

On Tue, 13 Sep 2022, Hubert Mazur wrote:

Provide optimized implementation for pix_median_abs16 function.


Performance comparison tests are shown below.
- median_sad_0_c: 722.0
- median_sad_0_neon: 144.7

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <h...@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
libavcodec/aarch64/me_cmp_neon.S         | 81 ++++++++++++++++++++++++
libavcodec/me_cmp.c                      |  5 +-
3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c 
b/libavcodec/aarch64/me_cmp_init_aarch64.c
index ade3e9a4c1..fb51a833be 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -53,6 +53,8 @@ int nsse16_neon(int multiplier, const uint8_t *s, const 
uint8_t *s2,
                ptrdiff_t stride, int h);
int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                        ptrdiff_t stride, int h);
+int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const 
uint8_t *pix2,
+                          ptrdiff_t stride, int h);

av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -78,6 +80,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, 
AVCodecContext *avctx)
        c->vsse[4] = vsse_intra16_neon;

        c->nsse[0] = nsse16_neon_wrapper;
+
+        c->median_sad[0] = pix_median_abs16_neon;
    }
}

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index f8998749a5..a4a4344f42 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -969,3 +969,84 @@ function nsse16_neon, export=1

        ret
endfunc
+
+function pix_median_abs16_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        ld1             {v2.16b}, [x1], x3
+        ld1             {v3.16b}, [x2], x3
+        movi            v31.8h, #0
+        movi            v16.8h, #0
+        ext             v0.16b, v2.16b, v2.16b, #1
+        ext             v1.16b, v3.16b, v3.16b, #1
+        usubl           v28.8h, v2.8b, v3.8b
+        usubl2          v27.8h, v2.16b, v3.16b
+        usubl           v26.8h, v0.8b, v1.8b
+        usubl2          v25.8h, v0.16b, v1.16b
+        sub             w4, w4, #1                              // we need to 
make h-1 iterations
+        saba            v31.8h, v26.8h, v28.8h
+        saba            v16.8h, v25.8h, v27.8h
+        mov             h18, v28.h[0]

+        cmp             w4, #1
+        sqabs           h18, h18
+
+        b.lt            2f
+1:
+
+        ld1             {v6.16b}, [x1], x3                      // pix1 vector 
for V(j-1)
+        ld1             {v7.16b}, [x2], x3                      // pix2 vector 
for V(j-1)
+        subs            w4, w4, #1
+        mov             v2.16b, v6.16b
+        mov             v3.16b, v7.16b


These two mov instructions seem unnecessary?

+        ext             v4.16b, v6.16b, v6.16b, #1              // pix1 vector 
for V(j)
+        ext             v5.16b, v7.16b, v7.16b, #1              // pix2 vector 
for V(j)
+
+        // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
+        // scratch registers: v22, v21, v20, v19, v17
+
+        // To find median of three values, calculate sum of them
+        // and subtract max and min value from it.
+        usubl           v30.8h, v6.8b, v7.8b                    // V(j-1)
+        usubl2          v29.8h, v6.16b, v7.16b                  // V(j-1)
+        usubl           v24.8h, v4.8b, v5.8b                    // V(j)
+        usubl2          v23.8h, v4.16b, v5.16b                  // V(j)
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v5.16b


These two movs are unused, too, right?

+        sabd            v20.8h, v30.8h, v28.8h
+        mov             h17, v20.h[0]
+        add             d18, d18, d17

These are quite suboptimally scheduled here. However, we shouldn't needthem.

In general, try to avoid these single-element calculations if not strictlynecessary. You can just keep using both the input (here, v20) and theaccumulator (v18) as a .4h vector, where you only care about the firstelement. Then at the very end you can extract the individual firstelement from it, instead of doing it every round in the loop. Then you canpotentially change sabd into saba too, unless the non-accumulated resultis needed too.

+        add             v22.8h, v26.8h, v30.8h
+        smin            v20.8h, v26.8h, v30.8h
+        add             v21.8h, v25.8h, v29.8h
+        smax            v19.8h, v26.8h, v30.8h
+        sub             v22.8h, v22.8h, v28.8h
+        sub             v21.8h, v21.8h, v27.8h
+        smin            v17.8h, v19.8h, v22.8h
+        smin            v22.8h, v25.8h, v29.8h
+        mov             v28.16b, v30.16b
+        smax            v20.8h, v20.8h, v17.8h                  // median 
values lower half
+        smax            v19.8h, v25.8h, v29.8h
+        saba            v31.8h, v24.8h, v20.8h
+        mov             v27.16b, v29.16b
+        smin            v19.8h, v19.8h, v21.8h
+        mov             v26.16b, v24.16b
+        smax            v17.8h, v22.8h, v19.8h                  // median 
values upper half
+        mov             v25.16b, v23.16b
+        saba            v16.8h, v23.8h, v17.8h
+
+        b.ne            1b
+
+2:
+        ins             v16.h[7], wzr
+        add             v31.8h, v31.8h, v16.8h
+        uaddlv          s17, v31.8h
+        add             d18, d18, d17
+        fmov            w0, s18
+
+        ret
+
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index 4242fbc6e4..230e7ea54a 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1048,6 +1048,9 @@ av_cold void ff_me_cmp_init(MECmpContext *c, 
AVCodecContext *avctx)
    ff_dsputil_init_dwt(c);
#endif

+c->median_sad[0] = pix_median_abs16_c;
+c->median_sad[1] = pix_median_abs8_c;
+


These are incorrectly indented.

Other than that, this seems reasonable I think.


// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: Add neon implementation for pix_median_abs16

Reply via email to