From 3321950c109b416e63eda59c76e6365abc2072b8 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop@amazon.com>
Date: Thu, 2 Jul 2020 16:57:58 +0000
Subject: [PATCH] [aarch64] improve hscale by 50% with multi-threading

hscale is bound by the number of multiply-adds available on a given core.
The patch doubles the number of multiply-adds by distributing half the load to a
helper thread.

The performance improves up to 50% on Graviton2 Arm Neoverse-N1 processors.

$ ./ffmpeg_g -nostats -f lavfi -i testsrc2=4k:d=2 -vf bench=start,scale=1024x1024,bench=stop -f null -
before: [bench @ 0xaaaad62c3d30] t:0.013293 avg:0.013315 max:0.013697 min:0.013293
after:  [bench @ 0xaaaae9346d30] t:0.009637 avg:0.009691 max:0.010005 min:0.009637
38% improvement

scale=1280x720  49% improvement
before: [bench @ 0xaaaadba88d30] t:0.015973 avg:0.016321 max:0.016917 min:0.015973
after:  [bench @ 0xaaaabc78dd30] t:0.010823 avg:0.010869 max:0.011552 min:0.010708

scale=852x480  45% improvement
before: [bench @ 0xaaaaeeed0d30] t:0.013731 avg:0.013727 max:0.013773 min:0.013279
after:  [bench @ 0xaaaaf5f5dd30] t:0.009279 avg:0.009296 max:0.009328 min:0.009187

scale=640x360  45% improvement
before: [bench @ 0xaaaacee25d30] t:0.012010 avg:0.012006 max:0.012053 min:0.011653
after:  [bench @ 0xaaaaea2b5d30] t:0.008077 avg:0.008084 max:0.008409 min:0.008057

scale=284x160  36% improvement
before: [bench @ 0xaaaadbb9ed30] t:0.008384 avg:0.008367 max:0.008421 min:0.008193
after:  [bench @ 0xaaaafb1d6d30] t:0.006099 avg:0.006100 max:0.006120 min:0.006026
---
 libswscale/aarch64/swscale.c  | 44 ++++++++++++++++++++++++++++++++++-
 libswscale/swscale_internal.h | 15 ++++++++++++
 libswscale/utils.c            | 14 +++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index eecbea88ca..56191ab612 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -20,6 +20,7 @@
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 #include "libavutil/aarch64/cpu.h"
+#include "pthread.h"
 
 void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
                             const uint8_t *src, const int16_t *filter,
@@ -29,6 +30,47 @@ void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
                           const int16_t **src, uint8_t *dest, int dstW,
                           const uint8_t *dither, int offset);
 
+void *aarch64_thread_fun(void *ctx) {
+  SwsContext *c = (SwsContext *)ctx;
+
+  /* Run until canceled. */
+  do {
+    if (__atomic_load_n(&c->has_work, __ATOMIC_ACQUIRE)) {
+      ff_hscale_8_to_15_neon(c, c->dst_, c->dstW_, c->src_, c->filter_, c->filterPos_, c->filterSize_);
+      __atomic_store_n(&c->has_work, 0, __ATOMIC_RELEASE);
+    }
+  } while (1);
+}
+
+static void ff_hscale_8_to_15_neon_1(SwsContext *c, int16_t *dst, int dstW,
+                              const uint8_t *src, const int16_t *filter,
+                              const int32_t *filterPos, int filterSize)
+{
+    int rem;
+    if (dstW <= 0 || filterSize <= 0)
+        return;
+
+    if (!c) {
+        ff_hscale_8_to_15_neon(c, dst, dstW, src, filter, filterPos, filterSize);
+        return;
+    }
+    /* Half the outer loop load.  */
+    rem = dstW % 2;
+    dstW /= 2;
+    c->dst_ = dst;
+    c->dstW_ = dstW;
+    c->src_ = src;
+    c->filter_ = filter;
+    c->filterPos_ = filterPos;
+    c->filterSize_ = filterSize;
+    __atomic_store_n(&c->has_work, 1, __ATOMIC_RELEASE);
+
+    ff_hscale_8_to_15_neon(c, dst + dstW, dstW + rem, src, filter + dstW * filterSize, filterPos + dstW, filterSize);
+
+    /* Wait for the helper thread to finish work.  */
+    do {} while (__atomic_load_n(&c->has_work, __ATOMIC_ACQUIRE));
+}
+
 av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -38,7 +80,7 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
             (c->hLumFilterSize % 8) == 0 &&
             (c->hChrFilterSize % 8) == 0)
         {
-            c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
+            c->hyScale = c->hcScale = ff_hscale_8_to_15_neon_1;
         }
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 1a1b6f0dee..fcfb7f0ef0 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -625,9 +625,24 @@ typedef struct SwsContext {
     SwsDither dither;
 
     SwsAlphaBlend alphablend;
+
+#ifdef __aarch64__
+    pthread_t thread;
+    int has_work;
+    int16_t *dst_;
+    int dstW_;
+    const uint8_t *src_;
+    const int16_t *filter_;
+    const int32_t *filterPos_;
+    int filterSize_;
+#endif
 } SwsContext;
 //FIXME check init (where 0)
 
+#ifdef __aarch64__
+void *aarch64_thread_fun(void *ctx);
+#endif
+
 SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c);
 int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
                              int fullRange, int brightness,
diff --git a/libswscale/utils.c b/libswscale/utils.c
index dcd1dbaa76..75d40c11c5 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -38,6 +38,10 @@
 #include <windows.h>
 #endif
 
+#ifdef __aarch64__
+#include "pthread.h"
+#endif
+
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
 #include "libavutil/avutil.h"
@@ -1846,6 +1850,12 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
         }
     }
 
+#ifdef __aarch64__
+    c->has_work = 0;
+    if (0 != pthread_create (&c->thread, NULL, aarch64_thread_fun, (void *)c))
+        goto fail;
+#endif
+
     c->swscale = ff_getSwsFunc(c);
     return ff_init_filters(c);
 nomem:
@@ -2385,6 +2395,10 @@ void sws_freeContext(SwsContext *c)
 
     ff_free_filters(c);
 
+#ifdef __aarch64__
+    pthread_cancel(c->thread);
+#endif
+
     av_free(c);
 }
 
-- 
2.25.1