From 7c59062663e34539136b1353adb6636ada5dd136 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop@amazon.com>
Date: Sun, 17 Nov 2019 14:13:13 -0600
Subject: [PATCH] [aarch64] use FMA and increase vector factor to 4

This patch implements ff_hscale_8_to_15_neon with NEON fused multiply accumulate
and bumps the vectorization factor from 2 to 4. I have seen speedups up to 15%
on Graviton A1 instances based on A-72 cpus.

$ ffmpeg -nostats -f lavfi -i testsrc2=4k:d=2 -vf bench=start,scale=1024x1024,bench=stop -f null -
before: t:0.040303 avg:0.040287 max:0.040371 min:0.039214
after:  t:0.037339 avg:0.037327 max:0.037550 min:0.036992

Tested with `make check` on aarch64-linux.
---
 libswscale/aarch64/swscale.c | 52 +++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 54a3beabe8..0a93fa7d5a 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -21,10 +21,60 @@
 #include "libswscale/swscale_internal.h"
 #include "libavutil/aarch64/cpu.h"
 
+#include "arm_neon.h"
+
 void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
                             const uint8_t *src, const int16_t *filter,
                             const int32_t *filterPos, int filterSize);
 
+static inline int32x4_t ff_hscale(const uint8_t *src, int srcPos,
+                                  const int16_t *filter, int idx,
+                                  int filterSize) {
+    int i;
+    int32x4_t x = vdupq_n_s32(0);
+    for (i = 0; i < filterSize; i += 8) {
+        uint8x8_t a = vld1_u8(src + srcPos + i);
+        int16x8_t s = vreinterpretq_s16_u16(vshll_n_u8(a, 0));
+        int16x8_t f = vld1q_s16(filter + filterSize * idx + i);
+        x = vmlal_s16(x, vget_low_s16(s), vget_low_s16(f));
+        x = vmlal_high_s16(x, s, f);
+    }
+    x = vpaddq_s32(x, x);
+    x = vpaddq_s32(x, x);
+    return x;
+}
+
+static inline int32x4_t ff_zip64(int32x4_t a, int32x4_t b) {
+    int64x2_t x0 = vreinterpretq_s64_s32(a);
+    int64x2_t x1 = vreinterpretq_s64_s32(b);
+    int64x2_t x2 = vzip1q_s64(x0, x1);
+    int32x4_t x3 = vreinterpretq_s32_s64(x2);
+    return x3;
+}
+
+static void ff_hscale_8_to_15_neon_1(SwsContext *c, int16_t *dst, int dstW,
+                              const uint8_t *src, const int16_t *filter,
+                              const int32_t *filterPos, int filterSize)
+{
+    int i;
+    if (dstW % 4 || filterSize < 8) {
+        ff_hscale_8_to_15_neon(c, dst, dstW, src, filter, filterPos, filterSize);
+        return;
+    }
+
+    for (i = 0; i < dstW; i += 4) {
+        int32x4_t x0 = ff_hscale(src, filterPos[i], filter, i, filterSize);
+        int32x4_t x1 = ff_hscale(src, filterPos[i + 1], filter, i + 1, filterSize);
+        int32x4_t x2 = ff_hscale(src, filterPos[i + 2], filter, i + 2, filterSize);
+        int32x4_t x3 = ff_hscale(src, filterPos[i + 3], filter, i + 3, filterSize);
+        int32x4_t x4 = vzip1q_s32(x0, x1);
+        int32x4_t x5 = vzip1q_s32(x2, x3);
+        int32x4_t x6 = ff_zip64(x4, x5);
+        int16x4_t x7 = vqshrn_n_s32(x6, 7);
+        vst1_s16(&dst[i], x7);
+    }
+}
+
 void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
                           const int16_t **src, uint8_t *dest, int dstW,
                           const uint8_t *dither, int offset);
@@ -35,7 +85,7 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
 
     if (have_neon(cpu_flags)) {
         if (c->srcBpc == 8 && c->dstBpc <= 14) {
-            c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
+            c->hyScale = c->hcScale = ff_hscale_8_to_15_neon_1;
         }
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
-- 
2.20.1