From 98a40bd72055e0a1c4616f5168457d85fc3446c5 Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Sat, 16 Dec 2017 18:53:39 +0100
Subject: [PATCH 9/9] avfilter/x86/vf_limiter : add AVX2 version for limiter 8 
 and 16

---
 libavfilter/x86/vf_limiter.asm    | 26 ++++++++++++++++----------
 libavfilter/x86/vf_limiter_init.c | 14 ++++++++++++++
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/libavfilter/x86/vf_limiter.asm b/libavfilter/x86/vf_limiter.asm
index b068ce3576..5963e3f503 100644
--- a/libavfilter/x86/vf_limiter.asm
+++ b/libavfilter/x86/vf_limiter.asm
@@ -28,12 +28,12 @@ cglobal limiter_8bit, 6, 7, 3, src, dst, slinesize, dlinesize, w, h, x
     add        srcq, wq
     add        dstq, wq
     neg          wq
-    movd         m1, r6m
-    punpcklbw    m1, m1
-    SPLATW       m1, m1
-    movd         m2, r7m
-    punpcklbw    m2, m2
-    SPLATW       m2, m2
+    movd         xm1, r6m
+    punpcklbw    xm1, xm1
+    SPLATW       m1, xm1
+    movd         xm2, r7m
+    punpcklbw    xm2, xm2
+    SPLATW       m2, xm2
 .nextrow:
     mov          xq, wq
 
@@ -57,10 +57,10 @@ cglobal limiter_16bit, 6, 7, 3, src, dst, slinesize, dlinesize, w, h, x
     add        srcq, wq
     add        dstq, wq
     neg          wq
-    movd         m1, r6m
-    SPLATW       m1, m1
-    movd         m2, r7m
-    SPLATW       m2, m2
+    movd         xm1, r6m
+    SPLATW       m1, xm1
+    movd         xm2, r7m
+    SPLATW       m2, xm2
 .nextrow:
     mov          xq, wq
 
@@ -84,3 +84,9 @@ LIMITER_8
 
 INIT_XMM sse4
 LIMITER_16
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+LIMITER_8
+LIMITER_16
+%endif
diff --git a/libavfilter/x86/vf_limiter_init.c b/libavfilter/x86/vf_limiter_init.c
index 07c733dc21..88e27f997c 100644
--- a/libavfilter/x86/vf_limiter_init.c
+++ b/libavfilter/x86/vf_limiter_init.c
@@ -23,9 +23,16 @@
 void ff_limiter_8bit_sse2(const uint8_t *src, uint8_t *dst,
                           ptrdiff_t slinesize, ptrdiff_t dlinesize,
                           int w, int h, int min, int max);
+void ff_limiter_8bit_avx2(const uint8_t *src, uint8_t *dst,
+                          ptrdiff_t slinesize, ptrdiff_t dlinesize,
+                          int w, int h, int min, int max);
+
 void ff_limiter_16bit_sse4(const uint8_t *src, uint8_t *dst,
                            ptrdiff_t slinesize, ptrdiff_t dlinesize,
                            int w, int h, int min, int max);
+void ff_limiter_16bit_avx2(const uint8_t *src, uint8_t *dst,
+                           ptrdiff_t slinesize, ptrdiff_t dlinesize,
+                           int w, int h, int min, int max);
 
 void ff_limiter_init_x86(LimiterDSPContext *dsp, int bpp)
 {
@@ -41,4 +48,11 @@ void ff_limiter_init_x86(LimiterDSPContext *dsp, int bpp)
             dsp->limiter = ff_limiter_16bit_sse4;
         }
     }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        if (bpp <= 8) {
+            dsp->limiter = ff_limiter_8bit_avx2;
+        } else {
+            dsp->limiter = ff_limiter_16bit_avx2;
+        }
+    }
 }
-- 
2.14.3 (Apple Git-98)

