From b7e2238f20b3efdc920acdfc77849753170bf4be Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Fri, 22 Sep 2017 22:56:48 +0200
Subject: [PATCH] libavcodec/exr : add x86 SIMD for predictor

---
 libavcodec/exr.c             | 16 ++----------
 libavcodec/exrdsp.c          | 13 ++++++++++
 libavcodec/exrdsp.h          |  1 +
 libavcodec/x86/exrdsp.asm    | 62 +++++++++++++++++++++++++++++++++++++++++++-
 libavcodec/x86/exrdsp_init.c |  5 ++++
 tests/checkasm/exrdsp.c      | 23 ++++++++++++++++
 6 files changed, 105 insertions(+), 15 deletions(-)

diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index 230d5bbca8..0b755db3cb 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -265,18 +265,6 @@ static inline uint16_t exr_halflt2uint(uint16_t v)
     return (v + (1 << 16)) >> (exp + 1);
 }
 
-static void predictor(uint8_t *src, int size)
-{
-    uint8_t *t    = src + 1;
-    uint8_t *stop = src + size;
-
-    while (t < stop) {
-        int d = (int) t[-1] + (int) t[0] - 128;
-        t[0] = d;
-        ++t;
-    }
-}
-
 static int zip_uncompress(EXRContext *s, const uint8_t *src, int compressed_size,
                           int uncompressed_size, EXRThreadData *td)
 {
@@ -288,7 +276,7 @@ static int zip_uncompress(EXRContext *s, const uint8_t *src, int compressed_size
 
     av_assert1(uncompressed_size % 2 == 0);
 
-    predictor(td->tmp, uncompressed_size);
+    s->dsp.predictor(td->tmp, uncompressed_size);
     s->dsp.reorder_pixels(td->uncompressed_data, td->tmp, uncompressed_size);
 
     return 0;
@@ -335,7 +323,7 @@ static int rle_uncompress(EXRContext *ctx, const uint8_t *src, int compressed_si
 
     av_assert1(uncompressed_size % 2 == 0);
 
-    predictor(td->tmp, uncompressed_size);
+    ctx->dsp.predictor(td->tmp, uncompressed_size);
     ctx->dsp.reorder_pixels(td->uncompressed_data, td->tmp, uncompressed_size);
 
     return 0;
diff --git a/libavcodec/exrdsp.c b/libavcodec/exrdsp.c
index 871b6f1276..192bbb864e 100644
--- a/libavcodec/exrdsp.c
+++ b/libavcodec/exrdsp.c
@@ -38,9 +38,22 @@ static void reorder_pixels_scalar(uint8_t *dst, const uint8_t *src, ptrdiff_t si
     }
 }
 
+static void predictor_scalar(uint8_t *src, ptrdiff_t size)
+{
+    uint8_t *t    = src + 1;
+    uint8_t *stop = src + size;
+
+    while (t < stop) {
+        int d = (int) t[-1] + (int) t[0] - 128;
+        t[0] = d;
+        ++t;
+    }
+}
+
 av_cold void ff_exrdsp_init(ExrDSPContext *c)
 {
     c->reorder_pixels   = reorder_pixels_scalar;
+    c->predictor        = predictor_scalar;
 
     if (ARCH_X86)
         ff_exrdsp_init_x86(c);
diff --git a/libavcodec/exrdsp.h b/libavcodec/exrdsp.h
index d8cb002efc..2c4dc3af88 100644
--- a/libavcodec/exrdsp.h
+++ b/libavcodec/exrdsp.h
@@ -24,6 +24,7 @@
 
 typedef struct ExrDSPContext {
     void (*reorder_pixels)(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+    void (*predictor)(uint8_t *src, ptrdiff_t size);
 } ExrDSPContext;
 
 void ff_exrdsp_init(ExrDSPContext *c);
diff --git a/libavcodec/x86/exrdsp.asm b/libavcodec/x86/exrdsp.asm
index 06c629e59e..e67ff63c95 100644
--- a/libavcodec/x86/exrdsp.asm
+++ b/libavcodec/x86/exrdsp.asm
@@ -2,7 +2,7 @@
 ;* X86 Optimized functions for Open Exr Decoder
 ;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
 ;*
-;* reorder_pixels based on patch by John Loy
+;* reorder_pixels, predictor based on patch by John Loy
 ;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
 ;*
 ;* This file is part of FFmpeg.
@@ -24,6 +24,11 @@
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA 32
+
+neg_128: times 16 db -128
+shuffle_15: times 16 db 15
+
 SECTION .text
 
 ;------------------------------------------------------------------------------
@@ -60,3 +65,58 @@ REORDER_PIXELS
 INIT_YMM avx2
 REORDER_PIXELS
 %endif
+
+
+;------------------------------------------------------------------------------
+; void ff_predictor(uint8_t *src, ptrdiff_t size);
+;------------------------------------------------------------------------------
+
+INIT_XMM ssse3
+cglobal predictor, 2,3,5, src, size, tmp
+
+    mov                tmpb, [srcq]
+    xor                tmpb, -128
+    mov              [srcq], tmpb
+
+;offset src by size
+    add                srcq, sizeq
+    neg               sizeq                ; size = offset for src
+
+;init mm
+    mova                 m0, [neg_128]     ; m0 = const for xor high byte
+    mova                 m1, [shuffle_15]  ; m1 = shuffle mask
+    pxor                 m2, m2            ; m2 = prev_buffer
+
+
+.loop:
+    mova                 m3, [srcq + sizeq]
+    pxor                 m3, m0
+
+    ;compute prefix sum
+    mova	         m4, m3
+    pslldq               m4, 1
+
+    paddb                m4, m3
+    mova                 m3, m4
+    pslldq               m3, 2
+
+    paddb                m3, m4
+    mova                 m4, m3
+    pslldq               m4, 4
+
+    paddb                m4, m3
+    mova                 m3, m4
+    pslldq               m3, 8
+
+    paddb                m4, m2
+    paddb                m4, m3
+
+    mova     [srcq + sizeq], m4
+
+    ;broadcast high byte for next iter
+    pshufb m4, m1
+    mova m2, m4
+
+    add     sizeq, mmsize
+    jl .loop
+    RET
diff --git a/libavcodec/x86/exrdsp_init.c b/libavcodec/x86/exrdsp_init.c
index 5669be3d97..4b047f6bd3 100644
--- a/libavcodec/x86/exrdsp_init.c
+++ b/libavcodec/x86/exrdsp_init.c
@@ -26,6 +26,8 @@ void ff_reorder_pixels_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
 
 void ff_reorder_pixels_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
 
+void ff_predictor_ssse3(uint8_t *src, ptrdiff_t size);
+
 av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -33,6 +35,9 @@ av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
     if (EXTERNAL_SSE2(cpu_flags)) {
         dsp->reorder_pixels = ff_reorder_pixels_sse2;
     }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        dsp->predictor = ff_predictor_ssse3;
+    }
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         dsp->reorder_pixels = ff_reorder_pixels_avx2;
     }
diff --git a/tests/checkasm/exrdsp.c b/tests/checkasm/exrdsp.c
index 6637f6fdd2..754a079f83 100644
--- a/tests/checkasm/exrdsp.c
+++ b/tests/checkasm/exrdsp.c
@@ -55,6 +55,24 @@ static void check_reorder_pixels(void) {
     bench_new(dst_new, src, BUF_SIZE);
 }
 
+static void check_predictor(void) {
+    LOCAL_ALIGNED_32(uint8_t, src,     [PADDED_BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, dst_ref, [PADDED_BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, dst_new, [PADDED_BUF_SIZE]);
+
+    declare_func(void, uint8_t *src, ptrdiff_t size);
+
+    memset(src,     0, PADDED_BUF_SIZE);
+    randomize_buffers();
+    memcpy(dst_ref, src, PADDED_BUF_SIZE);
+    memcpy(dst_new, src, PADDED_BUF_SIZE);
+    call_ref(dst_ref, BUF_SIZE);
+    call_new(dst_new, BUF_SIZE);
+    if (memcmp(dst_ref, dst_new, BUF_SIZE))
+        fail();
+    bench_new(dst_new, BUF_SIZE);
+}
+
 void checkasm_check_exrdsp(void)
 {
     ExrDSPContext h;
@@ -65,4 +83,9 @@ void checkasm_check_exrdsp(void)
         check_reorder_pixels();
 
     report("reorder_pixels");
+
+    if (check_func(h.predictor, "predictor"))
+        check_predictor();
+
+    report("predictor");
 }
-- 
2.11.0 (Apple Git-81)

