From: Christophe Gisquet <christophe.gisq...@gmail.com>

          C   MMX  SSE2
Cycles: 2972  587  302

Signed-off-by: Michael Niedermayer <michae...@gmx.at>
Signed-off-by: Janne Grunau <janne-li...@jannau.net>
---
 libavcodec/huffyuvdsp.h             |  2 +-
 libavcodec/huffyuvdsp.c             |  2 +-
 libavcodec/ppc/huffyuvdsp_altivec.c |  2 +-
 libavcodec/x86/huffyuvdsp_init.c    | 51 ++++++++++---------------------------
 libavcodec/x86/huffyuvdsp.asm       | 37 +++++++++++++++++++++++++++
 5 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h
index 5e84e3a..bf3005e 100644
--- a/libavcodec/huffyuvdsp.h
+++ b/libavcodec/huffyuvdsp.h
@@ -23,7 +23,7 @@
 
 typedef struct HuffYUVDSPContext {
     void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
-                      int w);
+                      intptr_t w);
     void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
                                  const uint8_t *diff, int w,
                                  int *left, int *left_top);
diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c
index ff69b45..b21e71e 100644
--- a/libavcodec/huffyuvdsp.c
+++ b/libavcodec/huffyuvdsp.c
@@ -27,7 +27,7 @@
 #define pb_7f (~0UL / 255 * 0x7f)
 #define pb_80 (~0UL / 255 * 0x80)
 
-static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w)
 {
     long i;
 
diff --git a/libavcodec/ppc/huffyuvdsp_altivec.c 
b/libavcodec/ppc/huffyuvdsp_altivec.c
index 7c34a67..93a4d51 100644
--- a/libavcodec/ppc/huffyuvdsp_altivec.c
+++ b/libavcodec/ppc/huffyuvdsp_altivec.c
@@ -33,7 +33,7 @@
 #include "libavcodec/huffyuvdsp.h"
 
 #if HAVE_ALTIVEC
-static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w)
 {
     register int i;
     register vector unsigned char vdst, vsrc;
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 80e6cfb..55786de 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -25,6 +25,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/huffyuvdsp.h"
 
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
+
 void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
                                     const uint8_t *diff, int w,
                                     int *left, int *left_top);
@@ -34,9 +37,7 @@ int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t 
*src,
 int  ff_add_hfyu_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
                                            int w, int left);
 
-#if HAVE_INLINE_ASM
-
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS
 static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
                                       const uint8_t *diff, int w,
                                       int *left, int *left_top)
@@ -72,49 +73,19 @@ static void add_hfyu_median_pred_cmov(uint8_t *dst, const 
uint8_t *top,
     *left     = l;
     *left_top = tl;
 }
-#endif /* HAVE_7REGS */
-
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
 
 av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_INLINE_ASM
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS
     if (cpu_flags & AV_CPU_FLAG_CMOV)
         c->add_hfyu_median_pred = add_hfyu_median_pred_cmov;
-#endif /* HAVE_7REGS */
+#endif /* HAVE_INLINE_ASM && HAVE_7REGS */
 
-    if (INLINE_MMX(cpu_flags))
-        c->add_bytes = add_bytes_mmx;
-#endif /* HAVE_INLINE_ASM */
+    if (EXTERNAL_MMX(cpu_flags))
+        c->add_bytes = ff_add_bytes_mmx;
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         /* slower than cmov version on AMD */
@@ -122,6 +93,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
             c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
     }
 
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes            = ff_add_bytes_sse2;
+    }
+
     if (EXTERNAL_SSSE3(cpu_flags)) {
         c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
     }
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index 692162b..c877806 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -163,3 +163,40 @@ cglobal add_hfyu_left_pred_unaligned, 3,3,7, dst, src, w, 
left
     ADD_HFYU_LEFT_LOOP 0, 1
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+    mov  sizeq, wq
+    and  sizeq, -2*mmsize
+    jz  .2
+    add   dstq, sizeq
+    add   srcq, sizeq
+    neg  sizeq
+.1:
+    mova    m0, [srcq + sizeq]
+    mova    m1, [srcq + sizeq + mmsize]
+    paddb   m0, [dstq + sizeq]
+    paddb   m1, [dstq + sizeq + mmsize]
+    mova   [dstq + sizeq], m0
+    mova   [dstq + sizeq + mmsize], m1
+    add  sizeq, 2*mmsize
+    jl .1
+.2:
+    and     wq, 2*mmsize-1
+    jz    .end
+    add   dstq, wq
+    add   srcq, wq
+    neg     wq
+.3:
+    mov  sizeb, [srcq + wq]
+    add [dstq + wq], sizeb
+    inc     wq
+    jl .3
+.end:
+    REP_RET
+%endmacro
+
+INIT_MMX mmx
+ADD_BYTES
+INIT_XMM sse2
+ADD_BYTES
-- 
2.10.1

_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to