From: Christophe Gisquet <christophe.gisq...@gmail.com> C MMX SSE2 Cycles: 2972 587 302
Signed-off-by: Michael Niedermayer <michae...@gmx.at> Signed-off-by: Janne Grunau <janne-li...@jannau.net> --- libavcodec/huffyuvdsp.h | 2 +- libavcodec/huffyuvdsp.c | 2 +- libavcodec/ppc/huffyuvdsp_altivec.c | 2 +- libavcodec/x86/huffyuvdsp_init.c | 51 ++++++++++--------------------------- libavcodec/x86/huffyuvdsp.asm | 37 +++++++++++++++++++++++++++ 5 files changed, 53 insertions(+), 41 deletions(-) diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h index 5e84e3a..bf3005e 100644 --- a/libavcodec/huffyuvdsp.h +++ b/libavcodec/huffyuvdsp.h @@ -23,7 +23,7 @@ typedef struct HuffYUVDSPContext { void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */, - int w); + intptr_t w); void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c index ff69b45..b21e71e 100644 --- a/libavcodec/huffyuvdsp.c +++ b/libavcodec/huffyuvdsp.c @@ -27,7 +27,7 @@ #define pb_7f (~0UL / 255 * 0x7f) #define pb_80 (~0UL / 255 * 0x80) -static void add_bytes_c(uint8_t *dst, uint8_t *src, int w) +static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w) { long i; diff --git a/libavcodec/ppc/huffyuvdsp_altivec.c b/libavcodec/ppc/huffyuvdsp_altivec.c index 7c34a67..93a4d51 100644 --- a/libavcodec/ppc/huffyuvdsp_altivec.c +++ b/libavcodec/ppc/huffyuvdsp_altivec.c @@ -33,7 +33,7 @@ #include "libavcodec/huffyuvdsp.h" #if HAVE_ALTIVEC -static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) +static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w) { register int i; register vector unsigned char vdst, vsrc; diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c index 80e6cfb..55786de 100644 --- a/libavcodec/x86/huffyuvdsp_init.c +++ b/libavcodec/x86/huffyuvdsp_init.c @@ -25,6 +25,9 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/huffyuvdsp.h" +void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w); +void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w); + void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); @@ -34,9 +37,7 @@ int ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src, int ff_add_hfyu_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); -#if HAVE_INLINE_ASM - -#if HAVE_7REGS +#if HAVE_INLINE_ASM && HAVE_7REGS static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) @@ -72,49 +73,19 @@ static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, *left = l; *left_top = tl; } -#endif /* HAVE_7REGS */ - -static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) -{ - x86_reg i = 0; - - __asm__ volatile ( - "jmp 2f \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq (%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%2, %0) \n\t" - "movq 8(%1, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "2: \n\t" - "cmp %3, %0 \n\t" - "js 1b \n\t" - : "+r" (i) - : "r" (src), "r" (dst), "r" ((x86_reg) w - 15)); - - for (; i < w; i++) - dst[i + 0] += src[i + 0]; -} - -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_7REGS && HAVE_INLINE_ASM */ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c) { int cpu_flags = av_get_cpu_flags(); -#if HAVE_INLINE_ASM -#if HAVE_7REGS +#if HAVE_INLINE_ASM && HAVE_7REGS if (cpu_flags & AV_CPU_FLAG_CMOV) c->add_hfyu_median_pred = add_hfyu_median_pred_cmov; -#endif /* HAVE_7REGS */ +#endif /* HAVE_INLINE_ASM && HAVE_7REGS */ - if (INLINE_MMX(cpu_flags)) - c->add_bytes = add_bytes_mmx; -#endif /* HAVE_INLINE_ASM */ + if (EXTERNAL_MMX(cpu_flags)) + c->add_bytes = ff_add_bytes_mmx; if (EXTERNAL_MMXEXT(cpu_flags)) { /* slower than cmov version on AMD */ @@ -122,6 +93,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c) c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext; } + if (EXTERNAL_SSE2(cpu_flags)) { + c->add_bytes = ff_add_bytes_sse2; + } + if (EXTERNAL_SSSE3(cpu_flags)) { c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3; } diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm index 692162b..c877806 100644 --- a/libavcodec/x86/huffyuvdsp.asm +++ b/libavcodec/x86/huffyuvdsp.asm @@ -163,3 +163,40 @@ cglobal add_hfyu_left_pred_unaligned, 3,3,7, dst, src, w, left ADD_HFYU_LEFT_LOOP 0, 1 .src_unaligned: ADD_HFYU_LEFT_LOOP 0, 0 + +%macro ADD_BYTES 0 +cglobal add_bytes, 3,4,2, dst, src, w, size + mov sizeq, wq + and sizeq, -2*mmsize + jz .2 + add dstq, sizeq + add srcq, sizeq + neg sizeq +.1: + mova m0, [srcq + sizeq] + mova m1, [srcq + sizeq + mmsize] + paddb m0, [dstq + sizeq] + paddb m1, [dstq + sizeq + mmsize] + mova [dstq + sizeq], m0 + mova [dstq + sizeq + mmsize], m1 + add sizeq, 2*mmsize + jl .1 +.2: + and wq, 2*mmsize-1 + jz .end + add dstq, wq + add srcq, wq + neg wq +.3: + mov sizeb, [srcq + wq] + add [dstq + wq], sizeb + inc wq + jl .3 +.end: + REP_RET +%endmacro + +INIT_MMX mmx +ADD_BYTES +INIT_XMM sse2 +ADD_BYTES -- 2.10.1 _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel