Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C: 84fps SSE2: 111fps AVX2: 115fps
dd97 vertical hi sse2: 2.77x faster (31773 vs. 11457 decicycles) compared with C avx2: 3.83x faster (31773 vs. 8297 decicycles) compared with C --- libavcodec/x86/dirac_dwt_10bit.asm | 39 +++++++++++++++++++++++++++ libavcodec/x86/dirac_dwt_init_10bit.c | 29 ++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index 0295e6f554..2ed77fe3b0 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -25,6 +25,7 @@ SECTION_RODATA 32 cextern pd_1 pd_2: times 8 dd 2 +pd_8: times 8 dd 8 SECTION .text @@ -246,7 +247,44 @@ RET %endmacro +%macro DD97_VERTICAL_HI 0 + +cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w + mova m7, [pd_8] + shl wd, 2 + add b0q, wq + add b1q, wq + add b2q, wq + add b3q, wq + add b4q, wq + neg wq + + ALIGN 16 + .loop: + mova m0, [b0q + wq] + mova m1, [b1q + wq] + mova m2, [b2q + wq] + mova m3, [b3q + wq] + mova m4, [b4q + wq] + pslld m5, m1, 3 + pslld m6, m3, 3 + paddd m5, m1 + paddd m6, m3 + psubd m5, m0 + psubd m6, m4 + paddd m5, m7 + paddd m5, m6 + psrad m5, 4 + paddd m2, m5 + mova [b2q + wq], m2 + add wq, mmsize + jl .loop +RET + +%endmacro + INIT_XMM sse2 +DD97_VERTICAL_HI HAAR_HORIZONTAL HAAR_VERTICAL LEGALL53_VERTICAL_HI @@ -257,6 +295,7 @@ HAAR_HORIZONTAL HAAR_VERTICAL INIT_YMM avx2 +DD97_VERTICAL_HI HAAR_HORIZONTAL HAAR_VERTICAL LEGALL53_VERTICAL_HI diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index d1234efac5..a9ac603bc5 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -23,6 +23,9 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/dirac_dwt.h" +void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width); +void ff_dd97_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width); + void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width); @@ -36,6 +39,24 @@ void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_ali void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, + int32_t *b3, int32_t *b4, int width) +{ + int i = width & ~3; + ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); + for(; i<width; i++) + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); +} + +static void dd97_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, + int32_t *b3, int32_t *b4, int width) +{ + int i = width & ~7; + ff_dd97_vertical_hi_avx2(b0, b1, b2, b3, b4, i); + for(; i<width; i++) + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); +} + av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) { #if HAVE_X86ASM @@ -43,6 +64,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) if (EXTERNAL_SSE2(cpu_flags)) { switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_h0 = (void*)dd97_vertical_hi_sse2; + d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2; + break; case DWT_DIRAC_LEGALL5_3: d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_sse2; d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2; @@ -71,6 +96,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) if (EXTERNAL_AVX2(cpu_flags)) { switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_h0 = (void*)dd97_vertical_hi_avx2; + d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2; + break; case DWT_DIRAC_LEGALL5_3: d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_avx2; d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2; -- 2.18.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel