Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C: 94fps SSE2: 118fps AVX2: 121fps
legall vertical hi sse2: 3.86x faster (20201 vs. 5231 decicycles) compared with C avx2: 6.70x faster (20201 vs. 3014 decicycles) compared with C legall vertical lo sse2: 1.50x faster (28345 vs. 18908 decicycles) compared with C avx2: 1.63x faster (28345 vs. 17361 decicycles) compared with C --- libavcodec/x86/dirac_dwt_10bit.asm | 105 +++++++++++++++++++++++++- libavcodec/x86/dirac_dwt_init_10bit.c | 13 ++++ 2 files changed, 117 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index baea91329e..0295e6f554 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -21,9 +21,10 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 cextern pd_1 +pd_2: times 8 dd 2 SECTION .text @@ -147,9 +148,109 @@ REP_RET %endmacro +%macro LEGALL53_VERTICAL_LO 0 + +cglobal legall53_vertical_lo, 4, 6, 4, b0, b1, b2, w + DECLARE_REG_TMP 3,4,5 + + mova m3, [pd_2] + mov t2d, wd + and wd, ~(mmsize/4 - 1) + shl wd, 2 + add b0q, wq + add b1q, wq + add b2q, wq + neg wq + + ALIGN 16 + .loop: + mova m0, [b0q + wq] + mova m1, [b1q + wq] + mova m2, [b2q + wq] + paddd m0, m2 + paddd m0, m3 + psrad m0, 2 + psubd m1, m0 + mova [b1q + wq], m1 + add wq, mmsize + jl .loop + + and t2d, mmsize/4 - 1 + jz .end + .loop_scalar: + mov t0d, [b0q] + mov t1d, [b1q] + add t0d, [b2q] + add t0d, 2 + sar t0d, 2 + sub t1d, t0d + mov [b1q], t1d + + add b0q, 4 + add b1q, 4 + add b2q, 4 + sub t2d, 1 + jg .loop_scalar + + .end: +RET + +%endmacro + +%macro LEGALL53_VERTICAL_HI 0 + +cglobal legall53_vertical_hi, 4, 6, 4, b0, b1, b2, w + DECLARE_REG_TMP 3,4,5 + + mova m3, [pd_1] + mov t2d, wd + and wd, ~(mmsize/4 - 1) + shl wd, 2 + add b0q, wq + add b1q, wq + add b2q, wq + neg wq + + ALIGN 16 + .loop: + mova m0, [b0q + wq] + mova m1, [b1q + wq] + mova m2, [b2q + wq] + paddd m0, m2 + paddd m0, m3 + psrad m0, 1 + paddd m1, m0 + mova [b1q + wq], m1 + add wq, mmsize + jl .loop + + and t2d, mmsize/4 - 1 + jz .end + .loop_scalar: + mov t0d, [b0q] + mov t1d, [b1q] + add t0d, [b2q] + add t0d, 1 + sar t0d, 1 + add t1d, t0d + mov [b1q], t1d + + add b0q, 4 + add b1q, 4 + add b2q, 4 + sub t2d, 1 + jg .loop_scalar + + .end: +RET + +%endmacro + INIT_XMM sse2 HAAR_HORIZONTAL HAAR_VERTICAL +LEGALL53_VERTICAL_HI +LEGALL53_VERTICAL_LO INIT_XMM avx HAAR_HORIZONTAL @@ -158,3 +259,5 @@ HAAR_VERTICAL INIT_YMM avx2 HAAR_HORIZONTAL HAAR_VERTICAL +LEGALL53_VERTICAL_HI +LEGALL53_VERTICAL_LO diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index 289862d728..d1234efac5 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -23,6 +23,11 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/dirac_dwt.h" +void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_lo_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width); + void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); @@ -38,6 +43,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) if (EXTERNAL_SSE2(cpu_flags)) { switch (type) { + case DWT_DIRAC_LEGALL5_3: + d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_sse2; + d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2; + break; case DWT_DIRAC_HAAR0: d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2; break; @@ -62,6 +71,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) if (EXTERNAL_AVX2(cpu_flags)) { switch (type) { + case DWT_DIRAC_LEGALL5_3: + d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_avx2; + d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2; + break; case DWT_DIRAC_HAAR0: d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx2; break; -- 2.18.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel