On 19 July 2018 at 15:52, James Darnley <jdarn...@obe.tv> wrote: > Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the > relevant transform. > C: 84fps > SSE2: 111fps > AVX2: 115fps > --- > libavcodec/x86/dirac_dwt_10bit.asm | 38 +++++++++++++++++++++++++++ > libavcodec/x86/dirac_dwt_init_10bit.c | 16 +++++++++++ > 2 files changed, 54 insertions(+) > > diff --git a/libavcodec/x86/dirac_dwt_10bit.asm > b/libavcodec/x86/dirac_dwt_10bit.asm > index c00de32bfe..681de5e1df 100644 > --- a/libavcodec/x86/dirac_dwt_10bit.asm > +++ b/libavcodec/x86/dirac_dwt_10bit.asm > @@ -25,6 +25,7 @@ SECTION_RODATA > > cextern pd_1 > pd_2: times 4 dd 2 > +pd_8: times 4 dd 8 > > SECTION .text > > @@ -153,7 +154,44 @@ RET > > %endmacro > > +%macro DD97_VERTICAL_HI 0 > + > +cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w > + mova m7, [pd_8] > + shl wd, 2 > + add b0q, wq > + add b1q, wq > + add b2q, wq > + add b3q, wq > + add b4q, wq > + neg wq > + > + ALIGN 16 > + .loop: > + mova m0, [b0q + wq] > + mova m1, [b1q + wq] > + mova m2, [b2q + wq] > + mova m3, [b3q + wq] > + mova m4, [b4q + wq] > + pslld m5, m1, 3 > + pslld m6, m3, 3 > + paddd m5, m1 > + paddd m6, m3 > + psubd m5, m0 > + psubd m6, m4 > + paddd m5, m7 > + paddd m5, m6 > + psrad m5, 4 > + paddd m2, m5 > + mova [b2q + wq], m2 > + add wq, mmsize > + jl .loop > +RET > + > +%endmacro > + > INIT_XMM sse2 > +DD97_VERTICAL_HI > HAAR_HORIZONTAL > HAAR_VERTICAL > LEGALL53_VERTICAL_HI > diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c > b/libavcodec/x86/dirac_dwt_init_10bit.c > index 88cf267d14..e7e7534050 100644 > --- a/libavcodec/x86/dirac_dwt_init_10bit.c > +++ b/libavcodec/x86/dirac_dwt_init_10bit.c > @@ -23,6 +23,8 @@ > #include "libavutil/x86/cpu.h" > #include "libavcodec/dirac_dwt.h" > > +void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, > int32_t *b3, int32_t *b4, int width); > + > void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, > int width); > void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, > int width); > > @@ -110,6 +112,16 @@ static void legall53_vertical_hi_sse2(int32_t *b0, > int32_t *b1, int32_t *b2, int > b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); > } > > +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, > + int32_t *b3, int32_t *b4, int width) > +{ > + int i = width & ~3; > + ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); > + for(; i<width; i++) > + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); > + > +} >
This, along with the rest of the patchset: what's up with the hybrid implementations? Couldn't you put the second part in the asm code as well? Now there are 2 function calls instead of 1. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel