James Almer: > uyvytoyuv422_c: 23991.8 > uyvytoyuv422_sse2: 2817.8 > uyvytoyuv422_avx: 2819.3
Why don't you nuke the avx version in a follow-up patch? > uyvytoyuv422_avx2: 1972.3 > > Signed-off-by: James Almer <jamr...@gmail.com> > --- > libswscale/x86/rgb2rgb.c | 6 ++++++ > libswscale/x86/rgb_2_rgb.asm | 32 ++++++++++++++++++++++++-------- > 2 files changed, 30 insertions(+), 8 deletions(-) > > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c > index b325e5dbd5..21ccfafe51 100644 > --- a/libswscale/x86/rgb2rgb.c > +++ b/libswscale/x86/rgb2rgb.c > @@ -136,6 +136,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, > uint8_t *vdst, > void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > const uint8_t *src, int width, int height, > int lumStride, int chromStride, int srcStride); > +void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > + const uint8_t *src, int width, int height, > + int lumStride, int chromStride, int srcStride); > #endif > > av_cold void rgb2rgb_init_x86(void) > @@ -177,5 +180,8 @@ av_cold void rgb2rgb_init_x86(void) > if (EXTERNAL_AVX(cpu_flags)) { > uyvytoyuv422 = ff_uyvytoyuv422_avx; > } > + if (EXTERNAL_AVX2_FAST(cpu_flags)) { > + uyvytoyuv422 = ff_uyvytoyuv422_avx2; > + } > #endif > } > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm > index 76ca1eec03..0bf1278718 100644 > --- a/libswscale/x86/rgb_2_rgb.asm > +++ b/libswscale/x86/rgb_2_rgb.asm > @@ -34,13 +34,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, > 15, 14, 13, 12 > > SECTION .text > > -%macro RSHIFT_COPY 3 > +%macro RSHIFT_COPY 5 > ; %1 dst ; %2 src ; %3 shift > -%if cpuflag(avx) > - psrldq %1, %2, %3 > +%if mmsize == 32 > + vperm2i128 %1, %2, %3, %5 > + RSHIFT %1, %4 > +%elif cpuflag(avx) > + psrldq %1, %2, %4 > %else > mova %1, %2 > - RSHIFT %1, %3 > + RSHIFT %1, %4 > %endif > %endmacro > > @@ -233,26 +236,37 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, > w, h, lum_stride, chrom_s > jge .end_line > > .loop_simd: > +%if mmsize == 32 > + movu xm2, [srcq + wtwoq ] > + movu xm3, [srcq + wtwoq + 16 ] > + movu xm4, [srcq + wtwoq + 16 * 2] > + movu xm5, [srcq + wtwoq + 16 * 3] > + vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1 > + vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1 > + vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1 > + vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1 > +%else > movu m2, [srcq + wtwoq ] > movu m3, [srcq + wtwoq + mmsize ] > movu m4, [srcq + wtwoq + mmsize * 2] > movu m5, [srcq + wtwoq + mmsize * 3] > +%endif > > ; extract y part 1 > - RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... > + RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... > pand m6, m1; YxYx YxYx... > > - RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... > + RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY... > pand m7, m1 ; YxYx YxYx... > > packuswb m6, m7 ; YYYY YYYY... > movu [ydstq + wq], m6 > > ; extract y part 2 > - RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... > + RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY... > pand m6, m1; YxYx YxYx... > > - RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... > + RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY... > pand m7, m1 ; YxYx YxYx... > > packuswb m6, m7 ; YYYY YYYY... > @@ -309,4 +323,6 @@ UYVY_TO_YUV422 > > INIT_XMM avx > UYVY_TO_YUV422 > +INIT_YMM avx2 > +UYVY_TO_YUV422 > %endif _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".