--- libswscale/x86/scale.asm | 129 ++++++++++++++++++++++++++++++++++++- libswscale/x86/swscale_mmx.c | 23 +++++++ libswscale/x86/swscale_template.c | 76 ---------------------- 3 files changed, 151 insertions(+), 77 deletions(-)
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index d355894..085bd9c 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* x86-optimized horizontal line scaling functions +;* x86-optimized horizontal/vertical line scaling functions ;* Copyright (c) 2011 Ronald S. Bultje <[email protected]> ;* ;* This file is part of Libav. @@ -28,6 +28,11 @@ max_19bit_int: times 4 dd 0x7ffff max_19bit_flt: times 4 dd 524287.0 minshort: times 8 dw 0x8000 unicoeff: times 4 dd 0x20000000 +pd_4: times 4 dd 4 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +pw_512: times 8 dw 512 +pw_1024: times 8 dw 1024 SECTION .text @@ -427,3 +432,125 @@ INIT_XMM SCALE_FUNCS2 sse2, 6, 7, 8 SCALE_FUNCS2 ssse3, 6, 6, 8 SCALE_FUNCS2 sse4, 6, 6, 8 + +;----------------------------------------------------------------------------- +; vertical line scaling +; +; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW, +; const uint8_t *dither, int offset) +; and +; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize, +; const int16_t **src, uint8_t *dst, int dstW, +; const uint8_t *dither, int offset) +; +; Scale one or $filterSize lines of source data to generate one line of output +; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in +; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple +; of 2. $offset is either 0 or 3. $dither holds 8 values. +;----------------------------------------------------------------------------- + +%macro yuv2plane1_fn 4 +cglobal yuv2plane1_%1_%2, %4, %4, %3 +%if %1 == 8 + add r1, r2 +%else ; %1 != 8 + lea r1, [r1+r2*2] +%endif ; %1 == 8 +%if %1 == 16 + lea r0, [r0+r2*4] +%else ; %1 != 16 + lea r0, [r0+r2*2] +%endif ; %1 == 16 + neg r2 + +%if %1 == 8 + pxor m4, m4 ; zero + + ; create registers holding dither + movq m3, [r3] ; dither + test r4, r4 + jz .no_rot +%if mmsize == 16 + punpcklqdq m3, m3 +%endif ; mmsize == 16 + PALIGNR_MMX m3, m3, 3, m2 +.no_rot: +%if mmsize == 8 + mova m2, m3 + punpckhbw m3, m4 ; byte->word + punpcklbw m2, m4 ; byte->word +%else + punpcklbw m3, m4 + mova m2, m3 +%endif +%elif %1 == 9 + pxor m4, m4 + mova m3, [pw_512] + mova m2, [pw_32] +%elif %1 == 10 + pxor m4, m4 + mova m3, [pw_1024] + mova m2, [pw_16] +%else ; %1 == 16 + mova m4, [pd_4] +%endif ; %1 == .. + + ; actual pixel scaling +.loop: +%if %1 == 8 + movu m0, [r0+r2*2+mmsize*0] + movu m1, [r0+r2*2+mmsize*1] + paddsw m0, m2 + paddsw m1, m3 + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 + movu [r1+r2], m0 +%elif %1 == 16 + movu m0, [r0+r2*4+mmsize*0] + movu m1, [r0+r2*4+mmsize*1] + movu m2, [r0+r2*4+mmsize*2] + movu m3, [r0+r2*4+mmsize*3] + paddd m0, m4 + paddd m1, m4 + paddd m2, m4 + paddd m3, m4 + psrad m0, 3 + psrad m1, 3 + psrad m2, 3 + psrad m3, 3 + packusdw m0, m1 + packusdw m2, m3 + movu [r1+r2*2], m0 + movu [r1+r2*2+mmsize], m2 +%else + movu m0, [r0+r2*2+mmsize*0] + movu m1, [r0+r2*2+mmsize*1] + paddw m0, m2 + paddw m1, m2 + psraw m0, 15 - %1 + psraw m1, 15 - %1 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m3 + pminsw m1, m3 + movu [r1+r2*2], m0 + movu [r1+r2*2+mmsize], m1 +%endif + add r2, mmsize + jl .loop + RET +%endmacro + +%ifdef ARCH_X86_32 +INIT_MMX +yuv2plane1_fn 8, mmx, 0, 5 +yuv2plane1_fn 9, mmx2, 0, 3 +yuv2plane1_fn 10, mmx2, 0, 3 +yuv2plane1_fn 16, mmx, 0, 3 +%endif +INIT_XMM +yuv2plane1_fn 8, sse2, 5, 5 +yuv2plane1_fn 9, sse2, 5, 3 +yuv2plane1_fn 10, sse2, 5, 3 +yuv2plane1_fn 16, sse2, 5, 3 diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c index dd7aea1..c7be8a6 100644 --- a/libswscale/x86/swscale_mmx.c +++ b/libswscale/x86/swscale_mmx.c @@ -211,6 +211,20 @@ SCALE_FUNCS_SSE(sse2); SCALE_FUNCS_SSE(ssse3); SCALE_FUNCS_SSE(sse4); +#define VSCALE_FUNC(size, opt) \ +extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \ + const uint8_t *dither, int offset) +#define VSCALE_FUNCS(opt1, opt2) \ + VSCALE_FUNC(8, opt1); \ + VSCALE_FUNC(9, opt2); \ + VSCALE_FUNC(10, opt2); \ + VSCALE_FUNC(16, opt1) + +#if ARCH_X86_32 +VSCALE_FUNCS(mmx, mmx2); +#endif +VSCALE_FUNCS(sse2, sse2); + void ff_sws_init_swScale_mmx(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); @@ -244,10 +258,18 @@ void ff_sws_init_swScale_mmx(SwsContext *c) case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \ } +#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2) \ + switch(c->dstBpc){ \ + case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \ + case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \ + case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \ + default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + } #if ARCH_X86_32 if (cpu_flags & AV_CPU_FLAG_MMX) { ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); + ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2); } #endif #define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ @@ -261,6 +283,7 @@ void ff_sws_init_swScale_mmx(SwsContext *c) if (cpu_flags & AV_CPU_FLAG_SSE2) { ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2); ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); + ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2); } if (cpu_flags & AV_CPU_FLAG_SSSE3) { ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index ccf4f74..869509b 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -275,80 +275,6 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0) } -static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, - const int16_t *chrUSrc, const int16_t *chrVSrc, - const int16_t *alpSrc, - uint8_t *dst[4], int dstW, int chrDstW) -{ - int p= 4; - const int16_t *src[4]= { - lumSrc + dstW, chrUSrc + chrDstW, - chrVSrc + chrDstW, alpSrc + dstW - }; - x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW }; - - while (p--) { - if (dst[p]) { - __asm__ volatile( - "mov %2, %%"REG_a" \n\t" - ".p2align 4 \n\t" /* FIXME Unroll? */ - "1: \n\t" - "movq (%0, %%"REG_a", 2), %%mm0 \n\t" - "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" - "psraw $7, %%mm0 \n\t" - "psraw $7, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - MOVNTQ(%%mm0, (%1, %%REGa)) - "add $8, %%"REG_a" \n\t" - "jnc 1b \n\t" - :: "r" (src[p]), "r" (dst[p] + counter[p]), - "g" (-counter[p]) - : "%"REG_a - ); - } - } -} - -static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, - const int16_t *chrUSrc, const int16_t *chrVSrc, - const int16_t *alpSrc, - uint8_t *dst[4], int dstW, int chrDstW) -{ - int p= 4; - const int16_t *src[4]= { - lumSrc + dstW, chrUSrc + chrDstW, - chrVSrc + chrDstW, alpSrc + dstW - }; - x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW }; - const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; - - while (p--) { - if (dst[p]) { - dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2); - __asm__ volatile( - "mov %2, %%"REG_a" \n\t" - "movq "DITHER16"+0(%3), %%mm6 \n\t" - "movq "DITHER16"+8(%3), %%mm7 \n\t" - ".p2align 4 \n\t" /* FIXME Unroll? */ - "1: \n\t" - "movq (%0, %%"REG_a", 2), %%mm0 \n\t" - "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" - "paddsw %%mm6, %%mm0 \n\t" - "paddsw %%mm7, %%mm1 \n\t" - "psraw $7, %%mm0 \n\t" - "psraw $7, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - MOVNTQ(%%mm0, (%1, %%REGa)) - "add $8, %%"REG_a" \n\t" - "jnc 1b \n\t" - :: "r" (src[p]), "r" (dst[p] + counter[p]), - "g" (-counter[p]), "r"(&c->redDither) - : "%"REG_a - ); - } - } -} - #define YSCALEYUV2PACKEDX_UV \ __asm__ volatile(\ "xor %%"REG_a", %%"REG_a" \n\t"\ @@ -2103,7 +2029,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) { if (!(c->flags & SWS_BITEXACT)) { if (c->flags & SWS_ACCURATE_RND) { - //c->yuv2yuv1 = RENAME(yuv2yuv1_ar ); //c->yuv2yuvX = RENAME(yuv2yuvX_ar ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { @@ -2116,7 +2041,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } } else { - //c->yuv2yuv1 = RENAME(yuv2yuv1 ); //c->yuv2yuvX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { -- 1.7.2.1 _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
