From: Kieran Kunhya <[email protected]>
Signed-off-by: Ronald S. Bultje <[email protected]>
---
libswscale/x86/scale.asm | 61 ++++++++++++++++++++++++++++++++++++++++++
libswscale/x86/swscale_mmx.c | 15 ++++++++++
2 files changed, 76 insertions(+), 0 deletions(-)
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index c74a2b2..ec8f4ec 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -34,6 +34,8 @@ pw_16: times 8 dw 16
pw_32: times 8 dw 32
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
+yuv2yuvX_10_start: times 4 dd 0x10000
+yuv2yuvX_10_upper: times 8 dw 0x3ff
SECTION .text
@@ -569,3 +571,62 @@ yuv2plane1_fn 10, sse2, 5, 3
yuv2plane1_fn 16, sse2, 6, 3
yuv2plane1_fn 16, sse4, 5, 3
+
+;void (*yuv2planarX_fn) (const int16_t *filter, int filterSize,
+; const int16_t **src, uint16_t *dest, int dstW,
+; const uint8_t *dither, int offset);
+%macro yuv2planeX10 1
+
+%ifdef ARCH_X86_32
+%define cntr_reg r1
+%else
+%define cntr_reg r11
+%endif
+
+cglobal yuv2planeX_10_%1, 7, 7
+ xor r5, r5
+.pixelloop
+ mova m1, [yuv2yuvX_10_start]
+ mova m2, m1
+ movsxdifnidn cntr_reg, r1d
+.filterloop
+ pxor m0, m0
+
+ mov r6, [r2+gprsize*cntr_reg-2*gprsize]
+ mova m3, [r6+r5]
+
+ mov r6, [r2+gprsize*cntr_reg-gprsize]
+ mova m4, [r6+r5]
+
+ punpcklwd m5, m3, m4
+ punpckhwd m3, m4
+
+ movd m0, [r0+2*cntr_reg-4]
+ SPLATD m0, m0
+
+ pmaddwd m5, m0
+ pmaddwd m3, m0
+
+ paddd m2, m5
+ paddd m1, m3
+
+ sub cntr_reg, 2
+ jg .filterloop
+
+ psrad m2, 17
+ psrad m1, 17
+
+ packusdw m2, m1
+ pminsw m2, [yuv2yuvX_10_upper]
+ mova [r3+r5], m2
+
+ add r5, mmsize
+ sub r4d, mmsize/2
+ jg .pixelloop
+ REP_RET
+%endmacro
+
+INIT_XMM
+yuv2planeX10 sse2
+INIT_AVX
+yuv2planeX10 avx
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index 009d5fd..c2cbdb6 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -226,6 +226,14 @@ VSCALE_FUNCS(mmx, mmx2);
VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC(16, sse4);
+extern void ff_yuv2planeX10_sse2(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+extern void ff_yuv2planeX10_avx(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
void ff_sws_init_swScale_mmx(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -285,6 +293,8 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
+ if (c->dstBpc == 10 && !isBE(c->dstFormat) && !(c->vChrFilterSize&1))
+ c->yuv2planeX_chroma = ff_yuv2planeX10_sse2;
}
if (cpu_flags & AV_CPU_FLAG_SSSE3) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
@@ -297,5 +307,10 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
if (c->dstBpc == 16 && !isBE(c->dstFormat))
c->yuv2plane1 = ff_yuv2plane1_16_sse4;
}
+
+ if (cpu_flags & AV_CPU_FLAG_AVX) {
+ if (c->dstBpc == 10 && !isBE(c->dstFormat) && !(c->vChrFilterSize&1))
+ c->yuv2planeX_chroma = ff_yuv2planeX10_avx;
+ }
#endif
}
--
1.7.2.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel