---
 libswscale/x86/scale.asm          |  129 ++++++++++++++++++++++++++++++++++++-
 libswscale/x86/swscale_mmx.c      |   23 +++++++
 libswscale/x86/swscale_template.c |   76 ----------------------
 3 files changed, 151 insertions(+), 77 deletions(-)

diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index d355894..085bd9c 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -1,5 +1,5 @@
 ;******************************************************************************
-;* x86-optimized horizontal line scaling functions
+;* x86-optimized horizontal/vertical line scaling functions
 ;* Copyright (c) 2011 Ronald S. Bultje <[email protected]>
 ;*
 ;* This file is part of Libav.
@@ -28,6 +28,11 @@ max_19bit_int: times 4 dd 0x7ffff
 max_19bit_flt: times 4 dd 524287.0
 minshort:      times 8 dw 0x8000
 unicoeff:      times 4 dd 0x20000000
+pd_4:          times 4 dd 4
+pw_16:         times 8 dw 16
+pw_32:         times 8 dw 32
+pw_512:        times 8 dw 512
+pw_1024:       times 8 dw 1024
 
 SECTION .text
 
@@ -427,3 +432,125 @@ INIT_XMM
 SCALE_FUNCS2 sse2,  6, 7, 8
 SCALE_FUNCS2 ssse3, 6, 6, 8
 SCALE_FUNCS2 sse4,  6, 6, 8
+
+;-----------------------------------------------------------------------------
+; vertical line scaling
+;
+; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int 
dstW,
+;                                     const uint8_t *dither, int offset)
+; and
+; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
+;                                     const int16_t **src, uint8_t *dst, int 
dstW,
+;                                     const uint8_t *dither, int offset)
+;
+; Scale one or $filterSize lines of source data to generate one line of output
+; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
+; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
+; of 2. $offset is either 0 or 3. $dither holds 8 values.
+;-----------------------------------------------------------------------------
+
+%macro yuv2plane1_fn 4
+cglobal yuv2plane1_%1_%2, %4, %4, %3
+%if %1 == 8
+    add             r1, r2
+%else ; %1 != 8
+    lea             r1, [r1+r2*2]
+%endif ; %1 == 8
+%if %1 == 16
+    lea             r0, [r0+r2*4]
+%else ; %1 != 16
+    lea             r0, [r0+r2*2]
+%endif ; %1 == 16
+    neg             r2
+
+%if %1 == 8
+    pxor            m4, m4               ; zero
+
+    ; create registers holding dither
+    movq            m3, [r3]             ; dither
+    test            r4, r4
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq      m3, m3
+%endif ; mmsize == 16
+    PALIGNR_MMX     m3, m3, 3, m2
+.no_rot:
+%if mmsize == 8
+    mova            m2, m3
+    punpckhbw       m3, m4               ; byte->word
+    punpcklbw       m2, m4               ; byte->word
+%else
+    punpcklbw       m3, m4
+    mova            m2, m3
+%endif
+%elif %1 == 9
+    pxor            m4, m4
+    mova            m3, [pw_512]
+    mova            m2, [pw_32]
+%elif %1 == 10
+    pxor            m4, m4
+    mova            m3, [pw_1024]
+    mova            m2, [pw_16]
+%else ; %1 == 16
+    mova            m4, [pd_4]
+%endif ; %1 == ..
+
+    ; actual pixel scaling
+.loop:
+%if %1 == 8
+    movu            m0, [r0+r2*2+mmsize*0]
+    movu            m1, [r0+r2*2+mmsize*1]
+    paddsw          m0, m2
+    paddsw          m1, m3
+    psraw           m0, 7
+    psraw           m1, 7
+    packuswb        m0, m1
+    movu       [r1+r2], m0
+%elif %1 == 16
+    movu            m0, [r0+r2*4+mmsize*0]
+    movu            m1, [r0+r2*4+mmsize*1]
+    movu            m2, [r0+r2*4+mmsize*2]
+    movu            m3, [r0+r2*4+mmsize*3]
+    paddd           m0, m4
+    paddd           m1, m4
+    paddd           m2, m4
+    paddd           m3, m4
+    psrad           m0, 3
+    psrad           m1, 3
+    psrad           m2, 3
+    psrad           m3, 3
+    packusdw        m0, m1
+    packusdw        m2, m3
+    movu     [r1+r2*2], m0
+    movu [r1+r2*2+mmsize], m2
+%else
+    movu            m0, [r0+r2*2+mmsize*0]
+    movu            m1, [r0+r2*2+mmsize*1]
+    paddw           m0, m2
+    paddw           m1, m2
+    psraw           m0, 15 - %1
+    psraw           m1, 15 - %1
+    pmaxsw          m0, m4
+    pmaxsw          m1, m4
+    pminsw          m0, m3
+    pminsw          m1, m3
+    movu     [r1+r2*2], m0
+    movu [r1+r2*2+mmsize], m1
+%endif
+    add             r2, mmsize
+    jl .loop
+    RET
+%endmacro
+
+%ifdef ARCH_X86_32
+INIT_MMX
+yuv2plane1_fn  8, mmx,  0, 5
+yuv2plane1_fn  9, mmx2, 0, 3
+yuv2plane1_fn 10, mmx2, 0, 3
+yuv2plane1_fn 16, mmx,  0, 3
+%endif
+INIT_XMM
+yuv2plane1_fn  8, sse2, 5, 5
+yuv2plane1_fn  9, sse2, 5, 3
+yuv2plane1_fn 10, sse2, 5, 3
+yuv2plane1_fn 16, sse2, 5, 3
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index dd7aea1..c7be8a6 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -211,6 +211,20 @@ SCALE_FUNCS_SSE(sse2);
 SCALE_FUNCS_SSE(ssse3);
 SCALE_FUNCS_SSE(sse4);
 
+#define VSCALE_FUNC(size, opt) \
+extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t 
*dst, int dstW, \
+                                               const uint8_t *dither, int 
offset)
+#define VSCALE_FUNCS(opt1, opt2) \
+    VSCALE_FUNC(8,  opt1); \
+    VSCALE_FUNC(9,  opt2); \
+    VSCALE_FUNC(10, opt2); \
+    VSCALE_FUNC(16, opt1)
+
+#if ARCH_X86_32
+VSCALE_FUNCS(mmx, mmx2);
+#endif
+VSCALE_FUNCS(sse2, sse2);
+
 void ff_sws_init_swScale_mmx(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -244,10 +258,18 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
     case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
     default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
     }
+#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2) \
+    switch(c->dstBpc){ \
+    case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; 
break; \
+    case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_10_ ## opt2; 
break; \
+    case 9:  if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_9_ ## opt2;  
break; \
+    default: vscalefn = ff_yuv2plane1_8_ ## opt1;  break; \
+    }
 #if ARCH_X86_32
     if (cpu_flags & AV_CPU_FLAG_MMX) {
         ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
         ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2);
     }
 #endif
 #define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
@@ -261,6 +283,7 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
     if (cpu_flags & AV_CPU_FLAG_SSE2) {
         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2);
     }
     if (cpu_flags & AV_CPU_FLAG_SSSE3) {
         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
diff --git a/libswscale/x86/swscale_template.c 
b/libswscale/x86/swscale_template.c
index ccf4f74..869509b 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -275,80 +275,6 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const 
int16_t *lumFilter,
     YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
 }
 
-static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
-                             const int16_t *chrUSrc, const int16_t *chrVSrc,
-                             const int16_t *alpSrc,
-                             uint8_t *dst[4], int dstW, int chrDstW)
-{
-    int p= 4;
-    const int16_t *src[4]= {
-        lumSrc + dstW,     chrUSrc + chrDstW,
-        chrVSrc + chrDstW, alpSrc + dstW
-    };
-    x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
-
-    while (p--) {
-        if (dst[p]) {
-            __asm__ volatile(
-                "mov %2, %%"REG_a"                    \n\t"
-                ".p2align               4             \n\t" /* FIXME Unroll? */
-                "1:                                   \n\t"
-                "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"
-                "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"
-                "psraw                 $7, %%mm0      \n\t"
-                "psraw                 $7, %%mm1      \n\t"
-                "packuswb           %%mm1, %%mm0      \n\t"
-                MOVNTQ(%%mm0, (%1, %%REGa))
-                "add                   $8, %%"REG_a"  \n\t"
-                "jnc                   1b             \n\t"
-                :: "r" (src[p]), "r" (dst[p] + counter[p]),
-                   "g" (-counter[p])
-                : "%"REG_a
-            );
-        }
-    }
-}
-
-static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
-                                const int16_t *chrUSrc, const int16_t *chrVSrc,
-                                const int16_t *alpSrc,
-                                uint8_t *dst[4], int dstW, int chrDstW)
-{
-    int p= 4;
-    const int16_t *src[4]= {
-        lumSrc + dstW,     chrUSrc + chrDstW,
-        chrVSrc + chrDstW, alpSrc + dstW
-    };
-    x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
-    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
-
-    while (p--) {
-        if (dst[p]) {
-            dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 
2);
-            __asm__ volatile(
-                "mov %2, %%"REG_a"                    \n\t"
-                "movq    "DITHER16"+0(%3), %%mm6      \n\t"
-                "movq    "DITHER16"+8(%3), %%mm7      \n\t"
-                ".p2align                4            \n\t" /* FIXME Unroll? */
-                "1:                                   \n\t"
-                "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"
-                "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"
-                "paddsw             %%mm6, %%mm0      \n\t"
-                "paddsw             %%mm7, %%mm1      \n\t"
-                "psraw                 $7, %%mm0      \n\t"
-                "psraw                 $7, %%mm1      \n\t"
-                "packuswb           %%mm1, %%mm0      \n\t"
-                MOVNTQ(%%mm0, (%1, %%REGa))
-                "add                   $8, %%"REG_a"  \n\t"
-                "jnc                   1b             \n\t"
-                :: "r" (src[p]), "r" (dst[p] + counter[p]),
-                   "g" (-counter[p]), "r"(&c->redDither)
-                : "%"REG_a
-            );
-        }
-    }
-}
-
 #define YSCALEYUV2PACKEDX_UV \
     __asm__ volatile(\
         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
@@ -2103,7 +2029,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext 
*c)
         dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
         if (!(c->flags & SWS_BITEXACT)) {
             if (c->flags & SWS_ACCURATE_RND) {
-                //c->yuv2yuv1 = RENAME(yuv2yuv1_ar    );
                 //c->yuv2yuvX = RENAME(yuv2yuvX_ar    );
                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                     switch (c->dstFormat) {
@@ -2116,7 +2041,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext 
*c)
                     }
                 }
             } else {
-                //c->yuv2yuv1 = RENAME(yuv2yuv1    );
                 //c->yuv2yuvX = RENAME(yuv2yuvX    );
                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                     switch (c->dstFormat) {
-- 
1.7.2.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to