Hi, On Mon, Oct 3, 2011 at 9:52 PM, Ronald S. Bultje <[email protected]> wrote: > On Mon, Oct 3, 2011 at 2:48 PM, Janne Grunau <[email protected]> wrote: >> On Mon, Oct 03, 2011 at 02:26:46PM -0700, Ronald S. Bultje wrote: >>> >>> >> --- a/libavcodec/x86/Makefile >>> >> +++ b/libavcodec/x86/Makefile >>> >> @@ -33,6 +33,9 @@ MMX-OBJS-$(CONFIG_ENCODERS) += >>> >> x86/dsputilenc_mmx.o >>> >> YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o >>> >> MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o >>> >> MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o >>> >> +YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o >>> >> +MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o >>> >> \ >>> >> + $(YASM-OBJS-yes) >>> > >>> > This is wrong, $(YASM-OBJS-yes) is already added to MMX-OBJS elsewhere. >>> >>> OK, but that's wrong, it should only be if CONFIG_PRORES_DECODER is >>> yes. How to do that? >> >> huh? There are other objects in $(YASM-OBJS-yes) and it won't have >> anything prores related if $(CONFIG_PRORES_DECODER) does not evaluate >> to yes. > > Oh right. Let me cook up a new patch then...
Attached. Ronald
From f86865ca16777c51cff546de9b4423cdbea74512 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" <[email protected]> Date: Fri, 30 Sep 2011 14:37:11 +0200 Subject: [PATCH 1/2] prores: idct sse2/sse4 optimizations. ~3.0-3.5x as fast as original C version, 1.6x as fast overall. --- libavcodec/dsputil.h | 3 +- libavcodec/proresdec.c | 2 +- libavcodec/proresdsp.c | 2 + libavcodec/proresdsp.h | 2 + libavcodec/x86/Makefile | 3 + libavcodec/x86/dsputil_mmx.c | 2 + libavcodec/x86/proresdsp-init.c | 45 ++++ libavcodec/x86/proresdsp.asm | 506 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 563 insertions(+), 2 deletions(-) create mode 100644 libavcodec/x86/proresdsp-init.c create mode 100644 libavcodec/x86/proresdsp.asm diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 73830f8..c22efc2 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -202,7 +202,8 @@ typedef struct ScanTable{ } ScanTable; void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable); - +void ff_init_scantable_permutation(uint8_t *idct_permutation, + int idct_permutation_type); #define EMULATED_EDGE(depth) \ void ff_emulated_edge_mc_ ## depth (uint8_t *buf, const uint8_t *src, int linesize,\ int block_w, int block_h,\ diff --git a/libavcodec/proresdec.c b/libavcodec/proresdec.c index 0424093..fed8bf0 100644 --- a/libavcodec/proresdec.c +++ b/libavcodec/proresdec.c @@ -478,7 +478,7 @@ static void decode_slice_plane(ProresContext *ctx, const uint8_t *buf, * that input I temporally introduced the coarse solution below... */ for (j = 0; j < blocks_per_mb; j++) for (i = 0; i < 64; i++) - block_ptr[j * 64 + i] = (block_ptr[j * 64 + i] * qmat[i]) >> 2; + block_ptr[j * 64 + i] = (block_ptr[j * 64 + i] * qmat[ctx->dsp.idct_permutation[i]]) >> 2; ctx->dsp.idct_put(out_ptr, linesize, block_ptr); block_ptr += 64; diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c index 7f20c83..d1506be 100644 --- a/libavcodec/proresdsp.c +++ b/libavcodec/proresdsp.c @@ -56,6 +56,8 @@ void ff_proresdsp_init(ProresDSPContext *dsp) dsp->idct_put = prores_idct_put_c; dsp->idct_permutation_type = FF_NO_IDCT_PERM; + if (HAVE_MMX) ff_proresdsp_x86_init(dsp); + ff_init_scantable_permutation(dsp->idct_permutation, dsp->idct_permutation_type); } diff --git a/libavcodec/proresdsp.h b/libavcodec/proresdsp.h index 96a5cb6..380f5b6 100644 --- a/libavcodec/proresdsp.h +++ b/libavcodec/proresdsp.h @@ -35,4 +35,6 @@ typedef struct { void ff_proresdsp_init(ProresDSPContext *dsp); +void ff_proresdsp_x86_init(ProresDSPContext *dsp); + #endif /* LIBAVCODEC_PRORESDSP_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a94f97a..7e0030b 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -33,6 +33,8 @@ MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o +YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o +MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 2fb75cb..58620d6 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -64,6 +64,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x00400 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; diff --git a/libavcodec/x86/proresdsp-init.c b/libavcodec/x86/proresdsp-init.c new file mode 100644 index 0000000..99fc44f --- /dev/null +++ b/libavcodec/x86/proresdsp-init.c @@ -0,0 +1,45 @@ +/* + * Apple ProRes compatible decoder + * + * Copyright (c) 2010-2011 Maxim Poliakovski + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/proresdsp.h" + +void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize, + DCTELEM *block); +void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize, + DCTELEM *block); + +void ff_proresdsp_x86_init(ProresDSPContext *dsp) +{ +#if ARCH_X86_64 + int flags = av_get_cpu_flags(); + + if (flags & AV_CPU_FLAG_SSE2) { + dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + dsp->idct_put = ff_prores_idct_put_10_sse2; + } + + if (flags & AV_CPU_FLAG_SSE4) { + dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + dsp->idct_put = ff_prores_idct_put_10_sse4; + } +#endif +} diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm new file mode 100644 index 0000000..c87c3ff --- /dev/null +++ b/libavcodec/x86/proresdsp.asm @@ -0,0 +1,506 @@ +;****************************************************************************** +;* x86-SIMD-optimized IDCT for prores +;* this is identical to "simple" IDCT except for the clip range +;* +;* Copyright (c) 2011 Ronald S. Bultje <[email protected]> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1 +%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1 +%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2 +%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1 +%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1 +%define W6sh2 8867 ; W6 = 35468 = 8867<<2 +%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1 + +%ifdef ARCH_X86_64 + +SECTION_RODATA + +w4_plus_w2: times 4 dw W4sh2, +W2sh2 +w4_min_w2: times 4 dw W4sh2, -W2sh2 +w4_plus_w6: times 4 dw W4sh2, +W6sh2 +w4_min_w6: times 4 dw W4sh2, -W6sh2 +w1_plus_w3: times 4 dw W1sh2, +W3sh2 +w3_min_w1: times 4 dw W3sh2, -W1sh2 +w7_plus_w3: times 4 dw W7sh2, +W3sh2 +w3_min_w7: times 4 dw W3sh2, -W7sh2 +w1_plus_w5: times 4 dw W1sh2, +W5sh2 +w5_min_w1: times 4 dw W5sh2, -W1sh2 +w5_plus_w7: times 4 dw W5sh2, +W7sh2 +w7_min_w5: times 4 dw W7sh2, -W5sh2 +row_round: times 8 dw (1<<14) + +cextern pw_4 +cextern pw_8 +cextern pw_512 +cextern pw_1019 + +section .text align=16 + +; %1 = row or col (for rounding variable) +; %2 = number of bits to shift at the end +%macro IDCT_1D 3 + ; a0 = (W4 * row[0]) + (1 << (15 - 1)); + ; a1 = a0; + ; a2 = a0; + ; a3 = a0; + ; a0 += W2 * row[2]; + ; a1 += W6 * row[2]; + ; a2 -= W6 * row[2]; + ; a3 -= W2 * row[2]; +%ifidn %1, col + paddw m0, [pw_8] +%endif + movdqa m10, m0 + movdqa m1, m0 +; OK for row + punpcklwd m0, m8 ; { row[0], row[2] }[0-3] + punpckhwd m1, m8 ; { row[0], row[2] }[4-7] +%ifidn %1, row + psllw m0, 2 + psllw m1, 2 + psubw m10,[row_round] +%endif + signextend m8, m9, m14 ; { row[2] }[0-3] / [4-7] + signextend m10, m11, m14 ; { row[0] }[0-3] / [4-7] + movdqa m2, m0 + movdqa m4, m0 + movdqa m6, m0 + movdqa m3, m1 + movdqa m5, m1 + movdqa m7, m1 + pmaddwd m0, [w4_plus_w2] + pmaddwd m1, [w4_plus_w2] + pmaddwd m2, [w4_plus_w6] + pmaddwd m3, [w4_plus_w6] + pmaddwd m4, [w4_min_w6] + pmaddwd m5, [w4_min_w6] + pmaddwd m6, [w4_min_w2] + pmaddwd m7, [w4_min_w2] +%ifidn %1, col + pslld m0, 2 + pslld m1, 2 + pslld m2, 2 + pslld m3, 2 + pslld m4, 2 + pslld m5, 2 + pslld m6, 2 + pslld m7, 2 +%endif + + ; a0: -1*row[0]-1*row[2] + ; a1: -1*row[0] + ; a2: -1*row[0] + ; a3: -1*row[0]+1*row[2] + psubd m0, m10 + psubd m1, m11 + psubd m2, m10 ; a1[0-3] + psubd m3, m11 ; a1[4-7] + psubd m4, m10 ; a2[0-3] + psubd m5, m11 ; a2[4-7] + psubd m6, m10 + psubd m7, m11 + psubd m0, m8 ; a0[0-3] + psubd m1, m9 ; a0[4-7] + paddd m6, m8 ; a3[0-3] + paddd m7, m9 ; a3[4-7] + + ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] + ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] + ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] + ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] + movdqa m8, m13 + movdqa m9, m13 + punpcklwd m8, m12 ; { row[4], row[6] }[0-3] + punpckhwd m9, m12 ; { row[4], row[6] }[4-7] +%ifidn %1, row + psllw m8, 2 + psllw m9, 2 +%endif + signextend m13, m14, m10 ; { row[4] }[0-3] / [4-7] + movdqa m10, m8 + movdqa m11, m9 + pmaddwd m8, [w4_plus_w6] + pmaddwd m9, [w4_plus_w6] +%ifidn %1, col + pslld m8, 2 + pslld m9, 2 +%endif + psubd m8, m13 + psubd m9, m14 + paddd m0, m8 ; a0[0-3] + paddd m1, m9 ; a0[4-7] + movdqa m8, m10 + movdqa m9, m11 + pmaddwd m10,[w4_min_w6] + pmaddwd m11,[w4_min_w6] +%ifidn %1, col + pslld m10, 2 + pslld m11, 2 +%endif + psubd m10, m13 + psubd m11, m14 + paddd m6, m10 ; a3[0-3] + paddd m7, m11 ; a3[4-7] + movdqa m10, m8 + movdqa m11, m9 + pmaddwd m8, [w4_plus_w2] + pmaddwd m9, [w4_plus_w2] +%ifidn %1, col + pslld m8, 2 + pslld m9, 2 +%endif + psubd m8, m13 + psubd m9, m14 + psubd m2, m8 ; a1[0-3] intermediate + psubd m3, m9 ; a1[4-7] intermediate + pmaddwd m10,[w4_min_w2] + pmaddwd m11,[w4_min_w2] +%ifidn %1, col + pslld m10, 2 + pslld m11, 2 +%endif + psubd m10, m13 + psubd m11, m14 + psubd m4, m10 ; a2[0-3] intermediate + psubd m5, m11 ; a2[4-7] intermediate + signextend m12, m13, m10 ; { row[6] }[0-3] / [4-7] + paddd m2, m12 ; a1[0-3] + psubd m4, m12 ; a2[0-3] + paddd m3, m13 ; a1[4-7] + psubd m5, m13 ; a2[4-7] + + ; load/store + movdqa [r2+ 0], m0 + movdqa [r2+32], m2 + movdqa [r2+64], m4 + movdqa [r2+96], m6 + movdqa m0, [r2+16] ; { row[1] }[0-7] + movdqa m8, [r2+48] ; { row[3] }[0-7] + movdqa m13,[r2+80] ; { row[5] }[0-7] + movdqa m14,[r2+112] ; { row[7] }[0-7] + movdqa [r2+16], m1 + movdqa [r2+48], m3 + movdqa [r2+80], m5 + movdqa [r2+112], m7 + + ; b0 = MUL(W1, row[1]); + ; MAC(b0, W3, row[3]); + ; b1 = MUL(W3, row[1]); + ; MAC(b1, -W7, row[3]); + ; b2 = MUL(W5, row[1]); + ; MAC(b2, -W1, row[3]); + ; b3 = MUL(W7, row[1]); + ; MAC(b3, -W5, row[3]); + movdqa m1, m0 + movdqa m10, m0 + punpcklwd m0, m8 ; { row[1], row[3] }[0-3] + punpckhwd m1, m8 ; { row[1], row[3] }[4-7] +%ifidn %1, row + psllw m0, 2 + psllw m1, 2 +%endif + signextend m10, m11, m12 ; { row[1] }[0-3] / [4-7] + signextend m8, m9, m12 ; { row[3] }[0-3] / [4-7] + movdqa m2, m0 + movdqa m4, m0 + movdqa m6, m0 + movdqa m3, m1 + movdqa m5, m1 + movdqa m7, m1 + pmaddwd m0, [w1_plus_w3] + pmaddwd m1, [w1_plus_w3] + pmaddwd m2, [w3_min_w7] + pmaddwd m3, [w3_min_w7] + pmaddwd m4, [w5_min_w1] + pmaddwd m5, [w5_min_w1] + pmaddwd m6, [w7_min_w5] + pmaddwd m7, [w7_min_w5] +%ifidn %1, col + pslld m0, 2 + pslld m1, 2 + pslld m2, 2 + pslld m3, 2 + pslld m4, 2 + pslld m5, 2 + pslld m6, 2 + pslld m7, 2 +%endif + + ; b0: +1*row[1]+2*row[3] + ; b1: +2*row[1]-1*row[3] + ; b2: -1*row[1]-1*row[3] + ; b3: +1*row[1]+1*row[3] + paddd m0, m8 + paddd m1, m9 + psubd m2, m8 + psubd m3, m9 + paddd m8, m10 ; { row[1] + row[3] }[0-3] + paddd m9, m11 ; { row[1] + row[3] }[4-7] + pslld m10, 1 + pslld m11, 1 + paddd m0, m8 ; b0[0-3] + paddd m1, m9 ; b0[4-7] + paddd m2, m10 ; b1[0-3] + paddd m3, m11 ; b2[4-7] + psubd m4, m8 ; b2[0-3] + psubd m5, m9 ; b2[4-7] + paddd m6, m8 ; b3[0-3] + paddd m7, m9 ; b3[4-7] + + ; MAC(b0, W5, row[5]); + ; MAC(b0, W7, row[7]); + ; MAC(b1, -W1, row[5]); + ; MAC(b1, -W5, row[7]); + ; MAC(b2, W7, row[5]); + ; MAC(b2, W3, row[7]); + ; MAC(b3, W3, row[5]); + ; MAC(b3, -W1, row[7]); + movdqa m8, m13 + movdqa m9, m13 + punpcklwd m8, m14 ; { row[5], row[7] }[0-3] + punpckhwd m9, m14 ; { row[5], row[7] }[4-7] +%ifidn %1, row + psllw m8, 2 + psllw m9, 2 +%endif + signextend m13, m12, m11 ; { row[5] }[0-3] / [4-7] + signextend m14, m11, m10 ; { row[7] }[0-3] / [4-7] + + ; b0: -1*row[5]+1*row[7] + ; b1: -1*row[5]+1*row[7] + ; b2: +1*row[5]+2*row[7] + ; b3: +2*row[5]-1*row[7] + paddd m4, m13 + paddd m5, m12 + paddd m6, m13 + paddd m7, m12 + psubd m13, m14 ; { row[5] - row[7] }[0-3] + psubd m12, m11 ; { row[5] - row[7] }[4-7] + pslld m14, 1 + pslld m11, 1 + psubd m0, m13 + psubd m1, m12 + psubd m2, m13 + psubd m3, m12 + paddd m4, m14 + paddd m5, m11 + paddd m6, m13 + paddd m7, m12 + + movdqa m10, m8 + movdqa m11, m9 + movdqa m12, m8 + movdqa m13, m9 + pmaddwd m8, [w5_plus_w7] + pmaddwd m9, [w5_plus_w7] + pmaddwd m10,[w1_plus_w5] + pmaddwd m11,[w1_plus_w5] +%ifidn %1, col + pslld m8, 2 + pslld m9, 2 + pslld m10, 2 + pslld m11, 2 +%endif + paddd m0, m8 ; b0[0-3] + paddd m1, m9 ; b0[4-7] + psubd m2, m10 ; b1[0-3] + psubd m3, m11 ; b1[4-7] + movdqa m8, m12 + movdqa m9, m13 + pmaddwd m12,[w7_plus_w3] + pmaddwd m13,[w7_plus_w3] + pmaddwd m8, [w3_min_w1] + pmaddwd m9, [w3_min_w1] +%ifidn %1, col + pslld m12, 2 + pslld m13, 2 + pslld m8, 2 + pslld m9, 2 +%endif + paddd m4, m12 ; b2[0-3] + paddd m5, m13 ; b2[4-7] + paddd m6, m8 ; b3[0-3] + paddd m7, m9 ; b3[4-7] + + ; row[0] = (a0 + b0) >> 15; + ; row[7] = (a0 - b0) >> 15; + ; row[1] = (a1 + b1) >> 15; + ; row[6] = (a1 - b1) >> 15; + ; row[2] = (a2 + b2) >> 15; + ; row[5] = (a2 - b2) >> 15; + ; row[3] = (a3 + b3) >> 15; + ; row[4] = (a3 - b3) >> 15; + movdqa m8, [r2+ 0] ; a0[0-3] + movdqa m9, [r2+16] ; a0[4-7] + movdqa m10, m8 + movdqa m11, m9 + paddd m8, m0 ; { a0 + b0 }[0-3] + paddd m9, m1 ; { a0 + b0 }[4-7] + psubd m10, m0 ; { a0 - b0 }[0-3] + psubd m11, m1 ; { a0 - b0 }[4-7] + psrad m8, %2 + psrad m9, %2 + psrad m10, %2 + psrad m11, %2 + packssdw m8, m9 ; row[0] + packssdw m10, m11 ; row[7] + movdqa m0, [r2+32] ; a1[0-3] + movdqa m1, [r2+48] ; a1[4-7] + movdqa m9, m0 + movdqa m11, m1 + paddd m0, m2 ; { a1 + b1 }[0-3] + paddd m1, m3 ; { a1 + b1 }[4-7] + psubd m9, m2 ; { a1 - b1 }[0-3] + psubd m11, m3 ; { a1 - b1 }[4-7] + psrad m0, %2 + psrad m1, %2 + psrad m9, %2 + psrad m11, %2 + packssdw m0, m1 ; row[1] + packssdw m9, m11 ; row[6] + movdqa m1, [r2+64] ; a2[0-3] + movdqa m2, [r2+80] ; a2[4-7] + movdqa m11, m1 + movdqa m3, m2 + paddd m1, m4 ; { a2 + b2 }[0-3] + paddd m2, m5 ; { a2 + b2 }[4-7] + psubd m11, m4 ; { a2 - b2 }[0-3] + psubd m3, m5 ; { a2 - b2 }[4-7] + psrad m1, %2 + psrad m2, %2 + psrad m11, %2 + psrad m3, %2 + packssdw m1, m2 ; row[2] + packssdw m11, m3 ; row[5] + movdqa m2, [r2+96] ; a3[0-3] + movdqa m3, [r2+112] ; a3[4-7] + movdqa m4, m2 + movdqa m5, m3 + paddd m2, m6 ; { a3 + b3 }[0-3] + paddd m3, m7 ; { a3 + b3 }[4-7] + psubd m4, m6 ; { a3 - b3 }[0-3] + psubd m5, m7 ; { a3 - b3 }[4-7] + psrad m2, %2 + psrad m3, %2 + psrad m4, %2 + psrad m5, %2 + packssdw m2, m3 ; row[3] + packssdw m4, m5 ; row[4] +%endmacro + +; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride, DCTELEM *block); +%macro idct_put_fn 2 +cglobal prores_idct_put_10_%1, 3, 3, %2 + movsxd r1, r1d + pxor m15, m15 ; zero + + ; for (i = 0; i < 8; i++) + ; idctRowCondDC(block + i*8); + movdqa m0, [r2+ 0] ; { row[0] }[0-7] + movdqa m8, [r2+32] ; { row[2] }[0-7] + movdqa m13,[r2+64] ; { row[4] }[0-7] + movdqa m12,[r2+96] ; { row[6] }[0-7] + + IDCT_1D row, 15, %1 + + ; transpose for second part of IDCT + TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 + SWAP 8, 0 + SWAP 1, 8 + SWAP 4, 13 + SWAP 9, 12 + + ; for (i = 0; i < 8; i++) + ; idctSparseColAdd(dest + i, line_size, block + i); + movdqa [r2+16], m1 + movdqa [r2+48], m2 + movdqa [r2+80], m11 + movdqa [r2+112],m10 + + IDCT_1D col, 20, %1 + + ; clip/store + movdqa m6, [pw_512] + movdqa m3, [pw_4] + movdqa m5, [pw_1019] + paddw m8, m6 + paddw m0, m6 + paddw m1, m6 + paddw m2, m6 + paddw m4, m6 + paddw m11, m6 + paddw m9, m6 + paddw m10, m6 + pmaxsw m8, m3, m7 + pmaxsw m0, m3, m7 + pmaxsw m1, m3, m7 + pmaxsw m2, m3, m7 + pmaxsw m4, m3, m7 + pmaxsw m11, m3, m7 + pmaxsw m9, m3, m7 + pmaxsw m10, m3, m7 + pminsw m8, m5, m7 + pminsw m0, m5, m7 + pminsw m1, m5, m7 + pminsw m2, m5, m7 + pminsw m4, m5, m7 + pminsw m11, m5, m7 + pminsw m9, m5, m7 + pminsw m10, m5, m7 + + lea r2, [r1*3] + movdqa [r0 ], m8 + movdqa [r0+r1], m0 + movdqa [r0+r1*2],m1 + movdqa [r0+r2], m2 + lea r0, [r0+r1*4] + movdqa [r0 ], m4 + movdqa [r0+r1], m11 + movdqa [r0+r1*2],m9 + movdqa [r0+r2], m10 + RET +%endmacro + +%macro signextend_sse2 3 ; dstlow, dsthigh, tmp + pxor %3, %3 + pcmpgtw %3, %1 + movdqa %2, %1 + punpcklwd %1, %3 + punpckhwd %2, %3 +%endmacro + +%macro signextend_sse4 2-3 ; dstlow, dsthigh + movhlps %2, %1 + pmovsxwd %1, %1 + pmovsxwd %2, %2 +%endmacro + +INIT_XMM +%define signextend signextend_sse2 +idct_put_fn sse2, 16 +INIT_XMM +%define signextend signextend_sse4 +idct_put_fn sse4, 16 + +%endif -- 1.7.6
_______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
