# HG changeset patch # User Nabajit Deka # Date 1385557692 -19800 # Wed Nov 27 18:38:12 2013 +0530 # Branch stable # Node ID e4206a37c20f531312013d2a5879f6dbb58c05c5 # Parent 648c669afd7476f30e4f432d839b36fbb5390332 asm: Adding asm routine for idct4
diff -r 648c669afd74 -r e4206a37c20f source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Wed Nov 27 18:19:18 2013 +0530 +++ b/source/common/x86/dct8.asm Wed Nov 27 18:38:12 2013 +0530 @@ -21,6 +21,8 @@ ;* For more information, contact us at [email protected]. ;*****************************************************************************/ +;TO-DO : Further optimize the routines. + %include "x86inc.asm" %include "x86util.asm" @@ -34,7 +36,9 @@ SECTION .text cextern pd_1 +cextern pd_64 cextern pd_128 +cextern pd_2048 ;------------------------------------------------------ ;void dct4(int16_t *src, int32_t *dst, intptr_t stride) @@ -128,3 +132,90 @@ movu [r1 + 3 * 16], m2 RET + +;------------------------------------------------------- +;void idct4(int32_t *src, int16_t *dst, intptr_t stride) +;------------------------------------------------------- +INIT_XMM sse2 +cglobal idct4, 3, 4, 7 + + add r2d, r2d + lea r3, [tab_dct4] + + mova m6, [pd_64] + + movu m0, [r0 + 0 * 16] + movu m1, [r0 + 1 * 16] + packssdw m0, m1 + + movu m1, [r0 + 2 * 16] + movu m2, [r0 + 3 * 16] + packssdw m1, m2 + + punpcklwd m2, m0, m1 + pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1 + paddd m3, m6 + + pmaddwd m2, [r3 + 2 * 16] ; m2 = E2 + paddd m2, m6 + + punpckhwd m0, m1 + pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 + pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 + + paddd m4, m3, m1 + psrad m4, 7 ; m4 = m128iA + paddd m5, m2, m0 + psrad m5, 7 + packssdw m4, m5 ; m4 = m128iA + + psubd m2, m0 + psrad m2, 7 + psubd m3, m1 + psrad m3, 7 + packssdw m2, m3 ; m2 = m128iD + + punpcklwd m1, m4, m2 ; m1 = S0 + punpckhwd m4, m2 ; m4 = S8 + + punpcklwd m0, m1, m4 ; m0 = m128iA + punpckhwd m1, m4 ; m1 = m128iD + + mova m6, [pd_2048] + + punpcklwd m2, m0, m1 + pmaddwd m3, m2, [r3 + 0 * 16] + paddd m3, m6 ; m3 = E1 + + pmaddwd m2, [r3 + 2 * 16] + paddd m2, m6 ; m2 = E2 + + punpckhwd m0, m1 + pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 + pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 + + paddd m4, m3, m1 + psrad m4, 12 ; m4 = m128iA + paddd m5, m2, m0 + psrad m5, 12 + packssdw m4, m5 ; m4 = m128iA + + psubd m2, m0 + psrad m2, 12 + psubd m3, m1 + psrad m3, 12 + packssdw m2, m3 ; m2 = m128iD + + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + + punpcklwd m0, m1, m4 + movlps [r1 + 0 * r2], m0 + movhps [r1 + 1 * r2], m0 + + punpckhwd m1, m4 + movlps [r1 + 2 * r2], m1 + lea r1, [r1 + 2 * r2] + movhps [r1 + r2], m1 + + RET _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
