# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1392374441 -19800 # Fri Feb 14 16:10:41 2014 +0530 # Node ID 831536babdc08f1553a10754bf2a4f4af6aa1695 # Parent ed310b17ff6681f191c85341cf6efe7a50770143 asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4 primitives
diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Feb 14 02:30:52 2014 -0600 +++ b/source/common/x86/asm-primitives.cpp Fri Feb 14 16:10:41 2014 +0530 @@ -726,6 +726,10 @@ p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2; p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2; p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2; + + p.dct[DCT_4x4] = x265_dct4_sse2; + p.idct[IDCT_4x4] = x265_idct4_sse2; + p.idct[IDST_4x4] = x265_idst4_sse2; } if (cpuMask & X265_CPU_SSSE3) { @@ -740,9 +744,12 @@ SETUP_INTRA_ANG32(2, 2, ssse3); SETUP_INTRA_ANG32(34, 2, ssse3); + + p.dct[DST_4x4] = x265_dst4_ssse3; } if (cpuMask & X265_CPU_SSE4) { + p.dct[DCT_8x8] = x265_dct8_sse4; p.cvt16to32_shl = x265_cvt16to32_shl_sse4; p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4; diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Fri Feb 14 02:30:52 2014 -0600 +++ b/source/common/x86/const-a.asm Fri Feb 14 16:10:41 2014 +0530 @@ -72,6 +72,8 @@ const pd_1, times 4 dd 1 const pd_2, times 4 dd 2 +const pd_4, times 4 dd 4 +const pd_8, times 4 dd 8 const pd_16, times 4 dd 16 const pd_32, times 4 dd 32 const pd_64, times 4 dd 64 diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Fri Feb 14 02:30:52 2014 -0600 +++ b/source/common/x86/dct8.asm Fri Feb 14 16:10:41 2014 +0530 @@ -67,6 +67,10 @@ cextern pd_1 cextern pd_2 +cextern pd_4 +cextern pd_8 +cextern pd_16 +cextern pd_32 cextern pd_64 cextern pd_128 cextern pd_256 @@ -79,6 +83,15 @@ ;------------------------------------------------------ INIT_XMM sse2 cglobal dct4, 3, 4, 8 +%if BIT_DEPTH == 10 + %define DCT_SHIFT 3 + mova m7, [pd_4] +%else if BIT_DEPTH == 8 + %define DCT_SHIFT 1 + mova m7, [pd_1] +%else + %error Unsupported BIT_DEPTH! +%endif add r2d, r2d lea r3, [tab_dct4] @@ -87,8 +100,6 @@ mova m5, [r3 + 1 * 16] mova m6, [r3 + 2 * 16] - mova m7, [pd_1] - movh m0, [r0 + 0 * r2] movh m1, [r0 + 1 * r2] punpcklqdq m0, m1 @@ -110,11 +121,11 @@ pmaddwd m0, m1, m4 paddd m0, m7 - psrad m0, 1 + psrad m0, DCT_SHIFT pmaddwd m3, m2, m5 paddd m3, m7 - psrad m3, 1 + psrad m3, DCT_SHIFT packssdw m0, m3 pshufd m0, m0, 0xD8 @@ -122,11 +133,11 @@ pmaddwd m1, m6 paddd m1, m7 - psrad m1, 1 + psrad m1, DCT_SHIFT pmaddwd m2, [r3 + 3 * 16] paddd m2, m7 - psrad m2, 1 + psrad m2, DCT_SHIFT packssdw m1, m2 pshufd m1, m1, 0xD8 @@ -179,7 +190,7 @@ %define IDCT4_OFFSET [pd_512] %define IDCT4_SHIFT 10 %else - %error Unsupport BIT_DEPTH! + %error Unsupported BIT_DEPTH! %endif add r2d, r2d lea r3, [tab_dct4] @@ -268,25 +279,28 @@ INIT_XMM ssse3 %if ARCH_X86_64 cglobal dst4, 3, 4, 8+2 + %define coef2 m8 + %define coef3 m9 %else ; ARCH_X86_64 = 0 cglobal dst4, 3, 4, 8 + %define coef2 [r3 + 2 * 16] + %define coef3 [r3 + 3 * 16] %endif ; ARCH_X86_64 - %define coef0 m6 - %define coef1 m7 -%if ARCH_X86_64 - %define coef2 m8 - %define coef3 m9 -%else - %define coef2 [r3 + 2 * 16] - %define coef3 [r3 + 3 * 16] -%endif +%define coef0 m6 +%define coef1 m7 + +%if BIT_DEPTH == 8 + %define DST_SHIFT 1 + mova m5, [pd_1] +%else if BIT_DEPTH == 10 + %define DST_SHIFT 3 + mova m5, [pd_4] +%endif add r2d, r2d lea r3, [tab_dst4] - mova m5, [pd_1] - mova coef0, [r3 + 0 * 16] mova coef1, [r3 + 1 * 16] %if ARCH_X86_64 @@ -294,7 +308,7 @@ mova coef3, [r3 + 3 * 16] %endif - movh m0, [r0 + 0 * r2] ;load + movh m0, [r0 + 0 * r2] ; load movh m1, [r0 + 1 * r2] punpcklqdq m0, m1 @@ -303,30 +317,30 @@ movh m2, [r0 + r2] punpcklqdq m1, m2 - pmaddwd m2, m0, coef0 ;DST1 + pmaddwd m2, m0, coef0 ; DST1 pmaddwd m3, m1, coef0 phaddd m2, m3 paddd m2, m5 - psrad m2, 1 + psrad m2, DST_SHIFT pmaddwd m3, m0, coef1 pmaddwd m4, m1, coef1 phaddd m3, m4 paddd m3, m5 - psrad m3, 1 + psrad m3, DST_SHIFT packssdw m2, m3 ; m2 = T70 pmaddwd m3, m0, coef2 pmaddwd m4, m1, coef2 phaddd m3, m4 paddd m3, m5 - psrad m3, 1 + psrad m3, DST_SHIFT pmaddwd m0, coef3 pmaddwd m1, coef3 phaddd m0, m1 paddd m0, m5 - psrad m0, 1 + psrad m0, DST_SHIFT packssdw m3, m0 ; m3 = T71 mova m5, [pd_128] @@ -365,8 +379,16 @@ ;void idst4(int32_t *src, int16_t *dst, intptr_t stride) ;------------------------------------------------------- INIT_XMM sse2 -cglobal idst4, 3, 4, 6 - +cglobal idst4, 3, 4, 7 +%if BIT_DEPTH == 8 + %define m6 [pd_2048] + %define IDCT4_SHIFT 12 +%elif BIT_DEPTH == 10 + %define m6 [pd_512] + %define IDCT4_SHIFT 10 +%else + %error Unsupported BIT_DEPTH! +%endif add r2d, r2d lea r3, [tab_idst4] @@ -415,35 +437,33 @@ punpcklwd m2, m0, m1 punpckhwd m0, m1 - mova m5, [pd_2048] - punpcklwd m1, m2, m0 punpckhwd m2, m0 pmaddwd m0, m1, [r3 + 0 * 16] pmaddwd m3, m2, [r3 + 1 * 16] paddd m0, m3 - paddd m0, m5 - psrad m0, 12 ; m1 = S0 + paddd m0, m6 + psrad m0, IDCT4_SHIFT ; m0 = S0 pmaddwd m3, m1, [r3 + 2 * 16] pmaddwd m4, m2, [r3 + 3 * 16] paddd m3, m4 - paddd m3, m5 - psrad m3, 12 ; m3 = S8 + paddd m3, m6 + psrad m3, IDCT4_SHIFT ; m3 = S8 packssdw m0, m3 ; m0 = m128iA pmaddwd m3, m1, [r3 + 4 * 16] pmaddwd m4, m2, [r3 + 5 * 16] paddd m3, m4 - paddd m3, m5 - psrad m3, 12 ; m3 = S0 + paddd m3, m6 + psrad m3, IDCT4_SHIFT ; m3 = S0 pmaddwd m1, [r3 + 6 * 16] pmaddwd m2, [r3 + 7 * 16] paddd m1, m2 - paddd m1, m5 - psrad m1, 12 ; m1 = S8 + paddd m1, m6 + psrad m1, IDCT4_SHIFT ; m1 = S8 packssdw m3, m1 ; m3 = m128iD punpcklwd m1, m0, m3 @@ -476,11 +496,20 @@ ; Row6[4-7] Row7[4-7] ;------------------------ +%if BIT_DEPTH == 10 + %define DCT_SHIFT 4 + mova m6, [pd_8] +%else if BIT_DEPTH == 8 + %define DCT_SHIFT 2 + mova m6, [pd_2] +%else + %error Unsupported BIT_DEPTH! +%endif + add r2, r2 lea r3, [r2 * 3] mov r5, rsp - mova m6, [pd_2] %assign x 0 %rep 2 movu m0, [r0] @@ -518,7 +547,7 @@ pmaddwd m5, m0, [r4 + 0*16] phaddd m1, m5 paddd m1, m6 - psrad m1, 2 + psrad m1, DCT_SHIFT %if x == 1 pshufd m1, m1, 0x1B %endif @@ -528,7 +557,7 @@ pmaddwd m5, m0, [r4 + 1*16] phaddd m1, m5 paddd m1, m6 - psrad m1, 2 + psrad m1, DCT_SHIFT %if x == 1 pshufd m1, m1, 0x1B %endif @@ -538,7 +567,7 @@ pmaddwd m5, m0, [r4 + 2*16] phaddd m1, m5 paddd m1, m6 - psrad m1, 2 + psrad m1, DCT_SHIFT %if x == 1 pshufd m1, m1, 0x1B %endif @@ -548,7 +577,7 @@ pmaddwd m0, [r4 + 3*16] phaddd m4, m0 paddd m4, m6 - psrad m4, 2 + psrad m4, DCT_SHIFT %if x == 1 pshufd m4, m4, 0x1B %endif @@ -564,7 +593,7 @@ pmaddwd m3, m0, [r4 + 0*16] paddd m3, m6 - psrad m3, 2 + psrad m3, DCT_SHIFT %if x == 1 pshufd m3, m3, 0x1B %endif @@ -572,7 +601,7 @@ pmaddwd m0, [r4 + 2*16] paddd m0, m6 - psrad m0, 2 + psrad m0, DCT_SHIFT %if x == 1 pshufd m0, m0, 0x1B %endif @@ -580,7 +609,7 @@ pmaddwd m3, m2, [r4 + 1*16] paddd m3, m6 - psrad m3, 2 + psrad m3, DCT_SHIFT %if x == 1 pshufd m3, m3, 0x1B %endif @@ -588,7 +617,7 @@ pmaddwd m2, [r4 + 3*16] paddd m2, m6 - psrad m2, 2 + psrad m2, DCT_SHIFT %if x == 1 pshufd m2, m2, 0x1B %endif diff -r ed310b17ff66 -r 831536babdc0 source/test/mbdstharness.cpp --- a/source/test/mbdstharness.cpp Fri Feb 14 02:30:52 2014 -0600 +++ b/source/test/mbdstharness.cpp Fri Feb 14 16:10:41 2014 +0530 @@ -173,6 +173,10 @@ bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, int width) { +#if HIGH_BIT_DEPTH + int old_depth = X265_DEPTH; + X265_DEPTH = 10; +#endif int j = 0; int cmp_size = sizeof(int) * width * width; @@ -189,6 +193,11 @@ ref(short_test_buff[index] + j, mintbuf3, width); opt(short_test_buff[index] + j, mintbuf4, width); #endif + +#if HIGH_BIT_DEPTH + X265_DEPTH = old_depth; +#endif + return false; } @@ -199,11 +208,20 @@ #endif } +#if HIGH_BIT_DEPTH + X265_DEPTH = old_depth; +#endif + return true; } bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, int width) { +#if HIGH_BIT_DEPTH + int old_depth = X265_DEPTH; + X265_DEPTH = 10; +#endif + int j = 0; int cmp_size = sizeof(int16_t) * width * width; @@ -220,6 +238,11 @@ ref(int_test_buff[index] + j, mbuf2, width); opt(int_test_buff[index] + j, mbuf3, width); #endif + +#if HIGH_BIT_DEPTH + X265_DEPTH = old_depth; +#endif + return false; } @@ -230,6 +253,9 @@ #endif } +#if HIGH_BIT_DEPTH + X265_DEPTH = old_depth; +#endif return true; } _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel