At 2015-03-05 07:54:02,[email protected] wrote:
># HG changeset patch
># User David T Yuen<[email protected]>
># Date 1425512599 28800
># Node ID 16880e791046ef8470f8307b76aae57c3be573c1
># Parent c53b456ad909eeab8d83f8e0817e641d174cc706
>asm:intra pred planar8 sse2
>
>This replaces c code for systems using ssse3 to sse2 processors
>The code is backported from intrapred planar8 sse4
>
>64-bit
>
>./test/TestBench --testbench intrapred | grep intra_planar_8x8
>intra_planar_8x8 3.34x 997.49 3330.22
>
>32-bit
>
>./test/TestBench --testbench intrapred | grep intra_planar_8x8
>intra_planar_8x8 3.87x 1052.48 4072.68
>
>diff -r c53b456ad909 -r 16880e791046 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Mar 03 18:40:21 2015
-0800
>+++ b/source/common/x86/asm-primitives.cpp Wed Mar 04 15:43:19 2015
-0800
>@@ -1210,6 +1210,7 @@
> p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2;
>
> p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] =
x265_intra_pred_planar4_sse2;
>+ p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] =
x265_intra_pred_planar8_sse2;
>
> p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
> p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
>diff -r c53b456ad909 -r 16880e791046 source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h Tue Mar 03 18:40:21 2015 -0800
>+++ b/source/common/x86/intrapred.h Wed Mar 04 15:43:19 2015 -0800
>@@ -36,6 +36,7 @@
> void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const
pixel* srcPix, int, int filter);
>
> void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const
pixel* srcPix, int, int);
>+void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const
pixel* srcPix, int, int);
> void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, const
pixel* srcPix, int, int);
> void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const
pixel* srcPix, int, int);
> void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const
pixel* srcPix, int, int);
>diff -r c53b456ad909 -r 16880e791046 source/common/x86/intrapred8.asm
>--- a/source/common/x86/intrapred8.asm Tue Mar 03 18:40:21 2015 -0800
>+++ b/source/common/x86/intrapred8.asm Wed Mar 04 15:43:19 2015 -0800
>@@ -124,6 +124,7 @@
> cextern pw_32
> cextern pw_257
> cextern pw_1024
>+cextern pw_00ff
> cextern pb_unpackbd1
> cextern multiL
> cextern multiH
>@@ -588,6 +589,63 @@
> movd [r0 + r1], m1
> RET
>
>+;---------------------------------------------------------------------------------------
>+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix,
int, int filter)
>+;---------------------------------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal intra_pred_planar8, 3,3,6
>+ pxor m0, m0
>+ movh m1, [r2 + 1]
>+ punpcklbw m1, m0
>+ movh m2, [r2 + 17]
>+ punpcklbw m2, m0
>+
>+ movd m3, [r2 + 9] ; topRight = above[8];
>+ movd m4, [r2 + 25] ; bottomLeft = left[8];
>+
>+ pand m3, [pw_00ff]
>+ pand m4, [pw_00ff]
how about mov+and+movd? it is less cycles on documents