# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1515999160 -19800 # Mon Jan 15 12:22:40 2018 +0530 # Node ID 1107c2def5f9dbee9947a2c9c41f50961fa31bc6 # Parent 3a310b157fdf345023ff4e96e7de316cee79b954 x86: AVX512 planecopy_sp_shl for input 10bit, output 12bit
AVX2 performance : 16.49x AVX512 performance : 20.44x diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jan 15 10:36:54 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jan 15 12:22:40 2018 +0530 @@ -3149,6 +3149,7 @@ p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx512); p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx512); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x64_avx512); + p.planecopy_sp_shl = PFX(upShift_16_avx512); } #endif @@ -5362,6 +5363,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); + p.planecopy_sp_shl = PFX(upShift_16_avx512); } #endif diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Jan 15 10:36:54 2018 +0530 +++ b/source/common/x86/pixel-a.asm Mon Jan 15 12:22:40 2018 +0530 @@ -8763,8 +8763,53 @@ .end: RET - - +INIT_ZMM avx512 +cglobal upShift_16, 4,7,4 + mov r4d, r4m + mov r5d, r5m + movd xm0, r6m ; m0 = shift + vbroadcasti32x4 m3, [pw_pixel_max] + FIX_STRIDES r1d, r3d + dec r5d +.loopH: + xor r6d, r6d +.loopW: + movu m1, [r0 + r6 * SIZEOF_PIXEL] + psllw m1, xm0 + pand m1, m3 + movu [r2 + r6 * SIZEOF_PIXEL], m1 + + add r6, mmsize / SIZEOF_PIXEL + cmp r6d, r4d + jl .loopW + + ; move to next row + add r0, r1 + add r2, r3 + dec r5d + jnz .loopH + + ; processing last row of every frame [To handle width which not a multiple of 32] + +.loop32: + movu m1, [r0 + (r4 - mmsize/2) * 2] + psllw m1, xm0 + pand m1, m3 + movu [r2 + (r4 - mmsize/2) * 2], m1 + + sub r4d, mmsize/2 + jz .end + cmp r4d, mmsize/2 + jge .loop32 + + ; process partial pixels + movu m1, [r0] + psllw m1, xm0 + pand m1, m3 + movu [r2], m1 + +.end: + RET ;--------------------------------------------------------------------------------------------------------------------- ;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) ;--------------------------------------------------------------------------------------------------------------------- diff -r 3a310b157fdf -r 1107c2def5f9 source/common/x86/pixel.h --- a/source/common/x86/pixel.h Mon Jan 15 10:36:54 2018 +0530 +++ b/source/common/x86/pixel.h Mon Jan 15 12:22:40 2018 +0530 @@ -34,6 +34,7 @@ void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); +void PFX(upShift_16_avx512)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift); void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift); pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel