# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1512036841 -19800 # Thu Nov 30 15:44:01 2017 +0530 # Node ID f86b11b8c629b0e4bf8342d42a0e9c475d7c3a7d # Parent e77ef4964dd04de6a8b84378f7a46219f34bf1b5 [x265-avx512]x86: AVX512 denoise DCT
diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 30 17:06:16 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 30 15:44:01 2017 +0530 @@ -2888,6 +2888,7 @@ p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); p.quant = PFX(quant_avx512); p.nquant = PFX(nquant_avx512); + p.denoiseDct = PFX(denoise_dct_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); @@ -5068,6 +5069,7 @@ p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); p.quant = PFX(quant_avx512); p.nquant = PFX(nquant_avx512); + p.denoiseDct = PFX(denoise_dct_avx512); } #endif } diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Thu Nov 30 17:06:16 2017 +0530 +++ b/source/common/x86/dct8.asm Thu Nov 30 15:44:01 2017 +0530 @@ -2357,6 +2357,67 @@ dec r3d jnz .loop RET +%if ARCH_X86_64 == 1 +INIT_ZMM avx512 +cglobal denoise_dct, 4, 4, 22 + pxor m16, m16 + sub r3d, 16 + je .coeff16 + add r3d, 16 + shr r3d, 5 + jmp .loop + +.coeff16: + movu ym19, [r0] + pabsw ym17, ym19 + movu m2, [r1] + pmovsxwd m18, ym17 + paddd m2, m18 + movu [r1], m2 + movu ym3, [r2] + psubusw ym17, ym3 + pcmpgtw ym18, ym17, ym16 + pand ym17, ym18 + psignw ym17, ym19 + movu [r0], ym17 + RET + +.loop: + movu m21, [r0] + pabsw m17, m21 + movu m2, [r1] + pmovsxwd m4, ym17 + paddd m2, m4 + movu [r1], m2 + vextracti64x4 ym4, m17, 1 + + movu m2, [r1 + mmsize] + pmovsxwd m3, ym4 + paddd m2, m3 + movu [r1 + mmsize], m2 + movu m3, [r2] + psubusw m17, m3 + + vextracti64x4 ym20, m17, 1 + pcmpgtw ym18, ym17, ym16 + pcmpgtw ym19, ym20, ym16 + vinserti64x4 m18, m18, ym19, 1 + + pand m17, m18 + vextracti64x4 ym19, m17, 1 + vextracti64x4 ym20, m21, 1 + psignw ym17, ym21 + psignw ym19, ym20 + vinserti64x4 m17, m17, ym19, 1 + + movu [r0], m17 + add r0, mmsize + add r1, mmsize * 2 + add r2, mmsize + dec r3d + jnz .loop + RET +%endif ; ARCH_X86_64 == 1 %if ARCH_X86_64 == 1 %macro DCT8_PASS_1 4 diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Thu Nov 30 17:06:16 2017 +0530 +++ b/source/common/x86/dct8.h Thu Nov 30 15:44:01 2017 +0530 @@ -42,7 +42,7 @@ void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size); void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size); - +void PFX(denoise_dct_avx512)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size); void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel