# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1506325284 -19800 # Mon Sep 25 13:11:24 2017 +0530 # Node ID b31fc8889e0f8a433be25fb6267552f7d03efeaf # Parent ffd4c1528b37332493c5fa4677e780dbef121a01 x86: Aligned routine implementation for low bit depth p2s primitive
diff -r ffd4c1528b37 -r b31fc8889e0f source/common/predict.cpp --- a/source/common/predict.cpp Thu Sep 21 16:39:45 2017 +0530 +++ b/source/common/predict.cpp Mon Sep 25 13:11:24 2017 +0530 @@ -284,16 +284,12 @@ if (!(yFrac | xFrac)) { -#if HIGH_BIT_DEPTH bool srcbufferAlignCheck = (refPic.m_cuOffsetY[pu.ctuAddr] + refPic.m_buOffsetY[pu.cuAbsPartIdx + pu.puAbsPartIdx] + srcOffset) % 64 == 0; bool dstbufferAlignCheck = (dstSYuv.getAddrOffset(pu.puAbsPartIdx, dstSYuv.m_size) % 64) == 0; if (srcStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheck && dstbufferAlignCheck && (refPic.m_param->cpuid & X265_CPU_AVX512)) primitives.pu[partEnum].convert_p2s_aligned(src, srcStride, dst, dstStride); else primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride); -#else - primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride); -#endif } else if (!yFrac) primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0); @@ -386,7 +382,6 @@ if (!(yFrac | xFrac)) { -#if HIGH_BIT_DEPTH bool srcbufferAlignCheckC = (refPic.m_cuOffsetC[pu.ctuAddr] + refPic.m_buOffsetC[pu.cuAbsPartIdx + pu.puAbsPartIdx] + refOffset) % 64 == 0; bool dstbufferAlignCheckC = dstSYuv.getChromaAddrOffset(pu.puAbsPartIdx) % 64 == 0; if (refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC && (refPic.m_param->cpuid & X265_CPU_AVX512)) @@ -399,10 +394,6 @@ primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride); primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride); } -#else - primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride); - primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride); -#endif } else if (!yFrac) { diff -r ffd4c1528b37 -r b31fc8889e0f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Sep 21 16:39:45 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Sep 25 13:11:24 2017 +0530 @@ -4217,6 +4217,106 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512); + p.pu[LUMA_4x4].convert_p2s_aligned = PFX(filterPixelToShort_4x4_sse4); + p.pu[LUMA_4x8].convert_p2s_aligned = PFX(filterPixelToShort_4x8_sse4); + p.pu[LUMA_4x16].convert_p2s_aligned = PFX(filterPixelToShort_4x16_sse4); + p.pu[LUMA_8x8].convert_p2s_aligned = PFX(filterPixelToShort_8x8_ssse3); + p.pu[LUMA_8x4].convert_p2s_aligned = PFX(filterPixelToShort_8x4_ssse3); + p.pu[LUMA_8x16].convert_p2s_aligned = PFX(filterPixelToShort_8x16_ssse3); + p.pu[LUMA_8x32].convert_p2s_aligned = PFX(filterPixelToShort_8x32_ssse3); + p.pu[LUMA_12x16].convert_p2s_aligned = PFX(filterPixelToShort_12x16_ssse3); + p.pu[LUMA_16x4].convert_p2s_aligned = PFX(filterPixelToShort_16x4_avx2); + p.pu[LUMA_16x8].convert_p2s_aligned = PFX(filterPixelToShort_16x8_avx2); + p.pu[LUMA_16x12].convert_p2s_aligned = PFX(filterPixelToShort_16x12_avx2); + p.pu[LUMA_16x16].convert_p2s_aligned = PFX(filterPixelToShort_16x16_avx2); + p.pu[LUMA_16x32].convert_p2s_aligned = PFX(filterPixelToShort_16x32_avx2); + p.pu[LUMA_16x64].convert_p2s_aligned = PFX(filterPixelToShort_16x64_avx2); + p.pu[LUMA_24x32].convert_p2s_aligned = PFX(filterPixelToShort_24x32_avx2); + p.pu[LUMA_64x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512); + p.pu[LUMA_64x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512); + p.pu[LUMA_64x48].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512); + p.pu[LUMA_64x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512); + p.pu[LUMA_32x8].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512); + p.pu[LUMA_32x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512); + p.pu[LUMA_32x24].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512); + p.pu[LUMA_32x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512); + p.pu[LUMA_32x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512); + p.pu[LUMA_48x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_48x64_avx512); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4); + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s_aligned = PFX(filterPixelToShort_4x2_sse4); + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4); + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s_aligned = PFX(filterPixelToShort_8x2_ssse3); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s_aligned = PFX(filterPixelToShort_8x6_ssse3); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3); + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3); + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s_aligned = PFX(filterPixelToShort_2x4_sse4); + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4); + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s_aligned = PFX(filterPixelToShort_6x8_sse4); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512); + + p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4); + p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s_aligned = PFX(filterPixelToShort_2x16_sse4); + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4); + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4); + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4); + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s_aligned = PFX(filterPixelToShort_4x32_sse4); + p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s_aligned = PFX(filterPixelToShort_6x16_sse4); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s_aligned = PFX(filterPixelToShort_8x12_ssse3); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s_aligned = PFX(filterPixelToShort_8x64_ssse3); + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s_aligned = PFX(filterPixelToShort_12x32_ssse3); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s_aligned = PFX(filterPixelToShort_16x24_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s_aligned = PFX(filterPixelToShort_24x64_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s_aligned = PFX(filterPixelToShort_aligned_32x48_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512); + + p.chroma[X265_CSP_I444].pu[LUMA_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4); + p.chroma[X265_CSP_I444].pu[LUMA_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4); + p.chroma[X265_CSP_I444].pu[LUMA_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4); + p.chroma[X265_CSP_I444].pu[LUMA_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3); + p.chroma[X265_CSP_I444].pu[LUMA_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3); + p.chroma[X265_CSP_I444].pu[LUMA_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3); + p.chroma[X265_CSP_I444].pu[LUMA_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3); + p.chroma[X265_CSP_I444].pu[LUMA_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3); + p.chroma[X265_CSP_I444].pu[LUMA_16x4].p2s_aligned = PFX(filterPixelToShort_16x4_avx2); + p.chroma[X265_CSP_I444].pu[LUMA_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2); + p.chroma[X265_CSP_I444].pu[LUMA_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2); + p.chroma[X265_CSP_I444].pu[LUMA_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2); + p.chroma[X265_CSP_I444].pu[LUMA_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2); + p.chroma[X265_CSP_I444].pu[LUMA_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2); + p.chroma[X265_CSP_I444].pu[LUMA_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2); + p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512); + p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); diff -r ffd4c1528b37 -r b31fc8889e0f source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Sep 21 16:39:45 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Sep 25 13:11:24 2017 +0530 @@ -1969,6 +1969,10 @@ P2S_H_32xN_avx2 64 P2S_H_32xN_avx2 48 +;----------------------------------------------------------------------------- +;p2s and p2s_aligned 32xN avx512 code start +;----------------------------------------------------------------------------- + %macro PROCESS_P2S_32x4_AVX512 0 pmovzxbw m0, [r0] pmovzxbw m1, [r0 + r1] @@ -2099,6 +2103,138 @@ PROCESS_P2S_32x4_AVX512 RET +%macro PROCESS_P2S_ALIGNED_32x4_AVX512 0 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + r1 * 2] + pmovzxbw m3, [r0 + r5] + + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + + mova [r2], m0 + mova [r2 + r3], m1 + mova [r2 + r3 * 2], m2 + mova [r2 + r6], m3 +%endmacro + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_32x8, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + + PROCESS_P2S_ALIGNED_32x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_ALIGNED_32x4_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_32x16, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 3 + PROCESS_P2S_ALIGNED_32x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_32x4_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_32x24, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 5 + PROCESS_P2S_ALIGNED_32x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_32x4_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_32x32, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 7 + PROCESS_P2S_ALIGNED_32x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_32x4_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_32x48, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 11 + PROCESS_P2S_ALIGNED_32x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_32x4_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_32x64, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 15 + PROCESS_P2S_ALIGNED_32x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_32x4_AVX512 + RET +;----------------------------------------------------------------------------- +;p2s and p2s_aligned 32xN avx512 code end +;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- @@ -2414,6 +2550,9 @@ P2S_H_64xN_avx2 32 P2S_H_64xN_avx2 48 +;----------------------------------------------------------------------------- +;p2s and p2s_aligned 64xN avx512 code start +;----------------------------------------------------------------------------- %macro PROCESS_P2S_64x4_AVX512 0 pmovzxbw m0, [r0] pmovzxbw m1, [r0 + mmsize/2] @@ -2452,6 +2591,43 @@ movu [r2 + r6 + mmsize], m3 %endmacro +%macro PROCESS_P2S_ALIGNED_64x4_AVX512 0 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + mmsize/2] + pmovzxbw m2, [r0 + r1] + pmovzxbw m3, [r0 + r1 + mmsize/2] + + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + mova [r2], m0 + mova [r2 + mmsize], m1 + mova [r2 + r3], m2 + mova [r2 + r3 + mmsize], m3 + + pmovzxbw m0, [r0 + r1 * 2] + pmovzxbw m1, [r0 + r1 * 2 + mmsize/2] + pmovzxbw m2, [r0 + r5] + pmovzxbw m3, [r0 + r5 + mmsize/2] + + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + mova [r2 + r3 * 2], m0 + mova [r2 + r3 * 2 + mmsize], m1 + mova [r2 + r6], m2 + mova [r2 + r6 + mmsize], m3 +%endmacro ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- @@ -2527,6 +2703,81 @@ PROCESS_P2S_64x4_AVX512 RET +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_64x64, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 15 + PROCESS_P2S_ALIGNED_64x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_64x4_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_64x48, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 11 + PROCESS_P2S_ALIGNED_64x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_64x4_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_64x32, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 7 + PROCESS_P2S_ALIGNED_64x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_64x4_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_64x16, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + +%rep 3 + PROCESS_P2S_ALIGNED_64x4_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + PROCESS_P2S_ALIGNED_64x4_AVX512 + RET +;----------------------------------------------------------------------------- +;p2s and p2s_aligned 64xN avx512 code end +;----------------------------------------------------------------------------- + ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride) ;----------------------------------------------------------------------------- @@ -2948,6 +3199,9 @@ jnz .loop RET +;----------------------------------------------------------------------------- +;p2s and p2s_aligned 48xN avx512 code start +;----------------------------------------------------------------------------- %macro PROCESS_P2S_48x8_AVX512 0 pmovzxbw m0, [r0] pmovzxbw m1, [r0 + r1] @@ -3021,6 +3275,78 @@ movu [r2 + r6 + 64], ym3 %endmacro +%macro PROCESS_P2S_ALIGNED_48x8_AVX512 0 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + r1 * 2] + pmovzxbw m3, [r0 + r5] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + mova [r2], m0 + mova [r2 + r3], m1 + mova [r2 + r3 * 2], m2 + mova [r2 + r6], m3 + + pmovzxbw ym0, [r0 + 32] + pmovzxbw ym1, [r0 + r1 + 32] + pmovzxbw ym2, [r0 + r1 * 2 + 32] + pmovzxbw ym3, [r0 + r5 + 32] + psllw ym0, 6 + psllw ym1, 6 + psllw ym2, 6 + psllw ym3, 6 + psubw ym0, ym4 + psubw ym1, ym4 + psubw ym2, ym4 + psubw ym3, ym4 + mova [r2 + 64], ym0 + mova [r2 + r3 + 64], ym1 + mova [r2 + r3 * 2 + 64], ym2 + mova [r2 + r6 + 64], ym3 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + r1 * 2] + pmovzxbw m3, [r0 + r5] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + mova [r2], m0 + mova [r2 + r3], m1 + mova [r2 + r3 * 2], m2 + mova [r2 + r6], m3 + + pmovzxbw ym0, [r0 + 32] + pmovzxbw ym1, [r0 + r1 + 32] + pmovzxbw ym2, [r0 + r1 * 2 + 32] + pmovzxbw ym3, [r0 + r5 + 32] + psllw ym0, 6 + psllw ym1, 6 + psllw ym2, 6 + psllw ym3, 6 + psubw ym0, ym4 + psubw ym1, ym4 + psubw ym2, ym4 + psubw ym3, ym4 + mova [r2 + 64], ym0 + mova [r2 + r3 + 64], ym1 + mova [r2 + r3 * 2 + 64], ym2 + mova [r2 + r6 + 64], ym3 +%endmacro ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- @@ -3058,6 +3384,43 @@ PROCESS_P2S_48x8_AVX512 RET +INIT_ZMM avx512 +cglobal filterPixelToShort_aligned_48x64, 3,7,5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + + PROCESS_P2S_ALIGNED_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_ALIGNED_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_ALIGNED_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_ALIGNED_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_ALIGNED_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_ALIGNED_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_ALIGNED_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_ALIGNED_48x8_AVX512 + RET +;----------------------------------------------------------------------------- +;p2s and p2s_aligned 48xN avx512 code end +;----------------------------------------------------------------------------- + %macro PROCESS_LUMA_W4_4R 0 movd m0, [r0] movd m1, [r0 + r1] _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel