# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1506512312 -19800 # Wed Sep 27 17:08:32 2017 +0530 # Node ID 762682acf5c25bdecbfec2d0f4f32da7dea3a9e2 # Parent b31fc8889e0f8a433be25fb6267552f7d03efeaf x86: Aligned routine implementation for addavg primitive
diff -r b31fc8889e0f -r 762682acf5c2 source/common/pixel.cpp --- a/source/common/pixel.cpp Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/pixel.cpp Wed Sep 27 17:08:32 2017 +0530 @@ -987,6 +987,7 @@ #define LUMA_PU(W, H) \ p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg<W, H>; \ + p.pu[LUMA_ ## W ## x ## H].addAvg_aligned = addAvg<W, H>; \ p.pu[LUMA_ ## W ## x ## H].sad = sad<W, H>; \ p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3<W, H>; \ p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4<W, H>; \ @@ -1103,6 +1104,7 @@ #define CHROMA_PU_420(W, H) \ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg = addAvg<W, H>; \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg_aligned = addAvg<W, H>; \ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \ CHROMA_PU_420(2, 2); @@ -1180,6 +1182,7 @@ #define CHROMA_PU_422(W, H) \ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg = addAvg<W, H>; \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg_aligned = addAvg<W, H>; \ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \ CHROMA_PU_422(2, 4); diff -r b31fc8889e0f -r 762682acf5c2 source/common/primitives.cpp --- a/source/common/primitives.cpp Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/primitives.cpp Wed Sep 27 17:08:32 2017 +0530 @@ -115,6 +115,7 @@ { p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp; p.chroma[X265_CSP_I444].pu[i].addAvg = p.pu[i].addAvg; + p.chroma[X265_CSP_I444].pu[i].addAvg_aligned = p.pu[i].addAvg_aligned; p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd; p.chroma[X265_CSP_I444].pu[i].p2s = p.pu[i].convert_p2s; } diff -r b31fc8889e0f -r 762682acf5c2 source/common/primitives.h --- a/source/common/primitives.h Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/primitives.h Wed Sep 27 17:08:32 2017 +0530 @@ -245,6 +245,7 @@ pixelavg_pp_t pixelavg_pp; // quick bidir using pixels (borrowed from x264) addAvg_t addAvg; // bidir motion compensation, uses 16bit values + addAvg_t addAvg_aligned; copy_pp_t copy_pp; filter_p2s_t convert_p2s; @@ -386,6 +387,7 @@ filter_pp_t filter_hpp; filter_hps_t filter_hps; addAvg_t addAvg; + addAvg_t addAvg_aligned; copy_pp_t copy_pp; filter_p2s_t p2s; filter_p2s_t p2s_aligned; diff -r b31fc8889e0f -r 762682acf5c2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Sep 27 17:08:32 2017 +0530 @@ -2510,6 +2510,65 @@ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512); + p.pu[LUMA_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2); + p.pu[LUMA_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2); + p.pu[LUMA_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2); + p.pu[LUMA_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2); + p.pu[LUMA_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2); + p.pu[LUMA_16x4].addAvg_aligned = PFX(addAvg_aligned_16x4_avx512); + p.pu[LUMA_16x8].addAvg_aligned = PFX(addAvg_aligned_16x8_avx512); + p.pu[LUMA_16x12].addAvg_aligned = PFX(addAvg_aligned_16x12_avx512); + p.pu[LUMA_16x16].addAvg_aligned = PFX(addAvg_aligned_16x16_avx512); + p.pu[LUMA_16x32].addAvg_aligned = PFX(addAvg_aligned_16x32_avx512); + p.pu[LUMA_16x64].addAvg_aligned = PFX(addAvg_aligned_16x64_avx512); + p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_aligned_48x64_avx512); + p.pu[LUMA_24x32].addAvg_aligned = PFX(addAvg_24x32_avx2); + p.pu[LUMA_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512); + p.pu[LUMA_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512); + p.pu[LUMA_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512); + p.pu[LUMA_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512); + p.pu[LUMA_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512); + p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_48x64_avx2); + p.pu[LUMA_64x16].addAvg_aligned = PFX(addAvg_aligned_64x16_avx512); + p.pu[LUMA_64x32].addAvg_aligned = PFX(addAvg_aligned_64x32_avx512); + p.pu[LUMA_64x48].addAvg_aligned = PFX(addAvg_aligned_64x48_avx512); + p.pu[LUMA_64x64].addAvg_aligned = PFX(addAvg_aligned_64x64_avx512); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg_aligned = PFX(addAvg_8x2_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg_aligned = PFX(addAvg_8x6_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg_aligned = PFX(addAvg_aligned_16x4_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg_aligned = PFX(addAvg_aligned_16x8_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg_aligned = PFX(addAvg_aligned_16x12_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg_aligned = PFX(addAvg_aligned_16x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg_aligned = PFX(addAvg_aligned_16x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512); + + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg_aligned = PFX(addAvg_8x12_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg_aligned = PFX(addAvg_8x64_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg_aligned = PFX(addAvg_24x64_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg_aligned = PFX(addAvg_12x32_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg_aligned = PFX(addAvg_aligned_16x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg_aligned = PFX(addAvg_aligned_16x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg_aligned = PFX(addAvg_aligned_16x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg_aligned = PFX(addAvg_aligned_16x24_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg_aligned = PFX(addAvg_aligned_16x8_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg_aligned = PFX(addAvg_aligned_32x48_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH @@ -4176,6 +4235,64 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx512); + p.pu[LUMA_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2); + p.pu[LUMA_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2); + p.pu[LUMA_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2); + p.pu[LUMA_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2); + p.pu[LUMA_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2); + p.pu[LUMA_16x4].addAvg_aligned = PFX(addAvg_16x4_avx2); + p.pu[LUMA_16x8].addAvg_aligned = PFX(addAvg_16x8_avx2); + p.pu[LUMA_16x12].addAvg_aligned = PFX(addAvg_16x12_avx2); + p.pu[LUMA_16x16].addAvg_aligned = PFX(addAvg_16x16_avx2); + p.pu[LUMA_16x32].addAvg_aligned = PFX(addAvg_16x32_avx2); + p.pu[LUMA_16x64].addAvg_aligned = PFX(addAvg_16x64_avx2); + p.pu[LUMA_24x32].addAvg_aligned = PFX(addAvg_24x32_avx2); + p.pu[LUMA_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512); + p.pu[LUMA_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512); + p.pu[LUMA_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512); + p.pu[LUMA_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512); + p.pu[LUMA_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512); + p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_48x64_avx2); + p.pu[LUMA_64x16].addAvg_aligned = PFX(addAvg_aligned_64x16_avx512); + p.pu[LUMA_64x32].addAvg_aligned = PFX(addAvg_aligned_64x32_avx512); + p.pu[LUMA_64x48].addAvg_aligned = PFX(addAvg_aligned_64x48_avx512); + p.pu[LUMA_64x64].addAvg_aligned = PFX(addAvg_aligned_64x64_avx512); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg_aligned = PFX(addAvg_8x2_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg_aligned = PFX(addAvg_8x6_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg_aligned = PFX(addAvg_16x4_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg_aligned = PFX(addAvg_16x8_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg_aligned = PFX(addAvg_16x12_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg_aligned = PFX(addAvg_16x16_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg_aligned = PFX(addAvg_16x32_avx2); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512); + + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg_aligned = PFX(addAvg_8x12_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg_aligned = PFX(addAvg_8x64_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg_aligned = PFX(addAvg_12x32_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg_aligned = PFX(addAvg_16x8_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg_aligned = PFX(addAvg_16x16_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg_aligned = PFX(addAvg_16x24_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg_aligned = PFX(addAvg_16x32_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg_aligned = PFX(addAvg_16x64_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg_aligned = PFX(addAvg_24x64_avx2); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg_aligned = PFX(addAvg_aligned_32x48_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512); + p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512); p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); diff -r b31fc8889e0f -r 762682acf5c2 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/x86/mc-a.asm Wed Sep 27 17:08:32 2017 +0530 @@ -2002,6 +2002,352 @@ %endrep PROCESS_ADDAVG_48x4_HBD_AVX512 RET + +%macro PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 0 + movu ym0, [r0] + vinserti32x8 m0, [r0 + r3], 1 + movu ym1, [r1] + vinserti32x8 m1, [r1 + r4], 1 + + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + + movu [r2], ym0 + vextracti32x8 [r2 + r5], m0, 1 + + movu ym0, [r0 + 2 * r3] + vinserti32x8 m0, [r0 + r6], 1 + movu ym1, [r1 + 2 * r4] + vinserti32x8 m1, [r1 + r7], 1 + + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + + movu [r2 + 2 * r5], ym0 + vextracti32x8 [r2 + r8], m0, 1 +%endmacro + +%macro PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512 0 + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu m0, [r0 + 2 * r3] + movu m1, [r1 + 2 * r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 2 * r5], m0 + + movu m0, [r0 + r6] + movu m1, [r1 + r7] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r8], m0 +%endmacro + +%macro PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512 0 + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu m0, [r0 + mmsize] + movu m1, [r1 + mmsize] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + mmsize], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu m0, [r0 + r3 + mmsize] + movu m1, [r1 + r4 + mmsize] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5 + mmsize], m0 + + movu m0, [r0 + 2 * r3] + movu m1, [r1 + 2 * r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 2 * r5], m0 + + movu m0, [r0 + 2 * r3 + mmsize] + movu m1, [r1 + 2 * r4 + mmsize] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 2 * r5 + mmsize], m0 + + movu m0, [r0 + r6] + movu m1, [r1 + r7] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r8], m0 + + movu m0, [r0 + r6 + mmsize] + movu m1, [r1 + r7 + mmsize] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r8 + mmsize], m0 +%endmacro + +%macro PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512 0 + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu ym0, [r0 + mmsize] + movu ym1, [r1 + mmsize] + paddw ym0, ym1 + pmulhrsw ym0, ym3 + paddw ym0, ym4 + pmaxsw ym0, ym2 + pminsw ym0, ym5 + movu [r2 + mmsize], ym0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu ym0, [r0 + r3 + mmsize] + movu ym1, [r1 + r4 + mmsize] + paddw ym0, ym1 + pmulhrsw ym0, ym3 + paddw ym0, ym4 + pmaxsw ym0, ym2 + pminsw ym0, ym5 + movu [r2 + r5 + mmsize], ym0 + + movu m0, [r0 + 2 * r3] + movu m1, [r1 + 2 * r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 2 * r5], m0 + + movu ym0, [r0 + 2 * r3 + mmsize] + movu ym1, [r1 + 2 * r4 + mmsize] + paddw ym0, ym1 + pmulhrsw ym0, ym3 + paddw ym0, ym4 + pmaxsw ym0, ym2 + pminsw ym0, ym5 + movu [r2 + 2 * r5 + mmsize], ym0 + + movu m0, [r0 + r6] + movu m1, [r1 + r7] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r8], m0 + + movu ym0, [r0 + r6 + mmsize] + movu ym1, [r1 + r7 + mmsize] + paddw ym0, ym1 + pmulhrsw ym0, ym3 + paddw ym0, ym4 + pmaxsw ym0, ym2 + pminsw ym0, ym5 + movu [r2 + r8 + mmsize], ym0 +%endmacro +;----------------------------------------------------------------------------- +;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal addAvg_aligned_16x4, 6,9,6 + vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND] + vbroadcasti32x8 m5, [pw_pixel_max] + vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + lea r6, [3 * r3] + lea r7, [3 * r4] + lea r8, [3 * r5] + PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 + RET + +%macro ADDAVG_ALIGNED_W16_HBD_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_aligned_16x%1, 6,9,6 + vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND] + vbroadcasti32x8 m5, [pw_pixel_max] + vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + lea r6, [3 * r3] + lea r7, [3 * r4] + lea r8, [3 * r5] + +%rep %1/4 - 1 + PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 + lea r2, [r2 + 4 * r5] + lea r0, [r0 + 4 * r3] + lea r1, [r1 + 4 * r4] +%endrep + PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 + RET +%endmacro + +ADDAVG_ALIGNED_W16_HBD_AVX512 8 +ADDAVG_ALIGNED_W16_HBD_AVX512 12 +ADDAVG_ALIGNED_W16_HBD_AVX512 16 +ADDAVG_ALIGNED_W16_HBD_AVX512 24 +ADDAVG_ALIGNED_W16_HBD_AVX512 32 +ADDAVG_ALIGNED_W16_HBD_AVX512 64 + +%macro ADDAVG_ALIGNED_W32_HBD_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_aligned_32x%1, 6,9,6 + vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND] + vbroadcasti32x8 m5, [pw_pixel_max] + vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + lea r6, [3 * r3] + lea r7, [3 * r4] + lea r8, [3 * r5] + +%rep %1/4 - 1 + PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512 + lea r2, [r2 + 4 * r5] + lea r0, [r0 + 4 * r3] + lea r1, [r1 + 4 * r4] +%endrep + PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512 + RET +%endmacro + +ADDAVG_ALIGNED_W32_HBD_AVX512 8 +ADDAVG_ALIGNED_W32_HBD_AVX512 16 +ADDAVG_ALIGNED_W32_HBD_AVX512 24 +ADDAVG_ALIGNED_W32_HBD_AVX512 32 +ADDAVG_ALIGNED_W32_HBD_AVX512 48 +ADDAVG_ALIGNED_W32_HBD_AVX512 64 + +%macro ADDAVG_ALIGNED_W64_HBD_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_aligned_64x%1, 6,9,6 + vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND] + vbroadcasti32x8 m5, [pw_pixel_max] + vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + lea r6, [3 * r3] + lea r7, [3 * r4] + lea r8, [3 * r5] + +%rep %1/4 - 1 + PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512 + lea r2, [r2 + 4 * r5] + lea r0, [r0 + 4 * r3] + lea r1, [r1 + 4 * r4] +%endrep + PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512 + RET +%endmacro + +ADDAVG_ALIGNED_W64_HBD_AVX512 16 +ADDAVG_ALIGNED_W64_HBD_AVX512 32 +ADDAVG_ALIGNED_W64_HBD_AVX512 48 +ADDAVG_ALIGNED_W64_HBD_AVX512 64 + +INIT_ZMM avx512 +cglobal addAvg_aligned_48x64, 6,9,6 + vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND] + vbroadcasti32x8 m5, [pw_pixel_max] + vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + lea r6, [3 * r3] + lea r7, [3 * r4] + lea r8, [3 * r5] + +%rep 15 + PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512 + lea r2, [r2 + 4 * r5] + lea r0, [r0 + 4 * r3] + lea r1, [r1 + 4 * r4] +%endrep + PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512 + RET ;----------------------------------------------------------------------------- ;addAvg avx512 high bit depth code end ;----------------------------------------------------------------------------- @@ -3424,6 +3770,112 @@ ADDAVG_W32_AVX512 32 ADDAVG_W32_AVX512 48 ADDAVG_W32_AVX512 64 + +%macro PROCESS_ADDAVG_ALIGNED_64x2_AVX512 0 + mova m0, [r0] + mova m1, [r1] + mova m2, [r0 + mmsize] + mova m3, [r1 + mmsize] + + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m0, m2 + vpermq m0, m6, m0 + mova [r2], m0 + + mova m0, [r0 + r3] + mova m1, [r1 + r4] + mova m2, [r0 + r3 + mmsize] + mova m3, [r1 + r4 + mmsize] + + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m0, m2 + vpermq m0, m6, m0 + mova [r2 + r5], m0 +%endmacro + +%macro PROCESS_ADDAVG_ALIGNED_32x2_AVX512 0 + mova m0, [r0] + mova m1, [r1] + mova m2, [r0 + r3] + mova m3, [r1 + r4] + + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m0, m2 + vpermq m0, m6, m0 + mova [r2], ym0 + vextracti32x8 [r2 + r5], m0, 1 +%endmacro +;-------------------------------------------------------------------------------------------------------------------- +;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) +;-------------------------------------------------------------------------------------------------------------------- +%macro ADDAVG_ALIGNED_W64_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_aligned_64x%1, 6,6,7 + vbroadcasti32x8 m4, [pw_256] + vbroadcasti32x8 m5, [pw_128] + mova m6, [shuf_avx512] + + add r3, r3 + add r4, r4 + +%rep %1/2 - 1 + PROCESS_ADDAVG_ALIGNED_64x2_AVX512 + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + PROCESS_ADDAVG_ALIGNED_64x2_AVX512 + RET +%endmacro + +ADDAVG_ALIGNED_W64_AVX512 16 +ADDAVG_ALIGNED_W64_AVX512 32 +ADDAVG_ALIGNED_W64_AVX512 48 +ADDAVG_ALIGNED_W64_AVX512 64 + +%macro ADDAVG_ALIGNED_W32_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_aligned_32x%1, 6,6,7 + vbroadcasti32x8 m4, [pw_256] + vbroadcasti32x8 m5, [pw_128] + mova m6, [shuf_avx512] + add r3, r3 + add r4, r4 + +%rep %1/2 - 1 + PROCESS_ADDAVG_ALIGNED_32x2_AVX512 + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + PROCESS_ADDAVG_ALIGNED_32x2_AVX512 + RET +%endmacro + +ADDAVG_ALIGNED_W32_AVX512 8 +ADDAVG_ALIGNED_W32_AVX512 16 +ADDAVG_ALIGNED_W32_AVX512 24 +ADDAVG_ALIGNED_W32_AVX512 32 +ADDAVG_ALIGNED_W32_AVX512 48 +ADDAVG_ALIGNED_W32_AVX512 64 ;----------------------------------------------------------------------------- ; addAvg avx512 code end ;----------------------------------------------------------------------------- diff -r b31fc8889e0f -r 762682acf5c2 source/common/x86/pixel.h --- a/source/common/x86/pixel.h Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/x86/pixel.h Wed Sep 27 17:08:32 2017 +0530 @@ -50,6 +50,7 @@ FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \ FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \ + FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \ FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \ FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \ diff -r b31fc8889e0f -r 762682acf5c2 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Mon Sep 25 13:11:24 2017 +0530 +++ b/source/test/pixelharness.cpp Wed Sep 27 17:08:32 2017 +0530 @@ -873,8 +873,8 @@ bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt) { - ALIGN_VAR_16(pixel, ref_dest[64 * 64]); - ALIGN_VAR_16(pixel, opt_dest[64 * 64]); + ALIGN_VAR_64(pixel, ref_dest[64 * 64]); + ALIGN_VAR_64(pixel, opt_dest[64 * 64]); int j = 0; @@ -898,6 +898,32 @@ return true; } +bool PixelHarness::check_addAvg_aligned(addAvg_t ref, addAvg_t opt) +{ + ALIGN_VAR_64(pixel, ref_dest[64 * 64]); + ALIGN_VAR_64(pixel, opt_dest[64 * 64]); + + int j = 0; + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + intptr_t stride = STRIDE; + + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, stride, stride, stride); + checked(opt, short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, stride, stride, stride); + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += INCR * 2; + } + + return true; +} bool PixelHarness::check_calSign(sign_t ref, sign_t opt) { ALIGN_VAR_16(int8_t, ref_dest[64 * 2]); @@ -2140,6 +2166,15 @@ } } + if (opt.pu[part].addAvg_aligned) + { + if (!check_addAvg_aligned(ref.pu[part].addAvg_aligned, opt.pu[part].addAvg_aligned)) + { + printf("addAvg_aligned[%s] failed\n", lumaPartStr[part]); + return false; + } + } + if (part < NUM_CU_SIZES) { if (opt.cu[part].sse_pp) @@ -2224,6 +2259,14 @@ return false; } } + if (opt.chroma[i].pu[part].addAvg_aligned) + { + if (!check_addAvg_aligned(ref.chroma[i].pu[part].addAvg_aligned, opt.chroma[i].pu[part].addAvg_aligned)) + { + printf("chroma_addAvg_aligned[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } if (opt.chroma[i].pu[part].satd) { if (!check_pixelcmp(ref.chroma[i].pu[part].satd, opt.chroma[i].pu[part].satd)) @@ -2869,6 +2912,11 @@ HEADER("addAvg[%s]", lumaPartStr[part]); REPORT_SPEEDUP(opt.pu[part].addAvg, ref.pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE); } + if (opt.pu[part].addAvg_aligned) + { + HEADER("addAvg_aligned[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.pu[part].addAvg_aligned, ref.pu[part].addAvg_aligned, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE); + } if (part < NUM_CU_SIZES) { @@ -2922,6 +2970,11 @@ HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg, ref.chroma[i].pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE); } + if (opt.chroma[i].pu[part].addAvg_aligned) + { + HEADER("[%s] addAvg_aligned[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg_aligned, ref.chroma[i].pu[part].addAvg_aligned, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE); + } if (opt.chroma[i].pu[part].satd) { HEADER("[%s] satd[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); diff -r b31fc8889e0f -r 762682acf5c2 source/test/pixelharness.h --- a/source/test/pixelharness.h Mon Sep 25 13:11:24 2017 +0530 +++ b/source/test/pixelharness.h Wed Sep 27 17:08:32 2017 +0530 @@ -44,30 +44,30 @@ enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max) enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min) - ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]); - pixel pbuf2[BUFFSIZE]; - pixel pbuf3[BUFFSIZE]; - pixel pbuf4[BUFFSIZE]; - int ibuf1[BUFFSIZE]; - int8_t psbuf1[BUFFSIZE]; - int8_t psbuf2[BUFFSIZE]; - int8_t psbuf3[BUFFSIZE]; - int8_t psbuf4[BUFFSIZE]; - int8_t psbuf5[BUFFSIZE]; + ALIGN_VAR_64(pixel, pbuf1[BUFFSIZE]); + ALIGN_VAR_64(pixel, pbuf2[BUFFSIZE]); + ALIGN_VAR_64(pixel, pbuf3[BUFFSIZE]); + ALIGN_VAR_64(pixel, pbuf4[BUFFSIZE]); + ALIGN_VAR_64(int, ibuf1[BUFFSIZE]); + ALIGN_VAR_64(int8_t, psbuf1[BUFFSIZE]); + ALIGN_VAR_64(int8_t, psbuf2[BUFFSIZE]); + ALIGN_VAR_64(int8_t, psbuf3[BUFFSIZE]); + ALIGN_VAR_64(int8_t, psbuf4[BUFFSIZE]); + ALIGN_VAR_64(int8_t, psbuf5[BUFFSIZE]); - int16_t sbuf1[BUFFSIZE]; - int16_t sbuf2[BUFFSIZE]; - int16_t sbuf3[BUFFSIZE]; + ALIGN_VAR_64(int16_t, sbuf1[BUFFSIZE]); + ALIGN_VAR_64(int16_t, sbuf2[BUFFSIZE]); + ALIGN_VAR_64(int16_t, sbuf3[BUFFSIZE]); - pixel pixel_test_buff[TEST_CASES][BUFFSIZE]; - int16_t short_test_buff[TEST_CASES][BUFFSIZE]; - int16_t short_test_buff1[TEST_CASES][BUFFSIZE]; - int16_t short_test_buff2[TEST_CASES][BUFFSIZE]; - int int_test_buff[TEST_CASES][BUFFSIZE]; - uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE]; - uint8_t uchar_test_buff[TEST_CASES][BUFFSIZE]; - double double_test_buff[TEST_CASES][BUFFSIZE]; - int16_t residual_test_buff[TEST_CASES][BUFFSIZE]; + ALIGN_VAR_64(pixel, pixel_test_buff[TEST_CASES][BUFFSIZE]); + ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][BUFFSIZE]); + ALIGN_VAR_64(int16_t, short_test_buff1[TEST_CASES][BUFFSIZE]); + ALIGN_VAR_64(int16_t, short_test_buff2[TEST_CASES][BUFFSIZE]); + ALIGN_VAR_64(int, int_test_buff[TEST_CASES][BUFFSIZE]); + ALIGN_VAR_64(uint16_t, ushort_test_buff[TEST_CASES][BUFFSIZE]); + ALIGN_VAR_64(uint8_t, uchar_test_buff[TEST_CASES][BUFFSIZE]); + ALIGN_VAR_64(double, double_test_buff[TEST_CASES][BUFFSIZE]); + ALIGN_VAR_64(int16_t, residual_test_buff[TEST_CASES][BUFFSIZE]); bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt); bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt); @@ -99,6 +99,7 @@ bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt); bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt); bool check_addAvg(addAvg_t, addAvg_t); + bool check_addAvg_aligned(addAvg_t, addAvg_t); bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt); bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt); bool check_saoCuOrgE2_t(saoCuOrgE2_t ref[], saoCuOrgE2_t opt[]); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel