# HG changeset patch # User Vignesh Vijayakumar # Date 1507278656 -19800 # Fri Oct 06 14:00:56 2017 +0530 # Node ID 44433ded38d00c79fa52e69e7c5c5127009f9ede # Parent ba20a08181382a2fb18a0d1aff7637d66fa41ac7 x86: Aligned routine implementation of add_ps primitive
diff -r ba20a0818138 -r 44433ded38d0 source/common/pixel.cpp --- a/source/common/pixel.cpp Wed Oct 04 17:02:59 2017 +0530 +++ b/source/common/pixel.cpp Fri Oct 06 14:00:56 2017 +0530 @@ -996,6 +996,7 @@ #define LUMA_CU(W, H) \ p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \ + p.cu[BLOCK_ ## W ## x ## H].add_ps_aligned = pixel_add_ps_c<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \ @@ -1169,7 +1170,8 @@ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \ - p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps_aligned = pixel_add_ps_c<W, H>; CHROMA_CU_420(2, 2) CHROMA_CU_420(4, 4) @@ -1247,7 +1249,8 @@ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \ - p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps_aligned = pixel_add_ps_c<W, H>; CHROMA_CU_422(2, 4) CHROMA_CU_422(4, 8) diff -r ba20a0818138 -r 44433ded38d0 source/common/primitives.cpp --- a/source/common/primitives.cpp Wed Oct 04 17:02:59 2017 +0530 +++ b/source/common/primitives.cpp Fri Oct 06 14:00:56 2017 +0530 @@ -126,6 +126,7 @@ p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp; p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps; p.chroma[X265_CSP_I444].cu[i].add_ps = p.cu[i].add_ps; + p.chroma[X265_CSP_I444].cu[i].add_ps_aligned = p.cu[i].add_ps_aligned; p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps; p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp; p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss; diff -r ba20a0818138 -r 44433ded38d0 source/common/primitives.h --- a/source/common/primitives.h Wed Oct 04 17:02:59 2017 +0530 +++ b/source/common/primitives.h Fri Oct 06 14:00:56 2017 +0530 @@ -271,6 +271,7 @@ calcresidual_t calcresidual_aligned; pixel_sub_ps_t sub_ps; pixel_add_ps_t add_ps; + pixel_add_ps_t add_ps_aligned; blockfill_s_t blockfill_s; // block fill, for DC transforms blockfill_s_t blockfill_s_aligned; // block fill, for DC transforms copy_cnt_t copy_cnt; // copy coeff while counting non-zero @@ -405,6 +406,7 @@ pixel_sse_t sse_pp; pixel_sub_ps_t sub_ps; pixel_add_ps_t add_ps; + pixel_add_ps_t add_ps_aligned; copy_ps_t copy_ps; copy_sp_t copy_sp; diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Oct 04 17:02:59 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Oct 06 14:00:56 2017 +0530 @@ -2202,6 +2202,20 @@ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512); + p.cu[BLOCK_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse2); + p.cu[BLOCK_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse2); + p.cu[BLOCK_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2); + p.cu[BLOCK_32x32].add_ps_aligned = PFX(pixel_add_ps_aligned_32x32_avx512); + p.cu[BLOCK_64x64].add_ps_aligned = PFX(pixel_add_ps_aligned_64x64_avx512); + p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse2); + p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse2); + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2); + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps_aligned = PFX(pixel_add_ps_aligned_32x32_avx512); + p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps_aligned = PFX(pixel_add_ps_4x8_sse2); + p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps_aligned = PFX(pixel_add_ps_8x16_sse2); + p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps_aligned = PFX(pixel_add_ps_16x32_avx2); + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps_aligned = PFX(pixel_add_ps_aligned_32x64_avx512); + // 64 X N p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512); @@ -4306,6 +4320,20 @@ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512); + p.cu[BLOCK_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse4); + p.cu[BLOCK_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse4); + p.cu[BLOCK_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2); + p.cu[BLOCK_32x32].add_ps_aligned = PFX(pixel_add_ps_aligned_32x32_avx512); + p.cu[BLOCK_64x64].add_ps_aligned = PFX(pixel_add_ps_aligned_64x64_avx512); + p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse4); + p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse4); + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2); + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps_aligned = PFX(pixel_add_ps_aligned_32x32_avx512); + p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps_aligned = PFX(pixel_add_ps_4x8_sse4); + p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps_aligned = PFX(pixel_add_ps_8x16_sse4); + p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps_aligned = PFX(pixel_add_ps_16x32_avx2); + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps_aligned = PFX(pixel_add_ps_aligned_32x64_avx512); + p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/pixel.h --- a/source/common/x86/pixel.h Wed Oct 04 17:02:59 2017 +0530 +++ b/source/common/x86/pixel.h Fri Oct 06 14:00:56 2017 +0530 @@ -45,6 +45,7 @@ FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \ FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \ FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \ + FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \ FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \ FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \ FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \ diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Wed Oct 04 17:02:59 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Fri Oct 06 14:00:56 2017 +0530 @@ -1150,27 +1150,27 @@ ;----------------------------------------------------------------------------- %macro PROCESS_ADD_PS_64x4_AVX512 0 pmovzxbw m0, [r2] - pmovzxbw m1, [r2 + 32] + pmovzxbw m1, [r2 + mmsize/2] movu m2, [r3] - movu m3, [r3 + 64] + movu m3, [r3 + mmsize] paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m4, m0 movu [r0], m0 pmovzxbw m0, [r2 + r4] - pmovzxbw m1, [r2 + r4 + 32] + pmovzxbw m1, [r2 + r4 + mmsize/2] movu m2, [r3 + r5] - movu m3, [r3 + r5 + 64] + movu m3, [r3 + r5 + mmsize] paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m4, m0 movu [r0 + r1], m0 pmovzxbw m0, [r2 + 2 * r4] - pmovzxbw m1, [r2 + 2 * r4 + 32] + pmovzxbw m1, [r2 + 2 * r4 + mmsize/2] movu m2, [r3 + 2 * r5] - movu m3, [r3 + 2 * r5 + 64] + movu m3, [r3 + 2 * r5 + mmsize] paddw m0, m2 paddw m1, m3 packuswb m0, m1 @@ -1178,15 +1178,16 @@ movu [r0 + 2 * r1], m0 pmovzxbw m0, [r2 + r7] - pmovzxbw m1, [r2 + r7 + 32] + pmovzxbw m1, [r2 + r7 + mmsize/2] movu m2, [r3 + r8] - movu m3, [r3 + r8 + 64] + movu m3, [r3 + r8 + mmsize] paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m4, m0 movu [r0 + r6], m0 %endmacro + %macro PROCESS_ADD_PS_64x4_HBD_AVX512 0 movu m0, [r2] movu m1, [r2 + mmsize] @@ -1233,6 +1234,92 @@ movu [r0 + r8 + mmsize], m1 %endmacro +%macro PROCESS_ADD_PS_64x4_ALIGNED_AVX512 0 + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + mmsize/2] + mova m2, [r3] + mova m3, [r3 + mmsize] + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m4, m0 + mova [r0], m0 + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + mmsize/2] + mova m2, [r3 + r5] + mova m3, [r3 + r5 + mmsize] + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m4, m0 + mova [r0 + r1], m0 + pmovzxbw m0, [r2 + 2 * r4] + pmovzxbw m1, [r2 + 2 * r4 + mmsize/2] + mova m2, [r3 + 2 * r5] + mova m3, [r3 + 2 * r5 + mmsize] + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m4, m0 + mova [r0 + 2 * r1], m0 + + pmovzxbw m0, [r2 + r7] + pmovzxbw m1, [r2 + r7 + mmsize/2] + mova m2, [r3 + r8] + mova m3, [r3 + r8 + mmsize] + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m4, m0 + mova [r0 + r6], m0 +%endmacro + +%macro PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512 0 + mova m0, [r2] + mova m1, [r2 + mmsize] + mova m2, [r3] + mova m3, [r3 + mmsize] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + mova [r0], m0 + mova [r0 + mmsize], m1 + + mova m0, [r2 + r4] + mova m1, [r2 + r4 + mmsize] + mova m2, [r3 + r5] + mova m3, [r3 + r5 + mmsize] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + mova [r0 + r1], m0 + mova [r0 + r1 + mmsize], m1 + + mova m0, [r2 + r4 * 2] + mova m1, [r2 + r4 * 2 + mmsize] + mova m2, [r3 + r5 * 2] + mova m3, [r3 + r5 * 2 + mmsize] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + mova [r0 + r1 * 2], m0 + mova [r0 + r1 * 2 + mmsize], m1 + + mova m0, [r2 + r6] + mova m1, [r2 + r6 + mmsize] + mova m2, [r3 + r7] + mova m3, [r3 + r7 + mmsize] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + mova [r0 + r8], m0 + mova [r0 + r8 + mmsize], m1 +%endmacro + ;----------------------------------------------------------------------------- ; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- @@ -1256,6 +1343,25 @@ %endrep PROCESS_ADD_PS_64x4_HBD_AVX512 RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_aligned_64x64, 6, 9, 6 + vbroadcasti32x8 m5, [pw_pixel_max] + pxor m4, m4 + add r4d, r4d + add r5d, r5d + add r1d, r1d + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] +%rep 15 + PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] +%endrep + PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512 + RET %endif %else %if ARCH_X86_64 @@ -1274,8 +1380,25 @@ %endrep PROCESS_ADD_PS_64x4_AVX512 RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_aligned_64x64, 6, 9, 4 + add r5, r5 + lea r6, [3 * r1] + lea r7, [3 * r4] + lea r8, [3 * r5] + mova m4, [store_shuf1_avx512] +%rep 15 + PROCESS_ADD_PS_64x4_ALIGNED_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] +%endrep + PROCESS_ADD_PS_64x4_ALIGNED_AVX512 + RET %endif %endif + %macro PROCESS_ADD_PS_32x4_AVX512 0 pmovzxbw m0, [r2] movu m1, [r3] @@ -1298,6 +1421,7 @@ movu [r0 + r1 * 2], ym0 vextracti32x8 [r0 + r8], m0, 1 %endmacro + %macro PROCESS_ADD_PS_32x4_HBD_AVX512 0 movu m0, [r2] movu m1, [r2 + r4] @@ -1322,6 +1446,53 @@ movu [r0 + r8], m1 %endmacro +%macro PROCESS_ADD_PS_32x4_ALIGNED_AVX512 0 + pmovzxbw m0, [r2] + mova m1, [r3] + pmovzxbw m2, [r2 + r4] + mova m3, [r3 + r5] + paddw m0, m1 + paddw m2, m3 + packuswb m0, m2 + vpermq m0, m4, m0 + mova [r0], ym0 + vextracti32x8 [r0 + r1], m0, 1 + pmovzxbw m0, [r2 + r4 * 2] + mova m1, [r3 + r5 * 2] + pmovzxbw m2, [r2 + r6] + mova m3, [r3 + r7] + paddw m0, m1 + paddw m2, m3 + packuswb m0, m2 + vpermq m0, m4, m0 + mova [r0 + r1 * 2], ym0 + vextracti32x8 [r0 + r8], m0, 1 +%endmacro + +%macro PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 0 + mova m0, [r2] + mova m1, [r2 + r4] + mova m2, [r3] + mova m3, [r3 + r5] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + mova [r0], m0 + mova [r0 + r1], m1 + + mova m0, [r2 + r4 * 2] + mova m1, [r2 + r6] + mova m2, [r3 + r5 * 2] + mova m3, [r3 + r7] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + mova [r0 + r1 * 2], m0 + mova [r0 + r8], m1 +%endmacro + ;----------------------------------------------------------------------------- ; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- @@ -1345,6 +1516,7 @@ %endrep PROCESS_ADD_PS_32x4_HBD_AVX512 RET + INIT_ZMM avx512 cglobal pixel_add_ps_32x64, 6, 9, 6 vbroadcasti32x8 m5, [pw_pixel_max] @@ -1363,6 +1535,44 @@ %endrep PROCESS_ADD_PS_32x4_HBD_AVX512 RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_aligned_32x32, 6, 9, 6 + vbroadcasti32x8 m5, [pw_pixel_max] + pxor m4, m4 + add r4d, r4d + add r5d, r5d + add r1d, r1d + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] +%rep 7 + PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] +%endrep + PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 + RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_aligned_32x64, 6, 9, 6 + vbroadcasti32x8 m5, [pw_pixel_max] + pxor m4, m4 + add r4d, r4d + add r5d, r5d + add r1d, r1d + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] +%rep 15 + PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] +%endrep + PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 + RET %endif %else %if ARCH_X86_64 @@ -1398,6 +1608,39 @@ %endrep PROCESS_ADD_PS_32x4_AVX512 RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_aligned_32x32, 6, 9, 5 + add r5, r5 + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] + mova m4, [store_shuf1_avx512] +%rep 7 + PROCESS_ADD_PS_32x4_ALIGNED_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] +%endrep + PROCESS_ADD_PS_32x4_ALIGNED_AVX512 + RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_aligned_32x64, 6, 9, 5 + add r5, r5 + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] + mova m4, [store_shuf1_avx512] + +%rep 15 + PROCESS_ADD_PS_32x4_ALIGNED_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] +%endrep + PROCESS_ADD_PS_32x4_ALIGNED_AVX512 + RET %endif %endif ;----------------------------------------------------------------------------- diff -r ba20a0818138 -r 44433ded38d0 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Wed Oct 04 17:02:59 2017 +0530 +++ b/source/test/pixelharness.cpp Fri Oct 06 14:00:56 2017 +0530 @@ -876,6 +876,31 @@ return true; } +bool PixelHarness::check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt) +{ + ALIGN_VAR_64(pixel, ref_dest[64 * 64]); + ALIGN_VAR_64(pixel, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride2 = 64, stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + checked(opt, opt_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride); + ref(ref_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride); + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel))) + return false; + + reportfail(); + j += 2 * INCR; + } + return true; +} + bool PixelHarness::check_pixel_var(var_t ref, var_t opt) { int j = 0; @@ -2288,6 +2313,15 @@ } } + if (opt.cu[part].add_ps_aligned) + { + if (!check_pixel_add_ps_aligned(ref.cu[part].add_ps_aligned, opt.cu[part].add_ps_aligned)) + { + printf("add_ps_aligned[%s] failed\n", lumaPartStr[part]); + return false; + } + } + if (opt.cu[part].copy_ss) { if (!check_copy_ss(ref.cu[part].copy_ss, opt.cu[part].copy_ss)) @@ -2376,6 +2410,14 @@ return false; } } + if (opt.chroma[i].cu[part].add_ps_aligned) + { + if (!check_pixel_add_ps_aligned(ref.chroma[i].cu[part].add_ps_aligned, opt.chroma[i].cu[part].add_ps_aligned)) + { + printf("chroma_add_ps_aligned[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } if (opt.chroma[i].cu[part].copy_sp) { if (!check_copy_sp(ref.chroma[i].cu[part].copy_sp, opt.chroma[i].cu[part].copy_sp)) @@ -3042,6 +3084,11 @@ HEADER("add_ps[%s]", lumaPartStr[part]); REPORT_SPEEDUP(opt.cu[part].add_ps, ref.cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE); } + if (opt.cu[part].add_ps_aligned) + { + HEADER("add_ps[%s]", lumaPartStr[part]); + REPORT_SPEEDUP(opt.cu[part].add_ps_aligned, ref.cu[part].add_ps_aligned, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE); + } if (opt.cu[part].copy_ss) { HEADER("copy_ss[%s]", lumaPartStr[part]); @@ -3113,6 +3160,11 @@ HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps, ref.chroma[i].cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE); } + if (opt.chroma[i].cu[part].add_ps_aligned) + { + HEADER("[%s] add_ps_aligned[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps_aligned, ref.chroma[i].cu[part].add_ps_aligned, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE); + } if (opt.chroma[i].cu[part].sa8d) { HEADER("[%s] sa8d[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); diff -r ba20a0818138 -r 44433ded38d0 source/test/pixelharness.h --- a/source/test/pixelharness.h Wed Oct 04 17:02:59 2017 +0530 +++ b/source/test/pixelharness.h Fri Oct 06 14:00:56 2017 +0530 @@ -81,6 +81,7 @@ bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt); bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt); bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt); + bool check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt); bool check_scale1D_pp(scale1D_t ref, scale1D_t opt); bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt); bool check_scale2D_pp(scale2D_t ref, scale2D_t opt); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel