# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1507112703 -19800 # Wed Oct 04 15:55:03 2017 +0530 # Node ID 14c93ddbd598128b43a96ff21221e2dbb189d275 # Parent ddc227597df3335e30cec9a50489f3fd87391274 x86: Aligned routine implementation for blockfill_s primitive
diff -r ddc227597df3 -r 14c93ddbd598 source/common/pixel.cpp --- a/source/common/pixel.cpp Wed Oct 04 14:03:32 2017 +0530 +++ b/source/common/pixel.cpp Wed Oct 04 15:55:03 2017 +0530 @@ -1000,6 +1000,7 @@ p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].blockfill_s = blockfill_s_c<W>; \ + p.cu[BLOCK_ ## W ## x ## H].blockfill_s_aligned = blockfill_s_c<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \ diff -r ddc227597df3 -r 14c93ddbd598 source/common/primitives.h --- a/source/common/primitives.h Wed Oct 04 14:03:32 2017 +0530 +++ b/source/common/primitives.h Wed Oct 04 15:55:03 2017 +0530 @@ -271,6 +271,7 @@ pixel_sub_ps_t sub_ps; pixel_add_ps_t add_ps; blockfill_s_t blockfill_s; // block fill, for DC transforms + blockfill_s_t blockfill_s_aligned; // block fill, for DC transforms copy_cnt_t copy_cnt; // copy coeff while counting non-zero count_nonzero_t count_nonzero; cpy2Dto1D_shl_t cpy2Dto1D_shl; diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Oct 04 14:03:32 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Oct 04 15:55:03 2017 +0530 @@ -2569,6 +2569,9 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg_aligned = PFX(addAvg_aligned_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512); + p.cu[BLOCK_16x16].blockfill_s_aligned = PFX(blockfill_s_16x16_avx2); + p.cu[BLOCK_32x32].blockfill_s_aligned = PFX(blockfill_s_aligned_32x32_avx512); + } } #else // if HIGH_BIT_DEPTH @@ -4294,6 +4297,8 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512); p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512); + p.cu[BLOCK_16x16].blockfill_s_aligned = PFX(blockfill_s_16x16_avx2); + p.cu[BLOCK_32x32].blockfill_s_aligned = PFX(blockfill_s_aligned_32x32_avx512); p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Oct 04 14:03:32 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Wed Oct 04 15:55:03 2017 +0530 @@ -2574,6 +2574,24 @@ %endrep RET +;-------------------------------------------------------------------- +; void blockfill_s_aligned_32x32(int16_t* dst, intptr_t dstride, int16_t val) +;-------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal blockfill_s_aligned_32x32, 3, 4, 1 +add r1, r1 +lea r3, [3 * r1] +movd xm0, r2d +vpbroadcastw m0, xm0 + +%rep 8 +mova [r0], m0 +mova [r0 + r1], m0 +mova [r0 + 2 * r1], m0 +mova [r0 + r3], m0 +lea r0, [r0 + 4 * r1] +%endrep +RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.h Wed Oct 04 14:03:32 2017 +0530 +++ b/source/common/x86/blockcopy8.h Wed Oct 04 15:55:03 2017 +0530 @@ -51,6 +51,7 @@ FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val); FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val); FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val); +FUNCDEF_TU(void, blockfill_s_aligned, avx512, int16_t* dst, intptr_t dstride, int16_t val); FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); diff -r ddc227597df3 -r 14c93ddbd598 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Wed Oct 04 14:03:32 2017 +0530 +++ b/source/test/pixelharness.cpp Wed Oct 04 15:55:03 2017 +0530 @@ -645,8 +645,33 @@ bool PixelHarness::check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt) { - ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); - ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); + ALIGN_VAR_64(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_64(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + intptr_t stride = 64; + for (int i = 0; i < ITERS; i++) + { + int16_t value = (rand() % SHORT_MAX) + 1; + + checked(opt, opt_dest, stride, value); + ref(ref_dest, stride, value); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + } + + return true; +} + +bool PixelHarness::check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt) +{ + ALIGN_VAR_64(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_64(int16_t, opt_dest[64 * 64]); memset(ref_dest, 0xCD, sizeof(ref_dest)); memset(opt_dest, 0xCD, sizeof(opt_dest)); @@ -2388,6 +2413,14 @@ } } + if (opt.cu[i].blockfill_s_aligned) + { + if (!check_blockfill_s_aligned(ref.cu[i].blockfill_s_aligned, opt.cu[i].blockfill_s_aligned)) + { + printf("blockfill_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i); + return false; + } + } if (opt.cu[i].var) { if (!check_pixel_var(ref.cu[i].var, opt.cu[i].var)) @@ -3081,6 +3114,12 @@ REPORT_SPEEDUP(opt.cu[i].blockfill_s, ref.cu[i].blockfill_s, sbuf1, 64, SHORT_MAX); } + if (opt.cu[i].blockfill_s_aligned) + { + HEADER("blkfill[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.cu[i].blockfill_s_aligned, ref.cu[i].blockfill_s_aligned, sbuf1, 64, SHORT_MAX); + } + if (opt.cu[i].transpose) { HEADER("transpose[%dx%d]", 4 << i, 4 << i); diff -r ddc227597df3 -r 14c93ddbd598 source/test/pixelharness.h --- a/source/test/pixelharness.h Wed Oct 04 14:03:32 2017 +0530 +++ b/source/test/pixelharness.h Wed Oct 04 15:55:03 2017 +0530 @@ -85,6 +85,7 @@ bool check_scale2D_pp(scale2D_t ref, scale2D_t opt); bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt); bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt); + bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt); bool check_calresidual(calcresidual_t ref, calcresidual_t opt); bool check_transpose(transpose_t ref, transpose_t opt); bool check_weightp(weightp_pp_t ref, weightp_pp_t opt); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel