On Thu, Oct 31, 2013 at 8:03 AM, Min Chen <chenm...@163.com> wrote: > # HG changeset patch > # User Min Chen <chenm...@163.com> > # Date 1383224503 -28800 > # Node ID 4a40c4069ad12bc72a1c443b45a91c65d319d35d > # Parent 21dbf988079b0e33265ae48578c26347cc779fbe > asm: chroma_p2s to replace ipfilter_p2s >
the testbench is reporting failures after this change. I'll see if I can patch this myself. > > diff -r 21dbf988079b -r 4a40c4069ad1 > source/Lib/TLibCommon/TComPrediction.cpp > --- a/source/Lib/TLibCommon/TComPrediction.cpp Thu Oct 31 21:01:29 2013 > +0800 > +++ b/source/Lib/TLibCommon/TComPrediction.cpp Thu Oct 31 21:01:43 2013 > +0800 > @@ -619,10 +619,13 @@ > uint32_t cxWidth = width >> 1; > uint32_t cxHeight = height >> 1; > > + assert(dstStride == MAX_CU_SIZE / 2); > + assert(((cxWidth | cxHeight) % 2) == 0); > + > if ((yFrac | xFrac) == 0) > { > - primitives.ipfilter_p2s(refCb, refStride, dstCb, dstStride, > cxWidth, cxHeight); > - primitives.ipfilter_p2s(refCr, refStride, dstCr, dstStride, > cxWidth, cxHeight); > + primitives.chroma_p2s(refCb, refStride, dstCb, cxWidth, cxHeight); > + primitives.chroma_p2s(refCr, refStride, dstCr, cxWidth, cxHeight); > } > else if (yFrac == 0) > { > diff -r 21dbf988079b -r 4a40c4069ad1 source/common/ipfilter.cpp > --- a/source/common/ipfilter.cpp Thu Oct 31 21:01:29 2013 +0800 > +++ b/source/common/ipfilter.cpp Thu Oct 31 21:01:43 2013 +0800 > @@ -264,6 +264,7 @@ > } > } > > +template<int dstStride> > void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t > *dst, int width, int height) > { > int shift = IF_INTERNAL_PREC - X265_DEPTH; > @@ -278,7 +279,7 @@ > } > > src += srcStride; > - dst += MAX_CU_SIZE; > + dst += dstStride; > } > } > > @@ -489,7 +490,8 @@ > > p.ipfilter_p2s = filterConvertPelToShort_c; > p.ipfilter_s2p = filterConvertShortToPel_c; > - p.luma_p2s = filterConvertPelToShort_c; > + p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>; > + p.chroma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE/2>; > > p.extendRowBorder = extendCURowColBorder; > } > diff -r 21dbf988079b -r 4a40c4069ad1 source/common/primitives.h > --- a/source/common/primitives.h Thu Oct 31 21:01:29 2013 +0800 > +++ b/source/common/primitives.h Thu Oct 31 21:01:43 2013 +0800 > @@ -254,6 +254,7 @@ > filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS]; > filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS]; > filter_p2s_t luma_p2s; > + filter_p2s_t chroma_p2s; > > intra_dc_t intra_pred_dc; > intra_planar_t intra_pred_planar; > diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/asm-primitives.cpp > --- a/source/common/x86/asm-primitives.cpp Thu Oct 31 21:01:29 2013 > +0800 > +++ b/source/common/x86/asm-primitives.cpp Thu Oct 31 21:01:43 2013 > +0800 > @@ -318,6 +318,7 @@ > p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3; > p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3; > p.luma_p2s = x265_luma_p2s_ssse3; > + p.chroma_p2s = x265_chroma_p2s_ssse3; > } > if (cpuMask & X265_CPU_SSE4) > { > diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/ipfilter8.asm > --- a/source/common/x86/ipfilter8.asm Thu Oct 31 21:01:29 2013 +0800 > +++ b/source/common/x86/ipfilter8.asm Thu Oct 31 21:01:43 2013 +0800 > @@ -2124,3 +2124,61 @@ > jnz .loopH > > RET > + > + > +; TODO: combin of U and V is more performance, but need more register > +; TODO: use two path for height alignment to 4 and otherwise may > improvement 10% performance, but code is more complex, so I disable it > +INIT_XMM ssse3 > +cglobal chroma_p2s, 3, 7, 6 > + > + ; load width and height > + mov r3d, r3m > + mov r4d, r4m > + > + ; load constant > + mova m4, [tab_c_128] > + mova m5, [tab_c_64_n64] > + > +.loopH: > + > + xor r5d, r5d > +.loopW: > + lea r6, [r0 + r5] > + > + movh m0, [r6] > + punpcklbw m0, m4 > + pmaddubsw m0, m5 > + > + movh m1, [r6 + r1] > + punpcklbw m1, m6 > + pmaddubsw m1, m7 > + > + add r5d, 8 > + cmp r5d, r3d > + lea r6, [r2 + r5 * 2] > + jg .width2 > + movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 > + movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 > + je .nextH > + jmp .loopW > + > +.width4: > + cmp r3d, 4 > + jl .width2 > + movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 > + movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 > + lea r6, [r6 + 8] > + jz .nextH > + > +.width2: > + movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 > + movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 > + > +.nextH: > + lea r0, [r0 + r1 * 2] > + add r2, FENC_STRIDE / 2 * 4 > + > + sub r4d, 2 > + jnz .loopH > + > + RET > diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/ipfilter8.h > --- a/source/common/x86/ipfilter8.h Thu Oct 31 21:01:29 2013 +0800 > +++ b/source/common/x86/ipfilter8.h Thu Oct 31 21:01:43 2013 +0800 > @@ -91,6 +91,7 @@ > void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, > pixel * dst, intptr_t dstStride, int idxX, int idxY); > void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel > *dst, intptr_t dstStride, int width, int height, const int coeffIdx); > void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, > int width, int height); > +void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, > int width, int height); > > #undef SETUP_CHROMA_FUNC_DEF > #undef SETUP_LUMA_FUNC_DEF > diff -r 21dbf988079b -r 4a40c4069ad1 source/test/ipfilterharness.cpp > --- a/source/test/ipfilterharness.cpp Thu Oct 31 21:01:29 2013 +0800 > +++ b/source/test/ipfilterharness.cpp Thu Oct 31 21:01:43 2013 +0800 > @@ -240,14 +240,15 @@ > return true; > } > > -bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, > filter_p2s_t opt) > +bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, > filter_p2s_t opt, int isChroma) > { > - int16_t rand_srcStride; > + intptr_t rand_srcStride; > + const int min_size = isChroma ? 2 : 4; > > for (int i = 0; i <= 1000; i++) > { > - int16_t rand_height = (int16_t)rand() % 100; // > Randomly generated Height > - int16_t rand_width = (int16_t)rand() % 100; // > Randomly generated Width > + int rand_height = (int16_t)rand() % 100; // > Randomly generated Height > + int rand_width = (int16_t)rand() % 100; // > Randomly generated Width > > memset(IPF_vec_output_s, 0, ipf_t_size); // Initialize > output buffer to zero > memset(IPF_C_output_s, 0, ipf_t_size); // Initialize > output buffer to zero > @@ -256,13 +257,13 @@ > if (rand_srcStride < rand_width) > rand_srcStride = rand_width; > > - rand_width %= 4; > - if (rand_width < 4) > - rand_width = 4; > + rand_width %= min_size; > + if (rand_width < min_size) > + rand_width = min_size; > > - rand_height %= 4; > - if (rand_height < 4) > - rand_height = 4; > + rand_height %= min_size; > + if (rand_height < min_size) > + rand_height = min_size; > > ref(pixel_buff, > rand_srcStride, > @@ -461,7 +462,16 @@ > > if (opt.luma_p2s) > { > - if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s)) > + if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s, 0)) > + { > + printf("ipfilter_p2s failed\n"); > + return false; > + } > + } > + > + if (opt.chroma_p2s) > + { > + if (!check_IPFilter_primitive(ref.chroma_p2s, opt.chroma_p2s, 1)) > { > printf("ipfilter_p2s failed\n"); > return false; > @@ -586,6 +596,13 @@ > pixel_buff, srcStride, IPF_vec_output_s, width, > height); > } > > + if (opt.chroma_p2s) > + { > + printf("chroma_p2s\t"); > + REPORT_SPEEDUP(opt.chroma_p2s, ref.chroma_p2s, > + pixel_buff, srcStride, IPF_vec_output_s, width, > height); > + } > + > if (opt.ipfilter_s2p) > { > printf("ipfilter_s2p\t"); > diff -r 21dbf988079b -r 4a40c4069ad1 source/test/ipfilterharness.h > --- a/source/test/ipfilterharness.h Thu Oct 31 21:01:29 2013 +0800 > +++ b/source/test/ipfilterharness.h Thu Oct 31 21:01:43 2013 +0800 > @@ -45,7 +45,7 @@ > bool check_IPFilter_primitive(ipfilter_ps_t ref, ipfilter_ps_t opt); > bool check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt); > bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt); > - bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt); > + bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int > isChroma); > bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt); > bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt); > bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt); > > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel