# HG changeset patch # User Jayashree # Date 1517285149 28800 # Mon Jan 29 20:05:49 2018 -0800 # Node ID 3a08a957d4cd2bf0eb57524651a824513378e0a3 # Parent 3c6e5ce07dbca7f967e4b5b62fe450979da3bf81 x86: AVX512 'count_nonzero_32x32' avx-512 kernel
diff -r 3c6e5ce07dbc -r 3a08a957d4cd source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jan 29 19:38:59 2018 -0800 +++ b/source/common/x86/asm-primitives.cpp Mon Jan 29 20:05:49 2018 -0800 @@ -5376,6 +5376,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); p.planecopy_sp_shl = PFX(upShift_16_avx512); p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx512); + p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32x32_avx512); } #endif diff -r 3c6e5ce07dbc -r 3a08a957d4cd source/common/x86/pixel-util.h --- a/source/common/x86/pixel-util.h Mon Jan 29 19:38:59 2018 -0800 +++ b/source/common/x86/pixel-util.h Mon Jan 29 20:05:49 2018 -0800 @@ -62,5 +62,6 @@ uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)); int PFX(count_nonzero_16x16_avx512(const int16_t* quantCoeff)); +int PFX(count_nonzero_32x32_avx512(const int16_t* quantCoeff)); #endif // ifndef X265_PIXEL_UTIL_H diff -r 3c6e5ce07dbc -r 3a08a957d4cd source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Mon Jan 29 19:38:59 2018 -0800 +++ b/source/common/x86/pixel-util8.asm Mon Jan 29 20:05:49 2018 -0800 @@ -1932,6 +1932,30 @@ RET +;----------------------------------------------------------------------------- +; int x265_count_nonzero_32x32_avx512(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal count_nonzero_32x32, 1,4,2 + mov r1, 0xFFFFFFFFFFFFFFFF + kmovq k2, r1 + xor r3, r3 + pxor m0, m0 + +%assign x 0 +%rep 16 + movu m1, [r0 + x] + vpacksswb m1, [r0 + x + 64] +%assign x x+128 + vpcmpb k1 {k2}, m1, m0, 00000100b + kmovq r1, k1 + popcnt r2, r1 + add r3d, r2d +%endrep + mov eax, r3d + + RET + ;----------------------------------------------------------------------------------------------------------------------------------------------- ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) ;----------------------------------------------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel