# HG changeset patch # User Vignesh Vijayakumar # Date 1500980858 -19800 # Tue Jul 25 16:37:38 2017 +0530 # Node ID 984cad60283b474ed756238cf904b08df290e103 # Parent 09159f73f47b7eda15c8d0294774fe6eafdadea7 x86: AVX512 cleanup addAvg, copy_ps and copy_sp
diff -r 09159f73f47b -r 984cad60283b source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Jul 25 12:58:16 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 25 16:37:38 2017 +0530 @@ -2162,15 +2162,7 @@ BLOCKCOPY_SP_W64_H4_avx2 64, 64 -%macro BLOCKCOPY_SP_W64_H4_avx512 2 -INIT_ZMM avx512 -cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride - mov r4d, %2/4 - add r3, r3 - lea r5, [3 * r3] - lea r6, [3 * r1] - -.loop: +%macro PROCESS_BLOCKCOPY_SP_64x8_AVX512 0 movu m0, [r2] movu m1, [r2 + 64] movu m2, [r2 + r3] @@ -2187,8 +2179,8 @@ movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 64] - movu m2, [r2 + r5] - movu m3, [r2 + r5 + 64] + movu m2, [r2 + r4] + movu m3, [r2 + r4 + 64] packuswb m0, m1 packuswb m2, m3 @@ -2197,17 +2189,69 @@ vshufi64x2 m0, m0, 11011000b vshufi64x2 m2, m2, 11011000b movu [r0 + 2 * r1], m0 - movu [r0 + r6], m2 + movu [r0 + r5], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] - dec r4d - jnz .loop + movu m0, [r2] + movu m1, [r2 + 64] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 64] + + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vshufi64x2 m0, m0, 11011000b + vshufi64x2 m2, m2, 11011000b + movu [r0], m0 + movu [r0 + r1], m2 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + 64] + movu m2, [r2 + r4] + movu m3, [r2 + r4 + 64] + + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vshufi64x2 m0, m0, 11011000b + vshufi64x2 m2, m2, 11011000b + movu [r0 + 2 * r1], m0 + movu [r0 + r5], m2 +%endmacro + +INIT_ZMM avx512 +cglobal blockcopy_sp_64x64, 4, 6, 4 + add r3, r3 + lea r4, [3 * r3] + lea r5, [3 * r1] + + PROCESS_BLOCKCOPY_SP_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_SP_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_SP_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_SP_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_SP_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_SP_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_SP_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_SP_64x8_AVX512 RET -%endmacro - -BLOCKCOPY_SP_W64_H4_avx512 64, 64 ;----------------------------------------------------------------------------- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) @@ -3184,35 +3228,78 @@ BLOCKCOPY_PS_W32_H4_avx2 32, 32 BLOCKCOPY_PS_W32_H4_avx2 32, 64 -%macro BLOCKCOPY_PS_W32_H4_avx512 2 -INIT_ZMM avx512 -cglobal blockcopy_ps_%1x%2, 4, 7, 4 - add r1, r1 - mov r4d, %2/8 - lea r5, [3 * r3] - lea r6, [3 * r1] -.loop: -%rep 2 +%macro PROCESS_BLOCKCOPY_PS_32x8_AVX512 0 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + r3] pmovzxbw m2, [r2 + r3 * 2] - pmovzxbw m3, [r2 + r5] + pmovzxbw m3, [r2 + r4] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + r1 * 2], m2 - movu [r0 + r6], m3 + movu [r0 + r5], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] -%endrep - dec r4d - jnz .loop + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + r3] + pmovzxbw m2, [r2 + r3 * 2] + pmovzxbw m3, [r2 + r4] + + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m2 + movu [r0 + r5], m3 +%endmacro + +INIT_ZMM avx512 +cglobal blockcopy_ps_32x32, 4, 6, 4 + add r1, r1 + lea r4, [3 * r3] + lea r5, [3 * r1] + + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 RET -%endmacro - -BLOCKCOPY_PS_W32_H4_avx512 32, 32 -BLOCKCOPY_PS_W32_H4_avx512 32, 64 + +INIT_ZMM avx512 +cglobal blockcopy_ps_32x64, 4, 6, 4 + add r1, r1 + lea r4, [3 * r3] + lea r5, [3 * r1] + + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_32x8_AVX512 + RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); @@ -3399,17 +3486,7 @@ jnz .loop RET -;----------------------------------------------------------------------------- -; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); -;----------------------------------------------------------------------------- -INIT_ZMM avx512 -cglobal blockcopy_ps_64x64, 4, 7, 4 - add r1, r1 - mov r4d, 64/8 - lea r5, [3 * r3] - lea r6, [3 * r1] -.loop: -%rep 2 +%macro PROCESS_BLOCKCOPY_PS_64x8_AVX512 0 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 32] pmovzxbw m2, [r2 + r3] @@ -3421,18 +3498,65 @@ pmovzxbw m0, [r2 + r3 * 2] pmovzxbw m1, [r2 + r3 * 2 + 32] - pmovzxbw m2, [r2 + r5] - pmovzxbw m3, [r2 + r5 + 32] + pmovzxbw m2, [r2 + r4] + pmovzxbw m3, [r2 + r4 + 32] movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 64], m1 - movu [r0 + r6], m2 - movu [r0 + r6 + 64], m3 + movu [r0 + r5], m2 + movu [r0 + r5 + 64], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] -%endrep - dec r4d - jnz .loop + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 32] + pmovzxbw m2, [r2 + r3] + pmovzxbw m3, [r2 + r3 + 32] + movu [r0], m0 + movu [r0 + 64], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 64], m3 + + pmovzxbw m0, [r2 + r3 * 2] + pmovzxbw m1, [r2 + r3 * 2 + 32] + pmovzxbw m2, [r2 + r4] + pmovzxbw m3, [r2 + r4 + 32] + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 64], m1 + movu [r0 + r5], m2 + movu [r0 + r5 + 64], m3 +%endmacro +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal blockcopy_ps_64x64, 4, 6, 4 + add r1, r1 + lea r4, [3 * r3] + lea r5, [3 * r1] + + PROCESS_BLOCKCOPY_PS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_BLOCKCOPY_PS_64x8_AVX512 RET ;----------------------------------------------------------------------------- diff -r 09159f73f47b -r 984cad60283b source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Tue Jul 25 12:58:16 2017 +0530 +++ b/source/common/x86/mc-a.asm Tue Jul 25 16:37:38 2017 +0530 @@ -2892,17 +2892,85 @@ ADDAVG_W64_H2_AVX2 48 ADDAVG_W64_H2_AVX2 64 -%macro ADDAVG_W64_H2_AVX512 1 -INIT_ZMM avx512 -cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride - vbroadcasti32x8 m4, [pw_256] - vbroadcasti32x8 m5, [pw_128] +%macro ADDAVG_W48_H2_AVX2 1 +INIT_YMM avx2 +cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] add r3, r3 add r4, r4 - mov r6d, %1/16 + mov r6d, %1/2 .loop: -%rep 8 + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 32] + movu m2, [r1 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2], m0 + + movu m0, [r0 + 64] + movu m1, [r1 + 64] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vpermq m0, m0, 11011000b + vextracti128 [r2 + 32], m0, 0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + r3 + 32] + movu m2, [r1 + r4 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2 + r5], m0 + + movu m0, [r0 + r3 + 64] + movu m1, [r1 + r4 + 64] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vpermq m0, m0, 11011000b + vextracti128 [r2 + r5 + 32], m0, 0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W48_H2_AVX2 64 + +;----------------------------------------------------------------------------- +; addAvg avx2 code end +;----------------------------------------------------------------------------- +; addAvg avx512 code start +;----------------------------------------------------------------------------- +%macro PROCESS_ADDAVG_64x2_AVX512 0 movu m0, [r0] movu m1, [r1] movu m2, [r0 + 64] @@ -2919,7 +2987,6 @@ vshufi64x2 m0, m0, 11011000b movu [r2], m0 - movu m0, [r0 + r3] movu m1, [r1 + r4] movu m2, [r0 + r3 + 64] @@ -2935,99 +3002,35 @@ vpermq m0, m0, 11011000b vshufi64x2 m0, m0, 11011000b movu [r2 + r5], m0 - +%endmacro + +;-------------------------------------------------------------------------------------------------------------------- +;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) +;-------------------------------------------------------------------------------------------------------------------- +%macro ADDAVG_W64_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_64x%1, 6,6,6 + vbroadcasti32x8 m4, [pw_256] + vbroadcasti32x8 m5, [pw_128] + add r3, r3 + add r4, r4 + +%rep %1/2 - 1 + PROCESS_ADDAVG_64x2_AVX512 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep - - dec r6d - jnz .loop + PROCESS_ADDAVG_64x2_AVX512 RET %endmacro -ADDAVG_W64_H2_AVX512 16 -ADDAVG_W64_H2_AVX512 32 -ADDAVG_W64_H2_AVX512 48 -ADDAVG_W64_H2_AVX512 64 - -%macro ADDAVG_W48_H2_AVX2 1 -INIT_YMM avx2 -cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride - mova m4, [pw_256] - mova m5, [pw_128] - add r3, r3 - add r4, r4 - mov r6d, %1/2 - -.loop: - movu m0, [r0] - movu m1, [r1] - paddw m0, m1 - pmulhrsw m0, m4 - paddw m0, m5 - - movu m1, [r0 + 32] - movu m2, [r1 + 32] - paddw m1, m2 - pmulhrsw m1, m4 - paddw m1, m5 - - packuswb m0, m1 - vpermq m0, m0, 11011000b - movu [r2], m0 - - movu m0, [r0 + 64] - movu m1, [r1 + 64] - paddw m0, m1 - pmulhrsw m0, m4 - paddw m0, m5 - - packuswb m0, m0 - vpermq m0, m0, 11011000b - vextracti128 [r2 + 32], m0, 0 - - movu m0, [r0 + r3] - movu m1, [r1 + r4] - paddw m0, m1 - pmulhrsw m0, m4 - paddw m0, m5 - - movu m1, [r0 + r3 + 32] - movu m2, [r1 + r4 + 32] - paddw m1, m2 - pmulhrsw m1, m4 - paddw m1, m5 - - packuswb m0, m1 - vpermq m0, m0, 11011000b - movu [r2 + r5], m0 - - movu m0, [r0 + r3 + 64] - movu m1, [r1 + r4 + 64] - paddw m0, m1 - pmulhrsw m0, m4 - paddw m0, m5 - - packuswb m0, m0 - vpermq m0, m0, 11011000b - vextracti128 [r2 + r5 + 32], m0, 0 - - lea r2, [r2 + 2 * r5] - lea r0, [r0 + 2 * r3] - lea r1, [r1 + 2 * r4] - - dec r6d - jnz .loop - RET -%endmacro - -ADDAVG_W48_H2_AVX2 64 - +ADDAVG_W64_AVX512 16 +ADDAVG_W64_AVX512 32 +ADDAVG_W64_AVX512 48 +ADDAVG_W64_AVX512 64 ;----------------------------------------------------------------------------- -; addAvg avx2 code end -;----------------------------------------------------------------------------- - +; addAvg avx512 code end ;----------------------------------------------------------------------------- %macro ADDAVG_W24_H2 2 INIT_XMM sse4 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel