# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1502103618 -19800 # Mon Aug 07 16:30:18 2017 +0530 # Node ID ad756cf6d35f0d1460c5a079bea8781ffd67b7c7 # Parent 039ed71e123c3e14bfaabbe3aada944157784b36 [x265-avx512]x86: clean up line endings issue in sad16-a.asm and asm-primitives.cpp files.
diff -r 039ed71e123c -r ad756cf6d35f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 04 16:20:38 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 07 16:30:18 2017 +0530 @@ -2267,17 +2267,6 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); - p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512); - p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); - p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); - p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); - p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); - p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512); - p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); - p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); - p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); - p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); - p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512); p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); @@ -2313,18 +2302,6 @@ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); - p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512); - p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512); - p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); - p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); - p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); - - p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); - p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); - p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); - p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); - p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); - p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); diff -r 039ed71e123c -r ad756cf6d35f source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Fri Aug 04 16:20:38 2017 +0530 +++ b/source/common/x86/sad16-a.asm Mon Aug 07 16:30:18 2017 +0530 @@ -1235,85 +1235,6 @@ %endmacro -%macro PROCESS_SAD_64x8_AVX512 0 - movu m1, [r2] - movu m2, [r2 + mmsize] - movu m3, [r2 + r3] - movu m4, [r2 + r3 + mmsize] - psubw m1, [r0] - psubw m2, [r0 + mmsize] - psubw m3, [r0 + r1] - psubw m4, [r0 + r1 + mmsize] - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - pabsw m4, m4 - paddw m1, m2 - paddw m3, m4 - paddw m5, m1, m3 - - movu m1, [r2 + 2 * r3] - movu m2, [r2 + 2 * r3 + mmsize] - movu m3, [r2 + r5] - movu m4, [r2 + r5 + mmsize] - psubw m1, [r0 + 2 * r1] - psubw m2, [r0 + 2 * r1 + mmsize] - psubw m3, [r0 + r4] - psubw m4, [r0 + r4 + mmsize] - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - pabsw m4, m4 - paddw m1, m2 - paddw m3, m4 - paddw m1, m3 - - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] - - pmaddwd m5, m6 - paddd m0, m5 - pmaddwd m1, m6 - paddd m0, m1 - - movu m1, [r2] - movu m2, [r2 + mmsize] - movu m3, [r2 + r3] - movu m4, [r2 + r3 + mmsize] - psubw m1, [r0] - psubw m2, [r0 + mmsize] - psubw m3, [r0 + r1] - psubw m4, [r0 + r1 + mmsize] - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - pabsw m4, m4 - paddw m1, m2 - paddw m3, m4 - paddw m5, m1, m3 - - movu m1, [r2 + 2 * r3] - movu m2, [r2 + 2 * r3 + mmsize] - movu m3, [r2 + r5] - movu m4, [r2 + r5 + mmsize] - psubw m1, [r0 + 2 * r1] - psubw m2, [r0 + 2 * r1 + mmsize] - psubw m3, [r0 + r4] - psubw m4, [r0 + r4 + mmsize] - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - pabsw m4, m4 - paddw m1, m2 - paddw m3, m4 - paddw m1, m3 - - pmaddwd m5, m6 - paddd m0, m5 - pmaddwd m1, m6 - paddd m0, m1 -%endmacro - %macro PROCESS_SAD_32x8_AVX512 0 movu m1, [r2] movu m2, [r2 + r3] @@ -1368,61 +1289,6 @@ movd eax, xm0 %endmacro -%macro PROCESS_SAD_32x8_AVX512 0 - movu m1, [r2] - movu m2, [r2 + r3] - movu m3, [r2 + 2 * r3] - movu m4, [r2 + r5] - psubw m1, [r0] - psubw m2, [r0 + r1] - psubw m3, [r0 + 2 * r1] - psubw m4, [r0 + r4] - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - pabsw m4, m4 - paddw m1, m2 - paddw m3, m4 - paddw m5, m1, m3 - - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] - - movu m1, [r2] - movu m2, [r2 + r3] - movu m3, [r2 + 2 * r3] - movu m4, [r2 + r5] - psubw m1, [r0] - psubw m2, [r0 + r1] - psubw m3, [r0 + 2 * r1] - psubw m4, [r0 + r4] - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - pabsw m4, m4 - paddw m1, m2 - paddw m3, m4 - paddw m1, m3 - - pmaddwd m5, m6 - paddd m0, m5 - pmaddwd m1, m6 - paddd m0, m1 -%endmacro - -%macro PROCESS_SAD_AVX512_END 0 - vextracti32x8 ym1, m0, 1 - paddd ym0, ym1 - vextracti64x2 xm1, m0, 1 - paddd xm0, xm1 - pshufd xm1, xm0, 00001110b - paddd xm0, xm1 - pshufd xm1, xm0, 00000001b - paddd xm0, xm1 - movd eax, xm0 -%endmacro - - ;----------------------------------------------------------------------------- ; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- @@ -1653,340 +1519,6 @@ RET ;----------------------------------------------------------------------------- -; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t ) -;----------------------------------------------------------------------------- -INIT_ZMM avx512 -cglobal pixel_sad_64x16, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - PROCESS_SAD_AVX512_END - RET - -INIT_ZMM avx512 -cglobal pixel_sad_64x32, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - PROCESS_SAD_AVX512_END - RET - -INIT_ZMM avx512 -cglobal pixel_sad_64x48, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - PROCESS_SAD_AVX512_END - RET - -INIT_ZMM avx512 -cglobal pixel_sad_64x64, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_64x8_AVX512 - PROCESS_SAD_AVX512_END - RET - -;----------------------------------------------------------------------------- -; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t ) -;----------------------------------------------------------------------------- -INIT_ZMM avx512 -cglobal pixel_sad_32x8, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_32x8_AVX512 - PROCESS_SAD_AVX512_END - RET - - -INIT_ZMM avx512 -cglobal pixel_sad_32x16, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - PROCESS_SAD_AVX512_END - RET - -INIT_ZMM avx512 -cglobal pixel_sad_32x24, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - PROCESS_SAD_AVX512_END - RET - -INIT_ZMM avx512 -cglobal pixel_sad_32x32, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - PROCESS_SAD_AVX512_END - RET - -INIT_ZMM avx512 -cglobal pixel_sad_32x64, 4,6,7 - pxor m0, m0 - - vbroadcasti32x8 m6, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] - - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - PROCESS_SAD_32x8_AVX512 - PROCESS_SAD_AVX512_END - RET - -;----------------------------------------------------------------------------- -; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t ) -;----------------------------------------------------------------------------- -INIT_ZMM avx512 -cglobal pixel_sad_48x64, 4, 7, 9 - pxor m0, m0 - mov r6d, 64/8 - - vbroadcasti32x8 m8, [pw_1] - - add r3d, r3d - add r1d, r1d - lea r4d, [r1 * 3] - lea r5d, [r3 * 3] -.loop: - movu m1, [r2] - movu m2, [r2 + r3] - movu ym3, [r2 + mmsize] - vinserti32x8 m3, [r2 + r3 + mmsize], 1 - movu m4, [r0] - movu m5, [r0 + r1] - movu ym6, [r0 + mmsize] - vinserti32x8 m6, [r0 + r1 + mmsize], 1 - - psubw m1, m4 - psubw m2, m5 - psubw m3, m6 - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - paddw m1, m2 - paddw m7, m3, m1 - - movu m1, [r2 + 2 * r3] - movu m2, [r2 + r5] - movu ym3, [r2 + 2 * r3 + mmsize] - vinserti32x8 m3, [r2 + r5 + mmsize], 1 - movu m4, [r0 + 2 * r1] - movu m5, [r0 + r4] - movu ym6, [r0 + 2 * r1 + mmsize] - vinserti32x8 m6, [r0 + r4 + mmsize], 1 - psubw m1, m4 - psubw m2, m5 - psubw m3, m6 - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - paddw m1, m2 - paddw m1, m3 - - pmaddwd m7, m8 - paddd m0, m7 - pmaddwd m1, m8 - paddd m0, m1 - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] - - movu m1, [r2] - movu m2, [r2 + r3] - movu ym3, [r2 + mmsize] - vinserti32x8 m3, [r2 + r3 + mmsize], 1 - movu m4, [r0] - movu m5, [r0 + r1] - movu ym6, [r0 + mmsize] - vinserti32x8 m6, [r0 + r1 + mmsize], 1 - - psubw m1, m4 - psubw m2, m5 - psubw m3, m6 - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - paddw m1, m2 - paddw m7, m3, m1 - - movu m1, [r2 + 2 * r3] - movu m2, [r2 + r5] - movu ym3, [r2 + 2 * r3 + mmsize] - vinserti32x8 m3, [r2 + r5 + mmsize], 1 - movu m4, [r0 + 2 * r1] - movu m5, [r0 + r4] - movu ym6, [r0 + 2 * r1 + mmsize] - vinserti32x8 m6, [r0 + r4 + mmsize], 1 - psubw m1, m4 - psubw m2, m5 - psubw m3, m6 - pabsw m1, m1 - pabsw m2, m2 - pabsw m3, m3 - paddw m1, m2 - paddw m1, m3 - - pmaddwd m7, m8 - paddd m0, m7 - pmaddwd m1, m8 - paddd m0, m1 - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] - - dec r6d - jg .loop - - PROCESS_SAD_AVX512_END - RET - -;----------------------------------------------------------------------------- ; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_ZMM avx512 @@ -3281,789 +2813,3 @@ PROCESS_SAD_X4_32x4_AVX512 PROCESS_SAD_X4_END_AVX512 RET - -;============================ -; SAD x3/x4 avx512 code start -;============================ - -%macro PROCESS_SAD_X4_32x4_AVX512 0 - movu m8, [r0] - movu m4, [r1] - movu m5, [r2] - movu m6, [r3] - movu m7, [r4] - - - psubw m4, m8 - psubw m5, m8 - psubw m6, m8 - psubw m7, m8 - pabsw m4, m4 - pabsw m5, m5 - pabsw m6, m6 - pabsw m7, m7 - - pmaddwd m4, m9 - paddd m0, m4 - pmaddwd m5, m9 - paddd m1, m5 - pmaddwd m6, m9 - paddd m2, m6 - pmaddwd m7, m9 - paddd m3, m7 - - - movu m8, [r0 + 2 * FENC_STRIDE] - movu m4, [r1 + r5] - movu m5, [r2 + r5] - movu m6, [r3 + r5] - movu m7, [r4 + r5] - - - psubw m4, m8 - psubw m5, m8 - psubw m6, m8 - psubw m7, m8 - pabsw m4, m4 - pabsw m5, m5 - pabsw m6, m6 - pabsw m7, m7 - - pmaddwd m4, m9 - paddd m0, m4 - pmaddwd m5, m9 - paddd m1, m5 - pmaddwd m6, m9 - paddd m2, m6 - pmaddwd m7, m9 - paddd m3, m7 - - movu m8, [r0 + 4 * FENC_STRIDE] - movu m4, [r1 + 2 * r5] - movu m5, [r2 + 2 * r5] - movu m6, [r3 + 2 * r5] - movu m7, [r4 + 2 * r5] - - - psubw m4, m8 - psubw m5, m8 - psubw m6, m8 - psubw m7, m8 - pabsw m4, m4 - pabsw m5, m5 - pabsw m6, m6 - pabsw m7, m7 - - pmaddwd m4, m9 - paddd m0, m4 - pmaddwd m5, m9 - paddd m1, m5 - pmaddwd m6, m9 - paddd m2, m6 - pmaddwd m7, m9 - paddd m3, m7 - - movu m8, [r0 + 6 * FENC_STRIDE] - movu m4, [r1 + r7] - movu m5, [r2 + r7] - movu m6, [r3 + r7] - movu m7, [r4 + r7] - - - psubw m4, m8 - psubw m5, m8 - psubw m6, m8 - psubw m7, m8 - pabsw m4, m4 - pabsw m5, m5 - pabsw m6, m6 - pabsw m7, m7 - - pmaddwd m4, m9 - paddd m0, m4 - pmaddwd m5, m9 - paddd m1, m5 - pmaddwd m6, m9 - paddd m2, m6 - pmaddwd m7, m9 - paddd m3, m7 -%endmacro - - -%macro PROCESS_SAD_X4_END_AVX512 0 - vextracti32x8 ym4, m0, 1 - vextracti32x8 ym5, m1, 1 - vextracti32x8 ym6, m2, 1 - vextracti32x8 ym7, m3, 1 - - paddd ym0, ym4 - paddd ym1, ym5 - paddd ym2, ym6 - paddd ym3, ym7 - - vextracti64x2 xm4, m0, 1 - vextracti64x2 xm5, m1, 1 - vextracti64x2 xm6, m2, 1 - vextracti64x2 xm7, m3, 1 - - paddd xm0, xm4 - paddd xm1, xm5 - paddd xm2, xm6 - paddd xm3, xm7 - - pshufd xm4, xm0, 00001110b - pshufd xm5, xm1, 00001110b - pshufd xm6, xm2, 00001110b - pshufd xm7, xm3, 00001110b - - paddd xm0, xm4 - paddd xm1, xm5 - paddd xm2, xm6 - paddd xm3, xm7 - - pshufd xm4, xm0, 00000001b - pshufd xm5, xm1, 00000001b - pshufd xm6, xm2, 00000001b - pshufd xm7, xm3, 00000001b - - paddd xm0, xm4 - paddd xm1, xm5 - paddd xm2, xm6 - paddd xm3, xm7 - - mov r0, r6mp - movd [r0 + 0], xm0 - movd [r0 + 4], xm1 - movd [r0 + 8], xm2 - movd [r0 + 12], xm3 -%endmacro - - - -%macro PROCESS_SAD_X3_32x4_AVX512 0 - movu m6, [r0] - movu m3, [r1] - movu m4, [r2] - movu m5, [r3] - - - psubw m3, m6 - psubw m4, m6 - psubw m5, m6 - pabsw m3, m3 - pabsw m4, m4 - pabsw m5, m5 - - pmaddwd m3, m7 - paddd m0, m3 - pmaddwd m4, m7 - paddd m1, m4 - pmaddwd m5, m7 - paddd m2, m5 - - movu m6, [r0 + 2 * FENC_STRIDE] - movu m3, [r1 + r4] - movu m4, [r2 + r4] - movu m5, [r3 + r4] - - psubw m3, m6 - psubw m4, m6 - psubw m5, m6 - pabsw m3, m3 - pabsw m4, m4 - pabsw m5, m5 - - pmaddwd m3, m7 - paddd m0, m3 - pmaddwd m4, m7 - paddd m1, m4 - pmaddwd m5, m7 - paddd m2, m5 - - movu m6, [r0 + 4 * FENC_STRIDE] - movu m3, [r1 + 2 * r4] - movu m4, [r2 + 2 * r4] - movu m5, [r3 + 2 * r4] - - psubw m3, m6 - psubw m4, m6 - psubw m5, m6 - pabsw m3, m3 - pabsw m4, m4 - pabsw m5, m5 - - pmaddwd m3, m7 - paddd m0, m3 - pmaddwd m4, m7 - paddd m1, m4 - pmaddwd m5, m7 - paddd m2, m5 - - movu m6, [r0 + 6 * FENC_STRIDE] - movu m3, [r1 + r6] - movu m4, [r2 + r6] - movu m5, [r3 + r6] - - psubw m3, m6 - psubw m4, m6 - psubw m5, m6 - pabsw m3, m3 - pabsw m4, m4 - pabsw m5, m5 - - pmaddwd m3, m7 - paddd m0, m3 - pmaddwd m4, m7 - paddd m1, m4 - pmaddwd m5, m7 - paddd m2, m5 -%endmacro - - -%macro PROCESS_SAD_X3_END_AVX512 0 - vextracti32x8 ym3, m0, 1 - vextracti32x8 ym4, m1, 1 - vextracti32x8 ym5, m2, 1 - - paddd ym0, ym3 - paddd ym1, ym4 - paddd ym2, ym5 - - vextracti64x2 xm3, m0, 1 - vextracti64x2 xm4, m1, 1 - vextracti64x2 xm5, m2, 1 - - paddd xm0, xm3 - paddd xm1, xm4 - paddd xm2, xm5 - - pshufd xm3, xm0, 00001110b - pshufd xm4, xm1, 00001110b - pshufd xm5, xm2, 00001110b - - paddd xm0, xm3 - paddd xm1, xm4 - paddd xm2, xm5 - - pshufd xm3, xm0, 00000001b - pshufd xm4, xm1, 00000001b - pshufd xm5, xm2, 00000001b - - paddd xm0, xm3 - paddd xm1, xm4 - paddd xm2, xm5 - - movd [r5 + 0], xm0 - movd [r5 + 4], xm1 - movd [r5 + 8], xm2 -%endmacro - - -;------------------------------------------------------------------------------------------------------------------------------------------ -; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res ) -;------------------------------------------------------------------------------------------------------------------------------------------ - -INIT_ZMM avx512 -cglobal pixel_sad_x3_32x8, 6,7,8 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - - vbroadcasti32x8 m7, [pw_1] - - add r4d, r4d - lea r6d, [r4 * 3] - - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - PROCESS_SAD_X3_END_AVX512 - RET - - -INIT_ZMM avx512 -cglobal pixel_sad_x3_32x16, 6,7,8 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - - vbroadcasti32x8 m7, [pw_1] - - add r4d, r4d - lea r6d, [r4 * 3] - - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - PROCESS_SAD_X3_END_AVX512 - RET - -INIT_ZMM avx512 -cglobal pixel_sad_x3_32x24, 6,7,8 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - - vbroadcasti32x8 m7, [pw_1] - - add r4d, r4d - lea r6d, [r4 * 3] - - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - PROCESS_SAD_X3_END_AVX512 - RET - - -INIT_ZMM avx512 -cglobal pixel_sad_x3_32x32, 6,7,8 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - - vbroadcasti32x8 m7, [pw_1] - - add r4d, r4d - lea r6d, [r4 * 3] - - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - PROCESS_SAD_X3_END_AVX512 - RET - -INIT_ZMM avx512 -cglobal pixel_sad_x3_32x64, 6,7,8 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - - vbroadcasti32x8 m7, [pw_1] - - add r4d, r4d - lea r6d, [r4 * 3] - - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r4 * 4] - PROCESS_SAD_X3_32x4_AVX512 - PROCESS_SAD_X3_END_AVX512 - RET - - -;------------------------------------------------------------------------------------------------------------------------------------------------------------ -; void pixel_sad_x4_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res ) -;------------------------------------------------------------------------------------------------------------------------------------------------------------ - -INIT_ZMM avx512 -cglobal pixel_sad_x4_32x8, 6,8,10 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - pxor m3, m3 - - vbroadcasti32x8 m9, [pw_1] - - add r5d, r5d - lea r7d, [r5 * 3] - - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - PROCESS_SAD_X4_END_AVX512 - RET - -INIT_ZMM avx512 -cglobal pixel_sad_x4_32x16, 6,8,10 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - pxor m3, m3 - - vbroadcasti32x8 m9, [pw_1] - - add r5d, r5d - lea r7d, [r5 * 3] - - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - PROCESS_SAD_X4_END_AVX512 - RET - -INIT_ZMM avx512 -cglobal pixel_sad_x4_32x24, 6,8,10 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - pxor m3, m3 - - vbroadcasti32x8 m9, [pw_1] - - add r5d, r5d - lea r7d, [r5 * 3] - - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - PROCESS_SAD_X4_END_AVX512 - RET - - -INIT_ZMM avx512 -cglobal pixel_sad_x4_32x32, 6,8,10 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - pxor m3, m3 - - vbroadcasti32x8 m9, [pw_1] - - add r5d, r5d - lea r7d, [r5 * 3] - - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - PROCESS_SAD_X4_END_AVX512 - RET - -INIT_ZMM avx512 -cglobal pixel_sad_x4_32x64, 6,8,10 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - pxor m3, m3 - - vbroadcasti32x8 m9, [pw_1] - - add r5d, r5d - lea r7d, [r5 * 3] - - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - add r0, FENC_STRIDE * 8 - lea r1, [r1 + r5 * 4] - lea r2, [r2 + r5 * 4] - lea r3, [r3 + r5 * 4] - lea r4, [r4 + r5 * 4] - PROCESS_SAD_X4_32x4_AVX512 - PROCESS_SAD_X4_END_AVX512 - RET - _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel