Re: [x265] [PATCH 000 of 307 ] AVX-512 implementataion in x265

2018-04-06 Thread Praveen Tiwari
Your request is on the way, soon we will share the performance related
details. Thanks.

Regards,
Praveen Tiwari

On Fri, Apr 6, 2018 at 9:36 PM, Vittorio Giovara  wrote:

> just curious, what kind of general speed improvement does this give?
> I could have missed them in the series, but it would be nice to have some
> sort of benchmarks
> thanks
> Vittorio
>
> On Sat, Apr 7, 2018 at 4:29 AM,  wrote:
>
>> This series of patches enables AVX-512 in x265. USe CLI option --asm
>> avx512 to enable AVX-512 kernels.
>> ___
>> x265-devel mailing list
>> x265-devel@videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
>
> --
> Vittorio
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH 300 of 307] x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2

2018-04-06 Thread chen
Sorry, I miss a line, resend with addition comment

At 2018-04-07 01:27:34, "chen"  wrote:


At 2018-04-06 21:17:37, mythr...@multicorewareinc.com wrote:
># HG changeset patch
># User Jayashree
># Date 1517283539 28800
>#  Mon Jan 29 19:38:59 2018 -0800
># Node ID 3c6e5ce07dbca7f967e4b5b62fe450979da3bf81
># Parent  624c83571d1df840e1206c46e589044fbf87ff32
>x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2
>
>count_nonzero[16x16]   18.88x ->  23.04x
>
>+;-
>+; int x265_count_nonzero_16x16_avx512(const int16_t *quantCoeff);
>+;-
>+INIT_ZMM avx512
>+cglobal count_nonzero_16x16, 1,4,2
>+mov r1, 0x
>+kmovq   k2, r1



https://www.cs.utexas.edu/~hunt/class/2017-spring/cs350c/documents/Intel-x86-Docs/64-ia-32-architectures-instruction-set-extensions-reference-manual.pdf
2.5.1.1 Opmask Register K0
The only exception to the opmask rules described above is that opmask k0 can 
not be used as a predicate operand.
Opmask k0 cannot be encoded as a predicate operand for a vector operation; the 
encoding value that would select
opmask k0 will instead selects an implicit opmask value of 0x, 
thereby effectively disabling
masking. Opmask register k0 can still be used for any instruction that takes 
opmask register(s) as operand(s)
(either source or destination).



>+xor r3, r3
>+pxorm0, m0
>+
>+%assign x 0

>+%rep 4
unroll 4 times only, so unnecessary unroll in here
I suggest load all of bytes in same time, it can be hidden memory latency with 
calculate instructions.


>+movum1, [r0 + x]

>+vpacksswb   m1, [r0 + x + 64]
>+%assign x x+128
>+vpcmpb  k1 {k2}, m1, m0, 0100b
could you please declare a new macro/const, the developers are difficult to 
understand that the '0100b' (4) means NE (on Intel's document).


>+kmovq   r1, k1
>+popcnt  r2, r1
>+add r3d, r2d
>+%endrep
>+mov eax, r3d
>+
>+RET
>+

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 105 of 307] x86: AVX512 interp_4tap_horiz_pp_48x64 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1504169664 -19800
#  Thu Aug 31 14:24:24 2017 +0530
# Node ID 9928b3e5b4d4235bea9ffb22434446e68c3aacdb
# Parent  052b8b5061d84b791489c01e114a0441f96e4ec2
x86: AVX512 interp_4tap_horiz_pp_48x64 for high bit depth

AVX2 performance   : 9.46x
AVX512 performance : 18.97x

diff -r 052b8b5061d8 -r 9928b3e5b4d4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Aug 31 13:03:39 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Aug 31 14:24:24 2017 +0530
@@ -2389,6 +2389,7 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_48x64_avx512);
 
 }
 }
diff -r 052b8b5061d8 -r 9928b3e5b4d4 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Thu Aug 31 13:03:39 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Thu Aug 31 14:24:24 2017 +0530
@@ -5175,6 +5175,90 @@
 movu[r2 + r3], m7
 %endmacro
 
+%macro PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512 0
+; register map
+; m0 , m1 interpolate coeff
+; m2 , m3  shuffle order table
+; m4 - pd_32
+; m5 - zero
+; m6 - pw_pixel_max
+
+movum7,[r0]
+movum8,[r0 + 8]
+
+pshufb  m9,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m9,m1
+paddd   m7,m9
+paddd   m7,m4
+psrad   m7,6
+
+pshufb  m9,m8,m3
+pshufb  m8,m2
+pmaddwd m8,m0
+pmaddwd m9,m1
+paddd   m8,m9
+paddd   m8,m4
+psrad   m8,6
+
+packusdwm7,m8
+CLIPW   m7,m5,m6
+pshufb  m7,m10
+movu[r2],  m7
+
+movum7,[r0 + r1]
+movum8,[r0 + r1 + 8]
+
+pshufb  m9,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m9,m1
+paddd   m7,m9
+paddd   m7,m4
+psrad   m7,6
+
+pshufb  m9,m8,m3
+pshufb  m8,m2
+pmaddwd m8,m0
+pmaddwd m9,m1
+paddd   m8,m9
+paddd   m8,m4
+psrad   m8,6
+
+packusdwm7,m8
+CLIPW   m7,m5,m6
+pshufb  m7,m10
+movu[r2 + r3], m7
+
+movuym7,   [r0 + mmsize]
+vinserti32x8m7,[r0 + r1 + mmsize], 1
+movuym8,   [r0 + mmsize + 8]
+vinserti32x8m8,[r0 + r1 + mmsize + 8],  1
+
+pshufb  m9,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m9,m1
+paddd   m7,m9
+paddd   m7,m4
+psrad   m7,6
+
+pshufb  m9,m8,m3
+pshufb  m8,m2
+pmaddwd m8,m0
+pmaddwd m9,m1
+paddd   m8,m9
+paddd   m8,m4
+psrad   m8,6
+
+packusdwm7,m8
+CLIPW   m7,m5,m6
+pshufb  m7,m10
+movu[r2 + mmsize],  ym7
+vextracti32x8   [r2 + r3 + mmsize], m7,1
+%endmacro
+
 %macro PROCESS_IPFILTER_CHROMA_PP_64x2_AVX512 0
 ; register map
 ; m0 , m1 interpolate coeff
@@ -5394,6 +5478,35 @@
 IPFILTER_CHROMA_AVX512_64xN 32
 IPFILTER_CHROMA_AVX512_64xN 48
 IPFILTER_CHROMA_AVX512_64xN 64
+
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_pp_48x64, 5,6,11
+add r1d, r1d
+add r3d, r3d
+sub r0, 2
+mov r4d, r4m
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+vpbroadcastdm0, [r5 + r4 * 8]
+vpbroadcastdm1, [r5 + r4 * 8 + 4]
+%else
+vpbroadcastdm0, [tab_ChromaCoeff + r4 * 8]
+vpbroadcastdm1, [tab_ChromaCoeff + r4 * 8 + 4]
+%endif
+vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
+vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
+vbroadcasti32x8 m4, [pd_32]
+pxorm5, m5
+vbroadcasti32x8 m6, [pw_pixel_max]
+vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
+
+%rep 31
+PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512
+lea r0, [r0 + 2 * r1]
+lea 

[x265] [PATCH 095 of 307] x86: AVX512 copy_pp_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503462961 -19800
#  Wed Aug 23 10:06:01 2017 +0530
# Node ID 31a180bcef33fae436ad7e3aa4378b283a86d56a
# Parent  7868f1cb521d554dc77d768ec1f838e0b29824e4
x86: AVX512 copy_pp_32xN

Size  |  AVX2 performance | AVX512 performance
--
32x16 |  1.63x   |  2.58x
32x24 |  2.51x   |  2.87x
32x32 |  2.48x   |  2.95x
32x64 |  2.03x   |  2.53x

This patch also clean up code for 64xN

diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 22 13:51:33 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Aug 23 10:06:01 2017 +0530
@@ -3965,6 +3965,18 @@
 p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512);
 p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512);
 p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
+p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
+p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
+p.pu[LUMA_32x32].copy_pp  = PFX(blockcopy_pp_32x32_avx512);
+p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
+
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = 
PFX(blockcopy_pp_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = 
PFX(blockcopy_pp_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = 
PFX(blockcopy_pp_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = 
PFX(blockcopy_pp_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = 
PFX(blockcopy_pp_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = 
PFX(blockcopy_pp_32x48_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = 
PFX(blockcopy_pp_32x64_avx512);
 
 p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
 p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Tue Aug 22 13:51:33 2017 +0530
+++ b/source/common/x86/blockcopy8.asm  Wed Aug 23 10:06:01 2017 +0530
@@ -1107,7 +1107,7 @@
 BLOCKCOPY_PP_W64_H4_avx 64, 64
 
 
;--
-; Macro to calculate blockcopy_pp_64x4_avx512
+; blockcopy_pp avx512 code start
 
;--
 %macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0
 movum0, [r2]
@@ -1121,16 +1121,28 @@
 movu[r0 + r5] , m3
 %endmacro
 
+%macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0
+movu   ym0, [r2]
+vinserti32x8   m0,  [r2 + r3], 1
+movu   ym1, [r2 + 2 * r3]
+vinserti32x8   m1,  [r2 + r4], 1
+
+movu   [r0] ,  ym0
+vextracti32x8  [r0 + r1] , m0,1
+movu   [r0 + 2 * r1]  ,ym1
+vextracti32x8  [r0 + r5] , m1,1
+%endmacro
+
 
;--
 ; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, 
intptr_t srcStride)
 
;--
 %macro BLOCKCOPY_PP_W64_H4_avx512 1
 INIT_ZMM avx512
-cglobal blockcopy_pp_64x%1, 4, 4, 6
+cglobal blockcopy_pp_64x%1, 4, 6, 4
 lear4,  [3 * r3]
 lear5,  [3 * r1]
 
-%rep %1/4 - 1 
+%rep %1/4 - 1
 PROCESS_BLOCKCOPY_PP_64X4_avx512
 lea r2, [r2 + 4 * r3]
 lea r0, [r0 + 4 * r1] 
@@ -1145,7 +1157,30 @@
 BLOCKCOPY_PP_W64_H4_avx512 48
 BLOCKCOPY_PP_W64_H4_avx512 64
 
-
+%macro BLOCKCOPY_PP_W32_H4_avx512 1
+INIT_ZMM avx512
+cglobal blockcopy_pp_32x%1, 4, 6, 2
+lear4,  [3 * r3]
+lear5,  [3 * r1]
+
+%rep %1/4 - 1
+PROCESS_BLOCKCOPY_PP_32X4_avx512
+lea r2, [r2 + 4 * r3]
+lea r0, [r0 + 4 * r1] 
+%endrep
+PROCESS_BLOCKCOPY_PP_32X4_avx512
+RET
+%endmacro
+
+BLOCKCOPY_PP_W32_H4_avx512 8
+BLOCKCOPY_PP_W32_H4_avx512 16
+BLOCKCOPY_PP_W32_H4_avx512 24
+BLOCKCOPY_PP_W32_H4_avx512 32
+BLOCKCOPY_PP_W32_H4_avx512 48
+BLOCKCOPY_PP_W32_H4_avx512 64
+;--
+; blockcopy_pp avx512 code end
+;--
 
 ;-
 ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, 
intptr_t srcStride)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 101 of 307] x86: revoke some changes in ipfilter16

2018-04-06 Thread mythreyi
# HG changeset patch
# User Aasaipriya Chandran 
# Date 1522962868 25200
#  Thu Apr 05 14:14:28 2018 -0700
# Node ID 1a31df496144c526fd5eba9d960bb286a81ae2d5
# Parent  562c00d2153193eec85ab907b60eeb5aca7cc609
x86: revoke some changes in ipfilter16

diff -r 562c00d21531 -r 1a31df496144 source/common/x86/h-ipfilter16.asm
--- a/source/common/x86/h-ipfilter16.asmMon Aug 28 14:59:38 2017 +0530
+++ b/source/common/x86/h-ipfilter16.asmThu Apr 05 14:14:28 2018 -0700
@@ -47,7 +47,7 @@
 
 h_pd_524800:times 8 dd 524800
 
-tab_LumaCoeff:dw   0, 0,  0,  64,  0,   0,  0,  0
+h_tab_LumaCoeff:dw   0, 0,  0,  64,  0,   0,  0,  0
   dw  -1, 4, -10, 58,  17, -5,  1,  0
   dw  -1, 4, -11, 40,  40, -11, 4, -1
   dw   0, 1, -5,  17,  58, -10, 4, -1
@@ -207,10 +207,10 @@
 add r3d,r3d
 
 %ifdef PIC
-lea r6, [tab_LumaCoeff]
+lea r6, [h_tab_LumaCoeff]
 movam0, [r6 + r4]
 %else
-movam0, [tab_LumaCoeff + r4]
+movam0, [h_tab_LumaCoeff + r4]
 %endif
 
 %ifidn %3, pp
@@ -625,10 +625,10 @@
 add r3, r3
 
 %ifdef PIC
-lea r6, [tab_LumaCoeff]
+lea r6, [h_tab_LumaCoeff]
 movam0, [r6 + r4]
 %else
-movam0, [tab_LumaCoeff + r4]
+movam0, [h_tab_LumaCoeff + r4]
 %endif
 
 %ifidn %3, pp
@@ -712,10 +712,10 @@
 shl r4d, 4
 
 %ifdef PIC
-lea r6, [tab_LumaCoeff]
+lea r6, [h_tab_LumaCoeff]
 movam0, [r6 + r4]
 %else
-movam0, [tab_LumaCoeff + r4]
+movam0, [h_tab_LumaCoeff + r4]
 %endif
 
 %ifidn %3, pp
@@ -815,10 +815,10 @@
 shl r4d, 4
 
 %ifdef PIC
-lea r6, [tab_LumaCoeff]
+lea r6, [h_tab_LumaCoeff]
 movam0, [r6 + r4]
 %else
-movam0, [tab_LumaCoeff + r4]
+movam0, [h_tab_LumaCoeff + r4]
 %endif
 %ifidn %3, pp
 movam1, [INTERP_OFFSET_PP]
@@ -936,10 +936,10 @@
 shl r4d, 4
 
 %ifdef PIC
-lea r6, [tab_LumaCoeff]
+lea r6, [h_tab_LumaCoeff]
 movam0, [r6 + r4]
 %else
-movam0, [tab_LumaCoeff + r4]
+movam0, [h_tab_LumaCoeff + r4]
 %endif
 
 %ifidn %3, pp
@@ -1132,10 +1132,10 @@
 shl r4d, 4
 
 %ifdef PIC
-lea r6, [tab_LumaCoeff]
+lea r6, [h_tab_LumaCoeff]
 movam0, [r6 + r4]
 %else
-movam0, [tab_LumaCoeff + r4]
+movam0, [h_tab_LumaCoeff + r4]
 %endif
 %ifidn %3, pp
 movam1, [pd_32]
@@ -1307,12 +1307,12 @@
 mov  r4d, r4m
 shl  r4d, 4
 %ifdef PIC
-lea  r5, [tab_LumaCoeff]
+lea  r5, [h_tab_LumaCoeff]
 vpbroadcastq m0, [r5 + r4]
 vpbroadcastq m1, [r5 + r4 + 8]
 %else
-vpbroadcastq m0, [tab_LumaCoeff + r4]
-vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
+vpbroadcastq m0, [h_tab_LumaCoeff + r4]
+vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
 %endif
 lea  r6, [pw_pixel_max]
 mova m3, [interp8_hpp_shuf]
@@ -1385,11 +1385,11 @@
 mov  r4d, r4m
 shl  r4d, 4
 %ifdef PIC
-lea  r5, [tab_LumaCoeff]
+lea  r5, [h_tab_LumaCoeff]
 vpbroadcastq m0, [r5 + r4]
 vpbroadcastq m1, [r5 + r4 + 8]
 %else
-vpbroadcastq m0, [tab_LumaCoeff + r4]
+vpbroadcastq m0, [h_tab_LumaCoeff + r4]
 vpbroadcastq m1, [h_ab_LumaCoeff + r4 + 8]
 %endif
 mova m3, [interp8_hpp_shuf]
@@ -1481,12 +1481,12 @@
 mov  r4d, r4m
 shl  r4d, 4
 %ifdef PIC
-lea  r5, [tab_LumaCoeff]
+lea  r5, [h_tab_LumaCoeff]
 vpbroadcastq m0, [r5 + r4]
 vpbroadcastq m1, [r5 + r4 + 8]
 %else
-vpbroadcastq m0, [tab_LumaCoeff + r4]
-vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
+vpbroadcastq m0, [h_tab_LumaCoeff + r4]
+vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
 %endif
 mova m3, [interp8_hpp_shuf]
 mova m7, [pd_32]
@@ -1579,12 +1579,12 @@
 mov  r4d, r4m
 shl  r4d, 4
 %ifdef PIC
-lea  r5, [tab_LumaCoeff]
+lea  r5, [h_tab_LumaCoeff]
 vpbroadcastq m0, [r5 + r4]
 vpbroadcastq m1, [r5 + r4 + 8]
 %else
-vpbroadcastq m0, [tab_LumaCoeff + r4]
-vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
+vpbroadcastq m0, [h_tab_LumaCoeff + r4]
+vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
 %endif
 mova m3, [interp8_hpp_shuf]
 mova m7, [pd_32]
@@ -1684,12 +1684,12 @@
 mov  r4d, r4m
 shl  r4d, 4
 %ifdef PIC
-lea  r5, [tab_LumaCoeff]
+lea  r5, 

[x265] [PATCH 099 of 307] x86: AVX512 interp_4tap_horiz_pp_48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503911788 -19800
#  Mon Aug 28 14:46:28 2017 +0530
# Node ID a7bf0a24cfc8eb8edc95d340b240b91d03dac5bd
# Parent  45e4dd746cfd9380dbe2344a5754a6ff6e9feed5
x86: AVX512 interp_4tap_horiz_pp_48x64

AVX2 performance: 17.53x
AVX512 performance  : 33.60x

diff -r 45e4dd746cfd -r a7bf0a24cfc8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 28 13:46:50 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Aug 28 14:46:28 2017 +0530
@@ -4094,6 +4094,7 @@
 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x12_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x4_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_48x64_avx512);
 
 //i422 chroma_hpp
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x8_avx512);
diff -r 45e4dd746cfd -r a7bf0a24cfc8 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Mon Aug 28 13:46:50 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Mon Aug 28 14:46:28 2017 +0530
@@ -9949,6 +9949,103 @@
 vextracti32x4 [r2 + r7], m5,   3
 %endmacro
 
+%macro PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512 0
+; register map
+; m0 - interpolate coeff
+; m1, m2 - shuffle order table
+; m3 - constant word 1
+; m4 - constant word 512
+movu  ym5,   [r0]
+vinserti32x8   m5,   [r0 + r1], 1
+movu  ym7,   [r0 + 4]
+vinserti32x8   m7,   [r0 + r1 + 4], 1
+
+pshufb m6,   m5,   m2
+pshufb m5,   m1
+pshufb m8,   m7,   m2
+pshufb m7,   m1
+
+pmaddubsw  m5,   m0
+pmaddubsw  m7,   m0
+pmaddwdm5,   m3
+pmaddwdm7,   m3
+
+pmaddubsw  m6,   m0
+pmaddubsw  m8,   m0
+pmaddwdm6,   m3
+pmaddwdm8,   m3
+
+packssdw   m5,   m7
+packssdw   m6,   m8
+pmulhrsw   m5,   m4
+pmulhrsw   m6,   m4
+packuswb   m5,   m6
+movu [r2],  ym5
+vextracti32x8[r2 + r3],  m5,1
+
+movu  ym5,   [r0 + 2 * r1]
+vinserti32x8   m5,   [r0 + r6], 1
+movu  ym7,   [r0 + 2 * r1 + 4]
+vinserti32x8   m7,   [r0 + r6 + 4], 1
+
+pshufb m6,   m5,   m2
+pshufb m5,   m1
+pshufb m8,   m7,   m2
+pshufb m7,   m1
+
+pmaddubsw  m5,   m0
+pmaddubsw  m7,   m0
+pmaddwdm5,   m3
+pmaddwdm7,   m3
+
+pmaddubsw  m6,   m0
+pmaddubsw  m8,   m0
+pmaddwdm6,   m3
+pmaddwdm8,   m3
+
+packssdw   m5,   m7
+packssdw   m6,   m8
+pmulhrsw   m5,   m4
+pmulhrsw   m6,   m4
+packuswb   m5,   m6
+movu [r2 + 2 * r3], ym5
+vextracti32x8[r2 + r7],  m5,1
+
+movu  xm5,   [r0 + mmsize/2]
+vinserti32x4   m5,   [r0 + r1 + mmsize/2],1
+vinserti32x4   m5,   [r0 + 2 * r1 + mmsize/2],2
+vinserti32x4   m5,   [r0 + r6 + mmsize/2],3
+pshufb m6,   m5,   m2
+pshufb m5,   m1
+
+movu  xm7,   [r0 + 36]
+vinserti32x4   m7,   [r0 + r1 + 36],1
+vinserti32x4   m7,   [r0 + 2 * r1 + 36],2
+vinserti32x4   m7,   [r0 + r6 + 36],3
+pshufb m8,   m7,   m2
+pshufb m7,   m1
+
+pmaddubsw  m5,   m0
+pmaddubsw  m7,   m0
+pmaddwdm5,   m3
+pmaddwdm7,   m3
+
+pmaddubsw  m6,   m0
+pmaddubsw  m8,   m0
+pmaddwdm6,   m3
+pmaddwdm8,   m3
+
+packssdw   m5,   m7
+packssdw   m6,   m8
+pmulhrsw   m5,   m4
+pmulhrsw   m6,   m4
+packuswb   m5,   m6
+movu  [r2 + mmsize/2],  xm5
+vextracti32x4 [r2 + r3 + 

[x265] [PATCH 112 of 307] x86: Aligned routine implementation for addavg primitive

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1506512312 -19800
#  Wed Sep 27 17:08:32 2017 +0530
# Node ID 762682acf5c25bdecbfec2d0f4f32da7dea3a9e2
# Parent  b31fc8889e0f8a433be25fb6267552f7d03efeaf
x86: Aligned routine implementation for addavg primitive

diff -r b31fc8889e0f -r 762682acf5c2 source/common/pixel.cpp
--- a/source/common/pixel.cpp   Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/pixel.cpp   Wed Sep 27 17:08:32 2017 +0530
@@ -987,6 +987,7 @@
 #define LUMA_PU(W, H) \
 p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c; \
 p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg; \
+p.pu[LUMA_ ## W ## x ## H].addAvg_aligned = addAvg; \
 p.pu[LUMA_ ## W ## x ## H].sad = sad; \
 p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3; \
 p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4; \
@@ -1103,6 +1104,7 @@
 
 #define CHROMA_PU_420(W, H) \
 p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg  = addAvg; \
+p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg_aligned  = 
addAvg; \
 p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = 
blockcopy_pp_c; \
 
 CHROMA_PU_420(2, 2);
@@ -1180,6 +1182,7 @@
 
 #define CHROMA_PU_422(W, H) \
 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg  = addAvg; \
+p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg_aligned  = 
addAvg; \
 p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = 
blockcopy_pp_c; \
 
 CHROMA_PU_422(2, 4);
diff -r b31fc8889e0f -r 762682acf5c2 source/common/primitives.cpp
--- a/source/common/primitives.cpp  Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/primitives.cpp  Wed Sep 27 17:08:32 2017 +0530
@@ -115,6 +115,7 @@
 {
 p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
 p.chroma[X265_CSP_I444].pu[i].addAvg  = p.pu[i].addAvg;
+p.chroma[X265_CSP_I444].pu[i].addAvg_aligned = p.pu[i].addAvg_aligned;
 p.chroma[X265_CSP_I444].pu[i].satd= p.pu[i].satd;
 p.chroma[X265_CSP_I444].pu[i].p2s = p.pu[i].convert_p2s;
 }
diff -r b31fc8889e0f -r 762682acf5c2 source/common/primitives.h
--- a/source/common/primitives.hMon Sep 25 13:11:24 2017 +0530
+++ b/source/common/primitives.hWed Sep 27 17:08:32 2017 +0530
@@ -245,6 +245,7 @@
 
 pixelavg_pp_t  pixelavg_pp; // quick bidir using pixels (borrowed from 
x264)
 addAvg_t   addAvg;  // bidir motion compensation, uses 16bit 
values
+addAvg_t   addAvg_aligned;
 
 copy_pp_t  copy_pp;
 filter_p2s_t   convert_p2s;
@@ -386,6 +387,7 @@
 filter_pp_t  filter_hpp;
 filter_hps_t filter_hps;
 addAvg_t addAvg;
+addAvg_t addAvg_aligned;
 copy_pp_tcopy_pp;
 filter_p2s_t p2s;
 filter_p2s_t p2s_aligned;
diff -r b31fc8889e0f -r 762682acf5c2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Sep 27 17:08:32 2017 +0530
@@ -2510,6 +2510,65 @@
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_24x32_avx512);
 
+p.pu[LUMA_8x4].addAvg_aligned   = PFX(addAvg_8x4_avx2);
+p.pu[LUMA_8x8].addAvg_aligned   = PFX(addAvg_8x8_avx2);
+p.pu[LUMA_8x16].addAvg_aligned  = PFX(addAvg_8x16_avx2);
+p.pu[LUMA_8x32].addAvg_aligned  = PFX(addAvg_8x32_avx2);
+p.pu[LUMA_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2);
+p.pu[LUMA_16x4].addAvg_aligned = PFX(addAvg_aligned_16x4_avx512);
+p.pu[LUMA_16x8].addAvg_aligned = PFX(addAvg_aligned_16x8_avx512);
+p.pu[LUMA_16x12].addAvg_aligned = PFX(addAvg_aligned_16x12_avx512);
+p.pu[LUMA_16x16].addAvg_aligned = PFX(addAvg_aligned_16x16_avx512);
+p.pu[LUMA_16x32].addAvg_aligned = PFX(addAvg_aligned_16x32_avx512);
+p.pu[LUMA_16x64].addAvg_aligned = PFX(addAvg_aligned_16x64_avx512);
+p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_aligned_48x64_avx512);
+p.pu[LUMA_24x32].addAvg_aligned = PFX(addAvg_24x32_avx2);
+p.pu[LUMA_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512);
+p.pu[LUMA_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512);
+p.pu[LUMA_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512);
+p.pu[LUMA_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
+p.pu[LUMA_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512);
+p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_48x64_avx2);
+p.pu[LUMA_64x16].addAvg_aligned = PFX(addAvg_aligned_64x16_avx512);
+p.pu[LUMA_64x32].addAvg_aligned = 

[x265] [PATCH 088 of 307] x86: AVX512 interp_8tap_horiz_pp_64xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502452096 -19800
#  Fri Aug 11 17:18:16 2017 +0530
# Node ID 354f848c3793b459c005667cdf7158eb6394eb0f
# Parent  2fa52ac34d8a8248d183fccfc78393c45a5f0839
x86: AVX512 interp_8tap_horiz_pp_64xN

Size  |  AVX2 performance | AVX512 performance
--
64x16 |  18.05x   |  39.92x
64x32 |  18.10x   |  40.28x
64x48 |  18.16x   |  40.02x
64x64 |  18.03x   |  40.43x

diff -r 2fa52ac34d8a -r 354f848c3793 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 15 11:24:19 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Aug 11 17:18:16 2017 +0530
@@ -4052,6 +4052,10 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = 
PFX(interp_4tap_horiz_ps_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = 
PFX(interp_4tap_horiz_ps_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = 
PFX(interp_4tap_horiz_ps_32x8_avx512);
+p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512);
+p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512);
+p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
+p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
 
 }
 #endif
diff -r 2fa52ac34d8a -r 354f848c3793 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Tue Aug 15 11:24:19 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Fri Aug 11 17:18:16 2017 +0530
@@ -147,8 +147,8 @@
 const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
 
 const interp4_horiz_shuf_load1_avx512,  times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 
3, 4, 5, 3, 4, 5, 6
-
 const interp4_horiz_shuf_load2_avx512,  times 2 db 8, 9, 10, 11, 9, 10, 11, 
12, 10, 11, 12, 13, 11, 12, 13, 14
+const interp4_horiz_shuf_load3_avx512,  times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 
7, 8, 9, 7, 8, 9, 10
 
 const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 
11, 6, 14, 7, 15
 
@@ -10130,3 +10130,90 @@
 
;-
 ;ipfilter_chroma_avx512 code end
 
;-
+;-
+;ipfilter_luma_avx512 code start
+;-
+%macro PROCESS_IPFILTER_LUMA_PP_64x1_AVX512 0
+; register map
+; m0 , m1 interpolate coeff
+; m2 , m3, m4  shuffle order table
+; m5 - pw_1
+; m6 - pw_512
+
+movu  m7,[r0]
+movu  m9,[r0 + 8]
+
+pshufbm8,m7,m3
+pshufbm7,m2
+pshufbm10,   m9,m3
+pshufbm11,   m9,m4
+pshufbm9,m2
+
+
+pmaddubsw m7,m0
+pmaddubsw m12,   m8,m1
+pmaddwd   m7,m5
+pmaddwd   m12,   m5
+paddd m7,m12
+
+pmaddubsw m8,m0
+pmaddubsw m12,   m9,m1
+pmaddwd   m8,m5
+pmaddwd   m12,   m5
+paddd m8,m12
+
+pmaddubsw m9,m0
+pmaddubsw m12,   m10,   m1
+pmaddwd   m9,m5
+pmaddwd   m12,   m5
+paddd m9,m12
+
+pmaddubsw m10,   m0
+pmaddubsw m12,  m11,m1
+pmaddwd   m10,  m5
+pmaddwd   m12,  m5
+paddd m10,  m12
+
+packssdw  m7,   m8
+packssdw  m9,   m10
+pmulhrsw  m7,   m6
+pmulhrsw  m9,   m6
+packuswb  m7,   m9
+movu  [r2], m7
+%endmacro
+
+%macro IPFILTER_LUMA_64xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_8tap_horiz_pp_64x%1, 4,6,13
+sub   r0,3
+mov   r4d,   r4m
+%ifdef PIC
+lea   r5,[tab_LumaCoeff]
+vpbroadcastd  m0,[r5 + r4 * 8]
+vpbroadcastd  m1,[r5 + r4 * 8 + 4]
+%else
+vpbroadcastd  m0,[tab_LumaCoeff + r4 * 8]
+vpbroadcastd  m1,[tab_LumaCoeff + r4 * 8 + 4]
+%endif
+vbroadcasti32x8   m2,[interp4_horiz_shuf_load1_avx512]
+vbroadcasti32x8   m3,[interp4_horiz_shuf_load3_avx512]
+vbroadcasti32x8   m4,[interp4_horiz_shuf_load2_avx512]
+vpbroadcastd  m5,[pw_1]
+vbroadcasti32x8   m6,[pw_512]
+
+%rep %1-1
+PROCESS_IPFILTER_LUMA_PP_64x1_AVX512
+lea   r0,   

[x265] [PATCH 087 of 307] x86: AVX512 cleanup addAvg low bit depth code

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502776459 -19800
#  Tue Aug 15 11:24:19 2017 +0530
# Node ID 2fa52ac34d8a8248d183fccfc78393c45a5f0839
# Parent  2db192bac0f14d55f7f82b8964d6c67c3a3637c3
x86: AVX512 cleanup addAvg low bit depth code

diff -r 2db192bac0f1 -r 2fa52ac34d8a source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmTue Aug 15 10:32:52 2017 +0530
+++ b/source/common/x86/mc-a.asmTue Aug 15 11:24:19 2017 +0530
@@ -46,13 +46,10 @@
 %error Unsupport bit depth!
 %endif
 
-SECTION_RODATA 32
-
-ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
-ch_shuf_adj: times 8 db 0
- times 8 db 2
- times 8 db 4
- times 8 db 6
+SECTION_RODATA 64
+
+ALIGN 64
+const shuf_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
 
 SECTION .text
 
@@ -3289,8 +3286,9 @@
 %macro PROCESS_ADDAVG_64x2_AVX512 0
 movum0, [r0]
 movum1, [r1]
-movum2, [r0 + 64]
-movum3, [r1 + 64]
+movum2, [r0 + mmsize]
+movum3, [r1 + mmsize]
+
 paddw   m0, m1
 pmulhrswm0, m4
 paddw   m0, m5
@@ -3299,14 +3297,14 @@
 paddw   m2, m5
 
 packuswbm0, m2
-vpermq  m0, m0, 11011000b
-vshufi64x2  m0, m0, 11011000b
+vpermq  m0, m6, m0
 movu[r2], m0
 
 movum0, [r0 + r3]
 movum1, [r1 + r4]
-movum2, [r0 + r3 + 64]
-movum3, [r1 + r4 + 64]
+movum2, [r0 + r3 + mmsize]
+movum3, [r1 + r4 + mmsize]
+
 paddw   m0, m1
 pmulhrswm0, m4
 paddw   m0, m5
@@ -3315,8 +3313,7 @@
 paddw   m2, m5
 
 packuswbm0, m2
-vpermq  m0, m0, 11011000b
-vshufi64x2  m0, m0, 11011000b
+vpermq  m0, m6, m0
 movu[r2 + r5], m0
 %endmacro
 
@@ -3325,9 +3322,11 @@
 
;
 %macro ADDAVG_W64_AVX512 1
 INIT_ZMM avx512
-cglobal addAvg_64x%1, 6,6,6
+cglobal addAvg_64x%1, 6,6,7
 vbroadcasti32x8 m4, [pw_256]
 vbroadcasti32x8 m5, [pw_128]
+movam6, [shuf_avx512]
+
 add r3, r3
 add r4, r4
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 092 of 307] x86: AVX512 interp_4tap_horiz_pp_16xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502628508 -19800
#  Sun Aug 13 18:18:28 2017 +0530
# Node ID ed1932a414bf5962bbeccfd5c9e208b7db90f77f
# Parent  dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd
x86: AVX512 interp_4tap_horiz_pp_16xN

Color Space i444
Size  |  AVX2 performance | AVX512 performance
--
16x4  |  12.87x   |  20.91x
16x8  |  18.03x   |  27.40x
16x12 |  16.95x   |  24.97x
16x16 |  18.82x   |  27.13x
16x32 |  16.21x   |  25.76x
16x64 |  17.41x   |  26.04x

diff -r dbfcd0ee40e9 -r ed1932a414bf source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Sep 01 10:33:48 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Sun Aug 13 18:18:28 2017 +0530
@@ -4021,14 +4021,30 @@
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x24_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x12_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x4_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x64_avx512);
 
 //i422 chroma_hpp
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x8_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x24_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x48_avx512);
 
 //i420 chroma_hpp
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x4_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x12_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x24_avx512);
diff -r dbfcd0ee40e9 -r ed1932a414bf source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Fri Sep 01 10:33:48 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Sun Aug 13 18:18:28 2017 +0530
@@ -9907,6 +9907,48 @@
 vextracti32x8[r2 + r3],  m5,1
 %endmacro
 
+%macro PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512 0
+; register map
+; m0 - interpolate coeff
+; m1, m2 - shuffle order table
+; m3 - constant word 1
+; m4 - constant word 512
+
+movu  xm5,   [r0]
+vinserti32x4   m5,   [r0 + r1],1
+vinserti32x4   m5,   [r0 + 2 * r1],2
+vinserti32x4   m5,   [r0 + r6],3
+pshufb m6,   m5,   m2
+pshufb m5,   m1
+
+movu  xm7,   [r0 + 4]
+vinserti32x4   m7,   [r0 + r1 + 4],1
+vinserti32x4   m7,   [r0 + 2 * r1 + 4],2
+vinserti32x4   m7,   [r0 + r6 + 4],3
+pshufb m8,   m7,   m2
+pshufb m7,   m1
+
+pmaddubsw  m5,   m0
+pmaddubsw  m7,   m0
+pmaddwdm5,   m3
+pmaddwdm7,   m3
+
+pmaddubsw  

[x265] [PATCH 090 of 307] x86: AVX512 interp_8tap_horiz_pp_16xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502617345 -19800
#  Sun Aug 13 15:12:25 2017 +0530
# Node ID d9200885420957bccd4edea62bf87bbe8831bc62
# Parent  4be3c35eb7510f269a548f248e4f5904b4107d74
x86: AVX512 interp_8tap_horiz_pp_16xN

Size  |  AVX2 performance | AVX512 performance
--
16x4  |  19.10x   |  26.27x
16x8  |  19.37x   |  26.59x
16x12 |  19.99x   |  32.66x
16x16 |  19.13x   |  31.47x
16x32 |  18.94x   |  33.38x
16x64 |  18.07x   |  29.97x

diff -r 4be3c35eb751 -r d92008854209 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Sep 01 10:24:43 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Sun Aug 13 15:12:25 2017 +0530
@@ -4053,6 +4053,12 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = 
PFX(interp_4tap_horiz_ps_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = 
PFX(interp_4tap_horiz_ps_32x8_avx512);
 
+p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512);
+p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512);
+p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512);
+p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512);
+p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512);
+p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512);
 p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512);
 p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512);
 p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512);
diff -r 4be3c35eb751 -r d92008854209 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Fri Sep 01 10:24:43 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Sun Aug 13 15:12:25 2017 +0530
@@ -10233,6 +10233,65 @@
 vextracti32x8 [r2 + r3], m7, 1
 %endmacro
 
+%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0
+; register map
+; m0 , m1 interpolate coeff
+; m2 , m3, m4  shuffle order table
+; m5 - pw_1
+; m6 - pw_512
+
+movu xm7,[r0]
+vinserti32x4  m7,[r0 + r1],  1
+vinserti32x4  m7,[r0 + 2 * r1],  2
+vinserti32x4  m7,[r0 + r6],  3
+
+pshufbm8,m7,m3
+pshufbm7,m2
+
+movu xm9,[r0 + 8]
+vinserti32x4  m9,[r0 + r1 + 8],  1
+vinserti32x4  m9,[r0 + 2 * r1 + 8],  2
+vinserti32x4  m9,[r0 + r6 + 8],  3
+
+pshufbm10,   m9,m3
+pshufbm11,   m9,m4
+pshufbm9,m2
+
+pmaddubsw m7,m0
+pmaddubsw m12,   m8,m1
+pmaddwd   m7,m5
+pmaddwd   m12,   m5
+paddd m7,m12
+
+pmaddubsw m8,m0
+pmaddubsw m12,   m9,m1
+pmaddwd   m8,m5
+pmaddwd   m12,   m5
+paddd m8,m12
+
+pmaddubsw m9,m0
+pmaddubsw m12,   m10,   m1
+pmaddwd   m9,m5
+pmaddwd   m12,   m5
+paddd m9,m12
+
+pmaddubsw m10,   m0
+pmaddubsw m12,  m11,m1
+pmaddwd   m10,  m5
+pmaddwd   m12,  m5
+paddd m10,  m12
+
+packssdw  m7,   m8
+packssdw  m9,   m10
+pmulhrsw  m7,   m6
+pmulhrsw  m9,   m6
+packuswb  m7,   m9
+movu  [r2], xm7
+vextracti32x4 [r2 + r3], m7,1
+vextracti32x4 [r2 + 2 * r3], m7,2
+vextracti32x4 [r2 + r7], m7,3
+%endmacro
+
 %macro IPFILTER_LUMA_64xN_AVX512 1
 INIT_ZMM avx512
 cglobal interp_8tap_horiz_pp_64x%1, 4,6,13
@@ -10299,6 +10358,43 @@
 IPFILTER_LUMA_32xN_AVX512 24
 IPFILTER_LUMA_32xN_AVX512 32
 IPFILTER_LUMA_32xN_AVX512 64
+
+%macro IPFILTER_LUMA_16xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_8tap_horiz_pp_16x%1, 4,8,14
+sub   r0,3
+mov   r4d,   r4m
+lea   r6,[3 * r1]
+lea   r7,[3 * r3]
+%ifdef PIC
+lea   r5,[tab_LumaCoeff]
+vpbroadcastd  m0,[r5 + r4 * 8]
+vpbroadcastd  m1,[r5 + r4 * 8 + 4]
+%else
+vpbroadcastd  m0,[tab_LumaCoeff + r4 * 8]
+vpbroadcastd  m1,[tab_LumaCoeff + r4 * 8 + 4]
+%endif
+vbroadcasti32x8   m2,[interp4_horiz_shuf_load1_avx512]
+vbroadcasti32x8   m3,[interp4_horiz_shuf_load3_avx512]
+vbroadcasti32x8   m4,[interp4_horiz_shuf_load2_avx512]
+

[x265] [PATCH 089 of 307] x86: AVX512 interp_8tap_horiz_pp_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1504241683 -19800
#  Fri Sep 01 10:24:43 2017 +0530
# Node ID 4be3c35eb7510f269a548f248e4f5904b4107d74
# Parent  354f848c3793b459c005667cdf7158eb6394eb0f
x86: AVX512 interp_8tap_horiz_pp_32xN

Size  |  AVX2 performance | AVX512 performance
--
32x8  |  18.92x   |  37.84x
32x16 |  17.46x   |  36.15x
32x24 |  17.77x   |  35.98x
32x32 |  17.91x   |  36.69x
32x64 |  18.10x   |  35.47x

diff -r 354f848c3793 -r 4be3c35eb751 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Aug 11 17:18:16 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Sep 01 10:24:43 2017 +0530
@@ -4052,6 +4052,12 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = 
PFX(interp_4tap_horiz_ps_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = 
PFX(interp_4tap_horiz_ps_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = 
PFX(interp_4tap_horiz_ps_32x8_avx512);
+
+p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512);
+p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512);
+p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512);
+p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx512);
+p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx512);
 p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512);
 p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512);
 p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
diff -r 354f848c3793 -r 4be3c35eb751 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Fri Aug 11 17:18:16 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Fri Sep 01 10:24:43 2017 +0530
@@ -10182,6 +10182,57 @@
 movu  [r2], m7
 %endmacro
 
+%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0
+; register map
+; m0 , m1 interpolate coeff
+; m2 , m3, m4  shuffle order table
+; m5 - pw_1
+; m6 - pw_512
+
+movu ym7,[r0]
+vinserti32x8  m7,[r0 + r1], 1
+movu ym9,[r0 + 8]
+vinserti32x8  m9,[r0 + r1 + 8], 1
+
+pshufbm8,m7,m3
+pshufbm7,m2
+pshufbm10,   m9,m3
+pshufbm11,   m9,m4
+pshufbm9,m2
+
+pmaddubsw m7,m0
+pmaddubsw m12,   m8,m1
+pmaddwd   m7,m5
+pmaddwd   m12,   m5
+paddd m7,m12
+
+pmaddubsw m8,m0
+pmaddubsw m12,   m9,m1
+pmaddwd   m8,m5
+pmaddwd   m12,   m5
+paddd m8,m12
+
+pmaddubsw m9,m0
+pmaddubsw m12,   m10,   m1
+pmaddwd   m9,m5
+pmaddwd   m12,   m5
+paddd m9,m12
+
+pmaddubsw m10,   m0
+pmaddubsw m12,  m11,m1
+pmaddwd   m10,  m5
+pmaddwd   m12,  m5
+paddd m10,  m12
+
+packssdw  m7,   m8
+packssdw  m9,   m10
+pmulhrsw  m7,   m6
+pmulhrsw  m9,   m6
+packuswb  m7,   m9
+movu  [r2], ym7
+vextracti32x8 [r2 + r3], m7, 1
+%endmacro
+
 %macro IPFILTER_LUMA_64xN_AVX512 1
 INIT_ZMM avx512
 cglobal interp_8tap_horiz_pp_64x%1, 4,6,13
@@ -10214,6 +10265,40 @@
 IPFILTER_LUMA_64xN_AVX512 32
 IPFILTER_LUMA_64xN_AVX512 48
 IPFILTER_LUMA_64xN_AVX512 64
+
+%macro IPFILTER_LUMA_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_8tap_horiz_pp_32x%1, 4,6,13
+sub   r0,3
+mov   r4d,   r4m
+%ifdef PIC
+lea   r5,[tab_LumaCoeff]
+vpbroadcastd  m0,[r5 + r4 * 8]
+vpbroadcastd  m1,[r5 + r4 * 8 + 4]
+%else
+vpbroadcastd  m0,[tab_LumaCoeff + r4 * 8]
+vpbroadcastd  m1,[tab_LumaCoeff + r4 * 8 + 4]
+%endif
+vbroadcasti32x8   m2,[interp4_horiz_shuf_load1_avx512]
+vbroadcasti32x8   m3,[interp4_horiz_shuf_load3_avx512]
+vbroadcasti32x8   m4,[interp4_horiz_shuf_load2_avx512]
+vpbroadcastd  m5,[pw_1]
+vbroadcasti32x8   m6,[pw_512]
+
+%rep %1/2 -1
+PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
+lea   r0,[r0 + 2 * r1]
+lea   r2,[r2 + 2 * r3]
+%endrep
+PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
+RET
+%endmacro
+
+IPFILTER_LUMA_32xN_AVX512 8
+IPFILTER_LUMA_32xN_AVX512 16
+IPFILTER_LUMA_32xN_AVX512 24
+IPFILTER_LUMA_32xN_AVX512 32

[x265] [PATCH 096 of 307] x86: AVX512 copy_cnt_32 and copy_cnt_16

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503557407 -19800
#  Thu Aug 24 12:20:07 2017 +0530
# Node ID 0355f0128b7d713c4a21c91d3cc5bed1e8b43c47
# Parent  31a180bcef33fae436ad7e3aa4378b283a86d56a
x86: AVX512 copy_cnt_32 and copy_cnt_16

Size | BitDepth | AVX2 performance | AVX512 performance
---
16x16|8 | 6.92x|   8.07x
16x16|10| 6.72x|   7.75x
32x32|8 | 6.08x|  10.33x
32x32|10| 6.04x|  10.16x

diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Aug 23 10:06:01 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Aug 24 12:20:07 2017 +0530
@@ -2342,6 +2342,9 @@
 p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
 p.weight_pp = PFX(weight_pp_avx512);
 
+p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
+p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
+
 }
 }
 #else // if HIGH_BIT_DEPTH
@@ -4054,6 +4057,9 @@
 p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
 p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
 
+p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
+p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
+
 //i444 chroma_hpp
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x32_avx512);
diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Wed Aug 23 10:06:01 2017 +0530
+++ b/source/common/x86/blockcopy8.asm  Thu Aug 24 12:20:07 2017 +0530
@@ -5958,7 +5958,91 @@
 movd eax, xm4
 RET
 
-
+;--
+; copy_cnt avx512 code start
+;--
+%macro PROCESS_COPY_CNT_32x4_AVX512 0
+movum0,  [r1]
+movum1,  [r1 + r2]
+movu[r0],m0
+movu[r0 + mmsize],   m1
+packsswbm0,  m1
+pminub  m0,  m3
+
+movum1,  [r1 + 2 * r2]
+movum2,  [r1 + r3]
+movu[r0 + 2 * mmsize],   m1
+movu[r0 + 3 * mmsize],   m2
+packsswbm1,  m2
+pminub  m1,  m3
+
+paddb   m0,  m1
+paddb   m4,  m0
+%endmacro
+
+%macro PROCESS_COPY_CNT_16x4_AVX512 0
+movu  ym0,   [r1]
+vinserti32x8   m0,   [r1 + r2],1
+movu  ym1,   [r1 + 2 * r2]
+vinserti32x8   m1,   [r1 + r3],1
+movu [r0],   m0
+movu [r0 + mmsize],  m1
+packsswb   m0,   m1
+pminub m0,   m3
+paddb  m4,   m0
+%endmacro
+
+%macro PROCESS_COPY_CNT_END_AVX512 0
+pxor   m0,  m0
+vextracti32x8  ym1, m4, 1
+paddb  ym4, ym1
+vextracti32x4  xm1, ym4, 1
+paddb  xm4, xm1
+psadbw xm4, xm0
+movhlpsxm1, xm4
+paddd  xm4, xm1
+movd   eax, xm4
+%endmacro
+
+;--
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
+;--
+INIT_ZMM avx512
+cglobal copy_cnt_32, 3, 4, 5
+add  r2d,  r2d
+lea  r3,   [3 * r2]
+
+vbroadcasti32x8  m3,   [pb_1]
+pxor m4,   m4
+
+%rep 7
+PROCESS_COPY_CNT_32x4_AVX512
+add  r0,  4 * mmsize
+lea  r1,  [r1 + 4 * r2]
+%endrep
+PROCESS_COPY_CNT_32x4_AVX512
+PROCESS_COPY_CNT_END_AVX512
+RET
+
+INIT_ZMM avx512
+cglobal copy_cnt_16, 3, 4, 5
+add  r2d,  r2d
+lea  r3,   [3 * r2]
+
+vbroadcasti32x8  m3,   [pb_1]
+pxor m4,   m4
+
+%rep 3
+PROCESS_COPY_CNT_16x4_AVX512
+add  r0,  2 * mmsize
+lea  r1,  [r1 + 4 * r2]
+%endrep
+PROCESS_COPY_CNT_16x4_AVX512
+PROCESS_COPY_CNT_END_AVX512
+RET
+;--
+; copy_cnt avx512 code end
+;--
 
;--
 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int 
shift);
 

[x265] [PATCH 106 of 307] x86: AVX512 interp_4tap_horiz_pp_8xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1504171458 -19800
#  Thu Aug 31 14:54:18 2017 +0530
# Node ID 1fb1948309a0a9218a07e060300b9d5a7ff58321
# Parent  9928b3e5b4d4235bea9ffb22434446e68c3aacdb
x86: AVX512 interp_4tap_horiz_pp_8xN for high bit depth

Color Space i444
Size  |  AVX2 performance | AVX512 performance
--
8x4   |  5.14x|   9.51x
8x8   |  6.20x|  12.75x
8x16  |  6.32x|  12.44x
8x32  |  6.01x|  13.68x

diff -r 9928b3e5b4d4 -r 1fb1948309a0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Aug 31 14:24:24 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Aug 31 14:54:18 2017 +0530
@@ -2354,6 +2354,10 @@
 p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
 p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
 
+p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x4_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x32_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x4_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x12_avx512);
@@ -2364,6 +2368,12 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
 
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x4_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x8_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x12_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x8_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x24_avx512);
@@ -2374,6 +2384,10 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x64_avx512);
 
+p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x4_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x4_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = 
PFX(interp_4tap_horiz_pp_16x12_avx512);
diff -r 9928b3e5b4d4 -r 1fb1948309a0 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Thu Aug 31 14:24:24 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Thu Aug 31 14:54:18 2017 +0530
@@ -5082,6 +5082,49 @@
 
;-
 ;ipfilter_chroma_avx512 code start
 
;-
+%macro PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512 0
+; register map
+; m0 , m1 interpolate coeff
+; m2 , m3  shuffle order table
+; m4 - pd_32
+; m5 - zero
+; m6 - pw_pixel_max
+
+movuxm7,   [r0]
+vinserti32x4m7,[r0 + r1],  1
+vinserti32x4m7,[r0 + 2 * r1],  2
+vinserti32x4m7,[r0 + r6],  3
+
+pshufb  m9,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m9,m1
+paddd   m7,m9
+paddd   m7,m4
+psrad   m7,6
+
+movuxm8,   [r0 + 8]
+

[x265] [PATCH 100 of 307] x86: AVX512 interp_8tap_horiz_pp_48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503912578 -19800
#  Mon Aug 28 14:59:38 2017 +0530
# Node ID 562c00d2153193eec85ab907b60eeb5aca7cc609
# Parent  a7bf0a24cfc8eb8edc95d340b240b91d03dac5bd
x86: AVX512 interp_8tap_horiz_pp_48x64

AVX2 performance: 19.57x
AVX512 perfornamce  : 35.25x

diff -r a7bf0a24cfc8 -r 562c00d21531 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 28 14:46:28 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Aug 28 14:59:38 2017 +0530
@@ -4159,6 +4159,7 @@
 p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512);
 p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
 p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
+p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
 
 p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512);
 p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512);
diff -r a7bf0a24cfc8 -r 562c00d21531 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Mon Aug 28 14:46:28 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Mon Aug 28 14:59:38 2017 +0530
@@ -10489,6 +10489,151 @@
 vextracti32x4 [r2 + r7], m7,3
 %endmacro
 
+%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0
+; register map
+; m0 , m1 interpolate coeff
+; m2 , m3, m4  shuffle order table
+; m5 - pw_1
+; m6 - pw_512
+
+movu ym7,[r0]
+vinserti32x8  m7,[r0 + r1], 1
+movu ym9,[r0 + 8]
+vinserti32x8  m9,[r0 + r1 + 8], 1
+
+pshufbm8,m7,m3
+pshufbm7,m2
+pshufbm10,   m9,m3
+pshufbm11,   m9,m4
+pshufbm9,m2
+
+pmaddubsw m7,m0
+pmaddubsw m12,   m8,m1
+pmaddwd   m7,m5
+pmaddwd   m12,   m5
+paddd m7,m12
+
+pmaddubsw m8,m0
+pmaddubsw m12,   m9,m1
+pmaddwd   m8,m5
+pmaddwd   m12,   m5
+paddd m8,m12
+
+pmaddubsw m9,m0
+pmaddubsw m12,   m10,   m1
+pmaddwd   m9,m5
+pmaddwd   m12,   m5
+paddd m9,m12
+
+pmaddubsw m10,   m0
+pmaddubsw m12,  m11,m1
+pmaddwd   m10,  m5
+pmaddwd   m12,  m5
+paddd m10,  m12
+
+packssdw  m7,   m8
+packssdw  m9,   m10
+pmulhrsw  m7,   m6
+pmulhrsw  m9,   m6
+packuswb  m7,   m9
+movu  [r2], ym7
+vextracti32x8 [r2 + r3], m7, 1
+
+movu ym7,[r0 + 2 * r1]
+vinserti32x8  m7,[r0 + r6],  1
+movu ym9,[r0 + 2 * r1 + 8]
+vinserti32x8  m9,[r0 + r6 + 8],  1
+
+pshufbm8,m7,m3
+pshufbm7,m2
+pshufbm10,   m9,m3
+pshufbm11,   m9,m4
+pshufbm9,m2
+
+pmaddubsw m7,m0
+pmaddubsw m12,   m8,m1
+pmaddwd   m7,m5
+pmaddwd   m12,   m5
+paddd m7,m12
+
+pmaddubsw m8,m0
+pmaddubsw m12,   m9,m1
+pmaddwd   m8,m5
+pmaddwd   m12,   m5
+paddd m8,m12
+
+pmaddubsw m9,m0
+pmaddubsw m12,   m10,   m1
+pmaddwd   m9,m5
+pmaddwd   m12,   m5
+paddd m9,m12
+
+pmaddubsw m10,   m0
+pmaddubsw m12,  m11,m1
+pmaddwd   m10,  m5
+pmaddwd   m12,  m5
+paddd m10,  m12
+
+packssdw  m7,   m8
+packssdw  m9,   m10
+pmulhrsw  m7,   m6
+pmulhrsw  m9,   m6
+packuswb  m7,   m9
+movu  [r2 + 2 * r3], ym7
+vextracti32x8 [r2 + r7],  m7,1
+
+movu xm7,[r0 + mmsize/2]
+vinserti32x4  m7,[r0 + r1 + mmsize/2],  1
+vinserti32x4  m7,[r0 + 2 * r1 + mmsize/2],  2
+vinserti32x4  m7,[r0 + r6 + mmsize/2],  3
+
+pshufbm8,m7,m3
+pshufbm7,m2
+
+movu xm9,[r0 + 40]
+vinserti32x4  m9,[r0 + r1 + 40],  1
+vinserti32x4  m9,[r0 + 2 * r1 + 40],  2
+vinserti32x4  m9,[r0 + r6 + 

[x265] [PATCH 107 of 307] x86: AVX512 interp_4tap_horiz_pp_24xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1504173085 -19800
#  Thu Aug 31 15:21:25 2017 +0530
# Node ID c726239a07580fd13c4177f0206d615ee02c5975
# Parent  1fb1948309a0a9218a07e060300b9d5a7ff58321
x86: AVX512 interp_4tap_horiz_pp_24xN for high bit depth

i444 24x32
AVX2 performance : 8.85x
AVX512 performance   : 19.37x

diff -r 1fb1948309a0 -r c726239a0758 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Aug 31 14:54:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Aug 31 15:21:25 2017 +0530
@@ -2367,6 +2367,7 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_24x32_avx512);
 
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x4_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x8_avx512);
@@ -2383,6 +2384,7 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_24x64_avx512);
 
 p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x4_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_8x8_avx512);
@@ -2404,6 +2406,7 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_48x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_24x32_avx512);
 
 }
 }
diff -r 1fb1948309a0 -r c726239a0758 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Thu Aug 31 14:54:18 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Thu Aug 31 15:21:25 2017 +0530
@@ -5161,6 +5161,103 @@
 vextracti32x8   [r2 + r3], m7,1
 %endmacro
 
+%macro PROCESS_IPFILTER_CHROMA_PP_24x4_AVX512 0
+; register map
+; m0 , m1 interpolate coeff
+; m2 , m3  shuffle order table
+; m4 - pd_32
+; m5 - zero
+; m6 - pw_pixel_max
+
+movuym7,   [r0]
+vinserti32x8m7,[r0 + r1],  1
+movuym8,   [r0 + 8]
+vinserti32x8m8,[r0 + r1 + 8],  1
+
+pshufb  m9,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m9,m1
+paddd   m7,m9
+paddd   m7,m4
+psrad   m7,6
+
+pshufb  m9,m8,m3
+pshufb  m8,m2
+pmaddwd m8,m0
+pmaddwd m9,m1
+paddd   m8,m9
+paddd   m8,m4
+psrad   m8,6
+
+packusdwm7,m8
+CLIPW   m7,m5,m6
+pshufb  m7,m10
+movu[r2],  ym7
+vextracti32x8   [r2 + r3], m7,1
+
+movuym7,   [r0 + 2 * r1]
+vinserti32x8m7,[r0 + r6],  1
+movuym8,   [r0 + 2 * r1 + 8]
+vinserti32x8m8,[r0 + r6 + 8],  1
+
+pshufb  m9,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m9,m1
+paddd   m7,m9
+paddd   m7,m4
+psrad   m7,6
+
+pshufb  m9,m8,m3
+pshufb  m8,m2
+pmaddwd m8,m0
+pmaddwd m9,m1
+paddd   m8,m9
+paddd   m8,m4
+psrad   m8,6
+
+packusdwm7,m8
+CLIPW   m7,m5,m6
+pshufb  m7,m10
+movu[r2 + 2 * r3],ym7
+vextracti32x8   [r2 + r7], m7,1
+
+movuxm7,   [r0 + mmsize/2]
+vinserti32x4m7,[r0 + r1 + mmsize/2],  1
+vinserti32x4m7,[r0 + 2 * r1 + mmsize/2],  2
+vinserti32x4m7,[r0 + r6 + mmsize/2],  3
+
+pshufb  m9,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+

[x265] [PATCH 098 of 307] x86: AVX512 pixel_avg_weight_64xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503908210 -19800
#  Mon Aug 28 13:46:50 2017 +0530
# Node ID 45e4dd746cfd9380dbe2344a5754a6ff6e9feed5
# Parent  bf199a5eca5be148be8a0c91cd9f2e8e0e908059
x86: AVX512 pixel_avg_weight_64xN

Size  |  AVX2 performance | AVX512 performance
--
64x16 |  41.70x   |  60.98x
64x32 |  36.75x   |  68.91x
64x48 |  37.31x   |  59.07x
64x64 |  37.92x   |  58.85x

diff -r bf199a5eca5b -r 45e4dd746cfd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 28 11:58:37 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Aug 28 13:46:50 2017 +0530
@@ -4159,6 +4159,11 @@
 p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
 p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
 
+p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512);
+p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512);
+p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512);
+p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512);
+
 }
 #endif
 }
diff -r bf199a5eca5b -r 45e4dd746cfd source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmMon Aug 28 11:58:37 2017 +0530
+++ b/source/common/x86/mc-a.asmMon Aug 28 13:46:50 2017 +0530
@@ -5020,6 +5020,58 @@
 RET
 %endif
 
+;-
+;pixel_avg_pp avx512 code start
+;-
+%macro PROCESS_PIXELAVG_64x4_AVX512 0
+movum0, [r2]
+movum2, [r2 + r3]
+movum1, [r4]
+movum3, [r4 + r5]
+pavgb   m0, m1
+pavgb   m2, m3
+movu[r0],   m0
+movu[r0 + r1],  m2
+
+movum0, [r2 + 2 * r3]
+movum2, [r2 + r7]
+movum1, [r4 + 2 * r5]
+movum3, [r4 + r8]
+pavgb   m0, m1
+pavgb   m2, m3
+movu[r0 + 2 * r1],  m0
+movu[r0 + r6],  m2
+%endmacro
+
+;---
+;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t 
sstride0, const pixel* src1, intptr_t sstride1, int)
+;---
+%if ARCH_X86_64 && BIT_DEPTH == 8
+%macro PIXEL_AVG_64xN_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_avg_64x%1, 6, 9, 4
+lea r6, [3 * r1]
+lea r7, [3 * r3]
+lea r8, [3 * r5]
+
+%rep %1/4 - 1
+PROCESS_PIXELAVG_64x4_AVX512
+lea r2, [r2 + r3 * 4]
+lea r4, [r4 + r5 * 4]
+lea r0, [r0 + r1 * 4]
+%endrep
+PROCESS_PIXELAVG_64x4_AVX512
+RET
+%endmacro
+
+PIXEL_AVG_64xN_AVX512 16
+PIXEL_AVG_64xN_AVX512 32
+PIXEL_AVG_64xN_AVX512 48
+PIXEL_AVG_64xN_AVX512 64
+%endif
+;-
+;pixel_avg_pp avx512 code end
+;-
 ;=
 ; pixel avg2
 ;=
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 178 of 307] x86: AVX512 interp_4tap_vert_pp_16xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1510823813 -19800
#  Thu Nov 16 14:46:53 2017 +0530
# Node ID 963884afd8f38dbcc8335ff1d3a39385e317d6d4
# Parent  2c24c0aadbe3e76eabde711a94c57aed077b7347
x86: AVX512 interp_4tap_vert_pp_16xN

i444
Size  |  AVX2 performance | AVX512 performance
--
16x8  |  31.15x   |  36.85x
16x16 |  29.18x   |  41.50x
16x32 |  30.14x   |  43.30x
16x64 |  31.79x   |  45.30x

This patch also optimises coeffIdx load to register for chroma_vpp

diff -r 2c24c0aadbe3 -r 963884afd8f3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 06 17:13:17 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Nov 16 14:46:53 2017 +0530
@@ -4816,20 +4816,33 @@
 p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = 
PFX(interp_4tap_horiz_ps_48x64_avx512);
 
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = 
PFX(interp_4tap_vert_pp_32x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
+
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = 
PFX(interp_4tap_vert_pp_16x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = 
PFX(interp_4tap_vert_pp_16x24_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = 
PFX(interp_4tap_vert_pp_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = 
PFX(interp_4tap_vert_pp_32x64_avx512);
+
+p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = 
PFX(interp_4tap_vert_pp_16x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = 
PFX(interp_4tap_vert_pp_32x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = 
PFX(interp_4tap_vert_pp_32x64_avx512);
-
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = 
PFX(interp_4tap_vert_pp_64x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = 
PFX(interp_4tap_vert_pp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = 
PFX(interp_4tap_vert_pp_64x32_avx512);
diff -r 2c24c0aadbe3 -r 963884afd8f3 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Mon Nov 06 17:13:17 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Thu Nov 16 14:46:53 2017 +0530
@@ -10797,6 +10797,134 @@
 IPFILTER_CHROMA_PS_48xN_AVX512 64
 %endif
 
+;-
+;avx512 chroma_vpp code start
+;-
+%macro PROCESS_CHROMA_VERT_PP_16x8_AVX512 0
+movu  xm1,[r0]
+lea   r8, [r0 + 4 * r1]
+lea   r9, [r8 + 2 * r1]
+vinserti32x4  m1, [r0 + 2 * r1],1
+vinserti32x4  m1, [r8],2
+vinserti32x4  m1, [r9],3
+movu  xm3,[r0 + 

[x265] [PATCH 175 of 307] [x265-avx512]x86: AVX512 idct16x16

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1509948596 -19800
#  Mon Nov 06 11:39:56 2017 +0530
# Node ID 8bbcc1bd3c1381e936695a6eff30a17cc2633b6f
# Parent  df3c576cd32c50b0412ad3d70eeebfe8fb511da1
[x265-avx512]x86: AVX512 idct16x16

AVX2 Performance:11.67x
AVX512 Performance  :12.80x

diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 13 16:02:40 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 06 11:39:56 2017 +0530
@@ -2837,6 +2837,8 @@
 
 p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
 p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
+p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
+
 
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = 
PFX(interp_4tap_horiz_ps_32x32_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = 
PFX(interp_4tap_horiz_ps_32x16_avx512);
@@ -4835,6 +4837,7 @@
 
 p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
 p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
+p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
 
 }
 #endif
diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmMon Nov 13 16:02:40 2017 +0530
+++ b/source/common/x86/dct8.asmMon Nov 06 11:39:56 2017 +0530
@@ -218,6 +218,27 @@
 
 idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
 
+
+tab_AVX512_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 
43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43
+   dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, 
-80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, 
-80, -57
+   dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, 
-9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, 
-57, 87
+   dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, 
-87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 
87, -90
+
+tab_AVX512_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 
50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75
+   dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, 
-18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 
64, 75, -36, -89
+   dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 
50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 
89, -83, 50
+   dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 
89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 
64, -50, 36, -18
+
+idct16_AVX512_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15
+
+idct16_AVX512_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13
+
+idct16_AVX512_shuff2:   dq 0, 1, 8, 9, 4, 5, 12, 13
+idct16_AVX512_shuff3:   dq 2, 3, 10, 11, 6, 7, 14, 15
+idct16_AVX512_shuff4:   dq 4, 5, 12, 13, 0, 1, 8, 9
+idct16_AVX512_shuff5:   dq 6, 7, 14, 15, 2, 3, 10, 11
+idct16_AVX512_shuff6:   times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 
2, 3, 0, 1
+
 tab_idct32_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 
4
 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, 
-61, -38, -13
 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 
85, 61, 22
@@ -3671,6 +3692,599 @@
 jnz .pass2
 RET
 
+
+%macro IDCT16_AVX512_PASS1 3
+movum5,  [tab_AVX512_idct16_2 + %1 * 64]
+pmaddwd m9, m0, m5
+pmaddwd m10, m7, m5
+
+vpsrldq m16,   m9, 4
+padddm9,  m16
+vpslldq m17,   m10, 4
+padddm10,  m17
+vmovdqu32m9   {k1}, m10
+
+pmaddwd m10, m6, m5
+pmaddwd m11, m8, m5
+
+vpsrldq m16,   m10, 4
+padddm10,  m16
+vpslldq m17,   m11, 4
+padddm11,  m17
+vmovdqu32m10   {k1}, m11
+
+vpsrldq m16,   m9, 8
+padddm9,  m16
+vpslldq m17,   m10, 8
+padddm10,  m17
+vmovdqu32m9   {k2}, m10
+
+movum5,  [tab_AVX512_idct16_1 + %1 * 64]
+pmaddwd m10, m1, m5
+pmaddwd m11, m3, m5
+
+vpsrldq m16,   m10, 4
+padddm10,  m16
+vpslldq m17,   m11, 4
+padddm11,  m17
+vmovdqu32m10   {k1}, m11
+
+pmaddwd m11, m4, m5
+pmaddwd m12, m2, m5
+
+vpsrldq m16,   m11, 4
+padddm11,  m16
+vpslldq m17,   m12, 4
+padddm12,  m17
+vmovdqu32m11   {k1}, m12
+
+vpsrldq m16,   m10, 8
+padddm10,  m16
+vpslldq m17,   m11, 8
+

[x265] [PATCH 173 of 307] x86: AVX512 optimise intermediate register load in chroma_vsp, chroma_vss, chroma_vps

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar>
# Date 1522976950 25200
#  Thu Apr 05 18:09:10 2018 -0700
# Node ID ab41c6957bc2f359e5df82f9936c3fd00a5d2ea5
# Parent  71f7869fac602953ef5e14c344f10adc374d7bfa
x86: AVX512 optimise intermediate register load in chroma_vsp, chroma_vss, 
chroma_vps

diff -r 71f7869fac60 -r ab41c6957bc2 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Wed Nov 15 14:35:17 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Thu Apr 05 18:09:10 2018 -0700
@@ -7527,10 +7527,10 @@
 vinserti32x4  m4, [r8 + 4 * r1],   2
 vinserti32x4  m4, [r9 + 4 * r1],   3
 punpcklwd m6, m5,  m4
-pmaddwd   m6, [r5 + mmsize]
+pmaddwd   m6, m9
 paddd m2, m6
 punpckhwd m5, m4
-pmaddwd   m5, [r5 + mmsize]
+pmaddwd   m5, m9
 paddd m3, m5
 
 paddd m0, m7
@@ -7561,7 +7561,7 @@
 
;-
 %if ARCH_X86_64
 INIT_ZMM avx512
-cglobal interp_4tap_vert_ps_8x8, 5, 11, 8
+cglobal interp_4tap_vert_ps_8x8, 5, 11, 10
 add   r1d,r1d
 add   r3d,r3d
 sub   r0, r1
@@ -7576,13 +7576,15 @@
 vbroadcasti32x4   m7, [INTERP_OFFSET_PS]
 lea   r10,[3 * r1]
 lea   r7, [3 * r3]
+mova  m8, [r5]
+mova  m9, [r5 + mmsize]
 PROCESS_CHROMA_VERT_PS_8x8_AVX512
 RET
 %endif
 
 %macro FILTER_VER_PS_CHROMA_8xN_AVX512 1
 INIT_ZMM avx512
-cglobal interp_4tap_vert_ps_8x%1, 5, 11, 8
+cglobal interp_4tap_vert_ps_8x%1, 5, 11, 10
 add   r1d,r1d
 add   r3d,r3d
 sub   r0, r1
@@ -7597,6 +7599,8 @@
 vbroadcasti32x4   m7, [INTERP_OFFSET_PS]
 lea   r10,[3 * r1]
 lea   r7, [3 * r3]
+mova  m8, [r5]
+mova  m9, [r5 + mmsize]
 %rep %1/8 - 1
 PROCESS_CHROMA_VERT_PS_8x8_AVX512
 lea   r0, [r8 + 4 * r1]
@@ -7619,33 +7623,33 @@
 movu  ym3,[r0 + r1]
 vinserti32x8  m3, [r6 + r1],   1
 punpcklwd m0, m1,  m3
-pmaddwd   m0, [r5]
+pmaddwd   m0, m8
 punpckhwd m1, m3
-pmaddwd   m1, [r5]
+pmaddwd   m1, m8
 
 movu  ym4,[r0 + 2 * r1]
 vinserti32x8  m4, [r6 + 2 * r1],   1
 punpcklwd m2, m3,  m4
-pmaddwd   m2, [r5]
+pmaddwd   m2, m8
 punpckhwd m3, m4
-pmaddwd   m3, [r5]
+pmaddwd   m3, m8
 
 movu  ym5,[r0 + r8]
 vinserti32x8  m5, [r6 + r8],   1
 punpcklwd m6, m4,  m5
-pmaddwd   m6, [r5 + mmsize]
+pmaddwd   m6, m9
 paddd m0, m6
 punpckhwd m4, m5
-pmaddwd   m4, [r5 + mmsize]
+pmaddwd   m4, m9
 paddd m1, m4
 
 movu  ym4,[r0 + 4 * r1]
 vinserti32x8  m4, [r6 + 4 * r1],   1
 punpcklwd m6, m5,  m4
-pmaddwd   m6, [r5 + mmsize]
+pmaddwd   m6, m9
 paddd m2, m6
 punpckhwd m5, m4
-pmaddwd   m5, [r5 + mmsize]
+pmaddwd   m5, m9
 paddd m3, m5
 
 paddd m0, m7
@@ -7671,7 +7675,7 @@
 
;-
 %if 

[x265] [PATCH 164 of 307] Disable all avx512 Kernels with negative IPC gains over avx2 Kernels

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashree
# Date 1510736734 -19800
#  Wed Nov 15 14:35:34 2017 +0530
# Node ID 7d41838f5d06ad4fbdabd08af99d724fcd599193
# Parent  e1dedfae074d765c26efca976538cd06e1ef7cab
Disable all avx512 Kernels with negative IPC gains over avx2 Kernels.

diff -r e1dedfae074d -r 7d41838f5d06 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Nov 14 02:11:35 2017 -0800
+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 15 14:35:34 2017 +0530
@@ -4328,10 +4328,10 @@
 if (cpuMask & X265_CPU_AVX512)
 {
 p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
-p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
+  //  p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
 p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
 p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
-p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
+//p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
 p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
 p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
 p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
@@ -4400,14 +4400,14 @@
 p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx512);
 p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx512);
 p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512);
-p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512);
+//p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512);
 
 p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_avx512);
 p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_avx512);
 p.pu[LUMA_4x16].sad = PFX(pixel_sad_4x16_avx512);
 p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_avx512);
 p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_avx512);
-p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_avx512);
+   // p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_avx512);
 p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512);
 p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 165 of 307] x86: AVX512 interp_4tap_vert_sp_64xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1510729226 -19800
#  Wed Nov 15 12:30:26 2017 +0530
# Node ID 3f4b7399d14ba72aba0692e61681276f09df8ada
# Parent  7d41838f5d06ad4fbdabd08af99d724fcd599193
x86: AVX512 interp_4tap_vert_sp_64xN for high bit depth

i444
Size  |  AVX2 performance  | AVX512 performance
--
64x16 |   23.46x   |  43.98x
64x32 |   23.54x   |  40.59x
64x48 |   23.71x   |  40.46x
64x64 |   23.59x   |  40.33x

This patch also cleanup horiz_ps_8xN for better readability of code

diff -r 7d41838f5d06 -r 3f4b7399d14b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Nov 15 14:35:34 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 15 12:30:26 2017 +0530
@@ -2643,6 +2643,11 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = 
PFX(interp_4tap_vert_ps_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = 
PFX(interp_4tap_vert_ps_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = 
PFX(interp_4tap_vert_ps_64x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = 
PFX(interp_4tap_vert_sp_64x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = 
PFX(interp_4tap_vert_sp_64x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = 
PFX(interp_4tap_vert_sp_64x48_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = 
PFX(interp_4tap_vert_sp_64x64_avx512);
+
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = 
PFX(interp_4tap_vert_pp_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = 
PFX(interp_4tap_vert_ps_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = 
PFX(interp_4tap_vert_pp_32x8_avx512);
diff -r 7d41838f5d06 -r 3f4b7399d14b source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Wed Nov 15 14:35:34 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Wed Nov 15 12:30:26 2017 +0530
@@ -7342,6 +7342,131 @@
 RET
 %endif
 
+%macro PROCESS_IPFILTER_CHROMA_PS_8x4_AVX512 0
+; register map
+; m0 , m1 - interpolate coeff
+; m2 , m3 - shuffle load order table
+; m4  - INTERP_OFFSET_PS
+; m5  - shuffle store order table
+
+movuxm6,   [r0]
+vinserti32x4m6,[r0 + r1],  1
+vinserti32x4m6,[r0 + 2 * r1],  2
+vinserti32x4m6,[r0 + r6],  3
+
+pshufb  m8,m6,m3
+pshufb  m6,m2
+pmaddwd m6,m0
+pmaddwd m8,m1
+paddd   m6,m8
+paddd   m6,m4
+psrad   m6,INTERP_SHIFT_PS
+
+movuxm7,   [r0 + 8]
+vinserti32x4m7,[r0 + r1 + 8],  1
+vinserti32x4m7,[r0 + 2 * r1 + 8],  2
+vinserti32x4m7,[r0 + r6 + 8],  3
+
+pshufb  m8,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m8,m1
+paddd   m7,m8
+paddd   m7,m4
+psrad   m7,INTERP_SHIFT_PS
+
+packssdwm6,m7
+pshufb  m6,m5
+movu[r2],  xm6
+vextracti32x4   [r2 + r3], m6,1
+vextracti32x4   [r2 + 2 * r3], m6,2
+vextracti32x4   [r2 + r7], m6,3
+%endmacro
+
+%macro PROCESS_IPFILTER_CHROMA_PS_8x3_AVX512 0
+movuxm6,   [r0]
+vinserti32x4m6,[r0 + r1],  1
+vinserti32x4m6,[r0 + 2 * r1],  2
+
+pshufb  m8,m6,m3
+pshufb  m6,m2
+pmaddwd m6,m0
+pmaddwd m8,m1
+paddd   m6,m8
+paddd   m6,m4
+psrad   m6,INTERP_SHIFT_PS
+
+movuxm7,   [r0 + 8]
+vinserti32x4m7,[r0 + r1 + 8],  1
+vinserti32x4m7,[r0 + 2 * r1 + 8],  2
+
+pshufb  m8,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m8,m1
+paddd   m7,m8
+paddd   m7,m4
+psrad   m7,INTERP_SHIFT_PS
+
+packssdwm6,m7
+pshufb  m6,m5
+movu[r2],  xm6
+vextracti32x4   [r2 + r3], m6,1
+vextracti32x4   [r2 + 2 * r3], m6,2
+%endmacro
+
+%macro IPFILTER_CHROMA_PS_AVX512_8xN 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_ps_8x%1, 4,9,9
+add r1d, r1d
+add r3d, r3d
+mov r4d, r4m
+mov r5d, r5m
+
+lea r6, [3 * r1]
+lea r7, [3 * r3]
+%ifdef PIC
+lea 

[x265] [PATCH 181 of 307] x86: AVX512 interp_4tap_vert_ss_16xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1511333868 -19800
#  Wed Nov 22 12:27:48 2017 +0530
# Node ID ad1814e2ff60904208508512af07472dee380c51
# Parent  83f75ffc0773a2448efa7e6485cb009825edae41
x86: AVX512 interp_4tap_vert_ss_16xN

i444
Size  |  AVX2 performance | AVX512 performance
--
16x4  |  13.31x   |  32.24x
16x8  |  16.43x   |  31.07x
16x12 |  17.26x   |  30.29x
16x16 |  17.62x   |  31.74x
16x32 |  16.66x   |  35.61x
16x64 |  17.09x   |  37.18x

diff -r 83f75ffc0773 -r ad1814e2ff60 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Nov 22 11:56:13 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 22 12:27:48 2017 +0530
@@ -4824,6 +4824,11 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
 
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = 
PFX(interp_4tap_vert_ss_16x4_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = 
PFX(interp_4tap_vert_ss_16x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = 
PFX(interp_4tap_vert_ss_16x12_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = 
PFX(interp_4tap_vert_ss_16x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = 
PFX(interp_4tap_vert_ss_16x32_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = 
PFX(interp_4tap_vert_ss_32x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = 
PFX(interp_4tap_vert_ss_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = 
PFX(interp_4tap_vert_ss_32x24_avx512);
@@ -4839,6 +4844,11 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = 
PFX(interp_4tap_vert_pp_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = 
PFX(interp_4tap_vert_pp_32x64_avx512);
 
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = 
PFX(interp_4tap_vert_ss_16x8_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = 
PFX(interp_4tap_vert_ss_16x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = 
PFX(interp_4tap_vert_ss_16x24_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = 
PFX(interp_4tap_vert_ss_16x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = 
PFX(interp_4tap_vert_ss_16x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = 
PFX(interp_4tap_vert_ss_32x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = 
PFX(interp_4tap_vert_ss_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = 
PFX(interp_4tap_vert_ss_32x48_avx512);
@@ -4858,6 +4868,12 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = 
PFX(interp_4tap_vert_pp_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = 
PFX(interp_4tap_vert_pp_64x16_avx512);
 
+p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = 
PFX(interp_4tap_vert_ss_16x4_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = 
PFX(interp_4tap_vert_ss_16x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = 
PFX(interp_4tap_vert_ss_16x12_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = 
PFX(interp_4tap_vert_ss_16x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = 
PFX(interp_4tap_vert_ss_16x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = 
PFX(interp_4tap_vert_ss_16x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = 
PFX(interp_4tap_vert_ss_32x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = 
PFX(interp_4tap_vert_ss_32x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = 
PFX(interp_4tap_vert_ss_32x24_avx512);
diff -r 83f75ffc0773 -r ad1814e2ff60 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Wed Nov 22 11:56:13 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Wed Nov 22 12:27:48 2017 +0530
@@ -11148,6 +11148,118 @@
 
;-
 ;avx512 chroma_vss code start
 
;-
+%macro PROCESS_CHROMA_VERT_SS_16x4_AVX512 0
+movu  ym1,[r0]
+lea   r6, [r0 + 2 * r1]
+vinserti32x8  m1, [r6],1
+movu  ym3,[r0 + r1]
+vinserti32x8  m3, [r6 + 

[x265] [PATCH 180 of 307] x86: AVX512 interp_4tap_vert_ss_64xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1511331973 -19800
#  Wed Nov 22 11:56:13 2017 +0530
# Node ID 83f75ffc0773a2448efa7e6485cb009825edae41
# Parent  635fbc26941a08a2829a473e13fb5052f5a8471a
x86: AVX512 interp_4tap_vert_ss_64xN

i444
Size  |  AVX2 performance  | AVX512 performance
--
64x16 |   15.89x   |  32.95x
64x32 |   16.11x   |  37.31x
64x48 |   16.04x   |  36.33x
64x64 |   16.63x   |  39.27x

diff -r 635fbc26941a -r 83f75ffc0773 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Nov 22 10:51:33 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 22 11:56:13 2017 +0530
@@ -4863,6 +4863,10 @@
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = 
PFX(interp_4tap_vert_ss_32x24_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = 
PFX(interp_4tap_vert_ss_32x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = 
PFX(interp_4tap_vert_ss_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = 
PFX(interp_4tap_vert_ss_64x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = 
PFX(interp_4tap_vert_ss_64x48_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = 
PFX(interp_4tap_vert_ss_64x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = 
PFX(interp_4tap_vert_ss_64x16_avx512);
 
 p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
 p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
diff -r 635fbc26941a -r 83f75ffc0773 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Wed Nov 22 10:51:33 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Wed Nov 22 11:56:13 2017 +0530
@@ -11261,6 +11261,116 @@
 FILTER_VER_SS_CHROMA_32xN_AVX512 64
 %endif
 
+%macro PROCESS_CHROMA_VERT_SS_64x2_AVX512 0
+movu m1,  [r0]
+movu m3,  [r0 + r1]
+punpcklwdm0,  m1, m3
+pmaddwd  m0,  m15
+punpckhwdm1,  m3
+pmaddwd  m1,  m15
+
+movu m9,  [r0 + mmsize]
+movu m11, [r0 + r1 + mmsize]
+punpcklwdm8,  m9, m11
+pmaddwd  m8,  m15
+punpckhwdm9,  m11
+pmaddwd  m9,  m15
+movu m4,  [r0 + 2 * r1]
+punpcklwdm2,  m3, m4
+pmaddwd  m2,  m15
+punpckhwdm3,  m4
+pmaddwd  m3,  m15
+movu m12, [r0 + 2 * r1 + mmsize]
+punpcklwdm10, m11,m12
+pmaddwd  m10, m15
+punpckhwdm11, m12
+pmaddwd  m11, m15
+
+lea  r0,  [r0 + 2 * r1]
+movu m5,  [r0 + r1]
+punpcklwdm6,  m4, m5
+pmaddwd  m6,  m16
+padddm0,  m6
+punpckhwdm4,  m5
+pmaddwd  m4,  m16
+padddm1,  m4
+
+movu m13, [r0 + r1 + mmsize]
+punpcklwdm14, m12,m13
+pmaddwd  m14, m16
+padddm8,  m14
+punpckhwdm12, m13
+pmaddwd  m12, m16
+padddm9,  m12
+
+movu m4,  [r0 + 2 * r1]
+punpcklwdm6,  m5, m4
+pmaddwd  m6,  m16
+padddm2,  m6
+punpckhwdm5,  m4
+pmaddwd  m5,  m16
+padddm3,  m5
+
+movu m12, [r0 + 2 * r1 + mmsize]
+punpcklwdm14, m13,m12
+pmaddwd  m14, m16
+padddm10, m14
+punpckhwdm13, m12
+pmaddwd  m13, m16
+padddm11, m13
+
+psradm0,  6
+psradm1,  6
+psradm2,   

[x265] [PATCH 179 of 307] x86: AVX512 interp_4tap_vert_ss_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1511328093 -19800
#  Wed Nov 22 10:51:33 2017 +0530
# Node ID 635fbc26941a08a2829a473e13fb5052f5a8471a
# Parent  963884afd8f38dbcc8335ff1d3a39385e317d6d4
x86: AVX512 interp_4tap_vert_ss_32xN

i444
Size  |  AVX2 performance | AVX512 performance
--
32x8  |  15.51x   |  34.64x
32x16 |  17.04x   |  37.82x
32x24 |  15.81x   |  35.75x
32x32 |  16.64x   |  40.20x
32x64 |  16.85x   |  35.51x

diff -r 963884afd8f3 -r 635fbc26941a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Nov 16 14:46:53 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 22 10:51:33 2017 +0530
@@ -4824,6 +4824,11 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
 
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = 
PFX(interp_4tap_vert_ss_32x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = 
PFX(interp_4tap_vert_ss_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = 
PFX(interp_4tap_vert_ss_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = 
PFX(interp_4tap_vert_ss_32x32_avx512);
+
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
@@ -4834,6 +4839,11 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = 
PFX(interp_4tap_vert_pp_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = 
PFX(interp_4tap_vert_pp_32x64_avx512);
 
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = 
PFX(interp_4tap_vert_ss_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = 
PFX(interp_4tap_vert_ss_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = 
PFX(interp_4tap_vert_ss_32x48_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = 
PFX(interp_4tap_vert_ss_32x64_avx512);
+
 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
@@ -4848,6 +4858,12 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = 
PFX(interp_4tap_vert_pp_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = 
PFX(interp_4tap_vert_pp_64x16_avx512);
 
+p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = 
PFX(interp_4tap_vert_ss_32x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = 
PFX(interp_4tap_vert_ss_32x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = 
PFX(interp_4tap_vert_ss_32x24_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = 
PFX(interp_4tap_vert_ss_32x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = 
PFX(interp_4tap_vert_ss_32x64_avx512);
+
 p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
 p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
 p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
diff -r 963884afd8f3 -r 635fbc26941a source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Thu Nov 16 14:46:53 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Wed Nov 22 10:51:33 2017 +0530
@@ -167,6 +167,31 @@
 times 32 db -2, 10
 times 32 db 58, -2
 
+ALIGN 64
+const pw_ChromaCoeffVer_32_avx512,  times 16 dw 0, 64
+times 16 dw 0, 0
+
+times 16 dw -2, 58
+times 16 dw 10, -2
+
+times 16 dw -4, 54
+times 16 dw 16, -2
+
+times 16 dw -6, 46
+times 16 dw 28, -4
+
+times 16 dw -4, 36
+times 16 dw 36, -4
+
+times 16 dw -4, 28
+times 16 dw 46, -6
+
+times 16 dw -2, 16
+times 16 dw 54, -4
+
+times 16 dw -2, 10
+times 16 dw 58, -2
+
 const 

[x265] [PATCH 177 of 307] x86: AVX512 optimise interp_4tap_vert_pp_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1509968597 -19800
#  Mon Nov 06 17:13:17 2017 +0530
# Node ID 2c24c0aadbe3e76eabde711a94c57aed077b7347
# Parent  67e149415f9f8be0d5b7832fde9e02cc592bbf28
x86: AVX512 optimise interp_4tap_vert_pp_32xN

diff -r 67e149415f9f -r 2c24c0aadbe3 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Mon Nov 20 15:07:31 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Mon Nov 06 17:13:17 2017 +0530
@@ -142,6 +142,7 @@
 times 16 db 58, -10
 times 16 db 4, -1
 
+ALIGN 64
 const tab_ChromaCoeffVer_32_avx512, times 32 db 0, 64
 times 32 db 0, 0
 
@@ -10796,151 +10797,94 @@
 IPFILTER_CHROMA_PS_48xN_AVX512 64
 %endif
 
-%macro PROCESS_CHROMA_VERT_PP_32x8_AVX512 0
-movu ym0,  [r0]; m0 = row 0
-lea   r6,  [r0 + 4 * r1]
-lea   r7,  [r2 + 4 * r3]
-vinserti32x8  m0,  [r6],  1; m0 = row 4
-movu ym1,  [r0 + r1]   ; m1 = row 1
-vinserti32x8  m1,  [r6 + r1], 1; m1 = row 5
-punpcklbw m2,  m0,m1
-punpckhbw m3,  m0,m1
-pmaddubsw m2,  m10
-pmaddubsw m3,  m10
-
-movu ym0,  [r0 + r1 * 2]   ; m0 = row 2
-vinserti32x8  m0,  [r6 + r1 * 2], 1; m0 = row 6
-punpcklbw m4,  m1,m0
-punpckhbw m5,  m1,m0
-pmaddubsw m4,  m10
-pmaddubsw m5,  m10
-
-movu ym1,  [r0 + r4]   ; m1 = row 3
-vinserti32x8  m1,  [r6 + r4], 1; m1 = row 7
-punpcklbw m6,  m0,m1
-punpckhbw m7,  m0,m1
-pmaddubsw m8,  m6,m11
-pmaddubsw m9,  m7,m11
-pmaddubsw m6,  m10
-pmaddubsw m7,  m10
-
-paddw m2,  m8
-paddw m3,  m9
-
-pmulhrsw  m2,  m12
-pmulhrsw  m3,  m12
-packuswb  m2,  m3
-movu  [r2],ym2
-vextracti32x8 [r7],m2,1
-lea   r0,  [r0 + r1 * 4]
-lea   r6,  [r6 + r1 * 4]
-
-movu ym0,  [r0]; m0 = row 4
-vinserti32x8  m0,  [r6],  1; m0 = row 8
-punpcklbw m2,  m1,m0
-punpckhbw m3,  m1,m0
-pmaddubsw m8,  m2,m11
-pmaddubsw m9,  m3,m11
-pmaddubsw m2,  m10
-pmaddubsw m3,  m10
-
-paddw m4,  m8
-paddw m5,  m9
-pmulhrsw  m4,  m12
-pmulhrsw  m5,  m12
-packuswb  m4,  m5
-movu  [r2 + r3],   ym4
-vextracti32x8 [r7 + r3],   m4,1
-
-movu ym1,  [r0 + r1]   ; m1 = row 5
-vinserti32x8  m1,  [r6 + r1], 1; m1 = row 9
-punpcklbw m4,  m0,m1
-punpckhbw m5,  m0,m1
-pmaddubsw m4,  m11
-pmaddubsw m5,  m11
-paddw m6,  m4
-paddw m7,  m5
-
-pmulhrsw  m6,  m12
-pmulhrsw  m7,  m12
-packuswb  m6,  m7
-movu  [r2 + r3 * 2],   ym6
-vextracti32x8 [r7 + r3 * 2],   m6,1
-
-movu ym0,  [r0 + r1 * 2]   ; m0 = row 6
-vinserti32x8  m0,  [r6 + r1 * 2], 1; m0 = row 
10
-punpcklbw m6,  m1,m0
-punpckhbw m7,  m1,m0
-pmaddubsw m6,  m11
-pmaddubsw m7,  m11
-paddw m2,  m6
-paddw m3,  m7
-pmulhrsw  m2,  m12
-pmulhrsw  m3,  m12
-packuswb  m2,  m3
-movu  [r2 + r5],   ym2
-

[x265] [PATCH 159 of 307] x86: dct8 PASS2 optimize for shuffle instructions

2018-04-06 Thread mythreyi
# HG changeset patch
# User Praveen Tiwari 
# Date 1510583185 28800
#  Mon Nov 13 06:26:25 2017 -0800
# Node ID 8bfedd92563a0e1da365c4d64a0e565e35f6025a
# Parent  a7ce91c5db95ac0eb3f58b5c993ace3bfe0bbe2f
x86: dct8 PASS2 optimize for shuffle instructions

diff -r a7ce91c5db95 -r 8bfedd92563a source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmMon Nov 13 04:08:07 2017 -0800
+++ b/source/common/x86/dct8.asmMon Nov 13 06:26:25 2017 -0800
@@ -35,9 +35,11 @@
 dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
 dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 
13, 15
 dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30
+dct8_shuf9_AVX512: times 2 dw 0, 8, 16, 24, 4, 12, 20, 28
 dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 
8, 9
 dct8_shuf_AVX512:  times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 
10, 11
 
+
 tab_dct8:   dw 64, 64, 64, 64, 64, 64, 64, 64
 dw 89, 75, 50, 18, -18, -50, -75, -89
 dw 83, 36, -36, -83, -83, -36, 36, 83
@@ -2325,77 +2327,83 @@
 %macro DCT8_AVX512_PASS_2 4
 vpmaddwd m0,   m9,  m%1
 vpmaddwd m1,   m10, m%1
-vpshufb  m2,   m0,  m6
-vpshufb  m3,   m1,  m6
+vpsrldq  m2,   m0,  8
+vpsrldq  m3,   m1,  8
 vpaddd   m0,   m2
 vpaddd   m1,   m3
-vpermd   m0,   m18, m0
-vpermd   m1,   m18, m1
-vinserti64x4 m0,   m0, ym1, 1
-vpshufb  m1,   m0, m6
-vpaddd   m0,   m1
-vpermd   m0,   m18, m0
-
-vpmaddwd m1,   m9, m%2
+vpsrlq   m2,   m0,  32
+vpsrlq   m3,   m1,  32
+vpaddd   m0,   m2
+vpaddd   m1,   m3
+vpaddd   m0,   m5
+vpsrad   m0,   DCT8_SHIFT2
+vpaddd   m1,   m5
+vpsrad   m1,   DCT8_SHIFT2
+vpackssdwm0,   m1
+vpermw   m0,   m19, m0
+
+vpmaddwd m1,   m9,  m%2
 vpmaddwd m2,   m10, m%2
-vpshufb  m3,   m1, m6
-vpshufb  m4,   m2, m6
+vpsrldq  m3,   m1,  8
+vpsrldq  m4,   m2,  8
 vpaddd   m1,   m3
 vpaddd   m2,   m4
-vpermd   m1,   m18, m1
-vpermd   m2,   m18, m2
-vinserti64x4 m1,   m1, ym2, 1
-vpshufb  m2,   m1, m6
-vpaddd   m1,   m2
-vpermd   m1,   m18, m1
-
-vinserti64x4 m0,   m0, ym1, 1
-vpaddd   m0,   m5
-vpsrad   m0,   DCT8_SHIFT2
+vpsrlq   m3,   m1,  32
+vpsrlq   m4,   m2,  32
+vpaddd   m1,   m3
+vpaddd   m2,   m4
+vpaddd   m1,   m5
+vpsrad   m1,   DCT8_SHIFT2
+vpaddd   m2,   m5
+vpsrad   m2,   DCT8_SHIFT2
+vpackssdwm1,   m2
+vpermw   m1,   m19, m1
+vinserti128  ym0,  ym0, xm1, 1
 
 vpmaddwd m1,   m9,  m%3
 vpmaddwd m2,   m10, m%3
-vpshufb  m3,   m1,  m6
-vpshufb  m4,   m2,  m6
+vpsrldq  m3,   m1,  8
+vpsrldq  m4,   m2,  8
 vpaddd   m1,   m3
 vpaddd   m2,   m4
-vpermd   m1,   m18, m1
-vpermd   m2,   m18, m2
-vinserti64x4 m1,   m1, ym2, 1
-vpshufb  m2,   m1, m6
-vpaddd   m1,   m2
-vpermd   m1,   m18, m1
-
-vpmaddwd m2,   m9, m%4
-vpmaddwd m3,   m10, m%4
-vpshufb  m4,   m2, m6
-vpshufb  m7,   m3, m6
+vpsrlq   m3,   m1,  32
+vpsrlq   m4,   m2,  32
+vpaddd   m1,   m3
 vpaddd   m2,   m4
-vpaddd   m3,   m7
-vpermd   m2,   m18, m2
-vpermd   m3,   m18, m3
-vinserti64x4 m2,   m2, ym3, 1
-vpshufb  m3,   m2, m6
-vpaddd   m2,

[x265] [PATCH 174 of 307] x86: AVX512 interp_4tap_horiz_ps_24xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1510569160 -19800
#  Mon Nov 13 16:02:40 2017 +0530
# Node ID df3c576cd32c50b0412ad3d70eeebfe8fb511da1
# Parent  ab41c6957bc2f359e5df82f9936c3fd00a5d2ea5
x86: AVX512 interp_4tap_horiz_ps_24xN for high bit depth

Color Space i420
Size  |  AVX2 performance  | AVX512 performance
--
24x32 |  24.21x|  34.11x

Color Space i422
Size  |  AVX2 performance  | AVX512 performance
--
24x64 |  24.99x|  35.13x

Color Space i444
Size  |  AVX2 performance  | AVX512 performance
--
24x32 |  24.40x|  34.42x

diff -r ab41c6957bc2 -r df3c576cd32c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Apr 05 18:09:10 2018 -0700
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 13 16:02:40 2017 +0530
@@ -2897,6 +2897,10 @@
 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps = 
PFX(interp_4tap_horiz_ps_8x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hps = 
PFX(interp_4tap_horiz_ps_8x32_avx512);
 
+p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hps = 
PFX(interp_4tap_horiz_ps_24x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = 
PFX(interp_4tap_horiz_ps_24x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hps = 
PFX(interp_4tap_horiz_ps_24x32_avx512);
+
 }
 #endif
 }
diff -r ab41c6957bc2 -r df3c576cd32c source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Thu Apr 05 18:09:10 2018 -0700
+++ b/source/common/x86/ipfilter16.asm  Mon Nov 13 16:02:40 2017 +0530
@@ -7479,6 +7479,228 @@
 IPFILTER_CHROMA_PS_AVX512_8xN 32
 IPFILTER_CHROMA_PS_AVX512_8xN 64
 %endif
+
+%macro PROCESS_IPFILTER_CHROMA_PS_24x4_AVX512 0
+; register map
+; m0 , m1 - interpolate coeff
+; m2 , m3 - shuffle order table
+; m4  - INTERP_OFFSET_PS
+; m5  - shuffle store order table
+
+movuym6,   [r0]
+vinserti32x8m6,[r0 + r1],  1
+movuym7,   [r0 + 8]
+vinserti32x8m7,[r0 + r1 + 8],  1
+
+pshufb  m8,m6,m3
+pshufb  m6,m2
+pmaddwd m6,m0
+pmaddwd m8,m1
+paddd   m6,m8
+paddd   m6,m4
+psrad   m6,INTERP_SHIFT_PS
+
+pshufb  m8,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m8,m1
+paddd   m7,m8
+paddd   m7,m4
+psrad   m7,INTERP_SHIFT_PS
+
+packssdwm6,m7
+pshufb  m6,m5
+movu[r2],  ym6
+vextracti32x8   [r2 + r3], m6,1
+
+movuym6,   [r0 + 2 * r1]
+vinserti32x8m6,[r0 + r6],  1
+movuym7,   [r0 + 2 * r1 + 8]
+vinserti32x8m7,[r0 + r6 + 8],  1
+
+pshufb  m8,m6,m3
+pshufb  m6,m2
+pmaddwd m6,m0
+pmaddwd m8,m1
+paddd   m6,m8
+paddd   m6,m4
+psrad   m6,INTERP_SHIFT_PS
+
+pshufb  m8,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m8,m1
+paddd   m7,m8
+paddd   m7,m4
+psrad   m7,INTERP_SHIFT_PS
+
+packssdwm6,m7
+pshufb  m6,m5
+movu[r2 + 2 * r3],ym6
+vextracti32x8   [r2 + r7], m6,1
+
+movuxm6,   [r0 + mmsize/2]
+vinserti32x4m6,[r0 + r1 + mmsize/2],  1
+vinserti32x4m6,[r0 + 2 * r1 + mmsize/2],  2
+vinserti32x4m6,[r0 + r6 + mmsize/2],  3
+
+pshufb  m8,m6,m3
+pshufb  m6,m2
+pmaddwd m6,m0
+pmaddwd m8,m1
+paddd   m6,m8
+paddd   m6,m4
+psrad   m6,INTERP_SHIFT_PS
+
+movuxm7,   [r0 + mmsize/2 + 8]
+vinserti32x4m7,[r0 + r1 + mmsize/2 + 8],  1
+vinserti32x4m7,[r0 + 2 * r1 + mmsize/2 + 8],  2
+vinserti32x4m7,[r0 + r6 + mmsize/2 + 8],  3
+
+pshufb  m8,m7,m3
+pshufb  m7,m2
+pmaddwd m7,m0
+pmaddwd m8,m1
+paddd   m7,m8
+paddd   m7,m4
+psrad   m7,INTERP_SHIFT_PS
+
+packssdwm6,m7
+pshufb  m6,m5
+movu[r2 + 

[x265] [PATCH 182 of 307] x86: AVX512 interp_4tap_vert_ss_48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1511337892 -19800
#  Wed Nov 22 13:34:52 2017 +0530
# Node ID 3d6605772d179c329fffc669cbecc64afd8c8dff
# Parent  ad1814e2ff60904208508512af07472dee380c51
x86: AVX512 interp_4tap_vert_ss_48x64

AVX2 performance   : 16.34x
AVX512 performance : 35.69x

diff -r ad1814e2ff60 -r 3d6605772d17 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Nov 22 12:27:48 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 22 13:34:52 2017 +0530
@@ -4883,6 +4883,7 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = 
PFX(interp_4tap_vert_ss_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = 
PFX(interp_4tap_vert_ss_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = 
PFX(interp_4tap_vert_ss_64x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = 
PFX(interp_4tap_vert_ss_48x64_avx512);
 
 p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
 p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
diff -r ad1814e2ff60 -r 3d6605772d17 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Wed Nov 22 12:27:48 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Wed Nov 22 13:34:52 2017 +0530
@@ -11373,6 +11373,155 @@
 FILTER_VER_SS_CHROMA_32xN_AVX512 64
 %endif
 
+%macro PROCESS_CHROMA_VERT_SS_48x4_AVX512 0
+movu  m1, [r0]
+lea   r6, [r0 + 2 * r1]
+movu  m10,[r6]
+movu  m3, [r0 + r1]
+movu  m12,[r6 + r1]
+punpcklwd m0, m1,  m3
+punpcklwd m9, m10, m12
+pmaddwd   m0, m16
+pmaddwd   m9, m16
+punpckhwd m1, m3
+punpckhwd m10,m12
+pmaddwd   m1, m16
+pmaddwd   m10,m16
+
+movu  m4, [r0 + 2 * r1]
+movu  m13,[r6 + 2 * r1]
+punpcklwd m2, m3,  m4
+punpcklwd m11,m12, m13
+pmaddwd   m2, m16
+pmaddwd   m11,m16
+punpckhwd m3, m4
+punpckhwd m12,m13
+pmaddwd   m3, m16
+pmaddwd   m12,m16
+
+movu  m5, [r0 + r7]
+movu  m14,[r6 + r7]
+punpcklwd m6, m4,  m5
+punpcklwd m15,m13, m14
+pmaddwd   m6, m17
+pmaddwd   m15,m17
+paddd m0, m6
+paddd m9, m15
+punpckhwd m4, m5
+punpckhwd m13,m14
+pmaddwd   m4, m17
+pmaddwd   m13,m17
+paddd m1, m4
+paddd m10,m13
+
+movu  m4, [r0 + 4 * r1]
+movu  m13,[r6 + 4 * r1]
+punpcklwd m6, m5,  m4
+punpcklwd m15,m14, m13
+pmaddwd   m6, m17
+pmaddwd   m15,m17
+paddd m2, m6
+paddd m11,m15
+punpckhwd m5, m4
+punpckhwd m14,m13
+pmaddwd   m5, m17
+pmaddwd   m14,m17
+paddd m3, m5
+paddd m12,m14
+
+psrad m0, 6
+psrad m1, 6
+psrad m2, 6
+psrad m3, 6
+psrad m9, 6
+psrad m10,6
+psrad m11,6
+psrad m12,6
+packssdw  m0, m1
+packssdw  m2, m3
+packssdw  m9, m10
+packssdw  m11,m12
+
+movu  [r2],   m0
+movu  [r2 + r3],  m2
+movu  

[x265] [PATCH 195 of 307] x86: AVX512 interp_8tap_vert_sp_24x32 and interp_8tap_vert_ss_24x32

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1511505545 -19800
#  Fri Nov 24 12:09:05 2017 +0530
# Node ID 47b99c09008b1921881b0dfa00d80cce1f8d15eb
# Parent  ecaf36f641dd1428d556f172e83cf7078f0287fb
x86: AVX512 interp_8tap_vert_sp_24x32 and interp_8tap_vert_ss_24x32

luma_vss
AVX2 performance   : 10.98x
AVX512 performance : 16.36x

luma_vsp
AVX2 performance   : 12.19x
AVX512 performance : 17.20x

diff -r ecaf36f641dd -r 47b99c09008b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Nov 24 11:34:33 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Nov 24 12:09:05 2017 +0530
@@ -2844,6 +2844,7 @@
 p.pu[LUMA_16x16].luma_vss = PFX(interp_8tap_vert_ss_16x16_avx512);
 p.pu[LUMA_16x32].luma_vss = PFX(interp_8tap_vert_ss_16x32_avx512);
 p.pu[LUMA_16x64].luma_vss = PFX(interp_8tap_vert_ss_16x64_avx512);
+p.pu[LUMA_24x32].luma_vss = PFX(interp_8tap_vert_ss_24x32_avx512);
 p.pu[LUMA_32x8].luma_vss = PFX(interp_8tap_vert_ss_32x8_avx512);
 p.pu[LUMA_32x16].luma_vss = PFX(interp_8tap_vert_ss_32x16_avx512);
 p.pu[LUMA_32x32].luma_vss = PFX(interp_8tap_vert_ss_32x32_avx512);
@@ -2864,6 +2865,7 @@
 p.pu[LUMA_16x16].luma_vsp = PFX(interp_8tap_vert_sp_16x16_avx512);
 p.pu[LUMA_16x32].luma_vsp = PFX(interp_8tap_vert_sp_16x32_avx512);
 p.pu[LUMA_16x64].luma_vsp = PFX(interp_8tap_vert_sp_16x64_avx512);
+p.pu[LUMA_24x32].luma_vsp = PFX(interp_8tap_vert_sp_24x32_avx512);
 p.pu[LUMA_32x8].luma_vsp = PFX(interp_8tap_vert_sp_32x8_avx512);
 p.pu[LUMA_32x16].luma_vsp = PFX(interp_8tap_vert_sp_32x16_avx512);
 p.pu[LUMA_32x32].luma_vsp = PFX(interp_8tap_vert_sp_32x32_avx512);
diff -r ecaf36f641dd -r 47b99c09008b source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Fri Nov 24 11:34:33 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Fri Nov 24 12:09:05 2017 +0530
@@ -11134,6 +11134,284 @@
 FILTER_VER_S_LUMA_16xN_AVX512 sp, 64
 %endif
 
+%macro PROCESS_LUMA_VERT_S_24x8_AVX512 1
+PROCESS_LUMA_VERT_S_16x4_AVX512 %1
+lea  r4,  [r6 + 4 * r1]
+lea  r8,  [r4 + 4 * r1]
+movu ym1, [r6]
+movu ym3, [r6 + r1]
+vinserti32x8 m1,  [r6 + 2 * r1],  1
+vinserti32x8 m3,  [r6 + r7],  1
+punpcklwdm0,  m1, m3
+pmaddwd  m0,  m15
+punpckhwdm1,  m3
+pmaddwd  m1,  m15
+
+movu ym4, [r6 + 2 * r1]
+vinserti32x8 m4,  [r4],   1
+punpcklwdm2,  m3, m4
+pmaddwd  m2,  m15
+punpckhwdm3,  m4
+pmaddwd  m3,  m15
+
+movu ym5, [r6 + r7]
+vinserti32x8 m5,  [r4 + r1],  1
+punpcklwdm6,  m4, m5
+pmaddwd  m6,  m16
+punpckhwdm4,  m5
+pmaddwd  m4,  m16
+
+padddm0,  m6
+padddm1,  m4
+
+movu ym4, [r4]
+vinserti32x8 m4,  [r4 + 2 * r1],  1
+punpcklwdm6,  m5, m4
+pmaddwd  m6,  m16
+punpckhwdm5,  m4
+pmaddwd  m5,  m16
+
+padddm2,  m6
+padddm3,  m5
+
+movu ym11,[r4 + r1]
+vinserti32x8 m11, [r4 + r7],  1
+punpcklwdm8,  m4, m11
+pmaddwd  m8,  m17
+punpckhwdm4,  m11
+pmaddwd  m4,  m17
+
+movu ym12,[r4 + 2 * r1]
+vinserti32x8 m12, [r4 + 4 * r1],  1
+punpcklwdm10, m11,m12
+pmaddwd  m10, m17
+punpckhwdm11, m12
+pmaddwd  m11, m17
+
+movu ym13,[r4 + r7]
+vinserti32x8 m13, [r8 + r1],  1
+punpcklwdm14, m12,m13
+pmaddwd  m14,  

[x265] [PATCH 228 of 307] x86: AVX512 interp_4tap_vert_sp_48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512041776 -19800
#  Thu Nov 30 17:06:16 2017 +0530
# Node ID e77ef4964dd04de6a8b84378f7a46219f34bf1b5
# Parent  9c652d9062d29607cdb3392567817e4e2ab7f6bb
x86: AVX512 interp_4tap_vert_sp_48x64

AVX2 performance   : 11.93x
AVX512 performance : 23.59x

diff -r 9c652d9062d2 -r e77ef4964dd0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Nov 30 17:01:28 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Nov 30 17:06:16 2017 +0530
@@ -4998,6 +4998,7 @@
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = 
PFX(interp_4tap_vert_sp_32x24_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = 
PFX(interp_4tap_vert_sp_32x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = 
PFX(interp_4tap_vert_sp_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = 
PFX(interp_4tap_vert_sp_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = 
PFX(interp_4tap_vert_sp_64x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = 
PFX(interp_4tap_vert_sp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = 
PFX(interp_4tap_vert_sp_64x32_avx512);
diff -r 9c652d9062d2 -r e77ef4964dd0 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Thu Nov 30 17:01:28 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Thu Nov 30 17:06:16 2017 +0530
@@ -11728,114 +11728,122 @@
 FILTER_VER_S_CHROMA_32xN_AVX512 sp, 48
 FILTER_VER_S_CHROMA_32xN_AVX512 sp, 64
 %endif
-%macro PROCESS_CHROMA_VERT_SS_48x4_AVX512 0
-movu  m1, [r0]
+
+%macro PROCESS_CHROMA_VERT_S_48x4_AVX512 1
+PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
 lea   r6, [r0 + 2 * r1]
-movu  m10,[r6]
-movu  m3, [r0 + r1]
-movu  m12,[r6 + r1]
+
+movu  m1, [r6]
+movu  m3, [r6 + r1]
 punpcklwd m0, m1,  m3
-punpcklwd m9, m10, m12
-pmaddwd   m0, m16
-pmaddwd   m9, m16
+pmaddwd   m0, m7
 punpckhwd m1, m3
-punpckhwd m10,m12
-pmaddwd   m1, m16
-pmaddwd   m10,m16
-
-movu  m4, [r0 + 2 * r1]
-movu  m13,[r6 + 2 * r1]
+pmaddwd   m1, m7
+movu  m4, [r6 + 2 * r1]
 punpcklwd m2, m3,  m4
-punpcklwd m11,m12, m13
-pmaddwd   m2, m16
-pmaddwd   m11,m16
+pmaddwd   m2, m7
 punpckhwd m3, m4
-punpckhwd m12,m13
-pmaddwd   m3, m16
-pmaddwd   m12,m16
-
-movu  m5, [r0 + r7]
-movu  m14,[r6 + r7]
+pmaddwd   m3, m7
+
+movu  m5, [r6 + r4]
 punpcklwd m6, m4,  m5
-punpcklwd m15,m13, m14
-pmaddwd   m6, m17
-pmaddwd   m15,m17
+pmaddwd   m6, m8
 paddd m0, m6
-paddd m9, m15
 punpckhwd m4, m5
-punpckhwd m13,m14
-pmaddwd   m4, m17
-pmaddwd   m13,m17
+pmaddwd   m4, m8
 paddd m1, m4
-paddd m10,m13
-
-movu  m4, [r0 + 4 * r1]
-movu  m13,[r6 + 4 * r1]
+
+movu  m4, [r6 + 4 * r1]
 punpcklwd m6, m5,  m4
-punpcklwd m15,m14, m13
-pmaddwd   m6, m17
-pmaddwd   m15,m17
+pmaddwd   m6, m8
 paddd m2, m6
-paddd m11,m15
 punpckhwd m5, m4
- 

[x265] [PATCH 229 of 307] [x265-avx512]x86: AVX512 denoise DCT

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1512036841 -19800
#  Thu Nov 30 15:44:01 2017 +0530
# Node ID f86b11b8c629b0e4bf8342d42a0e9c475d7c3a7d
# Parent  e77ef4964dd04de6a8b84378f7a46219f34bf1b5
[x265-avx512]x86: AVX512 denoise DCT

diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Nov 30 15:44:01 2017 +0530
@@ -2888,6 +2888,7 @@
 p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
 p.quant = PFX(quant_avx512);
 p.nquant = PFX(nquant_avx512);
+p.denoiseDct = PFX(denoise_dct_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = 
PFX(interp_4tap_horiz_ps_32x32_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = 
PFX(interp_4tap_horiz_ps_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = 
PFX(interp_4tap_horiz_ps_32x24_avx512);
@@ -5068,6 +5069,7 @@
 p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
 p.quant = PFX(quant_avx512);
 p.nquant = PFX(nquant_avx512);
+p.denoiseDct = PFX(denoise_dct_avx512);
 }
 #endif
 }
diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmThu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/dct8.asmThu Nov 30 15:44:01 2017 +0530
@@ -2357,6 +2357,67 @@
 dec  r3d
 jnz .loop
 RET
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal denoise_dct, 4, 4, 22
+pxor m16,  m16
+sub  r3d,   16
+je   .coeff16
+add  r3d,   16
+shr  r3d,5
+jmp  .loop
+
+.coeff16:
+movu  ym19,  [r0]
+pabsw ym17, ym19
+movum2, [r1]
+pmovsxwd   m18, ym17
+paddd   m2,  m18
+movu  [r1],   m2
+movu   ym3, [r2]
+psubusw   ym17, ym3
+pcmpgtw   ym18, ym17, ym16
+pand  ym17, ym18
+psignwym17, ym19
+movu  [r0], ym17
+RET
+
+.loop:
+movu  m21, [r0]
+pabsw m17, m21
+movu   m2, [r1]
+pmovsxwd   m4, ym17
+paddd  m2,  m4
+movu [r1],  m2
+vextracti64x4 ym4, m17, 1
+
+movu   m2, [r1 + mmsize]
+pmovsxwd   m3, ym4
+paddd  m2, m3
+movu   [r1 + mmsize], m2
+movu   m3, [r2]
+psubusw   m17, m3
+
+vextracti64x4 ym20,  m17,1
+pcmpgtw   ym18, ym17, ym16
+pcmpgtw   ym19, ym20, ym16
+vinserti64x4   m18,  m18, ym19, 1
+
+pand   m17,  m18
+vextracti64x4 ym19,  m17, 1
+vextracti64x4 ym20,  m21, 1
+psignwym17, ym21
+psignwym19, ym20
+vinserti64x4   m17,  m17, ym19, 1
+
+movu  [r0],  m17
+add r0,  mmsize
+add r1,  mmsize * 2
+add r2,  mmsize
+dec r3d
+jnz .loop
+RET
+%endif ; ARCH_X86_64 == 1
 
 %if ARCH_X86_64 == 1
 %macro DCT8_PASS_1 4
diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h  Thu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/dct8.h  Thu Nov 30 15:44:01 2017 +0530
@@ -42,7 +42,7 @@
 void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* 
offset, int size);
 void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* 
offset, int size);
-
+void PFX(denoise_dct_avx512)(int16_t* dct, uint32_t* sum, const uint16_t* 
offset, int size);
 void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
 void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 245 of 307] x86: AVX512 interp_4tap_vert_ps_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1512455478 -19800
#  Tue Dec 05 12:01:18 2017 +0530
# Node ID c335a7ca4304001e245dea7977cde1c2e0c0a8ee
# Parent  81a870948ac446b36c248325e0c7264cf8f3f09e
x86: AVX512 interp_4tap_vert_ps_32xN

i420
Size  |  AVX2 performance | AVX512 performance
--
32x8  |  36.28x   |  47.86x
32x16 |  40.43x   |  51.57x
32x24 |  40.96x   |  54.05x
32x32 |  40.12x   |  54.27x

i422
Size  |  AVX2 performance | AVX512 performance
--
32x16 |  39.84x   |  51.35x
32x32 |  39.86x   |  54.17x
32x48 |  41.14x   |  54.85x
32x64 |  42.00x   |  56.50x


i444
Size  |  AVX2 performance | AVX512 performance
--
32x8  |  36.08x   |  47.61x
32x16 |  39.96x   |  51.41x
32x24 |  40.38x   |  54.51x
32x32 |  40.07x   |  54.56x
32x64 |  41.94x   |  56.59x

diff -r 81a870948ac4 -r c335a7ca4304 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Dec 07 15:31:54 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 05 12:01:18 2017 +0530
@@ -5158,6 +5158,23 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = 
PFX(interp_4tap_vert_ps_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = 
PFX(interp_4tap_vert_ps_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = 
PFX(interp_4tap_vert_ps_64x16_avx512);
+
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = 
PFX(interp_4tap_vert_ps_32x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = 
PFX(interp_4tap_vert_ps_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = 
PFX(interp_4tap_vert_ps_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = 
PFX(interp_4tap_vert_ps_32x8_avx512);
+
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = 
PFX(interp_4tap_vert_ps_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = 
PFX(interp_4tap_vert_ps_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = 
PFX(interp_4tap_vert_ps_32x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = 
PFX(interp_4tap_vert_ps_32x48_avx512);
+
+p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = 
PFX(interp_4tap_vert_ps_32x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = 
PFX(interp_4tap_vert_ps_32x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = 
PFX(interp_4tap_vert_ps_32x24_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = 
PFX(interp_4tap_vert_ps_32x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = 
PFX(interp_4tap_vert_ps_32x64_avx512);
+
 }
 #endif
 }
diff -r 81a870948ac4 -r c335a7ca4304 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Thu Dec 07 15:31:54 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Tue Dec 05 12:01:18 2017 +0530
@@ -10951,7 +10951,7 @@
 FILTER_VER_PP_CHROMA_16xN_AVX512 64
 %endif
 
-%macro PROCESS_CHROMA_VERT_PP_32x4_AVX512 0
+%macro PROCESS_CHROMA_VERT_32x4_AVX512 1
 movu  ym1,[r0]
 movu  ym3,[r0 + r1]
 vinserti32x8  m1, [r0 + 2 * r1],   1
@@ -10988,25 +10988,45 @@
 pmaddubsw m5, m9
 paddw m3, m5
 
+%ifidn %1,pp
 pmulhrsw  m0, m7
 pmulhrsw  m1, m7
 pmulhrsw  m2, m7
 pmulhrsw  m3, m7
-
 packuswb  m0, m1
 packuswb  m2, m3
 movu  [r2],   ym0
 movu  [r2 + r3],  ym2
 vextracti32x8 [r2 + 2 * r3],  m0,  1
 vextracti32x8 [r2 + r7],  m2,  1
+%else
+psubw m0, m7
+psubw m1, m7
+psubw m2, m7
+psubw m3, m7
+
+mova  m4, m10
+mova  m5, m11
+vpermi2q  m4, m0,m1
+vpermi2q  m5, m0,m1
+mova  m6, m10
+mova  m12,m11
+vpermi2q  m6, m2,m3
+vpermi2q  m12, m2,m3
+
+movu  

[x265] [PATCH 232 of 307] x86: AVX512 optimise interp_4tap_vert_pp_16xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512377610 -19800
#  Mon Dec 04 14:23:30 2017 +0530
# Node ID 3e8615bc86537e07754a1c023ade702a837042a8
# Parent  465682e66d91ecf207feae78c33e32f0eaaf45c4
x86: AVX512 optimise interp_4tap_vert_pp_16xN

i444
Size  |  AVX2 performance | AVX512 performance
--
16x4  |   26.22x  |  32.07x
16x12 |   30.95x  |  40.01x

diff -r 465682e66d91 -r 3e8615bc8653 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Dec 04 12:33:32 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Dec 04 14:23:30 2017 +0530
@@ -4893,9 +4893,11 @@
 p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = 
PFX(interp_4tap_horiz_ps_48x64_avx512);
 
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = 
PFX(interp_4tap_vert_pp_16x4_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = 
PFX(interp_4tap_vert_pp_16x12_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
-p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
-p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = 
PFX(interp_4tap_vert_pp_32x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
@@ -4927,6 +4929,7 @@
 
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = 
PFX(interp_4tap_vert_pp_16x24_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = 
PFX(interp_4tap_vert_pp_16x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = 
PFX(interp_4tap_vert_pp_16x24_avx512);
@@ -4960,7 +4963,9 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = 
PFX(interp_4tap_vert_sp_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = 
PFX(interp_4tap_vert_sp_32x64_avx512);
 
+p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = 
PFX(interp_4tap_vert_pp_16x4_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = 
PFX(interp_4tap_vert_pp_16x12_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = 
PFX(interp_4tap_vert_pp_16x64_avx512);
diff -r 465682e66d91 -r 3e8615bc8653 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Mon Dec 04 12:33:32 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Mon Dec 04 14:23:30 2017 +0530
@@ -10866,96 +10866,50 @@
 
;-
 ;avx512 chroma_vpp code start
 
;-
-%macro PROCESS_CHROMA_VERT_PP_16x8_AVX512 0
+%macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0
+lea   r5, [r0 + 4 * r1]
 movu  xm1,[r0]
-lea   r8, [r0 + 4 * r1]
-lea   r9, [r8 + 2 * r1]
-vinserti32x4  m1, [r0 + 2 * r1],1
-vinserti32x4  m1, [r8],2
-vinserti32x4  m1, [r9],3
 movu  xm3,[r0 + r1]
-vinserti32x4  m3, [r0 + r6],   1
-vinserti32x4  m3, [r8 + r1],   2
-vinserti32x4  m3, [r9 + r1],   3
+vinserti32x4  m1, [r0 + r1],   1
+vinserti32x4  m3, [r0 + 2 * r1],   1
+vinserti32x4  m1, [r0 + 2 * r1],   2
+

[x265] [PATCH 243 of 307] [x265-avx512]x86: AVX512 sad_x3_16xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1512637265 -19800
#  Thu Dec 07 14:31:05 2017 +0530
# Node ID 0ffc9c56a0a7361e98e6388e3067e4a78e8cd252
# Parent  931dd781dc0c6de76bb31d0215db7a7af885f9bf
[x265-avx512]x86: AVX512 sad_x3_16xN for high bit depth

Size  |  AVX2 performance | AVX512 performance
--
16x8  |  16.34x   |  17.91x
16x12 |  17.38x   |  18.82x
16x16 |  17.90x   |  20.07x
16x32 |  18.39x   |  21.77x
16x64 |  18.00x   |  22.43x

diff -r 931dd781dc0c -r 0ffc9c56a0a7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Dec 07 11:07:35 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Dec 07 14:31:05 2017 +0530
@@ -2495,6 +2495,11 @@
 p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512);
 p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx512);
 
+p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx512);
+p.pu[LUMA_16x12].sad_x3 = PFX(pixel_sad_x3_16x12_avx512);
+p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_avx512);
+p.pu[LUMA_16x32].sad_x3 = PFX(pixel_sad_x3_16x32_avx512);
+p.pu[LUMA_16x64].sad_x3 = PFX(pixel_sad_x3_16x64_avx512);
 p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512);
 p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512);
 p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
diff -r 931dd781dc0c -r 0ffc9c56a0a7 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Thu Dec 07 11:07:35 2017 +0530
+++ b/source/common/x86/sad16-a.asm Thu Dec 07 14:31:05 2017 +0530
@@ -2443,6 +2443,54 @@
 %endmacro
 
 
+%macro PROCESS_SAD_X3_16x4_AVX512 0
+movuym6, [r0]
+vinserti64x4 m6, [r0 + 2 * FENC_STRIDE],  1
+movuym3, [r1]
+vinserti64x4 m3, [r1 + r4],  1
+movuym4, [r2]
+vinserti64x4 m4, [r2 + r4],  1
+movuym5, [r3]
+vinserti64x4 m5, [r3 + r4],  1
+
+psubw   m3, m6
+psubw   m4, m6
+psubw   m5, m6
+pabsw   m3, m3
+pabsw   m4, m4
+pabsw   m5, m5
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+
+movuym6, [r0 + 4 * FENC_STRIDE]
+vinserti64x4 m6, [r0 + 6 * FENC_STRIDE],  1
+movuym3, [r1 + 2 * r4]
+vinserti64x4 m3, [r1 + r6],  1
+movuym4, [r2 + 2 * r4]
+vinserti64x4 m4, [r2 + r6],  1
+movuym5, [r3 + 2 * r4]
+vinserti64x4 m5, [r3 + r6],  1
+
+psubw   m3, m6
+psubw   m4, m6
+psubw   m5, m6
+pabsw   m3, m3
+pabsw   m4, m4
+pabsw   m5, m5
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+%endmacro
+
 
 %macro PROCESS_SAD_X3_32x4_AVX512 0
 movum6, [r0]
@@ -2700,6 +2748,118 @@
 
 
 
;--
+; void pixel_sad_x3_16x%1( const pixel* pix1, const pixel* pix2, const pixel* 
pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
+;--
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_sad_x3_16x8, 6,7,8
+pxorm0,  m0
+pxorm1,  m1
+pxorm2,  m2
+
+vbroadcasti32x8 m7, [pw_1]
+
+add r4d, r4d
+lea r6d, [r4 * 3]
+
+PROCESS_SAD_X3_16x4_AVX512
+add r0, FENC_STRIDE * 8
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+PROCESS_SAD_X3_16x4_AVX512
+PROCESS_SAD_X3_END_AVX512
+RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_16x12, 6,7,8
+pxorm0,  m0
+pxorm1,  m1
+pxorm2,  m2
+
+vbroadcasti32x8 m7, [pw_1]
+
+add r4d, r4d
+lea r6d, [r4 * 3]
+%rep 2
+PROCESS_SAD_X3_16x4_AVX512
+add r0, FENC_STRIDE * 8
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+%endrep
+PROCESS_SAD_X3_16x4_AVX512
+PROCESS_SAD_X3_END_AVX512
+RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_16x16, 6,7,8
+pxorm0,  m0
+pxorm1,  m1
+pxorm2,  m2
+
+vbroadcasti32x8 m7, [pw_1]
+
+add r4d, r4d
+lea r6d, [r4 * 3]
+
+%rep 3
+PROCESS_SAD_X3_16x4_AVX512
+add r0, FENC_STRIDE * 8
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+%endrep
+PROCESS_SAD_X3_16x4_AVX512
+PROCESS_SAD_X3_END_AVX512
+RET
+
+INIT_ZMM avx512

[x265] [PATCH 221 of 307] x86: AVX512 interp_8tap_vert_sp_64xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1511953084 -19800
#  Wed Nov 29 16:28:04 2017 +0530
# Node ID 834a8f52a976a6c5da294267392bcd6da1aa6d6e
# Parent  9f2c4a0d09f3405f9c28cd3ebf229617c2278681
x86: AVX512 interp_8tap_vert_sp_64xN

Size  |  AVX2 performance  | AVX512 performance
--
64x16 |   12.23x   |  21.04x
64x32 |   12.24x   |  22.10x
64x48 |   12.28x   |  22.19x
64x64 |   12.26x   |  22.23x

diff -r 9f2c4a0d09f3 -r 834a8f52a976 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Nov 29 15:08:25 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 29 16:28:04 2017 +0530
@@ -2886,7 +2886,10 @@
 p.pu[LUMA_32x24].luma_vsp = PFX(interp_8tap_vert_sp_32x24_avx512);
 p.pu[LUMA_32x16].luma_vsp = PFX(interp_8tap_vert_sp_32x16_avx512);
 p.pu[LUMA_32x8].luma_vsp = PFX(interp_8tap_vert_sp_32x8_avx512);
-
+p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512);
+p.pu[LUMA_64x48].luma_vsp = PFX(interp_8tap_vert_sp_64x48_avx512);
+p.pu[LUMA_64x32].luma_vsp = PFX(interp_8tap_vert_sp_64x32_avx512);
+p.pu[LUMA_64x16].luma_vsp = PFX(interp_8tap_vert_sp_64x16_avx512);
 p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
 p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
 p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
diff -r 9f2c4a0d09f3 -r 834a8f52a976 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Wed Nov 29 15:08:25 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Wed Nov 29 16:28:04 2017 +0530
@@ -13702,86 +13702,8 @@
 PROCESS_LUMA_VERT_SS_48x4_AVX512
 RET
 %endif
-
-%macro PROCESS_LUMA_VERT_SS_64x2_AVX512 0
-movu m1,  [r0]   
;0 row
-movu m3,  [r0 + r1]  
;1 row
-punpcklwdm0,  m1, m3
-pmaddwd  m0,  m15
-punpckhwdm1,  m3
-pmaddwd  m1,  m15
-
-movu m4,  [r0 + 2 * r1]  
;2 row
-punpcklwdm2,  m3, m4
-pmaddwd  m2,  m15
-punpckhwdm3,  m4
-pmaddwd  m3,  m15
-
-movu m5,  [r0 + r7]  
;3 row
-punpcklwdm6,  m4, m5
-pmaddwd  m6,  m16
-punpckhwdm4,  m5
-pmaddwd  m4,  m16
-
-padddm0,  m6
-padddm1,  m4
-
-movu m4,  [r0 + 4 * r1]  
;4 row
-punpcklwdm6,  m5, m4
-pmaddwd  m6,  m16
-punpckhwdm5,  m4
-pmaddwd  m5,  m16
-
-padddm2,  m6
-padddm3,  m5
-
-lea  r6,  [r0 + 4 * r1]
-
-movu m11, [r6 + r1]  
;5 row
-punpcklwdm8,  m4, m11
-pmaddwd  m8,  m17
-punpckhwdm4,  m11
-pmaddwd  m4,  m17
-
-movu m12, [r6 + 2 * r1]  
;6 row
-punpcklwdm10, m11,m12
-pmaddwd  m10, m17
-punpckhwdm11, m12
-pmaddwd  m11, m17
-
-movu m13, [r6 + r7]  
;7 row
-punpcklwdm14, m12,m13
-pmaddwd  m14, m18
-punpckhwdm12, m13
-pmaddwd  m12, m18
-
-padddm8,  m14
-padddm4,  m12
-padddm0,  m8
-padddm1,  m4
-
-movu m12, [r6 + 4 * r1] ; 
8 row
-punpcklwdm14, m13,m12
-pmaddwd  m14, m18
-punpckhwdm13, m12
-pmaddwd  m13, m18
-
-padddm10, m14
-padddm11, m13
- 

[x265] [PATCH 238 of 307] x86: AVX512 interp_8tap_vert_pp_16xN and interp_8tap_vert_ps_16xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512465067 -19800
#  Tue Dec 05 14:41:07 2017 +0530
# Node ID f92128e41ac3c1da210c1c665d97061539821aaf
# Parent  ca6bb5919227672e0cf98b785acf099531c32945
x86: AVX512 interp_8tap_vert_pp_16xN  and interp_8tap_vert_ps_16xN for high bit 
depth

luma_vpp
Size  |  AVX2 performance | AVX512 performance
--
16x4  |   8.32x   |  13.14x
16x8  |  10.69x   |  15.14x
16x12 |  11.62x   |  15.94x
16x16 |  12.19x   |  15.97x
16x32 |  12.24x   |  16.59x
16x64 |  12.57x   |  16.50x

luma_vps
Size  |  AVX2 performance | AVX512 performance
--
16x4  |   8.04x   |  15.37x
16x8  |   9.72x   |  14.97x
16x12 |  10.47x   |  14.71x
16x16 |   9.79x   |  15.21x
16x32 |   9.66x   |  15.60x
16x64 |  11.16x   |  15.67x

diff -r ca6bb5919227 -r f92128e41ac3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Dec 05 13:28:42 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 05 14:41:07 2017 +0530
@@ -2882,6 +2882,12 @@
 p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512);
 p.pu[LUMA_48x64].luma_vsp = PFX(interp_8tap_vert_sp_48x64_avx512);
 
+p.pu[LUMA_16x4].luma_vpp = PFX(interp_8tap_vert_pp_16x4_avx512);
+p.pu[LUMA_16x8].luma_vpp = PFX(interp_8tap_vert_pp_16x8_avx512);
+p.pu[LUMA_16x12].luma_vpp = PFX(interp_8tap_vert_pp_16x12_avx512);
+p.pu[LUMA_16x16].luma_vpp = PFX(interp_8tap_vert_pp_16x16_avx512);
+p.pu[LUMA_16x32].luma_vpp = PFX(interp_8tap_vert_pp_16x32_avx512);
+p.pu[LUMA_16x64].luma_vpp = PFX(interp_8tap_vert_pp_16x64_avx512);
 p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512);
 p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512);
 p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
@@ -2892,6 +2898,12 @@
 p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512);
 p.pu[LUMA_64x64].luma_vpp = PFX(interp_8tap_vert_pp_64x64_avx512);
 
+p.pu[LUMA_16x4].luma_vps = PFX(interp_8tap_vert_ps_16x4_avx512);
+p.pu[LUMA_16x8].luma_vps = PFX(interp_8tap_vert_ps_16x8_avx512);
+p.pu[LUMA_16x12].luma_vps = PFX(interp_8tap_vert_ps_16x12_avx512);
+p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512);
+p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512);
+p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512);
 p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512);
 p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512);
 p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
diff -r ca6bb5919227 -r f92128e41ac3 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Tue Dec 05 13:28:42 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Tue Dec 05 14:41:07 2017 +0530
@@ -12930,6 +12930,169 @@
 
;-
 ;avx512 luma_vpp and luma_vps code start
 
;-
+%macro PROCESS_LUMA_VERT_P_16x4_AVX512 1
+lea  r6,  [r0 + 4 * r1]
+movu ym1, [r0]
+movu ym3, [r0 + r1]
+vinserti32x8 m1,  [r0 + 2 * r1],  1
+vinserti32x8 m3,  [r0 + r7],  1
+punpcklwdm0,  m1, m3
+pmaddwd  m0,  m15
+punpckhwdm1,  m3
+pmaddwd  m1,  m15
+
+movu ym4, [r0 + 2 * r1]
+vinserti32x8 m4,  [r0 + 4 * r1],  1
+punpcklwdm2,  m3, m4
+pmaddwd  m2,  m15
+punpckhwdm3,  m4
+pmaddwd  m3,  m15
+
+movu ym5, [r0 + r7]
+vinserti32x8 m5,  [r6 + r1],  1
+punpcklwdm6,  m4, m5
+pmaddwd  m6,  m16
+punpckhwdm4,  m5
+pmaddwd  m4,  m16
+
+padddm0,  m6
+padddm1,  m4
+
+movu ym4, [r6]
+vinserti32x8 m4,  [r6 + 2 * r1],  

[x265] [PATCH 240 of 307] x86: AVX512 interp_8tap_vert_pp_24xN and interp_vert_ps_24xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512537795 -19800
#  Wed Dec 06 10:53:15 2017 +0530
# Node ID 2d298099a8d6b266a32b975de4b6a369988d3887
# Parent  8b1c9d9c5bd8135dc11b6d031b990bfe47e3bcd8
x86: AVX512 interp_8tap_vert_pp_24xN and interp_vert_ps_24xN for high bit depth

luma_vpp
AVX2 performance   : 11.91x
AVX512 performance : 15.77x

luma_vsp
AVX2 performance   : 10.36x
AVX512 performance : 14.20x

diff -r 8b1c9d9c5bd8 -r 2d298099a8d6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Dec 05 17:30:30 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Dec 06 10:53:15 2017 +0530
@@ -2888,6 +2888,7 @@
 p.pu[LUMA_16x16].luma_vpp = PFX(interp_8tap_vert_pp_16x16_avx512);
 p.pu[LUMA_16x32].luma_vpp = PFX(interp_8tap_vert_pp_16x32_avx512);
 p.pu[LUMA_16x64].luma_vpp = PFX(interp_8tap_vert_pp_16x64_avx512);
+p.pu[LUMA_24x32].luma_vpp = PFX(interp_8tap_vert_pp_24x32_avx512);
 p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512);
 p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512);
 p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
@@ -2905,6 +2906,7 @@
 p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512);
 p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512);
 p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512);
+p.pu[LUMA_24x32].luma_vps = PFX(interp_8tap_vert_ps_24x32_avx512);
 p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512);
 p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512);
 p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
diff -r 8b1c9d9c5bd8 -r 2d298099a8d6 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Tue Dec 05 17:30:30 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Wed Dec 06 10:53:15 2017 +0530
@@ -12931,7 +12931,7 @@
 ;avx512 luma_vpp and luma_vps code start
 
;-
 %macro PROCESS_LUMA_VERT_P_16x4_AVX512 1
-lea  r6,  [r0 + 4 * r1]
+lea  r5,  [r0 + 4 * r1]
 movu ym1, [r0]
 movu ym3, [r0 + r1]
 vinserti32x8 m1,  [r0 + 2 * r1],  1
@@ -12949,7 +12949,7 @@
 pmaddwd  m3,  m15
 
 movu ym5, [r0 + r7]
-vinserti32x8 m5,  [r6 + r1],  1
+vinserti32x8 m5,  [r5 + r1],  1
 punpcklwdm6,  m4, m5
 pmaddwd  m6,  m16
 punpckhwdm4,  m5
@@ -12958,8 +12958,8 @@
 padddm0,  m6
 padddm1,  m4
 
-movu ym4, [r6]
-vinserti32x8 m4,  [r6 + 2 * r1],  1
+movu ym4, [r5]
+vinserti32x8 m4,  [r5 + 2 * r1],  1
 punpcklwdm6,  m5, m4
 pmaddwd  m6,  m16
 punpckhwdm5,  m4
@@ -12968,22 +12968,22 @@
 padddm2,  m6
 padddm3,  m5
 
-lea  r4,  [r6 + 4 * r1]
-movu ym11,[r6 + r1]
-vinserti32x8 m11, [r6 + r7],  1
+lea  r4,  [r5 + 4 * r1]
+movu ym11,[r5 + r1]
+vinserti32x8 m11, [r5 + r7],  1
 punpcklwdm8,  m4, m11
 pmaddwd  m8,  m17
 punpckhwdm4,  m11
 pmaddwd  m4,  m17
 
-movu ym12,[r6 + 2 * r1]
+movu ym12,[r5 + 2 * r1]
 vinserti32x8 m12, [r4],   1
 punpcklwdm10, m11,m12
 pmaddwd  m10, m17
 punpckhwdm11, m12
 pmaddwd  m11, m17
 
-movu ym13,[r6 + r7]
+movu ym13,[r5 + r7]
 vinserti32x8 m13, [r4 + r1],  1
 punpcklwdm14, m12,m13
 pmaddwd  m14,   

[x265] [PATCH 234 of 307] x86: AVX512 interp_4tap_vert_pp_48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512389309 -19800
#  Mon Dec 04 17:38:29 2017 +0530
# Node ID 283aa4d77cef296699167c041763d7115e7a88aa
# Parent  ae75b2d09d10f28391d573507c13512360593386
x86: AVX512 interp_4tap_vert_pp_48x64

AVX2 performance   : 43.04x
AVX512 performance : 51.46x

diff -r ae75b2d09d10 -r 283aa4d77cef source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Dec 04 15:05:04 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Dec 04 17:38:29 2017 +0530
@@ -4977,6 +4977,7 @@
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = 
PFX(interp_4tap_vert_pp_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = 
PFX(interp_4tap_vert_pp_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = 
PFX(interp_4tap_vert_pp_64x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = 
PFX(interp_4tap_vert_pp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = 
PFX(interp_4tap_vert_pp_64x32_avx512);
diff -r ae75b2d09d10 -r 283aa4d77cef source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Mon Dec 04 15:05:04 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Mon Dec 04 17:38:29 2017 +0530
@@ -11038,6 +11038,125 @@
 FILTER_VER_PP_CHROMA_32xN_AVX512 64
 %endif
 
+%macro PROCESS_CHROMA_VERT_PP_48x4_AVX512 0
+movu  ym1,[r0]
+movu  ym3,[r0 + r1]
+vinserti32x8  m1, [r0 + 2 * r1],   1
+vinserti32x8  m3, [r0 + r6],   1
+punpcklbw m0, m1,  m3
+pmaddubsw m0, m8
+punpckhbw m1, m3
+pmaddubsw m1, m8
+
+movu  ym4,[r0 + 2 * r1]
+vinserti32x8  m4, [r0 + 4 * r1],   1
+punpcklbw m2, m3,  m4
+pmaddubsw m2, m8
+punpckhbw m3, m4
+pmaddubsw m3, m8
+
+lea   r5, [r0 + 4 * r1]
+
+movu  ym5,[r0 + r6]
+vinserti32x8  m5, [r5 + r1],   1
+punpcklbw m6, m4,  m5
+pmaddubsw m6, m9
+paddw m0, m6
+punpckhbw m4, m5
+pmaddubsw m4, m9
+paddw m1, m4
+
+movu  ym4,[r0 + 4 * r1]
+vinserti32x8  m4, [r5 + 2 * r1],   1
+punpcklbw m6, m5,  m4
+pmaddubsw m6, m9
+paddw m2, m6
+punpckhbw m5, m4
+pmaddubsw m5, m9
+paddw m3, m5
+
+pmulhrsw  m0, m7
+pmulhrsw  m1, m7
+pmulhrsw  m2, m7
+pmulhrsw  m3, m7
+
+packuswb  m0, m1
+packuswb  m2, m3
+movu  [r2],   ym0
+movu  [r2 + r3],  ym2
+vextracti32x8 [r2 + 2 * r3],  m0,  1
+vextracti32x8 [r2 + r7],  m2,  1
+
+movu  xm1,[r0 + mmsize/2]
+movu  xm3,[r0 + r1 + mmsize/2]
+vinserti32x4  m1, [r0 + r1 + mmsize/2],   1
+vinserti32x4  m3, [r0 + 2 * r1 + mmsize/2],   1
+vinserti32x4  m1, [r0 + 2 * r1 + mmsize/2],   2
+vinserti32x4  m3, [r0 + r6 + mmsize/2],   2
+vinserti32x4  m1, [r0 + r6 + mmsize/2],   3
+vinserti32x4  m3, [r0 + 4 * r1 + mmsize/2],   3
+
+punpcklbw m0, m1,  m3
+pmaddubsw m0, m8
+punpckhbw m1, m3
+pmaddubsw m1, m8
+
+movu  xm4,[r0 + 2 * r1 + mmsize/2]
+movu  xm5,[r0 + r6 + mmsize/2]
+vinserti32x4  m4,   

[x265] [PATCH 244 of 307] [x265-avx512]x86: AVX512 sad_x4_16xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1512640914 -19800
#  Thu Dec 07 15:31:54 2017 +0530
# Node ID 81a870948ac446b36c248325e0c7264cf8f3f09e
# Parent  0ffc9c56a0a7361e98e6388e3067e4a78e8cd252
[x265-avx512]x86: AVX512 sad_x4_16xN for high bit depth

Size  |  AVX2 performance | AVX512 performance
--
16x8  |  16.33x   |  18.34x
16x12 |  15.79x   |  19.91x
16x16 |  15.73x   |  18.82x
16x32 |  17.13x   |  20.72x
16x64 |  17.72x   |  23.04x

diff -r 0ffc9c56a0a7 -r 81a870948ac4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Dec 07 14:31:05 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Dec 07 15:31:54 2017 +0530
@@ -2511,6 +2511,11 @@
 p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
 p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512);
 
+p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx512);
+p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx512);
+p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx512);
+p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx512);
+p.pu[LUMA_16x64].sad_x4 = PFX(pixel_sad_x4_16x64_avx512);
 p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512);
 p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
 p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
diff -r 0ffc9c56a0a7 -r 81a870948ac4 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Thu Dec 07 14:31:05 2017 +0530
+++ b/source/common/x86/sad16-a.asm Thu Dec 07 15:31:54 2017 +0530
@@ -2124,6 +2124,67 @@
 ; SAD x3/x4 avx512 code start
 ;
 
+%macro PROCESS_SAD_X4_16x4_AVX512 0
+movuym8, [r0]
+vinserti64x4 m8, [r0 + 2 * FENC_STRIDE],  1
+movuym4, [r1]
+vinserti64x4 m4, [r1 + r5],  1
+movuym5, [r2]
+vinserti64x4 m5, [r2 + r5],  1
+movuym6, [r3]
+vinserti64x4 m6, [r3 + r5],  1
+movuym7, [r4]
+vinserti64x4 m7, [r4 + r5],  1
+
+
+psubw   m4, m8
+psubw   m5, m8
+psubw   m6, m8
+psubw   m7, m8
+pabsw   m4, m4
+pabsw   m5, m5
+pabsw   m6, m6
+pabsw   m7, m7
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+
+movuym8, [r0 + 4 * FENC_STRIDE]
+vinserti64x4 m8, [r0 + 6 * FENC_STRIDE],  1
+movuym4, [r1 + 2 * r5]
+vinserti64x4 m4, [r1 + r7],  1
+movuym5, [r2 + 2 * r5]
+vinserti64x4 m5, [r2 + r7],  1
+movuym6, [r3 +  2 * r5]
+vinserti64x4 m6, [r3 + r7],  1
+movuym7, [r4 +  2 * r5]
+vinserti64x4 m7, [r4 + r7],  1
+
+psubw   m4, m8
+psubw   m5, m8
+psubw   m6, m8
+psubw   m7, m8
+pabsw   m4, m4
+pabsw   m5, m5
+pabsw   m6, m6
+pabsw   m7, m7
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+%endmacro
+
 %macro PROCESS_SAD_X4_32x4_AVX512 0
 movum8, [r0]
 movum4, [r1]
@@ -3467,6 +3528,130 @@
 PROCESS_SAD_X3_END_AVX512
 RET
 %endif
+
+;
+; void pixel_sad_x4_16x%1( const pixel* pix1, const pixel* pix2, const pixel* 
pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
+;
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_sad_x4_16x8, 6,8,10
+pxorm0,  m0
+pxorm1,  m1
+pxorm2,  m2
+pxorm3,  m3
+
+vbroadcasti32x8 m9, [pw_1]
+
+add r5d, r5d
+lea r7d, [r5 * 3]
+
+PROCESS_SAD_X4_16x4_AVX512
+add r0, FENC_STRIDE * 8
+lea r1, [r1 + r5 * 4]
+lea r2, [r2 + r5 * 4]
+lea r3, [r3 + r5 * 4]
+lea r4, [r4 + r5 * 4]
+PROCESS_SAD_X4_16x4_AVX512
+PROCESS_SAD_X4_END_AVX512
+RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_16x12, 6,8,10
+pxorm0,  m0
+pxorm1,  m1
+pxorm2,  m2
+pxorm3,  m3
+
+vbroadcasti32x8 m9, [pw_1]
+
+add r5d, r5d
+lea r7d, [r5 * 3]
+
+%rep 2
+PROCESS_SAD_X4_16x4_AVX512
+add r0, FENC_STRIDE * 8
+lea r1, [r1 + r5 * 4]
+lea r2, [r2 + r5 * 4]
+lea r3, [r3 + r5 * 4]
+lea r4, [r4 + r5 * 4]
+

[x265] [PATCH 237 of 307] x86: AVX512 interp_8tap_vert_pp_64xN and interp_8tap_vert_ps_64xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512460722 -19800
#  Tue Dec 05 13:28:42 2017 +0530
# Node ID ca6bb5919227672e0cf98b785acf099531c32945
# Parent  c3a341391f0c777665e191a4cd172f08a5a313f9
x86: AVX512 interp_8tap_vert_pp_64xN  and interp_8tap_vert_ps_64xN for high bit 
depth

luma_vpp
Size  |  AVX2 performance  | AVX512 performance
--
64x16 |   11.51x   |  19.67x
64x32 |   11.51x   |  19.42x
64x48 |   11.54x   |  19.42x
64x64 |   11.55x   |  19.72x

luma_vps
Size  |  AVX2 performance  | AVX512 performance
--
64x16 |9.92x   |  18.23x
64x32 |9.71x   |  18.13x
64x48 |9.81x   |  18.04x
64x64 |9.86x   |  18.14x

diff -r c3a341391f0c -r ca6bb5919227 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Nov 24 16:44:56 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 05 13:28:42 2017 +0530
@@ -2887,12 +2887,20 @@
 p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
 p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512);
 p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512);
+p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512);
+p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512);
+p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512);
+p.pu[LUMA_64x64].luma_vpp = PFX(interp_8tap_vert_pp_64x64_avx512);
 
 p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512);
 p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512);
 p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
 p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512);
 p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512);
+p.pu[LUMA_64x16].luma_vps = PFX(interp_8tap_vert_ps_64x16_avx512);
+p.pu[LUMA_64x32].luma_vps = PFX(interp_8tap_vert_ps_64x32_avx512);
+p.pu[LUMA_64x48].luma_vps = PFX(interp_8tap_vert_ps_64x48_avx512);
+p.pu[LUMA_64x64].luma_vps = PFX(interp_8tap_vert_ps_64x64_avx512);
 
 p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
 /* TODO: Currently these kernels performance are similar to AVX2 
version, we need a to improve them further to ebable
diff -r c3a341391f0c -r ca6bb5919227 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Fri Nov 24 16:44:56 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Tue Dec 05 13:28:42 2017 +0530
@@ -13078,6 +13078,152 @@
 FILTER_VER_P_LUMA_32xN_AVX512 pp, 24
 FILTER_VER_P_LUMA_32xN_AVX512 pp, 64
 %endif
+
+%macro PROCESS_LUMA_VERT_P_64x2_AVX512 1
+PROCESS_LUMA_VERT_P_32x2_AVX512 %1
+movu m1,  [r0 + mmsize]
+movu m3,  [r0 + r1 + mmsize]
+punpcklwdm0,  m1, m3
+pmaddwd  m0,  m15
+punpckhwdm1,  m3
+pmaddwd  m1,  m15
+
+movu m4,  [r0 + 2 * r1 + mmsize]
+punpcklwdm2,  m3, m4
+pmaddwd  m2,  m15
+punpckhwdm3,  m4
+pmaddwd  m3,  m15
+
+movu m5,  [r0 + r7 + mmsize]
+punpcklwdm6,  m4, m5
+pmaddwd  m6,  m16
+punpckhwdm4,  m5
+pmaddwd  m4,  m16
+
+padddm0,  m6
+padddm1,  m4
+
+movu m4,  [r0 + 4 * r1 + mmsize]
+punpcklwdm6,  m5, m4
+pmaddwd  m6,  m16
+punpckhwdm5,  m4
+pmaddwd  m5,  m16
+
+padddm2,  m6
+padddm3,  m5
+
+movu m11, [r6 + r1 + mmsize]
+punpcklwdm8,  m4, m11
+pmaddwd  m8,  m17
+punpckhwdm4,  m11
+pmaddwd  m4,  m17
+
+movu m12, [r6 + 2 * r1 + mmsize]
+punpcklwdm10, m11,m12
+pmaddwd  m10, m17
+punpckhwdm11, m12
+pmaddwd  m11, m17
+
+movu   

[x265] [PATCH 236 of 307] x86: AVX512 interp_8tap_vert_pp_32xN and interp_8tap_vert_ps_32xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1511522096 -19800
#  Fri Nov 24 16:44:56 2017 +0530
# Node ID c3a341391f0c777665e191a4cd172f08a5a313f9
# Parent  1cd123613bbb28fd00da36a3cfe3765f8e07d00e
x86: AVX512 interp_8tap_vert_pp_32xN  and interp_8tap_vert_ps_32xN for high bit 
depth

luma_vpp
Size  |  AVX2 performance  | AVX512 performance
--
32x8  |   10.54x   |  18.96x
32x16 |   11.70x   |  20.71x
32x24 |   11.34x   |  20.47x
32x32 |   11.76x   |  19.45x
32x64 |   11.87x   |  21.04x

luma_vps
Size  |  AVX2 performance  | AVX512 performance
--
32x8  |9.01x   |  17.10x
32x16 |   10.15x   |  18.05x
32x24 |9.78x   |  17.90x
32x32 |   10.19x   |  17.79x
32x64 |   10.14x   |  18.50x

diff -r 1cd123613bbb -r c3a341391f0c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 27 16:45:08 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Nov 24 16:44:56 2017 +0530
@@ -2882,6 +2882,18 @@
 p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512);
 p.pu[LUMA_48x64].luma_vsp = PFX(interp_8tap_vert_sp_48x64_avx512);
 
+p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512);
+p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512);
+p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
+p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512);
+p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512);
+
+p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512);
+p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512);
+p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
+p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512);
+p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512);
+
 p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
 /* TODO: Currently these kernels performance are similar to AVX2 
version, we need a to improve them further to ebable
 * it. Probably a Vtune analysis will help here.
diff -r 1cd123613bbb -r c3a341391f0c source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Mon Nov 27 16:45:08 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Fri Nov 24 16:44:56 2017 +0530
@@ -12928,5 +12928,159 @@
 ;avx512 luma_vss and luma_vsp code end
 
;-
 
;-
+;avx512 luma_vpp and luma_vps code start
+;-
+%macro PROCESS_LUMA_VERT_P_32x2_AVX512 1
+movu m1,  [r0]   
;0 row
+movu m3,  [r0 + r1]  
;1 row
+punpcklwdm0,  m1, m3
+pmaddwd  m0,  m15
+punpckhwdm1,  m3
+pmaddwd  m1,  m15
+
+movu m4,  [r0 + 2 * r1]  
;2 row
+punpcklwdm2,  m3, m4
+pmaddwd  m2,  m15
+punpckhwdm3,  m4
+pmaddwd  m3,  m15
+
+movu m5,  [r0 + r7]  
;3 row
+punpcklwdm6,  m4, m5
+pmaddwd  m6,  m16
+punpckhwdm4,  m5
+pmaddwd  m4,  m16
+
+padddm0,  m6
+padddm1,  m4
+
+movu m4,  [r0 + 4 * r1]  
;4 row
+punpcklwdm6,  m5, m4
+pmaddwd  m6,  m16
+punpckhwdm5,  m4
+pmaddwd  m5,  m16
+
+padddm2,  m6
+padddm3,  m5
+
+lea  r6,  [r0 + 4 * r1]
+
+movu m11, [r6 + r1]  
;5 row
+punpcklwdm8,  m4, m11
+pmaddwd  m8,  m17
+punpckhwdm4,  m11
+pmaddwd  m4,  m17
+
+movu   

[x265] [PATCH 231 of 307] x86: AVX512 ssd_ss_16x16

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512371012 -19800
#  Mon Dec 04 12:33:32 2017 +0530
# Node ID 465682e66d91ecf207feae78c33e32f0eaaf45c4
# Parent  4f690222337dbc1757665729ea15f2380a11c329
x86: AVX512 ssd_ss_16x16
AVX2 performance   : 43.55x
AVX512 performance : 48.11x

This patch also cleanup already existing ssd_ss AVX512 code

diff -r 4f690222337d -r 465682e66d91 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Dec 01 10:30:38 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Dec 04 12:33:32 2017 +0530
@@ -4743,6 +4743,7 @@
 
 p.cu[BLOCK_64x64].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
 p.cu[BLOCK_32x32].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
+p.cu[BLOCK_16x16].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx512);
 p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
 p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512);
 
diff -r 4f690222337d -r 465682e66d91 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm   Fri Dec 01 10:30:38 2017 +0530
+++ b/source/common/x86/ssd-a.asm   Mon Dec 04 12:33:32 2017 +0530
@@ -1390,183 +1390,120 @@
 ;-
 ; ssd_ss avx512 code start
 ;-
-%macro PROCESS_SSD_SS_64x8_AVX512 0
+%if ARCH_X86_64
+%macro PROCESS_SSD_SS_64x4_AVX512 0
 movum0, [r0]
 movum1, [r0 + mmsize]
 movum2, [r0 + r1]
 movum3, [r0 + r1 + mmsize]
-
-psubw   m0, [r2]
-psubw   m1, [r2 + mmsize]
-psubw   m2, [r2 + r3]
-psubw   m3, [r2 + r3 + mmsize]
+movum4, [r2]
+movum5, [r2 + mmsize]
+movum6, [r2 + r3]
+movum7, [r2 + r3 + mmsize]
+
+psubw   m0, m4
+psubw   m1, m5
+psubw   m2, m6
+psubw   m3, m7
 pmaddwd m0, m0
 pmaddwd m1, m1
 pmaddwd m2, m2
 pmaddwd m3, m3
-paddd   m4, m0
-paddd   m5, m1
-paddd   m4, m2
-paddd   m5, m3
+paddd   m8, m0
+paddd   m8, m1
+paddd   m8, m2
+paddd   m8, m3
 
 movum0, [r0 + 2 * r1]
 movum1, [r0 + 2 * r1 + mmsize]
 movum2, [r0 + r5]
 movum3, [r0 + r5 + mmsize]
-
-psubw   m0, [r2 + 2 * r3]
-psubw   m1, [r2 + 2 * r3 + mmsize]
-psubw   m2, [r2 + r6]
-psubw   m3, [r2 + r6 + mmsize]
+movum4, [r2 + 2 * r3]
+movum5, [r2 + 2 * r3 + mmsize]
+movum6, [r2 + r6]
+movum7, [r2 + r6 + mmsize]
+
+psubw   m0, m4
+psubw   m1, m5
+psubw   m2, m6
+psubw   m3, m7
 pmaddwd m0, m0
 pmaddwd m1, m1
 pmaddwd m2, m2
 pmaddwd m3, m3
-paddd   m4, m0
-paddd   m5, m1
-paddd   m4, m2
-paddd   m5, m3
-
-lea r0, [r0 + 4 * r1]
-lea r2, [r2 + 4 * r3]
-
+paddd   m8, m0
+paddd   m8, m1
+paddd   m8, m2
+paddd   m8, m3
+%endmacro
+
+%macro PROCESS_SSD_SS_32x4_AVX512 0
 movum0, [r0]
-movum1, [r0 + mmsize]
-movum2, [r0 + r1]
-movum3, [r0 + r1 + mmsize]
-
-psubw   m0, [r2]
-psubw   m1, [r2 + mmsize]
-psubw   m2, [r2 + r3]
-psubw   m3, [r2 + r3 + mmsize]
+movum1, [r0 + r1]
+movum2, [r0 + 2 * r1]
+movum3, [r0 + r5]
+movum4, [r2]
+movum5, [r2 + r3]
+movum6, [r2 + 2 * r3]
+movum7, [r2 + r6]
+
+psubw   m0, m4
+psubw   m1, m5
+psubw   m2, m6
+psubw   m3, m7
 pmaddwd m0, m0
 pmaddwd m1, m1
 pmaddwd m2, m2
 pmaddwd m3, m3
-paddd   m4, m0
-paddd   m5, m1
-paddd   m4, m2
-paddd   m5, m3
-
-movum0, [r0 + 2 * r1]
-movum1, [r0 + 2 * r1 + mmsize]
-movum2, [r0 + r5]
-movum3, [r0 + r5 + mmsize]
-
-psubw   m0, [r2 + 2 * r3]
-psubw   m1, [r2 + 2 * r3 + mmsize]
-psubw   m2, [r2 + r6]
-psubw   m3, [r2 + r6 + mmsize]
+paddd   m8, m0
+paddd   m8, m1
+paddd   m8, m2
+paddd   m8, m3
+%endmacro
+
+%macro PROCESS_SSD_SS_16x4_AVX512 0
+movu   ym0, [r0]
+vinserti32x8m0, [r0 + r1],1
+movu   ym1, [r0 + 2 * r1]
+vinserti32x8m1, [r0 + r5],1
+movu   ym4, [r2]
+vinserti32x8m4, [r2 + r3],1
+movu   ym5, [r2 + 2 * r3]
+vinserti32x8m5, [r2 + r6],1
+
+psubw   m0, m4
+psubw   m1, m5
 pmaddwd m0, m0
 pmaddwd m1, m1
-pmaddwd m2, m2
-pmaddwd m3, m3
-paddd   m4, m0
-paddd  

[x265] [PATCH 235 of 307] x86: AVX512 interp_4tap_vert_ps_64xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1511781308 -19800
#  Mon Nov 27 16:45:08 2017 +0530
# Node ID 1cd123613bbb28fd00da36a3cfe3765f8e07d00e
# Parent  283aa4d77cef296699167c041763d7115e7a88aa
x86: AVX512 interp_4tap_vert_ps_64xN

Size  |  AVX2 performance | AVX512 performance
--
64x16 |  39.17x   |  64.63x
64x32 |  40.14x   |  64.98x
64x48 |  39.97x   |  64.52x
64x64 |  40.32x   |  64.93x

diff -r 283aa4d77cef -r 1cd123613bbb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Dec 04 17:38:29 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 27 16:45:08 2017 +0530
@@ -5087,6 +5087,11 @@
 p.quant = PFX(quant_avx512);
 p.nquant = PFX(nquant_avx512);
 p.denoiseDct = PFX(denoise_dct_avx512);
+
+p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = 
PFX(interp_4tap_vert_ps_64x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = 
PFX(interp_4tap_vert_ps_64x48_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = 
PFX(interp_4tap_vert_ps_64x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = 
PFX(interp_4tap_vert_ps_64x16_avx512);
 }
 #endif
 }
diff -r 283aa4d77cef -r 1cd123613bbb source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Mon Dec 04 17:38:29 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Mon Nov 27 16:45:08 2017 +0530
@@ -243,10 +243,13 @@
 const interp4_horiz_shuf_load3_avx512,  times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 
7, 8, 9, 7, 8, 9, 10
 
 ALIGN 64
+interp4_vps_store1_avx512:   dq 0, 1, 8, 9, 2, 3, 10, 11
+interp4_vps_store2_avx512:   dq 4, 5, 12, 13, 6, 7, 14, 15
 const interp4_hps_shuf_avx512,  dq 0, 4, 1, 5, 2, 6, 3, 7
 const interp4_hps_store_16xN_avx512,  dq 0, 2, 1, 3, 4, 6, 5, 7
 const interp8_hps_store_avx512,  dq 0, 1, 4, 5, 2, 3, 6, 7
 const interp8_vsp_store_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
+
 SECTION .text
 cextern pb_128
 cextern pw_1
@@ -10864,7 +10867,7 @@
 %endif
 
 
;-
-;avx512 chroma_vpp code start
+;avx512 chroma_vpp and chroma_vps code start
 
;-
 %macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0
 lea   r5, [r0 + 4 * r1]
@@ -11157,7 +11160,7 @@
 RET
 %endif
 
-%macro PROCESS_CHROMA_VERT_PP_64x4_AVX512 0
+%macro PROCESS_CHROMA_VERT_64x4_AVX512 1
 movu  m0,  [r0]; m0 = row 0
 movu  m1,  [r0 + r1]   ; m1 = row 1
 punpcklbw m2,  m0,m1
@@ -11179,10 +11182,21 @@
 paddw m2,  m8
 paddw m3,  m9
 
+%ifidn %1,pp
 pmulhrsw  m2,  m12
 pmulhrsw  m3,  m12
 packuswb  m2,  m3
 movu  [r2],m2
+%else
+psubw m2, m12
+psubw m3, m12
+movu  m8, m13
+movu  m9, m14
+vpermi2q  m8, m2, m3
+vpermi2q  m9, m2, m3
+movu  [r2], m8
+movu  [r2 + mmsize], m9
+%endif
 
 lea   r0,  [r0 + r1 * 4]
 movu  m0,  [r0]; m0 = row 4
@@ -11194,10 +11208,22 @@
 pmaddubsw m3,  m10
 paddw m4,  m8
 paddw m5,  m9
+
+%ifidn %1,pp
 pmulhrsw  m4,  m12
 pmulhrsw  m5,  m12
 packuswb  m4,  m5
 movu  [r2 + r3],   m4
+%else
+psubw m4, m12
+psubw m5, m12
+movu  m8, m13
+movu  m9, m14
+vpermi2q  m8, m4, m5
+vpermi2q  m9, m4, m5
+movu  [r2 + r3], m8
+movu  [r2 + r3 + mmsize], m9
+%endif
 
 movu  m1,  [r0 + r1]   ; m1 = row 5
 punpcklbw m4,  m0,m1
@@ -11207,11 +11233,21 @@
 paddw m6,  m4
 paddw m7,  m5
 
+%ifidn %1,pp
 pmulhrsw  m6,  m12
 pmulhrsw  m7,  m12
 packuswb  m6,  m7
 movu  [r2 + r3 * 2],   m6
-
+%else
+psubw m6, m12
+psubw m7, m12
+movu  m8, m13
+movu  m9, m14
+vpermi2q  m8, m6, m7
+vpermi2q  m9, m6, m7
+movu  [r2 + 2 * r3], m8
+movu  [r2 + 2 * r3 + mmsize], m9
+%endif

[x265] [PATCH 242 of 307] [x265-avx512]x86: AVX512 sad_16x32 and sad_16x64 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1512625055 -19800
#  Thu Dec 07 11:07:35 2017 +0530
# Node ID 931dd781dc0c6de76bb31d0215db7a7af885f9bf
# Parent  9bd38bd06850914d1cbf617063ea0e1e60f66219
[x265-avx512]x86: AVX512 sad_16x32 and sad_16x64 for high bit depth

Size  |  AVX2 performance | AVX512 performance
--
16x32 |  15.49x   |  16.89x
16x64 |  16.46x   |  17.84x

diff -r 9bd38bd06850 -r 931dd781dc0c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Dec 07 10:25:21 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Dec 07 11:07:35 2017 +0530
@@ -2434,6 +2434,8 @@
 
 p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
 
+p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx512);
+p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx512);
 p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
 p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
 p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
diff -r 9bd38bd06850 -r 931dd781dc0c source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Thu Dec 07 10:25:21 2017 +0530
+++ b/source/common/x86/sad16-a.asm Thu Dec 07 11:07:35 2017 +0530
@@ -1277,6 +1277,46 @@
 paddd   m0, m1
 %endmacro
 
+%macro PROCESS_SAD_16x8_AVX512 0
+movuym1, [r2]
+vinserti64x4 m1, [r2 + r3],  1
+movuym2, [r2 + 2 * r3]
+vinserti64x4 m2, [r2 + r5],  1
+movuym3, [r0]
+vinserti64x4 m3, [r0 + r1],  1
+movuym4, [r0 + 2 * r1]
+vinserti64x4 m4, [r0 + r4],  1
+
+psubw   m1, m3
+psubw   m2, m4
+pabsw   m1, m1
+pabsw   m2, m2
+paddw   m5, m1, m2
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+
+movuym1, [r2]
+vinserti64x4 m1, [r2 + r3],  1
+movuym2, [r2 + 2 * r3]
+vinserti64x4 m2, [r2 + r5],  1
+movuym3, [r0]
+vinserti64x4 m3, [r0 + r1],  1
+movuym4, [r0 + 2 * r1]
+vinserti64x4 m4, [r0 + r4],  1
+
+psubw   m1, m3
+psubw   m2, m4
+pabsw   m1, m1
+pabsw   m2, m2
+paddw   m1, m2
+
+pmaddwd m5, m6
+paddd   m0, m5
+pmaddwd m1, m6
+paddd   m0, m1
+%endmacro
+
 %macro PROCESS_SAD_AVX512_END 0
 vextracti32x8  ym1, m0, 1
 paddd  ym0, ym1
@@ -1523,6 +1563,51 @@
 %endif
 
 ;-
+; int pixel_sad_16x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_sad_16x32, 4,6,7
+pxorm0, m0
+
+vbroadcasti32x8 m6, [pw_1]
+
+add r3d, r3d
+add r1d, r1d
+lea r4d, [r1 * 3]
+lea r5d, [r3 * 3]
+
+%rep 3
+PROCESS_SAD_16x8_AVX512
+lear2, [r2 + 4 * r3]
+lear0, [r0 + 4 * r1]
+%endrep
+PROCESS_SAD_16x8_AVX512
+PROCESS_SAD_AVX512_END
+RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_16x64, 4,6,7
+   pxorm0, m0
+
+vbroadcasti32x8 m6, [pw_1]
+
+add r3d, r3d
+add r1d, r1d
+lea r4d, [r1 * 3]
+lea r5d, [r3 * 3]
+
+%rep 7
+PROCESS_SAD_16x8_AVX512
+lear2, [r2 + 4 * r3]
+lear0, [r0 + 4 * r1]
+%endrep
+PROCESS_SAD_16x8_AVX512
+PROCESS_SAD_AVX512_END
+RET
+%endif
+
+;-
 ; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-
 %if ARCH_X86_64
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 225 of 307] x86: AVX512 interp_4tap_vert_sp_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512021637 -19800
#  Thu Nov 30 11:30:37 2017 +0530
# Node ID 6137bed68dac85ab475b8be28fdba7f5787ac551
# Parent  a78e09e144582bd52c52d3475aa1922fc2ae8893
x86: AVX512 interp_4tap_vert_sp_32xN

i444
Size  |  AVX2 performance | AVX512 performance
--
32x8  |  10.49x   |  24.74x
32x16 |  11.57x   |  25.34x
32x24 |  11.59x   |  25.55x
32x32 |  12.01x   |  25.72x
32x64 |  12.29x   |  26.02x

diff -r a78e09e14458 -r 6137bed68dac source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Nov 30 10:35:20 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Nov 30 11:30:37 2017 +0530
@@ -4906,6 +4906,10 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = 
PFX(interp_4tap_vert_ss_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = 
PFX(interp_4tap_vert_ss_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = 
PFX(interp_4tap_vert_ss_32x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = 
PFX(interp_4tap_vert_sp_32x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = 
PFX(interp_4tap_vert_sp_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = 
PFX(interp_4tap_vert_sp_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = 
PFX(interp_4tap_vert_sp_32x32_avx512);
 
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
@@ -4931,6 +4935,10 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = 
PFX(interp_4tap_vert_ss_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = 
PFX(interp_4tap_vert_ss_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = 
PFX(interp_4tap_vert_ss_32x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = 
PFX(interp_4tap_vert_sp_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = 
PFX(interp_4tap_vert_sp_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = 
PFX(interp_4tap_vert_sp_32x48_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = 
PFX(interp_4tap_vert_sp_32x64_avx512);
 
 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
@@ -4966,6 +4974,11 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = 
PFX(interp_4tap_vert_ss_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = 
PFX(interp_4tap_vert_ss_64x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = 
PFX(interp_4tap_vert_ss_48x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = 
PFX(interp_4tap_vert_sp_32x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = 
PFX(interp_4tap_vert_sp_32x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = 
PFX(interp_4tap_vert_sp_32x24_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = 
PFX(interp_4tap_vert_sp_32x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = 
PFX(interp_4tap_vert_sp_32x64_avx512);
 
 p.pu[LUMA_8x8].luma_vss = PFX(interp_8tap_vert_ss_8x8_avx512);
 p.pu[LUMA_8x16].luma_vss = PFX(interp_8tap_vert_ss_8x16_avx512);
diff -r a78e09e14458 -r 6137bed68dac source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Thu Nov 30 10:35:20 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Thu Nov 30 11:30:37 2017 +0530
@@ -11614,120 +11614,109 @@
 FILTER_VER_SS_CHROMA_24xN_AVX512 32
 FILTER_VER_SS_CHROMA_24xN_AVX512 64
 %endif
-
-%macro PROCESS_CHROMA_VERT_SS_32x4_AVX512 0
+%macro PROCESS_CHROMA_VERT_S_32x2_AVX512 1
 movu  m1, [r0]
-lea   r6, [r0 + 2 * r1]
-movu  m10,[r6]
 movu  m3, [r0 + r1]
-movu  m12,[r0 + r4]
 punpcklwd m0, m1,  m3
-punpcklwd m9, m10, m12
-pmaddwd   m0, m16
-pmaddwd   m9, m16
+pmaddwd   m0, m7
 punpckhwd m1, m3
-punpckhwd m10,m12
-pmaddwd   m1, m16
-pmaddwd   m10,m16
+pmaddwd   m1, m7

[x265] [PATCH 224 of 307] x86: AVX512 interp_8tap_vert_sp_16xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512018320 -19800
#  Thu Nov 30 10:35:20 2017 +0530
# Node ID a78e09e144582bd52c52d3475aa1922fc2ae8893
# Parent  3e14c3f607d0f9ec6dd3735d21fc2e698217fe71
x86: AVX512 interp_8tap_vert_sp_16xN

Size  |  AVX2 performance | AVX512 performance
--
16x4  |   9.68x   |  16.45x
16x8  |  11.69x   |  16.93x
16x16 |  13.26x   |  18.58x
16x32 |  12.96x   |  19.23x
16x64 |  13.12x   |  16.84x

diff -r 3e14c3f607d0 -r a78e09e14458 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Nov 30 16:00:14 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Nov 30 10:35:20 2017 +0530
@@ -5002,7 +5002,12 @@
 p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512);
 p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512);
 p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512);
-
+p.pu[LUMA_16x4].luma_vsp = PFX(interp_8tap_vert_sp_16x4_avx512);
+p.pu[LUMA_16x8].luma_vsp = PFX(interp_8tap_vert_sp_16x8_avx512);
+p.pu[LUMA_16x12].luma_vsp = PFX(interp_8tap_vert_sp_16x12_avx512);
+p.pu[LUMA_16x16].luma_vsp = PFX(interp_8tap_vert_sp_16x16_avx512);
+p.pu[LUMA_16x32].luma_vsp = PFX(interp_8tap_vert_sp_16x32_avx512);
+p.pu[LUMA_16x64].luma_vsp = PFX(interp_8tap_vert_sp_16x64_avx512);
 p.pu[LUMA_32x64].luma_vsp = PFX(interp_8tap_vert_sp_32x64_avx512);
 p.pu[LUMA_32x32].luma_vsp = PFX(interp_8tap_vert_sp_32x32_avx512);
 p.pu[LUMA_32x24].luma_vsp = PFX(interp_8tap_vert_sp_32x24_avx512);
diff -r 3e14c3f607d0 -r a78e09e14458 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Thu Nov 30 16:00:14 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Thu Nov 30 10:35:20 2017 +0530
@@ -12985,8 +12985,7 @@
 FILTER_VER_SS_LUMA_8xN_AVX512 16
 FILTER_VER_SS_LUMA_8xN_AVX512 32
 %endif
-
-%macro PROCESS_LUMA_VERT_SS_16x4_AVX512 0
+%macro PROCESS_LUMA_VERT_S_16x4_AVX512 1
 movu ym1, [r0]
 movu ym3, [r0 + r1]
 vinserti32x8 m1,  [r0 + 2 * r1],  1
@@ -13062,7 +13061,26 @@
 padddm11, m13
 padddm2,  m10
 padddm3,  m11
-
+%ifidn %1, sp
+padddm0,  m19
+padddm1,  m19
+padddm2,  m19
+padddm3,  m19
+
+psradm0,  12
+psradm1,  12
+psradm2,  12
+psradm3,  12
+
+packssdw m0,  m1
+packssdw m2,  m3
+packuswb m0,  m2
+vpermq   m0,  m20,   m0
+movu [r2],xm0
+vextracti32x4[r2 + r3],   m0,2
+vextracti32x4[r2 + 2 * r3],   m0,1
+vextracti32x4[r2 + r5],   m0,3
+%else
 psradm0,  6
 psradm1,  6
 psradm2,  6
@@ -13075,15 +13093,15 @@
 movu [r2 + r3],   ym2
 vextracti32x8[r2 + 2 * r3],   m0,1
 vextracti32x8[r2 + r5],   m2,1
+%endif
 %endmacro
 
;-
 ; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, 
intptr_t dstStride, int coeffIdx)
 
;-
-%macro FILTER_VER_SS_LUMA_16xN_AVX512 1
+%macro FILTER_VER_S_LUMA_16xN_AVX512 2
 INIT_ZMM avx512
-cglobal interp_8tap_vert_ss_16x%1, 5, 8, 19
+cglobal interp_8tap_vert_%1_16x%2, 5, 8, 21
 add   r1d,r1d
-add   r3d,r3d
 lea   r7, [3 * r1]
 sub   r0, r7
 shl   r4d,8
@@ -13100,28 +13118,39 @@
 mova  m17,[r5 + 2 * mmsize]
 mova  m18,[r5 + 3 * mmsize]
 %endif
+%ifidn %1, sp
+vbroadcasti32x4   m19,[pd_526336]
+mova  m20,[interp8_vsp_store_avx512]
+%else
+add   r3d,r3d
+%endif
 
 lea

[x265] [PATCH 239 of 307] x86: AVX512 interp_8tap_vert_pp_48x64 and interp_8tap_vert_ps_48x64 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512475230 -19800
#  Tue Dec 05 17:30:30 2017 +0530
# Node ID 8b1c9d9c5bd8135dc11b6d031b990bfe47e3bcd8
# Parent  f92128e41ac3c1da210c1c665d97061539821aaf
x86: AVX512 interp_8tap_vert_pp_48x64 and interp_8tap_vert_ps_48x64 for high 
bit depth

luma_vpp_48x64
AVX2 performance   : 11.60x
AVX512 performance : 18.57x

luma_vps_48x64
AVX2 performance   :  9.97x
AVX512 performance : 17.28x

diff -r f92128e41ac3 -r 8b1c9d9c5bd8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Dec 05 14:41:07 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 05 17:30:30 2017 +0530
@@ -2893,6 +2893,7 @@
 p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
 p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512);
 p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512);
+p.pu[LUMA_48x64].luma_vpp = PFX(interp_8tap_vert_pp_48x64_avx512);
 p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512);
 p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512);
 p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512);
@@ -2909,6 +2910,7 @@
 p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
 p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512);
 p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512);
+p.pu[LUMA_48x64].luma_vps = PFX(interp_8tap_vert_ps_48x64_avx512);
 p.pu[LUMA_64x16].luma_vps = PFX(interp_8tap_vert_ps_64x16_avx512);
 p.pu[LUMA_64x32].luma_vps = PFX(interp_8tap_vert_ps_64x32_avx512);
 p.pu[LUMA_64x48].luma_vps = PFX(interp_8tap_vert_ps_64x48_avx512);
diff -r f92128e41ac3 -r 8b1c9d9c5bd8 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Tue Dec 05 14:41:07 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Tue Dec 05 17:30:30 2017 +0530
@@ -13242,6 +13242,251 @@
 FILTER_VER_P_LUMA_32xN_AVX512 pp, 64
 %endif
 
+%macro PROCESS_LUMA_VERT_P_48x4_AVX512 1
+PROCESS_LUMA_VERT_P_32x2_AVX512 %1
+movu m1,  [r0 + 2 * r1]
+movu m3,  [r0 + r7]
+punpcklwdm0,  m1, m3
+pmaddwd  m0,  m15
+punpckhwdm1,  m3
+pmaddwd  m1,  m15
+
+movu m4,  [r0 + 4 * r1]
+punpcklwdm2,  m3, m4
+pmaddwd  m2,  m15
+punpckhwdm3,  m4
+pmaddwd  m3,  m15
+
+movu m5,  [r6 + r1]
+punpcklwdm6,  m4, m5
+pmaddwd  m6,  m16
+punpckhwdm4,  m5
+pmaddwd  m4,  m16
+
+padddm0,  m6
+padddm1,  m4
+
+movu m4,  [r6 + 2 * r1]
+punpcklwdm6,  m5, m4
+pmaddwd  m6,  m16
+punpckhwdm5,  m4
+pmaddwd  m5,  m16
+
+padddm2,  m6
+padddm3,  m5
+
+lea  r4,  [r6 + 4 * r1]
+
+movu m11, [r6 + r7]
+punpcklwdm8,  m4, m11
+pmaddwd  m8,  m17
+punpckhwdm4,  m11
+pmaddwd  m4,  m17
+
+movu m12, [r6 + 4 * r1]
+punpcklwdm10, m11,m12
+pmaddwd  m10, m17
+punpckhwdm11, m12
+pmaddwd  m11, m17
+
+movu m13, [r4 + r1]
+punpcklwdm14, m12,m13
+pmaddwd  m14, m18
+punpckhwdm12, m13
+pmaddwd  m12, m18
+
+padddm8,  m14
+padddm4,  m12
+padddm0,  m8
+padddm1,  m4
+
+movu m12, [r4 + 2 * r1]
+punpcklwdm14, m13,m12
+pmaddwd  m14, m18
+punpckhwdm13, m12
+pmaddwd  

[x265] [PATCH 241 of 307] x86: AVX512 pixel_satd_64xN and 32xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512622521 -19800
#  Thu Dec 07 10:25:21 2017 +0530
# Node ID 9bd38bd06850914d1cbf617063ea0e1e60f66219
# Parent  2d298099a8d6b266a32b975de4b6a369988d3887
x86: AVX512 pixel_satd_64xN and 32xN for high bit depth

Size  |  AVX2 performance  | AVX512 performance
--
32x8  |   10.99x   |  17.98x
32x16 |   12.18x   |  17.05x
32x24 |   13.11x   |  19.70x
32x32 |   13.21x   |  18.36x
32x64 |   13.27x   |  19.04x
64x16 |   12.36x   |  17.15x
64x32 |   11.63x   |  17.78x
64x48 |   12.00x   |  19.23x
64x64 |   12.12x   |  19.20x

diff -r 2d298099a8d6 -r 9bd38bd06850 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Dec 06 10:53:15 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Dec 07 10:25:21 2017 +0530
@@ -3015,6 +3015,24 @@
 //Luma_hps_48x64
 p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
 
+p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
+p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
+p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
+p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512);
+p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512);
+p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx512);
+p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512);
+p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512);
+p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512);
+
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = 
PFX(pixel_satd_32x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = 
PFX(pixel_satd_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = 
PFX(pixel_satd_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = 
PFX(pixel_satd_32x8_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = 
PFX(pixel_satd_32x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = 
PFX(pixel_satd_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = 
PFX(pixel_satd_32x16_avx512);
+
 }
 #endif
 }
diff -r 2d298099a8d6 -r 9bd38bd06850 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 06 10:53:15 2017 +0530
+++ b/source/common/x86/pixel-a.asm Thu Dec 07 10:25:21 2017 +0530
@@ -13958,6 +13958,192 @@
 paddd   xm6, xm7
 movdeax, xm6
 RET
+
+%macro SATD_HBD_AVX512_END 0
+vextracti32x8   ym7, m6, 1
+paddd   ym6, ym7
+vextracti128xm7, ym6, 1
+paddd   xm6, xm7
+pxorxm7, xm7
+movhlps xm7, xm6
+paddd   xm6, xm7
+pshufd  xm7, xm6, 1
+paddd   xm6, xm7
+movdeax, xm6
+%endmacro
+
+%macro PROCESS_SATD_32x8_HBD_AVX512 0; function to compute satd cost 
for 32 columns, 8 rows
+; rows 0-3
+movum0, [r0]
+movum4, [r2]
+psubw   m0, m4
+movum1, [r0 + r1]
+movum5, [r2 + r3]
+psubw   m1, m5
+movum2, [r0 + r1 * 2]
+movum4, [r2 + r3 * 2]
+psubw   m2, m4
+movum3, [r0 + r4]
+movum5, [r2 + r5]
+psubw   m3, m5
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+paddw   m4, m0, m1
+psubw   m1, m0
+paddw   m0, m2, m3
+psubw   m3, m2
+punpckhwd   m2, m4, m1
+punpcklwd   m4, m1
+punpckhwd   m1, m0, m3
+punpcklwd   m0, m3
+paddw   m3, m4, m0
+psubw   m0, m4
+paddw   m4, m2, m1
+psubw   m1, m2
+punpckhdq   m2, m3, m0
+punpckldq   m3, m0
+paddw   m0, m3, m2
+psubw   m2, m3
+punpckhdq   m3, m4, m1
+punpckldq   m4, m1
+paddw   m1, m4, m3
+psubw   m3, m4
+punpckhqdq  m4, m0, m1
+punpcklqdq  m0, m1
+pabsw   m0, m0
+pabsw   m4, m4
+pmaxsw  m0, m0, m4
+punpckhqdq  m1, m2, m3
+punpcklqdq  m2, m3
+pabsw   m2, m2
+pabsw   m1, m1
+pmaxsw  m2, m1
+pxorm7, m7
+movam1, m0
+punpcklwd   m1, m7
+paddd   m6, m1
+movam1, m0
+punpckhwd   m1, m7
+paddd   m6, m1
+pxorm7, m7
+movam1, m2
+punpcklwd   m1, m7
+paddd   m6, m1
+movam1, m2
+punpckhwd   m1, m7
+paddd   m6, m1
+; rows 4-7
+movum0, [r0]
+movum4, [r2]
+psubw   m0, m4
+movu 

[x265] [PATCH 252 of 307] x86: AVX512 intra_pred_dc32 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1512723573 -19800
#  Fri Dec 08 14:29:33 2017 +0530
# Node ID ddd64f4b2ff382d05e86708750b20332ed93f3c9
# Parent  fa954ed4a1e7ce2741f3cac14006f78c3199191b
x86: AVX512 intra_pred_dc32 for high bit depth

AVX2 performance   : 15.53x
AVX512 performance : 23.96x

diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Dec 08 14:29:33 2017 +0530
@@ -3053,6 +3053,7 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = 
PFX(pixel_satd_32x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = 
PFX(pixel_satd_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = 
PFX(pixel_satd_32x16_avx512);
+p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
 
 }
 #endif
diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/intrapred.h Fri Dec 08 14:29:33 2017 +0530
@@ -76,7 +76,7 @@
 FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const 
pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const 
pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const 
pixel*srcPix, int, int filter);
-
+FUNCDEF_TU_S2(void, intra_pred_dc, avx512, pixel* dst, intptr_t dstStride, 
const pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, 
const pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, 
const pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, 
const pixel*srcPix, int, int filter);
diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/intrapred16.asm Fri Dec 08 14:29:33 2017 +0530
@@ -688,6 +688,68 @@
 movu[r0 + r2 * 1 +  0], m0
 movu[r0 + r2 * 1 + mmsize], m0
 RET
+
+INIT_ZMM avx512
+cglobal intra_pred_dc32, 3,3,17
+add  r2, 2
+add r1d, r1d
+movu m16, [r2]
+movu m1, [r2 + 2 * mmsize]
+paddwm16, m1
+vextracti32x8   ym1, m16, 1
+paddw   ym16, ym1
+vextracti32x4   xm1, m16, 1
+paddw   xm16, xm1
+pmaddwd xm16, [pw_1]
+movhlps xm1, xm16
+paddd   xm16, xm1
+phaddd  xm16, xm16
+paddd   xm16, [pd_32]; sum = sum + 32
+psrld   xm16, 6  ; sum = sum / 64
+vpbroadcastw m0, xm16
+
+lea  r2, [r1 * 3]
+; store DC 32x32
+movu[r0 + r1 * 0 +  0], m0
+movu[r0 + r1 * 1 +  0], m0
+movu[r0 + r1 * 2 +  0], m0
+movu[r0 + r2 * 1 +  0], m0
+lea r0, [r0 + r1 * 4]
+movu[r0 + r1 * 0 +  0], m0
+movu[r0 + r1 * 1 +  0], m0
+movu[r0 + r1 * 2 +  0], m0
+movu[r0 + r2 * 1 +  0], m0
+lea r0, [r0 + r1 * 4]
+movu[r0 + r1 * 0 +  0], m0
+movu[r0 + r1 * 1 +  0], m0
+movu[r0 + r1 * 2 +  0], m0
+movu[r0 + r2 * 1 +  0], m0
+lea r0, [r0 + r1 * 4]
+movu[r0 + r1 * 0 +  0], m0
+movu[r0 + r1 * 1 +  0], m0
+movu[r0 + r1 * 2 +  0], m0
+movu[r0 + r2 * 1 +  0], m0
+lea r0, [r0 + r1 * 4]
+movu[r0 + r1 * 0 +  0], m0
+movu[r0 + r1 * 1 +  0], m0
+movu[r0 + r1 * 2 +  0], m0
+movu[r0 + r2 * 1 +  0], m0
+lea r0, [r0 + r1 * 4]
+movu[r0 + r1 * 0 +  0], m0
+movu[r0 + r1 * 1 +  0], m0
+movu[r0 + r1 * 2 +  0], m0
+movu[r0 + r2 * 1 +  0], m0
+lea r0, [r0 + r1 * 4]
+movu[r0 + r1 * 0 +  0], m0
+movu[r0 + r1 * 1 +  0], m0
+movu[r0 + r1 * 2 +  0], m0
+movu[r0 + r2 * 1 +  0], m0
+lea r0, [r0 + r1 * 4]
+movu[r0 + r1 * 0 +  0], m0
+movu[r0 + r1 * 1 +  0], m0
+movu[r0 + r1 * 2 +  0], m0
+movu[r0 + r2 * 1 +  0], m0
+RET
 %endif
 
 
;---
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 255 of 307] x86: AVX512 luma_hvpp

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1513072665 -19800
#  Tue Dec 12 15:27:45 2017 +0530
# Node ID 9ca6f6a66eabf5bfdecc3a8472c1137d16b1c722
# Parent  b858f80e3ff03118abb1ef3e4ea56059f9ec5af4
x86: AVX512 luma_hvpp

Calling interp_8tap_hv_pp_cpu C function(which calls luma_hps and luma_vsp asm 
functions individually)
Including ALL_LUMA_PU_T for luma_hvpp which calls interp_8tap_hv_pp_cpu C 
function.

diff -r b858f80e3ff0 -r 9ca6f6a66eab source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Dec 12 15:44:55 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 12 15:27:45 2017 +0530
@@ -3056,6 +3056,23 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = 
PFX(pixel_satd_32x16_avx512);
 p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
 
+p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu;
+
 }
 #endif
 }
@@ -5220,6 +5237,24 @@
 p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512);
 p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512);
 //p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512);
+
+p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu;
+p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu;
+
 }
 #endif
 }
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 293 of 307] x86 : AVX512 intra_pred_ang16 mode 5 and 31 high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashree
# Date 1515992814 -19800
#  Mon Jan 15 10:36:54 2018 +0530
# Node ID 3a310b157fdf345023ff4e96e7de316cee79b954
# Parent  c1daa99a8c14edbe5e9e5a59a74a6b0936c27a82
x86 : AVX512 intra_pred_ang16 mode 5 and 31 high bit depth
Mode | AVX2 performance | AVX512 performance
---
 5   |10.5x   |  16.61x
 31  |12.26x  |  20.3x

diff -r c1daa99a8c14 -r 3a310b157fdf source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Jan 15 09:53:46 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Jan 15 10:36:54 2018 +0530
@@ -3111,7 +3111,8 @@
 p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512);
 p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512);
 p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512);
-
+p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx512);
+p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx512);
 p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu;
 p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu;
 p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu;
diff -r c1daa99a8c14 -r 3a310b157fdf source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Jan 15 09:53:46 2018 +0530
+++ b/source/common/x86/intrapred16.asm Mon Jan 15 10:36:54 2018 +0530
@@ -19283,10 +19283,29 @@
 callang16_mode_5_31
 
 add r2,18
-
 callang32_mode_5_31
 RET
-
+cglobal intra_pred_ang16_5, 3,7,13
+add r2,64
+xor r6d,   r6d
+vbroadcasti32x8  m15,  [pd_16]
+lea r3,[ang_table_avx2 + 16 * 32]
+add r1d,   r1d
+lea r4,[r1 * 3]
+
+callang16_mode_5_31
+RET
+
+cglobal intra_pred_ang16_31, 3,7,13
+xor r6d,   r6d
+inc r6d
+vbroadcasti32x8  m15,  [pd_16]
+lea r3,[ang_table_avx2 + 16 * 32]
+add r1d,   r1d
+lea r4,[r1 * 3]
+
+callang16_mode_5_31
+RET
 
;---
 ; avx512 code for intra_pred_ang32 mode 2 to 34 end
 
;---
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 304 of 307] Fix SIMD register count for intra_pred_ang modes

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashree
# Date 1516361209 -19800
#  Fri Jan 19 16:56:49 2018 +0530
# Node ID e82bfd58acb99cd4c2e4767b1afdd3750881a68e
# Parent  f56354b2b542aaafa389a226f0fb3b41e4d33803
Fix SIMD register count for intra_pred_ang modes

diff -r f56354b2b542 -r e82bfd58acb9 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Wed Jan 17 10:11:09 2018 -0800
+++ b/source/common/x86/intrapred16.asm Fri Jan 19 16:56:49 2018 +0530
@@ -19236,8 +19236,7 @@
 packusdwm2, m1
 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16
 ret
-
-cglobal intra_pred_ang32_5, 3,8,13
+cglobal intra_pred_ang32_5, 3,8,17
 add r2,128
 xor r6d,   r6d
 lea r3,[ang_table_avx2 + 16 * 32]
@@ -19259,11 +19258,9 @@
 vbroadcasti32x8  m15,  [pd_16]
 add r2,18
 lea r0,[r0 + 32]
-
 callang32_mode_5_31
 RET
-
-cglobal intra_pred_ang32_31, 3,7,13
+cglobal intra_pred_ang32_31, 3,7,17
 xor r6d,   r6d
 inc r6d
 lea r3,[ang_table_avx2 + 16 * 32]
@@ -19285,18 +19282,16 @@
 add r2,18
 callang32_mode_5_31
 RET
-cglobal intra_pred_ang16_5, 3,7,13
+cglobal intra_pred_ang16_5, 3,7,17
 add r2,64
 xor r6d,   r6d
 vbroadcasti32x8  m15,  [pd_16]
 lea r3,[ang_table_avx2 + 16 * 32]
 add r1d,   r1d
 lea r4,[r1 * 3]
-
 callang16_mode_5_31
 RET
-
-cglobal intra_pred_ang16_31, 3,7,13
+cglobal intra_pred_ang16_31, 3,7,17
 xor r6d,   r6d
 inc r6d
 vbroadcasti32x8  m15,  [pd_16]
@@ -19609,8 +19604,7 @@
 packusdwm3, m0
 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16
 ret
-
-cglobal intra_pred_ang32_4, 3,8,13
+cglobal intra_pred_ang32_4, 3,8,17
 add r2,128
 xor r6d,   r6d
 lea r3,[ang_table_avx2 + 18 * 32]
@@ -19632,11 +19626,9 @@
 
 add r2,22
 lea r0,[r0 + 32]
-
 callang32_mode_4_32
 RET
-
-cglobal intra_pred_ang32_32, 3,7,13
+cglobal intra_pred_ang32_32, 3,7,17
 xor r6d,   r6d
 inc r6d
 lea r3,[ang_table_avx2 + 18 * 32]
@@ -19654,23 +19646,19 @@
 mov r0,r5
 
 callang16_mode_4_32
-
 add r2,22
-
 callang32_mode_4_32
 RET
-cglobal intra_pred_ang16_4, 3,7,13
+cglobal intra_pred_ang16_4, 3,7,17
 add r2,64
 xor r6d,   r6d
 vbroadcasti32x8  m15,  [pd_16]
 lea r3,[ang_table_avx2 + 18 * 32]
 add r1d,   r1d
 lea r4,[r1 * 3]
-
 callang16_mode_4_32
 RET
-
-cglobal intra_pred_ang16_32, 3,7,13
+cglobal intra_pred_ang16_32, 3,7,17
 xor r6d,   r6d
 inc r6d
 vbroadcasti32x8  m15,  [pd_16]
@@ -19949,8 +19937,7 @@
 packusdwm11, m3
 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
 ret
-
-cglobal intra_pred_ang32_6, 3,8,14
+cglobal intra_pred_ang32_6, 3,8,17
 add r2,128
 xor r6d,   r6d
 lea r3,[ang_table_avx2 + 15 * 32]
@@ -19972,11 +19959,9 @@
 
 add r2,12
 lea r0,[r0 + 32]
-
 callang32_mode_6_30
 RET
-
-cglobal intra_pred_ang32_30, 3,7,14
+cglobal intra_pred_ang32_30, 3,7,17
 xor r6d,   r6d
 inc r6d
 lea r3,[ang_table_avx2 + 15 * 32]
@@ -19998,18 +19983,16 @@
 add r2,12
 callang32_mode_6_30
 RET
-cglobal intra_pred_ang16_6, 3,7,14
+cglobal intra_pred_ang16_6, 3,7,17
 add r2,64
 xor r6d,   r6d
 vbroadcasti32x8  m15,  [pd_16]
 lea r3,[ang_table_avx2 + 15 * 32]
 shl r1d,   1
 lea r4,[r1 * 3]
-
 callang16_mode_6_30
 RET
-
-cglobal intra_pred_ang16_30, 3,7,14
+cglobal intra_pred_ang16_30, 3,7,17
 xor r6d,   r6d
 inc r6d
 vbroadcasti32x8  m15,  [pd_16]
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 292 of 307] x86:AVX512 intra_pred_ang32 mode 5 and 31 high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashree
# Date 1515990226 -19800
#  Mon Jan 15 09:53:46 2018 +0530
# Node ID c1daa99a8c14edbe5e9e5a59a74a6b0936c27a82
# Parent  e4983d90f403d968d6760ae044f86a7a2e1865a2
x86:AVX512 intra_pred_ang32 mode 5 and 31 high bit depth
Mode | AVX2 performance | AVX512 performance
---
 5   |9.5x|  17.11x
 31  |11.5x   |   24.1x

diff -r e4983d90f403 -r c1daa99a8c14 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Jan 12 15:17:56 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Jan 15 09:53:46 2018 +0530
@@ -3105,11 +3105,13 @@
 p.cu[BLOCK_32x32].intra_pred[25]= PFX(intra_pred_ang32_25_avx512);
 p.cu[BLOCK_32x32].intra_pred[26]= PFX(intra_pred_ang32_26_avx512);
 p.cu[BLOCK_32x32].intra_pred[27]= PFX(intra_pred_ang32_27_avx512);
-
+p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx512);
+p.cu[BLOCK_32x32].intra_pred[31] = PFX(intra_pred_ang32_31_avx512);
 p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512);
 p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512);
 p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512);
 p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512);
+
 p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu;
 p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu;
 p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu;
diff -r e4983d90f403 -r c1daa99a8c14 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Jan 12 15:17:56 2018 +0530
+++ b/source/common/x86/intrapred16.asm Mon Jan 15 09:53:46 2018 +0530
@@ -18977,6 +18977,316 @@
 lea r4,[r1 * 3]
 callang16_mode_11_25
 RET
+cglobal ang16_mode_5_31
+testr6d, r6d
+
+vbroadcasti32x8m0, [r2 + 2]; [16 15 14 13 
12 11 10  9  8  7  6  5  4  3  2  1]
+vbroadcasti32x8m1, [r2 + 4]; [17 16 15 14 
13 12 11 10  9  8  7  6  5  4  3  2]
+
+punpcklwd   m3, m0, m1  ; [13 12 12 11 11 10 10  9 
 5  4  4  3  3  2  2  1]
+punpckhwd   m0, m1  ; [17 16 16 15 15 14 14 13 
 9  8  8  7  7  6  6  5]
+
+vbroadcasti32x8m1, [r2 + 18]   ; [24 23 22 21 
20 19 18 17 16 15 14 13 12 11 10  9]
+vbroadcasti32x8m4, [r2 + 20]   ; [25 24 23 22 
21 20 19 18 17 16 15 14 13 12 11 10]
+punpcklwd   m2, m1, m4  ; [21 20 20 19 19 18 18 17 
13 12 12 11 11 10 10  9]
+punpckhwd   m1, m4  ; [25 24 24 23 23 22 22 21 
17 16 16 15 15 14 14 13]
+
+pmaddwd m4, m3, [r3 + 1 * 32]   ; [17]
+paddd   m4, m15
+psrld   m4, 5
+pmaddwd m5, m0, [r3 + 1 * 32]
+paddd   m5, m15
+psrld   m5, 5
+packusdwm4, m5
+
+movuym16, [r3 - 14 * 32]  ; [2]
+vinserti32x8m16, [r3 + 3 * 32] ,1 ; [19]
+palignr m6, m0, m3, 4
+pmaddwd m5, m6, m16
+paddd   m5, m15
+psrld   m5, 5
+palignr m7, m2, m0, 4
+pmaddwd m8, m7, m16
+paddd   m8, m15
+psrld   m8, 5
+packusdwm5, m8
+vextracti32x8   ym6, m5, 1
+
+palignr m8, m0, m3, 8
+palignr m9, m2, m0, 8
+movuym16, [r3 - 12 * 32]  ; [4]
+vinserti32x8m16, [r3 + 5 * 32],1  ; [21]
+pmaddwd m7, m8, m16
+paddd   m7, m15
+psrld   m7, 5
+pmaddwd m10, m9,m16
+paddd   m10, m15
+psrld   m10, 5
+packusdwm7, m10
+vextracti32x8   ym8, m7, 1
+
+palignr m10, m0, m3, 12
+palignr m11, m2, m0, 12
+movuym16,[r3 - 10 * 32] ; [6]
+vinserti32x8m16,  [r3 + 7 * 32] ,1  ; [23]
+pmaddwd m9, m10, m16
+paddd   m9, m15
+psrld   m9, 5
+pmaddwd m3, m11, m16
+paddd   m3, m15
+psrld   m3, 5
+packusdwm9, m3
+vextracti32x8   ym10, m9, 1
+
+pmaddwd m11, m0, [r3 - 8 * 32]  ; [8]
+paddd   m11, m15
+psrld   m11, 5
+pmaddwd m3, m2, [r3 - 8 * 32]
+paddd   m3, m15
+psrld   m3, 5
+packusdwm11, m3
+
+TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
+
+pmaddwd m4, m0, [r3 + 9 * 32]   ; [25]
+paddd   m4, m15
+psrld   m4, 5
+pmaddwd m5, m2, [r3 + 9  * 32]
+paddd   m5, m15
+psrld   m5, 5
+packusdwm4, m5
+
+palignr m6, m2, m0, 4
+movuym16, [r3 

[x265] [PATCH 305 of 307] X86: AVX512 intra_pred_ang16 mode 8 and 28 high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1517294626 -19800
#  Tue Jan 30 12:13:46 2018 +0530
# Node ID b80e844209ecd0abc896df94306a5ef96b27b918
# Parent  e82bfd58acb99cd4c2e4767b1afdd3750881a68e
X86: AVX512 intra_pred_ang16 mode 8 and 28 high bit depth

Mode | AVX2 performance | AVX512 performance
---
 8   |9.31x |10.78x
 28  |12.80x|15.21x

diff -r e82bfd58acb9 -r b80e844209ec source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Jan 19 16:56:49 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Jan 30 12:13:46 2018 +0530
@@ -3113,14 +3113,14 @@
 p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx512);
 p.cu[BLOCK_32x32].intra_pred[29] = PFX(intra_pred_ang32_29_avx512);
 p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx512);
-
+p.cu[BLOCK_32x32].intra_pred[8]= PFX(intra_pred_ang32_8_avx512);
+p.cu[BLOCK_32x32].intra_pred[28]= PFX(intra_pred_ang32_28_avx512);
 p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512);
 p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512);
 p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512);
 p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512);
-p.cu[BLOCK_32x32].intra_pred[8]= PFX(intra_pred_ang32_8_avx512);
-p.cu[BLOCK_32x32].intra_pred[28]= PFX(intra_pred_ang32_28_avx512);
-
+p.cu[BLOCK_16x16].intra_pred[8] = PFX(intra_pred_ang16_8_avx512);
+p.cu[BLOCK_16x16].intra_pred[28]= PFX(intra_pred_ang16_28_avx512);
 p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx512);
 p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx512);
 p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx512);
diff -r e82bfd58acb9 -r b80e844209ec source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Jan 19 16:56:49 2018 +0530
+++ b/source/common/x86/intrapred16.asm Tue Jan 30 12:13:46 2018 +0530
@@ -11843,6 +11843,27 @@
 packusdwm11, m3
 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
 ret
+cglobal intra_pred_ang16_8, 3,7,16
+add r2,64
+xor r6d,   r6d
+lea r3,[ang_table_avx2 + 15 * 32]
+add r1d,   r1d
+lea r4,[r1 * 3]
+vbroadcasti32x8  m15,  [pd_16]
+
+callang16_mode_8_28
+RET
+
+cglobal intra_pred_ang16_28, 3,7,16
+xor r6d,   r6d
+inc r6d
+lea r3,[ang_table_avx2 + 15 * 32]
+add r1d,   r1d
+lea r4,[r1 * 3]
+vbroadcasti32x8  m15,  [pd_16]
+
+callang16_mode_8_28
+RET
 
 ;; angle 16, modes 7 and 29
 cglobal ang16_mode_7_29
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 307 of 307] x86:AVX512 Set run time flag to enable/disable avx512

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashree 
# Date 1522928767 -19800
#  Thu Apr 05 17:16:07 2018 +0530
# Node ID f6ad2fa637fd3c8f9e2811982b89aa28228e9f6b
# Parent  876b6e006f2080072c0684dbf75e7cfde974ba79
x86:AVX512 Set run time flag to enable/disable avx512

diff -r 876b6e006f20 -r f6ad2fa637fd source/common/cpu.cpp
--- a/source/common/cpu.cpp Mon Feb 05 10:39:00 2018 -0800
+++ b/source/common/cpu.cpp Thu Apr 05 17:16:07 2018 +0530
@@ -122,7 +122,7 @@
 #pragma warning(disable: 4309) // truncation of constant value
 #endif
 
-uint32_t cpu_detect(void)
+uint32_t cpu_detect(bool benableavx512 )
 {
 uint32_t cpu = 0;
 
@@ -184,11 +184,13 @@
 {
 if (ebx & 0x0020)
 cpu |= X265_CPU_AVX2;
-
-if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
+if (benableavx512)
 {
-if ((ebx & 0xD003) == 0xD003)
-cpu |= X265_CPU_AVX512;
+if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
+{
+if ((ebx & 0xD003) == 0xD003)
+cpu |= X265_CPU_AVX512;
+}
 }
 }
 }
@@ -327,7 +329,7 @@
 int PFX(cpu_fast_neon_mrc_test)(void);
 }
 
-uint32_t cpu_detect(void)
+uint32_t cpu_detect(bool benableavx512)
 {
 int flags = 0;
 
@@ -370,7 +372,7 @@
 
 #elif X265_ARCH_POWER8
 
-uint32_t cpu_detect(void)
+uint32_t cpu_detect(bool benableavx512)
 {
 #if HAVE_ALTIVEC
 return X265_CPU_ALTIVEC;
@@ -381,7 +383,7 @@
 
 #else // if X265_ARCH_POWER8
 
-uint32_t cpu_detect(void)
+uint32_t cpu_detect(bool benableavx512)
 {
 return 0;
 }
diff -r 876b6e006f20 -r f6ad2fa637fd source/common/cpu.h
--- a/source/common/cpu.h   Mon Feb 05 10:39:00 2018 -0800
+++ b/source/common/cpu.h   Thu Apr 05 17:16:07 2018 +0530
@@ -50,7 +50,7 @@
 #endif
 
 namespace X265_NS {
-uint32_t cpu_detect(void);
+uint32_t cpu_detect(bool);
 
 struct cpu_name_t
 {
diff -r 876b6e006f20 -r f6ad2fa637fd source/common/param.cpp
--- a/source/common/param.cpp   Mon Feb 05 10:39:00 2018 -0800
+++ b/source/common/param.cpp   Thu Apr 05 17:16:07 2018 +0530
@@ -99,13 +99,13 @@
 {
 x265_free(p);
 }
-
+bool  benableavx512 = false;
 void x265_param_default(x265_param* param)
 {
 memset(param, 0, sizeof(x265_param));
 
 /* Applying default values to all elements in the param structure */
-param->cpuid = X265_NS::cpu_detect();
+param->cpuid = X265_NS::cpu_detect(benableavx512);
 param->bEnableWavefront = 1;
 param->frameNumThreads = 0;
 
@@ -609,6 +609,17 @@
 if (0) ;
 OPT("asm")
 {
+sscanf(value, "%s", p->asmname);
+if (strcmp(value, "avx512")==0)
+{
+p->bEnableavx512 = 1;
+benableavx512 = true;
+}
+else
+{
+p->bEnableavx512 = 0;
+benableavx512 = false;
+}
 if (bValueWasNull)
 p->cpuid = atobool(value);
 else
@@ -1072,7 +1083,7 @@
 if (isdigit(value[0]))
 cpu = x265_atoi(value, bError);
 else
-cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? 
X265_NS::cpu_detect() : 0;
+cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? 
X265_NS::cpu_detect(benableavx512) : 0;
 
 if (bError)
 {
diff -r 876b6e006f20 -r f6ad2fa637fd source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp  Mon Feb 05 10:39:00 2018 -0800
+++ b/source/test/pixelharness.cpp  Thu Apr 05 17:16:07 2018 +0530
@@ -332,8 +332,9 @@
 memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
 memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
 int j = 0;
+bool enableavx512 = true;
 int width = 16 * (rand() % 4 + 1);
-int cpuid = X265_NS::cpu_detect();
+int cpuid = X265_NS::cpu_detect(enableavx512);
 if (cpuid & X265_CPU_AVX512)
 width = 32 * (rand() % 2 + 1);
 int height = 8;
diff -r 876b6e006f20 -r f6ad2fa637fd source/test/testbench.cpp
--- a/source/test/testbench.cpp Mon Feb 05 10:39:00 2018 -0800
+++ b/source/test/testbench.cpp Thu Apr 05 17:16:07 2018 +0530
@@ -96,7 +96,8 @@
 
 int main(int argc, char *argv[])
 {
-int cpuid = X265_NS::cpu_detect();
+bool enableavx512 = true;
+int cpuid = X265_NS::cpu_detect(enableavx512);
 const char *testname = 0;
 
 if (!(argc & 1))
diff -r 876b6e006f20 -r f6ad2fa637fd source/x265.h
--- a/source/x265.h Mon Feb 05 10:39:00 2018 -0800
+++ b/source/x265.h Thu Apr 05 17:16:07 2018 +0530
@@ -585,7 +585,14 @@
  * somehow flawed on your target hardware. The asm function tables are
  * process global, the first encoder configures them for all encoders */
 int   cpuid;
-
+ /*==Assembly features ==*/
+ /*  x265_param_parse() will detect if the avx512 is enabled (in cli )and 
set 
+ *  bEnableavx512 to 1 to use avx512 SIMD. By default this flag will not 
be set , 
+ *  hence the encoding 

[x265] [PATCH 295 of 307] x86:AVX512 intra_pred-ang32 mode 4 and 32 high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashree
# Date 1516018946 -19800
#  Mon Jan 15 17:52:26 2018 +0530
# Node ID a2b347ed81f90ac82f59d891deba7fa876df7f62
# Parent  1107c2def5f9dbee9947a2c9c41f50961fa31bc6
x86:AVX512 intra_pred-ang32 mode 4 and 32 high bit depth
Mode | AVX2 performance | AVX512 performance
---
 4   |9.1x |   14.6x
 32  |11.35x   |   20.85x

diff -r 1107c2def5f9 -r a2b347ed81f9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Jan 15 12:22:40 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Jan 15 17:52:26 2018 +0530
@@ -3107,6 +3107,9 @@
 p.cu[BLOCK_32x32].intra_pred[27]= PFX(intra_pred_ang32_27_avx512);
 p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx512);
 p.cu[BLOCK_32x32].intra_pred[31] = PFX(intra_pred_ang32_31_avx512);
+p.cu[BLOCK_32x32].intra_pred[32] = PFX(intra_pred_ang32_32_avx512);
+p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx512);
+
 p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512);
 p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512);
 p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512);
diff -r 1107c2def5f9 -r a2b347ed81f9 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Jan 15 12:22:40 2018 +0530
+++ b/source/common/x86/intrapred16.asm Mon Jan 15 17:52:26 2018 +0530
@@ -19303,9 +19303,363 @@
 lea r3,[ang_table_avx2 + 16 * 32]
 add r1d,   r1d
 lea r4,[r1 * 3]
-
 callang16_mode_5_31
 RET
+;; angle 16, modes 4 and 32
+cglobal ang16_mode_4_32
+testr6d, r6d
+
+vbroadcasti32x8m0, [r2 + 2]; [16 15 14 13 
12 11 10  9  8  7  6  5  4  3  2  1]
+vbroadcasti32x8m1, [r2 + 4]; [17 16 15 14 
13 12 11 10  9  8  7  6  5  4  3  2]
+
+punpcklwd   m3, m0, m1  ; [13 12 12 11 11 10 10  9 
 5  4  4  3  3  2  2  1]
+punpckhwd   m0, m1  ; [17 16 16 15 15 14 14 13 
 9  8  8  7  7  6  6  5]
+
+vbroadcasti32x8m1, [r2 + 18]   ; [24 23 22 21 
20 19 18 17 16 15 14 13 12 11 10  9]
+vbroadcasti32x8m4, [r2 + 20]   ; [25 24 23 22 
21 20 19 18 17 16 15 14 13 12 11 10]
+punpcklwd   m2, m1, m4  ; [21 20 20 19 19 18 18 17 
13 12 12 11 11 10 10  9]
+punpckhwd   m1, m4  ; [25 24 24 23 23 22 22 21 
17 16 16 15 15 14 14 13]
+
+pmaddwd m4, m3, [r3 + 3 * 32]   ; [21]
+paddd   m4, m15
+psrld   m4, 5
+pmaddwd m5, m0, [r3 + 3 * 32]
+paddd   m5, m15
+psrld   m5, 5
+packusdwm4, m5
+
+palignr m6, m0, m3, 4   ; [14 13 13 12 12 11 11 10 
 6  5  5  4  4  3  3  2]
+palignr m7, m2, m0, 4   ; [18 17 17 16 16 15 15 14 
10  9  9  8  8  7  7  6]
+movuym16,[r3 - 8 * 32]   ; [10]
+vinserti32x8m16, [r3 + 13 * 32] ,1 ; [31]
+pmaddwd m5, m6, m16
+paddd   m5, m15
+psrld   m5, 5
+pmaddwd m8, m7,m16
+paddd   m8, m15
+psrld   m8, 5
+packusdwm5, m8
+vextracti32x8   ym6, m5, 1
+
+
+palignr m7, m0, m3, 8   ; [15 14 14 13 13 12 12 11 
 7  6  6  5  5  4  4  3]
+pmaddwd m7, [r3 + 2 * 32]   ; [20]
+paddd   m7, m15
+psrld   m7, 5
+palignr m8, m2, m0, 8   ; [19 18 18 17 17 16 16 15 
11 10 10  9  9  8  8  7]
+pmaddwd m8, [r3 + 2 * 32]
+paddd   m8, m15
+psrld   m8, 5
+packusdwm7, m8
+
+palignr m9, m0, m3, 12
+palignr m3, m2, m0, 12
+movuym16,[r3 - 9 * 32]   ; [9]
+vinserti32x8m16, [r3 + 12 * 32] ,1  ; [30]
+pmaddwd m8, m9, m16
+paddd   m8, m15
+psrld   m8, 5
+pmaddwd m10, m3,m16
+paddd   m10,m15
+psrld   m10, 5
+packusdwm8, m10
+vextracti32x8   ym9, m8, 1
+
+
+pmaddwd m10, m0, [r3 + 1 * 32]  ; [19]
+paddd   m10,m15
+psrld   m10, 5
+pmaddwd m3, m2, [r3 + 1 * 32]
+paddd   m3, m15
+psrld   m3, 5
+packusdwm10, m3
+
+palignr m11, m2, m0, 4
+pmaddwd m11, [r3 - 10 * 32] ; [8]
+paddd   m11, m15
+psrld   m11, 5
+palignr m3, m1, m2, 4
+pmaddwd m3, [r3 - 10 * 32]
+paddd   m3, m15
+psrld   m3, 5
+packusdwm11, m3
+
+TRANSPOSE_STORE_AVX2 4, 5, 6, 

[x265] [PATCH 302 of 307] X86:AVX512 intra_pred_ang32 mode 7 and 29 high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashree
# Date 1516212669 28800
#  Wed Jan 17 10:11:09 2018 -0800
# Node ID ae3f7bd65b45df716f1cd56b6b15d91643772621
# Parent  3a08a957d4cd2bf0eb57524651a824513378e0a3
X86:AVX512 intra_pred_ang32 mode 7 and 29 high bit depth

Mode | AVX2 performance | AVX512 performance
---
 7   |9.2x  | 11.45x
 29  |16.2x | 24.0x

diff -r 3a08a957d4cd -r ae3f7bd65b45 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Jan 29 20:05:49 2018 -0800
+++ b/source/common/x86/asm-primitives.cpp  Wed Jan 17 10:11:09 2018 -0800
@@ -3111,6 +3111,9 @@
 p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx512);
 p.cu[BLOCK_32x32].intra_pred[30] = PFX(intra_pred_ang32_30_avx512);
 p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx512);
+p.cu[BLOCK_32x32].intra_pred[29] = PFX(intra_pred_ang32_29_avx512);
+p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx512);
+
 p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512);
 p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512);
 p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512);
diff -r 3a08a957d4cd -r ae3f7bd65b45 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Jan 29 20:05:49 2018 -0800
+++ b/source/common/x86/intrapred16.asm Wed Jan 17 10:11:09 2018 -0800
@@ -20311,7 +20311,318 @@
 
 callang32_mode_8_28
 RET
-
+;; angle 16, modes 7 and 29
+cglobal ang16_mode_7_29
+testr6d, r6d
+
+vbroadcasti32x8 m0, [r2 + 2]; [16 15 14 13 
12 11 10  9  8  7  6  5  4  3  2  1]
+vbroadcasti32x8m1, [r2 + 4]; [17 16 15 14 
13 12 11 10  9  8  7  6  5  4  3  2]
+
+punpcklwd   m3, m0, m1  ; [13 12 12 11 11 10 10  9 
 5  4  4  3  3  2  2  1]
+punpckhwd   m0, m1  ; [17 16 16 15 15 14 14 13 
 9  8  8  7  7  6  6  5]
+
+vbroadcasti32x8 m2, [r2 + 18]   ; [24 23 22 21 
20 19 18 17 16 15 14 13 12 11 10  9]
+vbroadcasti32x8m4, [r2 + 20]   ; [25 24 23 22 
21 20 19 18 17 16 15 14 13 12 11 10]
+punpcklwd   m2, m4  ; [21 20 20 19 19 18 18 17 
13 12 12 11 11 10 10  9]
+
+movuym16, [r3 - 8 * 32]   ; [9]
+vinserti32x8m16,  [r3 + 1 * 32] ,1  ; [18]
+pmaddwd m4, m3,m16
+paddd   m4, m15
+psrld   m4, 5
+pmaddwd m5, m0, m16
+paddd   m5, m15
+psrld   m5, 5
+packusdwm4, m5
+vextracti32x8   ym5, m4, 1
+
+pmaddwd m6, m3, [r3 + 10 * 32]  ; [27]
+paddd   m6, m15
+psrld   m6, 5
+pmaddwd m9, m0, [r3 + 10 * 32]
+paddd   m9, m15
+psrld   m9, 5
+packusdwm6, m9
+
+palignr m10, m0, m3, 4
+pmaddwd m7, m10, [r3 - 13 * 32] ; [4]
+paddd   m7, m15
+psrld   m7, 5
+palignr m11, m2, m0, 4
+pmaddwd m8, m11, [r3 - 13 * 32]
+paddd   m8, m15
+psrld   m8, 5
+packusdwm7, m8
+
+movuym16,  [r3 - 4 * 32]  ; [13]
+vinserti32x8m16,  [r3 + 5 * 32],1  ; [22]
+pmaddwd m8, m10, m16
+paddd   m8, m15
+psrld   m8, 5
+pmaddwd m9, m11, m16
+paddd   m9, m15
+psrld   m9, 5
+packusdwm8, m9
+vextracti32x8   ym9, m8, 1
+
+pmaddwd m10, [r3 + 14 * 32] ; [31]
+paddd   m10, m15
+psrld   m10, 5
+pmaddwd m11, [r3 + 14 * 32]
+paddd   m11, m15
+psrld   m11, 5
+packusdwm10, m11
+
+palignr m11, m0, m3, 8
+pmaddwd m11, [r3 - 9 * 32]  ; [8]
+paddd   m11, m15
+psrld   m11, 5
+palignr m12, m2, m0, 8
+pmaddwd m12, [r3 - 9 * 32]
+paddd   m12, m15
+psrld   m12, 5
+packusdwm11, m12
+
+TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
+
+palignr m5, m0, m3, 8
+palignr m6, m2, m0, 8
+movuym16, [r3]; [17]
+vinserti32x8m16,  [r3 + 9 * 32] ,1  ; [26]
+pmaddwd m4, m5, m16
+paddd   m4, m15
+psrld   m4, 5
+pmaddwd m7, m6, m16
+paddd   m7, m15
+psrld   m7, 5
+packusdwm4, m7
+vextracti32x8   ym5, m4, 1
+
+
+palignr m9, m0, m3, 12
+palignr m3, m2, m0, 12
+movuym16, [r3 - 14 * 32]  ; [3]
+vinserti32x8m16, [r3 - 5 * 32] ,1  ; 

[x265] [PATCH 299 of 307] X86: AVX512 intra_pred_ang32 mode 8 and 28 high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1515789616 28800
#  Fri Jan 12 12:40:16 2018 -0800
# Node ID 624c83571d1df840e1206c46e589044fbf87ff32
# Parent  b0d00ca83af0cb2053d6eda82b6d4081236a0f5f
X86: AVX512 intra_pred_ang32 mode 8 and 28 high bit depth

Mode | AVX2 performance | AVX512 performance
---
 8   |9.15x |9.60x
 28  |11.30x|12.13x

diff -r b0d00ca83af0 -r 624c83571d1d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Jan 16 15:38:58 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jan 12 12:40:16 2018 -0800
@@ -3115,6 +3115,9 @@
 p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512);
 p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512);
 p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512);
+p.cu[BLOCK_32x32].intra_pred[8]= PFX(intra_pred_ang32_8_avx512);
+p.cu[BLOCK_32x32].intra_pred[28]= PFX(intra_pred_ang32_28_avx512);
+
 p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx512);
 p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx512);
 p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx512);
diff -r b0d00ca83af0 -r 624c83571d1d source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Jan 16 15:38:58 2018 +0530
+++ b/source/common/x86/intrapred16.asm Fri Jan 12 12:40:16 2018 -0800
@@ -20016,9 +20016,302 @@
 lea r3,[ang_table_avx2 + 15 * 32]
  shl r1d,   1
 lea r4,[r1 * 3]
-
 callang16_mode_6_30
 RET
+
+;; angle 16, modes 8 and 28
+cglobal ang16_mode_8_28
+testr6d, r6d
+
+vbroadcasti32x8m0, [r2 + 2] ; [16 15 14 13 12 11 10  9 
 8  7  6  5  4  3  2  1]
+vbroadcasti32x8m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 
 9  8  7  6  5  4  3  2]
+
+punpcklwd   m3, m0, m1  ; [13 12 12 11 11 10 10  9 
 5  4  4  3  3  2  2  1]
+punpckhwd   m0, m1  ; [17 16 16 15 15 14 14 13 
 9  8  8  7  7  6  6  5]
+
+vbroadcasti32x8m2, [r2 + 18]; [24 23 22 21 20 19 18 17 
16 15 14 13 12 11 10  9]
+vbroadcasti32x8m4, [r2 + 20]; [25 24 23 22 21 20 19 18 
17 16 15 14 13 12 11 10]
+punpcklwd   m2, m4  ; [21 20 20 19 19 18 18 17 
13 12 12 11 11 10 10  9]
+
+movuym14, [r3 - 10 * 32]
+vinserti32x8m14, [r3 - 5 * 32], 1
+pmaddwd m4, m3, m14; [5], [10]
+paddd   m4, m15
+psrld   m4, 5
+pmaddwd m5, m0, m14
+paddd   m5, m15
+psrld   m5, 5
+packusdwm4, m5
+vextracti32x8   ym5, m4, 1
+
+movuym14, [r3]
+vinserti32x8m14, [r3 + 5 * 32], 1
+pmaddwd m6, m3, m14; [15], [20]
+paddd   m6, m15
+psrld   m6, 5
+pmaddwd m9, m0, m14
+paddd   m9, m15
+psrld   m9, 5
+packusdwm6, m9
+vextracti32x8   ym7, m6, 1
+
+movuym14, [r3 + 10 * 32]
+vinserti32x8m14, [r3 +  15 * 32], 1
+pmaddwd m8, m3, m14 ; [25], [30]
+paddd   m8, m15
+psrld   m8, 5
+pmaddwd m9, m0, m14
+paddd   m9, m15
+psrld   m9, 5
+packusdwm8, m9
+vextracti32x8   ym9, m8, 1
+
+palignr m11, m0, m3, 4
+movuym14, [r3 - 12 * 32]
+vinserti32x8m14, [r3 - 7 * 32], 1
+pmaddwd m10, m11, m14 ; [3], [8]
+paddd   m10, m15
+psrld   m10, 5
+palignr m1, m2, m0, 4
+pmaddwd m12, m1, m14
+paddd   m12, m15
+psrld   m12, 5
+packusdwm10, m12
+vextracti32x8   ym11, m10, 1
+
+TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
+
+palignr m7, m0, m3, 4
+movuym14, [r3 - 2 * 32]
+vinserti32x8m14, [r3 + 3 * 32], 1
+pmaddwd m4, m7, m14  ; [13], [18]
+paddd   m4, m15
+psrld   m4, 5
+palignr m1, m2, m0, 4
+pmaddwd m5, m1, m14
+paddd   m5, m15
+psrld   m5, 5
+packusdwm4, m5
+vextracti32x8   ym5, m4, 1
+
+movuym14, [r3 + 8 * 32]
+vinserti32x8m14, [r3 + 13 * 32], 1
+pmaddwd m6, m7, m14  ; [23], [28]
+paddd   m6, m15
+psrld   m6, 5
+pmaddwd m8, m1, m14
+paddd   m8, m15
+psrld   m8, 5
+packusdwm6, m8
+vextracti32x8   ym7, m6, 1
+
+movuym14, [r3 - 14 * 32]
+

[x265] [PATCH 289 of 307] x86: AVX512 intra_pred_ang32 mode 11 and 25, intra_pred_ang16 mode 11 and 25

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1515661273 -19800
#  Thu Jan 11 14:31:13 2018 +0530
# Node ID d43237051962eab3cd761cf24f3971de09c07aa5
# Parent  5a90661c7fbf2fbacbd6b8afde64368147c29674
x86: AVX512 intra_pred_ang32 mode 11 and 25, intra_pred_ang16 mode 11 and 25

Size | Mode | AVX2 performance | AVX512 performance
---
16   | 11   | 8.68x|   9.27x
16   | 25   | 11.11x   |   14.26x
32   | 11   | 6.54x|   11.19x
32   | 25   | 12.40x   |   14.86x

diff -r 5a90661c7fbf -r d43237051962 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jan 11 09:13:56 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Jan 11 14:31:13 2018 +0530
@@ -3100,12 +3100,16 @@
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx512);
 p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx512);
 p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx512);
+p.cu[BLOCK_32x32].intra_pred[11]= PFX(intra_pred_ang32_11_avx512);
 p.cu[BLOCK_32x32].intra_pred[18]= PFX(intra_pred_ang32_18_avx512);
+p.cu[BLOCK_32x32].intra_pred[25]= PFX(intra_pred_ang32_25_avx512);
 p.cu[BLOCK_32x32].intra_pred[26]= PFX(intra_pred_ang32_26_avx512);
 p.cu[BLOCK_32x32].intra_pred[27]= PFX(intra_pred_ang32_27_avx512);
+
 p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512);
+p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512);
+p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512);
 p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512);
-
 p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu;
 p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu;
 p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu;
diff -r 5a90661c7fbf -r d43237051962 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Thu Jan 11 09:13:56 2018 +0530
+++ b/source/common/x86/intrapred16.asm Thu Jan 11 14:31:13 2018 +0530
@@ -18779,6 +18779,158 @@
 add r2,2
 callang16_mode_9_27
 RET
+;; angle 16, modes 11 and 25
+cglobal ang16_mode_11_25
+testr6d, r6d
+
+vbroadcasti32x8  m0, [r2]; [15 14 13 12 11 10  9  
8  7  6  5  4  3  2  1  0]
+vbroadcasti32x8  m1, [r2 + 2]; [16 15 14 13 12 11 10  
9  8  7  6  5  4  3  2  1]
+
+punpcklwd   m3, m0, m1  ; [12 11 11 10 10  9  9  8 
 4  3  3  2  2  1  1  0]
+punpckhwd   m0, m1  ; [16 15 15 14 14 13 13 12 
 8  7  7  6  6  5  5  4]
+
+movuym16, [r3 + 14 * 32]  ; [30]
+vinserti32x8m16, [r3 + 12 * 32], 1   ; [28]
+pmaddwd m4, m3, m16
+paddd   m4, m15
+psrld   m4, 5
+pmaddwd m5, m0, m16
+paddd   m5, m15
+psrld   m5, 5
+packusdwm4, m5
+vextracti32x8   ym5, m4, 1
+movuym16, [r3 + 10 * 32]  ; [26]
+vinserti32x8m16, [r3 + 8 * 32], 1; [24]
+pmaddwd m6, m3, m16
+paddd   m6, m15
+psrld   m6, 5
+pmaddwd m9, m0, m16
+paddd   m9, m15
+psrld   m9, 5
+packusdwm6, m9
+vextracti32x8   ym7, m6, 1
+movuym16, [r3 + 6 * 32]   ; [22]
+vinserti32x8m16, [r3 + 4 * 32], 1; [20]
+pmaddwd m8, m3, m16
+paddd   m8, m15
+psrld   m8, 5
+pmaddwd m9, m0, m16
+paddd   m9, m15
+psrld   m9, 5
+packusdwm8, m9
+vextracti32x8   ym9, m8, 1
+movuym16, [r3 + 2 * 32]   ; [18]
+vinserti32x8m16, [r3], 1 ; [16]
+pmaddwd m10, m3, m16
+paddd   m10, m15
+psrld   m10, 5
+pmaddwd m1, m0, m16
+paddd   m1, m15
+psrld   m1, 5
+packusdwm10, m1
+vextracti32x8   ym11, m10, 1
+TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0
+
+movuym16, [r3 - 2 * 32] ; [14]
+vinserti32x8m16, [r3 - 4 * 32], 1  ; [12]
+pmaddwd m4, m3, m16
+paddd   m4, m15
+psrld   m4, 5
+pmaddwd m5, m0, m16
+paddd   m5, m15
+psrld   m5, 5
+packusdwm4, m5
+vextracti32x8   ym5, m4, 1
+movuym16, [r3 - 6 * 32] ; [10]
+vinserti32x8m16, [r3 - 8 * 32], 1  ; [8]
+pmaddwd m6, m3, m16
+paddd   m6, m15
+psrld   m6, 5
+pmaddwd m8, m0, m16
+paddd   m8, m15
+psrld   m8, 5
+

[x265] [PATCH 303 of 307] X86:AVX512 intra_pred_ang16 mode 7 and 29 high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashree
# Date 1516212669 28800
#  Wed Jan 17 10:11:09 2018 -0800
# Node ID f56354b2b542aaafa389a226f0fb3b41e4d33803
# Parent  ae3f7bd65b45df716f1cd56b6b15d91643772621
X86:AVX512 intra_pred_ang16 mode 7 and 29 high bit depth
Mode | AVX2 performance | AVX512 performance
---
 7   |9.2x  | 11.9x
 29  |17.3x | 24.30x

diff -r ae3f7bd65b45 -r f56354b2b542 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Jan 17 10:11:09 2018 -0800
+++ b/source/common/x86/asm-primitives.cpp  Wed Jan 17 10:11:09 2018 -0800
@@ -3127,6 +3127,8 @@
 p.cu[BLOCK_16x16].intra_pred[32] = PFX(intra_pred_ang16_32_avx512);
 p.cu[BLOCK_16x16].intra_pred[6] = PFX(intra_pred_ang16_6_avx512);
 p.cu[BLOCK_16x16].intra_pred[30] = PFX(intra_pred_ang16_30_avx512);
+p.cu[BLOCK_16x16].intra_pred[7] = PFX(intra_pred_ang16_7_avx512);
+p.cu[BLOCK_16x16].intra_pred[29] = PFX(intra_pred_ang16_29_avx512);
 p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu;
 p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu;
 p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu;
diff -r ae3f7bd65b45 -r f56354b2b542 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Wed Jan 17 10:11:09 2018 -0800
+++ b/source/common/x86/intrapred16.asm Wed Jan 17 10:11:09 2018 -0800
@@ -20618,11 +20618,30 @@
 mov r0,r5
 
 callang16_mode_7_29
-
 add r2,8
-
 callang32_mode_7_29
 RET
+cglobal intra_pred_ang16_7, 3,7,17
+add r2,64
+xor r6d,   r6d
+vbroadcasti32x8  m15,  [pd_16]
+lea r3,[ang_table_avx2 + 17 * 32]
+add r1d,   r1d
+lea r4,[r1 * 3]
+
+callang16_mode_7_29
+RET
+
+cglobal intra_pred_ang16_29, 3,7,17
+xor r6d,   r6d
+inc r6d
+vbroadcasti32x8  m15,  [pd_16]
+lea r3,[ang_table_avx2 + 17 * 32]
+add r1d,   r1d
+lea r4,[r1 * 3]
+
+callang16_mode_7_29
+RET
 
;---
 ; avx512 code for intra_pred_ang32 mode 2 to 34 end
 
;---
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 029 of 307] x86: AVX512 pixel_sad_x4_48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500263597 -19800
#  Mon Jul 17 09:23:17 2017 +0530
# Node ID 576a93cba7d189fddba3466a21188f0ece3ed278
# Parent  229c13a0d7e4a1dafad7b0a2e9eef041ecccdb77
x86: AVX512 pixel_sad_x4_48x64

AVX2 performance :  59.49x
AVX512 performance: 62.29x

diff -r 229c13a0d7e4 -r 576a93cba7d1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Jul 17 08:27:14 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Jul 17 09:23:17 2017 +0530
@@ -3756,6 +3756,7 @@
 p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
 p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
 p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
+p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
 
 p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512);
 p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512);
diff -r 229c13a0d7e4 -r 576a93cba7d1 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm   Mon Jul 17 08:27:14 2017 +0530
+++ b/source/common/x86/sad-a.asm   Mon Jul 17 09:23:17 2017 +0530
@@ -4348,6 +4348,154 @@
 paddd   m3, m4
 %endmacro
 
+%macro SAD_X4_48x8_AVX512 0
+movuym4, [r0]
+vinserti32x8m4, [r0 + FENC_STRIDE], 1
+movuym5, [r1]
+vinserti32x8m5, [r1 + r5], 1
+movuym6, [r2]
+vinserti32x8m6, [r2 + r5], 1
+movuym7, [r3]
+vinserti32x8m7, [r3 + r5], 1
+movuym8, [r4]
+vinserti32x8m8, [r4 + r5], 1
+
+psadbw  m9, m4, m5
+paddd   m0, m9
+psadbw  m5, m4, m6
+paddd   m1, m5
+psadbw  m6, m4, m7
+paddd   m2, m6
+psadbw  m4, m8
+paddd   m3, m4
+
+movuym4, [r0 + FENC_STRIDE * 2]
+vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
+movuym5, [r1 + r5 * 2]
+vinserti32x8 m5, [r1 + r7], 1
+movuym6, [r2 + r5 * 2]
+vinserti32x8 m6, [r2 + r7], 1
+movuym7, [r3 + r5 * 2]
+vinserti32x8 m7, [r3 + r7], 1
+movuym8, [r4 + r5 * 2]
+vinserti32x8 m8, [r4 + r7], 1
+
+psadbw  m9, m4, m5
+paddd   m0, m9
+psadbw  m5, m4, m6
+paddd   m1, m5
+psadbw  m6, m4, m7
+paddd   m2, m6
+psadbw  m4, m8
+paddd   m3, m4
+
+movu   xm4, [r0 + 32]
+vinserti32x4m4, [r0 + FENC_STRIDE + 32], 1
+vinserti32x4m4, [r0 + FENC_STRIDE * 2 + 32], 2
+vinserti32x4m4, [r0 + FENC_STRIDE * 3 + 32], 3
+movu   xm5, [r1 + 32]
+vinserti32x4m5, [r1 + r5 + 32], 1
+vinserti32x4m5, [r1 + r5 * 2 + 32], 2
+vinserti32x4m5, [r1 + r7 + 32], 3
+movu   xm6, [r2 + 32]
+vinserti32x4m6, [r2 + r5 + 32], 1
+vinserti32x4m6, [r2 + r5 * 2 + 32], 2
+vinserti32x4m6, [r2 + r7 + 32], 3
+movu   xm7, [r3 + 32]
+vinserti32x4m7, [r3 + r5 + 32], 1
+vinserti32x4m7, [r3 + r5 * 2 + 32], 2
+vinserti32x4m7, [r3 + r7 + 32], 3
+movu   xm8, [r4 + 32]
+vinserti32x4m8, [r4 + r5 + 32], 1
+vinserti32x4m8, [r4 + r5 * 2 + 32], 2
+vinserti32x4m8, [r4 + r7 + 32], 3
+
+psadbw  m9, m4, m5
+paddd   m0, m9
+psadbw  m5, m4, m6
+paddd   m1, m5
+psadbw  m6, m4, m7
+paddd   m2, m6
+psadbw  m4, m8
+paddd   m3, m4
+
+add r0, FENC_STRIDE * 4
+lea r1, [r1 + r5 * 4]
+lea r2, [r2 + r5 * 4]
+lea r3, [r3 + r5 * 4]
+lea r4, [r4 + r5 * 4]
+
+movuym4, [r0]
+vinserti32x8m4, [r0 + FENC_STRIDE], 1
+movuym5, [r1]
+vinserti32x8m5, [r1 + r5], 1
+movuym6, [r2]
+vinserti32x8m6, [r2 + r5], 1
+movuym7, [r3]
+vinserti32x8m7, [r3 + r5], 1
+movuym8, [r4]
+vinserti32x8m8, [r4 + r5], 1
+
+psadbw  m9, m4, m5
+paddd   m0, m9
+psadbw  m5, m4, m6
+paddd   m1, m5
+psadbw  m6, m4, m7
+paddd   m2, m6
+psadbw  m4, m8
+paddd   m3, m4
+
+movuym4, [r0 + FENC_STRIDE * 2]
+vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
+movuym5, [r1 + r5 * 2]
+vinserti32x8 m5, [r1 + r7], 1
+movuym6, [r2 + r5 * 2]
+vinserti32x8 m6, [r2 + r7], 1
+movuym7, [r3 + r5 * 2]
+vinserti32x8 m7, [r3 + r7], 1
+movuym8, [r4 + r5 * 2]
+vinserti32x8 m8, [r4 + r7], 1
+
+psadbw  m9, m4, m5
+paddd   m0, m9
+psadbw  m5, m4, m6
+paddd   m1, m5
+psadbw  m6, m4, m7
+paddd   m2, m6
+ 

[x265] [PATCH 026 of 307] x86: AVX512 pixel_sad_x3_W64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500013190 -19800
#  Fri Jul 14 11:49:50 2017 +0530
# Node ID 20ca79c2c6a803e2c6caf0c1dc87fb211ea9f708
# Parent  3183189cf8a0f1b95c31ecc39dd07b220ec53cea
x86: AVX512 pixel_sad_x3_W64

Size  | AVX2 performance | AVX512 performance
-
64x16 | 64,76x   |  95.17x
64x32 | 71.08x   | 106.10x
64x48 | 71.45x   | 108.12x
64x64 | 75.57x   | 110.06x

diff -r 3183189cf8a0 -r 20ca79c2c6a8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Jul 14 11:21:54 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jul 14 11:49:50 2017 +0530
@@ -3736,6 +3736,11 @@
 p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
 p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
 
+p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
+p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
+p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
+p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512);
+
 p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
 p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
 p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
diff -r 3183189cf8a0 -r 20ca79c2c6a8 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm   Fri Jul 14 11:21:54 2017 +0530
+++ b/source/common/x86/sad-a.asm   Fri Jul 14 11:49:50 2017 +0530
@@ -6129,6 +6129,263 @@
 RET
 %endif
 
+;
+;sad_x3 avx512 code start
+;
+%macro SAD_X3_64x8_AVX512 0
+movum3, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE]
+movum4, [r1 + r4]
+movum5, [r2 + r4]
+movum6, [r3 + r4]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 2]
+movum4, [r1 + r4 * 2]
+movum5, [r2 + r4 * 2]
+movum6, [r3 + r4 * 2]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 3]
+movum4, [r1 + r6]
+movum5, [r2 + r6]
+movum6, [r3 + r6]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+add r0, FENC_STRIDE * 4
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+
+movum3, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE]
+movum4, [r1 + r4]
+movum5, [r2 + r4]
+movum6, [r3 + r4]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 2]
+movum4, [r1 + r4 * 2]
+movum5, [r2 + r4 * 2]
+movum6, [r3 + r4 * 2]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 3]
+movum4, [r1 + r6]
+movum5, [r2 + r6]
+movum6, [r3 + r6]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+%endmacro
+
+%macro PIXEL_SAD_X3_END_AVX512 0
+vextracti32x8   ym3, m0, 1
+vextracti32x8   ym4, m1, 1
+vextracti32x8   ym5, m2, 1
+paddd   ym0, ym3
+paddd   ym1, ym4
+paddd   ym2, ym5
+vextracti64x2   xm3, m0, 1
+vextracti64x2   xm4, m1, 1
+vextracti64x2   xm5, m2, 1
+paddd   xm0, xm3
+paddd   xm1, xm4
+paddd   xm2, xm5
+pshufd  xm3, xm0, 2

[x265] [PATCH 030 of 307] x86: AVX512 convert_p2s 64xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499858302 -19800
#  Wed Jul 12 16:48:22 2017 +0530
# Node ID a77082ebfa67b40f3dbb8cd45b54c17e710a104c
# Parent  576a93cba7d189fddba3466a21188f0ece3ed278
x86: AVX512 convert_p2s 64xN

Size| AVX2 performance | AVX512 performance

64x16   | 2.05x   |  3.77x
64x32   | 2.16x   |  3.88x
64x48   | 2.13x   |  3.91x
64x64   | 2.16x   |  4.00x

diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Jul 17 09:23:17 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Jul 12 16:48:22 2017 +0530
@@ -3832,6 +3832,11 @@
 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = 
PFX(pixel_sub_ps_32x32_avx512);
 p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = 
PFX(pixel_sub_ps_32x64_avx512);
 
+p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx512);
+p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
+p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
+p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
+
 }
 #endif
 }
diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Mon Jul 17 09:23:17 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Wed Jul 12 16:48:22 2017 +0530
@@ -2269,6 +2269,186 @@
 P2S_H_64xN_avx2 48
 
 ;-
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
+;-
+%macro PROCESS_P2S_64x8_AVX512 0
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + mmsize/2]
+pmovzxbwm2, [r0 + r1]
+pmovzxbwm3, [r0 + r1 + mmsize/2]
+
+psllw   m0, 6
+psllw   m1, 6
+psllw   m2, 6
+psllw   m3, 6
+psubw   m0, m4
+psubw   m1, m4
+psubw   m2, m4
+psubw   m3, m4
+movu[r2], m0
+movu[r2 + mmsize], m1
+movu[r2 + r3], m2
+movu[r2 + r3 + mmsize], m3
+
+pmovzxbwm0, [r0 + r1 * 2]
+pmovzxbwm1, [r0 + r1 * 2 + mmsize/2]
+pmovzxbwm2, [r0 + r5]
+pmovzxbwm3, [r0 + r5 + mmsize/2]
+
+psllw   m0, 6
+psllw   m1, 6
+psllw   m2, 6
+psllw   m3, 6
+psubw   m0, m4
+psubw   m1, m4
+psubw   m2, m4
+psubw   m3, m4
+movu[r2 + r3 * 2], m0
+movu[r2 + r3 * 2 + mmsize], m1
+movu[r2 + r6], m2
+movu[r2 + r6 + mmsize], m3
+
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + mmsize/2]
+pmovzxbwm2, [r0 + r1]
+pmovzxbwm3, [r0 + r1 + mmsize/2]
+
+psllw   m0, 6
+psllw   m1, 6
+psllw   m2, 6
+psllw   m3, 6
+psubw   m0, m4
+psubw   m1, m4
+psubw   m2, m4
+psubw   m3, m4
+movu[r2], m0
+movu[r2 + mmsize], m1
+movu[r2 + r3], m2
+movu[r2 + r3 + mmsize], m3
+
+pmovzxbwm0, [r0 + r1 * 2]
+pmovzxbwm1, [r0 + r1 * 2 + mmsize/2]
+pmovzxbwm2, [r0 + r5]
+pmovzxbwm3, [r0 + r5 + mmsize/2]
+
+psllw   m0, 6
+psllw   m1, 6
+psllw   m2, 6
+psllw   m3, 6
+psubw   m0, m4
+psubw   m1, m4
+psubw   m2, m4
+psubw   m3, m4
+movu[r2 + r3 * 2], m0
+movu[r2 + r3 * 2 + mmsize], m1
+movu[r2 + r6], m2
+movu[r2 + r6 + mmsize], m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x64, 3, 7, 5
+mov r3d, r3m
+add r3d, r3d
+lea r5, [r1 * 3]
+lea r6, [r3 * 3]
+
+; load constant
+vpbroadcastd  m4, [pw_2000]
+
+PROCESS_P2S_64x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_64x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_64x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_64x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_64x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_64x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_64x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_64x8_AVX512
+RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_64x48, 3, 7, 9
+mov r3d, r3m
+add r3d, r3d
+lea r5, [r1 * 3]
+lea r6, [r3 * 3]
+
+; load constant
+vpbroadcastd 

[x265] [PATCH 031 of 307] x86: AVX512 convert_p2s_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500445753 -19800
#  Wed Jul 19 11:59:13 2017 +0530
# Node ID 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1
# Parent  a77082ebfa67b40f3dbb8cd45b54c17e710a104c
x86: AVX512 convert_p2s_32xN

Size| AVX2 performance | AVX512 performance

32x8| 1.51x   |  1.54x
32x16   | 2.18x   |  3.62x
32x24   | 2.26x   |  3.58x
32x32   | 2.28x   |  3.94x
32x64   | 2.20x   |  4.06x

diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Jul 12 16:48:22 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Jul 19 11:59:13 2017 +0530
@@ -3836,6 +3836,19 @@
 p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
 p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
 p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
+p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
+p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512);
+p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
+p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
+p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = 
PFX(filterPixelToShort_32x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = 
PFX(filterPixelToShort_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = 
PFX(filterPixelToShort_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = 
PFX(filterPixelToShort_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = 
PFX(filterPixelToShort_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = 
PFX(filterPixelToShort_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = 
PFX(filterPixelToShort_32x48_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = 
PFX(filterPixelToShort_32x64_avx512);
 
 }
 #endif
diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Wed Jul 12 16:48:22 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Wed Jul 19 11:59:13 2017 +0530
@@ -1956,6 +1956,184 @@
 ;-
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
 ;-
+%macro PROCESS_P2S_32x8_AVX512 0
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + r1]
+pmovzxbwm2, [r0 + r1 * 2]
+pmovzxbwm3, [r0 + r5]
+
+psllw   m0, 6
+psllw   m1, 6
+psllw   m2, 6
+psllw   m3, 6
+psubw   m0, m4
+psubw   m1, m4
+psubw   m2, m4
+psubw   m3, m4
+
+movu[r2],   m0
+movu[r2 + r3],  m1
+movu[r2 + r3 * 2],  m2
+movu[r2 + r6],  m3
+
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + r1]
+pmovzxbwm2, [r0 + r1 * 2]
+pmovzxbwm3, [r0 + r5]
+
+psllw   m0, 6
+psllw   m1, 6
+psllw   m2, 6
+psllw   m3, 6
+psubw   m0, m4
+psubw   m1, m4
+psubw   m2, m4
+psubw   m3, m4
+
+movu[r2],   m0
+movu[r2 + r3],  m1
+movu[r2 + r3 * 2],  m2
+movu[r2 + r6],  m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x8, 3, 7, 5
+mov r3d, r3m
+add r3d, r3d
+lea r5, [r1 * 3]
+lea r6, [r3 * 3]
+
+; load constant
+vpbroadcastd  m4, [pw_2000]
+
+PROCESS_P2S_32x8_AVX512
+RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x16, 3, 7, 5
+mov r3d, r3m
+add r3d, r3d
+lea r5, [r1 * 3]
+lea r6, [r3 * 3]
+
+; load constant
+vpbroadcastd  m4, [pw_2000]
+
+PROCESS_P2S_32x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_32x8_AVX512
+RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x24, 3, 7, 5
+mov r3d, r3m
+add r3d, r3d
+lea r5, [r1 * 3]
+lea r6, [r3 * 3]
+
+; load constant
+vpbroadcastd  m4, [pw_2000]
+
+PROCESS_P2S_32x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_32x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_32x8_AVX512
+RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_32x32, 3, 7, 5
+mov r3d, r3m
+add r3d, r3d
+lea r5, 

[x265] [PATCH 033 of 307] x86: AVX512 fix convert_p2s_64xN,48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500536572 -19800
#  Thu Jul 20 13:12:52 2017 +0530
# Node ID bf9a9cd255216300408506d10d4ff8bc87a15845
# Parent  97d5ab44b6da2db69584875c2dde97aef5533d9b
x86: AVX512 fix convert_p2s_64xN,48x64

diff -r 97d5ab44b6da -r bf9a9cd25521 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Wed Jul 19 12:25:43 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Thu Jul 20 13:12:52 2017 +0530
@@ -1953,9 +1953,6 @@
 P2S_H_32xN_avx2 64
 P2S_H_32xN_avx2 48
 
-;-
-; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
-;-
 %macro PROCESS_P2S_32x8_AVX512 0
 pmovzxbwm0, [r0]
 pmovzxbwm1, [r0 + r1]
@@ -1999,6 +1996,9 @@
 movu[r2 + r6],  m3
 %endmacro
 
+;-
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
+;-
 INIT_ZMM avx512
 cglobal filterPixelToShort_32x8, 3, 7, 5
 mov r3d, r3m
@@ -2446,9 +2446,6 @@
 P2S_H_64xN_avx2 32
 P2S_H_64xN_avx2 48
 
-;-
-; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
-;-
 %macro PROCESS_P2S_64x8_AVX512 0
 pmovzxbwm0, [r0]
 pmovzxbwm1, [r0 + mmsize/2]
@@ -2526,6 +2523,9 @@
 movu[r2 + r6 + mmsize], m3
 %endmacro
 
+;-
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
+;-
 INIT_ZMM avx512
 cglobal filterPixelToShort_64x64, 3, 7, 5
 mov r3d, r3m
@@ -2561,14 +2561,14 @@
 RET
 
 INIT_ZMM avx512
-cglobal filterPixelToShort_64x48, 3, 7, 9
+cglobal filterPixelToShort_64x48, 3, 7, 5
 mov r3d, r3m
 add r3d, r3d
 lea r5, [r1 * 3]
 lea r6, [r3 * 3]
 
 ; load constant
-vpbroadcastd  m8, [pw_2000]
+vpbroadcastd  m4, [pw_2000]
 
 PROCESS_P2S_64x8_AVX512
 lea r0, [r0 + r1 * 4]
@@ -2589,14 +2589,14 @@
 RET
 
 INIT_ZMM avx512
-cglobal filterPixelToShort_64x32, 3, 7, 9
+cglobal filterPixelToShort_64x32, 3, 7, 5
 mov r3d, r3m
 add r3d, r3d
 lea r5, [r1 * 3]
 lea r6, [r3 * 3]
 
 ; load constant
-vpbroadcastd  m8, [pw_2000]
+vpbroadcastd  m4, [pw_2000]
 
 PROCESS_P2S_64x8_AVX512
 lea r0, [r0 + r1 * 4]
@@ -2611,14 +2611,14 @@
 RET
 
 INIT_ZMM avx512
-cglobal filterPixelToShort_64x16, 3, 7, 9
+cglobal filterPixelToShort_64x16, 3, 7, 5
 mov r3d, r3m
 add r3d, r3d
 lea r5, [r1 * 3]
 lea r6, [r3 * 3]
 
 ; load constant
-vpbroadcastd  m8, [pw_2000]
+vpbroadcastd  m4, [pw_2000]
 
 PROCESS_P2S_64x8_AVX512
 lea r0, [r0 + r1 * 4]
@@ -3047,9 +3047,6 @@
 jnz.loop
 RET
 
-;-
-; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
-;-
 %macro PROCESS_P2S_48x8_AVX512 0
 pmovzxbwm0, [r0]
 pmovzxbwm1, [r0 + r1]
@@ -3123,6 +3120,9 @@
 movu[r2 + r6 + 64],  ym3
 %endmacro
 
+;-
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
+;-
 INIT_ZMM avx512
 cglobal filterPixelToShort_48x64, 3,7,5
 mov r3d, r3m
@@ -3131,7 +3131,7 @@
 lea r6, [r3 * 3]
 
 ; load constant
-vpbroadcastd m8, [pw_2000]
+vpbroadcastd m4, [pw_2000]
 
 PROCESS_P2S_48x8_AVX512
 lea r0, [r0 + r1 * 4]
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 034 of 307] x86: AVX512 ssd_ss_64x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500528397 -19800
#  Thu Jul 20 10:56:37 2017 +0530
# Node ID 0320e60b3323546eb6767508f1c39cd088e9f03e
# Parent  bf9a9cd255216300408506d10d4ff8bc87a15845
x86: AVX512 ssd_ss_64x64

AVX2 performance   : 14.85x
AVX512 performance : 21.35x

diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jul 20 13:12:52 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Jul 20 10:56:37 2017 +0530
@@ -3851,6 +3851,8 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = 
PFX(filterPixelToShort_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = 
PFX(filterPixelToShort_32x64_avx512);
 
+p.cu[BLOCK_64x64].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
+
 }
 #endif
 }
diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm   Thu Jul 20 13:12:52 2017 +0530
+++ b/source/common/x86/ssd-a.asm   Thu Jul 20 10:56:37 2017 +0530
@@ -1377,7 +1377,124 @@
 HADDD   m2, m0
 movdeax, xm2
 RET
+;-
+; ssd_ss avx512 code start
+;-
+%macro PROCESS_SSD_SS_64x8_AVX512 0
+movum0, [r0]
+movum1, [r0 + mmsize]
+movum2, [r0 + r1]
+movum3, [r0 + r1 + mmsize]
 
+psubw   m0, [r2]
+psubw   m1, [r2 + mmsize]
+psubw   m2, [r2 + r3]
+psubw   m3, [r2 + r3 + mmsize]
+pmaddwd m0, m0
+pmaddwd m1, m1
+pmaddwd m2, m2
+pmaddwd m3, m3
+paddd   m4, m0
+paddd   m5, m1
+paddd   m4, m2
+paddd   m5, m3
+
+movum0, [r0 + 2 * r1]
+movum1, [r0 + 2 * r1 + mmsize]
+movum2, [r0 + r5]
+movum3, [r0 + r5 + mmsize]
+
+psubw   m0, [r2 + 2 * r3]
+psubw   m1, [r2 + 2 * r3 + mmsize]
+psubw   m2, [r2 + r6]
+psubw   m3, [r2 + r6 + mmsize]
+pmaddwd m0, m0
+pmaddwd m1, m1
+pmaddwd m2, m2
+pmaddwd m3, m3
+paddd   m4, m0
+paddd   m5, m1
+paddd   m4, m2
+paddd   m5, m3
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+
+movum0, [r0]
+movum1, [r0 + mmsize]
+movum2, [r0 + r1]
+movum3, [r0 + r1 + mmsize]
+
+psubw   m0, [r2]
+psubw   m1, [r2 + mmsize]
+psubw   m2, [r2 + r3]
+psubw   m3, [r2 + r3 + mmsize]
+pmaddwd m0, m0
+pmaddwd m1, m1
+pmaddwd m2, m2
+pmaddwd m3, m3
+paddd   m4, m0
+paddd   m5, m1
+paddd   m4, m2
+paddd   m5, m3
+
+movum0, [r0 + 2 * r1]
+movum1, [r0 + 2 * r1 + mmsize]
+movum2, [r0 + r5]
+movum3, [r0 + r5 + mmsize]
+
+psubw   m0, [r2 + 2 * r3]
+psubw   m1, [r2 + 2 * r3 + mmsize]
+psubw   m2, [r2 + r6]
+psubw   m3, [r2 + r6 + mmsize]
+pmaddwd m0, m0
+pmaddwd m1, m1
+pmaddwd m2, m2
+pmaddwd m3, m3
+paddd   m4, m0
+paddd   m5, m1
+paddd   m4, m2
+paddd   m5, m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_ssd_ss_64x64, 4,7,6
+add r1d, r1d
+add r3d, r3d
+lea r5, [r1 * 3]
+lea r6, [r3 * 3]
+pxorm4, m4
+pxorm5, m5
+
+PROCESS_SSD_SS_64x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_64x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_64x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_64x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_64x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_64x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_64x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_64x8_AVX512
+paddd   m4, m5
+HADDD   m4, m0
+movdeax, xm4
+RET
+;-
+; ssd_ss avx512 code end
+;-
 %endif ; !HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 032 of 307] x86: AVX512 convert_p2s 48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500447343 -19800
#  Wed Jul 19 12:25:43 2017 +0530
# Node ID 97d5ab44b6da2db69584875c2dde97aef5533d9b
# Parent  60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1
x86: AVX512 convert_p2s 48x64

AVX2 performance  : 2.22x
AVX512 performance: 3.01x

diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Jul 19 11:59:13 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Jul 19 12:25:43 2017 +0530
@@ -3841,6 +3841,7 @@
 p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
 p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
 p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
+p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = 
PFX(filterPixelToShort_32x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = 
PFX(filterPixelToShort_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = 
PFX(filterPixelToShort_32x24_avx512);
diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Wed Jul 19 11:59:13 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Wed Jul 19 12:25:43 2017 +0530
@@ -3047,6 +3047,115 @@
 jnz.loop
 RET
 
+;-
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, 
int16_t dstStride)
+;-
+%macro PROCESS_P2S_48x8_AVX512 0
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + r1]
+pmovzxbwm2, [r0 + r1 * 2]
+pmovzxbwm3, [r0 + r5]
+psllw   m0, 6
+psllw   m1, 6
+psllw   m2, 6
+psllw   m3, 6
+psubw   m0, m4
+psubw   m1, m4
+psubw   m2, m4
+psubw   m3, m4
+movu[r2],   m0
+movu[r2 + r3],  m1
+movu[r2 + r3 * 2],  m2
+movu[r2 + r6],  m3
+
+pmovzxbwym0, [r0 + 32]
+pmovzxbwym1, [r0 + r1 + 32]
+pmovzxbwym2, [r0 + r1 * 2 + 32]
+pmovzxbwym3, [r0 + r5 + 32]
+psllw   ym0, 6
+psllw   ym1, 6
+psllw   ym2, 6
+psllw   ym3, 6
+psubw   ym0, ym4
+psubw   ym1, ym4
+psubw   ym2, ym4
+psubw   ym3, ym4
+movu[r2 + 64],   ym0
+movu[r2 + r3 + 64],  ym1
+movu[r2 + r3 * 2 + 64],  ym2
+movu[r2 + r6 + 64],  ym3
+
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + r1]
+pmovzxbwm2, [r0 + r1 * 2]
+pmovzxbwm3, [r0 + r5]
+psllw   m0, 6
+psllw   m1, 6
+psllw   m2, 6
+psllw   m3, 6
+psubw   m0, m4
+psubw   m1, m4
+psubw   m2, m4
+psubw   m3, m4
+movu[r2],   m0
+movu[r2 + r3],  m1
+movu[r2 + r3 * 2],  m2
+movu[r2 + r6],  m3
+
+pmovzxbwym0, [r0 + 32]
+pmovzxbwym1, [r0 + r1 + 32]
+pmovzxbwym2, [r0 + r1 * 2 + 32]
+pmovzxbwym3, [r0 + r5 + 32]
+psllw   ym0, 6
+psllw   ym1, 6
+psllw   ym2, 6
+psllw   ym3, 6
+psubw   ym0, ym4
+psubw   ym1, ym4
+psubw   ym2, ym4
+psubw   ym3, ym4
+movu[r2 + 64],   ym0
+movu[r2 + r3 + 64],  ym1
+movu[r2 + r3 * 2 + 64],  ym2
+movu[r2 + r6 + 64],  ym3
+%endmacro
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_48x64, 3,7,5
+mov r3d, r3m
+add r3d, r3d
+lea r5, [r1 * 3]
+lea r6, [r3 * 3]
+
+; load constant
+vpbroadcastd m8, [pw_2000]
+
+PROCESS_P2S_48x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_48x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_48x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_48x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_48x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_48x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_48x8_AVX512
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+PROCESS_P2S_48x8_AVX512
+RET
 
 %macro PROCESS_LUMA_W4_4R 0
 movdm0, [r0]
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 036 of 307] x86: AVX512 blockcopy_ss_64x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1499162011 -19800
#  Tue Jul 04 15:23:31 2017 +0530
# Node ID 3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392
# Parent  2eda6628c75302a10d59918a58740d6e27434293
x86: AVX512 blockcopy_ss_64x64

AVX2 performance over C code   : 1.32x
AVX512 performance over C code : 3.00x

diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jul 20 16:59:52 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Jul 04 15:23:31 2017 +0530
@@ -3854,6 +3854,8 @@
 p.cu[BLOCK_64x64].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
 p.cu[BLOCK_32x32].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
 
+p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
+
 }
 #endif
 }
diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Thu Jul 20 16:59:52 2017 +0530
+++ b/source/common/x86/blockcopy8.asm  Tue Jul 04 15:23:31 2017 +0530
@@ -4462,6 +4462,154 @@
 BLOCKCOPY_SS_W64_H4_avx 64, 48
 BLOCKCOPY_SS_W64_H4_avx 64, 64
 
+%macro PROCESS_BLOCKCOPY_SS_W64_H8_avx512 0
+movum0, [r2]
+movum1, [r2 + mmsize]
+movum2, [r2 + r3]
+movum3, [r2 + r3 + mmsize]
+
+movu[r0],   m0
+movu[r0 + mmsize],  m1
+movu[r0 + r1],  m2
+movu[r0 + r1 + mmsize], m3
+
+movum0, [r2 + 2 * r3]
+movum1, [r2 + 2 * r3 + mmsize]
+movum2, [r2 + r6]
+movum3, [r2 + r6 + mmsize]
+lea r2, [r2 + 4 * r3]
+
+movu[r0 + 2 * r1],  m0
+movu[r0 + 2 * r1 + mmsize], m1
+movu[r0 + r5],  m2
+movu[r0 + r5 + mmsize], m3
+lea r0, [r0 + 4 * r1]
+
+movum0, [r2]
+movum1, [r2 + mmsize]
+movum2, [r2 + r3]
+movum3, [r2 + r3 + mmsize]
+
+movu[r0],   m0
+movu[r0 + mmsize],  m1
+movu[r0 + r1],  m2
+movu[r0 + r1 + mmsize], m3
+
+movum0, [r2 + 2 * r3]
+movum1, [r2 + 2 * r3 + mmsize]
+movum2, [r2 + r6]
+movum3, [r2 + r6 + mmsize]
+lea r2, [r2 + 4 * r3]
+
+movu[r0 + 2 * r1],  m0
+movu[r0 + 2 * r1 + mmsize], m1
+movu[r0 + r5],  m2
+movu[r0 + r5 + mmsize], m3
+lea r0, [r0 + 4 * r1]
+%endmacro
+
+%macro PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 0
+movum0, [r2]
+movum1, [r2 + mmsize]
+movum2, [r2 + r3]
+movum3, [r2 + r3 + mmsize]
+
+movu[r0],   m0
+movu[r0 + mmsize],  m1
+movu[r0 + r1],  m2
+movu[r0 + r1 + mmsize], m3
+
+movum0, [r2 + 2 * r3]
+movum1, [r2 + 2 * r3 + mmsize]
+movum2, [r2 + r6]
+movum3, [r2 + r6 + mmsize]
+lea r2, [r2 + 4 * r3]
+
+movu[r0 + 2 * r1],  m0
+movu[r0 + 2 * r1 + mmsize], m1
+movu[r0 + r5],  m2
+movu[r0 + r5 + mmsize], m3
+lea r0, [r0 + 4 * r1]
+
+movum0, [r2]
+movum1, [r2 + mmsize]
+movum2, [r2 + r3]
+movum3, [r2 + r3 + mmsize]
+
+movu[r0],   m0
+movu[r0 + mmsize],  m1
+movu[r0 + r1],  m2
+movu[r0 + r1 + mmsize], m3
+
+movum0, [r2 + 2 * r3]
+movum1, [r2 + 2 * r3 + mmsize]
+movum2, [r2 + r6]
+movum3, [r2 + r6 + mmsize]
+
+movu[r0 + 2 * r1],  m0
+movu[r0 + 2 * r1 + mmsize], m1
+movu[r0 + r5],  m2
+movu[r0 + r5 + mmsize], m3
+%endmacro
+
+;-
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* 
src, intptr_t srcStride)
+;-
+INIT_ZMM avx512
+cglobal blockcopy_ss_64x16, 4, 7, 4
+add r1, r1
+add r3, r3
+lea r5, [3 * r1]
+lea r6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
+RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_64x32, 4, 7, 4
+add r1, r1
+add r3, r3
+lea r5, [3 * r1]
+lea r6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
+RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_64x48, 4, 7, 4
+add r1, r1
+add r3, r3
+lea r5, [3 * r1]
+lea r6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
+RET
+
+INIT_ZMM avx512
+cglobal 

[x265] [PATCH 028 of 307] x86: AVX512 pixel_sad_x3_48x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500260234 -19800
#  Mon Jul 17 08:27:14 2017 +0530
# Node ID 229c13a0d7e4a1dafad7b0a2e9eef041ecccdb77
# Parent  5a2d94db6fcaabf532f00848a72fa337bb5e65ac
x86: AVX512 pixel_sad_x3_48x64

AVX2 performance :  59.91x
AVX512 performance: 61.95x

diff -r 5a2d94db6fca -r 229c13a0d7e4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Sun Jul 16 18:05:11 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Jul 17 08:27:14 2017 +0530
@@ -3745,6 +3745,7 @@
 p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
 p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
 p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512);
+p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
 
 p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
 p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
diff -r 5a2d94db6fca -r 229c13a0d7e4 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm   Sun Jul 16 18:05:11 2017 +0530
+++ b/source/common/x86/sad-a.asm   Mon Jul 17 08:27:14 2017 +0530
@@ -6306,6 +6306,125 @@
 paddd   m2, m3
 %endmacro
 
+%macro SAD_X3_48x8_AVX512 0
+movuym3, [r0]
+vinserti32x8m3, [r0 + FENC_STRIDE], 1
+movuym4, [r1]
+vinserti32x8m4, [r1 + r4], 1
+movuym5, [r2]
+vinserti32x8m5, [r2 + r4], 1
+movuym6, [r3]
+vinserti32x8m6, [r3 + r4], 1
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movuym3, [r0 + FENC_STRIDE * 2]
+vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
+movuym4, [r1 + r4 * 2]
+vinserti32x8 m4, [r1 + r6], 1
+movuym5, [r2 + r4 * 2]
+vinserti32x8 m5, [r2 + r6], 1
+movuym6, [r3 + r4 * 2]
+vinserti32x8 m6, [r3 + r6], 1
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movu   xm3, [r0 + 32]
+vinserti32x4m3, [r0 + FENC_STRIDE + 32], 1
+vinserti32x4m3, [r0 + 2 * FENC_STRIDE + 32], 2
+vinserti32x4m3, [r0 + 3 * FENC_STRIDE + 32], 3
+movu   xm4, [r1 + 32]
+vinserti32x4m4, [r1 + r4 + 32], 1
+vinserti32x4m4, [r1 + 2 * r4 + 32], 2
+vinserti32x4m4, [r1 + r6 + 32], 3
+movu   xm5, [r2 + 32]
+vinserti32x4m5, [r2 + r4 + 32], 1
+vinserti32x4m5, [r2 + 2 * r4 + 32], 2
+vinserti32x4m5, [r2 + r6 + 32], 3
+movu   xm6, [r3 + 32]
+vinserti32x4m6, [r3 + r4 + 32], 1
+vinserti32x4m6, [r3 + 2 * r4 + 32], 2
+vinserti32x4m6, [r3 + r6 + 32], 3
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+add r0, FENC_STRIDE * 4
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+
+movuym3, [r0]
+vinserti32x8m3, [r0 + FENC_STRIDE], 1
+movuym4, [r1]
+vinserti32x8m4, [r1 + r4], 1
+movuym5, [r2]
+vinserti32x8m5, [r2 + r4], 1
+movuym6, [r3]
+vinserti32x8m6, [r3 + r4], 1
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movuym3, [r0 + FENC_STRIDE * 2]
+vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
+movuym4, [r1 + r4 * 2]
+vinserti32x8 m4, [r1 + r6], 1
+movuym5, [r2 + r4 * 2]
+vinserti32x8 m5, [r2 + r6], 1
+movuym6, [r3 + r4 * 2]
+vinserti32x8 m6, [r3 + r6], 1
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movu   xm3, [r0 + 32]
+vinserti32x4m3, [r0 + FENC_STRIDE + 32], 1
+vinserti32x4m3, [r0 + 2 * FENC_STRIDE + 32], 2
+vinserti32x4m3, [r0 + 3 * FENC_STRIDE + 32], 3
+movu   xm4, [r1 + 32]
+vinserti32x4m4, [r1 + r4 + 32], 1
+vinserti32x4m4, [r1 + 2 * r4 + 32], 2
+vinserti32x4m4, [r1 + r6 + 32], 3
+movu   xm5, [r2 + 32]
+vinserti32x4m5, [r2 + r4 + 32], 1
+vinserti32x4m5, [r2 + 2 * r4 + 32], 2
+vinserti32x4m5, [r2 + r6 + 32], 3
+movu   xm6, [r3 + 32]
+vinserti32x4m6, [r3 + r4 + 32], 1
+vinserti32x4m6, [r3 + 2 * r4 + 32], 2
+vinserti32x4m6, [r3 + r6 + 32], 3
+
+psadbw  m7, 

[x265] [PATCH 035 of 307] x86: AVX512 ssd_ss_32x32

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500550192 -19800
#  Thu Jul 20 16:59:52 2017 +0530
# Node ID 2eda6628c75302a10d59918a58740d6e27434293
# Parent  0320e60b3323546eb6767508f1c39cd088e9f03e
x86: AVX512 ssd_ss_32x32

AVX2 performance   : 12.73x
AVX512 performance : 19.72x

diff -r 0320e60b3323 -r 2eda6628c753 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jul 20 10:56:37 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Jul 20 16:59:52 2017 +0530
@@ -3852,6 +3852,7 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = 
PFX(filterPixelToShort_32x64_avx512);
 
 p.cu[BLOCK_64x64].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
+p.cu[BLOCK_32x32].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
 
 }
 #endif
diff -r 0320e60b3323 -r 2eda6628c753 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm   Thu Jul 20 10:56:37 2017 +0530
+++ b/source/common/x86/ssd-a.asm   Thu Jul 20 16:59:52 2017 +0530
@@ -1457,6 +1457,47 @@
 paddd   m5, m3
 %endmacro
 
+%macro PROCESS_SSD_SS_32x8_AVX512 0
+movum0, [r0]
+movum1, [r0 + r1]
+movum2, [r0 + 2 * r1]
+movum3, [r0 + r5]
+
+psubw   m0, [r2]
+psubw   m1, [r2 + r3]
+psubw   m2, [r2 + 2 * r3]
+psubw   m3, [r2 + r6]
+pmaddwd m0, m0
+pmaddwd m1, m1
+pmaddwd m2, m2
+pmaddwd m3, m3
+paddd   m4, m0
+paddd   m5, m1
+paddd   m4, m2
+paddd   m5, m3
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+
+movum0, [r0]
+movum1, [r0 + r1]
+movum2, [r0 + 2 * r1]
+movum3, [r0 + r5]
+
+psubw   m0, [r2]
+psubw   m1, [r2 + r3]
+psubw   m2, [r2 + 2 * r3]
+psubw   m3, [r2 + r6]
+pmaddwd m0, m0
+pmaddwd m1, m1
+pmaddwd m2, m2
+pmaddwd m3, m3
+paddd   m4, m0
+paddd   m5, m1
+paddd   m4, m2
+paddd   m5, m3
+%endmacro
+
 INIT_ZMM avx512
 cglobal pixel_ssd_ss_64x64, 4,7,6
 add r1d, r1d
@@ -1492,6 +1533,30 @@
 HADDD   m4, m0
 movdeax, xm4
 RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_ss_32x32, 4,7,6
+add r1d, r1d
+add r3d, r3d
+lea r5, [r1 * 3]
+lea r6, [r3 * 3]
+pxorm4, m4
+pxorm5, m5
+
+PROCESS_SSD_SS_32x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_32x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_32x8_AVX512
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+PROCESS_SSD_SS_32x8_AVX512
+paddd   m4, m5
+HADDD   m4, m0
+movdeax, xm4
+RET
 ;-
 ; ssd_ss avx512 code end
 ;-
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 038 of 307] x86: AVX512 getResidual32

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1500627732 -19800
#  Fri Jul 21 14:32:12 2017 +0530
# Node ID 49123506b563fd44378e856e6833c77812d0349e
# Parent  ef8989f43083cd5195ff3ba360959fe3900399e5
x86: AVX512 getResidual32

BIT_DEPTH = 8
AVX2 performance over C code   : 2.99x
AVX512 performance over C code : 5.46x

HIGH_BIT_DEPTH
AVX2 performance over C code   : 3.10x
AVX512 performance over C code : 5.60x

diff -r ef8989f43083 -r 49123506b563 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Jul 04 18:02:59 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jul 21 14:32:12 2017 +0530
@@ -3723,6 +3723,7 @@
 p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2);
 p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2);
 
+p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
 }
 if (cpuMask & X265_CPU_AVX512)
 {
@@ -3859,6 +3860,8 @@
 p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = 
PFX(blockcopy_ss_32x64_avx512);
 p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
 
+p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
+
 }
 #endif
 }
diff -r ef8989f43083 -r 49123506b563 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jul 04 18:02:59 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Jul 21 14:32:12 2017 +0530
@@ -554,6 +554,135 @@
 %endrep
 RET
 %endif
+
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0
+movum0, [r0]
+movum1, [r0 + r3]
+movum2, [r0 + r3 * 2]
+movum3, [r0 + r4]
+lea r0, [r0 + r3 * 4]
+
+movum4, [r1]
+movum5, [r1 + r3]
+movum6, [r1 + r3 * 2]
+movum7, [r1 + r4]
+lea r1, [r1 + r3 * 4]
+
+psubw   m0, m4
+psubw   m1, m5
+psubw   m2, m6
+psubw   m3, m7
+
+movu[r2], m0
+movu[r2 + r3], m1
+movu[r2 + r3 * 2], m2
+movu[r2 + r4], m3
+lea r2, [r2 + r3 * 4]
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0
+movum0, [r0]
+movum1, [r0 + r3]
+movum2, [r0 + r3 * 2]
+movum3, [r0 + r4]
+
+movum4, [r1]
+movum5, [r1 + r3]
+movum6, [r1 + r3 * 2]
+movum7, [r1 + r4]
+
+psubw   m0, m4
+psubw   m1, m5
+psubw   m2, m6
+psubw   m3, m7
+
+movu[r2], m0
+movu[r2 + r3], m1
+movu[r2 + r3 * 2], m2
+movu[r2 + r4], m3
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_AVX512 0
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + r3]
+pmovzxbwm2, [r0 + r3 * 2]
+pmovzxbwm3, [r0 + r4]
+lea r0, [r0 + r3 * 4]
+
+pmovzxbwm4, [r1]
+pmovzxbwm5, [r1 + r3]
+pmovzxbwm6, [r1 + r3 * 2]
+pmovzxbwm7, [r1 + r4]
+lea r1, [r1 + r3 * 4]
+
+psubw   m0, m4
+psubw   m1, m5
+psubw   m2, m6
+psubw   m3, m7
+
+movu[r2], m0
+movu[r2 + r3 * 2], m1
+lea r2, [r2 + r3 * 4]
+movu[r2], m2
+movu[r2 + r3 * 2], m3
+lea r2, [r2 + r3 * 4]
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + r3]
+pmovzxbwm2, [r0 + r3 * 2]
+pmovzxbwm3, [r0 + r4]
+
+pmovzxbwm4, [r1]
+pmovzxbwm5, [r1 + r3]
+pmovzxbwm6, [r1 + r3 * 2]
+pmovzxbwm7, [r1 + r4]
+
+psubw   m0, m4
+psubw   m1, m5
+psubw   m2, m6
+psubw   m3, m7
+
+movu[r2], m0
+movu[r2 + r3 * 2], m1
+lea r2, [r2 + r3 * 4]
+movu[r2], m2
+movu[r2 + r3 * 2], m3
+%endmacro
+
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal getResidual32, 4,5,8
+add r3, r3
+lea r4, [r3 * 3]
+
+PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END
+RET
+%else
+INIT_ZMM avx512
+cglobal getResidual32, 4,5,8
+lea r4, [r3 * 3]
+
+PROCESS_GETRESIDUAL32_W4_AVX512
+PROCESS_GETRESIDUAL32_W4_AVX512
+PROCESS_GETRESIDUAL32_W4_AVX512
+PROCESS_GETRESIDUAL32_W4_AVX512
+PROCESS_GETRESIDUAL32_W4_AVX512
+PROCESS_GETRESIDUAL32_W4_AVX512
+PROCESS_GETRESIDUAL32_W4_AVX512
+PROCESS_GETRESIDUAL32_W4_AVX512_END
+RET
+%endif
+
 ;-
 ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t 
*qCoef, int qBits, int add, int numCoeff);
 

[x265] [PATCH 037 of 307] x86: AVX512 blockcopy_ss_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1499171579 -19800
#  Tue Jul 04 18:02:59 2017 +0530
# Node ID ef8989f43083cd5195ff3ba360959fe3900399e5
# Parent  3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392
x86: AVX512 blockcopy_ss_32xN

AVX2 performance over C code   : 1.82x
AVX512 performance over C code : 4.56x

diff -r 3e3a44c6d77c -r ef8989f43083 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Jul 04 15:23:31 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Jul 04 18:02:59 2017 +0530
@@ -3854,6 +3854,9 @@
 p.cu[BLOCK_64x64].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
 p.cu[BLOCK_32x32].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
 
+p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
+p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = 
PFX(blockcopy_ss_32x32_avx512);
+p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = 
PFX(blockcopy_ss_32x64_avx512);
 p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
 
 }
diff -r 3e3a44c6d77c -r ef8989f43083 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Tue Jul 04 15:23:31 2017 +0530
+++ b/source/common/x86/blockcopy8.asm  Tue Jul 04 18:02:59 2017 +0530
@@ -4164,6 +4164,143 @@
 BLOCKCOPY_SS_W32_H4_avx 32, 48
 BLOCKCOPY_SS_W32_H4_avx 32, 64
 
+%macro PROCESS_BLOCKCOPY_SS_W32_H8_avx512 0
+movum0, [r2]
+movum1, [r2 + r3]
+movum2, [r2 + 2 * r3]
+movum3, [r2 + r6]
+lea r2, [r2 + 4 * r3]
+
+movu[r0],  m0
+movu[r0 + r1], m1
+movu[r0 + 2 * r1], m2
+movu[r0 + r5], m3
+lea r0, [r0 + 4 * r1]
+
+movum0, [r2]
+movum1, [r2 + r3]
+movum2, [r2 + 2 * r3]
+movum3, [r2 + r6]
+lea r2, [r2 + 4 * r3]
+
+movu[r0],  m0
+movu[r0 + r1], m1
+movu[r0 + 2 * r1], m2
+movu[r0 + r5], m3
+lea r0, [r0 + 4 * r1]
+%endmacro
+
+%macro PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 0
+movum0, [r2]
+movum1, [r2 + r3]
+movum2, [r2 + 2 * r3]
+movum3, [r2 + r6]
+lea r2, [r2 + 4 * r3]
+
+movu[r0],  m0
+movu[r0 + r1], m1
+movu[r0 + 2 * r1], m2
+movu[r0 + r5], m3
+lea r0, [r0 + 4 * r1]
+
+movum0, [r2]
+movum1, [r2 + r3]
+movum2, [r2 + 2 * r3]
+movum3, [r2 + r6]
+
+movu[r0],  m0
+movu[r0 + r1], m1
+movu[r0 + 2 * r1], m2
+movu[r0 + r5], m3
+%endmacro
+
+;-
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* 
src, intptr_t srcStride)
+;-
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x8, 4, 7, 4
+
+addr1, r1
+addr3, r3
+lear5, [3 * r1]
+lear6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x16, 4, 7, 4
+
+addr1, r1
+addr3, r3
+lear5, [3 * r1]
+lear6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x24, 4, 7, 4
+
+addr1, r1
+addr3, r3
+lear5, [3 * r1]
+lear6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x32, 4, 7, 4
+
+addr1, r1
+addr3, r3
+lear5, [3 * r1]
+lear6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x48, 4, 7, 4
+
+addr1, r1
+addr3, r3
+lear5, [3 * r1]
+lear6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x64, 4, 7, 4
+
+addr1, r1
+addr3, r3
+lear5, [3 * r1]
+lear6, [3 * r3]
+
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+RET
+
 ;-
 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, 

[x265] [PATCH 079 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_48x64 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1502187312 -19800
#  Tue Aug 08 15:45:12 2017 +0530
# Node ID 95c8818a26eea8a17a6a9471f861b89ab9e210c6
# Parent  aa1747a46469afe6fc2d5e6295a4b43a14ea
[x265-avx512]x86: AVX512 pixel_sad_x3_48x64 for high bit depth

AVX2 performance:   20.10x
AVX512 performance: 36.00x

diff -r aa1747a46469 -r 95c8818a26ee source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 08 11:18:41 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Aug 08 15:45:12 2017 +0530
@@ -2302,6 +2302,7 @@
 p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
 p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
 p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
+p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
 p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
 p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
 p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
diff -r aa1747a46469 -r 95c8818a26ee source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Tue Aug 08 11:18:41 2017 +0530
+++ b/source/common/x86/sad16-a.asm Tue Aug 08 15:45:12 2017 +0530
@@ -2844,6 +2844,133 @@
 PROCESS_SAD_X3_END_AVX512
 RET
 
+;
+; int pixel_sad_x3_48x64( const pixel* pix1, const pixel* pix2, const pixel* 
pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
+;
+INIT_ZMM avx512
+cglobal pixel_sad_x3_48x64, 4, 8, 17
+pxorm0,  m0
+pxorm1,  m1
+pxorm2,  m2
+mov r7d, 64/4
+vbroadcasti32x8 m16, [pw_1]
+
+add r4d, r4d
+lea r6d, [r4 * 3]
+.loop:
+movum4,   [r0]
+movum5,   [r0 + 2 * FENC_STRIDE]
+movu   ym6,   [r0 + mmsize]
+vinserti32x8m6,   [r0 + 2 * FENC_STRIDE + mmsize], 1
+movum7,   [r1]
+movum8,   [r1 + r4]
+movu   ym9,   [r1 + mmsize]
+vinserti32x8m9,   [r1 + r4 + mmsize], 1
+movum10,  [r2]
+movum11,  [r2 + r4]
+movu   ym12,  [r2 + mmsize]
+vinserti32x8m12,  [r2 + r4 + mmsize], 1
+movum13,  [r3]
+movum14,  [r3 + r4]
+movu   ym15,  [r3 + mmsize]
+vinserti32x8m15,  [r3 + r4 + mmsize], 1
+
+psubw   m7,  m4
+psubw   m8,  m5
+psubw   m9,  m6
+psubw   m10, m4
+psubw   m11, m5
+psubw   m12, m6
+psubw   m13, m4
+psubw   m14, m5
+psubw   m15, m6
+
+pabsw   m7,  m7
+pabsw   m8,  m8
+pabsw   m9,  m9
+pabsw   m10, m10
+pabsw   m11, m11
+pabsw   m12, m12
+pabsw   m13, m13
+pabsw   m14, m14
+pabsw   m15, m15
+
+paddw   m7,  m8
+paddw   m7,  m9
+paddw   m10, m11
+paddw   m10, m12
+paddw   m13, m14
+paddw   m13, m15
+
+pmaddwd m7,  m16
+paddd   m0,  m7
+pmaddwd m10, m16
+paddd   m1,  m10
+pmaddwd m13, m16
+paddd   m2,  m13
+
+movum4,   [r0 + 4 * FENC_STRIDE]
+movum5,   [r0 + 6 * FENC_STRIDE]
+movu   ym6,   [r0 + 4 * FENC_STRIDE + mmsize]
+vinserti32x8m6,   [r0 + 6 * FENC_STRIDE + mmsize], 1
+movum7,   [r1 + 2 * r4]
+movum8,   [r1 + r6]
+movu   ym9,   [r1 + 2 * r4 + mmsize]
+vinserti32x8m9,   [r1 + r6 + mmsize], 1
+movum10,  [r2 + 2 * r4]
+movum11,  [r2 + r6]
+movu   ym12,  [r2 + 2 * r4 + mmsize]
+vinserti32x8m12,  [r2 + r6 + mmsize], 1
+movum13,  [r3 + 2 * r4]
+movum14,  [r3 + r6]
+movu   ym15,  [r3 + 2 * r4 + mmsize]
+vinserti32x8m15,  [r3 + r6 + mmsize], 1
+
+psubw   m7,  m4
+psubw   m8,  m5
+psubw   m9,  m6
+psubw   m10, m4
+psubw   m11, m5
+psubw   m12, m6
+psubw   m13, m4
+psubw   m14, m5
+psubw   m15, m6
+
+pabsw   m7,  m7
+pabsw   m8,  m8
+pabsw   m9,  m9
+pabsw   m10, m10
+pabsw   m11, m11
+pabsw   m12, m12
+pabsw   m13, m13
+pabsw   m14, m14
+pabsw   m15, m15
+
+paddw   m7,  m8
+paddw   m7,  m9
+paddw   m10, m11
+paddw   m10, m12
+paddw   m13, m14
+paddw   m13, m15
+
+pmaddwd m7,  m16
+paddd   m0,  m7
+pmaddwd m10, m16
+paddd   m1,  m10
+pmaddwd m13, m16
+paddd   m2,  m13
+
+add r0, FENC_STRIDE * 8
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+
+dec r7d
+jg  .loop
+
+PROCESS_SAD_X3_END_AVX512
+RET
+
 

[x265] [PATCH 081 of 307] x86: AVX512 cleanup blockcopy_sp_64x64

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502709712 -19800
#  Mon Aug 14 16:51:52 2017 +0530
# Node ID 5c18b655a88a739b87c6b071d186a2b9286b8266
# Parent  4a643ecb8c3bcc4dab96bfe56217d4449564bae0
x86: AVX512 cleanup blockcopy_sp_64x64

diff -r 4a643ecb8c3b -r 5c18b655a88a source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Tue Aug 08 17:01:50 2017 +0530
+++ b/source/common/x86/blockcopy8.asm  Mon Aug 14 16:51:52 2017 +0530
@@ -26,7 +26,10 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA 32
+SECTION_RODATA 64
+
+ALIGN 64
+const shuf1_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
 
 cextern pb_4
 cextern pb_1
@@ -2162,7 +2165,7 @@
 
 BLOCKCOPY_SP_W64_H4_avx2 64, 64
 
-%macro PROCESS_BLOCKCOPY_SP_64x8_AVX512 0
+%macro PROCESS_BLOCKCOPY_SP_64x4_AVX512 0
 movu   m0, [r2]
 movu   m1, [r2 + 64]
 movu   m2, [r2 + r3]
@@ -2170,10 +2173,8 @@
 
 packuswb   m0, m1
 packuswb   m2, m3
-vpermq m0, m0, 11011000b
-vpermq m2, m2, 11011000b
-vshufi64x2 m0, m0, 11011000b
-vshufi64x2 m2, m2, 11011000b
+vpermq m0, m4, m0
+vpermq m2, m4, m2
 movu   [r0],   m0
 movu   [r0 + r1],  m2
 
@@ -2184,73 +2185,25 @@
 
 packuswb   m0, m1
 packuswb   m2, m3
-vpermq m0, m0, 11011000b
-vpermq m2, m2, 11011000b
-vshufi64x2 m0, m0, 11011000b
-vshufi64x2 m2, m2, 11011000b
-movu   [r0 + 2 * r1],  m0
-movu   [r0 + r5],  m2
-
-lear0, [r0 + 4 * r1]
-lear2, [r2 + 4 * r3]
-
-movu   m0, [r2]
-movu   m1, [r2 + 64]
-movu   m2, [r2 + r3]
-movu   m3, [r2 + r3 + 64]
-
-packuswb   m0, m1
-packuswb   m2, m3
-vpermq m0, m0, 11011000b
-vpermq m2, m2, 11011000b
-vshufi64x2 m0, m0, 11011000b
-vshufi64x2 m2, m2, 11011000b
-movu   [r0],   m0
-movu   [r0 + r1],  m2
-
-movu   m0, [r2 + 2 * r3]
-movu   m1, [r2 + 2 * r3 + 64]
-movu   m2, [r2 + r4]
-movu   m3, [r2 + r4 + 64]
-
-packuswb   m0, m1
-packuswb   m2, m3
-vpermq m0, m0, 11011000b
-vpermq m2, m2, 11011000b
-vshufi64x2 m0, m0, 11011000b
-vshufi64x2 m2, m2, 11011000b
+vpermq m0, m4, m0
+vpermq m2, m4, m2
 movu   [r0 + 2 * r1],  m0
 movu   [r0 + r5],  m2
 %endmacro
 
 INIT_ZMM avx512
-cglobal blockcopy_sp_64x64, 4, 6, 4
+cglobal blockcopy_sp_64x64, 4, 6, 5
+mova   m4, [shuf1_avx512]
 addr3,  r3
 lear4,  [3 * r3]
 lear5,  [3 * r1]
 
-PROCESS_BLOCKCOPY_SP_64x8_AVX512
+%rep 15
+PROCESS_BLOCKCOPY_SP_64x4_AVX512
 lear0, [r0 + 4 * r1]
 lear2, [r2 + 4 * r3]
-PROCESS_BLOCKCOPY_SP_64x8_AVX512
-lear0, [r0 + 4 * r1]
-lear2, [r2 + 4 * r3]
-PROCESS_BLOCKCOPY_SP_64x8_AVX512
-lear0, [r0 + 4 * r1]
-lear2, [r2 + 4 * r3]
-PROCESS_BLOCKCOPY_SP_64x8_AVX512
-lear0, [r0 + 4 * r1]
-lear2, [r2 + 4 * r3]
-PROCESS_BLOCKCOPY_SP_64x8_AVX512
-lear0, [r0 + 4 * r1]
-lear2, [r2 + 4 * r3]
-PROCESS_BLOCKCOPY_SP_64x8_AVX512
-lear0, [r0 + 4 * r1]
-lear2, [r2 + 4 * r3]
-PROCESS_BLOCKCOPY_SP_64x8_AVX512
-lear0, [r0 + 4 * r1]
-lear2, [r2 + 4 * r3]
-PROCESS_BLOCKCOPY_SP_64x8_AVX512
+%endrep
+PROCESS_BLOCKCOPY_SP_64x4_AVX512
 RET
 
 ;-
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 084 of 307] x86: AVX512 interp_4tap_horiz_ps_64xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan
# Date 1502430475 25200
#  Thu Aug 10 22:47:55 2017 -0700
# Node ID 951e9a16296e5d1e528c0083630fde8122bd15c1
# Parent  3d8c45642752803c560891fdfbe0a8b5c03ca76a
x86: AVX512 interp_4tap_horiz_ps_64xN

Size  |  AVX2 performance | AVX512 performance
--
64x16 |  26.50x   |  35.13x
64x32 |  25.48x   |  38.62x
64x48 |  27.52x   |  40.34x
64x64 |  27.85x   |  40.43x

diff -r 3d8c45642752 -r 951e9a16296e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Aug 11 14:36:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Aug 10 22:47:55 2017 -0700
@@ -4029,6 +4029,11 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x48_avx512);
 p.weight_pp = PFX(weight_pp_avx512);
 
+//i444 chroma_hps
+p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = 
PFX(interp_4tap_horiz_ps_64x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = 
PFX(interp_4tap_horiz_ps_64x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = 
PFX(interp_4tap_horiz_ps_64x48_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = 
PFX(interp_4tap_horiz_ps_64x16_avx512);
 }
 #endif
 }
diff -r 3d8c45642752 -r 951e9a16296e source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Fri Aug 11 14:36:18 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Thu Aug 10 22:47:55 2017 -0700
@@ -26,7 +26,7 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA 32
+SECTION_RODATA 64
 const tab_Tm,db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
  db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
  db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
@@ -152,6 +152,9 @@
 
 const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 
11, 6, 14, 7, 15
 
+ALIGN 64
+const interp8_hps_shuf_avx512,  dq 0, 4, 1, 5, 2, 6, 3, 7
+
 SECTION .text
 
 cextern pb_128
@@ -9836,7 +9839,7 @@
 FILTER_VER_LUMA_S_AVX2_32x24 sp
 FILTER_VER_LUMA_S_AVX2_32x24 ss
 
;-
-;ipfilter_chroma_pp_avx512 code start
+;ipfilter_chroma_avx512 code start
 
;-
 %macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0
 ; register map
@@ -9976,6 +9979,86 @@
 IPFILTER_CHROMA_PP_32xN_AVX512 32
 IPFILTER_CHROMA_PP_32xN_AVX512 64
 IPFILTER_CHROMA_PP_32xN_AVX512 48
+
+%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
+movu   ym6,  [r0]
+vinserti32x8   m6,   [r0 + 4], 1
+pshufb m7,   m6,   m2
+pshufb m6,   m1
+pmaddubsw  m6,   m0
+pmaddubsw  m7,   m0
+pmaddwdm6,   m3
+pmaddwdm7,   m3
+
+movu   ym8,  [r0 + 32]
+vinserti32x8   m8,   [r0 + 36], 1
+pshufb m9,   m8,   m2
+pshufb m8,   m1
+pmaddubsw  m8,   m0
+pmaddubsw  m9,   m0
+pmaddwdm8,   m3
+pmaddwdm9,   m3
+
+packssdw   m6,   m7
+packssdw   m8,   m9
+psubw  m6,   m4
+psubw  m8,   m4
+vpermq m6,   m10,   m6
+vpermq m8,   m10,   m8
+movu   [r2], m6
+movu   [r2 + mmsize],m8
+%endmacro
+
 
;-
-;ipfilter_chroma_pp_avx512 code end
+; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, 
intptr_t dstStride, int coeffIdx, int isRowExt)
 
;-
+%macro IPFILTER_CHROMA_PS_64xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_ps_64x%1, 4,7,11
+mov r4d, r4m
+mov r5d, r5m
+
+%ifdef PIC
+lea   r6,   [tab_ChromaCoeff]
+vpbroadcastd  m0,   [r6 + r4 * 4]
+%else
+vpbroadcastd  m0,   [tab_ChromaCoeff + r4 * 4]
+%endif
+
+vbroadcasti32x8m1,   [interp4_horiz_shuf_load1_avx512]
+vbroadcasti32x8m2,   [interp4_horiz_shuf_load2_avx512]
+vbroadcasti32x8m3,   [pw_1]
+vbroadcasti32x8m4,   [pw_2000]
+mova   m10,  [interp8_hps_shuf_avx512]
+
+; register map
+; m0- interpolate coeff
+; m1,m2 - load shuffle order table
+; 

[x265] [PATCH 082 of 307] x86: AVX512 blockcopy_sp_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502711388 -19800
#  Mon Aug 14 17:19:48 2017 +0530
# Node ID b30539ebe5c9b2d9412d3a39458a90a7574ac744
# Parent  5c18b655a88a739b87c6b071d186a2b9286b8266
x86: AVX512 blockcopy_sp_32xN

Size   | AVX2 performance | AVX512 performance
--
32x32  |  6.77x   |  11.27x
i420 32x32 |  6.73x   |  11.43x
i422 32x64 |  6.68x   |  12.19x

diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 14 16:51:52 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Aug 14 17:19:48 2017 +0530
@@ -3948,6 +3948,10 @@
 p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
 
 p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
+p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
+p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = 
PFX(blockcopy_sp_32x32_avx512);
+p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = 
PFX(blockcopy_sp_32x64_avx512);
+
 p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
 p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = 
PFX(blockcopy_ps_32x32_avx512);
 p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = 
PFX(blockcopy_ps_32x64_avx512);
diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Mon Aug 14 16:51:52 2017 +0530
+++ b/source/common/x86/blockcopy8.asm  Mon Aug 14 17:19:48 2017 +0530
@@ -2191,6 +2191,25 @@
 movu   [r0 + r5],  m2
 %endmacro
 
+%macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0
+movu   m0, [r2]
+movu   m1, [r2 + r3]
+movu   m2, [r2 + 2 * r3]
+movu   m3, [r2 + r4]
+
+packuswb   m0, m1
+packuswb   m2, m3
+vpermq m0, m4, m0
+vpermq m2, m4, m2
+movu   [r0],   ym0
+vextracti32x8  [r0 + r1],  m0, 1
+movu   [r0 + 2 * r1],  ym2
+vextracti32x8  [r0 + r5],  m2, 1
+%endmacro
+
+;-
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, 
intptr_t srcStride)
+;-
 INIT_ZMM avx512
 cglobal blockcopy_sp_64x64, 4, 6, 5
 mova   m4, [shuf1_avx512]
@@ -2206,6 +2225,26 @@
 PROCESS_BLOCKCOPY_SP_64x4_AVX512
 RET
 
+%macro BLOCKCOPY_SP_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal blockcopy_sp_32x%1, 4, 6, 5
+mova   m4, [shuf1_avx512]
+addr3,  r3
+lear4,  [3 * r3]
+lear5,  [3 * r1]
+
+%rep %1/4 - 1
+PROCESS_BLOCKCOPY_SP_32x4_AVX512
+lear0, [r0 + 4 * r1]
+lear2, [r2 + 4 * r3]
+%endrep
+PROCESS_BLOCKCOPY_SP_32x4_AVX512
+RET
+%endmacro
+
+BLOCKCOPY_SP_32xN_AVX512 32
+BLOCKCOPY_SP_32xN_AVX512 64
+
 ;-
 ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
 ;-
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 074 of 307] x86: AVX512 interp_4tap_horiz_pp_64xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501222403 -19800
#  Fri Jul 28 11:43:23 2017 +0530
# Node ID 563b3c4f91eb20374311ed18fb18ad12aeebaf26
# Parent  7d7f2a4e771c7c2b573db9bc298d1a35bb72f32d
x86: AVX512 interp_4tap_horiz_pp_64xN

Size  |  AVX2 performance | AVX512 performance
--
64x16 |  21.45x   |  39.29x
64x32 |  22.27x   |  39.37x
64x48 |  22.76x   |  40.75x
64x64 |  22.76x   |  40.90x

diff -r 7d7f2a4e771c -r 563b3c4f91eb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 08 15:25:11 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jul 28 11:43:23 2017 +0530
@@ -3996,6 +3996,11 @@
 p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
 p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
 
+p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x48_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x16_avx512);
+
 }
 #endif
 }
diff -r 7d7f2a4e771c -r 563b3c4f91eb source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Tue Aug 08 15:25:11 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Fri Jul 28 11:43:23 2017 +0530
@@ -137,6 +137,10 @@
 
 const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
 
+const interp4_horiz_shuf_load1_avx512,  times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 
3, 4, 5, 3, 4, 5, 6
+
+const interp4_horiz_shuf_load2_avx512,  times 2 db 8, 9, 10, 11, 9, 10, 11, 
12, 10, 11, 12, 13, 11, 12, 13, 14
+
 SECTION .text
 
 cextern pb_128
@@ -9820,3 +9824,75 @@
 
 FILTER_VER_LUMA_S_AVX2_32x24 sp
 FILTER_VER_LUMA_S_AVX2_32x24 ss
+;-
+;ipfilter_chroma_pp_avx512 code start
+;-
+%macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0
+; register map
+; m0 - interpolate coeff
+; m1, m2 - shuffle order table
+; m3 - constant word 1
+; m4 - constant word 512
+
+movu   m5,   [r0]
+pshufb m6,   m5,   m2
+pshufb m5,   m5,   m1
+pmaddubsw  m5,   m0
+pmaddubsw  m6,   m0
+pmaddwdm5,   m3
+pmaddwdm6,   m3
+
+movu   m7,   [r0 + 4]
+pshufb m8,   m7,   m2
+pshufb m7,   m7,   m1
+pmaddubsw  m7,   m0
+pmaddubsw  m8,   m0
+pmaddwdm7,   m3
+pmaddwdm8,   m3
+
+packssdw   m5,   m7
+packssdw   m6,   m8
+pmulhrsw   m5,   m4
+pmulhrsw   m6,   m4
+packuswb   m5,   m6
+movu  [r2],  m5
+%endmacro
+
+;-
+; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, 
intptr_t dstStride, int coeffIdx
+;-
+%macro IPFILTER_CHROMA_PP_64xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_pp_64x%1, 4,6,9
+mov   r4d,   r4m
+
+%ifdef PIC
+lea   r5,   [tab_ChromaCoeff]
+vpbroadcastd  m0,   [r5 + r4 * 4]
+%else
+vpbroadcastd  m0,   [tab_ChromaCoeff + r4 * 4]
+%endif
+
+vbroadcasti32x8   m1,   [interp4_horiz_shuf_load1_avx512]
+vbroadcasti32x8   m2,   [interp4_horiz_shuf_load2_avx512]
+vbroadcasti32x8   m3,   [pw_1]
+vbroadcasti32x8   m4,   [pw_512]
+dec   r0
+
+%rep %1 - 1
+PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512
+lea   r2,   [r2 + r3]
+lea   r0,   [r0 + r1]
+%endrep
+PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512
+RET
+%endmacro
+
+IPFILTER_CHROMA_PP_64xN_AVX512  64
+IPFILTER_CHROMA_PP_64xN_AVX512  32
+IPFILTER_CHROMA_PP_64xN_AVX512  48
+IPFILTER_CHROMA_PP_64xN_AVX512  16
+
+;-
+;ipfilter_chroma_pp_avx512 code end
+;-
___
x265-devel mailing list

[x265] [PATCH 080 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_48x64 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1502191910 -19800
#  Tue Aug 08 17:01:50 2017 +0530
# Node ID 4a643ecb8c3bcc4dab96bfe56217d4449564bae0
# Parent  95c8818a26eea8a17a6a9471f861b89ab9e210c6
[x265-avx512]x86: AVX512 pixel_sad_x4_48x64 for high bit depth

AVX2 performance:   19.96x
AVX512 performance: 34.24x

diff -r 95c8818a26ee -r 4a643ecb8c3b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 08 15:45:12 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Aug 08 17:01:50 2017 +0530
@@ -2313,6 +2313,7 @@
 p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
 p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
 p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
+p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
 p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
 p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
 p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
diff -r 95c8818a26ee -r 4a643ecb8c3b source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Tue Aug 08 15:45:12 2017 +0530
+++ b/source/common/x86/sad16-a.asm Tue Aug 08 17:01:50 2017 +0530
@@ -3487,6 +3487,165 @@
 RET
 
 
;
+; void pixel_sad_x4_48x64( const pixel* pix1, const pixel* pix2, const pixel* 
pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
+;
+INIT_ZMM avx512
+cglobal pixel_sad_x4_48x64, 4, 9, 20
+pxorm0,  m0
+pxorm1,  m1
+pxorm2,  m2
+pxorm3,  m3
+mov r8d,  64/4
+
+vbroadcasti32x8 m19, [pw_1]
+
+add r5d, r5d
+lea r7d, [r5 * 3]
+.loop:
+movum4,   [r0]
+movum5,   [r0 + 2 * FENC_STRIDE]
+movu   ym6,   [r0 + mmsize]
+vinserti32x8m6,   [r0 + 2 * FENC_STRIDE + mmsize], 1
+movum7,   [r1]
+movum8,   [r1 + r5]
+movu   ym9,   [r1 + mmsize]
+vinserti32x8m9,   [r1 + r5 + mmsize], 1
+movum10,  [r2]
+movum11,  [r2 + r5]
+movu   ym12,  [r2 + mmsize]
+vinserti32x8m12,  [r2 + r5 + mmsize], 1
+movum13,  [r3]
+movum14,  [r3 + r5]
+movu   ym15,  [r3 + mmsize]
+vinserti32x8m15,  [r3 + r5 + mmsize], 1
+movum16,  [r4]
+movum17,  [r4 + r5]
+movu   ym18,  [r4 + mmsize]
+vinserti32x8m18,  [r4 + r5 + mmsize], 1
+
+psubw   m7,  m4
+psubw   m8,  m5
+psubw   m9,  m6
+psubw   m10, m4
+psubw   m11, m5
+psubw   m12, m6
+psubw   m13, m4
+psubw   m14, m5
+psubw   m15, m6
+psubw   m16, m4
+psubw   m17, m5
+psubw   m18, m6
+
+pabsw   m7,  m7
+pabsw   m8,  m8
+pabsw   m9,  m9
+pabsw   m10, m10
+pabsw   m11, m11
+pabsw   m12, m12
+pabsw   m13, m13
+pabsw   m14, m14
+pabsw   m15, m15
+pabsw   m16, m16
+pabsw   m17, m17
+pabsw   m18, m18
+
+paddw   m7,  m8
+paddw   m7,  m9
+paddw   m10, m11
+paddw   m10, m12
+paddw   m13, m14
+paddw   m13, m15
+paddw   m16, m17
+paddw   m16, m18
+
+pmaddwd m7,  m19
+paddd   m0,  m7
+pmaddwd m10, m19
+paddd   m1,  m10
+pmaddwd m13, m19
+paddd   m2,  m13
+pmaddwd m16, m19
+paddd   m3,  m16
+
+movum4,   [r0 + 4 * FENC_STRIDE]
+movum5,   [r0 + 6 * FENC_STRIDE]
+movu   ym6,   [r0 + 4 * FENC_STRIDE + mmsize]
+vinserti32x8m6,   [r0 + 6 * FENC_STRIDE + mmsize], 1
+movum7,   [r1 + 2 * r5]
+movum8,   [r1 + r7]
+movu   ym9,   [r1 + 2 * r5 + mmsize]
+vinserti32x8m9,   [r1 + r7 + mmsize], 1
+movum10,  [r2 + 2 * r5]
+movum11,  [r2 + r7]
+movu   ym12,  [r2 + 2 * r5 + mmsize]
+vinserti32x8m12,  [r2 + r7 + mmsize], 1
+movum13,  [r3 + 2 * r5]
+movum14,  [r3 + r7]
+movu   ym15,  [r3 + 2 * r5 + mmsize]
+vinserti32x8m15,  [r3 + r7 + mmsize], 1
+movum16,  [r4 + 2 * r5]
+movum17,  [r4 + r7]
+movu   ym18,  [r4 + 2 * r5 + mmsize]
+vinserti32x8m18,  [r4 + r7 + mmsize], 1
+
+
+psubw   m7,  m4
+psubw   m8,  m5
+psubw   m9,  m6
+psubw   m10, m4
+psubw   m11, m5
+psubw   m12, m6
+psubw   m13, m4
+psubw   m14, m5
+psubw   m15, m6
+psubw   m16, m4
+psubw   m17, m5
+psubw   m18, m6
+
+pabsw   m7,  m7
+pabsw   m8, 

[x265] [PATCH 083 of 307] [x265-avx512]x86: AVX512 weight_pp

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1502442378 -19800
#  Fri Aug 11 14:36:18 2017 +0530
# Node ID 3d8c45642752803c560891fdfbe0a8b5c03ca76a
# Parent  b30539ebe5c9b2d9412d3a39458a90a7574ac744
[x265-avx512]x86: AVX512 weight_pp

BitDepth | AVX2 performance | AVX512 performance

  8  | 6.23x|   10.60x
  10 | 9.43x|   14.59x

diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 14 17:19:48 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Aug 11 14:36:18 2017 +0530
@@ -2322,6 +2322,7 @@
 p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
 p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
 p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+p.weight_pp = PFX(weight_pp_avx512);
 
 }
 }
@@ -4026,6 +4027,7 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x48_avx512);
+p.weight_pp = PFX(weight_pp_avx512);
 
 }
 #endif
diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Aug 14 17:19:48 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Aug 11 14:36:18 2017 +0530
@@ -1662,6 +1662,116 @@
 jnz .loopH
 RET
 %endif
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal weight_pp, 6, 7, 7
+%define correction  (14 - BIT_DEPTH)
+mov  r6d, r6m
+shl  r6d, 16 - correction
+or   r6d, r5d
+
+movd xm0, r6d
+vpbroadcastd  m0, xm0
+mov  r5d, r7m
+sub  r5d, correction
+movd xm1, r5d
+
+vpbroadcastdm2, r8m
+vbroadcasti32x8 m5, [pw_1]
+vbroadcasti32x8 m6, [pw_pixel_max]
+
+add r2d, r2d
+add r3d, r3d
+sub r2d, r3d
+shr r3d, 6
+
+.loopH:
+mov  r5d, r3d
+
+.loopW:
+movum4, [r0]
+punpcklwd   m3, m4, m5
+pmaddwd m3, m0
+psrad   m3, xm1
+paddd   m3, m2
+
+punpckhwd   m4, m5
+pmaddwd m4, m0
+psrad   m4, xm1
+paddd   m4, m2
+
+packusdwm3,   m4
+pminuw  m3,   m6
+movu[r1], m3
+
+add r0, 64
+add r1, 64
+
+dec r5d
+jnz .loopW
+
+lea r0, [r0 + r2]
+lea r1, [r1 + r2]
+
+dec r4d
+jnz .loopH
+%undef correction
+RET
+%else
+INIT_ZMM avx512
+cglobal weight_pp, 6, 7, 6
+
+shl  r5d, 6
+mov  r6d, r6m
+shl  r6d, 16
+or   r6d, r5d
+
+movd xm0, r6d
+vpbroadcastd  m0, xm0
+movd xm1, r7m
+vpbroadcastd  m2, r8m
+
+vbroadcasti32x8 m5, [pw_1]
+
+sub  r2d, r3d
+shr  r3d, 5
+
+.loopH:
+mov  r5d, r3d
+
+.loopW:
+pmovzxbwm4, [r0]
+punpcklwd   m3, m4, m5
+pmaddwd m3, m0
+psrad   m3, xm1
+paddd   m3, m2
+
+punpckhwd   m4, m5
+pmaddwd m4, m0
+psrad   m4, xm1
+paddd   m4, m2
+
+packssdw   m3,  m4
+vextracti64x4 ym4,  m3, 1
+packuswb  ym3,  ym4
+vpermqym3,  ym3, q3120
+movu  [r1], ym3
+
+add r0, 32
+add r1, 32
+
+dec r5d
+jnz .loopW
+
+lea r0, [r0 + r2]
+lea r1, [r1 + r2]
+
+dec r4d
+jnz .loopH
+RET
+%endif
+
 
;-
 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t 
dstStride, int width, int height, int w0, int round, int shift, int offset)
 
;-
diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/reference.cpp
--- a/source/encoder/reference.cpp  Mon Aug 14 17:19:48 2017 +0530
+++ b/source/encoder/reference.cpp  Fri Aug 11 14:36:18 2017 +0530
@@ -155,12 +155,10 @@
 
 const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight 
* stride;
 pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
-
 // Computing weighted CU rows
 int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate 
interpolation depth
-int padwidth = (width + 15) & ~15;  // weightp assembly 
needs even 16 byte widths
+int padwidth = (width + 31) & ~31;  // 

[x265] [PATCH 086 of 307] x86: AVX512 cleanup add_ps code

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502773372 -19800
#  Tue Aug 15 10:32:52 2017 +0530
# Node ID 2db192bac0f14d55f7f82b8964d6c67c3a3637c3
# Parent  6f811dfd5690866f4c432911982a30665dc0e91c
x86: AVX512 cleanup add_ps code

diff -r 6f811dfd5690 -r 2db192bac0f1 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm   Fri Aug 11 12:32:50 2017 +0530
+++ b/source/common/x86/pixeladd8.asm   Tue Aug 15 10:32:52 2017 +0530
@@ -24,11 +24,11 @@
 
 %include "x86inc.asm"
 %include "x86util.asm"
+SECTION_RODATA 64
 
-SECTION_RODATA 32
-
+ALIGN 64
+const store_shuf1_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
 SECTION .text
-
 cextern pw_pixel_max
 
 ;-
@@ -1148,157 +1148,46 @@
 ;-
 ; pixel_add_ps avx512 code start
 ;-
-%macro PROCESS_ADD_PS_64x8_AVX512 0
+%macro PROCESS_ADD_PS_64x4_AVX512 0
 pmovzxbwm0, [r2]
 pmovzxbwm1, [r2 + 32]
 movum2, [r3]
 movum3, [r3 + 64]
-pmovzxbwm4, [r2 + r4]
-pmovzxbwm5, [r2 + r4 + 32]
-movum6, [r3 + r5]
-movum7, [r3 + r5 + 64]
-
 paddw   m0, m2
 paddw   m1, m3
-paddw   m4, m6
-paddw   m5, m7
 packuswbm0, m1
-packuswbm4, m5
-vpermq  m0, m0, 11011000b
-vpermq  m4, m4, 11011000b
-vshufi64x2  m0, m0, 11011000b
-vshufi64x2  m4, m4, 11011000b
+vpermq  m0, m4,  m0
 movu[r0],   m0
-movu[r0 + r1],  m4
-
-lea r2, [r2 + r4 * 2]
-lea r3, [r3 + r5 * 2]
-lea r0, [r0 + r1 * 2]
-
-pmovzxbwm0, [r2]
-pmovzxbwm1, [r2 + 32]
-movum2, [r3]
-movum3, [r3 + 64]
-pmovzxbwm4, [r2 + r4]
-pmovzxbwm5, [r2 + r4 + 32]
-movum6, [r3 + r5]
-movum7, [r3 + r5 + 64]
-
+pmovzxbwm0, [r2 + r4]
+pmovzxbwm1, [r2 + r4 + 32]
+movum2, [r3 + r5]
+movum3, [r3 + r5 + 64]
 paddw   m0, m2
 paddw   m1, m3
-paddw   m4, m6
-paddw   m5, m7
 packuswbm0, m1
-packuswbm4, m5
-vpermq  m0, m0, 11011000b
-vpermq  m4, m4, 11011000b
-vshufi64x2  m0, m0, 11011000b
-vshufi64x2  m4, m4, 11011000b
-movu[r0],   m0
-movu[r0 + r1],  m4
-
-lea r2, [r2 + r4 * 2]
-lea r3, [r3 + r5 * 2]
-lea r0, [r0 + r1 * 2]
-
-pmovzxbwm0, [r2]
-pmovzxbwm1, [r2 + 32]
-movum2, [r3]
-movum3, [r3 + 64]
-pmovzxbwm4, [r2 + r4]
-pmovzxbwm5, [r2 + r4 + 32]
-movum6, [r3 + r5]
-movum7, [r3 + r5 + 64]
-
+vpermq  m0, m4,  m0
+movu[r0 + r1],  m0
+pmovzxbwm0, [r2 + 2 * r4]
+pmovzxbwm1, [r2 + 2 * r4 + 32]
+movum2, [r3 + 2 * r5]
+movum3, [r3 + 2 * r5 + 64]
 paddw   m0, m2
 paddw   m1, m3
-paddw   m4, m6
-paddw   m5, m7
 packuswbm0, m1
-packuswbm4, m5
-vpermq  m0, m0, 11011000b
-vpermq  m4, m4, 11011000b
-vshufi64x2  m0, m0, 11011000b
-vshufi64x2  m4, m4, 11011000b
-movu[r0],   m0
-movu[r0 + r1],  m4
+vpermq  m0, m4,  m0
+movu[r0 + 2 * r1],   m0
 
-lea r2, [r2 + r4 * 2]
-lea r3, [r3 + r5 * 2]
-lea r0, [r0 + r1 * 2]
-
-pmovzxbwm0, [r2]
-pmovzxbwm1, [r2 + 32]
-movum2, [r3]
-movum3, [r3 + 64]
-pmovzxbwm4, [r2 + r4]
-pmovzxbwm5, [r2 + r4 + 32]
-movum6, [r3 + r5]
-movum7, [r3 + r5 + 64]
-
+pmovzxbwm0, [r2 + r7]
+pmovzxbwm1, [r2 + r7 + 32]
+movum2, [r3 + r8]
+movum3, [r3 + r8 + 64]
 paddw   m0, m2
 paddw   m1, m3
-paddw   m4, m6
-paddw   m5, m7
 packuswbm0, m1
-packuswbm4, m5
-vpermq  m0, m0, 11011000b
-vpermq  m4, m4, 11011000b
-vshufi64x2  m0,  

[x265] [PATCH 071 of 307] x86: AVX512 addAvg_48x64 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501589225 -19800
#  Tue Aug 01 17:37:05 2017 +0530
# Node ID aac415b7223acced7fc844c4a07225704b811df0
# Parent  ad756cf6d35f0d1460c5a079bea8781ffd67b7c7
x86: AVX512 addAvg_48x64 for high bit depth

AVX2 performance:   10.61x
AVX512 performance: 13.18x

diff -r ad756cf6d35f -r aac415b7223a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 07 16:30:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Aug 01 17:37:05 2017 +0530
@@ -2276,6 +2276,7 @@
 p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512);
 p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512);
 p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512);
+p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = 
PFX(addAvg_32x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = 
PFX(addAvg_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = 
PFX(addAvg_32x24_avx512);
diff -r ad756cf6d35f -r aac415b7223a source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmMon Aug 07 16:30:18 2017 +0530
+++ b/source/common/x86/mc-a.asmTue Aug 01 17:37:05 2017 +0530
@@ -1812,6 +1812,79 @@
 movu[r2 + r8 + mmsize],   m0
 %endmacro
 
+%macro PROCESS_ADDAVG_48x4_HBD_AVX512 0
+movum0,  [r0]
+movum1,  [r1]
+paddw   m0,  m1
+pmulhrswm0,  m3
+paddw   m0,  m4
+pmaxsw  m0,  m2
+pminsw  m0,  m5
+movu[r2],m0
+
+movuym0,  [r0 + mmsize]
+movuym1,  [r1 + mmsize]
+paddw   ym0,  ym1
+pmulhrswym0,  ym3
+paddw   ym0,  ym4
+pmaxsw  ym0,  ym2
+pminsw  ym0,  ym5
+movu[r2 + mmsize],ym0
+
+movum0,  [r0 + r3]
+movum1,  [r1 + r4]
+paddw   m0,  m1
+pmulhrswm0,  m3
+paddw   m0,  m4
+pmaxsw  m0,  m2
+pminsw  m0,  m5
+movu[r2 + r5],   m0
+
+movuym0,  [r0 + r3 + mmsize]
+movuym1,  [r1 + r4 + mmsize]
+paddw   ym0,  ym1
+pmulhrswym0,  ym3
+paddw   ym0,  ym4
+pmaxsw  ym0,  ym2
+pminsw  ym0,  ym5
+movu[r2 + r5 + mmsize],   ym0
+
+movum0,  [r0 + 2 * r3]
+movum1,  [r1 + 2 * r4]
+paddw   m0,  m1
+pmulhrswm0,  m3
+paddw   m0,  m4
+pmaxsw  m0,  m2
+pminsw  m0,  m5
+movu[r2 + 2 * r5],   m0
+
+movuym0,  [r0 + 2 * r3 + mmsize]
+movuym1,  [r1 + 2 * r4 + mmsize]
+paddw   ym0,  ym1
+pmulhrswym0,  ym3
+paddw   ym0,  ym4
+pmaxsw  ym0,  ym2
+pminsw  ym0,  ym5
+movu[r2 + 2 * r5 + mmsize],   ym0
+
+movum0,  [r0 + r6]
+movum1,  [r1 + r7]
+paddw   m0,  m1
+pmulhrswm0,  m3
+paddw   m0,  m4
+pmaxsw  m0,  m2
+pminsw  m0,  m5
+movu[r2 + r8],   m0
+
+movuym0,  [r0 + r6 + mmsize]
+movuym1,  [r1 + r7 + mmsize]
+paddw   ym0,  ym1
+pmulhrswym0,  ym3
+paddw   ym0,  ym4
+pmaxsw  ym0,  ym2
+pminsw  ym0,  ym5
+movu[r2 + r8 + mmsize],   ym0
+%endmacro
 ;-
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, 
intptr_t src1Stride, intptr_t dstStride)
 ;-
@@ -1874,6 +1947,28 @@
 ADDAVG_W64_HBD_AVX512 32
 ADDAVG_W64_HBD_AVX512 48
 ADDAVG_W64_HBD_AVX512 64
+
+INIT_ZMM avx512
+cglobal addAvg_48x64, 6,9,6
+vbroadcasti32x8m4,  [pw_ %+ ADDAVG_ROUND]
+vbroadcasti32x8m5,  [pw_pixel_max]
+vbroadcasti32x8m3,  [pw_ %+ ADDAVG_FACTOR]
+pxorm2,  m2
+add r3,  r3
+add r4,  r4
+add r5,  r5
+lea r6,  [3 * r3]
+lea r7,  [3 * r4]
+lea r8,  [3 * r5]
+
+%rep 15
+

[x265] [PATCH 076 of 307] x86: AVX512 interp_4tap_horiz_pp_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502347959 -19800
#  Thu Aug 10 12:22:39 2017 +0530
# Node ID f489bc0b864c48f557cc40b739e84fe1040e8728
# Parent  7bdf20f62d02f5714c1332695ffa8c7c6a9d8a5a
x86: AVX512 interp_4tap_horiz_pp_32xN

Color Space i444
Size| AVX2 performance | AVX512 performance

32x8| 23.96x   |  31.57x
32x16   | 24.38x   |  33.22x
32x24   | 22.41x   |  36.92x
32x32   | 21.54x   |  34.09x
32x64   | 23.27x   |  29.14x
Color Space i422
Size| AVX2 performance | AVX512 performance

32x16   | 25.55x   |  33.16x
32x32   | 22.08x   |  35.13x
32x48   | 24.01x   |  34.53x
32x64   | 23.76x   |  35.21x

diff -r 7bdf20f62d02 -r f489bc0b864c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Apr 04 16:47:58 2018 -0700
+++ b/source/common/x86/asm-primitives.cpp  Thu Aug 10 12:22:39 2017 +0530
@@ -4001,6 +4001,18 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x16_avx512);
 
+p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x24_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x8_avx512);
+
+p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
+
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x48_avx512);
+
 }
 #endif
 }
diff -r 7bdf20f62d02 -r f489bc0b864c source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Wed Apr 04 16:47:58 2018 -0700
+++ b/source/common/x86/ipfilter8.asm   Thu Aug 10 12:22:39 2017 +0530
@@ -150,6 +150,8 @@
 
 const interp4_horiz_shuf_load2_avx512,  times 2 db 8, 9, 10, 11, 9, 10, 11, 
12, 10, 11, 12, 13, 11, 12, 13, 14
 
+const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 
11, 6, 14, 7, 15
+
 SECTION .text
 
 cextern pb_128
@@ -9867,6 +9869,44 @@
 movu  [r2],  m5
 %endmacro
 
+%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0
+; register map
+; m0 - interpolate coeff
+; m1, m2 - shuffle order table
+; m3 - constant word 1
+; m4 - constant word 512
+; m9 - store shuffle order table
+
+movu  ym5,   [r0]
+vinserti32x8   m5,   [r0 + 4], 1
+
+pshufb m6,   m5,   m2
+pshufb m5,   m5,   m1
+pmaddubsw  m5,   m0
+pmaddubsw  m6,   m0
+pmaddwdm5,   m3
+pmaddwdm6,   m3
+
+movu  ym7,   [r0 + r1]
+vinserti32x8   m7,   [r0 + r1 + 4], 1
+
+pshufb m8,   m7,   m2
+pshufb m7,   m7,   m1
+pmaddubsw  m7,   m0
+pmaddubsw  m8,   m0
+pmaddwdm7,   m3
+pmaddwdm8,   m3
+
+packssdw   m5,   m6
+packssdw   m7,   m8
+pmulhrsw   m5,   m4
+pmulhrsw   m7,   m4
+packuswb   m5,   m7
+vpermd m5,   m9,   m5
+movu [r2],  ym5
+vextracti32x8[r2 + r3],  m5,1
+%endmacro
+
 
;-
 ; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, 
intptr_t dstStride, int coeffIdx
 
;-
@@ -9902,6 +9942,40 @@
 IPFILTER_CHROMA_PP_64xN_AVX512  48
 IPFILTER_CHROMA_PP_64xN_AVX512  16
 
+%macro IPFILTER_CHROMA_PP_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_pp_32x%1, 4,6,10
+mov   r4d,   r4m
+
+%ifdef PIC
+lea   r5,   [tab_ChromaCoeff]
+vpbroadcastd  m0,   [r5 + r4 * 4]
+%else
+vpbroadcastd  m0,   [tab_ChromaCoeff + r4 * 4]
+%endif
+
+vbroadcasti32x8   m1,

[x265] [PATCH 078 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_64xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1502171321 -19800
#  Tue Aug 08 11:18:41 2017 +0530
# Node ID aa1747a46469afe6fc2d5e6295a4b43a14ea
# Parent  d0e43a0e3b531f3e4f42be169c224563753b0210
[x265-avx512]x86: AVX512 pixel_sad_x4_64xN for high bit depth

Size| AVX2 performance | AVX512 performance

64x16   | 19.41x   |  33.30x
64x32   | 19.75x   |  33.22x
64x48   | 20.39x   |  35.05x
64x64   | 20.25x   |  36.72x

diff -r d0e43a0e3b53 -r aa1747a46469 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 07 17:04:23 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Aug 08 11:18:41 2017 +0530
@@ -2312,6 +2312,10 @@
 p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
 p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
 p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
+p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
+p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
+p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
+p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
 
 p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
 p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
diff -r d0e43a0e3b53 -r aa1747a46469 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Mon Aug 07 17:04:23 2017 +0530
+++ b/source/common/x86/sad16-a.asm Tue Aug 08 11:18:41 2017 +0530
@@ -2136,6 +2136,172 @@
 paddd   m3, m7
 %endmacro
 
+%macro PROCESS_SAD_X4_64x4_AVX512 0
+movum8,  [r0]
+movum10, [r0 + mmsize]
+movum4,  [r1]
+movum11, [r1 + mmsize]
+movum5,  [r2]
+movum12, [r2 + mmsize]
+movum6,  [r3]
+movum13, [r3 + mmsize]
+movum7,  [r4]
+movum14, [r4 + mmsize]
+
+psubw   m4,  m8
+psubw   m5,  m8
+psubw   m6,  m8
+psubw   m7,  m8
+psubw   m11, m10
+psubw   m12, m10
+psubw   m13, m10
+psubw   m14, m10
+pabsw   m4,  m4
+pabsw   m5,  m5
+pabsw   m6,  m6
+pabsw   m7,  m7
+pabsw   m11, m11
+pabsw   m12, m12
+pabsw   m13, m13
+pabsw   m14, m14
+paddw   m4,  m11
+paddw   m5,  m12
+paddw   m6,  m13
+paddw   m7,  m14
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+
+
+movum8,  [r0 + 2 * FENC_STRIDE]
+movum10, [r0 + 2 * FENC_STRIDE + mmsize]
+movum4,  [r1 + r5]
+movum11, [r1 + r5 + mmsize]
+movum5,  [r2 + r5]
+movum12, [r2 + r5 + mmsize]
+movum6,  [r3 + r5]
+movum13, [r3 + r5 + mmsize]
+movum7,  [r4 + r5]
+movum14, [r4 + r5 + mmsize]
+
+psubw   m4,  m8
+psubw   m5,  m8
+psubw   m6,  m8
+psubw   m7,  m8
+psubw   m11, m10
+psubw   m12, m10
+psubw   m13, m10
+psubw   m14, m10
+pabsw   m4,  m4
+pabsw   m5,  m5
+pabsw   m6,  m6
+pabsw   m7,  m7
+pabsw   m11, m11
+pabsw   m12, m12
+pabsw   m13, m13
+pabsw   m14, m14
+paddw   m4,  m11
+paddw   m5,  m12
+paddw   m6,  m13
+paddw   m7,  m14
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+
+movum8,  [r0 + 4 * FENC_STRIDE]
+movum10, [r0 + 4 * FENC_STRIDE + mmsize]
+movum4,  [r1 + 2 * r5]
+movum11, [r1 + 2 * r5 + mmsize]
+movum5,  [r2 + 2 * r5]
+movum12, [r2 + 2 * r5 + mmsize]
+movum6,  [r3 + 2 * r5]
+movum13, [r3 + 2 * r5 + mmsize]
+movum7,  [r4 + 2 * r5]
+movum14, [r4 + 2 * r5 + mmsize]
+
+psubw   m4,  m8
+psubw   m5,  m8
+psubw   m6,  m8
+psubw   m7,  m8
+psubw   m11, m10
+psubw   m12, m10
+psubw   m13, m10
+psubw   m14, m10
+pabsw   m4,  m4
+pabsw   m5,  m5
+pabsw   m6,  m6
+pabsw   m7,  m7
+pabsw   m11, m11
+pabsw   m12, m12
+pabsw   m13, m13
+pabsw   m14, m14
+paddw   m4,  m11
+paddw   m5,  m12
+paddw   m6,  m13
+paddw   m7,  m14
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+
+movum8,  [r0 + 6 * FENC_STRIDE]
+movum10, [r0 + 6 * FENC_STRIDE + mmsize]
+movum4,  [r1 + r7]
+movum11, [r1 + r7 + mmsize]
+movum5,  [r2 + r7]
+movum12, [r2 + r7 + mmsize]
+movum6,  [r3 + r7]
+movum13, [r3 + r7 + mmsize]
+movum7,  [r4 + r7]
+movum14, [r4 + r7 + mmsize]
+
+psubw   m4,  m8
+psubw   m5,  m8
+psubw   m6,  m8
+psubw   m7,  

[x265] [PATCH 066 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1501765251 -19800
#  Thu Aug 03 18:30:51 2017 +0530
# Node ID 241f318be574498b7bb77939937a907e4721dc32
# Parent  df45017fca906d5f3370dcc78e43284622753a73
[x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth

Size| AVX2 performance | AVX512 performance

32x8| 20.72x   |  29.20x
32x16   | 19.31x   |  30.53x
32x24   | 19.78x   |  33.32x
32x32   | 20.02x   |  32.71x
32x64   | 20.40x   |  33.30x

diff -r df45017fca90 -r 241f318be574 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Aug 03 18:30:51 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Aug 03 18:30:51 2017 +0530
@@ -2313,6 +2313,12 @@
 p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
 p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
 
+p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512);
+p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512);
+p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
+p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
+p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
+
 }
 }
 #else // if HIGH_BIT_DEPTH
diff -r df45017fca90 -r 241f318be574 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530
+++ b/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530
@@ -2856,3 +2856,362 @@
 PROCESS_SAD_X3_END_AVX512
 RET
 
+;
+; SAD x3/x4 avx512 code start
+;
+
+%macro PROCESS_SAD_X3_32x4_AVX512 0
+movum6, [r0]
+movum3, [r1]
+movum4, [r2]
+movum5, [r3]
+
+
+psubw   m3, m6
+psubw   m4, m6
+psubw   m5, m6
+pabsw   m3, m3
+pabsw   m4, m4
+pabsw   m5, m5
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+
+movum6, [r0 + 2 * FENC_STRIDE]
+movum3, [r1 + r4]
+movum4, [r2 + r4]
+movum5, [r3 + r4]
+
+psubw   m3, m6
+psubw   m4, m6
+psubw   m5, m6
+pabsw   m3, m3
+pabsw   m4, m4
+pabsw   m5, m5
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+
+movum6, [r0 + 4 * FENC_STRIDE]
+movum3, [r1 + 2 * r4]
+movum4, [r2 + 2 * r4]
+movum5, [r3 + 2 * r4]
+
+psubw   m3, m6
+psubw   m4, m6
+psubw   m5, m6
+pabsw   m3, m3
+pabsw   m4, m4
+pabsw   m5, m5
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+
+movum6, [r0 + 6 * FENC_STRIDE]
+movum3, [r1 + r6]
+movum4, [r2 + r6]
+movum5, [r3 + r6]
+
+psubw   m3, m6
+psubw   m4, m6
+psubw   m5, m6
+pabsw   m3, m3
+pabsw   m4, m4
+pabsw   m5, m5
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+%endmacro
+
+
+%macro PROCESS_SAD_X3_END_AVX512 0
+vextracti32x8  ym3, m0, 1
+vextracti32x8  ym4, m1, 1
+vextracti32x8  ym5, m2, 1
+
+paddd  ym0, ym3
+paddd  ym1, ym4
+paddd  ym2, ym5
+
+vextracti64x2  xm3, m0, 1
+vextracti64x2  xm4, m1, 1
+vextracti64x2  xm5, m2, 1
+
+paddd  xm0, xm3
+paddd  xm1, xm4
+paddd  xm2, xm5
+
+pshufd xm3, xm0, 1110b
+pshufd xm4, xm1, 1110b
+pshufd xm5, xm2, 1110b
+
+paddd  xm0, xm3
+paddd  xm1, xm4
+paddd  xm2, xm5
+
+pshufd xm3, xm0, 0001b
+pshufd xm4, xm1, 0001b
+pshufd xm5, xm2, 0001b
+
+paddd  xm0, xm3
+paddd  xm1, xm4
+paddd  xm2, xm5
+
+movd   [r5 + 0], xm0
+movd   [r5 + 4], xm1
+movd   [r5 + 8], xm2
+%endmacro
+
+
+;--
+; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* 
pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
+;--
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_32x8, 6,7,8
+pxorm0,  m0
+pxorm1,  m1
+pxorm2,  m2
+
+vbroadcasti32x8 m7, [pw_1]
+
+add r4d, r4d
+lea r6d, [r4 * 3]
+
+PROCESS_SAD_X3_32x4_AVX512
+add r0, FENC_STRIDE * 8
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]

[x265] [PATCH 067 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_32xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1501837071 -19800
#  Fri Aug 04 14:27:51 2017 +0530
# Node ID c3a2abd8e46f8db3ba7c276f39fe41ed002ce295
# Parent  241f318be574498b7bb77939937a907e4721dc32
[x265-avx512]x86: AVX512 pixel_sad_x4_32xN for high bit depth

Size| AVX2 performance | AVX512 performance

32x8| 16.73x   |  25.16x
32x16   | 18.36x   |  29.04x
32x24   | 19.52x   |  31.03x
32x32   | 18.78x   |  31.95x
32x64   | 19.01x   |  34.20x

diff -r 241f318be574 -r c3a2abd8e46f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Aug 03 18:30:51 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Aug 04 14:27:51 2017 +0530
@@ -2319,6 +2319,12 @@
 p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
 p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
 
+p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512);
+p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
+p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
+p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
+p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
+
 }
 }
 #else // if HIGH_BIT_DEPTH
diff -r 241f318be574 -r c3a2abd8e46f source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530
+++ b/source/common/x86/sad16-a.asm Fri Aug 04 14:27:51 2017 +0530
@@ -2501,6 +2501,160 @@
 ; SAD x3/x4 avx512 code start
 ;
 
+%macro PROCESS_SAD_X4_32x4_AVX512 0
+movum8, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+movum7, [r4]
+
+
+psubw   m4, m8
+psubw   m5, m8
+psubw   m6, m8
+psubw   m7, m8
+pabsw   m4, m4
+pabsw   m5, m5
+pabsw   m6, m6
+pabsw   m7, m7
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+
+
+movum8, [r0 + 2 * FENC_STRIDE]
+movum4, [r1 + r5]
+movum5, [r2 + r5]
+movum6, [r3 + r5]
+movum7, [r4 + r5]
+
+
+psubw   m4, m8
+psubw   m5, m8
+psubw   m6, m8
+psubw   m7, m8
+pabsw   m4, m4
+pabsw   m5, m5
+pabsw   m6, m6
+pabsw   m7, m7
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+
+movum8, [r0 + 4 * FENC_STRIDE]
+movum4, [r1 + 2 * r5]
+movum5, [r2 + 2 * r5]
+movum6, [r3 + 2 * r5]
+movum7, [r4 + 2 * r5]
+
+
+psubw   m4, m8
+psubw   m5, m8
+psubw   m6, m8
+psubw   m7, m8
+pabsw   m4, m4
+pabsw   m5, m5
+pabsw   m6, m6
+pabsw   m7, m7
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+
+movum8, [r0 + 6 * FENC_STRIDE]
+movum4, [r1 + r7]
+movum5, [r2 + r7]
+movum6, [r3 + r7]
+movum7, [r4 + r7]
+
+
+psubw   m4, m8
+psubw   m5, m8
+psubw   m6, m8
+psubw   m7, m8
+pabsw   m4, m4
+pabsw   m5, m5
+pabsw   m6, m6
+pabsw   m7, m7
+
+pmaddwd m4, m9
+paddd   m0, m4
+pmaddwd m5, m9
+paddd   m1, m5
+pmaddwd m6, m9
+paddd   m2, m6
+pmaddwd m7, m9
+paddd   m3, m7
+%endmacro
+
+
+%macro PROCESS_SAD_X4_END_AVX512 0
+vextracti32x8  ym4, m0, 1
+vextracti32x8  ym5, m1, 1
+vextracti32x8  ym6, m2, 1
+vextracti32x8  ym7, m3, 1
+
+paddd  ym0, ym4
+paddd  ym1, ym5
+paddd  ym2, ym6
+paddd  ym3, ym7
+
+vextracti64x2  xm4, m0, 1
+vextracti64x2  xm5, m1, 1
+vextracti64x2  xm6, m2, 1
+vextracti64x2  xm7, m3, 1
+
+paddd  xm0, xm4
+paddd  xm1, xm5
+paddd  xm2, xm6
+paddd  xm3, xm7
+
+pshufd xm4, xm0, 1110b
+pshufd xm5, xm1, 1110b
+pshufd xm6, xm2, 1110b
+pshufd xm7, xm3, 1110b
+
+paddd  xm0, xm4
+paddd  xm1, xm5
+paddd  xm2, xm6
+paddd  xm3, xm7
+
+pshufd xm4, xm0, 0001b
+pshufd xm5, xm1, 0001b
+pshufd xm6, xm2, 0001b
+pshufd xm7, xm3, 0001b
+
+paddd  xm0, xm4
+paddd  xm1, xm5
+paddd  xm2, xm6
+paddd  xm3, xm7
+
+mov  r0,  r6mp
+movd   [r0 + 0],  xm0
+movd   [r0 + 4],  xm1
+movd   [r0 + 8],  xm2
+movd   [r0 + 12], xm3
+%endmacro
+
+
+
 %macro PROCESS_SAD_X3_32x4_AVX512 0
 movum6, [r0]
 movum3, [r1]
@@ 

[x265] [PATCH 077 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_64xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Gopi Satykrishna Akisetty 
# Date 1502105663 -19800
#  Mon Aug 07 17:04:23 2017 +0530
# Node ID d0e43a0e3b531f3e4f42be169c224563753b0210
# Parent  f489bc0b864c48f557cc40b739e84fe1040e8728
[x265-avx512]x86: AVX512 pixel_sad_x3_64xN for high bit depth

Size| AVX2 performance | AVX512 performance

64x16   | 19.69x   |  36.23x
64x32   | 20.33x   |  37.94x
64x48   | 20.64x   |  38.48x
64x64   | 20.51x   |  38.49x

diff -r f489bc0b864c -r d0e43a0e3b53 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Aug 10 12:22:39 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Aug 07 17:04:23 2017 +0530
@@ -2302,6 +2302,10 @@
 p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
 p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
 p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
+p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
+p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
+p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
+p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512);
 
 p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512);
 p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
diff -r f489bc0b864c -r d0e43a0e3b53 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Thu Aug 10 12:22:39 2017 +0530
+++ b/source/common/x86/sad16-a.asm Mon Aug 07 17:04:23 2017 +0530
@@ -2266,6 +2266,135 @@
 paddd   m2, m5
 %endmacro
 
+%macro PROCESS_SAD_X3_64x4_AVX512 0
+movum6,  [r0]
+movum8,  [r0 + mmsize]
+movum3,  [r1]
+movum9,  [r1 + mmsize]
+movum4,  [r2]
+movum10, [r2 + mmsize]
+movum5,  [r3]
+movum11, [r3 + mmsize]
+
+psubw   m3,  m6
+psubw   m9,  m8
+psubw   m4,  m6
+psubw   m10, m8
+psubw   m5,  m6
+psubw   m11, m8
+pabsw   m3,  m3
+pabsw   m4,  m4
+pabsw   m5,  m5
+pabsw   m9,  m9
+pabsw   m10, m10
+pabsw   m11, m11
+paddw   m3,  m9
+paddw   m4,  m10
+paddw   m5,  m11
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+
+movum6,  [r0 + 2 * FENC_STRIDE]
+movum8,  [r0 + 2 * FENC_STRIDE + mmsize]
+movum3,  [r1 + r4]
+movum9,  [r1 + r4 + mmsize]
+movum4,  [r2 + r4]
+movum10, [r2 + r4 + mmsize]
+movum5,  [r3 + r4]
+movum11, [r3 + r4 + mmsize]
+
+psubw   m3,  m6
+psubw   m9,  m8
+psubw   m4,  m6
+psubw   m10, m8
+psubw   m5,  m6
+psubw   m11, m8
+pabsw   m3,  m3
+pabsw   m4,  m4
+pabsw   m5,  m5
+pabsw   m9,  m9
+pabsw   m10, m10
+pabsw   m11, m11
+paddw   m3,  m9
+paddw   m4,  m10
+paddw   m5,  m11
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+
+movum6,  [r0 + 4 * FENC_STRIDE]
+movum8,  [r0 + 4 * FENC_STRIDE + mmsize]
+movum3,  [r1 + 2 * r4]
+movum9,  [r1 + 2 * r4 + mmsize]
+movum4,  [r2 + 2 * r4]
+movum10, [r2 + 2 * r4 + mmsize]
+movum5,  [r3 + 2 * r4]
+movum11, [r3 + 2 * r4 + mmsize]
+
+psubw   m3,  m6
+psubw   m9,  m8
+psubw   m4,  m6
+psubw   m10, m8
+psubw   m5,  m6
+psubw   m11, m8
+pabsw   m3,  m3
+pabsw   m4,  m4
+pabsw   m5,  m5
+pabsw   m9,  m9
+pabsw   m10, m10
+pabsw   m11, m11
+paddw   m3,  m9
+paddw   m4,  m10
+paddw   m5,  m11
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+
+movum6,  [r0 + 6 * FENC_STRIDE]
+movum8,  [r0 + 6 * FENC_STRIDE + mmsize]
+movum3,  [r1 + r6]
+movum9,  [r1 + r6 + mmsize]
+movum4,  [r2 + r6]
+movum10, [r2 + r6 + mmsize]
+movum5,  [r3 + r6]
+movum11, [r3 + r6 + mmsize]
+
+psubw   m3,  m6
+psubw   m9,  m8
+psubw   m4,  m6
+psubw   m10, m8
+psubw   m5,  m6
+psubw   m11, m8
+pabsw   m3,  m3
+pabsw   m4,  m4
+pabsw   m5,  m5
+pabsw   m9,  m9
+pabsw   m10, m10
+pabsw   m11, m11
+paddw   m3,  m9
+paddw   m4,  m10
+paddw   m5,  m11
+
+pmaddwd m3, m7
+paddd   m0, m3
+pmaddwd m4, m7
+paddd   m1, m4
+pmaddwd m5, m7
+paddd   m2, m5
+%endmacro
 
 %macro PROCESS_SAD_X3_END_AVX512 0
 vextracti32x8  ym3, m0, 1
@@ -2300,9 +2429,16 @@
 paddd  xm1, xm4
 paddd  xm2, xm5
 
-movd   [r5 + 0], xm0
-movd   [r5 + 4], xm1
-movd   [r5 + 8], xm2
+%if UNIX64
+movd [r5 + 0], xm0
+movd [r5 + 4], xm1
+movd [r5 + 8], 

[x265] [PATCH 069 of 307] x86: AVX512 pixel_var_32x32

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501843838 -19800
#  Fri Aug 04 16:20:38 2017 +0530
# Node ID 039ed71e123c3e14bfaabbe3aada944157784b36
# Parent  c5b5b7cb9bbef4365692bfaf05a2a83796d5f1b0
x86: AVX512 pixel_var_32x32

AVX2 performance   : 9.15x
AVX512 performance : 13.49x

diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Aug 04 14:27:51 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Aug 04 16:20:38 2017 +0530
@@ -3929,6 +3929,7 @@
 
 p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
 p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
+p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512);
 p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512);
 p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512);
 p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512);
diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Aug 04 14:27:51 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Aug 04 16:20:38 2017 +0530
@@ -7105,6 +7105,82 @@
 RET
 %endif ; !HIGH_BIT_DEPTH
 
+%macro PROCESS_VAR_32x8_AVX512 0
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + r1]
+pmovzxbwm2, [r0 + 2 * r1]
+pmovzxbwm3, [r0 + r2]
+
+paddw m4, m0
+paddw m4, m1
+paddw m4, m2
+paddw m4, m3
+pmaddwd   m0, m0
+pmaddwd   m1, m1
+pmaddwd   m2, m2
+pmaddwd   m3, m3
+paddd m5, m0
+paddd m5, m1
+paddd m5, m2
+paddd m5, m3
+
+lea r0, [r0 + r1 * 4]
+
+pmovzxbwm0, [r0]
+pmovzxbwm1, [r0 + r1]
+pmovzxbwm2, [r0 + 2 * r1]
+pmovzxbwm3, [r0 + r2]
+
+paddw m4, m0
+paddw m4, m1
+paddw m4, m2
+paddw m4, m3
+pmaddwd   m0, m0
+pmaddwd   m1, m1
+pmaddwd   m2, m2
+pmaddwd   m3, m3
+paddd m5, m0
+paddd m5, m1
+paddd m5, m2
+paddd m5, m3
+%endmacro
+
+%macro PROCESS_VAR_AVX512_END 0
+vextracti32x8  ym0, m4, 1
+vextracti32x8  ym1, m5, 1
+paddw  ym4, ym0
+paddd  ym5, ym1
+vextracti32x4  xm0, m4, 1
+vextracti32x4  xm1, m5, 1
+paddw  xm4, xm0
+paddd  xm5, xm1
+HADDW  xm4, xm2
+HADDD  xm5, xm1
+punpckldq  xm4, xm5
+movq   rax, xm4
+%endmacro
+
+%if HIGH_BIT_DEPTH==0
+;-
+; int pixel_var_wxh( uint8_t *, intptr_t )
+;-
+INIT_ZMM avx512
+cglobal pixel_var_32x32, 2,4,6
+pxor  m4, m4; sum
+pxor  m5, m5; sum squared
+lea   r2, [3 * r1]
+
+PROCESS_VAR_32x8_AVX512
+lea   r0, [r0 + r1 * 4]
+PROCESS_VAR_32x8_AVX512
+lea   r0, [r0 + r1 * 4]
+PROCESS_VAR_32x8_AVX512
+lea   r0, [r0 + r1 * 4]
+PROCESS_VAR_32x8_AVX512
+PROCESS_VAR_AVX512_END
+RET
+%endif
+
 %macro VAR_AVX512_CORE 1 ; accum
 %if %1
 paddwm0, m2
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 085 of 307] x86: AVX512 interp_4tap_horiz_ps_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Jayashri Murugan 
# Date 1502434970 -19800
#  Fri Aug 11 12:32:50 2017 +0530
# Node ID 6f811dfd5690866f4c432911982a30665dc0e91c
# Parent  951e9a16296e5d1e528c0083630fde8122bd15c1
x86: AVX512 interp_4tap_horiz_ps_32xN

Color Space i444
Size| AVX2 performance | AVX512 performance

32x8| 25.91x   |  38.35x
32x16   | 25.45x   |  32.02x
32x24   | 25.80x   |  32.73x
32x32   | 33.49x   |  38.02x
32x64   | 27.42x   |  36.20x
Color Space i422
Size| AVX2 performance | AVX512 performance

32x16   | 24.74x   |  33.95x
32x32   | 33.31x   |  34.28x
32x48   | 27.11x   |  35.98x
32x64   | 27.32x   |  35.02x
Color Space i420
Size| AVX2 performance | AVX512 performance

32x8| 27.16x   |  36.68x
32x16   | 24.87x   |  31.40x
32x24   | 25.98x   |  34.08x
32x32   | 33.01x   |  34.71x

diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Aug 10 22:47:55 2017 -0700
+++ b/source/common/x86/asm-primitives.cpp  Fri Aug 11 12:32:50 2017 +0530
@@ -4034,6 +4034,25 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = 
PFX(interp_4tap_horiz_ps_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = 
PFX(interp_4tap_horiz_ps_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = 
PFX(interp_4tap_horiz_ps_64x16_avx512);
+
+p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = 
PFX(interp_4tap_horiz_ps_32x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = 
PFX(interp_4tap_horiz_ps_32x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = 
PFX(interp_4tap_horiz_ps_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = 
PFX(interp_4tap_horiz_ps_32x24_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = 
PFX(interp_4tap_horiz_ps_32x8_avx512);
+
+//i422 chroma_hps
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = 
PFX(interp_4tap_horiz_ps_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = 
PFX(interp_4tap_horiz_ps_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = 
PFX(interp_4tap_horiz_ps_32x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = 
PFX(interp_4tap_horiz_ps_32x48_avx512);
+
+//i420 chroma_hps
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = 
PFX(interp_4tap_horiz_ps_32x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = 
PFX(interp_4tap_horiz_ps_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = 
PFX(interp_4tap_horiz_ps_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = 
PFX(interp_4tap_horiz_ps_32x8_avx512);
+
 }
 #endif
 }
diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Thu Aug 10 22:47:55 2017 -0700
+++ b/source/common/x86/ipfilter8.asm   Fri Aug 11 12:32:50 2017 +0530
@@ -10010,7 +10010,7 @@
 %endmacro
 
 
;-
-; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, 
intptr_t dstStride, int coeffIdx, int isRowExt)
+; void interp_horiz_ps_64xN(const pixel* src, intptr_t srcStride, int16_t* 
dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 
;-
 %macro IPFILTER_CHROMA_PS_64xN_AVX512 1
 INIT_ZMM avx512
@@ -10059,6 +10059,74 @@
 IPFILTER_CHROMA_PS_64xN_AVX512 48
 IPFILTER_CHROMA_PS_64xN_AVX512 16
 
+%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
+movu   ym6,  [r0]
+vinserti32x8   m6,   [r0 + 4], 1
+pshufb m7,   m6,   m2
+pshufb m6,   m6,   m1
+pmaddubsw  m6,   m0
+pmaddubsw  m7,   m0
+pmaddwdm6,   m3
+pmaddwdm7,   m3
+
+packssdw   m6,   m7
+psubw  m6,   m4
+vpermq m6,   m8,   m6
+movu   [r2], m6
+%endmacro
+
+;-
+; void interp_horiz_ps_32xN(const pixel* src, intptr_t srcStride, int16_t* 
dst, intptr_t dstStride, int coeffIdx, int isRowExt)

[x265] [PATCH 091 of 307] x86: AVX512 cleanup interp_4tap_horiz_pp_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1504242228 -19800
#  Fri Sep 01 10:33:48 2017 +0530
# Node ID dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd
# Parent  d9200885420957bccd4edea62bf87bbe8831bc62
x86: AVX512 cleanup interp_4tap_horiz_pp_32xN

diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Sun Aug 13 15:12:25 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Sep 01 10:33:48 2017 +0530
@@ -4011,22 +4011,29 @@
 p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
 p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
 
+//i444 chroma_hpp
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_64x16_avx512);
-
 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x24_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x8_avx512);
-
 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
 
+//i422 chroma_hpp
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x48_avx512);
+
+//i420 chroma_hpp
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_32x8_avx512);
+
 p.weight_pp = PFX(weight_pp_avx512);
 
 //i444 chroma_hps
diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Sun Aug 13 15:12:25 2017 +0530
+++ b/source/common/x86/ipfilter8.asm   Fri Sep 01 10:33:48 2017 +0530
@@ -150,8 +150,6 @@
 const interp4_horiz_shuf_load2_avx512,  times 2 db 8, 9, 10, 11, 9, 10, 11, 
12, 10, 11, 12, 13, 11, 12, 13, 14
 const interp4_horiz_shuf_load3_avx512,  times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 
7, 8, 9, 7, 8, 9, 10
 
-const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 
11, 6, 14, 7, 15
-
 ALIGN 64
 const interp8_hps_shuf_avx512,  dq 0, 4, 1, 5, 2, 6, 3, 7
 
@@ -9881,31 +9879,30 @@
 ; m9 - store shuffle order table
 
 movu  ym5,   [r0]
-vinserti32x8   m5,   [r0 + 4], 1
+vinserti32x8   m5,   [r0 + r1], 1
+movu  ym7,   [r0 + 4]
+vinserti32x8   m7,   [r0 + r1 + 4], 1
 
 pshufb m6,   m5,   m2
-pshufb m5,   m5,   m1
+pshufb m5,   m1
+pshufb m8,   m7,   m2
+pshufb m7,   m1
+
 pmaddubsw  m5,   m0
+pmaddubsw  m7,   m0
+pmaddwdm5,   m3
+pmaddwdm7,   m3
+
 pmaddubsw  m6,   m0
-pmaddwdm5,   m3
+pmaddubsw  m8,   m0
 pmaddwdm6,   m3
-
-movu  ym7,   [r0 + r1]
-vinserti32x8   m7,   [r0 + r1 + 4], 1
-
-pshufb m8,   m7,   m2
-pshufb m7,   m7,   m1
-pmaddubsw  m7,   m0
-pmaddubsw  m8,   m0
-pmaddwdm7,   m3
 pmaddwdm8,   m3
 
-packssdw   m5,   m6
-packssdw   m7,   m8
+packssdw   m5,   m7
+packssdw   m6,   m8
 pmulhrsw   m5,   m4
-pmulhrsw   m7,   m4
-packuswb   m5,   m7
-vpermd m5,   m9,   m5
+pmulhrsw   m6,   m4
+packuswb   m5,   m6
 movu [r2],  ym5
 vextracti32x8[r2 + r3],  m5,

[x265] [PATCH 093 of 307] x86: AVX512 addAvg_32xN

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503385834 -19800
#  Tue Aug 22 12:40:34 2017 +0530
# Node ID 738f07186eb1d4bca84e9acdf70921ee9e2fee92
# Parent  ed1932a414bf5962bbeccfd5c9e208b7db90f77f
x86: AVX512 addAvg_32xN

Size  |  AVX2 performance | AVX512 performance
--
32x8  |  15.31x   |  19.98x
32x16 |  15.14x   |  23.25x
32x24 |  14.65x   |  23.95x
32x32 |  15.41x   |  24.76x
32x64 |  14.56x   |  24.53x

diff -r ed1932a414bf -r 738f07186eb1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Sun Aug 13 18:18:28 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Aug 22 12:40:34 2017 +0530
@@ -3964,6 +3964,19 @@
 p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512);
 p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512);
 p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
+p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx512);
+p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx512);
+p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512);
+p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512);
+p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = 
PFX(addAvg_32x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = 
PFX(addAvg_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = 
PFX(addAvg_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = 
PFX(addAvg_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = 
PFX(addAvg_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = 
PFX(addAvg_32x48_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = 
PFX(addAvg_32x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = 
PFX(addAvg_32x32_avx512);
 
 p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512);
 
diff -r ed1932a414bf -r 738f07186eb1 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmSun Aug 13 18:18:28 2017 +0530
+++ b/source/common/x86/mc-a.asmTue Aug 22 12:40:34 2017 +0530
@@ -3317,6 +3317,24 @@
 movu[r2 + r5], m0
 %endmacro
 
+%macro PROCESS_ADDAVG_32x2_AVX512 0
+movum0, [r0]
+movum1, [r1]
+movum2, [r0 + r3]
+movum3, [r1 + r4]
+
+paddw   m0, m1
+pmulhrswm0, m4
+paddw   m0, m5
+paddw   m2, m3
+pmulhrswm2, m4
+paddw   m2, m5
+
+packuswbm0, m2
+vpermq  m0, m6, m0
+movu[r2],   ym0
+vextracti32x8   [r2 + r5],  m0, 1
+%endmacro
 
;
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, 
intptr_t src1Stride, intptr_t dstStride)
 
;
@@ -3344,6 +3362,32 @@
 ADDAVG_W64_AVX512 32
 ADDAVG_W64_AVX512 48
 ADDAVG_W64_AVX512 64
+
+%macro ADDAVG_W32_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_32x%1, 6,6,7
+vbroadcasti32x8 m4, [pw_256]
+vbroadcasti32x8 m5, [pw_128]
+movam6, [shuf_avx512]
+add r3, r3
+add r4, r4
+
+%rep %1/2 - 1
+PROCESS_ADDAVG_32x2_AVX512
+lea r2, [r2 + 2 * r5]
+lea r0, [r0 + 2 * r3]
+lea r1, [r1 + 2 * r4]
+%endrep
+PROCESS_ADDAVG_32x2_AVX512
+RET
+%endmacro
+
+ADDAVG_W32_AVX512 8
+ADDAVG_W32_AVX512 16
+ADDAVG_W32_AVX512 24
+ADDAVG_W32_AVX512 32
+ADDAVG_W32_AVX512 48
+ADDAVG_W32_AVX512 64
 ;-
 ; addAvg avx512 code end
 ;-
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 097 of 307] x86: AVX512 convert_p2s link 32xN and 64xN chroma_444 primitives

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503901717 -19800
#  Mon Aug 28 11:58:37 2017 +0530
# Node ID bf199a5eca5be148be8a0c91cd9f2e8e0e908059
# Parent  0355f0128b7d713c4a21c91d3cc5bed1e8b43c47
x86: AVX512 convert_p2s link 32xN and 64xN chroma_444 primitives

diff -r 0355f0128b7d -r bf199a5eca5b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Aug 24 12:20:07 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Aug 28 11:58:37 2017 +0530
@@ -2253,6 +2253,15 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = 
PFX(filterPixelToShort_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = 
PFX(filterPixelToShort_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = 
PFX(filterPixelToShort_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = 
PFX(filterPixelToShort_32x8_avx2);
+p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s = 
PFX(filterPixelToShort_32x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s = 
PFX(filterPixelToShort_32x24_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s = 
PFX(filterPixelToShort_32x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s = 
PFX(filterPixelToShort_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s = 
PFX(filterPixelToShort_64x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s = 
PFX(filterPixelToShort_64x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = 
PFX(filterPixelToShort_64x48_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = 
PFX(filterPixelToShort_64x64_avx512);
 
 p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
 
@@ -4041,6 +4050,15 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = 
PFX(filterPixelToShort_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = 
PFX(filterPixelToShort_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = 
PFX(filterPixelToShort_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = 
PFX(filterPixelToShort_32x8_avx2);
+p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s = 
PFX(filterPixelToShort_32x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s = 
PFX(filterPixelToShort_32x24_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s = 
PFX(filterPixelToShort_32x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s = 
PFX(filterPixelToShort_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s = 
PFX(filterPixelToShort_64x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s = 
PFX(filterPixelToShort_64x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = 
PFX(filterPixelToShort_64x48_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = 
PFX(filterPixelToShort_64x64_avx512);
 
 p.cu[BLOCK_64x64].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
 p.cu[BLOCK_32x32].sse_ss = 
(pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 146 of 307] x86: Fix crash in 32 bit main10 build from chroma_hps code

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1509681036 -19800
#  Fri Nov 03 09:20:36 2017 +0530
# Node ID 84dc38e191366e8b737d2a6014793afe830f3b35
# Parent  d3a1db4790b662306a3f1222cde66c006e10f604
x86: Fix crash in 32 bit main10 build from chroma_hps code

diff -r d3a1db4790b6 -r 84dc38e19136 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Thu Nov 02 14:10:18 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Fri Nov 03 09:20:36 2017 +0530
@@ -6447,9 +6447,9 @@
 movu[r2],  m6
 %endmacro
 
+%macro IPFILTER_CHROMA_PS_AVX512_32xN 1
+%if ARCH_X86_64 == 1
 INIT_ZMM avx512
-%if ARCH_X86_64 == 1
-%macro IPFILTER_CHROMA_PS_AVX512_32xN 1
 cglobal interp_4tap_horiz_ps_32x%1, 4,7,9
 add r1d, r1d
 add r3d, r3d
@@ -6486,8 +6486,8 @@
 sub r6d, 2
 jnz .loop
 RET
+%endif
 %endmacro
-%endif
 
 IPFILTER_CHROMA_PS_AVX512_32xN 8
 IPFILTER_CHROMA_PS_AVX512_32xN 16
@@ -6645,9 +6645,9 @@
 movu[r2 + mmsize],m6
 %endmacro
 
+%macro IPFILTER_CHROMA_PS_AVX512_64xN 1
+%if ARCH_X86_64 == 1
 INIT_ZMM avx512
-%if ARCH_X86_64 == 1
-%macro IPFILTER_CHROMA_PS_AVX512_64xN 1
 cglobal interp_4tap_horiz_ps_64x%1, 4,7,9
 add r1d, r1d
 add r3d, r3d
@@ -6683,8 +6683,8 @@
 sub r6d, 2
 jnz .loop
 RET
+%endif
 %endmacro
-%endif
 
 IPFILTER_CHROMA_PS_AVX512_64xN 16
 IPFILTER_CHROMA_PS_AVX512_64xN 32
@@ -6750,10 +6750,9 @@
 movu[r2],   ym6
 %endmacro
 
-
+%macro IPFILTER_CHROMA_PS_AVX512_16xN 1
+%if ARCH_X86_64 == 1
 INIT_ZMM avx512
-%if ARCH_X86_64 == 1
-%macro IPFILTER_CHROMA_PS_AVX512_16xN 1
 cglobal interp_4tap_horiz_ps_16x%1, 4,7,9
 add r1d, r1d
 add r3d, r3d
@@ -6789,8 +6788,8 @@
 sub r6d, 2
 jnz .loop
 RET
+%endif
 %endmacro
-%endif
 
 IPFILTER_CHROMA_PS_AVX512_16xN 4
 IPFILTER_CHROMA_PS_AVX512_16xN 8
@@ -6934,8 +6933,8 @@
 movu[r2 + mmsize],   ym6
 %endmacro
 
+%if ARCH_X86_64 == 1
 INIT_ZMM avx512
-%if ARCH_X86_64 == 1
 cglobal interp_4tap_horiz_ps_48x64, 4,7,9
 add r1d, r1d
 add r3d, r3d
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 150 of 307] x86: AVX512 interp_4tap_vert_ps_32xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1510029383 -19800
#  Tue Nov 07 10:06:23 2017 +0530
# Node ID c983858deccb26e5b4c957fbff959c1e74f84756
# Parent  0775ffcdfc8a0c4ad078e8c4197f6bff7158efd8
x86: AVX512 interp_4tap_vert_ps_32xN for high bit depth

i444
Size  |  AVX2 performance | AVX512 performance
--
32x8  |  26.31x   |  43.62x
32x16 |  27.04x   |  45.52x
32x24 |  27.33x   |  43.80x
32x32 |  27.64x   |  44.25x
32x64 |  27.89x   |  44.69x

diff -r 0775ffcdfc8a -r c983858deccb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 06 15:41:43 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Nov 07 10:06:23 2017 +0530
@@ -2645,6 +2645,11 @@
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = 
PFX(interp_4tap_vert_pp_32x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = 
PFX(interp_4tap_vert_ps_32x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = 
PFX(interp_4tap_vert_ps_32x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = 
PFX(interp_4tap_vert_ps_32x24_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = 
PFX(interp_4tap_vert_ps_32x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = 
PFX(interp_4tap_vert_ps_32x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = 
PFX(interp_4tap_vert_pp_16x4_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = 
PFX(interp_4tap_vert_pp_16x12_avx512);
@@ -2659,6 +2664,10 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = 
PFX(interp_4tap_vert_pp_32x48_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = 
PFX(interp_4tap_vert_pp_32x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = 
PFX(interp_4tap_vert_ps_32x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = 
PFX(interp_4tap_vert_ps_32x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = 
PFX(interp_4tap_vert_ps_32x48_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = 
PFX(interp_4tap_vert_ps_32x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = 
PFX(interp_4tap_vert_pp_16x24_avx512);
@@ -2673,6 +2682,10 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = 
PFX(interp_4tap_vert_ps_32x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = 
PFX(interp_4tap_vert_ps_32x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = 
PFX(interp_4tap_vert_ps_32x24_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = 
PFX(interp_4tap_vert_ps_32x32_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = 
PFX(interp_4tap_vert_pp_16x4_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = 
PFX(interp_4tap_vert_pp_16x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = 
PFX(interp_4tap_vert_pp_16x12_avx512);
diff -r 0775ffcdfc8a -r c983858deccb source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Mon Nov 06 15:41:43 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Tue Nov 07 10:06:23 2017 +0530
@@ -7341,6 +7341,88 @@
 jnz .loop
 RET
 %endif
+
+%macro PROCESS_CHROMA_VERT_PS_32x2_AVX512 0
+movu  m1, [r0]
+movu  m3, [r0 + r1]
+punpcklwd m0, m1,  m3
+pmaddwd   m0, [r5]
+punpckhwd m1, m3
+pmaddwd   m1, [r5]
+
+movu  m4, [r0 + 2 * r1]
+punpcklwd m2, m3,  m4
+pmaddwd   m2, [r5]
+punpckhwd 

[x265] [PATCH 148 of 307] x86: AVX512 optimise interp_4tap_vert_pp_8xN high bit depth code

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1509862764 -19800
#  Sun Nov 05 11:49:24 2017 +0530
# Node ID 2d94e5d214922d0f6cb0126e4477db8dd33256e7
# Parent  410a223c2caa58321a3a6b3e0a91c1dee512667a
x86: AVX512 optimise interp_4tap_vert_pp_8xN high bit depth code

diff -r 410a223c2caa -r 2d94e5d21492 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Sat Nov 04 18:05:34 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Sun Nov 05 11:49:24 2017 +0530
@@ -5930,15 +5930,10 @@
 punpckhwd m3, m4
 pmaddwd   m3, [r5]
 
-lea   r0, [r0 + 2 * r1]
-lea   r6, [r6 + 2 * r1]
-lea   r8, [r8 + 2 * r1]
-lea   r9, [r9 + 2 * r1]
-
-movu  xm5,[r0 + r1]
-vinserti32x4  m5, [r6 + r1],   1
-vinserti32x4  m5, [r8 + r1],   2
-vinserti32x4  m5, [r9 + r1],   3
+movu  xm5,[r0 + r10]
+vinserti32x4  m5, [r6 + r10],  1
+vinserti32x4  m5, [r8 + r10],  2
+vinserti32x4  m5, [r9 + r10],  3
 punpcklwd m6, m4,  m5
 pmaddwd   m6, [r5 + mmsize]
 paddd m0, m6
@@ -5946,10 +5941,10 @@
 pmaddwd   m4, [r5 + mmsize]
 paddd m1, m4
 
-movu  xm4,[r0 + 2 * r1]
-vinserti32x4  m4, [r6 + 2 * r1],   1
-vinserti32x4  m4, [r8 + 2 * r1],   2
-vinserti32x4  m4, [r9 + 2 * r1],   3
+movu  xm4,[r0 + 4 * r1]
+vinserti32x4  m4, [r6 + 4 * r1],   1
+vinserti32x4  m4, [r8 + 4 * r1],   2
+vinserti32x4  m4, [r9 + 4 * r1],   3
 punpcklwd m6, m5,  m4
 pmaddwd   m6, [r5 + mmsize]
 paddd m2, m6
@@ -5987,7 +5982,7 @@
 
;-
 %if ARCH_X86_64
 INIT_ZMM avx512
-cglobal interp_4tap_vert_pp_8x8, 5, 10, 9
+cglobal interp_4tap_vert_pp_8x8, 5, 11, 9
 add   r1d,r1d
 add   r3d,r3d
 sub   r0, r1
@@ -6001,6 +5996,7 @@
 %endif
 vbroadcasti32x8   m7, [INTERP_OFFSET_PP]
 vbroadcasti32x8   m8, [pw_pixel_max]
+lea   r10,[3 * r1]
 lea   r7, [3 * r3]
 PROCESS_CHROMA_VERT_PP_8x8_AVX512
 RET
@@ -6008,7 +6004,7 @@
 
 %macro FILTER_VER_PP_CHROMA_8xN_AVX512 1
 INIT_ZMM avx512
-cglobal interp_4tap_vert_pp_8x%1, 5, 10, 9
+cglobal interp_4tap_vert_pp_8x%1, 5, 11, 9
 add   r1d,r1d
 add   r3d,r3d
 sub   r0, r1
@@ -6022,10 +6018,11 @@
 %endif
 vbroadcasti32x8   m7, [INTERP_OFFSET_PP]
 vbroadcasti32x8   m8, [pw_pixel_max]
+lea   r10,[3 * r1]
 lea   r7, [3 * r3]
 %rep %1/8 - 1
 PROCESS_CHROMA_VERT_PP_8x8_AVX512
-lea   r0, [r9]
+lea   r0, [r8 + 4 * r1]
 lea   r2, [r2 + 4 * r3]
 %endrep
 PROCESS_CHROMA_VERT_PP_8x8_AVX512
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 151 of 307] x86: AVX512 interp_4tap_vert_ps_64xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1510030534 -19800
#  Tue Nov 07 10:25:34 2017 +0530
# Node ID 5517caaeb88b0f76a78706a867a4fa24fb17f64e
# Parent  c983858deccb26e5b4c957fbff959c1e74f84756
x86: AVX512 interp_4tap_vert_ps_64xN for high bit depth

i444
Size  |  AVX2 performance  | AVX512 performance
--
64x16 |   27.45x   |  42.45x
64x32 |   27.77x   |  43.65x
64x48 |   28.06x   |  43.04x
64x64 |   28.18x   |  43.34x

diff -r c983858deccb -r 5517caaeb88b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Nov 07 10:06:23 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Nov 07 10:25:34 2017 +0530
@@ -2639,6 +2639,10 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = 
PFX(interp_4tap_vert_pp_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = 
PFX(interp_4tap_vert_pp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = 
PFX(interp_4tap_vert_pp_64x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = 
PFX(interp_4tap_vert_ps_64x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = 
PFX(interp_4tap_vert_ps_64x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = 
PFX(interp_4tap_vert_ps_64x48_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = 
PFX(interp_4tap_vert_ps_64x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = 
PFX(interp_4tap_vert_pp_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = 
PFX(interp_4tap_vert_pp_32x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
diff -r c983858deccb -r 5517caaeb88b source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Tue Nov 07 10:06:23 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Tue Nov 07 10:25:34 2017 +0530
@@ -7423,6 +7423,128 @@
 FILTER_VER_PS_CHROMA_32xN_AVX512 48
 FILTER_VER_PS_CHROMA_32xN_AVX512 64
 %endif
+
+%macro PROCESS_CHROMA_VERT_PS_64x2_AVX512 0
+movu m1,  [r0]
+movu m3,  [r0 + r1]
+punpcklwdm0,  m1, m3
+pmaddwd  m0,  [r5]
+punpckhwdm1,  m3
+pmaddwd  m1,  [r5]
+
+movu m9,  [r0 + mmsize]
+movu m11, [r0 + r1 + mmsize]
+punpcklwdm8,  m9, m11
+pmaddwd  m8,  [r5]
+punpckhwdm9,  m11
+pmaddwd  m9,  [r5]
+
+movu m4,  [r0 + 2 * r1]
+punpcklwdm2,  m3, m4
+pmaddwd  m2,  [r5]
+punpckhwdm3,  m4
+pmaddwd  m3,  [r5]
+
+movu m12, [r0 + 2 * r1 + mmsize]
+punpcklwdm10, m11,m12
+pmaddwd  m10, [r5]
+punpckhwdm11, m12
+pmaddwd  m11, [r5]
+
+lea  r0,  [r0 + 2 * r1]
+movu m5,  [r0 + r1]
+punpcklwdm6,  m4, m5
+pmaddwd  m6,  [r5 + 1 * mmsize]
+padddm0,  m6
+punpckhwdm4,  m5
+pmaddwd  m4,  [r5 + 1 * mmsize]
+padddm1,  m4
+
+movu m13, [r0 + r1 + mmsize]
+punpcklwdm14, m12,m13
+pmaddwd  m14, [r5 + 1 * mmsize]
+padddm8,  m14
+punpckhwdm12, m13
+pmaddwd  m12, [r5 + 1 * mmsize]
+padddm9,  m12
+
+movu m4,  [r0 + 2 * r1]
+punpcklwdm6,  m5, m4
+pmaddwd  m6,  [r5 + 1 * mmsize]
+padddm2,  m6
+punpckhwdm5,  m4
+pmaddwd  m5,  [r5 + 1 * mmsize]
+padddm3,  m5
+
+movu m12, [r0 + 2 * r1 + mmsize]
+punpcklwdm14, m13,m12
+pmaddwd  m14, [r5 + 

[x265] [PATCH 147 of 307] x86: AVX512 interp_4tap_vert_pp_48x64 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1509798934 -19800
#  Sat Nov 04 18:05:34 2017 +0530
# Node ID 410a223c2caa58321a3a6b3e0a91c1dee512667a
# Parent  84dc38e191366e8b737d2a6014793afe830f3b35
x86: AVX512 interp_4tap_vert_pp_48x64 for high bit depth

AVX2 performance   : 26.37x
AVX512 performance : 42.37x

diff -r 84dc38e19136 -r 410a223c2caa source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Nov 03 09:20:36 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Sat Nov 04 18:05:34 2017 +0530
@@ -2639,6 +2639,7 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = 
PFX(interp_4tap_vert_pp_64x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = 
PFX(interp_4tap_vert_pp_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = 
PFX(interp_4tap_vert_pp_64x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = 
PFX(interp_4tap_vert_pp_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = 
PFX(interp_4tap_vert_pp_32x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
diff -r 84dc38e19136 -r 410a223c2caa source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Fri Nov 03 09:20:36 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Sat Nov 04 18:05:34 2017 +0530
@@ -6242,6 +6242,172 @@
 FILTER_VER_PP_CHROMA_32xN_AVX512 64
 %endif
 
+%macro PROCESS_CHROMA_VERT_PP_48x4_AVX512 0
+movu  m1, [r0]
+lea   r6, [r0 + 2 * r1]
+movu  m10,[r6]
+movu  m3, [r0 + r1]
+movu  m12,[r6 + r1]
+punpcklwd m0, m1,  m3
+punpcklwd m9, m10, m12
+pmaddwd   m0, [r5]
+pmaddwd   m9, [r5]
+punpckhwd m1, m3
+punpckhwd m10,m12
+pmaddwd   m1, [r5]
+pmaddwd   m10,[r5]
+
+movu  m4, [r0 + 2 * r1]
+movu  m13,[r6 + 2 * r1]
+punpcklwd m2, m3,  m4
+punpcklwd m11,m12, m13
+pmaddwd   m2, [r5]
+pmaddwd   m11,[r5]
+punpckhwd m3, m4
+punpckhwd m12,m13
+pmaddwd   m3, [r5]
+pmaddwd   m12,[r5]
+
+movu  m5, [r0 + r7]
+movu  m14,[r6 + r7]
+punpcklwd m6, m4,  m5
+punpcklwd m15,m13, m14
+pmaddwd   m6, [r5 + mmsize]
+pmaddwd   m15,[r5 + mmsize]
+paddd m0, m6
+paddd m9, m15
+punpckhwd m4, m5
+punpckhwd m13,m14
+pmaddwd   m4, [r5 + mmsize]
+pmaddwd   m13,[r5 + mmsize]
+paddd m1, m4
+paddd m10,m13
+
+movu  m4, [r0 + 4 * r1]
+movu  m13,[r6 + 4 * r1]
+punpcklwd m6, m5,  m4
+punpcklwd m15,m14, m13
+pmaddwd   m6, [r5 + mmsize]
+pmaddwd   m15,[r5 + mmsize]
+paddd m2, m6
+paddd m11,m15
+punpckhwd m5, m4
+punpckhwd m14,m13
+pmaddwd   m5, [r5 + mmsize]
+pmaddwd   m14,[r5 + mmsize]
+paddd m3, m5
+paddd m12,m14
+
+paddd m0, m7
+paddd m1, m7
+paddd m2, m7
+paddd m3, m7
+paddd m9, m7
+paddd m10,m7
+paddd m11,m7
+paddd m12,m7
+
+psrad m0, 

[x265] [PATCH 153 of 307] x86: AVX512 interp_4tap_vert_ps_48x64 for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1510033854 -19800
#  Tue Nov 07 11:20:54 2017 +0530
# Node ID 9df6f8ae51300ebbb9d0941f7fc1cce1fdef4e94
# Parent  092438e72985dc1d75bf3be4f0c8c1485ec8
x86: AVX512 interp_4tap_vert_ps_48x64 for high bit depth

AVX2 performance   : 28.05x
AVX512 performance : 39.37x

diff -r 092438e7 -r 9df6f8ae5130 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Nov 07 11:04:05 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Nov 07 11:20:54 2017 +0530
@@ -2644,6 +2644,7 @@
 p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = 
PFX(interp_4tap_vert_ps_64x48_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = 
PFX(interp_4tap_vert_ps_64x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = 
PFX(interp_4tap_vert_pp_48x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = 
PFX(interp_4tap_vert_ps_48x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = 
PFX(interp_4tap_vert_pp_32x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = 
PFX(interp_4tap_vert_pp_32x24_avx512);
diff -r 092438e7 -r 9df6f8ae5130 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Tue Nov 07 11:04:05 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Tue Nov 07 11:20:54 2017 +0530
@@ -7537,6 +7537,167 @@
 FILTER_VER_PS_CHROMA_32xN_AVX512 64
 %endif
 
+%macro PROCESS_CHROMA_VERT_PS_48x4_AVX512 0
+movu  m1, [r0]
+lea   r6, [r0 + 2 * r1]
+movu  m10,[r6]
+movu  m3, [r0 + r1]
+movu  m12,[r6 + r1]
+punpcklwd m0, m1,  m3
+punpcklwd m9, m10, m12
+pmaddwd   m0, [r5]
+pmaddwd   m9, [r5]
+punpckhwd m1, m3
+punpckhwd m10,m12
+pmaddwd   m1, [r5]
+pmaddwd   m10,[r5]
+
+movu  m4, [r0 + 2 * r1]
+movu  m13,[r6 + 2 * r1]
+punpcklwd m2, m3,  m4
+punpcklwd m11,m12, m13
+pmaddwd   m2, [r5]
+pmaddwd   m11,[r5]
+punpckhwd m3, m4
+punpckhwd m12,m13
+pmaddwd   m3, [r5]
+pmaddwd   m12,[r5]
+
+movu  m5, [r0 + r7]
+movu  m14,[r6 + r7]
+punpcklwd m6, m4,  m5
+punpcklwd m15,m13, m14
+pmaddwd   m6, [r5 + mmsize]
+pmaddwd   m15,[r5 + mmsize]
+paddd m0, m6
+paddd m9, m15
+punpckhwd m4, m5
+punpckhwd m13,m14
+pmaddwd   m4, [r5 + mmsize]
+pmaddwd   m13,[r5 + mmsize]
+paddd m1, m4
+paddd m10,m13
+
+movu  m4, [r0 + 4 * r1]
+movu  m13,[r6 + 4 * r1]
+punpcklwd m6, m5,  m4
+punpcklwd m15,m14, m13
+pmaddwd   m6, [r5 + mmsize]
+pmaddwd   m15,[r5 + mmsize]
+paddd m2, m6
+paddd m11,m15
+punpckhwd m5, m4
+punpckhwd m14,m13
+pmaddwd   m5, [r5 + mmsize]
+pmaddwd   m14,[r5 + mmsize]
+paddd m3, m5
+paddd m12,m14
+
+paddd m0, m7
+paddd m1, m7
+paddd m2, m7
+paddd m3, m7
+paddd m9, m7
+paddd m10,m7
+paddd m11,m7
+paddd m12,m7
+
+psrad 

[x265] [PATCH 152 of 307] x86: AVX512 interp_4tap_vert_ps_16xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1510032845 -19800
#  Tue Nov 07 11:04:05 2017 +0530
# Node ID 092438e72985dc1d75bf3be4f0c8c1485ec8
# Parent  5517caaeb88b0f76a78706a867a4fa24fb17f64e
x86: AVX512 interp_4tap_vert_ps_16xN for high bit depth

i444
Size  |  AVX2 performance | AVX512 performance
--
16x4  |  27.12x   |  33.94x
16x8  |  25.90x   |  30.27x
16x12 |  26.81x   |  34.40x
16x16 |  27.69x   |  33.72x
16x32 |  26.96x   |  36.42x
16x64 |  28.37x   |  35.85x

diff -r 5517caaeb88b -r 092438e7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Nov 07 10:25:34 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Nov 07 11:04:05 2017 +0530
@@ -2660,6 +2660,12 @@
 p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = 
PFX(interp_4tap_vert_pp_16x64_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = 
PFX(interp_4tap_vert_ps_16x4_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = 
PFX(interp_4tap_vert_ps_16x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = 
PFX(interp_4tap_vert_ps_16x12_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = 
PFX(interp_4tap_vert_ps_16x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = 
PFX(interp_4tap_vert_ps_16x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = 
PFX(interp_4tap_vert_ps_16x64_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = 
PFX(interp_4tap_vert_pp_8x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = 
PFX(interp_4tap_vert_pp_8x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = 
PFX(interp_4tap_vert_pp_8x32_avx512);
@@ -2677,6 +2683,11 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = 
PFX(interp_4tap_vert_pp_16x24_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = 
PFX(interp_4tap_vert_pp_16x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = 
PFX(interp_4tap_vert_ps_16x8_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = 
PFX(interp_4tap_vert_ps_16x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = 
PFX(interp_4tap_vert_ps_16x24_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = 
PFX(interp_4tap_vert_ps_16x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = 
PFX(interp_4tap_vert_ps_16x64_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = 
PFX(interp_4tap_vert_pp_8x8_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = 
PFX(interp_4tap_vert_pp_8x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = 
PFX(interp_4tap_vert_pp_8x32_avx512);
@@ -2695,6 +2706,11 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = 
PFX(interp_4tap_vert_pp_16x12_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = 
PFX(interp_4tap_vert_pp_16x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = 
PFX(interp_4tap_vert_pp_16x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = 
PFX(interp_4tap_vert_ps_16x4_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = 
PFX(interp_4tap_vert_ps_16x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = 
PFX(interp_4tap_vert_ps_16x12_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = 
PFX(interp_4tap_vert_ps_16x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = 
PFX(interp_4tap_vert_ps_16x32_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = 
PFX(interp_4tap_vert_pp_8x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = 
PFX(interp_4tap_vert_pp_8x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = 
PFX(interp_4tap_vert_pp_8x32_avx512);
diff -r 5517caaeb88b -r 092438e7 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Tue Nov 07 10:25:34 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Tue Nov 07 11:04:05 2017 +0530
@@ -7342,6 +7342,119 @@
 RET
 %endif
 
+%macro PROCESS_CHROMA_VERT_PS_16x4_AVX512 0
+movu  ym1,[r0]
+lea   r6, [r0 + 2 * r1]
+vinserti32x8  m1, [r6],1
+movu  ym3,

[x265] [PATCH 154 of 307] x86: AVX512 interp_4tap_vert_ps_8xN for high bit depth

2018-04-06 Thread mythreyi
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1510035839 -19800
#  Tue Nov 07 11:53:59 2017 +0530
# Node ID ae3775aa94f3acceb7d43ce7db2df6f8be6c6912
# Parent  9df6f8ae51300ebbb9d0941f7fc1cce1fdef4e94
x86: AVX512 interp_4tap_vert_ps_8xN for high bit depth

i444
Size  |  AVX2 performance | AVX512 performance
--
8x8   |  19.97x|  28.50x
8x16  |  22.32x|  27.74x
8x32  |  21.73x|  29.04x

diff -r 9df6f8ae5130 -r ae3775aa94f3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Nov 07 11:20:54 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Nov 07 11:53:59 2017 +0530
@@ -2670,6 +2670,9 @@
 p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = 
PFX(interp_4tap_vert_pp_8x8_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = 
PFX(interp_4tap_vert_pp_8x16_avx512);
 p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = 
PFX(interp_4tap_vert_pp_8x32_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = 
PFX(interp_4tap_vert_ps_8x8_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = 
PFX(interp_4tap_vert_ps_8x16_avx512);
+p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = 
PFX(interp_4tap_vert_ps_8x32_avx512);
 
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = 
PFX(interp_4tap_vert_pp_32x32_avx512);
@@ -2693,6 +2696,10 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = 
PFX(interp_4tap_vert_pp_8x16_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = 
PFX(interp_4tap_vert_pp_8x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = 
PFX(interp_4tap_vert_pp_8x64_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = 
PFX(interp_4tap_vert_ps_8x8_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = 
PFX(interp_4tap_vert_ps_8x16_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = 
PFX(interp_4tap_vert_ps_8x32_avx512);
+p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = 
PFX(interp_4tap_vert_ps_8x64_avx512);
 
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = 
PFX(interp_4tap_vert_pp_32x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = 
PFX(interp_4tap_vert_pp_32x16_avx512);
@@ -2715,6 +2722,9 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = 
PFX(interp_4tap_vert_pp_8x8_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = 
PFX(interp_4tap_vert_pp_8x16_avx512);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = 
PFX(interp_4tap_vert_pp_8x32_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = 
PFX(interp_4tap_vert_ps_8x8_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = 
PFX(interp_4tap_vert_ps_8x16_avx512);
+p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = 
PFX(interp_4tap_vert_ps_8x32_avx512);
 
 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = 
PFX(interp_4tap_vert_pp_24x32_avx512);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = 
PFX(interp_4tap_vert_pp_24x64_avx512);
diff -r 9df6f8ae5130 -r ae3775aa94f3 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm  Tue Nov 07 11:20:54 2017 +0530
+++ b/source/common/x86/ipfilter16.asm  Tue Nov 07 11:53:59 2017 +0530
@@ -7342,6 +7342,133 @@
 RET
 %endif
 
+%macro PROCESS_CHROMA_VERT_PS_8x8_AVX512 0
+movu  xm1,[r0]
+lea   r6, [r0 + 2 * r1]
+lea   r8, [r0 + 4 * r1]
+lea   r9, [r8 + 2 * r1]
+vinserti32x4  m1, [r6],1
+vinserti32x4  m1, [r8],2
+vinserti32x4  m1, [r9],3
+movu  xm3,[r0 + r1]
+vinserti32x4  m3, [r6 + r1],   1
+vinserti32x4  m3, [r8 + r1],   2
+vinserti32x4  m3, [r9 + r1],   3
+punpcklwd m0, m1,  m3
+pmaddwd   m0, [r5]
+punpckhwd m1, m3
+pmaddwd   m1, [r5]
+
+movu  xm4,[r0 + 2 * r1]
+vinserti32x4  m4, [r6 + 2 * r1],   1
+vinserti32x4  m4, [r8 + 2 * r1],   2
+vinserti32x4  m4, [r9 + 2 * r1],   3
+punpcklwd m2, m3,  m4
+  

  1   2   3   >