# HG changeset patch
# User Vignesh Vijayakumar
# Date 1507278656 -19800
#      Fri Oct 06 14:00:56 2017 +0530
# Node ID 44433ded38d00c79fa52e69e7c5c5127009f9ede
# Parent  ba20a08181382a2fb18a0d1aff7637d66fa41ac7
x86: Aligned routine implementation of add_ps primitive

diff -r ba20a0818138 -r 44433ded38d0 source/common/pixel.cpp
--- a/source/common/pixel.cpp   Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/pixel.cpp   Fri Oct 06 14:00:56 2017 +0530
@@ -996,6 +996,7 @@
 #define LUMA_CU(W, H) \
     p.cu[BLOCK_ ## W ## x ## H].sub_ps        = pixel_sub_ps_c<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps        = pixel_add_ps_c<W, H>; \
+    p.cu[BLOCK_ ## W ## x ## H].add_ps_aligned = pixel_add_ps_c<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_sp       = blockcopy_sp_c<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ps       = blockcopy_ps_c<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ss       = blockcopy_ss_c<W, H>; \
@@ -1169,7 +1170,8 @@
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = 
blockcopy_ps_c<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = 
blockcopy_ss_c<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = 
pixel_sub_ps_c<W, H>;  \
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = 
pixel_add_ps_c<W, H>;
+    p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = 
pixel_add_ps_c<W, H>; \
+    p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps_aligned = 
pixel_add_ps_c<W, H>;
 
     CHROMA_CU_420(2, 2)
     CHROMA_CU_420(4, 4)
@@ -1247,7 +1249,8 @@
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = 
blockcopy_ps_c<W, H>; \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = 
blockcopy_ss_c<W, H>; \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = 
pixel_sub_ps_c<W, H>; \
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = 
pixel_add_ps_c<W, H>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = 
pixel_add_ps_c<W, H>; \
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps_aligned = 
pixel_add_ps_c<W, H>;
 
     CHROMA_CU_422(2, 4)
     CHROMA_CU_422(4, 8)
diff -r ba20a0818138 -r 44433ded38d0 source/common/primitives.cpp
--- a/source/common/primitives.cpp      Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/primitives.cpp      Fri Oct 06 14:00:56 2017 +0530
@@ -126,6 +126,7 @@
         p.chroma[X265_CSP_I444].cu[i].sse_pp  = p.cu[i].sse_pp;
         p.chroma[X265_CSP_I444].cu[i].sub_ps  = p.cu[i].sub_ps;
         p.chroma[X265_CSP_I444].cu[i].add_ps  = p.cu[i].add_ps;
+        p.chroma[X265_CSP_I444].cu[i].add_ps_aligned = p.cu[i].add_ps_aligned;
         p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
         p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
         p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
diff -r ba20a0818138 -r 44433ded38d0 source/common/primitives.h
--- a/source/common/primitives.h        Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/primitives.h        Fri Oct 06 14:00:56 2017 +0530
@@ -271,6 +271,7 @@
         calcresidual_t  calcresidual_aligned;
         pixel_sub_ps_t  sub_ps;
         pixel_add_ps_t  add_ps;
+        pixel_add_ps_t  add_ps_aligned;
         blockfill_s_t   blockfill_s;   // block fill, for DC transforms
         blockfill_s_t   blockfill_s_aligned;   // block fill, for DC transforms
         copy_cnt_t      copy_cnt;      // copy coeff while counting non-zero
@@ -405,6 +406,7 @@
             pixel_sse_t    sse_pp;
             pixel_sub_ps_t sub_ps;
             pixel_add_ps_t add_ps;
+            pixel_add_ps_t add_ps_aligned;
 
             copy_ps_t      copy_ps;
             copy_sp_t      copy_sp;
diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp      Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp      Fri Oct 06 14:00:56 2017 +0530
@@ -2202,6 +2202,20 @@
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = 
PFX(pixel_add_ps_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = 
PFX(pixel_add_ps_32x64_avx512);
 
+        p.cu[BLOCK_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse2);
+        p.cu[BLOCK_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse2);
+        p.cu[BLOCK_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2);
+        p.cu[BLOCK_32x32].add_ps_aligned = 
PFX(pixel_add_ps_aligned_32x32_avx512);
+        p.cu[BLOCK_64x64].add_ps_aligned = 
PFX(pixel_add_ps_aligned_64x64_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps_aligned = 
PFX(pixel_add_ps_4x4_sse2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps_aligned = 
PFX(pixel_add_ps_8x8_sse2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps_aligned = 
PFX(pixel_add_ps_16x16_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps_aligned = 
PFX(pixel_add_ps_aligned_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps_aligned = 
PFX(pixel_add_ps_4x8_sse2);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps_aligned = 
PFX(pixel_add_ps_8x16_sse2);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps_aligned = 
PFX(pixel_add_ps_16x32_avx2);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps_aligned = 
PFX(pixel_add_ps_aligned_32x64_avx512);
+
         // 64 X N
         p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512);
@@ -4306,6 +4320,20 @@
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = 
PFX(pixel_add_ps_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = 
PFX(pixel_add_ps_32x64_avx512);
 
+        p.cu[BLOCK_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse4);
+        p.cu[BLOCK_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse4);
+        p.cu[BLOCK_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2);
+        p.cu[BLOCK_32x32].add_ps_aligned = 
PFX(pixel_add_ps_aligned_32x32_avx512);
+        p.cu[BLOCK_64x64].add_ps_aligned = 
PFX(pixel_add_ps_aligned_64x64_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps_aligned = 
PFX(pixel_add_ps_4x4_sse4);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps_aligned = 
PFX(pixel_add_ps_8x8_sse4);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps_aligned = 
PFX(pixel_add_ps_16x16_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps_aligned = 
PFX(pixel_add_ps_aligned_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps_aligned = 
PFX(pixel_add_ps_4x8_sse4);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps_aligned = 
PFX(pixel_add_ps_8x16_sse4);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps_aligned = 
PFX(pixel_add_ps_16x32_avx2);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps_aligned = 
PFX(pixel_add_ps_aligned_32x64_avx512);
+
         p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
         p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = 
PFX(pixel_sub_ps_32x32_avx512);
diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/x86/pixel.h Fri Oct 06 14:00:56 2017 +0530
@@ -45,6 +45,7 @@
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const 
pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
     FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const 
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
     FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const 
pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, 
const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const 
pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const 
pixel*, intptr_t); \
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const 
pixel*, intptr_t); \
diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm   Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/x86/pixeladd8.asm   Fri Oct 06 14:00:56 2017 +0530
@@ -1150,27 +1150,27 @@
 ;-----------------------------------------------------------------------------
 %macro PROCESS_ADD_PS_64x4_AVX512 0
     pmovzxbw    m0,         [r2]
-    pmovzxbw    m1,         [r2 + 32]
+    pmovzxbw    m1,         [r2 + mmsize/2]
     movu        m2,         [r3]
-    movu        m3,         [r3 + 64]
+    movu        m3,         [r3 + mmsize]
     paddw       m0,         m2
     paddw       m1,         m3
     packuswb    m0,         m1
     vpermq      m0,         m4,      m0
     movu        [r0],       m0
     pmovzxbw    m0,         [r2 + r4]
-    pmovzxbw    m1,         [r2 + r4 + 32]
+    pmovzxbw    m1,         [r2 + r4 + mmsize/2]
     movu        m2,         [r3 + r5]
-    movu        m3,         [r3 + r5 + 64]
+    movu        m3,         [r3 + r5 + mmsize]
     paddw       m0,         m2
     paddw       m1,         m3
     packuswb    m0,         m1
     vpermq      m0,         m4,      m0
     movu        [r0 + r1],  m0
     pmovzxbw    m0,         [r2 + 2 * r4]
-    pmovzxbw    m1,         [r2 + 2 * r4 + 32]
+    pmovzxbw    m1,         [r2 + 2 * r4 + mmsize/2]
     movu        m2,         [r3 + 2 * r5]
-    movu        m3,         [r3 + 2 * r5 + 64]
+    movu        m3,         [r3 + 2 * r5 + mmsize]
     paddw       m0,         m2
     paddw       m1,         m3
     packuswb    m0,         m1
@@ -1178,15 +1178,16 @@
     movu        [r0 + 2 * r1],       m0
 
     pmovzxbw    m0,         [r2 + r7]
-    pmovzxbw    m1,         [r2 + r7 + 32]
+    pmovzxbw    m1,         [r2 + r7 + mmsize/2]
     movu        m2,         [r3 + r8]
-    movu        m3,         [r3 + r8 + 64]
+    movu        m3,         [r3 + r8 + mmsize]
     paddw       m0,         m2
     paddw       m1,         m3
     packuswb    m0,         m1
     vpermq      m0,         m4,      m0
     movu        [r0 + r6],       m0
 %endmacro
+
 %macro PROCESS_ADD_PS_64x4_HBD_AVX512 0
     movu    m0,     [r2]
     movu    m1,     [r2 + mmsize]
@@ -1233,6 +1234,92 @@
     movu    [r0 + r8 + mmsize],      m1
 %endmacro
 
+%macro PROCESS_ADD_PS_64x4_ALIGNED_AVX512 0
+    pmovzxbw    m0,         [r2]
+    pmovzxbw    m1,         [r2 + mmsize/2]
+    mova        m2,         [r3]
+    mova        m3,         [r3 + mmsize]
+    paddw       m0,         m2
+    paddw       m1,         m3
+    packuswb    m0,         m1
+    vpermq      m0,         m4,      m0
+    mova        [r0],       m0
+    pmovzxbw    m0,         [r2 + r4]
+    pmovzxbw    m1,         [r2 + r4 + mmsize/2]
+    mova        m2,         [r3 + r5]
+    mova        m3,         [r3 + r5 + mmsize]
+    paddw       m0,         m2
+    paddw       m1,         m3
+    packuswb    m0,         m1
+    vpermq      m0,         m4,      m0
+    mova        [r0 + r1],  m0
+    pmovzxbw    m0,         [r2 + 2 * r4]
+    pmovzxbw    m1,         [r2 + 2 * r4 + mmsize/2]
+    mova        m2,         [r3 + 2 * r5]
+    mova        m3,         [r3 + 2 * r5 + mmsize]
+    paddw       m0,         m2
+    paddw       m1,         m3
+    packuswb    m0,         m1
+    vpermq      m0,         m4,      m0
+    mova        [r0 + 2 * r1],       m0
+
+    pmovzxbw    m0,         [r2 + r7]
+    pmovzxbw    m1,         [r2 + r7 + mmsize/2]
+    mova        m2,         [r3 + r8]
+    mova        m3,         [r3 + r8 + mmsize]
+    paddw       m0,         m2
+    paddw       m1,         m3
+    packuswb    m0,         m1
+    vpermq      m0,         m4,      m0
+    mova        [r0 + r6],       m0
+%endmacro
+
+%macro PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512 0
+    mova    m0,     [r2]
+    mova    m1,     [r2 + mmsize]
+    mova    m2,     [r3]
+    mova    m3,     [r3 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    mova    [r0],                m0
+    mova    [r0 + mmsize],       m1
+
+    mova    m0,     [r2 + r4]
+    mova    m1,     [r2 + r4 + mmsize]
+    mova    m2,     [r3 + r5]
+    mova    m3,     [r3 + r5 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    mova    [r0 + r1],           m0
+    mova    [r0 + r1 + mmsize],  m1
+
+    mova    m0,     [r2 + r4 * 2]
+    mova    m1,     [r2 + r4 * 2 + mmsize]
+    mova    m2,     [r3 + r5 * 2]
+    mova    m3,     [r3 + r5 * 2 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    mova    [r0 + r1 * 2],           m0
+    mova    [r0 + r1 * 2 + mmsize],  m1
+
+    mova    m0,     [r2 + r6]
+    mova    m1,     [r2 + r6 + mmsize]
+    mova    m2,     [r3 + r7]
+    mova    m3,     [r3 + r7 + mmsize]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    mova    [r0 + r8],               m0
+    mova    [r0 + r8 + mmsize],      m1
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t 
*scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -1256,6 +1343,25 @@
 %endrep
     PROCESS_ADD_PS_64x4_HBD_AVX512
     RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_64x64, 6, 9, 6
+    vbroadcasti32x8  m5,     [pw_pixel_max]
+    pxor             m4,     m4
+    add             r4d,     r4d
+    add             r5d,     r5d
+    add             r1d,     r1d
+    lea              r6,     [r4 * 3]
+    lea              r7,     [r5 * 3]
+    lea              r8,     [r1 * 3]
+%rep 15
+    PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+%endrep
+    PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512
+    RET
 %endif
 %else
 %if ARCH_X86_64
@@ -1274,8 +1380,25 @@
 %endrep
     PROCESS_ADD_PS_64x4_AVX512
     RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_64x64, 6, 9, 4
+    add         r5,         r5
+    lea         r6,         [3 * r1]
+    lea         r7,         [3 * r4]
+    lea         r8,         [3 * r5]
+    mova        m4,         [store_shuf1_avx512]
+%rep 15
+    PROCESS_ADD_PS_64x4_ALIGNED_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+%endrep
+    PROCESS_ADD_PS_64x4_ALIGNED_AVX512
+    RET
 %endif
 %endif
+
 %macro PROCESS_ADD_PS_32x4_AVX512 0
     pmovzxbw    m0,         [r2]
     movu        m1,         [r3]
@@ -1298,6 +1421,7 @@
     movu           [r0 + r1 * 2],   ym0
     vextracti32x8  [r0 + r8],        m0,    1
 %endmacro
+
 %macro PROCESS_ADD_PS_32x4_HBD_AVX512 0
     movu    m0,     [r2]
     movu    m1,     [r2 + r4]
@@ -1322,6 +1446,53 @@
     movu    [r0 + r8],               m1
 %endmacro
 
+%macro PROCESS_ADD_PS_32x4_ALIGNED_AVX512 0
+    pmovzxbw    m0,         [r2]
+    mova        m1,         [r3]
+    pmovzxbw    m2,         [r2 + r4]
+    mova        m3,         [r3 + r5]
+    paddw       m0,         m1
+    paddw       m2,         m3
+    packuswb    m0,         m2
+    vpermq      m0,         m4,      m0
+    mova           [r0],       ym0
+    vextracti32x8  [r0 + r1],   m0,    1
+    pmovzxbw    m0,         [r2 + r4 * 2]
+    mova        m1,         [r3 + r5 * 2]
+    pmovzxbw    m2,         [r2 + r6]
+    mova        m3,         [r3 + r7]
+    paddw       m0,         m1
+    paddw       m2,         m3
+    packuswb    m0,         m2
+    vpermq      m0,         m4,      m0
+    mova           [r0 + r1 * 2],   ym0
+    vextracti32x8  [r0 + r8],        m0,    1
+%endmacro
+
+%macro PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 0
+    mova    m0,     [r2]
+    mova    m1,     [r2 + r4]
+    mova    m2,     [r3]
+    mova    m3,     [r3 + r5]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    mova    [r0],                m0
+    mova    [r0 + r1],           m1
+
+    mova    m0,     [r2 + r4 * 2]
+    mova    m1,     [r2 + r6]
+    mova    m2,     [r3 + r5 * 2]
+    mova    m3,     [r3 + r7]
+    paddw   m0,     m2
+    paddw   m1,     m3
+
+    CLIPW2  m0, m1, m4, m5
+    mova    [r0 + r1 * 2],           m0
+    mova    [r0 + r8],               m1
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t 
*scr1, intptr_t srcStride0, intptr_t srcStride1)
 ;-----------------------------------------------------------------------------
@@ -1345,6 +1516,7 @@
 %endrep
     PROCESS_ADD_PS_32x4_HBD_AVX512
     RET
+
 INIT_ZMM avx512
 cglobal pixel_add_ps_32x64, 6, 9, 6
     vbroadcasti32x8  m5,     [pw_pixel_max]
@@ -1363,6 +1535,44 @@
 %endrep
     PROCESS_ADD_PS_32x4_HBD_AVX512
     RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_32x32, 6, 9, 6
+    vbroadcasti32x8  m5,     [pw_pixel_max]
+    pxor             m4,     m4
+    add             r4d,     r4d
+    add             r5d,     r5d
+    add             r1d,     r1d
+    lea              r6,     [r4 * 3]
+    lea              r7,     [r5 * 3]
+    lea              r8,     [r1 * 3]
+%rep 7
+    PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+%endrep
+    PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_32x64, 6, 9, 6
+    vbroadcasti32x8  m5,     [pw_pixel_max]
+    pxor             m4,     m4
+    add             r4d,     r4d
+    add             r5d,     r5d
+    add             r1d,     r1d
+    lea              r6,     [r4 * 3]
+    lea              r7,     [r5 * 3]
+    lea              r8,     [r1 * 3]
+%rep 15
+    PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+%endrep
+    PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
+    RET
 %endif
 %else
 %if ARCH_X86_64
@@ -1398,6 +1608,39 @@
 %endrep
     PROCESS_ADD_PS_32x4_AVX512
     RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_32x32, 6, 9, 5
+    add         r5,         r5
+    lea         r6,         [r4 * 3]
+    lea         r7,         [r5 * 3]
+    lea         r8,         [r1 * 3]
+    mova        m4,         [store_shuf1_avx512]
+%rep 7
+    PROCESS_ADD_PS_32x4_ALIGNED_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+%endrep
+    PROCESS_ADD_PS_32x4_ALIGNED_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_32x64, 6, 9, 5
+    add         r5,         r5
+    lea         r6,         [r4 * 3]
+    lea         r7,         [r5 * 3]
+    lea         r8,         [r1 * 3]
+    mova        m4,         [store_shuf1_avx512]
+
+%rep 15
+    PROCESS_ADD_PS_32x4_ALIGNED_AVX512
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
+%endrep
+    PROCESS_ADD_PS_32x4_ALIGNED_AVX512
+    RET
 %endif
 %endif
 ;-----------------------------------------------------------------------------
diff -r ba20a0818138 -r 44433ded38d0 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp      Wed Oct 04 17:02:59 2017 +0530
+++ b/source/test/pixelharness.cpp      Fri Oct 06 14:00:56 2017 +0530
@@ -876,6 +876,31 @@
     return true;
 }
 
+bool PixelHarness::check_pixel_add_ps_aligned(pixel_add_ps_t ref, 
pixel_add_ps_t opt)
+{
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+    intptr_t stride2 = 64, stride = STRIDE;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index1 = rand() % TEST_CASES;
+        int index2 = rand() % TEST_CASES;
+        checked(opt, opt_dest, stride2, pixel_test_buff[index1] + j, 
short_test_buff[index2] + j, stride, stride);
+        ref(ref_dest, stride2, pixel_test_buff[index1] + j, 
short_test_buff[index2] + j, stride, stride);
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += 2 * INCR;
+    }
+    return true;
+}
+
 bool PixelHarness::check_pixel_var(var_t ref, var_t opt)
 {
     int j = 0;
@@ -2288,6 +2313,15 @@
             }
         }
 
+        if (opt.cu[part].add_ps_aligned)
+        {
+            if (!check_pixel_add_ps_aligned(ref.cu[part].add_ps_aligned, 
opt.cu[part].add_ps_aligned))
+            {
+                printf("add_ps_aligned[%s] failed\n", lumaPartStr[part]);
+                return false;
+            }
+        }
+
         if (opt.cu[part].copy_ss)
         {
             if (!check_copy_ss(ref.cu[part].copy_ss, opt.cu[part].copy_ss))
@@ -2376,6 +2410,14 @@
                     return false;
                 }
             }
+            if (opt.chroma[i].cu[part].add_ps_aligned)
+            {
+                if 
(!check_pixel_add_ps_aligned(ref.chroma[i].cu[part].add_ps_aligned, 
opt.chroma[i].cu[part].add_ps_aligned))
+                {
+                    printf("chroma_add_ps_aligned[%s][%s] failed\n", 
x265_source_csp_names[i], chromaPartStr[i][part]);
+                    return false;
+                }
+            }
             if (opt.chroma[i].cu[part].copy_sp)
             {
                 if (!check_copy_sp(ref.chroma[i].cu[part].copy_sp, 
opt.chroma[i].cu[part].copy_sp))
@@ -3042,6 +3084,11 @@
             HEADER("add_ps[%s]", lumaPartStr[part]);
             REPORT_SPEEDUP(opt.cu[part].add_ps, ref.cu[part].add_ps, pbuf1, 
FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
         }
+        if (opt.cu[part].add_ps_aligned)
+        {
+            HEADER("add_ps[%s]", lumaPartStr[part]);
+            REPORT_SPEEDUP(opt.cu[part].add_ps_aligned, 
ref.cu[part].add_ps_aligned, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
+        }
         if (opt.cu[part].copy_ss)
         {
             HEADER("copy_ss[%s]", lumaPartStr[part]);
@@ -3113,6 +3160,11 @@
                 HEADER("[%s]  add_ps[%s]", x265_source_csp_names[i], 
chromaPartStr[i][part]);
                 REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps, 
ref.chroma[i].cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, 
STRIDE);
             }
+            if (opt.chroma[i].cu[part].add_ps_aligned)
+            {
+                HEADER("[%s]  add_ps_aligned[%s]", x265_source_csp_names[i], 
chromaPartStr[i][part]);
+                REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps_aligned, 
ref.chroma[i].cu[part].add_ps_aligned, pbuf1, FENC_STRIDE, pbuf2, sbuf1, 
STRIDE, STRIDE);
+            }
             if (opt.chroma[i].cu[part].sa8d)
             {
                 HEADER("[%s] sa8d[%s]", x265_source_csp_names[i], 
chromaPartStr[i][part]);
diff -r ba20a0818138 -r 44433ded38d0 source/test/pixelharness.h
--- a/source/test/pixelharness.h        Wed Oct 04 17:02:59 2017 +0530
+++ b/source/test/pixelharness.h        Fri Oct 06 14:00:56 2017 +0530
@@ -81,6 +81,7 @@
     bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
     bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
     bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
+    bool check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt);
     bool check_scale1D_pp(scale1D_t ref, scale1D_t opt);
     bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
     bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to