[x265] [PATCH] arm: Implement filterPixelToShort ARM NEON asm

2016-03-01 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1456831820 -19800
#  Tue Mar 01 17:00:20 2016 +0530
# Node ID 61e51faf9e7ee1c8056ac2f66cf51da104bfa106
# Parent  79c00b9bc2b81afef2e41526fc3c390528f3174c
arm: Implement filterPixelToShort ARM NEON asm

diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Tue Mar 01 12:18:18 2016 +0530
+++ b/source/common/CMakeLists.txt  Tue Mar 01 17:00:20 2016 +0530
@@ -89,7 +89,7 @@
 set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
loopfilter.h)
 
 # add ARM assembly/intrinsic files here
-set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S)
+set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S 
ipfilter8.S)
 set(VEC_PRIMITIVES)
 
 set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp  Tue Mar 01 12:18:18 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp  Tue Mar 01 17:00:20 2016 +0530
@@ -33,6 +33,7 @@
 #include "blockcopy8.h"
 #include "pixel.h"
 #include "pixel-util.h"
+#include "ipfilter8.h"
 }
 
 namespace X265_NS {
@@ -42,6 +43,33 @@
 {
 if (cpuMask & X265_CPU_NEON)
 {
+// filterPixelToShort
+p.pu[LUMA_4x4].convert_p2s   = PFX(filterPixelToShort_4x4_neon);
+p.pu[LUMA_4x8].convert_p2s   = PFX(filterPixelToShort_4x8_neon);
+p.pu[LUMA_4x16].convert_p2s  = PFX(filterPixelToShort_4x16_neon);
+p.pu[LUMA_8x4].convert_p2s   = PFX(filterPixelToShort_8x4_neon);
+p.pu[LUMA_8x8].convert_p2s   = PFX(filterPixelToShort_8x8_neon);
+p.pu[LUMA_8x16].convert_p2s  = PFX(filterPixelToShort_8x16_neon);
+p.pu[LUMA_8x32].convert_p2s  = PFX(filterPixelToShort_8x32_neon);
+p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
+p.pu[LUMA_16x4].convert_p2s  = PFX(filterPixelToShort_16x4_neon);
+p.pu[LUMA_16x8].convert_p2s  = PFX(filterPixelToShort_16x8_neon);
+p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
+p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
+p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
+p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
+p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
+p.pu[LUMA_32x8].convert_p2s  = PFX(filterPixelToShort_32x8_neon);
+p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
+p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
+p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
+p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
+p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
+p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
+p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
+p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
+p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
+
 // Block_fill
 p.cu[BLOCK_4x4].blockfill_s   = PFX(blockfill_s_4x4_neon);
 p.cu[BLOCK_8x8].blockfill_s   = PFX(blockfill_s_8x8_neon);
diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/arm/ipfilter8.S Tue Mar 01 17:00:20 2016 +0530
@@ -0,0 +1,694 @@
+/*****
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ */
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* ds

[x265] [PATCH 0 of 3 ] Patch series for new primitive pelFilterChroma and ASM code

2016-02-26 Thread dnyaneshwar
Speed up =

pelFilterChroma_Vertical : 600c -> 300c
pelFilterChroma_Horizontal : 585c -> 160c
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 1 of 3] asm: separated pelFilterChroma function into horizontal & vertical primitives for asm

2016-02-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1456466613 -19800
#  Fri Feb 26 11:33:33 2016 +0530
# Node ID 5ff8ee940ad7f4d34b106ae4999b996245c87919
# Parent  01782e7f0a8cb93efbe4ff1534602ff9055c8565
asm: separated pelFilterChroma function into horizontal & vertical primitives 
for asm

diff -r 01782e7f0a8c -r 5ff8ee940ad7 source/common/deblock.cpp
--- a/source/common/deblock.cpp Thu Feb 25 12:17:57 2016 +0530
+++ b/source/common/deblock.cpp Fri Feb 26 11:33:33 2016 +0530
@@ -319,27 +319,6 @@
 }
 }
 
-/* Deblocking of one line/column for the chrominance component
- * \param src pointer to picture data
- * \param offset  offset value for picture data
- * \param tc  tc value
- * \param maskP   indicator to disable filtering on partP
- * \param maskQ   indicator to disable filtering on partQ */
-static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t 
offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
-for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-{
-int16_t m4  = (int16_t)src[0];
-int16_t m3  = (int16_t)src[-offset];
-int16_t m5  = (int16_t)src[offset];
-int16_t m2  = (int16_t)src[-offset * 2];
-
-int32_t delta = x265_clip3(-tc, tc, m4 - m3) * 4) + m2 - m5 + 4) 
>> 3));
-src[-offset] = x265_clip(m3 + (delta & maskP));
-src[0] = x265_clip(m4 - (delta & maskQ));
-}
-}
-
 void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t 
depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
 {
 PicYuv* reconPic = cuQ->m_encData->m_reconPic;
@@ -517,7 +496,7 @@
 int32_t tc = s_tcTable[indexTC] << bitdepthShift;
 pixel* srcC = srcChroma[chromaIdx];
 
-pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, 
maskQ);
+primitives.pelFilterChroma[dir](srcC + unitOffset, srcStep, 
offset, tc, maskP, maskQ);
 }
 }
 }
diff -r 01782e7f0a8c -r 5ff8ee940ad7 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp  Thu Feb 25 12:17:57 2016 +0530
+++ b/source/common/loopfilter.cpp  Fri Feb 26 11:33:33 2016 +0530
@@ -158,6 +158,27 @@
 src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * 
m6 + 2 * m7 + 4) >> 3) - m6) + m6);
 }
 }
+
+/* Deblocking of one line/column for the chrominance component
+* \param src pointer to picture data
+* \param offset  offset value for picture data
+* \param tc  tc value
+* \param maskP   indicator to disable filtering on partP
+* \param maskQ   indicator to disable filtering on partQ */
+static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, 
int32_t tc, int32_t maskP, int32_t maskQ)
+{
+for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
+{
+int16_t m4 = (int16_t)src[0];
+int16_t m3 = (int16_t)src[-offset];
+int16_t m5 = (int16_t)src[offset];
+int16_t m2 = (int16_t)src[-offset * 2];
+
+int32_t delta = x265_clip3(-tc, tc, m4 - m3) * 4) + m2 - m5 + 4) 
>> 3));
+src[-offset]  = x265_clip(m3 + (delta & maskP));
+src[0]= x265_clip(m4 - (delta & maskQ));
+}
+}
 }
 
 namespace X265_NS {
@@ -176,5 +197,7 @@
 // C code is same for EDGE_VER and EDGE_HOR only asm code is different
 p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
 p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
+p.pelFilterChroma[0] = pelFilterChroma_c;
+p.pelFilterChroma[1] = pelFilterChroma_c;
 }
 }
diff -r 01782e7f0a8c -r 5ff8ee940ad7 source/common/primitives.h
--- a/source/common/primitives.hThu Feb 25 12:17:57 2016 +0530
+++ b/source/common/primitives.hFri Feb 26 11:33:33 2016 +0530
@@ -197,6 +197,7 @@
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, 
uint8_t *baseCtxMod, intptr_t ctxOffset);
 
 typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t 
offset, int32_t tcP, int32_t tcQ);
+typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t 
offset, int32_t tc, int32_t maskP, int32_t maskQ);
 
 /* Function pointers to optimized encoder primitives. Each pointer can 
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -332,6 +333,7 @@
 costC1C2Flag_tcostC1C2Flag;
 
 pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
+pelFilterChroma_t pelFilterChroma[2]; // EDGE_VER = 0, EDGE_HOR = 1
 
 /* There is one set of chroma primitives per color space. An encoder will
  * have just a single color space and thus it will only ever use one entry
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 3 of 3] asm: asm code for pelFilterLumaStrong_V/H & pelFilterChroma_V/H for main10 & main12

2016-02-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1456466696 -19800
#  Fri Feb 26 11:34:56 2016 +0530
# Node ID d7d0c03b5e6e7fd0258d609ad5e9f4d7c0a40390
# Parent  59d9eca3d144e71f11d509a5dd40b634bb9ab500
asm: asm code for pelFilterLumaStrong_V/H & pelFilterChroma_V/H for main10 & 
main12

diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Feb 26 11:34:56 2016 +0530
@@ -1101,6 +1101,11 @@
 }
 if (cpuMask & X265_CPU_SSE4)
 {
+p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
+p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
+p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
+
 p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
 p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/const-a.asm Fri Feb 26 11:34:56 2016 +0530
@@ -69,6 +69,7 @@
 const pb_000F,   db 0xff
 times 15 db 0x00
 const pb_shuf_off4, times  2 db   0,   4,   1,   5,   2,   6,   3,   7
+const pw_shuf_off4, times  1 db   0,   1,   8,   9,   2,   3,  10,  
11,   4,   5,  12,  13,   6,   7,  14,  15
 
 ;; 16-bit constants
 
diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/loopfilter.asm  Fri Feb 26 11:34:56 2016 +0530
@@ -51,6 +51,8 @@
 cextern hmul_16p
 cextern pw_1_
 cextern pb_shuf_off4
+cextern pw_shuf_off4
+
 
;
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* 
signLeft, intptr_t stride)
 
;
@@ -3758,6 +3760,9 @@
 
 INIT_XMM sse4
 cglobal pelFilterLumaStrong_H, 5,7,10
+%if HIGH_BIT_DEPTH
+add r2d, r2d
+%endif
 mov r1, r2
 neg r3d
 neg r4d
@@ -3766,6 +3771,16 @@
 lea r5, [r2 * 3]
 lea r6, [r1 * 3]
 
+%if HIGH_BIT_DEPTH
+movum4, [r0]; src[0]
+movum3, [r0 + r1]   ; src[-offset]
+movum2, [r0 + r1 * 2]   ; src[-offset * 2]
+movum1, [r0 + r6]   ; src[-offset * 3]
+movum0, [r0 + r1 * 4]   ; src[-offset * 4]
+movum5, [r0 + r2]   ; src[offset]
+movum6, [r0 + r2 * 2]   ; src[offset * 2]
+movum7, [r0 + r5]   ; src[offset * 3]
+%else
 pmovzxbwm4, [r0]; src[0]
 pmovzxbwm3, [r0 + r1]   ; src[-offset]
 pmovzxbwm2, [r0 + r1 * 2]   ; src[-offset * 2]
@@ -3774,6 +3789,7 @@
 pmovzxbwm5, [r0 + r2]   ; src[offset]
 pmovzxbwm6, [r0 + r2 * 2]   ; src[offset * 2]
 pmovzxbwm7, [r0 + r5]   ; src[offset * 3]
+%endif
 
 paddw   m0, m0  ; m0*2
 movam8, m2
@@ -3841,6 +3857,15 @@
 paddw   m0, m1
 paddw   m3, m4
 paddw   m9, m5
+
+%if HIGH_BIT_DEPTH
+movh[r0 + r6], m0
+movhps  [r0 + r1], m0
+movh[r0], m3
+movhps  [r0 + r2 * 2], m3,
+movh[r0 + r2 * 1], m9
+movhps  [r0 + r1 * 2], m9
+%else
 packuswbm0, m0
 packuswbm3, m9
 
@@ -3850,14 +3875,41 @@
 pextrd  [r0 + r2 * 2], m3, 1
 pextrd  [r0 + r2 * 1], m3, 2
 pextrd  [r0 + r1 * 2], m3, 3
+%endif
 RET
 
 INIT_XMM sse4
 cglobal pelFilterLumaStrong_V, 5,5,10
+%if HIGH_BIT_DEPTH
+add r1d, r1d
+%endif
 neg r3d
 neg r4d
 lea r2, [r1 * 3]
 
+%if HIGH_BIT_DEPTH
+movum0, [r0 - 8]; src[-offset * 4] row 0
+movum1, [r0 + r1 * 1 - 8]   ; src[-offset * 4] row 1
+movum2, [r0 + r1 * 2 - 8]   ; src[-offset * 4] row 2
+movum3, [r0 + r2 * 1 - 8]   ; src[-offset * 4] row 3
+
+punpckhwd   m4, m0, m1  ; [m4 m4 m5 m5 m6 m6 m7 m7]
+punpcklwd   m0, m1  ; [m0 m0 m1 m1 m2 m2 m3 m3]
+
+punpckhwd   m5, m2, m3  ; [m4 m4 m5 m5 m6 m6 m7 m7]
+punpcklwd   m2, m3  ; [m0 m0 m1 m1 m2 m2 m3 m3]
+
+punpckhdq   m3, m0, m2  ; [m2 m2 m2 m2 m3 m3 m3 

[x265] [PATCH] arm: Implement pixel_ssd_s ARM NEON asm

2016-02-25 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1456136894 -19800
#  Mon Feb 22 15:58:14 2016 +0530
# Node ID ed3dd1a26cb5801e306db8f1d4a52cd1f4d6620b
# Parent  4a1b8f3c0c7385ff19fd61133e0af4464510e9aa
arm: Implement pixel_ssd_s ARM NEON asm

diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp  Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp  Mon Feb 22 15:58:14 2016 +0530
@@ -42,6 +42,12 @@
 {
 if (cpuMask & X265_CPU_NEON)
 {
+// ssd_s
+p.cu[BLOCK_4x4].ssd_s   = PFX(pixel_ssd_s_4x4_neon);
+p.cu[BLOCK_8x8].ssd_s   = PFX(pixel_ssd_s_8x8_neon);
+p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);
+p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);
+
 // sse_ss
 p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_neon);
 p.cu[BLOCK_8x8].sse_ss   = PFX(pixel_sse_ss_8x8_neon);
diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/pixel.h Mon Feb 22 15:58:14 2016 +0530
@@ -123,6 +123,12 @@
 sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, 
const int16_t* pix2, intptr_t stride_pix2);
 sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, 
const int16_t* pix2, intptr_t stride_pix2);
 
+sse_t x265_pixel_ssd_s_4x4_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_8x8_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_16x16_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_32x32_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_64x64_neon(const int16_t* a, intptr_t dstride);
+
 void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, 
const pixel* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, 
const pixel* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* 
b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/ssd-a.S
--- a/source/common/arm/ssd-a.S Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/ssd-a.S Mon Feb 22 15:58:14 2016 +0530
@@ -371,4 +371,99 @@
 bx  lr
 endfunc
 
+function x265_pixel_ssd_s_4x4_neon
+add r1, r1
+vld1.s16{d4}, [r0], r1
+vld1.s16{d5}, [r0], r1
+vld1.s16{d6}, [r0], r1
+vld1.s16{d7}, [r0]
+vmull.s16   q0, d4, d4
+vmull.s16   q1, d5, d5
+vmlal.s16   q0, d6, d6
+vmlal.s16   q1, d7, d7
+vadd.s32q0, q1
+vadd.s32d0, d0, d1
+vpadd.s32   d0, d0, d0
+vmov.32 r0, d0[0]
+bx  lr
+endfunc
 
+function x265_pixel_ssd_s_8x8_neon
+add r1, r1
+vld1.s16{q8}, [r0], r1
+vld1.s16{q9}, [r0], r1
+vmull.s16   q0, d16, d16
+vmull.s16   q1, d17, d17
+vmlal.s16   q0, d18, d18
+vmlal.s16   q1, d19, d19
+.rept 3
+vld1.s16{q8}, [r0], r1
+vld1.s16{q9}, [r0], r1
+vmlal.s16   q0, d16, d16
+vmlal.s16   q1, d17, d17
+vmlal.s16   q0, d18, d18
+vmlal.s16   q1, d19, d19
+.endr
+vadd.s32q0, q1
+vadd.s32d0, d0, d1
+vpadd.s32   d0, d0, d0
+vmov.32 r0, d0[0]
+bx  lr
+endfunc
+
+function x265_pixel_ssd_s_16x16_neon
+add r1, r1
+mov r12, #4
+veor.u8 q0, q0
+veor.u8 q1, q1
+
+.loop_ssd_s_16:
+subsr12, #1
+.rept 2
+vld1.s16{q8-q9}, [r0], r1
+vld1.s16{q10-q11}, [r0], r1
+vmlal.s16   q0, d16, d16
+vmlal.s16   q1, d17, d17
+vmlal.s16   q0, d18, d18
+vmlal.s16   q1, d19, d19
+vmlal.s16   q0, d20, d20
+vmlal.s16   q1, d21, d21
+vmlal.s16   q0, d22, d22
+vmlal.s16   q1, d23, d23
+.endr
+bne .loop_ssd_s_16
+vadd.s32q0, q1
+vadd.s32d0, d0, d1
+vpadd.s32   d0, d0, d0
+vmov.32 r0, d0[0]
+bx  lr
+endfunc
+
+function x265_pixel_ssd_s_32x32_neon
+add r1, r1
+sub r1, #32
+mov r12, #8
+veor.u8 q0, q0
+veor.u8 q1, q1
+
+.loop_ssd_s_32:
+subsr12, #1
+.rept 4
+vld1.s16{q8-q9}, [r0]!
+vld1.s16{q10-q11}, [r0], r1
+vmlal.s16   q0, d16, d16
+vmlal.s16   q1, d17, d17
+vmlal.s16   q0, d18, d18
+vmlal.s16   q1, d19, d19
+vmlal.s16   q0, d20, d20
+vmlal.s16   q1, d21, d21
+vmlal.s16   q0, d22, d22
+vmlal.s16   q1, d23, d23
+.endr
+bne .loop_ssd_s_32
+vadd.s32q0, q1
+vadd.s32d0, d0, d1
+vpadd.s32   d0, d0, d0
+vmov.32 r0, d0[0]
+bx  lr
+endfunc
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/lis

[x265] [PATCH] arm: Implement pixel_sse_ss ARM NEON asm

2016-02-25 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1456382751 -19800
#  Thu Feb 25 12:15:51 2016 +0530
# Node ID 4a1b8f3c0c7385ff19fd61133e0af4464510e9aa
# Parent  45c0dbd43dec24608199362a86bfba6ef91cacca
arm: Implement pixel_sse_ss ARM NEON asm

diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp  Mon Feb 22 18:22:37 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp  Thu Feb 25 12:15:51 2016 +0530
@@ -42,6 +42,13 @@
 {
 if (cpuMask & X265_CPU_NEON)
 {
+// sse_ss
+p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_neon);
+p.cu[BLOCK_8x8].sse_ss   = PFX(pixel_sse_ss_8x8_neon);
+p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_neon);
+p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_neon);
+p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_neon);
+
 // pixel_sub_ps
 p.cu[BLOCK_4x4].sub_ps   = PFX(pixel_sub_ps_4x4_neon);
 p.cu[BLOCK_8x8].sub_ps   = PFX(pixel_sub_ps_8x8_neon);
diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Mon Feb 22 18:22:37 2016 +0530
+++ b/source/common/arm/pixel.h Thu Feb 25 12:15:51 2016 +0530
@@ -117,6 +117,12 @@
 sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, 
const pixel* pix2, intptr_t stride_pix2);
 sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, 
const pixel* pix2, intptr_t stride_pix2);
 
+sse_t x265_pixel_sse_ss_4x4_neon(const int16_t* pix1, intptr_t stride_pix1, 
const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_8x8_neon(const int16_t* pix1, intptr_t stride_pix1, 
const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_16x16_neon(const int16_t* pix1, intptr_t stride_pix1, 
const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, 
const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, 
const int16_t* pix2, intptr_t stride_pix2);
+
 void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, 
const pixel* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, 
const pixel* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* 
b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/ssd-a.S
--- a/source/common/arm/ssd-a.S Mon Feb 22 18:22:37 2016 +0530
+++ b/source/common/arm/ssd-a.S Thu Feb 25 12:15:51 2016 +0530
@@ -194,3 +194,181 @@
 vmov.32 r0, d0[0]
 bx  lr
 endfunc
+
+function x265_pixel_sse_ss_4x4_neon
+add r1, r1
+add r3, r3
+
+vld1.s16{d16}, [r0], r1
+vld1.s16{d18}, [r2], r3
+vsub.s16q2, q8, q9
+vld1.s16{d16}, [r0], r1
+vmull.s16   q0, d4, d4
+vld1.s16{d18}, [r2], r3
+
+vsub.s16q2, q8, q9
+vld1.s16{d16}, [r0], r1
+vmlal.s16   q0, d4, d4
+vld1.s16{d18}, [r2], r3
+
+vsub.s16q2, q8, q9
+vld1.s16{d16}, [r0], r1
+vmlal.s16   q0, d4, d4
+vld1.s16{d18}, [r2], r3
+
+vsub.s16q2, q8, q9
+vmlal.s16   q0, d4, d4
+
+vadd.s32d0, d0, d1
+vpadd.s32   d0, d0, d0
+vmov.32 r0, d0[0]
+bx  lr
+endfunc
+
+function x265_pixel_sse_ss_8x8_neon
+add r1, r1
+add r3, r3
+
+vld1.s16{q8}, [r0], r1
+vld1.s16{q9}, [r2], r3
+vsub.s16q8, q9
+vmull.s16   q0, d16, d16
+vmull.s16   q1, d17, d17
+
+.rept 7
+vld1.s16{q8}, [r0], r1
+vld1.s16{q9}, [r2], r3
+vsub.s16q8, q9
+vmlal.s16   q0, d16, d16
+vmlal.s16   q1, d17, d17
+.endr
+vadd.s32q0, q1
+vadd.s32d0, d0, d1
+vpadd.s32   d0, d0, d0
+vmov.32 r0, d0[0]
+bx  lr
+endfunc
+
+function x265_pixel_sse_ss_16x16_neon
+add r1, r1
+add r3, r3
+
+mov r12, #4
+veor.u8 q0, q0
+veor.u8 q1, q1
+
+.loop_sse_ss_16:
+subsr12, #1
+.rept 4
+vld1.s16{q8-q9}, [r0], r1
+vld1.s16{q10-q11}, [r2], r3
+vsub.s16q8, q10
+vsub.s16q9, q11
+vmlal.s16   q0, d16, d16
+vmlal.s16   q1, d17, d17
+vmlal.s16   q0, d18, d18
+vmlal.s16   q1, d19, d19
+.endr
+bne .loop_sse_ss_16
+vadd.s32q0, q1
+vadd.s32d0, d0, d1
+vpadd.s32   d0, d0, d0
+vmov.32 r0, d0[0]
+bx  lr
+endfunc
+
+function x265_pixel_sse_ss_32x32_neon
+add r1, r1
+add r3, r3
+sub r1, #32
+sub r3, #32
+mov r12, #8
+veor.u8 q0, q0
+veor.u8 q1, q1
+
+.loop_sse_ss_32:
+subsr12, #1
+.rept 4
+vld1.s16{q

[x265] [PATCH] arm: Implement pixel_sse_pp ARM NEON asm

2016-02-18 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1455794242 -19800
#  Thu Feb 18 16:47:22 2016 +0530
# Node ID 5e4593ef30cc4bccc5eec2a0109b8dff397e5c93
# Parent  b31fa1a4ef43697e163d17dda0f4650de45d6ff9
arm: Implement pixel_sse_pp ARM NEON asm

diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Thu Feb 18 16:37:01 2016 +0530
+++ b/source/common/CMakeLists.txt  Thu Feb 18 16:47:22 2016 +0530
@@ -89,7 +89,7 @@
 set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
loopfilter.h)
 
 # add ARM assembly/intrinsic files here
-set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S)
+set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S)
 set(VEC_PRIMITIVES)
 
 set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp  Thu Feb 18 16:37:01 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp  Thu Feb 18 16:47:22 2016 +0530
@@ -42,6 +42,13 @@
 {
 if (cpuMask & X265_CPU_NEON)
 {
+// sse_pp
+p.cu[BLOCK_4x4].sse_pp   = PFX(pixel_sse_pp_4x4_neon);
+p.cu[BLOCK_8x8].sse_pp   = PFX(pixel_sse_pp_8x8_neon);
+p.cu[BLOCK_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon);
+p.cu[BLOCK_32x32].sse_pp = PFX(pixel_sse_pp_32x32_neon);
+p.cu[BLOCK_64x64].sse_pp = PFX(pixel_sse_pp_64x64_neon);
+
 // pixel_var
 p.cu[BLOCK_8x8].var   = PFX(pixel_var_8x8_neon);
 p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Thu Feb 18 16:37:01 2016 +0530
+++ b/source/common/arm/pixel.h Thu Feb 18 16:47:22 2016 +0530
@@ -111,4 +111,10 @@
 void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const 
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, 
int32_t* res);
 void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const 
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, 
int32_t* res);
 
+sse_t x265_pixel_sse_pp_4x4_neon(const pixel* pix1, intptr_t stride_pix1, 
const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_8x8_neon(const pixel* pix1, intptr_t stride_pix1, 
const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_16x16_neon(const pixel* pix1, intptr_t stride_pix1, 
const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, 
const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, 
const pixel* pix2, intptr_t stride_pix2);
+
 #endif // ifndef X265_I386_PIXEL_ARM_H
diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/ssd-a.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/arm/ssd-a.S Thu Feb 18 16:47:22 2016 +0530
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ */
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+
+.text
+
+
+function x265_pixel_sse_pp_4x4_neon
+vld1.32 {d16[]}, [r0], r1
+vld1.32 {d17[]}, [r2], r3
+vsubl.u8q2, d16, d17
+vld1.32 {d16[]}, [r0], r1
+vmull.s16   q0, d4, d4
+vld1.32 {d17[]}, [r2], r3
+
+vsubl.u8q2, d16, d17
+vld1.32 {d16[]}, [r0], r1
+vmlal.s16   q0, d4, d4
+vld1.32 {d17[]}, [r2], r3
+
+vsubl.u8q2, d16, d17
+vld1.32 {d16[]}, [r0], r1
+vmlal.s16   q0, d4, d4
+vld1.32 {d17[]}, [r2], r3
+
+vsubl.u8q2, d16, d17
+vmlal.s16   q0, d4, d4
+vadd.s32d0, d0, d1
+vpadd.s32   d0, d0, d0
+vmov.32 r0, d0[0]
+bx  lr
+endfunc
+
+function x265_pixel_sse_pp_8x8_neon
+vld1.64  

[x265] [PATCH] arm: Implement pixel_var ARM NEON asm

2016-02-18 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1455793621 -19800
#  Thu Feb 18 16:37:01 2016 +0530
# Node ID b31fa1a4ef43697e163d17dda0f4650de45d6ff9
# Parent  cb8769b5ea70304d658173e02deb254fb8572bd6
arm: Implement pixel_var ARM NEON asm

diff -r cb8769b5ea70 -r b31fa1a4ef43 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Thu Feb 18 10:23:24 2016 +0530
+++ b/source/common/CMakeLists.txt  Thu Feb 18 16:37:01 2016 +0530
@@ -89,7 +89,7 @@
 set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
loopfilter.h)
 
 # add ARM assembly/intrinsic files here
-set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S)
+set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S)
 set(VEC_PRIMITIVES)
 
 set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
diff -r cb8769b5ea70 -r b31fa1a4ef43 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp  Thu Feb 18 10:23:24 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp  Thu Feb 18 16:37:01 2016 +0530
@@ -32,6 +32,7 @@
 extern "C" {
 #include "blockcopy8.h"
 #include "pixel.h"
+#include "pixel-util.h"
 }
 
 namespace X265_NS {
@@ -41,6 +42,12 @@
 {
 if (cpuMask & X265_CPU_NEON)
 {
+// pixel_var
+p.cu[BLOCK_8x8].var   = PFX(pixel_var_8x8_neon);
+p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
+p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_neon);
+p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_neon);
+
 // blockcopy
 p.pu[LUMA_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
 p.pu[LUMA_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
diff -r cb8769b5ea70 -r b31fa1a4ef43 source/common/arm/pixel-util.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/arm/pixel-util.SThu Feb 18 16:37:01 2016 +0530
@@ -0,0 +1,243 @@
+/*****
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ */
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+
+.text
+
+.macro VAR_SQR_SUM qsqr_sum, qsqr_last, qsqr_temp, dsrc, num=0, 
vpadal=vpadal.u16
+vmull.u8\qsqr_temp, \dsrc, \dsrc
+vaddw.u8q\num, q\num, \dsrc
+\vpadal \qsqr_sum, \qsqr_last
+.endm
+
+function x265_pixel_var_8x8_neon
+vld1.u8 {d16}, [r0], r1
+vmull.u8q1, d16, d16
+vmovl.u8q0, d16
+vld1.u8 {d18}, [r0], r1
+vmull.u8q2, d18, d18
+vaddw.u8q0, q0, d18
+
+vld1.u8 {d20}, [r0], r1
+VAR_SQR_SUM q1, q1, q3, d20, 0, vpaddl.u16
+vld1.u8 {d22}, [r0], r1
+VAR_SQR_SUM q2, q2, q8, d22, 0, vpaddl.u16
+
+vld1.u8 {d24}, [r0], r1
+VAR_SQR_SUM q1, q3, q9, d24
+vld1.u8 {d26}, [r0], r1
+VAR_SQR_SUM q2, q8, q10, d26
+vld1.u8 {d24}, [r0], r1
+VAR_SQR_SUM q1, q9, q14, d24
+vld1.u8 {d26}, [r0], r1
+VAR_SQR_SUM q2, q10, q15, d26
+
+vpaddl.u16  q8, q14
+vpaddl.u16  q9, q15
+vadd.u32q1, q1, q8
+vadd.u16d0, d0, d1
+vadd.u32q1, q1, q9
+vadd.u32q1, q1, q2
+vpaddl.u16  d0, d0
+vadd.u32d2, d2, d3
+vpadd.u32   d0, d0, d2
+
+vmovr0, r1, d0
+bx  lr
+endfunc
+
+function x265_pixel_var_16x16_neon
+veor.u8 q0, q0
+veor.u8 q1, q1
+veor.u8 q2, q2
+veor.u8 q14, q14
+veor.u8 q15, q15
+mov ip, #4
+
+.var16_loop:
+subsip, ip, #1
+vld1.u8 {q8}, [r0], r1
+VAR_SQR_SUM q1, q14, q12, d16
+VAR_SQR_SUM q2, q15, q13, d17
+
+vld1.u8 {q9}, [r0], r1
+VAR_SQR_SUM q1, q12, q14, d18
+VAR_SQR_SUM q2, q13, q15, d19
+
+vld1.u8 {q8}, [r0], r1
+V

[x265] [PATCH] arm: Implement sad_x3 and sad_x4 ARM NEON asm

2016-02-15 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1455598958 -19800
#  Tue Feb 16 10:32:38 2016 +0530
# Node ID ac6c535109a43e9cdb69f30db1143c06400a19f4
# Parent  e3902c96c3c268ec4ab1a4976ee2feae7348b36f
arm: Implement sad_x3 and sad_x4 ARM NEON asm

diff -r e3902c96c3c2 -r ac6c535109a4 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp  Thu Feb 11 15:00:20 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp  Tue Feb 16 10:32:38 2016 +0530
@@ -41,6 +41,7 @@
 {
 if (cpuMask & X265_CPU_NEON)
 {
+// blockcopy
 p.pu[LUMA_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
 p.pu[LUMA_8x4].copy_pp = PFX(blockcopy_pp_8x4_neon);
 p.pu[LUMA_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
@@ -66,11 +67,65 @@
 p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_neon);
 p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
 p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
+
+// sad_x3
+p.pu[LUMA_4x4].sad_x3   = PFX(sad_x3_4x4_neon);
+p.pu[LUMA_4x8].sad_x3   = PFX(sad_x3_4x8_neon);
+p.pu[LUMA_4x16].sad_x3  = PFX(sad_x3_4x16_neon);
+p.pu[LUMA_8x4].sad_x3   = PFX(sad_x3_8x4_neon);
+p.pu[LUMA_8x8].sad_x3   = PFX(sad_x3_8x8_neon);
+p.pu[LUMA_8x16].sad_x3  = PFX(sad_x3_8x16_neon);
+p.pu[LUMA_8x32].sad_x3  = PFX(sad_x3_8x32_neon);
+p.pu[LUMA_12x16].sad_x3 = PFX(sad_x3_12x16_neon);
+p.pu[LUMA_16x4].sad_x3  = PFX(sad_x3_16x4_neon);
+p.pu[LUMA_16x8].sad_x3  = PFX(sad_x3_16x8_neon);
+p.pu[LUMA_16x12].sad_x3 = PFX(sad_x3_16x12_neon);
+p.pu[LUMA_16x16].sad_x3 = PFX(sad_x3_16x16_neon);
+p.pu[LUMA_16x32].sad_x3 = PFX(sad_x3_16x32_neon);
+p.pu[LUMA_16x64].sad_x3 = PFX(sad_x3_16x64_neon);
+p.pu[LUMA_24x32].sad_x3 = PFX(sad_x3_24x32_neon);
+p.pu[LUMA_32x8].sad_x3  = PFX(sad_x3_32x8_neon);
+p.pu[LUMA_32x16].sad_x3 = PFX(sad_x3_32x16_neon);
+p.pu[LUMA_32x24].sad_x3 = PFX(sad_x3_32x24_neon);
+p.pu[LUMA_32x32].sad_x3 = PFX(sad_x3_32x32_neon);
+p.pu[LUMA_32x64].sad_x3 = PFX(sad_x3_32x64_neon);
+p.pu[LUMA_48x64].sad_x3 = PFX(sad_x3_48x64_neon);
+p.pu[LUMA_64x16].sad_x3 = PFX(sad_x3_64x16_neon);
+p.pu[LUMA_64x32].sad_x3 = PFX(sad_x3_64x32_neon);
+p.pu[LUMA_64x48].sad_x3 = PFX(sad_x3_64x48_neon);
+p.pu[LUMA_64x64].sad_x3 = PFX(sad_x3_64x64_neon);
+
+// sad_x4
+p.pu[LUMA_4x4].sad_x4   = PFX(sad_x4_4x4_neon);
+p.pu[LUMA_4x8].sad_x4   = PFX(sad_x4_4x8_neon);
+p.pu[LUMA_4x16].sad_x4  = PFX(sad_x4_4x16_neon);
+p.pu[LUMA_8x4].sad_x4   = PFX(sad_x4_8x4_neon);
+p.pu[LUMA_8x8].sad_x4   = PFX(sad_x4_8x8_neon);
+p.pu[LUMA_8x16].sad_x4  = PFX(sad_x4_8x16_neon);
+p.pu[LUMA_8x32].sad_x4  = PFX(sad_x4_8x32_neon);
+p.pu[LUMA_12x16].sad_x4 = PFX(sad_x4_12x16_neon);
+p.pu[LUMA_16x4].sad_x4  = PFX(sad_x4_16x4_neon);
+p.pu[LUMA_16x8].sad_x4  = PFX(sad_x4_16x8_neon);
+p.pu[LUMA_16x12].sad_x4 = PFX(sad_x4_16x12_neon);
+p.pu[LUMA_16x16].sad_x4 = PFX(sad_x4_16x16_neon);
+p.pu[LUMA_16x32].sad_x4 = PFX(sad_x4_16x32_neon);
+p.pu[LUMA_16x64].sad_x4 = PFX(sad_x4_16x64_neon);
+p.pu[LUMA_24x32].sad_x4 = PFX(sad_x4_24x32_neon);
+p.pu[LUMA_32x8].sad_x4  = PFX(sad_x4_32x8_neon);
+p.pu[LUMA_32x16].sad_x4 = PFX(sad_x4_32x16_neon);
+p.pu[LUMA_32x24].sad_x4 = PFX(sad_x4_32x24_neon);
+p.pu[LUMA_32x32].sad_x4 = PFX(sad_x4_32x32_neon);
+p.pu[LUMA_32x64].sad_x4 = PFX(sad_x4_32x64_neon);
+p.pu[LUMA_48x64].sad_x4 = PFX(sad_x4_48x64_neon);
+p.pu[LUMA_64x16].sad_x4 = PFX(sad_x4_64x16_neon);
+p.pu[LUMA_64x32].sad_x4 = PFX(sad_x4_64x32_neon);
+p.pu[LUMA_64x48].sad_x4 = PFX(sad_x4_64x48_neon);
+p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon);
 }
 if (cpuMask & X265_CPU_ARMV6)
 {
-p.pu[LUMA_4x4].sad=PFX(pixel_sad_4x4_armv6);
- p.pu[LUMA_4x8].sad=PFX(pixel_sad_4x8_armv6);
+p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_armv6);
+p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_armv6);
 }
 }
 } // namespace X265_NS
diff -r e3902c96c3c2 -r ac6c535109a4 source/common/arm/asm.S
--- a/source/common/arm/asm.S   Thu Feb 11 15:00:20 2016 +0530
+++ b/source/common/arm/asm.S   Tue Feb 16 10:32:38 2016 +0530
@@ -108,7 +108,7 @@
 #define JOIN(a, b) GLUE(a, b)
 #define X(s) JOIN(EXTERN_ASM, s)
 
-#define FENC_STRIDE 16
+#define FENC_STRIDE 64
 #define FDEC_STRIDE 32
 
 .macro HORIZ_ADD dest, a, b
diff -r e3902c96c3c2 -r ac6c535109a4 source/common/arm/mc-a.S
--- a/source/common/arm/mc-a.S  Thu Feb 11 15:00:20 2016 +0530
+++ b/source/common/arm/mc-a.S  Tue Feb 16 10:32:38 2016 +0530
@@ -34,7 +34,7 @@
  * r0   - dst
  * r1   - dstStride
  * r2   - src
- * d3   - srcStride */
+ * r3   - srcStrid

Re: [x265] [PATCH] arm: Implement blockcopy_pp_NxN_neon

2016-02-11 Thread Dnyaneshwar Gorade
On Thu, Feb 11, 2016 at 5:30 PM, chen  wrote:

>
> At 2016-02-11 17:54:45,radhakrish...@multicorewareinc.com wrote:
> ># HG changeset patch
> ># User radhakrish...@multicorewareinc.com
> ># Date 1455183020 -19800
> >#  Thu Feb 11 15:00:20 2016 +0530
> ># Node ID 4f5720ccaf1aa04868054636f14dce8ea65390ad
> ># Parent  a2ff6747eaf7b25102f27f808cf5526f441df488
> >arm: Implement blockcopy_pp_NxN_neon
> >
> >+function x265_blockcopy_pp_48x64_neon
> >+push{r4, r5}
> >+mov r4, #8
> >+mov r5, #32
> >+sub r3, r5
> >+sub r1, r5
> >+loop_48x64:
> >+.rept 8
> >+vld1.8  {q0, q1}, [r2]!
> >+vld1.8  {q2}, [r2], r3
> the ARM support format"vld1.8 {q0, q1, q1}, Rn, Rm"
>
>

> ​Load support maximum upto 4 double word vld1.8 {d0,d1,d2,d3}, [Rn], Rm
> OR 2 quadwords vld1.8 {q0,q1}, [Rn], Rm​
>
>

> >+vst1.8  {q0, q1}, [r0]!
> >+vst1.8  {q2}, [r0], r1
> >+.endr
> >+subsr4, r4, #1
> >+bne loop_48x64
> >+pop {r4, r5}
> >+bx  lr
> >+endfunc
>
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] threadpool: utilize all processors on embedded ARM platforms

2016-02-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1455010589 -19800
#  Tue Feb 09 15:06:29 2016 +0530
# Node ID 18b83aaee1b56e2048a425c25a452aa62c39da89
# Parent  023e6051c4c63ab1633b2de0e8f37e6158796288
threadpool: utilize all processors on embedded ARM platforms

diff -r 023e6051c4c6 -r 18b83aaee1b5 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Fri Feb 05 15:13:57 2016 +0530
+++ b/source/common/threadpool.cpp  Tue Feb 09 15:06:29 2016 +0530
@@ -528,6 +528,10 @@
 SYSTEM_INFO sysinfo;
 GetSystemInfo();
 return sysinfo.dwNumberOfProcessors;
+#elif __unix__ && X265_ARCH_ARM
+/* Return the number of processors configured by OS. Because, most 
embedded linux distributions
+ * uses only one processor as the scheduler doesn't have enough work to 
utilize all processors */
+return sysconf(_SC_NPROCESSORS_CONF);
 #elif __unix__
 return sysconf(_SC_NPROCESSORS_ONLN);
 #elif MACOS
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] arm: Implement blockcopy_pp_16x16_neon. Modified include guards with ARM suffix

2016-02-02 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1454410744 -19800
#  Tue Feb 02 16:29:04 2016 +0530
# Node ID 5463e2b9f37e4952bb16e94673c6fd2991243145
# Parent  dc62b47dd0d98f732165345883edac55320baec1
arm: Implement blockcopy_pp_16x16_neon. Modified include guards with ARM suffix.

diff -r dc62b47dd0d9 -r 5463e2b9f37e source/CMakeLists.txt
--- a/source/CMakeLists.txt Mon Jan 25 14:59:50 2016 +0530
+++ b/source/CMakeLists.txt Tue Feb 02 16:29:04 2016 +0530
@@ -182,9 +182,11 @@
 add_definitions(-march=i686)
 endif()
 if(ARM AND CROSS_COMPILE_ARM)
-add_definitions(-march=armv6 -mfloat-abi=soft -mfpu=vfp)
+set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp)
+add_definitions(${ARM_ARGS})
 elseif(ARM)
-add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
+set(ARM_ARGS -march=armv6 -mfloat-abi=hard -mfpu=vfp)
+add_definitions(${ARM_ARGS})
 endif()
 if(FPROFILE_GENERATE)
 if(INTEL_CXX)
@@ -418,7 +420,7 @@
 add_subdirectory(encoder)
 add_subdirectory(common)
 
-if((MSVC_IDE OR XCODE) AND ENABLE_ASSEMBLY)
+if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
 # this is required because of this cmake bug
 # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170
 if(WIN32)
@@ -429,23 +431,33 @@
 
 if(ARM OR CROSS_COMPILE_ARM)
 # compile ARM arch asm files here
-
+enable_language(ASM)
+foreach(ASM ${ARM_ASMS})
+set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+list(APPEND ASM_SRCS ${ASM_SRC})
+list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
+add_custom_command(
+OUTPUT ${ASM}.${SUFFIX}
+COMMAND ${CMAKE_CXX_COMPILER}
+ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+DEPENDS ${ASM_SRC})
+endforeach()
 elseif(X86)
 # compile X86 arch asm files here
 foreach(ASM ${MSVC_ASMS})
-set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
-list(APPEND YASM_SRCS ${YASM_SRC})
-list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
+set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
+list(APPEND ASM_SRCS ${ASM_SRC})
+list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
 add_custom_command(
 OUTPUT ${ASM}.${SUFFIX}
-COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o 
${ASM}.${SUFFIX}
-DEPENDS ${YASM_SRC})
+COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${ASM_SRC} -o 
${ASM}.${SUFFIX}
+DEPENDS ${ASM_SRC})
 endforeach()
 endif()
 endif()
 
-source_group(ASM FILES ${YASM_SRCS})
-add_library(x265-static STATIC $ 
$ ${YASM_OBJS} ${YASM_SRCS})
+source_group(ASM FILES ${ASM_SRCS})
+add_library(x265-static STATIC $ 
$ ${ASM_OBJS} ${ASM_SRCS})
 if(NOT MSVC)
 set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265)
 endif()
@@ -479,7 +491,7 @@
 
 option(ENABLE_SHARED "Build shared library" ON)
 if(ENABLE_SHARED)
-add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" 
${YASM_OBJS}
+add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${ASM_OBJS}
 ${X265_RC_FILE} $ 
$)
 if(EXTRA_LIB)
 target_link_libraries(x265-shared ${EXTRA_LIB})
@@ -575,7 +587,7 @@
 # Xcode seems unable to link the CLI with libs, so link as one targget
 add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp
-   $ $ 
${YASM_OBJS} ${YASM_SRCS})
+   $ $ 
${ASM_OBJS} ${ASM_SRCS})
 else()
 add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} 
${X265_RC_FILE}
${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h 
x265-extras.cpp)
diff -r dc62b47dd0d9 -r 5463e2b9f37e source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Mon Jan 25 14:59:50 2016 +0530
+++ b/source/common/CMakeLists.txt  Tue Feb 02 16:29:04 2016 +0530
@@ -89,9 +89,10 @@
 set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
loopfilter.h)
 
 # add ARM assembly/intrinsic files here
-set(A_SRCS)
+set(A_SRCS asm.S cpu-a.S mc-a.S)
 set(VEC_PRIMITIVES)
 
+set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
 foreach(SRC ${C_SRCS})
 set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
 endforeach()
diff -r dc62b47dd0d9 -r 5463e2b9f37e source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp  Mon Jan 25 14:59:50 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp  Tue Feb 02 16:29:04 2016 +0530
@@ -29,12 +29,18 @@
 #include "x265.h"
 #include "cpu.h"
 
+extern "C" {

[x265] [PATCH] arm: Implement blockcopy_pp_16x16_neon. Modified include guards with ARM suffix

2016-02-01 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1454327470 -19800
#  Mon Feb 01 17:21:10 2016 +0530
# Node ID 894e0fce5d14844d3c85cdb2a287f302fc8cffca
# Parent  dc62b47dd0d98f732165345883edac55320baec1
arm: Implement blockcopy_pp_16x16_neon. Modified include guards with ARM suffix.

diff -r dc62b47dd0d9 -r 894e0fce5d14 source/CMakeLists.txt
--- a/source/CMakeLists.txt Mon Jan 25 14:59:50 2016 +0530
+++ b/source/CMakeLists.txt Mon Feb 01 17:21:10 2016 +0530
@@ -182,9 +182,11 @@
 add_definitions(-march=i686)
 endif()
 if(ARM AND CROSS_COMPILE_ARM)
-add_definitions(-march=armv6 -mfloat-abi=soft -mfpu=vfp)
+set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp)
+add_definitions(${ARM_ARGS})
 elseif(ARM)
-add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
+set(ARM_ARGS -march=armv6 -mfloat-abi=hard -mfpu=vfp)
+add_definitions(${ARM_ARGS})
 endif()
 if(FPROFILE_GENERATE)
 if(INTEL_CXX)
@@ -418,7 +420,7 @@
 add_subdirectory(encoder)
 add_subdirectory(common)
 
-if((MSVC_IDE OR XCODE) AND ENABLE_ASSEMBLY)
+if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
 # this is required because of this cmake bug
 # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170
 if(WIN32)
@@ -429,7 +431,17 @@
 
 if(ARM OR CROSS_COMPILE_ARM)
 # compile ARM arch asm files here
-
+enable_language(ASM)
+foreach(ASM ${ARM_ASMS})
+set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+list(APPEND YASM_SRCS ${YASM_SRC})
+list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
+add_custom_command(
+OUTPUT ${ASM}.${SUFFIX}
+COMMAND ${CMAKE_CXX_COMPILER}
+ARGS ${ARM_ARGS} -c ${YASM_SRC} -o ${ASM}.${SUFFIX}
+DEPENDS ${YASM_SRC})
+endforeach()
 elseif(X86)
 # compile X86 arch asm files here
 foreach(ASM ${MSVC_ASMS})
diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Mon Jan 25 14:59:50 2016 +0530
+++ b/source/common/CMakeLists.txt  Mon Feb 01 17:21:10 2016 +0530
@@ -89,9 +89,10 @@
 set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
loopfilter.h)
 
 # add ARM assembly/intrinsic files here
-set(A_SRCS)
+set(ARM_SRCS asm.S cpu-a.S mc-a.S)
 set(VEC_PRIMITIVES)
 
+set(ARM_ASMS "${ARM_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
 foreach(SRC ${C_SRCS})
 set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
 endforeach()
diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp  Mon Jan 25 14:59:50 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp  Mon Feb 01 17:21:10 2016 +0530
@@ -29,12 +29,18 @@
 #include "x265.h"
 #include "cpu.h"
 
+extern "C" {
+#include "blockcopy8.h"
+}
 
 namespace X265_NS {
 // private x265 namespace
 
 void setupAssemblyPrimitives(EncoderPrimitives , int cpuMask)
 {
-
+if (cpuMask & X265_CPU_NEON)
+{
+p.pu[LUMA_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
+}
 }
 } // namespace X265_NS
diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/asm.S
--- a/source/common/arm/asm.S   Mon Jan 25 14:59:50 2016 +0530
+++ b/source/common/arm/asm.S   Mon Feb 01 17:21:10 2016 +0530
@@ -25,8 +25,6 @@
  * For more information, contact us at license @ x265.com.
  */
 
-#include "x265_config.h"
-
 .syntax unified
 
 #if   HAVE_NEON
diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.hMon Jan 25 14:59:50 2016 +0530
+++ b/source/common/arm/blockcopy8.hMon Feb 01 17:21:10 2016 +0530
@@ -23,7 +23,9 @@
  * For more information, contact us at license @ x265.com.
  */
 
-#ifndef X265_BLOCKCOPY8_H
-#define X265_BLOCKCOPY8_H
+#ifndef X265_BLOCKCOPY8_ARM_H
+#define X265_BLOCKCOPY8_ARM_H
 
-#endif // ifndef X265_I386_PIXEL_H
+void x265_blockcopy_pp_16x16_neon(pixel* dst, intptr_t dstStride, const pixel* 
src, intptr_t srcStride);
+
+#endif // ifndef X265_I386_PIXEL_ARM_H
diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/dct8.h
--- a/source/common/arm/dct8.h  Mon Jan 25 14:59:50 2016 +0530
+++ b/source/common/arm/dct8.h  Mon Feb 01 17:21:10 2016 +0530
@@ -22,7 +22,7 @@
  * For more information, contact us at license @ x265.com.
  */
 
-#ifndef X265_DCT8_H
-#define X265_DCT8_H
+#ifndef X265_DCT8_ARM_H
+#define X265_DCT8_ARM_H
 
-#endif // ifndef X265_DCT8_H
+#endif // ifndef X265_DCT8_ARM_H
diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/intrapred.h
--- a/source/common/arm/in

[x265] [PATCH] testbench: port x264 stack & register check code for ARM arch

2016-01-27 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1453891819 -19800
#  Wed Jan 27 16:20:19 2016 +0530
# Node ID 14c4806a24eb277d31fa77c1c906838ffcb62395
# Parent  f548abe8eae8fb75513a85d1b09233e706c7b5ba
testbench: port x264 stack & register check code for ARM arch

diff -r f548abe8eae8 -r 14c4806a24eb source/common/arm/asm.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/arm/asm.S   Wed Jan 27 16:20:19 2016 +0530
@@ -0,0 +1,184 @@
+/*
+ * asm.S: arm utility macros
+ *
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Mans Rullgard <m...@mansr.com>
+ *  David Conrad <lesse...@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ */
+
+#include "x265_config.h"
+
+.syntax unified
+
+#if   HAVE_NEON
+.arch   armv7-a
+#elif HAVE_ARMV6T2
+.arch   armv6t2
+#elif HAVE_ARMV6
+.arch   armv6
+#endif
+
+.fpu neon
+
+#ifdef PREFIX
+#   define EXTERN_ASM _
+#else
+#   define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC @
+#endif
+
+.macro require8, val=1
+ELF .eabi_attribute 24, \val
+.endm
+
+.macro preserve8, val=1
+ELF .eabi_attribute 25, \val
+.endm
+
+.macro function name, export=1
+.macro endfunc
+ELF .size   \name, . - \name
+FUNC.endfunc
+.purgem endfunc
+.endm
+.align  2
+.if \export == 1
+.global EXTERN_ASM\name
+ELF .hidden EXTERN_ASM\name
+ELF .type   EXTERN_ASM\name, %function
+FUNC.func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF .hidden \name
+ELF .type   \name, %function
+FUNC.func   \name
+\name:
+.endif
+.endm
+
+.macro movrel rd, val
+#if HAVE_ARMV6T2 && !defined(PIC)
+movw\rd, #:lower16:\val
+movt\rd, #:upper16:\val
+#else
+ldr \rd, =\val
+#endif
+.endm
+
+.macro movconst rd, val
+#if HAVE_ARMV6T2
+movw\rd, #:lower16:\val
+.if \val >> 16
+movt\rd, #:upper16:\val
+.endif
+#else
+ldr \rd, =\val
+#endif
+.endm
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
+#define FENC_STRIDE 16
+#define FDEC_STRIDE 32
+
+.macro HORIZ_ADD dest, a, b
+.ifnb \b
+vadd.u16\a, \a, \b
+.endif
+vpaddl.u16  \a, \a
+vpaddl.u32  \dest, \a
+.endm
+
+.macro SUMSUB_AB sum, diff, a, b
+vadd.s16\sum,  \a, \b
+vsub.s16\diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+SUMSUB_AB   \s1, \d1, \a, \b
+SUMSUB_AB   \s2, \d2, \c, \d
+.endm
+
+.macro ABS2 a b
+vabs.s16 \a, \a
+vabs.s16 \b, \b
+.endm
+
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
+// op = sumsub/amax (sum and diff / maximum of absolutes)
+// d1/2 = destination registers
+// s1/2 = source registers
+.macro HADAMARD dist, op, d1, d2, s1, s2
+.if \dist == 1
+vtrn.16 \s1, \s2
+.else
+vtrn.32 \s1, \s2
+.endif
+.ifc \op, sumsub
+SUMSUB_AB   \d1, \d2, \s1, \s2
+.else
+vabs.s16\s1, \s1
+vabs.s16\s2, \s2
+vmax.s16\d1, \s1, \s2
+.endif
+.endm
+
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
+vtrn.32 \r0, \r4
+vtrn.32 \r1, \r5
+vtrn.32 \r2, \r6
+vtrn.32 \r3, \r7
+vtrn.16 \r0, \r2
+vtrn.16 \r1, \r3
+vtrn.16 \r4, \r6
+vtrn.16 \r5, \r7
+vtrn.8  \r0, \r1
+vtrn.8  \r2, \r3
+vtrn.8  \r4, \r5
+vtrn.8  \r6, \r7
+.endm
+
+.macro TRANSPOSE4x4 r0 r1 r2 r3
+vtrn.16 \r0, \r2
+vtrn.16 \r1, \r3
+vtrn.8  \r0, \r1
+vtrn.8  \r2, \r3
+.endm
+
+.macro TRANSPOSE4x4_16  d0 d1 d2 d3
+vtrn.32 \d0, \d2
+vtrn.32 \d1, \d3
+vtrn.16 \d0

[x265] [PATCH] testbench: port x264 stack & register check code for ARM arch

2016-01-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1453872887 -19800
#  Wed Jan 27 11:04:47 2016 +0530
# Node ID f98483674435cdb5cbd7acb655ee217feffdf976
# Parent  f548abe8eae8fb75513a85d1b09233e706c7b5ba
testbench: port x264 stack & register check code for ARM arch

diff -r f548abe8eae8 -r f98483674435 source/common/arm/asm.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/arm/asm.S   Wed Jan 27 11:04:47 2016 +0530
@@ -0,0 +1,184 @@
+/*
+ * asm.S: arm utility macros
+ *
+ * Copyright (C) 2008-2015 x264 project
+ *
+ * Authors: Mans Rullgard <m...@mansr.com>
+ *  David Conrad <lesse...@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licens...@x264.com.
+ */
+
+#include "x265_config.h"
+
+.syntax unified
+
+#if   HAVE_NEON
+.arch   armv7-a
+#elif HAVE_ARMV6T2
+.arch   armv6t2
+#elif HAVE_ARMV6
+.arch   armv6
+#endif
+
+.fpu neon
+
+#ifdef PREFIX
+#   define EXTERN_ASM _
+#else
+#   define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC @
+#endif
+
+.macro require8, val=1
+ELF .eabi_attribute 24, \val
+.endm
+
+.macro preserve8, val=1
+ELF .eabi_attribute 25, \val
+.endm
+
+.macro function name, export=1
+.macro endfunc
+ELF .size   \name, . - \name
+FUNC.endfunc
+.purgem endfunc
+.endm
+.align  2
+.if \export == 1
+.global EXTERN_ASM\name
+ELF .hidden EXTERN_ASM\name
+ELF .type   EXTERN_ASM\name, %function
+FUNC.func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF .hidden \name
+ELF .type   \name, %function
+FUNC.func   \name
+\name:
+.endif
+.endm
+
+.macro movrel rd, val
+#if HAVE_ARMV6T2 && !defined(PIC)
+movw\rd, #:lower16:\val
+movt\rd, #:upper16:\val
+#else
+ldr \rd, =\val
+#endif
+.endm
+
+.macro movconst rd, val
+#if HAVE_ARMV6T2
+movw\rd, #:lower16:\val
+.if \val >> 16
+movt\rd, #:upper16:\val
+.endif
+#else
+ldr \rd, =\val
+#endif
+.endm
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
+#define FENC_STRIDE 16
+#define FDEC_STRIDE 32
+
+.macro HORIZ_ADD dest, a, b
+.ifnb \b
+vadd.u16\a, \a, \b
+.endif
+vpaddl.u16  \a, \a
+vpaddl.u32  \dest, \a
+.endm
+
+.macro SUMSUB_AB sum, diff, a, b
+vadd.s16\sum,  \a, \b
+vsub.s16\diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+SUMSUB_AB   \s1, \d1, \a, \b
+SUMSUB_AB   \s2, \d2, \c, \d
+.endm
+
+.macro ABS2 a b
+vabs.s16 \a, \a
+vabs.s16 \b, \b
+.endm
+
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
+// op = sumsub/amax (sum and diff / maximum of absolutes)
+// d1/2 = destination registers
+// s1/2 = source registers
+.macro HADAMARD dist, op, d1, d2, s1, s2
+.if \dist == 1
+vtrn.16 \s1, \s2
+.else
+vtrn.32 \s1, \s2
+.endif
+.ifc \op, sumsub
+SUMSUB_AB   \d1, \d2, \s1, \s2
+.else
+vabs.s16\s1, \s1
+vabs.s16\s2, \s2
+vmax.s16\d1, \s1, \s2
+.endif
+.endm
+
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
+vtrn.32 \r0, \r4
+vtrn.32 \r1, \r5
+vtrn.32 \r2, \r6
+vtrn.32 \r3, \r7
+vtrn.16 \r0, \r2
+vtrn.16 \r1, \r3
+vtrn.16 \r4, \r6
+vtrn.16 \r5, \r7
+vtrn.8  \r0, \r1
+vtrn.8  \r2, \r3
+vtrn.8  \r4, \r5
+vtrn.8  \r6, \r7
+.endm
+
+.macro TRANSPOSE4x4 r0 r1 r2 r3
+vtrn.16 \r0, \r2
+vtrn.16 \r1, \r3
+vtrn.8  \r0, \r1
+vtrn.8  \r2, \r3
+.endm
+
+.macro TRANSPOSE4x4_16  d0 d1 d2 d3
+vtrn.32 \d0, \d2
+vtrn.32 \d1, \d3
+vtrn.16 \d0

[x265] [PATCH 2 of 2] asm: improved intra_ang8x8 modes 3 to 17 AVX2 asm over 20% than previous AVX2 asm

2016-01-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1449813841 -19800
#  Fri Dec 11 11:34:01 2015 +0530
# Node ID ee47dd944e08ebb49fd54114979c65dadabfe0df
# Parent  593a1907e915c9bad7bd3ff608a30770289c249a
asm: improved intra_ang8x8 modes 3 to 17 AVX2 asm over 20% than previous AVX2 
asm

diff -r 593a1907e915 -r ee47dd944e08 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Sat Dec 12 09:56:10 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Dec 11 11:34:01 2015 +0530
@@ -2932,6 +2932,7 @@
 p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2);
 p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
 p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2);
+p.cu[BLOCK_8x8].intra_pred[17] = PFX(intra_pred_ang8_17_avx2);
 p.cu[BLOCK_8x8].intra_pred[20] = PFX(intra_pred_ang8_20_avx2);
 p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2);
 p.cu[BLOCK_8x8].intra_pred[22] = PFX(intra_pred_ang8_22_avx2);
diff -r 593a1907e915 -r ee47dd944e08 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Sat Dec 12 09:56:10 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Fri Dec 11 11:34:01 2015 +0530
@@ -355,55 +355,55 @@
 times 8 db (32-22), 22
 times 8 db (32-11), 11
 
-const ang16_shuf_mode9,times 8 db 0, 1
-   times 8 db 1, 2
-
-const angHor_tab_9,  db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 
10, (32-12), 12, (32-14), 14, (32-16), 16
- db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, 
(32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
-
-const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, 
(32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
- db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, 
(32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
-
-const ang16_shuf_mode12,   db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 
3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
-   db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 
2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
-
-const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, 
(32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
- db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, 
(32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
-
-const ang16_shuf_mode13,   db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 
5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
-   db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 
3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
-   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 
0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
-
-const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, 
(32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
- db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, 
(32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
-
-const ang16_shuf_mode14,   db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 
7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
-   db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 
4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
-   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
-
-const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, 
(32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
- db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, 
(32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
-
-const ang16_shuf_mode15,   db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 
9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
-   db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 
5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
-   db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 
0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
-
-const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, 
(32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
- db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, 
(32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
-
-const ang16_shuf_mode16,   db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 
6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
-   db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 
6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
-   db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 
2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
-
-const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, 
(32-23), 23, (32-2), 2, (32-13), 13, (32-2

[x265] [PATCH 1 of 2] asm: move common constants into const-a.asm, remove unused constants

2016-01-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1449894370 -19800
#  Sat Dec 12 09:56:10 2015 +0530
# Node ID 593a1907e915c9bad7bd3ff608a30770289c249a
# Parent  a5309338d1352978e79da6210a0d64eb88d60c8f
asm: move common constants into const-a.asm, remove unused constants

diff -r a5309338d135 -r 593a1907e915 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Sat Jan 09 13:45:00 2016 +0530
+++ b/source/common/x86/blockcopy8.asm  Sat Dec 12 09:56:10 2015 +0530
@@ -28,8 +28,6 @@
 
 SECTION_RODATA 32
 
-tab_Vm:db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
-
 cextern pb_4
 cextern pb_1
 cextern pb_16
diff -r a5309338d135 -r 593a1907e915 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Sat Jan 09 13:45:00 2016 +0530
+++ b/source/common/x86/const-a.asm Sat Dec 12 09:56:10 2015 +0530
@@ -40,8 +40,10 @@
 const pb_8, times 32 db 8
 const pb_15,times 32 db 15
 const pb_16,times 32 db 16
+const pb_31,times 32 db 31
 const pb_32,times 32 db 32
 const pb_64,times 32 db 64
+const pb_124,   times 32 db 124
 const pb_128,   times 32 db 128
 const pb_a1,times 16 db 0xa1
 
@@ -146,10 +148,6 @@
 const pd_planar16_mul2, times  1 dd  15,  14,  13,  12,  11,  10,   9,   
8,7,   6,   5,   4,   3,   2,   1,   0
 const trans8_shuf,  times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
-const popcnt_table
-%assign x 0
-%rep 256
-; population count
-db 
((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
-%assign x x+1
-%endrep
+;; 64-bit constants
+
+const pq_1, times 1 dq 1
diff -r a5309338d135 -r 593a1907e915 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Sat Jan 09 13:45:00 2016 +0530
+++ b/source/common/x86/loopfilter.asm  Sat Dec 12 09:56:10 2015 +0530
@@ -29,9 +29,6 @@
 %include "x86util.asm"
 
 SECTION_RODATA 32
-pb_31:  times 32 db 31
-pb_124: times 32 db 124
-pb_15:  times 32 db 15
 
 SECTION .text
 cextern pb_1
@@ -39,6 +36,9 @@
 cextern pb_3
 cextern pb_4
 cextern pb_01
+cextern pb_15
+cextern pb_31
+cextern pb_124
 cextern pb_128
 cextern pw_1
 cextern pw_n1
diff -r a5309338d135 -r 593a1907e915 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmSat Jan 09 13:45:00 2016 +0530
+++ b/source/common/x86/mc-a.asmSat Dec 12 09:56:10 2015 +0530
@@ -53,7 +53,6 @@
  times 8 db 2
  times 8 db 4
  times 8 db 6
-sq_1: times 1 dq 1
 
 SECTION .text
 
@@ -74,6 +73,7 @@
 cextern pw_pixel_max
 cextern pd_32
 cextern pd_64
+cextern pq_1
 
 
;
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, 
intptr_t src1Stride, intptr_t dstStride)
@@ -3638,7 +3638,7 @@
 movam3, [r4+16]
 movdm2, [r4+32] ; denom
 movam4, [pw_pixel_max]
-paddw   m2, [sq_1]  ; denom+1
+paddw   m2, [pq_1]  ; denom+1
 %endmacro
 
 ; src1, src2
diff -r a5309338d135 -r 593a1907e915 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm   Sat Jan 09 13:45:00 2016 +0530
+++ b/source/common/x86/mc-a2.asm   Sat Dec 12 09:56:10 2015 +0530
@@ -43,11 +43,7 @@
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif
-pw_1024: times 16 dw 1024
 
-pd_16: times 4 dd 16
-pd_0f: times 4 dd 0x
-pf_inv256: times 8 dd 0.00390625
 const pd_inv256,times 4 dq 0.00390625
 const pd_0_5,   times 4 dq 0.5
 
@@ -59,9 +55,11 @@
 cextern pw_32
 cextern pw_512
 cextern pw_00ff
+cextern pw_1024
 cextern pw_3fff
 cextern pw_pixel_max
 cextern pd_
+cextern pd_16
 
 ;The hpel_filter routines use non-temporal writes for output.
 ;The following defines may be uncommented for testing.
diff -r a5309338d135 -r 593a1907e915 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Sat Jan 09 13:45:00 2016 +0530
+++ b/source/common/x86/pixel-a.asm Sat Dec 12 09:56:10 2015 +0530
@@ -50,9 +50,6 @@
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
 
-sw_f0: dq 0xfff0, 0
-pd_f0: times 4 dd 0x
-
 SECTION .text
 
 cextern pb_0
@@ -67,7 +64,6 @@
 cextern pw_pmpmpmpm
 cextern pw_pmmp
 cextern pd_1
-cextern popcnt_table
 cextern pd_2
 cextern hmul_16p
 cextern pb_movemask
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] testbench: setup testbench for ARM assembly

2016-01-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1452327300 -19800
#  Sat Jan 09 13:45:00 2016 +0530
# Node ID a5309338d1352978e79da6210a0d64eb88d60c8f
# Parent  d94f6c2b45f87f5b4b10b4fa70f8a9bd03d3d1c2
testbench: setup testbench for ARM assembly

X86 intrinsics has been commented from the ARM testbench.
This ARM testbench is for Linux and ARMv6 arch and above

diff -r d94f6c2b45f8 -r a5309338d135 source/CMakeLists.txt
--- a/source/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530
+++ b/source/CMakeLists.txt Sat Jan 09 13:45:00 2016 +0530
@@ -275,7 +275,9 @@
 endif(GCC)
 
 find_package(Yasm)
-if(YASM_FOUND AND X86)
+if(ARM OR CROSS_COMPILE_ARM)
+option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
+elseif(YASM_FOUND AND X86)
 if (YASM_VERSION_STRING VERSION_LESS "1.2.0")
 message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 
or later required")
 option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
@@ -423,15 +425,22 @@
 else()
 set(SUFFIX o)
 endif()
-foreach(ASM ${MSVC_ASMS})
-set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
-list(APPEND YASM_SRCS ${YASM_SRC})
-list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
-add_custom_command(
-OUTPUT ${ASM}.${SUFFIX}
-COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o 
${ASM}.${SUFFIX}
-DEPENDS ${YASM_SRC})
-endforeach()
+
+if(ARM OR CROSS_COMPILE_ARM)
+# compile ARM arch asm files here
+
+elseif(X86)
+# compile X86 arch asm files here
+foreach(ASM ${MSVC_ASMS})
+set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
+list(APPEND YASM_SRCS ${YASM_SRC})
+list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
+add_custom_command(
+OUTPUT ${ASM}.${SUFFIX}
+COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o 
${ASM}.${SUFFIX}
+DEPENDS ${YASM_SRC})
+endforeach()
+endif()
 endif()
 
 source_group(ASM FILES ${YASM_SRCS})
diff -r d94f6c2b45f8 -r a5309338d135 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Sat Jan 09 11:32:33 2016 +0530
+++ b/source/common/CMakeLists.txt  Sat Jan 09 13:45:00 2016 +0530
@@ -16,12 +16,14 @@
 if(ENABLE_ASSEMBLY)
 set_source_files_properties(threading.cpp primitives.cpp PROPERTIES 
COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
 list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
+endif(ENABLE_ASSEMBLY)
 
+if(ENABLE_ASSEMBLY AND X86)
 set(SSE3  vec/dct-sse3.cpp)
 set(SSSE3 vec/dct-ssse3.cpp)
 set(SSE41 vec/dct-sse41.cpp)
 
-if(MSVC AND X86)
+if(MSVC)
 set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
 set(WARNDISABLE "/wd4100") # unreferenced formal parameter
 if(INTEL_CXX)
@@ -38,7 +40,7 @@
 set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES 
COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
 endif()
 endif()
-if(GCC AND X86)
+if(GCC)
 if(CLANG)
 # llvm intrinsic headers cause shadow warnings
 set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
@@ -81,7 +83,20 @@
 set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC})
 endforeach()
 source_group(Assembly FILES ${ASM_PRIMITIVES})
-endif(ENABLE_ASSEMBLY)
+endif(ENABLE_ASSEMBLY AND X86)
+
+if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
+set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
loopfilter.h)
+
+# add ARM assembly/intrinsic files here
+set(A_SRCS)
+set(VEC_PRIMITIVES)
+
+foreach(SRC ${C_SRCS})
+set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
+endforeach()
+source_group(Assembly FILES ${ASM_PRIMITIVES})
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
 # set_target_properties can't do list expansion
 string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
diff -r d94f6c2b45f8 -r a5309338d135 source/common/arm/asm-primitives.cpp
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/arm/asm-primitives.cpp  Sat Jan 09 13:45:00 2016 +0530
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Steve Borho <st...@borho.org>
+ *  Praveen Kumar Tiwari <prav...@multicorewareinc.com>
+ *  Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com>
+ *  Dnyaneshwar Gorade <dnyanesh...@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is

[x265] [PATCH] testbench: setup testbench for ARM assembly

2016-01-08 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1452321658 -19800
#  Sat Jan 09 12:10:58 2016 +0530
# Node ID cd9318b1671bb24212321fcd005381e50642af4c
# Parent  d94f6c2b45f87f5b4b10b4fa70f8a9bd03d3d1c2
testbench: setup testbench for ARM assembly

X86 intrinsics has been commented from the ARM testbench.
This ARM testbench is for Linux and ARMv6 arch and above

diff -r d94f6c2b45f8 -r cd9318b1671b source/CMakeLists.txt
--- a/source/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530
+++ b/source/CMakeLists.txt Sat Jan 09 12:10:58 2016 +0530
@@ -275,7 +275,9 @@
 endif(GCC)
 
 find_package(Yasm)
-if(YASM_FOUND AND X86)
+if(ARM OR CROSS_COMPILE_ARM)
+option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
+elseif(YASM_FOUND AND X86)
 if (YASM_VERSION_STRING VERSION_LESS "1.2.0")
 message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 
or later required")
 option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
@@ -423,15 +425,22 @@
 else()
 set(SUFFIX o)
 endif()
-foreach(ASM ${MSVC_ASMS})
-set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
-list(APPEND YASM_SRCS ${YASM_SRC})
-list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
-add_custom_command(
-OUTPUT ${ASM}.${SUFFIX}
-COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o 
${ASM}.${SUFFIX}
-DEPENDS ${YASM_SRC})
-endforeach()
+
+if(ARM OR CROSS_COMPILE_ARM)
+# compile ARM arch asm files here
+
+elseif(X86)
+# compile X86 arch asm files here
+foreach(ASM ${MSVC_ASMS})
+set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
+list(APPEND YASM_SRCS ${YASM_SRC})
+list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
+add_custom_command(
+OUTPUT ${ASM}.${SUFFIX}
+COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o 
${ASM}.${SUFFIX}
+DEPENDS ${YASM_SRC})
+endforeach()
+endif()
 endif()
 
 source_group(ASM FILES ${YASM_SRCS})
diff -r d94f6c2b45f8 -r cd9318b1671b source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Sat Jan 09 11:32:33 2016 +0530
+++ b/source/common/CMakeLists.txt  Sat Jan 09 12:10:58 2016 +0530
@@ -16,12 +16,14 @@
 if(ENABLE_ASSEMBLY)
 set_source_files_properties(threading.cpp primitives.cpp PROPERTIES 
COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
 list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
+endif(ENABLE_ASSEMBLY)
 
+if(ENABLE_ASSEMBLY AND X86)
 set(SSE3  vec/dct-sse3.cpp)
 set(SSSE3 vec/dct-ssse3.cpp)
 set(SSE41 vec/dct-sse41.cpp)
 
-if(MSVC AND X86)
+if(MSVC)
 set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
 set(WARNDISABLE "/wd4100") # unreferenced formal parameter
 if(INTEL_CXX)
@@ -38,7 +40,7 @@
 set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES 
COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
 endif()
 endif()
-if(GCC AND X86)
+if(GCC)
 if(CLANG)
 # llvm intrinsic headers cause shadow warnings
 set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
@@ -81,7 +83,20 @@
 set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC})
 endforeach()
 source_group(Assembly FILES ${ASM_PRIMITIVES})
-endif(ENABLE_ASSEMBLY)
+endif(ENABLE_ASSEMBLY AND X86)
+
+if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
+set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
loopfilter.h)
+
+# add ARM assembly/intrinsic files here
+set(A_SRCS)
+set(VEC_PRIMITIVES)
+
+foreach(SRC ${C_SRCS})
+set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
+endforeach()
+source_group(Assembly FILES ${ASM_PRIMITIVES})
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
 # set_target_properties can't do list expansion
 string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
diff -r d94f6c2b45f8 -r cd9318b1671b source/common/arm/asm-primitives.cpp
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/arm/asm-primitives.cpp  Sat Jan 09 12:10:58 2016 +0530
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Steve Borho <st...@borho.org>
+ *  Praveen Kumar Tiwari <prav...@multicorewareinc.com>
+ *  Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com>
+ *  Dnyaneshwar Gorade <dnyanesh...@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is

Re: [x265] [PATCH] testbench: setup testbench for ARM assembly

2016-01-08 Thread Dnyaneshwar Gorade
Please ignore this patch. need little modifications.

On Sat, Jan 9, 2016 at 12:12 PM, <dnyanesh...@multicorewareinc.com> wrote:

> # HG changeset patch
> # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
> # Date 1452321658 -19800
> #  Sat Jan 09 12:10:58 2016 +0530
> # Node ID cd9318b1671bb24212321fcd005381e50642af4c
> # Parent  d94f6c2b45f87f5b4b10b4fa70f8a9bd03d3d1c2
> testbench: setup testbench for ARM assembly
>
> X86 intrinsics has been commented from the ARM testbench.
> This ARM testbench is for Linux and ARMv6 arch and above
>
> diff -r d94f6c2b45f8 -r cd9318b1671b source/CMakeLists.txt
> --- a/source/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530
> +++ b/source/CMakeLists.txt Sat Jan 09 12:10:58 2016 +0530
> @@ -275,7 +275,9 @@
>  endif(GCC)
>
>  find_package(Yasm)
> -if(YASM_FOUND AND X86)
> +if(ARM OR CROSS_COMPILE_ARM)
> +option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
> +elseif(YASM_FOUND AND X86)
>  if (YASM_VERSION_STRING VERSION_LESS "1.2.0")
>  message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old.
> 1.2.0 or later required")
>  option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives"
> OFF)
> @@ -423,15 +425,22 @@
>  else()
>  set(SUFFIX o)
>  endif()
> -foreach(ASM ${MSVC_ASMS})
> -set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
> -list(APPEND YASM_SRCS ${YASM_SRC})
> -list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
> -add_custom_command(
> -OUTPUT ${ASM}.${SUFFIX}
> -COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o
> ${ASM}.${SUFFIX}
> -DEPENDS ${YASM_SRC})
> -endforeach()
> +
> +if(ARM OR CROSS_COMPILE_ARM)
> +# compile ARM arch asm files here
> +
> +elseif(X86)
> +# compile X86 arch asm files here
> +foreach(ASM ${MSVC_ASMS})
> +set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
> +list(APPEND YASM_SRCS ${YASM_SRC})
> +list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
> +add_custom_command(
> +OUTPUT ${ASM}.${SUFFIX}
> +COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC}
> -o ${ASM}.${SUFFIX}
> +DEPENDS ${YASM_SRC})
> +endforeach()
> +endif()
>  endif()
>
>  source_group(ASM FILES ${YASM_SRCS})
> diff -r d94f6c2b45f8 -r cd9318b1671b source/common/CMakeLists.txt
> --- a/source/common/CMakeLists.txt  Sat Jan 09 11:32:33 2016 +0530
> +++ b/source/common/CMakeLists.txt  Sat Jan 09 12:10:58 2016 +0530
> @@ -16,12 +16,14 @@
>  if(ENABLE_ASSEMBLY)
>  set_source_files_properties(threading.cpp primitives.cpp PROPERTIES
> COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
>  list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
> +endif(ENABLE_ASSEMBLY)
>
> +if(ENABLE_ASSEMBLY AND X86)
>  set(SSE3  vec/dct-sse3.cpp)
>  set(SSSE3 vec/dct-ssse3.cpp)
>  set(SSE41 vec/dct-sse41.cpp)
>
> -if(MSVC AND X86)
> +if(MSVC)
>  set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
>  set(WARNDISABLE "/wd4100") # unreferenced formal parameter
>  if(INTEL_CXX)
> @@ -38,7 +40,7 @@
>  set_source_files_properties(${SSE3} ${SSSE3} ${SSE41}
> PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
>  endif()
>  endif()
> -if(GCC AND X86)
> +if(GCC)
>  if(CLANG)
>  # llvm intrinsic headers cause shadow warnings
>  set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
> @@ -81,7 +83,20 @@
>  set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC})
>  endforeach()
>  source_group(Assembly FILES ${ASM_PRIMITIVES})
> -endif(ENABLE_ASSEMBLY)
> +endif(ENABLE_ASSEMBLY AND X86)
> +
> +if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
> +set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h
> dct8.h loopfilter.h)
> +
> +# add ARM assembly/intrinsic files here
> +set(A_SRCS)
> +set(VEC_PRIMITIVES)
> +
> +foreach(SRC ${C_SRCS})
> +set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
> +endforeach()
> +source_group(Assembly FILES ${ASM_PRIMITIVES})
> +endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
>
>  # set_target_properties can't do list expansion
>  string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
> diff -r d94f6c2b45f8 -r cd9318b1671b source/common/arm/asm-primitives.cpp
> --- /dev/null   Thu Jan 01 00:00:00 1970 +
> +++ b/source/common/arm/asm-primitives.cpp  Sat Jan 09 12:10:58 2016
&

[x265] [PATCH] enable arm-linux cross compile build

2016-01-05 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1450516372 -19800
#  Sat Dec 19 14:42:52 2015 +0530
# Node ID d4de155912366fb831021c9f6a0fde6757a168d7
# Parent  25f78ff3d8efaa1e9d85bc3e718c887ec9afa557
enable arm-linux cross compile build

diff -r 25f78ff3d8ef -r d4de15591236 build/arm-linux/make-Makefiles.bash
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/build/arm-linux/make-Makefiles.bash   Sat Dec 19 14:42:52 2015 +0530
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Run this from within a bash shell
+
+cmake -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake -G "Unix Makefiles" ../../source 
&& ccmake ../../source
diff -r 25f78ff3d8ef -r d4de15591236 build/arm-linux/toolchain.cmake
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/build/arm-linux/toolchain.cmake   Sat Dec 19 14:42:52 2015 +0530
@@ -0,0 +1,12 @@
+# CMake toolchain file for cross compiling x265 for ARM arch
+
+set(CROSS_COMPILE_ARM 1)
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR armv6l)
+
+# specify the cross compiler
+set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc)
+set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++)
+
+# specify the target environment
+SET(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
diff -r 25f78ff3d8ef -r d4de15591236 source/CMakeLists.txt
--- a/source/CMakeLists.txt Tue Dec 22 18:13:28 2015 +0530
+++ b/source/CMakeLists.txt Sat Dec 19 14:42:52 2015 +0530
@@ -59,6 +59,11 @@
 set(POWER 1)
 add_definitions(-DX265_ARCH_POWER=1)
 elseif(${SYSPROC} STREQUAL "armv6l")
+if(CROSS_COMPILE_ARM)
+message(STATUS "Cross compiling for ARM arch")
+else()
+set(CROSS_COMPILE_ARM 0)
+endif()
 message(STATUS "Detected ARM target processor")
 set(ARM 1)
 add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
@@ -175,7 +180,9 @@
 elseif(X86 AND NOT X64)
 add_definitions(-march=i686)
 endif()
-if(ARM)
+if(ARM AND CROSS_COMPILE_ARM)
+add_definitions(-march=armv6 -mfloat-abi=soft -mfpu=vfp)
+elseif(ARM)
 add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
 endif()
 if(FPROFILE_GENERATE)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 2 of 3] asm: psyCost_pp avx2 asm code for main12

2015-12-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1448963172 -19800
#  Tue Dec 01 15:16:12 2015 +0530
# Node ID 9357c1f448a7b987cebfd3cc5542cc6c65e63fe2
# Parent  e2b07541670331ab0cd94b5f312f8f7cac893f92
asm: psyCost_pp avx2 asm code for main12

psy_cost_pp[8x8]6.55x1254.76 8224.62
psy_cost_pp[16x16]  6.51x5087.56 33111.62
psy_cost_pp[32x32]  6.50x20230.92131523.63
psy_cost_pp[64x64]  6.57x80351.48528226.25

diff -r e2b075416703 -r 9357c1f448a7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Dec 09 13:13:57 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 01 15:16:12 2015 +0530
@@ -1479,12 +1479,11 @@
 p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
 p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_avx2);
 p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar32_avx2);
-#if X265_DEPTH <= 10
+
 p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
 p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
 p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
 p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
-#endif
 
 p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
 p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
diff -r e2b075416703 -r 9357c1f448a7 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 09 13:13:57 2015 +0530
+++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:16:12 2015 +0530
@@ -10090,16 +10090,272 @@
 pabsd  xm1, xm1
 %endmacro
 
+%macro PSY_COST_PP_8x8_MAIN12 0
+; load source pixels
+lea r4, [r1 * 3]
+pmovzxwdm0, [r0]
+pmovzxwdm1, [r0 + r1]
+pmovzxwdm2, [r0 + r1 * 2]
+pmovzxwdm3, [r0 + r4]
+lea r5, [r0 + r1 * 4]
+pmovzxwdm4, [r5]
+pmovzxwdm5, [r5 + r1]
+pmovzxwdm6, [r5 + r1 * 2]
+pmovzxwdm7, [r5 + r4]
+
+; source SAD
+paddd   m8, m0, m1
+paddd   m8, m2
+paddd   m8, m3
+paddd   m8, m4
+paddd   m8, m5
+paddd   m8, m6
+paddd   m8, m7
+
+vextracti128xm9, m8, 1
+paddd   m8, m9  ; sad_8x8
+movhlps xm9, xm8
+paddd   xm8, xm9
+pshuflw xm9, xm8, 0Eh
+paddd   xm8, xm9
+psrld   m8, 2
+
+; source SA8D
+psubd   m9, m1, m0
+paddd   m0, m1
+psubd   m1, m3, m2
+paddd   m2, m3
+punpckhdq   m3, m0, m9
+punpckldq   m0, m9
+psubd   m9, m3, m0
+paddd   m0, m3
+punpckhdq   m3, m2, m1
+punpckldq   m2, m1
+psubd   m10, m3, m2
+paddd   m2, m3
+psubd   m3, m5, m4
+paddd   m4, m5
+psubd   m5, m7, m6
+paddd   m6, m7
+punpckhdq   m1, m4, m3
+punpckldq   m4, m3
+psubd   m7, m1, m4
+paddd   m4, m1
+punpckhdq   m3, m6, m5
+punpckldq   m6, m5
+psubd   m1, m3, m6
+paddd   m6, m3
+psubd   m3, m2, m0
+paddd   m0, m2
+psubd   m2, m10, m9
+paddd   m9, m10
+punpckhqdq  m5, m0, m3
+punpcklqdq  m0, m3
+psubd   m10, m5, m0
+paddd   m0, m5
+punpckhqdq  m3, m9, m2
+punpcklqdq  m9, m2
+psubd   m5, m3, m9
+paddd   m9, m3
+psubd   m3, m6, m4
+paddd   m4, m6
+psubd   m6, m1, m7
+paddd   m7, m1
+punpckhqdq  m2, m4, m3
+punpcklqdq  m4, m3
+psubd   m1, m2, m4
+paddd   m4, m2
+punpckhqdq  m3, m7, m6
+punpcklqdq  m7, m6
+psubd   m2, m3, m7
+paddd   m7, m3
+psubd   m3, m4, m0
+paddd   m0, m4
+psubd   m4, m1, m10
+paddd   m10, m1
+vinserti128 m6, m0, xm3, 1
+vperm2i128  m0, m0, m3, 00110001b
+pabsd   m0, m0
+pabsd   m6, m6
+pmaxsd  m0, m6
+vinserti128 m3, m10, xm4, 1
+vperm2i128  m10, m10, m4, 00110001b
+pabsd   m10, m10
+pabsd   m3, m3
+pmaxsd  m10, m3
+psubd   m3, m7, m9
+paddd   m9, m7
+psubd   m7, m2, m5
+paddd   m5, m2
+vinserti128 m4, m9, xm3, 1
+vperm2i128  m9, m9, m3, 00110001b
+pabsd   m9, m9
+pabsd   m4, m4
+pmaxsd  m9, m4
+vinserti128 m3, m5, xm7, 1
+vperm2i128  m5, m5, m7, 00110001b
+pabsd   m5, m5
+pabsd   m3, m3
+pmaxsd  m5, m3
+paddd   m0, m9
+

[x265] [PATCH 1 of 3] asm: SA8D avx2 asm code for main12

2015-12-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar Gorade <gorad...@gmail.com>
# Date 1449647037 -19800
#  Wed Dec 09 13:13:57 2015 +0530
# Node ID e2b07541670331ab0cd94b5f312f8f7cac893f92
# Parent  b80087c9bf25697c3d354d732323fc895a2ca11f
asm: SA8D avx2 asm code for main12

sa8d[  8x8]  4.70x564.58  2652.82
sa8d[ 8x16]  4.00x1358.06 5429.52
sa8d[16x16]  5.57x2013.70 11212.47
sa8d[16x32]  3.90x5610.47 21883.35
sa8d[32x32]  5.36x8274.18 44361.61
sa8d[32x64]  3.86x23024.0488901.80
sa8d[64x64]  4.35x45509.79198165.11

diff -r b80087c9bf25 -r e2b075416703 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Dec 08 15:52:21 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Dec 09 13:13:57 2015 +0530
@@ -1313,6 +1313,9 @@
 }
 if (cpuMask & X265_CPU_AVX2)
 {
+#if X265_DEPTH == 12
+ASSIGN_SA8D(avx2);
+#endif
 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
 
 // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
diff -r b80087c9bf25 -r e2b075416703 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Dec 08 15:52:21 2015 +0530
+++ b/source/common/x86/pixel-a.asm Wed Dec 09 13:13:57 2015 +0530
@@ -6499,6 +6499,1357 @@
 %endif ; !ARCH_X86_64
 %endmacro ; SA8D
 
+
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+INIT_YMM avx2
+cglobal sa8d_8x8_12bit
+pmovzxwdm0, [r0]
+pmovzxwdm9, [r2]
+psubd   m0, m9
+
+pmovzxwdm1, [r0 + r1]
+pmovzxwdm9, [r2 + r3]
+psubd   m1, m9
+
+pmovzxwdm2, [r0 + r1 * 2]
+pmovzxwdm9, [r2 + r3 * 2]
+psubd   m2, m9
+
+pmovzxwdm8, [r0 + r4]
+pmovzxwdm9, [r2 + r5]
+psubd   m8, m9
+
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+
+pmovzxwdm4, [r0]
+pmovzxwdm9, [r2]
+psubd   m4, m9
+
+pmovzxwdm5, [r0 + r1]
+pmovzxwdm9, [r2 + r3]
+psubd   m5, m9
+
+pmovzxwdm3, [r0 + r1 * 2]
+pmovzxwdm9, [r2 + r3 * 2]
+psubd   m3, m9
+
+pmovzxwdm7, [r0 + r4]
+pmovzxwdm9, [r2 + r5]
+psubd   m7, m9
+
+movam6, m0
+paddd   m0, m1
+psubd   m1, m6
+movam6, m2
+paddd   m2, m8
+psubd   m8, m6
+movam6, m0
+
+punpckldq   m0, m1
+punpckhdq   m6, m1
+
+movam1, m0
+paddd   m0, m6
+psubd   m6, m1
+movam1, m2
+
+punpckldq   m2, m8
+punpckhdq   m1, m8
+
+movam8, m2
+paddd   m2, m1
+psubd   m1, m8
+movam8, m4
+paddd   m4, m5
+psubd   m5, m8
+movam8, m3
+paddd   m3, m7
+psubd   m7, m8
+movam8, m4
+
+punpckldq   m4, m5
+punpckhdq   m8, m5
+
+movam5, m4
+paddd   m4, m8
+psubd   m8, m5
+movam5, m3
+punpckldq   m3, m7
+punpckhdq   m5, m7
+
+movam7, m3
+paddd   m3, m5
+psubd   m5, m7
+movam7, m0
+paddd   m0, m2
+psubd   m2, m7
+movam7, m6
+paddd   m6, m1
+psubd   m1, m7
+movam7, m0
+
+punpcklqdq  m0, m2
+punpckhqdq  m7, m2
+
+movam2, m0
+paddd   m0, m7
+psubd   m7, m2
+movam2, m6
+
+punpcklqdq  m6, m1
+punpckhqdq  m2, m1
+
+movam1, m6
+paddd   m6, m2
+psubd   m2, m1
+movam1, m4
+paddd   m4, m3
+psubd   m3, m1
+movam1, m8
+paddd   m8, m5
+psubd   m5, m1
+movam1, m4
+
+punpcklqdq  m4, m3
+punpckhqdq  m1, m3
+
+movam3, m4
+paddd   m4, m1
+psubd   m1, m3
+movam3, m8
+
+punpcklqdq  m8, m5
+punpckhqdq  m3, m5
+
+movam5, m8
+paddd   m8, m3
+psubd   m3, m5
+movam5, m0
+paddd   m0, m4
+psubd   m4, m5
+movam5, m7
+paddd   m7, m1
+psubd   m1, m5
+movam5, m0
+
+vinserti128 m0, m0, xm4, 1
+vperm2i128  m5, m5, m4, 00110001b
+
+pxorm4, m4
+psubd   m4, m0
+pmaxsd  m0, m4
+pxorm4, m4
+psubd   m4, m5
+pmaxsd  m5, m4
+pmaxsd  m0, m5
+movam4, m7
+
+vinserti128 m7, m7, xm1, 1
+vperm2i128  m4, m4, m1, 00110001b
+
+

[x265] [PATCH 3 of 3] asm: fix dct[8x8] AVX2 asm for main12

2015-12-09 Thread dnyaneshwar
# HG changeset patch
# User Aasaipriya Chandran 
# Date 1449648215 -19800
#  Wed Dec 09 13:33:35 2015 +0530
# Node ID 9e3f71d784e59527a14702e83de474bc3f12fd15
# Parent  9357c1f448a7b987cebfd3cc5542cc6c65e63fe2
asm: fix dct[8x8] AVX2 asm for main12

diff -r 9357c1f448a7 -r 9e3f71d784e5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Dec 01 15:16:12 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Dec 09 13:33:35 2015 +0530
@@ -1573,9 +1573,8 @@
 p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
 
 ALL_LUMA_TU_S(idct, idct, avx2);
-#if X265_DEPTH <= 10
 ALL_LUMA_TU_S(dct, dct, avx2);
-#endif
+
 ALL_LUMA_CU_S(transpose, transpose, avx2);
 
 ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
diff -r 9357c1f448a7 -r 9e3f71d784e5 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmTue Dec 01 15:16:12 2015 +0530
+++ b/source/common/x86/dct8.asmWed Dec 09 13:33:35 2015 +0530
@@ -2174,7 +2174,7 @@
 pmaddwd m0, m%4
 phaddd  m2, m0
 paddd   m2, m5
-psrad   m2, DCT_SHIFT
+psrad   m2, DCT8_SHIFT1
 packssdwm2, m2
 vpermq  m2, m2, 0x08
 mova[r5 + %2],  xm2
@@ -2190,7 +2190,7 @@
 phaddd  m8, m9
 phaddd  m6, m8
 paddd   m6, m5
-psrad   m6, DCT_SHIFT2
+psrad   m6, DCT8_SHIFT2
 
 vbroadcasti128  m4, [r6 + %2]
 pmaddwd m10,m0, m4
@@ -2201,7 +2201,7 @@
 phaddd  m8, m9
 phaddd  m10,m8
 paddd   m10,m5
-psrad   m10,DCT_SHIFT2
+psrad   m10,DCT8_SHIFT2
 
 packssdwm6, m10
 vpermq  m10,m6, 0xD8
@@ -2210,18 +2210,7 @@
 
 INIT_YMM avx2
 cglobal dct8, 3, 7, 11, 0-8*16
-%if BIT_DEPTH == 12
-%define DCT_SHIFT  6
-vbroadcasti128  m5,[pd_16]
-%elif BIT_DEPTH == 10
-%define DCT_SHIFT  4
-vbroadcasti128  m5,[pd_8]
-%elif BIT_DEPTH == 8
-%define DCT_SHIFT  2
-vbroadcasti128  m5,[pd_2]
-%else
-%error Unsupported BIT_DEPTH!
-%endif
+vbroadcasti128  m5,[pd_ %+ DCT8_ROUND1]
 %define DCT_SHIFT2 9
 
 add r2d,   r2d
@@ -2265,7 +2254,7 @@
 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
 
 ;pass2
-vbroadcasti128  m5,[pd_256]
+vbroadcasti128  m5,[pd_ %+ DCT8_ROUND2]
 
 movam0,[r5]
 movam1,[r5 + 32]
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: move common constants into const-a.asm, remove unused constants

2015-12-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1449723720 -19800
#  Thu Dec 10 10:32:00 2015 +0530
# Node ID ff08c87f20a7f3f36bfb0849bd2d10fc1f8da465
# Parent  33d04da2f68830ac51151cfbda8f38fb9a7e8bb9
asm: move common constants into const-a.asm, remove unused constants

diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Wed Dec 09 22:24:25 2015 +0530
+++ b/source/common/x86/blockcopy8.asm  Thu Dec 10 10:32:00 2015 +0530
@@ -28,8 +28,6 @@
 
 SECTION_RODATA 32
 
-tab_Vm:db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
-
 cextern pb_4
 cextern pb_1
 cextern pb_16
diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Dec 09 22:24:25 2015 +0530
+++ b/source/common/x86/const-a.asm Thu Dec 10 10:32:00 2015 +0530
@@ -40,8 +40,10 @@
 const pb_8, times 32 db 8
 const pb_15,times 32 db 15
 const pb_16,times 32 db 16
+const pb_31,times 32 db 31
 const pb_32,times 32 db 32
 const pb_64,times 32 db 64
+const pb_124,   times 32 db 124
 const pb_128,   times 32 db 128
 const pb_a1,times 16 db 0xa1
 
@@ -146,10 +148,6 @@
 const pd_planar16_mul2, times  1 dd  15,  14,  13,  12,  11,  10,   9,   
8,7,   6,   5,   4,   3,   2,   1,   0
 const trans8_shuf,  times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
-const popcnt_table
-%assign x 0
-%rep 256
-; population count
-db 
((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
-%assign x x+1
-%endrep
+;; 64-bit constants
+
+const pq_1, times 1 dq 1
diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Wed Dec 09 22:24:25 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Thu Dec 10 10:32:00 2015 +0530
@@ -29,15 +29,15 @@
 %include "x86util.asm"
 
 SECTION_RODATA 32
-pb_31:  times 32 db 31
-pb_124: times 32 db 124
-pb_15:  times 32 db 15
 pb_movemask_32:  times 32 db 0x00
  times 32 db 0xFF
 
 SECTION .text
 cextern pb_1
 cextern pb_01
+cextern pb_15
+cextern pb_31
+cextern pb_124
 cextern pb_128
 cextern pb_2
 cextern pw_2
diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmWed Dec 09 22:24:25 2015 +0530
+++ b/source/common/x86/mc-a.asmThu Dec 10 10:32:00 2015 +0530
@@ -53,7 +53,6 @@
  times 8 db 2
  times 8 db 4
  times 8 db 6
-sq_1: times 1 dq 1
 
 SECTION .text
 
@@ -74,6 +73,7 @@
 cextern pw_pixel_max
 cextern pd_32
 cextern pd_64
+cextern pq_1
 
 
;
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, 
intptr_t src1Stride, intptr_t dstStride)
@@ -3638,7 +3638,7 @@
 movam3, [r4+16]
 movdm2, [r4+32] ; denom
 movam4, [pw_pixel_max]
-paddw   m2, [sq_1]  ; denom+1
+paddw   m2, [pq_1]  ; denom+1
 %endmacro
 
 ; src1, src2
diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm   Wed Dec 09 22:24:25 2015 +0530
+++ b/source/common/x86/mc-a2.asm   Thu Dec 10 10:32:00 2015 +0530
@@ -43,11 +43,7 @@
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif
-pw_1024: times 16 dw 1024
 
-pd_16: times 4 dd 16
-pd_0f: times 4 dd 0x
-pf_inv256: times 8 dd 0.00390625
 const pd_inv256,times 4 dq 0.00390625
 const pd_0_5,   times 4 dq 0.5
 
@@ -59,9 +55,11 @@
 cextern pw_32
 cextern pw_512
 cextern pw_00ff
+cextern pw_1024
 cextern pw_3fff
 cextern pw_pixel_max
 cextern pd_
+cextern pd_16
 
 ;The hpel_filter routines use non-temporal writes for output.
 ;The following defines may be uncommented for testing.
diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 09 22:24:25 2015 +0530
+++ b/source/common/x86/pixel-a.asm Thu Dec 10 10:32:00 2015 +0530
@@ -50,9 +50,6 @@
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
 
-sw_f0: dq 0xfff0, 0
-pd_f0: times 4 dd 0x
-
 SECTION .text
 
 cextern pb_0
@@ -67,7 +64,6 @@
 cextern pw_pmpmpmpm
 cextern pw_pmmp
 cextern pd_1
-cextern popcnt_table
 cextern pd_2
 cextern hmul_16p
 cextern pb_movemask
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH 1 of 2] asm: SA8D avx2 asm code for main12

2015-12-08 Thread Dnyaneshwar Gorade
Thanks, Min. I am re-sending these two patches with the above modifications.

On Wed, Dec 2, 2015 at 8:57 PM, chen <chenm...@163.com> wrote:

> I suggest just keep one name of sa8d_avx2
>
> At 2015-12-02 12:31:59,"Dnyaneshwar Gorade" <
> dnyanesh...@multicorewareinc.com> wrote:
>
> the real function name is sa8d_8x8_avx2 whereas the common function name
> is sa8d_8x8_avx2_avx2, that's why we got proper call. both are different.
>
> On Tue, Dec 1, 2015 at 9:08 PM, chen <chenm...@163.com> wrote:
>
>>
>>
>> At 2015-12-01 18:58:43,dnyanesh...@multicorewareinc.com wrote:
>> ># HG changeset patch
>> ># User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
>> ># Date 1448962785 -19800
>> >#  Tue Dec 01 15:09:45 2015 +0530
>> ># Node ID f8b0ce4e9f4092a38d8095961825e734a34f112e
>> ># Parent  e2e507ffe752d6c193a219b242c433bdc55f39f7
>> >asm: SA8D avx2 asm code for main12
>> >
>> >sa8d[  8x8]  4.70x564.58  2652.82
>> >sa8d[ 8x16]  4.00x1358.06 5429.52
>> >sa8d[16x16]  5.57x2013.70 11212.47
>> >sa8d[16x32]  3.90x5610.47 21883.35
>> >sa8d[32x32]  5.36x8274.18 44361.61
>> >sa8d[32x64]  3.86x23024.0488901.80
>> >sa8d[64x64]  4.35x45509.79198165.11
>> >
>> >diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/asm-primitives.cpp
>> >--- a/source/common/x86/asm-primitives.cpp  Mon Nov 30 11:23:38 2015 +0530
>> >+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 01 15:09:45 2015 +0530
>> >@@ -1313,6 +1313,9 @@
>> > }
>> > if (cpuMask & X265_CPU_AVX2)
>> > {
>> >+#if X265_DEPTH == 12
>> >+ASSIGN_SA8D(avx2);
>> >+#endif
>> > p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
>> >
>> > // TODO: the planecopy_sp is really planecopy_SC now, must be fix 
>> > it
>> >diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/pixel-a.asm
>> >--- a/source/common/x86/pixel-a.asm Mon Nov 30 11:23:38 2015 +0530
>> >+++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:09:45 2015 +0530
>> >@@ -6499,6 +6499,1357 @@
>> > %endif ; !ARCH_X86_64
>> > %endmacro ; SA8D
>> >
>> >+
>> >+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
>> >+INIT_YMM avx2
>> >+cglobal sa8d_8x8_avx2
>> the really function name is sa8d_8x8_avx2_avx2, we are lucky, below call use 
>> correct name
>>
>>
>> ___
>> x265-devel mailing list
>> x265-devel@videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>>
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH 1 of 2] asm: SA8D avx2 asm code for main12

2015-12-01 Thread Dnyaneshwar Gorade
the real function name is sa8d_8x8_avx2 whereas the common function name is
sa8d_8x8_avx2_avx2, that's why we got proper call. both are different.

On Tue, Dec 1, 2015 at 9:08 PM, chen <chenm...@163.com> wrote:

>
>
> At 2015-12-01 18:58:43,dnyanesh...@multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
> ># Date 1448962785 -19800
> >#  Tue Dec 01 15:09:45 2015 +0530
> ># Node ID f8b0ce4e9f4092a38d8095961825e734a34f112e
> ># Parent  e2e507ffe752d6c193a219b242c433bdc55f39f7
> >asm: SA8D avx2 asm code for main12
> >
> >sa8d[  8x8]  4.70x564.58  2652.82
> >sa8d[ 8x16]  4.00x1358.06 5429.52
> >sa8d[16x16]  5.57x2013.70 11212.47
> >sa8d[16x32]  3.90x5610.47 21883.35
> >sa8d[32x32]  5.36x8274.18 44361.61
> >sa8d[32x64]  3.86x23024.0488901.80
> >sa8d[64x64]  4.35x45509.79198165.11
> >
> >diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp   Mon Nov 30 11:23:38 2015 +0530
> >+++ b/source/common/x86/asm-primitives.cpp   Tue Dec 01 15:09:45 2015 +0530
> >@@ -1313,6 +1313,9 @@
> > }
> > if (cpuMask & X265_CPU_AVX2)
> > {
> >+#if X265_DEPTH == 12
> >+ASSIGN_SA8D(avx2);
> >+#endif
> > p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
> >
> > // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
> >diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/pixel-a.asm
> >--- a/source/common/x86/pixel-a.asm  Mon Nov 30 11:23:38 2015 +0530
> >+++ b/source/common/x86/pixel-a.asm  Tue Dec 01 15:09:45 2015 +0530
> >@@ -6499,6 +6499,1357 @@
> > %endif ; !ARCH_X86_64
> > %endmacro ; SA8D
> >
> >+
> >+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
> >+INIT_YMM avx2
> >+cglobal sa8d_8x8_avx2
> the really function name is sa8d_8x8_avx2_avx2, we are lucky, below call use 
> correct name
>
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 2 of 2] asm: psyCost_pp avx2 asm code for main12

2015-12-01 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1448963172 -19800
#  Tue Dec 01 15:16:12 2015 +0530
# Node ID dbc004801f4734ba048a451d779c1c9c82f1b6ac
# Parent  f8b0ce4e9f4092a38d8095961825e734a34f112e
asm: psyCost_pp avx2 asm code for main12

psy_cost_pp[8x8]6.55x1254.76 8224.62
psy_cost_pp[16x16]  6.51x5087.56 33111.62
psy_cost_pp[32x32]  6.50x20230.92131523.63
psy_cost_pp[64x64]  6.57x80351.48528226.25

diff -r f8b0ce4e9f40 -r dbc004801f47 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Dec 01 15:09:45 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 01 15:16:12 2015 +0530
@@ -1479,12 +1479,11 @@
 p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
 p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_avx2);
 p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar32_avx2);
-#if X265_DEPTH <= 10
+
 p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
 p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
 p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
 p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
-#endif
 
 p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
 p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
diff -r f8b0ce4e9f40 -r dbc004801f47 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Dec 01 15:09:45 2015 +0530
+++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:16:12 2015 +0530
@@ -10090,16 +10090,272 @@
 pabsd  xm1, xm1
 %endmacro
 
+%macro PSY_COST_PP_8x8_MAIN12 0
+; load source pixels
+lea r4, [r1 * 3]
+pmovzxwdm0, [r0]
+pmovzxwdm1, [r0 + r1]
+pmovzxwdm2, [r0 + r1 * 2]
+pmovzxwdm3, [r0 + r4]
+lea r5, [r0 + r1 * 4]
+pmovzxwdm4, [r5]
+pmovzxwdm5, [r5 + r1]
+pmovzxwdm6, [r5 + r1 * 2]
+pmovzxwdm7, [r5 + r4]
+
+; source SAD
+paddd   m8, m0, m1
+paddd   m8, m2
+paddd   m8, m3
+paddd   m8, m4
+paddd   m8, m5
+paddd   m8, m6
+paddd   m8, m7
+
+vextracti128xm9, m8, 1
+paddd   m8, m9  ; sad_8x8
+movhlps xm9, xm8
+paddd   xm8, xm9
+pshuflw xm9, xm8, 0Eh
+paddd   xm8, xm9
+psrld   m8, 2
+
+; source SA8D
+psubd   m9, m1, m0
+paddd   m0, m1
+psubd   m1, m3, m2
+paddd   m2, m3
+punpckhdq   m3, m0, m9
+punpckldq   m0, m9
+psubd   m9, m3, m0
+paddd   m0, m3
+punpckhdq   m3, m2, m1
+punpckldq   m2, m1
+psubd   m10, m3, m2
+paddd   m2, m3
+psubd   m3, m5, m4
+paddd   m4, m5
+psubd   m5, m7, m6
+paddd   m6, m7
+punpckhdq   m1, m4, m3
+punpckldq   m4, m3
+psubd   m7, m1, m4
+paddd   m4, m1
+punpckhdq   m3, m6, m5
+punpckldq   m6, m5
+psubd   m1, m3, m6
+paddd   m6, m3
+psubd   m3, m2, m0
+paddd   m0, m2
+psubd   m2, m10, m9
+paddd   m9, m10
+punpckhqdq  m5, m0, m3
+punpcklqdq  m0, m3
+psubd   m10, m5, m0
+paddd   m0, m5
+punpckhqdq  m3, m9, m2
+punpcklqdq  m9, m2
+psubd   m5, m3, m9
+paddd   m9, m3
+psubd   m3, m6, m4
+paddd   m4, m6
+psubd   m6, m1, m7
+paddd   m7, m1
+punpckhqdq  m2, m4, m3
+punpcklqdq  m4, m3
+psubd   m1, m2, m4
+paddd   m4, m2
+punpckhqdq  m3, m7, m6
+punpcklqdq  m7, m6
+psubd   m2, m3, m7
+paddd   m7, m3
+psubd   m3, m4, m0
+paddd   m0, m4
+psubd   m4, m1, m10
+paddd   m10, m1
+vinserti128 m6, m0, xm3, 1
+vperm2i128  m0, m0, m3, 00110001b
+pabsd   m0, m0
+pabsd   m6, m6
+pmaxsd  m0, m6
+vinserti128 m3, m10, xm4, 1
+vperm2i128  m10, m10, m4, 00110001b
+pabsd   m10, m10
+pabsd   m3, m3
+pmaxsd  m10, m3
+psubd   m3, m7, m9
+paddd   m9, m7
+psubd   m7, m2, m5
+paddd   m5, m2
+vinserti128 m4, m9, xm3, 1
+vperm2i128  m9, m9, m3, 00110001b
+pabsd   m9, m9
+pabsd   m4, m4
+pmaxsd  m9, m4
+vinserti128 m3, m5, xm7, 1
+vperm2i128  m5, m5, m7, 00110001b
+pabsd   m5, m5
+pabsd   m3, m3
+pmaxsd  m5, m3
+paddd   m0, m9
+

[x265] [PATCH 1 of 2] asm: SA8D avx2 asm code for main12

2015-12-01 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1448962785 -19800
#  Tue Dec 01 15:09:45 2015 +0530
# Node ID f8b0ce4e9f4092a38d8095961825e734a34f112e
# Parent  e2e507ffe752d6c193a219b242c433bdc55f39f7
asm: SA8D avx2 asm code for main12

sa8d[  8x8]  4.70x564.58  2652.82
sa8d[ 8x16]  4.00x1358.06 5429.52
sa8d[16x16]  5.57x2013.70 11212.47
sa8d[16x32]  3.90x5610.47 21883.35
sa8d[32x32]  5.36x8274.18 44361.61
sa8d[32x64]  3.86x23024.0488901.80
sa8d[64x64]  4.35x45509.79198165.11

diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 30 11:23:38 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Dec 01 15:09:45 2015 +0530
@@ -1313,6 +1313,9 @@
 }
 if (cpuMask & X265_CPU_AVX2)
 {
+#if X265_DEPTH == 12
+ASSIGN_SA8D(avx2);
+#endif
 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
 
 // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 30 11:23:38 2015 +0530
+++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:09:45 2015 +0530
@@ -6499,6 +6499,1357 @@
 %endif ; !ARCH_X86_64
 %endmacro ; SA8D
 
+
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+INIT_YMM avx2
+cglobal sa8d_8x8_avx2
+pmovzxwdm0, [r0]
+pmovzxwdm9, [r2]
+psubd   m0, m9
+
+pmovzxwdm1, [r0 + r1]
+pmovzxwdm9, [r2 + r3]
+psubd   m1, m9
+
+pmovzxwdm2, [r0 + r1 * 2]
+pmovzxwdm9, [r2 + r3 * 2]
+psubd   m2, m9
+
+pmovzxwdm8, [r0 + r4]
+pmovzxwdm9, [r2 + r5]
+psubd   m8, m9
+
+lea r0, [r0 + r1 * 4]
+lea r2, [r2 + r3 * 4]
+
+pmovzxwdm4, [r0]
+pmovzxwdm9, [r2]
+psubd   m4, m9
+
+pmovzxwdm5, [r0 + r1]
+pmovzxwdm9, [r2 + r3]
+psubd   m5, m9
+
+pmovzxwdm3, [r0 + r1 * 2]
+pmovzxwdm9, [r2 + r3 * 2]
+psubd   m3, m9
+
+pmovzxwdm7, [r0 + r4]
+pmovzxwdm9, [r2 + r5]
+psubd   m7, m9
+
+movam6, m0
+paddd   m0, m1
+psubd   m1, m6
+movam6, m2
+paddd   m2, m8
+psubd   m8, m6
+movam6, m0
+
+punpckldq   m0, m1
+punpckhdq   m6, m1
+
+movam1, m0
+paddd   m0, m6
+psubd   m6, m1
+movam1, m2
+
+punpckldq   m2, m8
+punpckhdq   m1, m8
+
+movam8, m2
+paddd   m2, m1
+psubd   m1, m8
+movam8, m4
+paddd   m4, m5
+psubd   m5, m8
+movam8, m3
+paddd   m3, m7
+psubd   m7, m8
+movam8, m4
+
+punpckldq   m4, m5
+punpckhdq   m8, m5
+
+movam5, m4
+paddd   m4, m8
+psubd   m8, m5
+movam5, m3
+punpckldq   m3, m7
+punpckhdq   m5, m7
+
+movam7, m3
+paddd   m3, m5
+psubd   m5, m7
+movam7, m0
+paddd   m0, m2
+psubd   m2, m7
+movam7, m6
+paddd   m6, m1
+psubd   m1, m7
+movam7, m0
+
+punpcklqdq  m0, m2
+punpckhqdq  m7, m2
+
+movam2, m0
+paddd   m0, m7
+psubd   m7, m2
+movam2, m6
+
+punpcklqdq  m6, m1
+punpckhqdq  m2, m1
+
+movam1, m6
+paddd   m6, m2
+psubd   m2, m1
+movam1, m4
+paddd   m4, m3
+psubd   m3, m1
+movam1, m8
+paddd   m8, m5
+psubd   m5, m1
+movam1, m4
+
+punpcklqdq  m4, m3
+punpckhqdq  m1, m3
+
+movam3, m4
+paddd   m4, m1
+psubd   m1, m3
+movam3, m8
+
+punpcklqdq  m8, m5
+punpckhqdq  m3, m5
+
+movam5, m8
+paddd   m8, m3
+psubd   m3, m5
+movam5, m0
+paddd   m0, m4
+psubd   m4, m5
+movam5, m7
+paddd   m7, m1
+psubd   m1, m5
+movam5, m0
+
+vinserti128 m0, m0, xm4, 1
+vperm2i128  m5, m5, m4, 00110001b
+
+pxorm4, m4
+psubd   m4, m0
+pmaxsd  m0, m4
+pxorm4, m4
+psubd   m4, m5
+pmaxsd  m5, m4
+pmaxsd  m0, m5
+movam4, m7
+
+vinserti128 m7, m7, xm1, 1
+vperm2i128  m4, m4, m1, 001

[x265] [PATCH] use 32-bits multiply in mbtree_propagate_cost to avoid intraCost overflow

2015-11-24 Thread dnyaneshwar
# HG changeset patch
# User Min Chen 
# Date 1447865933 21600
#  Wed Nov 18 10:58:53 2015 -0600
# Node ID d4e8af415c2ea939f1c82cf2dc1561fee20847de
# Parent  ad15f3756ad888b99a4ba868b857e09909dae226
use 32-bits multiply in mbtree_propagate_cost to avoid intraCost overflow

diff -r ad15f3756ad8 -r d4e8af415c2e source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm   Fri Nov 06 12:33:51 2015 +0530
+++ b/source/common/x86/mc-a2.asm   Wed Nov 18 10:58:53 2015 -0600
@@ -1019,15 +1019,11 @@
 por m3, m1
 
 movdm1, [r1+r5*2]   ; prop
-%if (BIT_DEPTH <= 8)
-pmaddwd m0, m2
-%else
 punpckldq   m2, m2
 punpckldq   m0, m0
 pmuludq m0, m2
 pshufd  m2, m2, q3120
 pshufd  m0, m0, q3120
-%endif
 
 punpcklwd   m1, m4
 cvtdq2pdm0, m0
@@ -1072,15 +1068,11 @@
 por m3, m1
 
 movdm1, [r1+r5*2]   ; prop
-%if (BIT_DEPTH <= 8)
-pmaddwd m0, m2
-%else
-punpckldq   m2, m2  ; DWORD [- 1 - 0]
+punpckldq   m2, m2  ; DWORD [_ 1 _ 0]
 punpckldq   m0, m0
 pmuludq m0, m2  ; QWORD [m1 m0]
 pshufd  m2, m2, q3120
 pshufd  m0, m0, q3120
-%endif
 punpcklwd   m1, m4
 cvtdq2pdm0, m0
 mulpd   m0, m6  ; intra*invq*fps_factor>>8
@@ -1120,11 +1112,7 @@
 pminsd  xm3, xm2
 
 pmovzxwdxm1, [r1+r5*2]  ; prop
-%if (BIT_DEPTH <= 8)
-pmaddwd xm0, xm2
-%else
 pmulld  xm0, xm2
-%endif
 cvtdq2pdm0, xm0
 cvtdq2pdm1, xm1 ; prop
 %if cpuflag(avx2)
@@ -1166,11 +1154,7 @@
 
 movdxm1, [r1+r5*2]  ; prop
 pmovzxwdxm1, xm1
-%if (BIT_DEPTH <= 8)
-pmaddwd xm0, xm2
-%else
 pmulld  xm0, xm2
-%endif
 cvtdq2pdm0, xm0
 cvtdq2pdm1, xm1 ; prop
 %if cpuflag(avx2)
@@ -1204,11 +1188,7 @@
 
 movzx   r6d, word [r1+r5*2] ; prop
 movdxm1, r6d
-%if (BIT_DEPTH <= 8)
-pmaddwd xm0, xm2
-%else
 pmulld  xm0, xm2
-%endif
 cvtdq2pdm0, xm0
 cvtdq2pdm1, xm1 ; prop
 %if cpuflag(avx2)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: fix inconsistent crash due to unaligned NR buffer in denoiseDct SSE4 asm

2015-11-18 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1447829883 -19800
#  Wed Nov 18 12:28:03 2015 +0530
# Node ID 653430a3de3f9ba342922ee6ea46d4cf52c1eb39
# Parent  e8f9a60d4cd9e73c9f2baf05c2ccda5af1892b46
asm: fix inconsistent crash due to unaligned NR buffer in denoiseDct SSE4 asm

Also, fixes warning C4316: object allocated on the heap may not be aligned 16

diff -r e8f9a60d4cd9 -r 653430a3de3f source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmMon Nov 16 16:44:33 2015 +0530
+++ b/source/common/x86/dct8.asmWed Nov 18 12:28:03 2015 +0530
@@ -2115,15 +2115,15 @@
 mova m0, [r0]
 pabswm1, m0
 
-mova m2, [r1]
+movu m2, [r1]
 pmovsxwd m3, m1
 padddm2, m3
-mova [r1], m2
-mova m2, [r1 + 16]
+movu [r1], m2
+movu m2, [r1 + 16]
 psrldq   m3, m1, 8
 pmovsxwd m4, m3
 padddm2, m4
-mova [r1 + 16], m2
+movu [r1 + 16], m2
 
 movu m3, [r2]
 psubusw  m1, m3
diff -r e8f9a60d4cd9 -r 653430a3de3f source/encoder/encoder.h
--- a/source/encoder/encoder.h  Mon Nov 16 16:44:33 2015 +0530
+++ b/source/encoder/encoder.h  Wed Nov 18 12:28:03 2015 +0530
@@ -79,7 +79,7 @@
 {
 public:
 
-ALIGN_VAR_16(uint32_t, 
m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
+uint32_t   
m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
 uint32_t   m_countEmergency[MAX_NUM_TR_CATEGORIES];
 uint16_t   
(*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: fix output change due to overflow in mbtree_propagate_cost 10bit asm

2015-11-17 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1447828315 -19800
#  Wed Nov 18 12:01:55 2015 +0530
# Node ID 58c177d2e182e5b633670024c567b535eb49614f
# Parent  e8f9a60d4cd9e73c9f2baf05c2ccda5af1892b46
asm: fix output change due to overflow in mbtree_propagate_cost 10bit asm

diff -r e8f9a60d4cd9 -r 58c177d2e182 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm   Mon Nov 16 16:44:33 2015 +0530
+++ b/source/common/x86/mc-a2.asm   Wed Nov 18 12:01:55 2015 +0530
@@ -1019,7 +1019,7 @@
 por m3, m1
 
 movdm1, [r1+r5*2]   ; prop
-%if (BIT_DEPTH <= 10)
+%if (BIT_DEPTH <= 8)
 pmaddwd m0, m2
 %else
 punpckldq   m2, m2
@@ -1072,7 +1072,7 @@
 por m3, m1
 
 movdm1, [r1+r5*2]   ; prop
-%if (BIT_DEPTH <= 10)
+%if (BIT_DEPTH <= 8)
 pmaddwd m0, m2
 %else
 punpckldq   m2, m2  ; DWORD [- 1 - 0]
@@ -1120,7 +1120,7 @@
 pminsd  xm3, xm2
 
 pmovzxwdxm1, [r1+r5*2]  ; prop
-%if (BIT_DEPTH <= 10)
+%if (BIT_DEPTH <= 8)
 pmaddwd xm0, xm2
 %else
 pmulld  xm0, xm2
@@ -1166,7 +1166,7 @@
 
 movdxm1, [r1+r5*2]  ; prop
 pmovzxwdxm1, xm1
-%if (BIT_DEPTH <= 10)
+%if (BIT_DEPTH <= 8)
 pmaddwd xm0, xm2
 %else
 pmulld  xm0, xm2
@@ -1204,7 +1204,7 @@
 
 movzx   r6d, word [r1+r5*2] ; prop
 movdxm1, r6d
-%if (BIT_DEPTH <= 10)
+%if (BIT_DEPTH <= 8)
 pmaddwd xm0, xm2
 %else
 pmulld  xm0, xm2
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: fix intrapred_planar16x16 SSE4 code for main12

2015-11-04 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1446700839 -19800
#  Thu Nov 05 10:50:39 2015 +0530
# Node ID 69bd13c0047d2c1a3b232bea40b72e436baa618e
# Parent  3103afbd31fa9b26533f06202516a511ee221439
asm: fix intrapred_planar16x16 SSE4 code for main12

diff -r 3103afbd31fa -r 69bd13c0047d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Nov 05 06:13:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Nov 05 10:50:39 2015 +0530
@@ -1144,9 +1144,9 @@
 
 p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
 p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
+p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_sse4);
 
 #if X265_DEPTH <= 10
-p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_sse4);
 p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar32_sse4);
 #endif
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
diff -r 3103afbd31fa -r 69bd13c0047d source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Thu Nov 05 06:13:51 2015 +0530
+++ b/source/common/x86/intrapred16.asm Thu Nov 05 10:50:39 2015 +0530
@@ -2427,6 +2427,118 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, 
int filter)
 
;---
 INIT_XMM sse4
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar16, 3,5,12
+add r1d, r1d
+
+pmovzxwdm2, [r2 + 2]
+pmovzxwdm7, [r2 + 10]
+pmovzxwdm10, [r2 + 18]
+pmovzxwdm0, [r2 + 26]
+
+movzx   r3d, word [r2 + 34] ; topRight   = 
above[16]
+lea r4, [pd_planar16_mul1]
+
+movdm3, r3d
+pshufd  m3, m3, 0   ; topRight
+
+pslld   m8, m3, 2
+pmulld  m3, m3, [r4 + 0*mmsize] ; (x + 1) * 
topRight
+paddd   m9, m3, m8
+paddd   m4, m9, m8
+paddd   m8, m4
+
+pslld   m1, m2, 4
+pslld   m6, m7, 4
+pslld   m5, m10, 4
+pslld   m11, m0, 4
+psubd   m1, m2
+psubd   m6, m7
+psubd   m5, m10
+psubd   m11, m0
+
+paddd   m4, m5
+paddd   m3, m1
+paddd   m8, m11
+paddd   m9, m6
+
+movam5, [pd_16]
+paddd   m3, m5
+paddd   m9, m5
+paddd   m4, m5
+paddd   m8, m5
+
+movzx   r4d, word [r2 + 98] ; bottomLeft = 
left[16]
+movdm6, r4d
+pshufd  m6, m6, 0   ; bottomLeft
+
+paddd   m4, m6
+paddd   m3, m6
+paddd   m8, m6
+paddd   m9, m6
+
+psubd   m1, m6, m0  ; column 12-15
+psubd   m11, m6, m10; column 8-11
+psubd   m10, m6, m7 ; column 4-7
+psubd   m6, m2  ; column 0-3
+
+add r2, 66
+lea r4, [pd_planar16_mul0]
+
+%macro INTRA_PRED_PLANAR16 1
+movzx   r3d, word [r2]
+movdm5, r3d
+pshufd  m5, m5, 0
+
+pmulld  m0, m5, [r4 + 3*mmsize] ; column 12-15
+pmulld  m2, m5, [r4 + 2*mmsize] ; column 8-11
+pmulld  m7, m5, [r4 + 1*mmsize] ; column 4-7
+pmulld  m5, m5, [r4 + 0*mmsize] ; column 0-3
+
+paddd   m0, m8
+paddd   m2, m4
+paddd   m7, m9
+paddd   m5, m3
+
+paddd   m8, m1
+paddd   m4, m11
+paddd   m9, m10
+paddd   m3, m6
+
+psrad   m0, 5
+psrad   m2, 5
+psrad   m7, 5
+psrad   m5, 5
+
+packusdwm2, m0
+packusdwm5, m7
+movu[r0], m5
+movu[r0 + mmsize], m2
+
+add r2, 2
+lea r0, [r0 + r1]
+%endmacro
+
+INTRA_PRED_PLANAR16 0
+INTRA_PRED_PLANAR16 1
+INTRA_PRED_PLANAR16 2
+INTRA_PRED_PLANAR16 3
+INTRA_PRED_PLANAR16 4
+INTRA_PRED_PLANAR16 5
+INTRA_PRED_PLANAR16 6
+INTRA_PRED_PLANAR16 7
+INTRA_PRED_PLANAR16 8
+INTRA_PRED_PLANAR16 9
+INTRA_PRED_PLANAR16 10
+INTRA_PRED_PLANAR16 11
+INTRA_PRED_PLANAR16 12
+INTRA_PRED_PLANAR16 13
+INTRA_PRED_PLANAR16 14
+INTRA_PRED_PLANAR16 15
+RET
+
+%else
+; code for BIT_DEPTH == 10
 cglobal intra_pred_planar16, 3,3,8
 add r1, r1
 movum2, [r2 + 2]
@@ -2504,6 +2616,7 @@
 INTRA

[x265] [PATCH] asm: fix mbtree_propagate_cost asm failure, fixes crash in OpenBSD

2015-11-04 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1446645042 -19800
#  Wed Nov 04 19:20:42 2015 +0530
# Node ID 25bada1bb5494fc12d62e87d1b7b788307dd963f
# Parent  c11dd97a8b999414c60dceef8620d3d9055cf4c1
asm: fix mbtree_propagate_cost asm failure, fixes crash in OpenBSD

The SSE2 asm code reads and write extra 4 bytes if loop counter is not multiple
of 2 as SSE2 asm code process 2 int values in single iteration

The AVX asm code reads and write extra 4,8 or 12 bytes if loop counter is not
multiple of 4 as AVX asm code process 4 int values in single iteration

diff -r c11dd97a8b99 -r 25bada1bb549 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm   Wed Nov 04 17:06:33 2015 +0530
+++ b/source/common/x86/mc-a2.asm   Wed Nov 04 19:20:42 2015 +0530
@@ -995,7 +995,8 @@
 ; uint16_t *inter_costs, int32_t *inv_qscales, 
double *fps_factor, int len )
 ;-
 INIT_XMM sse2
-cglobal mbtree_propagate_cost, 6,6,7
+cglobal mbtree_propagate_cost, 7,7,7
+dec r6d
 movsd   m6, [r5]
 mulpd   m6, [pd_inv256]
 xor r5d, r5d
@@ -1044,8 +1045,40 @@
 
 movh[r0+r5*4], m0
 add r5d, 2
-cmp r5d, r6m
+cmp r5d, r6d
 jl .loop
+
+xor r6d, r5d
+jnz .even
+movdm2, [r2+r5*4]   ; intra
+movdm0, [r4+r5*4]   ; invq
+movdm3, [r3+r5*2]   ; inter
+pandm3, m5
+punpcklwd   m3, m4
+
+; PMINSD
+pcmpgtd m1, m2, m3
+pandm3, m1
+pandn   m1, m2
+por m3, m1
+
+movdm1, [r1+r5*2]   ; prop
+pmaddwd m0, m2
+punpcklwd   m1, m4
+cvtdq2pdm0, m0
+mulpd   m0, m6  ; intra*invq*fps_factor>>8
+cvtdq2pdm1, m1  ; prop
+addpd   m0, m1  ; prop + (intra*invq*fps_factor>>8)
+cvtdq2pdm1, m2  ; intra
+psubd   m2, m3  ; intra - inter
+cvtdq2pdm2, m2  ; intra - inter
+mulpd   m0, m2  ; (prop + (intra*invq*fps_factor>>8)) * 
(intra - inter)
+
+divpd   m0, m1
+addpd   m0, [pd_0_5]
+cvttpd2dqm0, m0
+movd[r0+r5*4], m0
+.even:
 RET
 
 
@@ -1055,7 +1088,8 @@
 ;-
 ; FIXME: align loads/stores to 16 bytes
 %macro MBTREE_AVX 0
-cglobal mbtree_propagate_cost, 6,6,7
+cglobal mbtree_propagate_cost, 7,7,7
+sub r6d, 3
 vbroadcastsdm6, [r5]
 mulpd   m6, [pd_inv256]
 xor r5d, r5d
@@ -1089,9 +1123,81 @@
 cvttpd2dq   xm0, m0
 
 movu[r0+r5*4], xm0
-add r5d, 4
-cmp r5d, r6m
+add r5d, 4  ; process 4 values in one iteration
+cmp r5d, r6d
 jl .loop
+
+add r6d, 3
+xor r6d, r5d
+jz  .even   ; if loop counter is multiple of 4, 
all values are processed
+
+and r6d, 3  ; otherwise, remaining unprocessed 
values must be 1, 2 or 3
+cmp r6d, 1
+je  .process1   ; if only 1 value is unprocessed
+
+; process 2 values here
+movqxm2, [r2+r5*4]  ; intra
+movqxm0, [r4+r5*4]  ; invq
+movdxm3, [r3+r5*2]  ; inter
+pmovzxwdxm3, xm3
+pandxm3, xm5
+pminsd  xm3, xm2
+
+movdxm1, [r1+r5*2]  ; prop
+pmovzxwdxm1, xm1
+pmaddwd xm0, xm2
+cvtdq2pdm0, xm0
+cvtdq2pdm1, xm1 ; prop
+%if cpuflag(avx2)
+fmaddpd m0, m0, m6, m1
+%else
+mulpd   m0, m6  ; intra*invq*fps_factor>>8
+addpd   m0, m1  ; prop + (intra*invq*fps_factor>>8)
+%endif
+cvtdq2pdm1, xm2 ; intra
+psubd   xm2, xm3; intra - inter
+cvtdq2pdm2, xm2 ; intra - inter
+mulpd   m0, m2  ; (prop + (intra*invq*fps_factor>>8)) 
* (intra - inter)
+
+divpd   m0, m1
+addpd   m0, [pd_0_5]
+cvttpd2dq   xm0, m0
+movq[r0+r5*4], xm0
+
+xor r6d, 2
+jz  .even
+add r5d, 2
+
+; process 1 value here
+.process1:
+movdxm2, [r2+r5*4]  ; intra
+movdxm0, [r4+r5*4]  ; invq
+movzx   r6d, word [r3+r5*2] ; inter
+movdxm3, r6d
+pandxm3, xm5
+pminsd  xm3, xm2
+
+movzx   r6d, word [r1+r5*2] ; prop
+movdxm1, r6d
+pmaddwd xm0, xm2
+cvtdq

Re: [x265] [PATCH] fix invalid Instruction Set provided in CLI if CPU doesn't support it

2015-11-02 Thread Dnyaneshwar Gorade
Ok. I will check if we can use AND mask and provide more information to
user.

On Tue, Nov 3, 2015 at 10:36 AM, Deepthi Nandakumar <
deep...@multicorewareinc.com> wrote:

> Since the idea here is to correctly log a user-generated error (user-cpuid
> > detected cpuid), the patch is headed in the right direction.
>
> Min's suggestion on using an AND mask sounds good, and can you also make
> the warning more informative (print user-cpuid, and the cpuid we're
> defaulting to) ?
>
> On Thu, Oct 29, 2015 at 11:16 PM, Steve Borho <st...@borho.org> wrote:
>
>> On 10/28, dnyanesh...@multicorewareinc.com wrote:
>> > # HG changeset patch
>> > # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
>> > # Date 1446021877 -19800
>> > #  Wed Oct 28 14:14:37 2015 +0530
>> > # Node ID 975087370d14e90cd63edecb34fb4bf2feda2468
>> > # Parent  6563218ce342c30bfd4f9bc172a1dab510e6e55b
>> > fix invalid Instruction Set provided in CLI if CPU doesn't support it
>> >
>> > This patch avoids crash/invalid instructions when we provide
>> instruction sets to
>> > be used are higher than the cpu capabilities.
>> >
>> > For example, if our cpu supports instruction sets upto AVX and we
>> provide
>> > --asm "avx2" (AVX2 is higher than AVX) then it will show warning and
>> use default
>> > x265 detected intruction sets.
>>
>> The whole point of having this override is in case our CPU detection is
>> somehow wrong. The user needs to be able to override the detection mask.
>>
>> That said.. if the user provided mask has bits set that were not
>> detected, it's ok to log a serious warning that says you think the
>> encoder is about to break and it is the user's fault.
>>
>> BTW: this feature is often used for benchmarking, to disable certain
>> optimizations piecemeal, but that is not the primary reason why it
>> exists.
>>
>> > diff -r 6563218ce342 -r 975087370d14 source/common/primitives.cpp
>> > --- a/source/common/primitives.cppMon Oct 26 12:13:53 2015 +0530
>> > +++ b/source/common/primitives.cppWed Oct 28 14:14:37 2015 +0530
>> > @@ -238,6 +238,15 @@
>> >  primitives.cu[i].intra_pred_allangs = NULL;
>> >
>> >  #if ENABLE_ASSEMBLY
>> > +
>> > +if ((uint32_t)param->cpuid > X265_NS::cpu_detect())
>> > +{
>> > +if (param->logLevel >= X265_LOG_INFO)
>> > +x265_log(param, X265_LOG_WARNING, "Unsupported CPUID
>> provided in CLI, so choosing x265 detected CPUID!\n");
>> > +
>> > +param->cpuid = X265_NS::cpu_detect();
>> > +}
>> > +
>> >  setupInstrinsicPrimitives(primitives, param->cpuid);
>> >  setupAssemblyPrimitives(primitives, param->cpuid);
>> >  #endif
>> > ___
>> > x265-devel mailing list
>> > x265-devel@videolan.org
>> > https://mailman.videolan.org/listinfo/x265-devel
>>
>> --
>> Steve Borho
>> ___
>> x265-devel mailing list
>> x265-devel@videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
>
> --
> Deepthi Nandakumar
> Engineering Manager, x265
> Multicoreware, Inc
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] fix invalid Instruction Set provided in CLI if CPU doesn't support it

2015-10-28 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1446021877 -19800
#  Wed Oct 28 14:14:37 2015 +0530
# Node ID 975087370d14e90cd63edecb34fb4bf2feda2468
# Parent  6563218ce342c30bfd4f9bc172a1dab510e6e55b
fix invalid Instruction Set provided in CLI if CPU doesn't support it

This patch avoids crash/invalid instructions when we provide instruction sets to
be used are higher than the cpu capabilities.

For example, if our cpu supports instruction sets upto AVX and we provide
--asm "avx2" (AVX2 is higher than AVX) then it will show warning and use default
x265 detected intruction sets.

diff -r 6563218ce342 -r 975087370d14 source/common/primitives.cpp
--- a/source/common/primitives.cpp  Mon Oct 26 12:13:53 2015 +0530
+++ b/source/common/primitives.cpp  Wed Oct 28 14:14:37 2015 +0530
@@ -238,6 +238,15 @@
 primitives.cu[i].intra_pred_allangs = NULL;
 
 #if ENABLE_ASSEMBLY
+
+if ((uint32_t)param->cpuid > X265_NS::cpu_detect())
+{
+if (param->logLevel >= X265_LOG_INFO)
+x265_log(param, X265_LOG_WARNING, "Unsupported CPUID provided 
in CLI, so choosing x265 detected CPUID!\n");
+
+param->cpuid = X265_NS::cpu_detect();
+}
+
 setupInstrinsicPrimitives(primitives, param->cpuid);
 setupAssemblyPrimitives(primitives, param->cpuid);
 #endif
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: fix intrapred_planar16x16 sse4 code for main12

2015-10-23 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1445588852 -19800
#  Fri Oct 23 13:57:32 2015 +0530
# Node ID 0fb5a67c2f5ea4f3fe1a7e0dcbc0c5c117dd6dfc
# Parent  a7251c3e0ef810b95bb25be5371035208e36996d
asm: fix intrapred_planar16x16 sse4 code for main12

diff -r a7251c3e0ef8 -r 0fb5a67c2f5e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Oct 22 09:12:28 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Oct 23 13:57:32 2015 +0530
@@ -1145,8 +1145,9 @@
 p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
 p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
 
+p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_sse4);
+
 #if X265_DEPTH <= 10
-p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_sse4);
 p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar32_sse4);
 #endif
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
diff -r a7251c3e0ef8 -r 0fb5a67c2f5e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Thu Oct 22 09:12:28 2015 +0530
+++ b/source/common/x86/const-a.asm Fri Oct 23 13:57:32 2015 +0530
@@ -122,6 +122,7 @@
 const pd_2, times  8 dd 2
 const pd_4, times  4 dd 4
 const pd_8, times  4 dd 8
+const pd_15,times  8 dd 15
 const pd_16,times  8 dd 16
 const pd_31,times  4 dd 31
 const pd_32,times  8 dd 32
@@ -136,7 +137,8 @@
 const pd_524416,times  4 dd 524416
 const pd_n32768,times  8 dd 0x8000
 const pd_n131072,   times  4 dd 0xfffe
-
+const pd_planar16_mul0, times  1 dd  15,  14,  13,  12,  11,  10,   9,   
8,   7,   6,   5,   4,   3,   2,   1,   0
+const pd_planar16_mul1, times  1 dd   1,   2,   3,   4,   5,   6,   7,   
8,   9,  10,  11,  12,  13,  14,  15,  16
 const trans8_shuf,  times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
 const popcnt_table
diff -r a7251c3e0ef8 -r 0fb5a67c2f5e source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Thu Oct 22 09:12:28 2015 +0530
+++ b/source/common/x86/intrapred16.asm Fri Oct 23 13:57:32 2015 +0530
@@ -109,6 +109,7 @@
 cextern pw_16
 cextern pw_31
 cextern pw_32
+cextern pd_15
 cextern pd_16
 cextern pd_31
 cextern pd_32
@@ -123,6 +124,8 @@
 cextern pb_unpackwq1
 cextern pb_unpackwq2
 cextern pw_planar16_mul
+cextern pd_planar16_mul0
+cextern pd_planar16_mul1
 cextern pw_planar32_mul
 
 
;---
@@ -2216,6 +2219,114 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, 
int filter)
 
;---
 INIT_XMM sse4
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar16, 3,5,12
+add r1d, r1d
+
+pmovzxwdm2, [r2 + 2]
+pmovzxwdm7, [r2 + 10]
+pmovzxwdm10, [r2 + 18]
+pmovzxwdm0, [r2 + 26]
+
+movzx   r3d, word [r2 + 34] ; topRight   = 
above[16]
+lea r4, [pd_planar16_mul1]
+
+movdm3, r3d
+pshufd  m3, m3, 0   ; topRight
+
+pmulld  m8, m3, [r4 + 3*mmsize] ; (x + 1) * 
topRight
+pmulld  m4, m3, [r4 + 2*mmsize] ; (x + 1) * 
topRight
+pmulld  m9, m3, [r4 + 1*mmsize] ; (x + 1) * 
topRight
+pmulld  m3, m3, [r4 + 0*mmsize] ; (x + 1) * 
topRight
+
+movam11, [pd_15]
+pmulld  m1, m2,  m11; (blkSize - 1 - 
y) * above[x]
+pmulld  m6, m7,  m11; (blkSize - 1 - 
y) * above[x]
+pmulld  m5, m10, m11; (blkSize - 1 - 
y) * above[x]
+pmulld  m11, m0 ; (blkSize - 1 - 
y) * above[x]
+
+paddd   m4, m5
+paddd   m3, m1
+paddd   m8, m11
+paddd   m9, m6
+
+movam5, [pd_16]
+paddd   m3, m5
+paddd   m9, m5
+paddd   m4, m5
+paddd   m8, m5
+
+movzx   r4d, word [r2 + 98] ; bottomLeft = 
left[16]
+movdm6, r4d
+pshufd  m6, m6, 0   ; bottomLeft
+
+paddd   m4, m6
+paddd   m3, m6
+paddd   m8, m6
+paddd   m9, m6
+
+psubd   m1, m6, m0  ; column 12-15
+psubd   m11, m6, m10; column 8-11
+psubd   m10, m6, m7 ; column 4-7
+psubd

Re: [x265] [PATCH] asm: fix intrapred_planar16x16 sse4 code for main12

2015-10-22 Thread Dnyaneshwar Gorade
​​


On Wed, Oct 21, 2015 at 7:58 AM, chen <chenm...@163.com> wrote:

>
>
> At 2015-10-20 18:38:56,dnyanesh...@multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
> ># Date 1445337446 -19800
> >#  Tue Oct 20 16:07:26 2015 +0530
> ># Node ID 987b5f8c2c447dc5b0e410d37f6212470feecd1c
> ># Parent  f335a9a7b9083dcb2fc7a1cadc2dbeffdd6388f2
> >asm: fix intrapred_planar16x16 sse4 code for main12
> >
> >diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp   Mon Oct 19 12:42:52 2015 +0530
> >+++ b/source/common/x86/asm-primitives.cpp   Tue Oct 20 16:07:26 2015 +0530
> >@@ -1145,8 +1145,9 @@
> > p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = 
> > PFX(intra_pred_planar4_sse4);
> > p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = 
> > PFX(intra_pred_planar8_sse4);
> >
> >+p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
> >PFX(intra_pred_planar16_sse4);
> >+
> > #if X265_DEPTH <= 10
> >-p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
> >PFX(intra_pred_planar16_sse4);
> > p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = 
> > PFX(intra_pred_planar32_sse4);
> > #endif
> > ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
> >diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/const-a.asm
> >--- a/source/common/x86/const-a.asm  Mon Oct 19 12:42:52 2015 +0530
> >+++ b/source/common/x86/const-a.asm  Tue Oct 20 16:07:26 2015 +0530
> >@@ -122,6 +122,7 @@
> > const pd_2, times  8 dd 2
> > const pd_4, times  4 dd 4
> > const pd_8, times  4 dd 8
> >+const pd_15,times  8 dd 15
> > const pd_16,times  8 dd 16
> > const pd_31,times  4 dd 31
> > const pd_32,times  8 dd 32
> >@@ -136,7 +137,8 @@
> > const pd_524416,times  4 dd 524416
> > const pd_n32768,times  8 dd 0x8000
> > const pd_n131072,   times  4 dd 0xfffe
> >-
> >+const pd_planar16_mul,  times  1 dd  15,  14,  13,  12,  11,  10,   9,  
> > 8,   7,   6,   5,   4,   3,   2,   1,   0
> >+const pd_planar16_mul1, times  1 dd   1,   2,   3,   4,   5,   6,   7,  
> > 8,   9,  10,  11,  12,  13,  14,  15,  16
> > const trans8_shuf,  times  1 dd   0,   4,   1,   5,   2,   6,   3,  
> >  7
> >
> > const popcnt_table
> >diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/intrapred16.asm
> >--- a/source/common/x86/intrapred16.asm  Mon Oct 19 12:42:52 2015 +0530
> >+++ b/source/common/x86/intrapred16.asm  Tue Oct 20 16:07:26 2015 +0530
> >@@ -109,6 +109,7 @@
> > cextern pw_16
> > cextern pw_31
> > cextern pw_32
> >+cextern pd_15
> > cextern pd_16
> > cextern pd_31
> > cextern pd_32
> >@@ -123,6 +124,8 @@
> > cextern pb_unpackwq1
> > cextern pb_unpackwq2
> > cextern pw_planar16_mul
> >+cextern pd_planar16_mul
> >+cextern pd_planar16_mul1
> > cextern pw_planar32_mul
> >
> > ;---
> >@@ -2216,6 +2219,114 @@
> > ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, 
> > int filter)
> > ;---
> > INIT_XMM sse4
> >+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
> >+cglobal intra_pred_planar16, 3,5,12
> >+add r1d, r1d
> >+
> >+pmovzxwdm2, [r2 + 2]
> >+pmovzxwdm7, [r2 + 10]
> >+pmovzxwdm10, [r2 + 18]
> >+pmovzxwdm0, [r2 + 26]
> >+
> >+movzx   r3d, word [r2 + 34] ; topRight   = 
> >above[16]
> >+lea r4, [pd_planar16_mul1]
> >+
> >+movdm3, r3d
> >+pshufd  m3, m3, 0   ; topRight
> >+
> >+pmulld  m8, m3, [r4 + 3*mmsize] ; (x + 1) * 
> >topRight
> >+pmulld  m4, m3, [r4 + 2*mmsize] ; (x + 1) * 
> >topRight
> >+pmulld  m9, m3, [r4 + 1*mmsize] ; (x + 1) * 
> >topRight
> >+pmulld  m3, m3, [r4 + 0*mmsize] ; (x + 1) * 
> >topRight
> >+
> >+movam11, [pd_15]
> >+pmulld  m1, m2,  m11; (blkSize - 1 
> >- y) * above[x]
> >+pmu

[x265] [PATCH] asm: fix intrapred_planar16x16 sse4 code for main12

2015-10-20 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1445337446 -19800
#  Tue Oct 20 16:07:26 2015 +0530
# Node ID 987b5f8c2c447dc5b0e410d37f6212470feecd1c
# Parent  f335a9a7b9083dcb2fc7a1cadc2dbeffdd6388f2
asm: fix intrapred_planar16x16 sse4 code for main12

diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Oct 19 12:42:52 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Oct 20 16:07:26 2015 +0530
@@ -1145,8 +1145,9 @@
 p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
 p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
 
+p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_sse4);
+
 #if X265_DEPTH <= 10
-p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_sse4);
 p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar32_sse4);
 #endif
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Oct 19 12:42:52 2015 +0530
+++ b/source/common/x86/const-a.asm Tue Oct 20 16:07:26 2015 +0530
@@ -122,6 +122,7 @@
 const pd_2, times  8 dd 2
 const pd_4, times  4 dd 4
 const pd_8, times  4 dd 8
+const pd_15,times  8 dd 15
 const pd_16,times  8 dd 16
 const pd_31,times  4 dd 31
 const pd_32,times  8 dd 32
@@ -136,7 +137,8 @@
 const pd_524416,times  4 dd 524416
 const pd_n32768,times  8 dd 0x8000
 const pd_n131072,   times  4 dd 0xfffe
-
+const pd_planar16_mul,  times  1 dd  15,  14,  13,  12,  11,  10,   9,   
8,   7,   6,   5,   4,   3,   2,   1,   0
+const pd_planar16_mul1, times  1 dd   1,   2,   3,   4,   5,   6,   7,   
8,   9,  10,  11,  12,  13,  14,  15,  16
 const trans8_shuf,  times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
 const popcnt_table
diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Oct 19 12:42:52 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue Oct 20 16:07:26 2015 +0530
@@ -109,6 +109,7 @@
 cextern pw_16
 cextern pw_31
 cextern pw_32
+cextern pd_15
 cextern pd_16
 cextern pd_31
 cextern pd_32
@@ -123,6 +124,8 @@
 cextern pb_unpackwq1
 cextern pb_unpackwq2
 cextern pw_planar16_mul
+cextern pd_planar16_mul
+cextern pd_planar16_mul1
 cextern pw_planar32_mul
 
 
;---
@@ -2216,6 +2219,114 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, 
int filter)
 
;---
 INIT_XMM sse4
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar16, 3,5,12
+add r1d, r1d
+
+pmovzxwdm2, [r2 + 2]
+pmovzxwdm7, [r2 + 10]
+pmovzxwdm10, [r2 + 18]
+pmovzxwdm0, [r2 + 26]
+
+movzx   r3d, word [r2 + 34] ; topRight   = 
above[16]
+lea r4, [pd_planar16_mul1]
+
+movdm3, r3d
+pshufd  m3, m3, 0   ; topRight
+
+pmulld  m8, m3, [r4 + 3*mmsize] ; (x + 1) * 
topRight
+pmulld  m4, m3, [r4 + 2*mmsize] ; (x + 1) * 
topRight
+pmulld  m9, m3, [r4 + 1*mmsize] ; (x + 1) * 
topRight
+pmulld  m3, m3, [r4 + 0*mmsize] ; (x + 1) * 
topRight
+
+movam11, [pd_15]
+pmulld  m1, m2,  m11; (blkSize - 1 - 
y) * above[x]
+pmulld  m6, m7,  m11; (blkSize - 1 - 
y) * above[x]
+pmulld  m5, m10, m11; (blkSize - 1 - 
y) * above[x]
+pmulld  m11, m0 ; (blkSize - 1 - 
y) * above[x]
+
+paddd   m4, m5
+paddd   m3, m1
+paddd   m8, m11
+paddd   m9, m6
+
+movam5, [pd_16]
+paddd   m3, m5
+paddd   m9, m5
+paddd   m4, m5
+paddd   m8, m5
+
+movzx   r4d, word [r2 + 98] ; bottomLeft = 
left[16]
+movdm6, r4d
+pshufd  m6, m6, 0   ; bottomLeft
+
+paddd   m4, m6
+paddd   m3, m6
+paddd   m8, m6
+paddd   m9, m6
+
+psubd   m1, m6, m0  ; column 12-15
+psubd   m11, m6, m10; column 8-11
+psubd   m10, m6, m7 ; column 4-7
+psubd

[x265] [PATCH] asm: fix intrapred_planar16x16 sse4 code for main12

2015-10-19 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1445245458 -19800
#  Mon Oct 19 14:34:18 2015 +0530
# Node ID 76d4fc7264a0d22218db30f65bb58095c294db1b
# Parent  04575a459a160162391fcf1a12e8e6f2e81e95b4
asm: fix intrapred_planar16x16 sse4 code for main12

diff -r 04575a459a16 -r 76d4fc7264a0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Sep 30 11:22:16 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Oct 19 14:34:18 2015 +0530
@@ -1145,8 +1145,9 @@
 p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
 p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
 
+p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_sse4);
+
 #if X265_DEPTH <= 10
-p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar16_sse4);
 p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = 
PFX(intra_pred_planar32_sse4);
 #endif
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
diff -r 04575a459a16 -r 76d4fc7264a0 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Sep 30 11:22:16 2015 +0530
+++ b/source/common/x86/const-a.asm Mon Oct 19 14:34:18 2015 +0530
@@ -122,6 +122,7 @@
 const pd_2, times  8 dd 2
 const pd_4, times  4 dd 4
 const pd_8, times  4 dd 8
+const pd_15,times  8 dd 15
 const pd_16,times  8 dd 16
 const pd_31,times  4 dd 31
 const pd_32,times  8 dd 32
@@ -136,7 +137,8 @@
 const pd_524416,times  4 dd 524416
 const pd_n32768,times  8 dd 0x8000
 const pd_n131072,   times  4 dd 0xfffe
-
+const pd_planar16_mul,  times  1 dd  15,  14,  13,  12,  11,  10,   9,   
8,   7,   6,   5,   4,   3,   2,   1,   0
+const pd_planar16_mul1, times  1 dd   1,   2,   3,   4,   5,   6,   7,   
8,   9,  10,  11,  12,  13,  14,  15,  16
 const trans8_shuf,  times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
 const popcnt_table
diff -r 04575a459a16 -r 76d4fc7264a0 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Wed Sep 30 11:22:16 2015 +0530
+++ b/source/common/x86/intrapred16.asm Mon Oct 19 14:34:18 2015 +0530
@@ -109,6 +109,7 @@
 cextern pw_16
 cextern pw_31
 cextern pw_32
+cextern pd_15
 cextern pd_16
 cextern pd_31
 cextern pd_32
@@ -123,6 +124,8 @@
 cextern pb_unpackwq1
 cextern pb_unpackwq2
 cextern pw_planar16_mul
+cextern pd_planar16_mul
+cextern pd_planar16_mul1
 cextern pw_planar32_mul
 
 
;---
@@ -2216,6 +2219,110 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, 
int filter)
 
;---
 INIT_XMM sse4
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar16, 3,5,12
+add r1d, r1d
+
+pmovzxwdm2, [r2 + 2]
+pmovzxwdm7, [r2 + 10]
+pmovzxwdm10, [r2 + 18]
+pmovzxwdm0, [r2 + 26]
+
+movzx   r3d, word [r2 + 34] ; topRight   = 
above[16]
+movzx   r4d, word [r2 + 98] ; bottomLeft = 
left[16]
+
+movdm3, r3d
+pshufd  m3, m3, 0   ; topRight
+
+pmulld  m8, m3, [pd_planar16_mul1 + 3*mmsize]   ; (x + 1) * 
topRight
+pmulld  m4, m3, [pd_planar16_mul1 + 2*mmsize]   ; (x + 1) * 
topRight
+pmulld  m9, m3, [pd_planar16_mul1 + 1*mmsize]   ; (x + 1) * 
topRight
+pmulld  m3, m3, [pd_planar16_mul1 + 0*mmsize]   ; (x + 1) * 
topRight
+
+pmulld  m1, m2,  [pd_15]; (blkSize - 1 - 
y) * above[x]
+pmulld  m6, m7,  [pd_15]; (blkSize - 1 - 
y) * above[x]
+pmulld  m5, m10, [pd_15]; (blkSize - 1 - 
y) * above[x]
+pmulld  m11, m0, [pd_15]; (blkSize - 1 - 
y) * above[x]
+
+paddd   m3, [pd_16]
+paddd   m9, [pd_16]
+paddd   m4, [pd_16]
+paddd   m8, [pd_16]
+
+paddd   m4, m5
+paddd   m3, m1
+paddd   m8, m11
+paddd   m9, m6
+
+movdm6, r4d
+pshufd  m6, m6, 0   ; bottomLeft
+
+paddd   m4, m6
+paddd   m3, m6
+paddd   m8, m6
+paddd   m9, m6
+
+psubd   m1, m6, m0  ; column 12-15
+psubd   m11, m6, m10; column 8-11
+psubd   m10, m6, m7 ; column 4-7
+psubd   m6, m2  ; column 0-3
+
+add r2, 66
+
+%macro INTR

[x265] [PATCH] multilib: fix multiple definition of pelFilterLumaStrong_c

2015-10-15 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1444972708 -19800
#  Fri Oct 16 10:48:28 2015 +0530
# Node ID 76a36eabd4be405fc4880d882499a754c3f190fa
# Parent  fe65544b6c40d7cd62c2b86275bf98b264b6edb0
multilib: fix multiple definition of pelFilterLumaStrong_c

diff -r fe65544b6c40 -r 76a36eabd4be source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp  Wed Oct 07 13:42:41 2015 +0530
+++ b/source/common/loopfilter.cpp  Fri Oct 16 10:48:28 2015 +0530
@@ -138,7 +138,7 @@
 }
 }
 
-void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, 
int32_t tcP, int32_t tcQ)
+static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t 
offset, int32_t tcP, int32_t tcQ)
 {
 for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
 {
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 2 of 2] asm: asm code for deblocking filter horizontal and vertical

2015-10-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1444286180 -19800
#  Thu Oct 08 12:06:20 2015 +0530
# Node ID 86627e458e6e2e357fe1746067392c6984b8915f
# Parent  38e4b94377fa6ffe57472c49ecff6c909ed4f6dc
asm: asm code for deblocking filter horizontal and vertical

diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Oct 08 12:06:20 2015 +0530
@@ -2541,6 +2541,9 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = 
PFX(filterPixelToShort_6x16_sse4);
 
 #if X86_64
+p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
+p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+
 p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
 p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
 p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/const-a.asm Thu Oct 08 12:06:20 2015 +0530
@@ -67,6 +67,7 @@
 
 ;; 16-bit constants
 
+const pw_n1,times 16 dw -1
 const pw_1, times 16 dw 1
 const pw_2, times 16 dw 2
 const pw_3, times 16 dw 3
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Thu Oct 08 12:06:20 2015 +0530
@@ -37,6 +37,7 @@
 
 SECTION .text
 cextern pb_1
+cextern pb_01
 cextern pb_128
 cextern pb_2
 cextern pw_2
@@ -45,6 +46,8 @@
 cextern pw_1
 cextern hmul_16p
 cextern pb_4
+cextern pw_4
+cextern pw_n1
 
 
 
;
@@ -2231,6 +2234,248 @@
 RET
 %endif ; ARCH_X86_64
 
+%if ARCH_X86_64
+;; argument registers used -
+; r0- src
+; r1- srcStep
+; r2- offset
+; r3- tcP
+; r4- tcQ
+
+INIT_XMM sse4
+cglobal pelFilterLumaStrong_H, 5,7,10
+mov r1, r2
+neg r3d
+neg r4d
+neg r1
+
+lea r5, [r2 * 3]
+lea r6, [r1 * 3]
+
+pmovzxbwm4, [r0]; src[0]
+pmovzxbwm3, [r0 + r1]   ; src[-offset]
+pmovzxbwm2, [r0 + r1 * 2]   ; src[-offset * 2]
+pmovzxbwm1, [r0 + r6]   ; src[-offset * 3]
+pmovzxbwm0, [r0 + r1 * 4]   ; src[-offset * 4]
+pmovzxbwm5, [r0 + r2]   ; src[offset]
+pmovzxbwm6, [r0 + r2 * 2]   ; src[offset * 2]
+pmovzxbwm7, [r0 + r5]   ; src[offset * 3]
+
+paddw   m0, m0  ; m0*2
+movam8, m2
+paddw   m8, m3  ; m2 + m3
+paddw   m8, m4  ; m2 + m3 + m4
+movam9, m8
+paddw   m9, m9  ; 2*m2 + 2*m3 + 2*m4
+paddw   m8, m1  ; m2 + m3 + m4 + m1
+paddw   m0, m8  ; 2*m0 + m2+ m3 + m4 + m1
+paddw   m9, m1
+paddw   m0, m1
+paddw   m9, m5  ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
+paddw   m0, m1  ; 2*m0 + 3*m1 + m2 + m3 + m4
+
+punpcklqdq  m0, m9
+punpcklqdq  m1, m3
+
+paddw   m3, m4
+movam9, m5
+paddw   m9, m6
+paddw   m7, m7  ; 2*m7
+paddw   m9, m3  ; m3 + m4 + m5 + m6
+movam3, m9
+paddw   m3, m3  ; 2*m3 + 2*m4 + 2*m5 + 2*m6
+paddw   m7, m9  ; 2*m7 + m3 + m4 + m5 + m6
+paddw   m7, m6
+psubw   m3, m6  ; 2*m3 + 2*m4 + 2*m5 + m6
+paddw   m7, m6  ; m3 + m4 + m5 + 3*m6 + 2*m7
+paddw   m3, m2  ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
+
+punpcklqdq  m9, m8
+punpcklqdq  m3, m7
+punpcklqdq  m5, m2
+punpcklqdq  m4, m6
+
+movdm7, r3d ; -tcP
+movdm2, r4d ; -tcQ
+pshufb  m7, [pb_01]
+pshufb  m2, [pb_01]
+movam6, m2
+punpcklqdq  m6, m7
+
+paddw   m0, [pw_4]
+paddw   m3, [pw_4]
+paddw   m9, [pw_2]
+
+psraw   m0, 3
+psraw   m3, 3
+psraw   m9, 2
+
+psubw   m0, m1
+psubw   m3, m4
+psubw   m9, m5
+
+pmaxsw  m0, m7
+pmaxsw  m3, m2
+pmaxsw  m9, m6
+psignw  m7, [pw_n1]
+psignw  m2, [pw_n1]
+psignw  m6, [pw_n1]
+pminsw  m0, m7
+pminsw  m3, m2
+

[x265] [PATCH 1 of 2] asm: separated deblocking filter into horizontal & vertical primitives for asm

2015-10-09 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1444121396 -19800
#  Tue Oct 06 14:19:56 2015 +0530
# Node ID 38e4b94377fa6ffe57472c49ecff6c909ed4f6dc
# Parent  f8ad1ff7074aab85a6cf376886014c88f46b7275
asm: separated deblocking filter into horizontal & vertical primitives for asm

diff -r f8ad1ff7074a -r 38e4b94377fa source/common/deblock.cpp
--- a/source/common/deblock.cpp Thu Oct 08 15:27:34 2015 -0500
+++ b/source/common/deblock.cpp Tue Oct 06 14:19:56 2015 +0530
@@ -280,31 +280,6 @@
  * \param maskQ   indicator to enable filtering on partQ
  * \param maskP1  decision weak filter/no filter for partP
  * \param maskQ1  decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t 
offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
-int32_t tc2 = 2 * tc;
-int32_t tcP = (tc2 & maskP);
-int32_t tcQ = (tc2 & maskQ);
-for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-{
-int16_t m4  = (int16_t)src[0];
-int16_t m3  = (int16_t)src[-offset];
-int16_t m5  = (int16_t)src[offset];
-int16_t m2  = (int16_t)src[-offset * 2];
-int16_t m6  = (int16_t)src[offset * 2];
-int16_t m1  = (int16_t)src[-offset * 3];
-int16_t m7  = (int16_t)src[offset * 3];
-int16_t m0  = (int16_t)src[-offset * 4];
-src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + 
m2 + m3 + m4 + 4) >> 3) - m1) + m1);
-src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 
2) >> 2) - m2) + m2);
-src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * 
m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
-src[0]   = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * 
m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
-src[offset]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 
2) >> 2) - m5) + m5);
-src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * 
m6 + 2 * m7 + 4) >> 3) - m6) + m6);
-}
-}
-
-/* Weak filter */
 static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t 
offset, int32_t tc, int32_t maskP, int32_t maskQ,
  int32_t maskP1, int32_t maskQ1)
 {
@@ -446,7 +421,12 @@
useStrongFiltering(offset, beta, tc, src + unitOffset + 
srcStep * 3));
 
 if (sw)
-pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, 
maskQ);
+{
+int32_t tc2 = 2 * tc;
+int32_t tcP = (tc2 & maskP);
+int32_t tcQ = (tc2 & maskQ);
+primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, 
offset, tcP, tcQ);
+}
 else
 {
 int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
diff -r f8ad1ff7074a -r 38e4b94377fa source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp  Thu Oct 08 15:27:34 2015 -0500
+++ b/source/common/loopfilter.cpp  Tue Oct 06 14:19:56 2015 +0530
@@ -137,6 +137,27 @@
 rec += stride;
 }
 }
+
+void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, 
int32_t tcP, int32_t tcQ)
+{
+for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
+{
+int16_t m4  = (int16_t)src[0];
+int16_t m3  = (int16_t)src[-offset];
+int16_t m5  = (int16_t)src[offset];
+int16_t m2  = (int16_t)src[-offset * 2];
+int16_t m6  = (int16_t)src[offset * 2];
+int16_t m1  = (int16_t)src[-offset * 3];
+int16_t m7  = (int16_t)src[offset * 3];
+int16_t m0  = (int16_t)src[-offset * 4];
+src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + 
m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 
2) >> 2) - m2) + m2);
+src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * 
m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+src[0]   = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * 
m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+src[offset]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 
2) >> 2) - m5) + m5);
+src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * 
m6 + 2 * m7 + 4) >> 3) - m6) + m6);
+}
+}
 }
 
 namespace X265_NS {
@@ -151,5 +172,9 @@
 p.saoCuOrgE3[1] = processSaoCUE3;
 p.saoCuOrgB0 = processSaoCUB0;
 p.sign = calSign;
+
+// C code is same for EDGE_VER and EDGE_HOR only asm code is different
+p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
+p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
 }
 }
diff -r f8ad1ff7074a -r 38e4b94377fa source/common/primitives.h
--- a/source/common/primitives.hThu Oct 08 15:27:34 2015 -0500
+++ b/source/common/primitives.hTue Oct 06 14:19:5

[x265] [PATCH] add 64-byte alignment macro, align NR buffer & Encoder class to cache line of 64-byte

2015-10-05 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1444107449 -19800
#  Tue Oct 06 10:27:29 2015 +0530
# Node ID 93525c471023575d500c912284a3853ee8df8991
# Parent  f8b8ebdc54578e6735216d8b9abce5ba80c05bd8
add 64-byte alignment macro, align NR buffer & Encoder class to cache line of 
64-byte

diff -r f8b8ebdc5457 -r 93525c471023 source/common/common.h
--- a/source/common/common.hMon Sep 28 14:34:41 2015 +0530
+++ b/source/common/common.hTue Oct 06 10:27:29 2015 +0530
@@ -74,6 +74,7 @@
 #define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
 #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
+#define ALIGN_VAR_64(T, var) T var __attribute__((aligned(64)))
 
 #if defined(__MINGW32__)
 #define fseeko fseeko64
@@ -84,6 +85,7 @@
 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
+#define ALIGN_VAR_64(T, var) __declspec(align(64)) T var
 #define fseeko _fseeki64
 
 #endif // if defined(__GNUC__)
diff -r f8b8ebdc5457 -r 93525c471023 source/encoder/encoder.h
--- a/source/encoder/encoder.h  Mon Sep 28 14:34:41 2015 +0530
+++ b/source/encoder/encoder.h  Tue Oct 06 10:27:29 2015 +0530
@@ -79,7 +79,7 @@
 {
 public:
 
-ALIGN_VAR_16(uint32_t, 
m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
+ALIGN_VAR_64(uint32_t, 
m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
 uint32_t   m_countEmergency[MAX_NUM_TR_CATEGORIES];
 uint16_t   
(*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: avx2 code for sad_x3_32xN, improved over 40% than SSE

2015-09-24 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1443156551 -19800
#  Fri Sep 25 10:19:11 2015 +0530
# Node ID 310d35ed0ba85174676d0b0bb91e6b8b5f475726
# Parent  975352b2c0223b9139aad233b43eaf2113ac8167
asm: avx2 code for sad_x3_32xN, improved over 40% than SSE

diff -r 975352b2c022 -r 310d35ed0ba8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Sep 23 16:19:48 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Sep 25 10:19:11 2015 +0530
@@ -3587,6 +3587,12 @@
 p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
 p.planeClipAndMax = PFX(planeClipAndMax_avx2);
 
+p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2);
+p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2);
+p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx2);
+p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx2);
+p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx2);
+
 /* The following primitives have been disabled since performance 
compared to SSE is negligible/negative */
 #if 0
 p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
diff -r 975352b2c022 -r 310d35ed0ba8 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm   Wed Sep 23 16:19:48 2015 +0530
+++ b/source/common/x86/sad-a.asm   Fri Sep 25 10:19:11 2015 +0530
@@ -4674,6 +4674,272 @@
 movd[r5 + 8], xm1
 RET
 
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+%macro SAD_X3_32x8_AVX2 0
+movum3, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m7, m3, m5
+paddd   m1, m7
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE]
+movum4, [r1 + r4]
+movum5, [r2 + r4]
+movum6, [r3 + r4]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 2]
+movum4, [r1 + r4 * 2]
+movum5, [r2 + r4 * 2]
+movum6, [r3 + r4 * 2]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 3]
+movum4, [r1 + r6]
+movum5, [r2 + r6]
+movum6, [r3 + r6]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+add r0, FENC_STRIDE * 4
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+
+movum3, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE]
+movum4, [r1 + r4]
+movum5, [r2 + r4]
+movum6, [r3 + r4]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 2]
+movum4, [r1 + r4 * 2]
+movum5, [r2 + r4 * 2]
+movum6, [r3 + r4 * 2]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 3]
+movum4, [r1 + r6]
+movum5, [r2 + r6]
+movum6, [r3 + r6]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+%endmacro
+
+%macro PIXEL_SAD_X3_END_AVX2 0
+vextracti128   xm3, m0, 1
+vextracti128   xm4, m1, 1
+vextracti128   xm5, m2, 1
+paddd   m0, m3
+paddd   m1, m4
+paddd   m2, m5
+pshufd xm3, xm0, 2
+pshufd xm4, xm1, 2
+pshufd xm5, xm2, 2
+paddd   m0, m3
+paddd   m1, m4
+paddd   m2, m5
+
+movd[r5 + 0], xm0
+movd[r5 + 4], xm1
+movd[r5 + 8], xm2
+%endmacro
+
+cglobal pixel_sad_x3_32x8, 6,7,8
+pxorm0, m0
+pxorm1, m1
+pxorm2, m2
+lea  

Re: [x265] How can I enable the AVX2 version of DCT and IDCT?

2015-09-16 Thread Dnyaneshwar Gorade
Hi Ximing,

If your machine (and OS also) supports AVX2 instruction set then you are
already compiling and using AVX2 version of DCT functions.
x265 automatically detects & sets all assembly primitives to highest/latest
available instruction sets.

When you run encoder, you can check command prompt output if cpu
capabilities info shows AVX2 instruction set or not.
You can get the source code of DCT AVX2 functions in dct8.asm file.


Regards,
Dnyaneshwar

On Wed, Sep 16, 2015 at 6:25 PM, Ximing Cheng <chengximing1...@gmail.com>
wrote:

> I read the source code of the /source/common/vec/dct-sse3.cpp and I found
> the comments said "Note: We have AVX2 assembly for these functions, but
> since AVX2 is still somewhat rare on end-user PCs we still compile and link
> these SSE3 intrinsic SIMD functions".
>
> But now both my PC and server support Intel AVX2 instruction set. If I
> want to compile these functions with AVX2 assembly, where to find the AVX2
> version of source code of these functions?
>
> Thanks!
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 1 of 3] asm: AVX2 code for pixel_var primitive, improved over 40% than SSE

2015-09-10 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1441715051 -19800
#  Tue Sep 08 17:54:11 2015 +0530
# Node ID 89c234e68523b05550b8c5197b83849544dc97d1
# Parent  365f7ed4d89628d49cd6af8d81d4edc01f73ffad
asm: AVX2 code for pixel_var primitive, improved over 40% than SSE

diff -r 365f7ed4d896 -r 89c234e68523 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Sep 08 17:54:11 2015 +0530
@@ -2729,6 +2729,10 @@
 #if X86_64
 if (cpuMask & X265_CPU_AVX2)
 {
+p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2);
+p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2);
+p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2);
+
 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
 
 p.planecopy_sp = PFX(downShift_16_avx2);
diff -r 365f7ed4d896 -r 89c234e68523 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Sep 08 17:54:11 2015 +0530
@@ -6397,6 +6397,78 @@
 movd   edx, xm6
 %endif
 RET
+
+INIT_YMM avx2
+cglobal pixel_var_32x32, 2,4,7
+VAR_START 0
+mov r2d, 16
+
+.loop:
+pmovzxbwm0, [r0]
+pmovzxbwm3, [r0 + 16]
+pmovzxbwm1, [r0 + r1]
+pmovzxbwm4, [r0 + r1 + 16]
+
+lea r0, [r0 + r1 * 2]
+
+VAR_CORE
+
+dec r2d
+jg  .loop
+
+vextracti128   xm0, m5, 1
+vextracti128   xm1, m6, 1
+paddw  xm5, xm0
+paddd  xm6, xm1
+HADDW  xm5, xm2
+HADDD  xm6, xm1
+
+%if ARCH_X86_64
+punpckldq  xm5, xm6
+movq   rax, xm5
+%else
+movd   eax, xm5
+movd   edx, xm6
+%endif
+RET
+
+INIT_YMM avx2
+cglobal pixel_var_64x64, 2,4,7
+VAR_START 0
+mov r2d, 64
+
+.loop:
+pmovzxbwm0, [r0]
+pmovzxbwm3, [r0 + 16]
+pmovzxbwm1, [r0 + mmsize]
+pmovzxbwm4, [r0 + mmsize + 16]
+
+lea r0, [r0 + r1]
+
+VAR_CORE
+
+dec r2d
+jg  .loop
+
+pxorm1, m1
+punpcklwd   m0, m5, m1
+punpckhwd   m5, m1
+paddd   m5, m0
+vextracti128   xm2, m5, 1
+vextracti128   xm1, m6, 1
+paddd  xm5, xm2
+paddd  xm6, xm1
+HADDD  xm5, xm2
+HADDD  xm6, xm1
+
+%if ARCH_X86_64
+punpckldq  xm5, xm6
+movq   rax, xm5
+%else
+movd   eax, xm5
+movd   edx, xm6
+%endif
+RET
 %endif ; !HIGH_BIT_DEPTH
 
 %macro VAR2_END 3
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 2 of 3] asm: avx2 code for sad_x3_32xN, improved over 40% than SSE

2015-09-10 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1441885683 -19800
#  Thu Sep 10 17:18:03 2015 +0530
# Node ID 5b5d7438e90196d7974b9ceec2130b6c924e2342
# Parent  abab4304e992b7addb65ad8fbdfe309ba57732a6
asm: avx2 code for sad_x3_32xN, improved over 40% than SSE

diff -r abab4304e992 -r 5b5d7438e901 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Sep 10 11:40:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Sep 10 17:18:03 2015 +0530
@@ -3571,6 +3571,12 @@
 p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
 p.planeClipAndMax = PFX(planeClipAndMax_avx2);
 
+p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2);
+p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2);
+p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx2);
+p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx2);
+p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx2);
+
 /* The following primitives have been disabled since performance 
compared to SSE is negligible/negative */
 #if 0
 p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
diff -r abab4304e992 -r 5b5d7438e901 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm   Thu Sep 10 11:40:35 2015 +0530
+++ b/source/common/x86/sad-a.asm   Thu Sep 10 17:18:03 2015 +0530
@@ -3949,6 +3949,272 @@
 movd[r5 + 8], xm1
 RET
 
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+%macro SAD_X3_32x8_AVX2 0
+movum3, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m7, m3, m5
+paddd   m1, m7
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE]
+movum4, [r1 + r4]
+movum5, [r2 + r4]
+movum6, [r3 + r4]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 2]
+movum4, [r1 + r4 * 2]
+movum5, [r2 + r4 * 2]
+movum6, [r3 + r4 * 2]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 3]
+movum4, [r1 + r6]
+movum5, [r2 + r6]
+movum6, [r3 + r6]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+lea r0, [r0 + FENC_STRIDE * 4]
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+
+movum3, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE]
+movum4, [r1 + r4]
+movum5, [r2 + r4]
+movum6, [r3 + r4]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 2]
+movum4, [r1 + r4 * 2]
+movum5, [r2 + r4 * 2]
+movum6, [r3 + r4 * 2]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 3]
+movum4, [r1 + r6]
+movum5, [r2 + r6]
+movum6, [r3 + r6]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+%endmacro
+
+%macro PIXEL_SAD_X3_END_AVX2 0
+vextracti128   xm3, m0, 1
+vextracti128   xm4, m1, 1
+vextracti128   xm5, m2, 1
+paddd   m0, m3
+paddd   m1, m4
+paddd   m2, m5
+pshufd xm3, xm0, 2
+pshufd xm4, xm1, 2
+pshufd xm5, xm2, 2
+paddd   m0, m3
+paddd   m1, m4
+paddd   m2, m5
+
+movd[r5 + 0], xm0
+movd[r5 + 4], xm1
+movd[r5 + 8], xm2
+%endmacro
+
+cglobal pixel_sad_x3_32x8, 6,7,8
+pxorm0, m0
+pxorm1, m1
+pxorm2, m2
+lea  

[x265] [PATCH 3 of 3] asm: avx2 code for sad_x3_64xN, improved over 40% than SSE

2015-09-10 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1441886472 -19800
#  Thu Sep 10 17:31:12 2015 +0530
# Node ID d31b9e8bdcf4f5fac2e3f0c567f1c90c1d19a382
# Parent  5b5d7438e90196d7974b9ceec2130b6c924e2342
asm: avx2 code for sad_x3_64xN, improved over 40% than SSE

diff -r 5b5d7438e901 -r d31b9e8bdcf4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Sep 10 17:18:03 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Sep 10 17:31:12 2015 +0530
@@ -3576,6 +3576,11 @@
 p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx2);
 p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx2);
 p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx2);
+p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx2);
+p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx2);
+p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2);
+p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2);
+p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2);
 
 /* The following primitives have been disabled since performance 
compared to SSE is negligible/negative */
 #if 0
diff -r 5b5d7438e901 -r d31b9e8bdcf4 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm   Thu Sep 10 17:18:03 2015 +0530
+++ b/source/common/x86/sad-a.asm   Thu Sep 10 17:31:12 2015 +0530
@@ -4054,6 +4054,372 @@
 paddd   m2, m3
 %endmacro
 
+%macro SAD_X3_64x8_AVX2 0
+movum3, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + mmsize]
+movum4, [r1 + mmsize]
+movum5, [r2 + mmsize]
+movum6, [r3 + mmsize]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE]
+movum4, [r1 + r4]
+movum5, [r2 + r4]
+movum6, [r3 + r4]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE + mmsize]
+movum4, [r1 + r4 + mmsize]
+movum5, [r2 + r4 + mmsize]
+movum6, [r3 + r4 + mmsize]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 2]
+movum4, [r1 + r4 * 2]
+movum5, [r2 + r4 * 2]
+movum6, [r3 + r4 * 2]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 2 + mmsize]
+movum4, [r1 + r4 * 2 + mmsize]
+movum5, [r2 + r4 * 2 + mmsize]
+movum6, [r3 + r4 * 2 + mmsize]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 3]
+movum4, [r1 + r6]
+movum5, [r2 + r6]
+movum6, [r3 + r6]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + FENC_STRIDE * 3 + mmsize]
+movum4, [r1 + r6 + mmsize]
+movum5, [r2 + r6 + mmsize]
+movum6, [r3 + r6 + mmsize]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+lea r0, [r0 + FENC_STRIDE * 4]
+lea r1, [r1 + r4 * 4]
+lea r2, [r2 + r4 * 4]
+lea r3, [r3 + r4 * 4]
+
+movum3, [r0]
+movum4, [r1]
+movum5, [r2]
+movum6, [r3]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+paddd   m1, m4
+psadbw  m3, m6
+paddd   m2, m3
+
+movum3, [r0 + mmsize]
+movum4, [r1 + mmsize]
+movum5, [r2 + mmsize]
+movum6, [r3 + mmsize]
+
+psadbw  m7, m3, m4
+paddd   m0, m7
+psadbw  m4, m3, m5
+   

[x265] [PATCH] asm: fix crash as NR buffer is not aligned to 16-byte boundry

2015-09-10 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1441865435 -19800
#  Thu Sep 10 11:40:35 2015 +0530
# Node ID abab4304e992b7addb65ad8fbdfe309ba57732a6
# Parent  89c234e68523b05550b8c5197b83849544dc97d1
asm: fix crash as NR buffer is not aligned to 16-byte boundry

diff -r 89c234e68523 -r abab4304e992 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmTue Sep 08 17:54:11 2015 +0530
+++ b/source/common/x86/dct8.asmThu Sep 10 11:40:35 2015 +0530
@@ -2115,15 +2115,15 @@
 mova m0, [r0]
 pabswm1, m0
 
-mova m2, [r1]
+movu m2, [r1]
 pmovsxwd m3, m1
 padddm2, m3
-mova [r1], m2
-mova m2, [r1 + 16]
+movu [r1], m2
+movu m2, [r1 + 16]
 psrldq   m3, m1, 8
 pmovsxwd m4, m3
 padddm2, m4
-mova [r1 + 16], m2
+movu [r1 + 16], m2
 
 movu m3, [r2]
 psubusw  m1, m3
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 1 of 2] asm: avx2 asm for intra_ang32 mode 16 & 20

2015-09-07 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G <dnyanesh...@multicorewareinc.com>
# Date 1441085487 -19800
#  Tue Sep 01 11:01:27 2015 +0530
# Node ID 3238ecbdbdf551a69bcd0dfdf8391f6462db45ac
# Parent  e1adac00dce8e5641cbe9aec3d50a72261c308d9
asm: avx2 asm for intra_ang32 mode 16 & 20
improved mode16 6000c -> 2200 and mode 20 3700c -> 1400c

diff -r e1adac00dce8 -r 3238ecbdbdf5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Sep 03 14:41:06 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Sep 01 11:01:27 2015 +0530
@@ -3004,6 +3004,8 @@
 p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2);
 p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2);
 p.cu[BLOCK_32x32].intra_pred[15] = PFX(intra_pred_ang32_15_avx2);
+p.cu[BLOCK_32x32].intra_pred[16] = PFX(intra_pred_ang32_16_avx2);
+p.cu[BLOCK_32x32].intra_pred[20] = PFX(intra_pred_ang32_20_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r e1adac00dce8 -r 3238ecbdbdf5 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Thu Sep 03 14:41:06 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Tue Sep 01 11:01:27 2015 +0530
@@ -448,6 +448,17 @@
 db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11, 
 9,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0, 14, 12, 10,  8,  7,  5,  3, 
 1
 const ang32_shuf_mode21,db 15, 15, 13, 13, 11, 11,  9,  9,  8,  8,  6, 
 6,  4,  4,  2,  2, 14, 14, 12, 12, 10, 10,  8,  8,  7,  7,  5,  5,  3,  3,  1, 
 1
 
+const ang32_fact_mode16,db (32-11), 11, (32-22), 22, (32- 1),  1, 
(32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24
+db (32- 3),  3, (32-14), 14, (32-25), 25, (32- 
4),  4, (32-15), 15, (32-26), 26, (32- 5),  5, (32-16), 16
+db (32-27), 27, (32- 6),  6, (32-17), 17, 
(32-28), 28, (32- 7),  7, (32-18), 18, (32-29), 29, (32- 8),  8
+db (32-19), 19, (32-30), 30, (32- 9),  9, 
(32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0),  0
+const ang32_shuf_mode16,db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 
12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8,  7,  8,  6,  7,  5,  6,  5,  6,  4, 
 5
+db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 
12, 10, 11, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7,  6,  7,  5,  6,  5, 
 6
+db  0,  0,  0,  0, 15, 14, 12, 11,  9,  8,  6, 
 5,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0, 14, 13, 11, 10,  8,  7,  5,  4,  2, 
 1
+dd  7,  1,  2,  3,  7,  1,  2,  3
+const ang32_shuf_mode20,db 12, 11,  9,  8,  6,  5,  3,  2,  0,  0,  0, 
 0,  0,  0, 14, 15,  8,  7,  5,  4,  2,  1,  0,  0, 14, 13, 13, 11, 11, 10, 10, 
 8
+db  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7, 
 7,  8,  8,  9,  9,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  1,  1,  0, 
 0
+
 const ang_table
 %assign x 0
 %rep 32
@@ -17100,6 +17111,728 @@
 movu[r0 + r4], m8
 RET
 
+cglobal intra_pred_ang32_16, 3,4,10
+movum0, [ang32_fact_mode16]
+movum1, [ang32_fact_mode16 + mmsize]
+movam2, [pw_1024]
+movam7, [ang32_shuf_mode16]
+movam8, [ang32_shuf_mode16 + mmsize]
+lea r3, [r1 * 3]
+
+; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11,  9, 
 8,  6,  5,  3,  2,  0, -1, -2...]
+
+movum6, [r2]
+pshufb  m6, [ang32_shuf_mode16 + mmsize*2]
+movam9, m6
+movam3, [ang32_shuf_mode16 + mmsize*3]
+vpermd  m6, m3, m6
+vpermq  m9, m9, q3232
+pslldq  m9, 4
+palignr m6, m9, 15
+pslldq  m9, 1
+
+vbroadcasti128  m3, [r2 + mmsize*2 + 1]
+
+palignr m4, m3, m6, 1
+palignr m5, m6, m9, 6
+pshufb  m4, m7
+pshufb  m5, m8
+pmaddubsw   m4, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+vpermq  m4, m4, q3120
+movu[r0], m4
+
+palignr m4, m3, m6, 2
+palignr m5, m6, m9, 7
+pshufb  m4, m7
+pshufb  m5, m8
+pmaddubsw   m4, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+vpermq  m4, m4, q3120
+movu[r0 + r1], m4
+
+palignr  

[x265] [PATCH] asm: fix dynamic range of input to quant primitive

2015-08-27 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1440736935 -19800
#  Fri Aug 28 10:12:15 2015 +0530
# Node ID dce85f739efeea842e490a0f555d4abdc89a5c80
# Parent  905c4f2e203ec082bd50b361865a7d4d297e45ce
asm: fix dynamic range of input to quant primitive

diff -r 905c4f2e203e -r dce85f739efe source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp  Thu Aug 27 10:13:56 2015 +0530
+++ b/source/test/mbdstharness.cpp  Fri Aug 28 10:12:15 2015 +0530
@@ -215,8 +215,14 @@
 uint32_t optReturnValue = 0;
 uint32_t refReturnValue = 0;
 
-int bits = (rand() % 24) + 8;
-int valueToAdd = rand() % (1  bits);
+int sliceType = rand() % 2;
+int log2TrSize = rand() % 4 + 2;
+int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1);
+int per = qp / 6;
+int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
+
+int bits = QUANT_SHIFT + per + transformShift;
+int valueToAdd = (sliceType == 1 ? 171 : 85)  (bits - 9);
 int cmp_size = sizeof(int) * height * width;
 int cmp_size1 = sizeof(short) * height * width;
 int numCoeff = height * width;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 5 of 7] asm: avx2 asm for intra_ang32 mode 12, 4758c-1474c

2015-08-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1440583211 -19800
#  Wed Aug 26 15:30:11 2015 +0530
# Node ID cb3f520f9942080d05ca1b3ba2cae0c1b4bcb345
# Parent  a27ac3b998f5677570a48285d22e1b771c08ab75
asm: avx2 asm for intra_ang32 mode 12, 4758c-1474c
updated intra_ang_32 mode 25 AVX2 asm code, improved 1438c-1270c

diff -r a27ac3b998f5 -r cb3f520f9942 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 25 11:02:17 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Aug 26 15:30:11 2015 +0530
@@ -3000,6 +3000,7 @@
 p.cu[BLOCK_32x32].intra_pred[9]  = PFX(intra_pred_ang32_9_avx2);
 p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
 p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2);
+p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r a27ac3b998f5 -r cb3f520f9942 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Tue Aug 25 11:02:17 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Wed Aug 26 15:30:11 2015 +0530
@@ -262,26 +262,6 @@
  db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
 ALIGN 32
-c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 
15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 
30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-   db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 
13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 
28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-   db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 
6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 
11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 
21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 
31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
-   db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 
9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 
14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 
29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
-   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 
12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
-   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
 c_ang32_mode_23:  db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 
18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
   db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 
27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
   db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 
13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
@@ -494,6 +474,15 @@
 const ang32_shuf_mode11,times 8 db 1, 2
 times 8 db 0, 1
 
+const ang32_fact_mode12,db (32-27), 27, (32-22), 22, (32-17), 17, 
(32-12), 12, (32- 7),  7, (32- 2),  2, (32-29), 29, (32-24), 24
+db (32-11), 11, (32- 6),  6, (32- 1),  1, 
(32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8),  8
+db (32-19), 19, (32-14), 14, (32- 9),  9, (32- 
4),  4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
+db (32- 3),  3, (32-30), 30, (32-25), 25, 
(32-20), 20, (32-15), 15

[x265] [PATCH 7 of 7] asm: avx2 asm for intra_ang32 mode 14, 5600c-1400c

2015-08-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1440583506 -19800
#  Wed Aug 26 15:35:06 2015 +0530
# Node ID 40ae6c49fa489dc995f78d93a35b441639e0847d
# Parent  00b26e64fd2c42bcb9652668721f6953d8f2eb0f
asm: avx2 asm for intra_ang32 mode 14, 5600c-1400c
updated intra_ang_32 mode 22 AVX2 asm code, improved 2300c-1300c

diff -r 00b26e64fd2c -r 40ae6c49fa48 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Aug 26 15:32:14 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Aug 26 15:35:06 2015 +0530
@@ -3002,6 +3002,7 @@
 p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2);
 p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2);
 p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2);
+p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 00b26e64fd2c -r 40ae6c49fa48 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Wed Aug 26 15:32:14 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Wed Aug 26 15:35:06 2015 +0530
@@ -262,27 +262,6 @@
  db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
 ALIGN 32
-c_ang32_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 
13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
- db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 
20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
- db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 
14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
- db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 
27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
- db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 
21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
- db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
- db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 
28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
- db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 
22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
- db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
- db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
- db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 
17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
- db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
- db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 
11, 21, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
- db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
- db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 
31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
- db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 
12, 20, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
- db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
- db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
 c_ang32_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 
17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
  db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
  db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 
21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
@@ -471,6 +450,15 @@
 db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 
 0, 14, 11,  7,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12,  9,  5, 
 2
 const ang32_shuf_mode23,db  0,  0,  0,  0,  0,  0,  0,  0, 14, 14, 11, 
11,  7,  7,  4,  4,  0,  0,  0,  0,  0,  0,  0,  0, 12, 12,  9,  9,  5,  5,  2, 
 2
 
+const ang32_fact_mode14,db (32-19), 19

[x265] [PATCH 3 of 7] asm: avx2 asm for intra_ang32 mode 11, 4550c-1326c

2015-08-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1440479904 -19800
#  Tue Aug 25 10:48:24 2015 +0530
# Node ID 630bae9a91392fdf9a327673f7c00eeedf60139f
# Parent  0409b136c208cb944fb76bfd400e76ba43e330a8
asm: avx2 asm for intra_ang32 mode 11, 4550c-1326c

updated intra_ang_32 mode 25 AVX2 asm code, improved 1300c-1184c
removed unnecessary constants from previous asm

diff -r 0409b136c208 -r 630bae9a9139 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 25 10:53:32 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Aug 25 10:48:24 2015 +0530
@@ -2999,6 +2999,7 @@
 p.cu[BLOCK_32x32].intra_pred[8]  = PFX(intra_pred_ang32_8_avx2);
 p.cu[BLOCK_32x32].intra_pred[9]  = PFX(intra_pred_ang32_9_avx2);
 p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
+p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 0409b136c208 -r 630bae9a9139 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Tue Aug 25 10:53:32 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Tue Aug 25 10:48:24 2015 +0530
@@ -262,24 +262,6 @@
  db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
 ALIGN 32
-c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 
14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 
18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 
14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 
18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
 c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 
15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 
30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
@@ -509,6 +491,9 @@
 const ang32_shuf_mode9, times 8 db 0, 1
 times 8 db 1, 2
 
+const ang32_shuf_mode11,times 8 db 1, 2
+times 8 db 0, 1
+
 const ang_table
 %assign x 0
 %rep 32
@@ -14020,6 +14005,578 @@
 movu[r0 + r4], m3
 RET
 
+cglobal intra_pred_ang32_11, 3,4,8
+vbroadcasti128  m0, [angHor_tab_11]
+vbroadcasti128  m1, [angHor_tab_11 + mmsize/2]
+movam2, [pw_1024]
+movam7, [ang32_shuf_mode11]
+lea r3, [r1 * 3]
+
+; prepare for [16 0 -1 -2 ...]
+movu   xm3, [r2 + mmsize*2 -  1]
+vbroadcasti128  m6

[x265] [PATCH] asm: avx2 asm for intra_ang32 mode 15, 5700c-1600c

2015-08-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1440650636 -19800
#  Thu Aug 27 10:13:56 2015 +0530
# Node ID 905c4f2e203ec082bd50b361865a7d4d297e45ce
# Parent  40ae6c49fa489dc995f78d93a35b441639e0847d
asm: avx2 asm for intra_ang32 mode 15, 5700c-1600c
updated intra_ang_32 mode 21 AVX2 asm code, improved 2670c-1330c

diff -r 40ae6c49fa48 -r 905c4f2e203e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Aug 26 15:35:06 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Aug 27 10:13:56 2015 +0530
@@ -3003,6 +3003,7 @@
 p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2);
 p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2);
 p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2);
+p.cu[BLOCK_32x32].intra_pred[15] = PFX(intra_pred_ang32_15_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 40ae6c49fa48 -r 905c4f2e203e source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Wed Aug 26 15:35:06 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Thu Aug 27 10:13:56 2015 +0530
@@ -262,26 +262,6 @@
  db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
 ALIGN 32
-c_ang32_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 
17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
- db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
- db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 
21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
- db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
- db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 
25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
- db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
- db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 
12, 20, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
- db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 
14, 18, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
- db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 
18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
- db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 
20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
- db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
- db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 
24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
- db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 
26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
- db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 
11, 21, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
- db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 
13, 19, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
- db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 
15, 17, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
 intra_pred_shuff_0_4:times 4 db 0, 1, 1, 2, 2, 3, 3, 4
 intra_pred4_shuff1:  db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 
1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
 intra_pred4_shuff2:  db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 
2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
@@ -459,6 +439,15 @@
 db  0,  0,  0,  0,  0,  0,  0,  0, 15, 12, 10, 
 7,  5,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 11,  9,  6,  4, 
 1
 const ang32_shuf_mode22,db  0,  0, 15, 15, 13, 13, 10, 10,  8,  8,  5, 
 5,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12, 12,  9,  9,  7,  7,  4,  4, 
 2
 
+const ang32_fact_mode15,db (32-15), 15, (32-30), 30, (32-13), 13, 
(32-28), 28, (32-11), 11, (32-26), 26, (32- 9),  9, (32-24), 24
+db (32-31), 31, (32-14), 14, (32-29), 29, 
(32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8),  8
+db (32- 7),  7, (32-22), 22, (32- 5),  5, 
(32-20), 20, (32- 3),  3, (32-18), 18, (32- 1),  1, (32-16

Re: [x265] [PATCH] asm: disabled 10bpp AVX AVX2 primitives having less than 10% speed up over SSE

2015-08-19 Thread Dnyaneshwar Gorade
right.. you can send it to mailing list

On Wed, Aug 19, 2015 at 3:26 PM, aasaipr...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Aasaipriya Chandran aasaipr...@multicorewareinc.com
 # Date 1439972978 -19800
 #  Wed Aug 19 13:59:38 2015 +0530
 # Node ID 8a45cff3182fa9f6e07493434711247d58f22cc4
 # Parent  2980141a744a569ad6f60dbebdece76a4eababfd
 asm: disabled 10bpp AVX  AVX2 primitives having less than 10% speed up
 over SSE

 these primitives are slower than SSE primitives

 diff -r 2980141a744a -r 8a45cff3182f source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cpp  Tue Aug 18 12:45:52 2015
 +0530
 +++ b/source/common/x86/asm-primitives.cpp  Wed Aug 19 13:59:38 2015
 +0530
 @@ -1185,7 +1185,6 @@
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd =
 PFX(pixel_satd_8x32_avx);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd =
 PFX(pixel_satd_8x4_avx);

 -ALL_LUMA_PU(satd, pixel_satd, avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd =
 PFX(pixel_satd_8x8_avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd =
 PFX(pixel_satd_8x4_avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd =
 PFX(pixel_satd_8x16_avx);
 @@ -1194,15 +1193,10 @@
  p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd =
 PFX(pixel_satd_24x32_avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd =
 PFX(pixel_satd_4x16_avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd =
 PFX(pixel_satd_4x8_avx);
 -#if X265_DEPTH = 10
 -ASSIGN_SA8D(avx);
 -#endif
 +
  p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d =
 PFX(pixel_sa8d_8x8_avx);
  p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d =
 PFX(pixel_sa8d_16x16_avx);
  p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d =
 PFX(pixel_sa8d_32x32_avx);
 -LUMA_VAR(avx);
 -p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
 -p.ssim_end_4 = PFX(pixel_ssim_end4_avx);

  // copy_pp primitives
  // 16 x N
 @@ -1299,6 +1293,20 @@
  p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
  p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
  p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
 +
 +/* The following primitives have been disabled since performance
 compared to SSE is negligible/negative */
 +#if 0
 +ALL_LUMA_PU(satd, pixel_satd, avx);
 +
 +p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
 +p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
 +
 +LUMA_VAR(avx);
 +
 +#if X265_DEPTH = 10
 +   ASSIGN_SA8D(avx);
 +#endif
 +#endif
  }
  if (cpuMask  X265_CPU_XOP)
  {
 @@ -1414,11 +1422,8 @@
  p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
  p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);

 -p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_avx2);
 -p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_avx2);
  p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_avx2);
  p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_avx2);
 -p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2);
  p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_avx2);
  p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_avx2);
  p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2);
 @@ -1438,12 +1443,10 @@
  p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2);

  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   =
 PFX(addAvg_8x2_avx2);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   =
 PFX(addAvg_8x4_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   =
 PFX(addAvg_8x6_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   =
 PFX(addAvg_8x8_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  =
 PFX(addAvg_8x16_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  =
 PFX(addAvg_8x32_avx2);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg =
 PFX(addAvg_12x16_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  =
 PFX(addAvg_16x4_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  =
 PFX(addAvg_16x8_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg =
 PFX(addAvg_16x12_avx2);
 @@ -1457,18 +1460,15 @@
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg =
 PFX(addAvg_8x16_avx2);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg =
 PFX(addAvg_16x32_avx2);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg =
 PFX(addAvg_32x64_avx2);
 -p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg =
 PFX(addAvg_8x8_avx2);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg =
 PFX(addAvg_16x16_avx2);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg =
 PFX(addAvg_8x32_avx2);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg =
 PFX(addAvg_32x32_avx2);
 

Re: [x265] [PATCH] asm: disabled 10bpp AVX AVX2 primitives having less than 3% speed up over SSE

2015-08-18 Thread Dnyaneshwar Gorade
right.. but small correction - in #if 0  #endif disable only specific
primitives and not all sizes  (expand the macro  keep only less than 3%)

On Tue, Aug 18, 2015 at 12:05 PM, aasaipr...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Aasaipriya Chandran aasaipr...@multicorewareinc.com
 # Date 1439879745 -19800
 #  Tue Aug 18 12:05:45 2015 +0530
 # Node ID 2d0d8be0f401aa4eac554a280118376a991f5475
 # Parent  996ebce8c874fc511d495cee227d24413e99d0c1
 asm: disabled 10bpp AVX  AVX2 primitives having less than 3% speed up
 over SSE

 these primitives are slower than SSE primitives

 diff -r 996ebce8c874 -r 2d0d8be0f401 source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cpp  Mon Aug 17 10:52:15 2015
 +0530
 +++ b/source/common/x86/asm-primitives.cpp  Tue Aug 18 12:05:45 2015
 +0530
 @@ -1169,7 +1169,6 @@
  }
  if (cpuMask  X265_CPU_AVX)
  {
 -// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d =
 PFX(pixel_satd_4x4_avx); fails tests
  p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd =
 PFX(pixel_satd_16x24_avx);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd =
 PFX(pixel_satd_32x48_avx);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd =
 PFX(pixel_satd_24x64_avx);
 @@ -1177,32 +1176,36 @@
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd =
 PFX(pixel_satd_8x12_avx);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd =
 PFX(pixel_satd_12x32_avx);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd =
 PFX(pixel_satd_4x32_avx);
 -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd =
 PFX(pixel_satd_4x8_avx);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd =
 PFX(pixel_satd_8x16_avx);
 -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd =
 PFX(pixel_satd_4x4_avx);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd =
 PFX(pixel_satd_8x8_avx);
 -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd =
 PFX(pixel_satd_4x16_avx);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd =
 PFX(pixel_satd_8x32_avx);
 -p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd =
 PFX(pixel_satd_8x4_avx);
 -
 -ALL_LUMA_PU(satd, pixel_satd, avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd =
 PFX(pixel_satd_8x8_avx);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd =
 PFX(pixel_satd_8x4_avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd =
 PFX(pixel_satd_8x16_avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd =
 PFX(pixel_satd_8x32_avx);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd =
 PFX(pixel_satd_12x16_avx);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd =
 PFX(pixel_satd_24x32_avx);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd =
 PFX(pixel_satd_4x16_avx);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd =
 PFX(pixel_satd_4x8_avx);
 -#if X265_DEPTH = 10
 -ASSIGN_SA8D(avx);
 -#endif
 -p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d =
 PFX(pixel_sa8d_8x8_avx);
 -p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d =
 PFX(pixel_sa8d_16x16_avx);
 -p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d =
 PFX(pixel_sa8d_32x32_avx);
 -LUMA_VAR(avx);
 -p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
 -p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
 +
 +p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx);
 +p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx);
 +p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx);
 +p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx);
 +p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx);
 +p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx);
 +p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx);
 +p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx);
 +p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx);
 +p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx);
 +p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx);
 +p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx);
 +p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx);
 +p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx);
 +p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx);
 +p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_avx);
 +p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx);
 +p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx);
 +p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx);
 +p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx);
 +
 +p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx);

  // copy_pp primitives
  // 16 x N
 @@ -1299,6 +1302,33 @@
  p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
  p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
  p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
 +
 +/* The 

Re: [x265] [PATCH 1 of 5] asm: AVX2 asm for intra_ang_32 mode 9, improved over 40% than SSE asm

2015-08-18 Thread Dnyaneshwar Gorade
Hi, I wrote this code before we found new algorithm. Sure, I will compare
with new algorithm once I finish remaining modes which don't have AVX2 asm.

On Tue, Aug 18, 2015 at 8:50 PM, chen chenm...@163.com wrote:

 This is old algorithm, it need transpose, could you compare to new
 algorithm?


 At 2015-08-18 12:11:35,dnyanesh...@multicorewareinc.com wrote:
 # HG changeset patch
 # User Dnyaneshwar G dnyanesh...@multicorewareinc.com
 # Date 1439531917 -19800
 #  Fri Aug 14 11:28:37 2015 +0530
 # Node ID 5ed23f786ea8f98e003189a537f960e4ff16201f
 # Parent  996ebce8c874fc511d495cee227d24413e99d0c1
 asm: AVX2 asm for intra_ang_32 mode 9, improved over 40% than SSE asm
 
 updated intra_ang_32 mode 27 AVX2 asm code, improved over 3% than previous 
 AVX2 code
 removed unnecessary constants from previous asm


 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel


___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 3 of 5] asm: avx2 asm for intra_ang32 mode 11, 4550c-1326c

2015-08-17 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439812025 -19800
#  Mon Aug 17 17:17:05 2015 +0530
# Node ID 43c9ec65927666db1316efe63d112bd8f9cb5f35
# Parent  8752daab2f07711c556dfffa9a733b7278484479
asm: avx2 asm for intra_ang32 mode 11, 4550c-1326c

diff -r 8752daab2f07 -r 43c9ec659276 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Aug 14 18:27:44 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Aug 17 17:17:05 2015 +0530
@@ -3027,6 +3027,7 @@
 p.cu[BLOCK_32x32].intra_pred[8]  = PFX(intra_pred_ang32_8_avx2);
 p.cu[BLOCK_32x32].intra_pred[9]  = PFX(intra_pred_ang32_9_avx2);
 p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
+p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 8752daab2f07 -r 43c9ec659276 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Fri Aug 14 18:27:44 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Mon Aug 17 17:17:05 2015 +0530
@@ -440,6 +440,9 @@
 const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, 
(32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
  db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, 
(32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
 
+const ang32_shuf_mode11,times 8 db 1, 2
+times 8 db 0, 1
+
 const ang_table
 %assign x 0
 %rep 32
@@ -13627,6 +13630,325 @@
 movu[r0 + r4], m3
 RET
 
+cglobal intra_pred_ang32_11, 3,4,8
+vbroadcasti128  m0, [angHor_tab_11]
+vbroadcasti128  m1, [angHor_tab_11 + mmsize/2]
+movam2, [pw_1024]
+movam7, [ang32_shuf_mode11]
+lea r3, [r1 * 3]
+
+; prepare for [16 0 -1 -2 ...]
+movu   xm3, [r2 + mmsize*2 -  1]
+vbroadcasti128  m6, [r2 + mmsize*2 + 15]
+
+pinsrb xm3, [r2 +  0], 1
+pinsrb xm3, [r2 + 16], 0
+vinserti128 m3, m3, xm3, 1  ; [16  0  1  2  3  4  5  6  7  
8  9 10 11 12 13 14 16  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
+
+pshufb  m5, m3, m7  ; [ 0  1  0  1  0  1  0  1  0  
1  0  1  0  1  0  1 16  0 16  0 16  0 16  0 16  0 16  0 16  0 16  0]
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0], m4
+
+palignr m5, m6, m3, 1
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0 + r1], m4
+
+palignr m5, m6, m3, 2
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0 + r1 * 2], m4
+
+palignr m5, m6, m3, 3
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0 + r3], m4
+
+lea r0, [r0 + r1 * 4]
+
+palignr m5, m6, m3, 4
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0], m4
+
+palignr m5, m6, m3, 5
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0 + r1], m4
+
+palignr m5, m6, m3, 6
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0 + r1 * 2], m4
+
+palignr m5, m6, m3, 7
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0 + r3], m4
+
+lea r0, [r0 + r1 * 4]
+
+palignr m5, m6, m3, 8
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswb

[x265] [PATCH 2 of 5] asm: AVX2 asm for intra_ang_32 mode 10, 816c-452c

2015-08-17 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439557064 -19800
#  Fri Aug 14 18:27:44 2015 +0530
# Node ID 8752daab2f07711c556dfffa9a733b7278484479
# Parent  5ed23f786ea8f98e003189a537f960e4ff16201f
asm: AVX2 asm for intra_ang_32 mode 10, 816c-452c

diff -r 5ed23f786ea8 -r 8752daab2f07 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Aug 14 11:28:37 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Aug 14 18:27:44 2015 +0530
@@ -3026,6 +3026,7 @@
 p.cu[BLOCK_32x32].intra_pred[7]  = PFX(intra_pred_ang32_7_avx2);
 p.cu[BLOCK_32x32].intra_pred[8]  = PFX(intra_pred_ang32_8_avx2);
 p.cu[BLOCK_32x32].intra_pred[9]  = PFX(intra_pred_ang32_9_avx2);
+p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 5ed23f786ea8 -r 8752daab2f07 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Fri Aug 14 11:28:37 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Fri Aug 14 18:27:44 2015 +0530
@@ -462,6 +462,7 @@
 %endrep
 
 SECTION .text
+cextern pb_1
 cextern pw_2
 cextern pw_3
 cextern pw_4
@@ -13500,6 +13501,132 @@
 call ang32_mode_9_27_avx2
 RET
 
+cglobal intra_pred_ang32_10, 5,5,4
+pxorm0, m0
+movam1, [pb_1]
+lea r4, [r1 * 3]
+
+vbroadcasti128  m2, [r2 + mmsize*2 + 1]
+
+pshufb  m3, m2, m0
+movu[r0], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1 * 2], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r4], m3
+
+lea r0, [r0 + r1 * 4]
+
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1 * 2], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r4], m3
+
+lea r0, [r0 + r1 * 4]
+
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1 * 2], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r4], m3
+
+lea r0, [r0 + r1 * 4]
+
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1 * 2], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r4], m3
+
+lea r0, [r0 + r1 * 4]
+pxorm0, m0
+vbroadcasti128  m2, [r2 + mmsize*2 + mmsize/2 + 1]
+
+pshufb  m3, m2, m0
+movu[r0], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1 * 2], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r4], m3
+
+lea r0, [r0 + r1 * 4]
+
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1 * 2], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r4], m3
+
+lea r0, [r0 + r1 * 4]
+
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1 * 2], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r4], m3
+
+lea r0, [r0 + r1 * 4]
+
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r1 * 2], m3
+paddb   m0, m1
+pshufb  m3, m2, m0
+movu[r0 + r4], m3
+RET

[x265] [PATCH 1 of 5] asm: AVX2 asm for intra_ang_32 mode 9, improved over 40% than SSE asm

2015-08-17 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439531917 -19800
#  Fri Aug 14 11:28:37 2015 +0530
# Node ID 5ed23f786ea8f98e003189a537f960e4ff16201f
# Parent  996ebce8c874fc511d495cee227d24413e99d0c1
asm: AVX2 asm for intra_ang_32 mode 9, improved over 40% than SSE asm

updated intra_ang_32 mode 27 AVX2 asm code, improved over 3% than previous AVX2 
code
removed unnecessary constants from previous asm

diff -r 996ebce8c874 -r 5ed23f786ea8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 17 10:52:15 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Aug 14 11:28:37 2015 +0530
@@ -3025,6 +3025,7 @@
 p.cu[BLOCK_32x32].intra_pred[6]  = PFX(intra_pred_ang32_6_avx2);
 p.cu[BLOCK_32x32].intra_pred[7]  = PFX(intra_pred_ang32_7_avx2);
 p.cu[BLOCK_32x32].intra_pred[8]  = PFX(intra_pred_ang32_8_avx2);
+p.cu[BLOCK_32x32].intra_pred[9]  = PFX(intra_pred_ang32_9_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 996ebce8c874 -r 5ed23f786ea8 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Mon Aug 17 10:52:15 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Fri Aug 14 11:28:37 2015 +0530
@@ -259,26 +259,6 @@
  db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 
23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
  db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
-
-ALIGN 32
-c_ang32_mode_27:db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 
18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 
14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 
26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 
22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 
20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 
16, 16, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 
12, 20, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 
6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 
2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
 ALIGN 32
 c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
@@ -13279,6 +13259,247 @@
 call ang32_mode_8_28_avx2
 RET
 
+cglobal ang32_mode_9_27_avx2
+testr7d,r7d
+; rows 0 to 7
+movum0, [r2 +  1]   ; [32 31 30 29 28 27 26 25 24 
23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
+movum1, [r2 +  2]   ; [33 32 31 30 29 28 27 26 25 
24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
+movum3, [r2 + 17]   ; [48 47 46 45 44 43 42 41 40 
39 38

Re: [x265] [PATCH] asm: disabled 10bpp AVX AVX2 primitives having less than 3% speed up over SSE

2015-08-17 Thread Dnyaneshwar Gorade
merge earlier patch (asm: disabled 10bpp AVX) into this one and send
again to avoid confusion.

2015-08-17 17:44 GMT+05:30 aasaipr...@multicorewareinc.com:

 # HG changeset patch
 # User Aasaipriya Chandran aasaipr...@multicorewareinc.com
 # Date 1439813601 -19800
 #  Mon Aug 17 17:43:21 2015 +0530
 # Node ID 458c015656c2f66ffc696484712540e1b8e6588d
 # Parent  4a6143fe6658534aec83c9ba3db386d118550196
 asm: disabled 10bpp AVX  AVX2 primitives having less than 3% speed up
 over SSE

 these primitives are slower than SSE primitives

 diff -r 4a6143fe6658 -r 458c015656c2 source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cpp  Mon Aug 17 11:56:37 2015
 +0530
 +++ b/source/common/x86/asm-primitives.cpp  Mon Aug 17 17:43:21 2015
 +0530
 @@ -1205,10 +1205,6 @@
  p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx);
  p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx);

 -p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d =
 PFX(pixel_sa8d_8x8_avx);
 -p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d =
 PFX(pixel_sa8d_16x16_avx);
 -p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d =
 PFX(pixel_sa8d_32x32_avx);
 -
  p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx);

  // copy_pp primitives
 @@ -1326,6 +1322,9 @@
  p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx)

  ASSIGN_SA8D(avx);
 +p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d =
 PFX(pixel_sa8d_8x8_avx);
 +p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d =
 PFX(pixel_sa8d_16x16_avx);
 +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d =
 PFX(pixel_sa8d_32x32_avx);

  p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
  p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
 @@ -1427,12 +1426,6 @@
  p.cu[BLOCK_32x32].intra_pred[34]=
 PFX(intra_pred_ang32_2_avx2);

  p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
 -p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
 -p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
 -p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
 -p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
 -p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
 -p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
  p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_avx2);
  p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
  p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
 @@ -1445,11 +1438,8 @@
  p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
  p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);

 -p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_avx2);
 -p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_avx2);
  p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_avx2);
  p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_avx2);
 -p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2);
  p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_avx2);
  p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_avx2);
  p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2);
 @@ -1468,13 +1458,9 @@
  p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx2);
  p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2);

 -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   =
 PFX(addAvg_8x2_avx2);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   =
 PFX(addAvg_8x4_avx2);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   =
 PFX(addAvg_8x6_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   =
 PFX(addAvg_8x8_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  =
 PFX(addAvg_8x16_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  =
 PFX(addAvg_8x32_avx2);
 -p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg =
 PFX(addAvg_12x16_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  =
 PFX(addAvg_16x4_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  =
 PFX(addAvg_16x8_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg =
 PFX(addAvg_16x12_avx2);
 @@ -1484,7 +1470,6 @@
  p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg =
 PFX(addAvg_32x16_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg =
 PFX(addAvg_32x24_avx2);
  p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg =
 PFX(addAvg_32x32_avx2);
 -
  p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg =
 PFX(addAvg_8x16_avx2);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg =
 PFX(addAvg_16x32_avx2);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg =
 PFX(addAvg_32x64_avx2);
 @@ -1494,12 +1479,10 @@
  p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg =
 PFX(addAvg_32x32_avx2);
  p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg =
 PFX(addAvg_16x64_avx2);
  

[x265] [PATCH 5 of 5] asm: optimized intra_ang16 mode 11 avx2 asm, 520c-370c

2015-08-17 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439816850 -19800
#  Mon Aug 17 18:37:30 2015 +0530
# Node ID 6ff0bcad1688f5ee1e393c648739ed2ae7e79b61
# Parent  e75f3a2f1d29f01ca2d71f1b8be970d471b5e1f6
asm: optimized intra_ang16 mode 11 avx2 asm, 520c-370c

diff -r e75f3a2f1d29 -r 6ff0bcad1688 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Mon Aug 17 17:24:37 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Mon Aug 17 18:37:30 2015 +0530
@@ -425,6 +425,9 @@
 const ang32_shuf_mode11,times 8 db 1, 2
 times 8 db 0, 1
 
+const ang16_shuf_mode11,times 8 db 0, 1
+times 8 db 1, 2
+
 const ang_table
 %assign x 0
 %rep 32
@@ -15630,130 +15633,106 @@
 INTRA_PRED_TRANS_STORE_16x16
 RET
 
-
-INIT_YMM avx2
-cglobal intra_pred_ang16_11, 3,4,5
-movam0, [angHor_tab_11]
-movam1, [pw_1024]
+INIT_YMM avx2
+cglobal intra_pred_ang16_11, 3,4,8
+vbroadcasti128  m0, [angHor_tab_11]
+vbroadcasti128  m1, [angHor_tab_11 + mmsize/2]
+movam2, [pw_1024]
+movam7, [ang16_shuf_mode11]
 lea r3, [r1 * 3]
 
 ; prepare for [0 -1 -2 ...]
-movu   xm2, [r2 + 32]
 ; TODO: input reference pixel buffer need a duplicate of pixel_lt to avoid 
reduce instruction in every mode
-pinsrb xm2, [r2], 0
-pshufb xm2, [intra_pred_shuff_0_8]  ; [0 1 1 2 2 3 3 4 4 5 
5 6 6 7 7 8]
-
-
-vpbroadcastwm3, xm2 ; word [1 0]
-psrldq xm2, 2
-vpbroadcastwm4, xm2 ; word [2 1]
-psrldq xm2, 2
-pmaddubsw   m3, m0
-pmaddubsw   m4, m0
-pmulhrswm3, m1
-pmulhrswm4, m1
-packuswbm3, m4
-vpermq  m3, m3, q3120
-movu[r0], xm3
-vextracti128[r0 + r1], m3, 1
-
-vpbroadcastwm3, xm2 ; word [3 2]
-psrldq xm2, 2
-vpbroadcastwm4, xm2 ; word [4 3]
-psrldq xm2, 2
-pmaddubsw   m3, m0
-pmaddubsw   m4, m0
-pmulhrswm3, m1
-pmulhrswm4, m1
-packuswbm3, m4
-vpermq  m3, m3, q3120
-movu[r0 + r1 * 2], xm3
-vextracti128[r0 + r3], m3, 1
+movu   xm3, [r2 + mmsize]
+pinsrb xm3, [r2], 0
+vbroadcasti128  m6, [r2 + mmsize + 16]
+vinserti128 m3, m3, xm3, 1
+
+pshufb  m5, m3, m7  ; [ 0  1  0  1  0  1  0  1  0  
1  0  1  0  1  0  1  1  2  1  2  1  2  1  2  1  2  1  2  1  2  1  2]
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0], xm4
+vextracti128[r0 + r1], m4, 1
+
+palignr m5, m6, m3, 2
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0 + r1 * 2], xm4
+vextracti128[r0 + r3], m4, 1
+
 lea r0, [r0 + r1 * 4]
 
-vpbroadcastwm3, xm2 ; word [5 4]
-psrldq xm2, 2
-vpbroadcastwm4, xm2 ; word [6 5]
-psrldq xm2, 2
-pmaddubsw   m3, m0
-pmaddubsw   m4, m0
-pmulhrswm3, m1
-pmulhrswm4, m1
-packuswbm3, m4
-vpermq  m3, m3, q3120
-movu[r0], xm3
-vextracti128[r0 + r1], m3, 1
-
-vpbroadcastwm3, xm2 ; word [7 6]
-psrldq xm2, 2
-vpbroadcastwm4, xm2 ; word [8 7]
-pmaddubsw   m3, m0
-pmaddubsw   m4, m0
-pmulhrswm3, m1
-pmulhrswm4, m1
-packuswbm3, m4
-vpermq  m3, m3, q3120
-movu[r0 + r1 * 2], xm3
-vextracti128[r0 + r3], m3, 1
+palignr m5, m6, m3, 4
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0], xm4
+vextracti128[r0 + r1], m4, 1
+
+palignr m5, m6, m3, 6
+pshufb  m5, m7
+pmaddubsw   m4, m5, m0
+pmaddubsw   m5, m1
+pmulhrswm4, m2
+pmulhrswm5, m2
+packuswbm4, m5
+movu[r0 + r1 * 2], xm4

[x265] [PATCH 4 of 5] asm: updated intra_ang_32 mode 25 AVX2 asm code, 1300c-1184c

2015-08-17 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439812477 -19800
#  Mon Aug 17 17:24:37 2015 +0530
# Node ID e75f3a2f1d29f01ca2d71f1b8be970d471b5e1f6
# Parent  43c9ec65927666db1316efe63d112bd8f9cb5f35
asm: updated intra_ang_32 mode 25 AVX2 asm code, 1300c-1184c

diff -r 43c9ec659276 -r e75f3a2f1d29 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Mon Aug 17 17:17:05 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Mon Aug 17 17:24:37 2015 +0530
@@ -260,24 +260,6 @@
  db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
 ALIGN 32
-c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 
14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 
18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 
14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 
18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
 c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 
15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 
30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
@@ -13949,6 +13931,260 @@
 movu[r0 + r3], m4
 RET
 
+cglobal intra_pred_ang32_25, 3,5,7
+lea r3, [ang_table_avx2 + 32 * 16]
+lea r4, [r1 * 3]
+movam5, [pw_1024]
+
+; rows 0 to 7
+movum0, [r2 +  0]   ; [31 30 29 28 27 26 
25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0]
+movum1, [r2 +  1]   ; [32 31 30 29 28 27 
26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
+
+pinsrb  xm3,[r2], 15
+pinsrb  xm3,[r2 + mmsize*2 + 16], 14
+
+punpckhbw   m2, m0, m1  ; [32 31 31 30 30 29 
29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8]
+punpcklbw   m0, m1  ; [24 23 23 22 22 21 
21 20 20 19 19 18 18 17 17 16  8  7  7  6  6  5  5  4  4  3  3  2  2  1  1  0]
+vinserti128 m3, m3, xm2, 1  ; [16 15 15 14 14 13 
13 12 12 11 11 10 10  9  9  8  0 16  x  x  x  x  x  x  x  x  x  x  x  x  x  x]
+
+pmaddubsw   m4, m0, [r3 + 14 * 32]  ; [30]
+pmulhrswm4, m5
+pmaddubsw   m1, m2, [r3 + 14 * 32]
+pmulhrswm1, m5
+packuswbm4, m1
+movu[r0],   m4
+
+pmaddubsw   m4, m0, [r3 + 12 * 32]  ; [28]
+pmulhrswm4, m5
+pmaddubsw   m1, m2, [r3 + 12 * 32

Re: [x265] [PATCH 3 of 4] asm: fix bug in macro vpbroadcastd for case ymm, xmm

2015-08-13 Thread Dnyaneshwar Gorade
Hi Min,

This still generates wrong code for case ymm, xmm (as %ifidni %2,xm will be
false always).
How about the below code ?

%macro vpbroadcastd 2-3 ;; incresed one argument for case ymm, xmm
  %ifid %3  ; case vpbroadcastd ymm, ymm, xmm
vpbroadcastd %1, %3
  %elifid %2
movd %1 %+ xmm, %2  ; case vpbroadcastd ymm, rN
vpbroadcastd %1, %1 %+ xmm
  %else
vpbroadcastd %1, %2 ; case vpbroadcastd ymm, [memory addr]
  %endif
%endmacro


Thanks,
Dnyaneshwar G

On Thu, Aug 13, 2015 at 8:52 AM, Min Chen chenm...@163.com wrote:

 # HG changeset patch
 # User Min Chen chenm...@163.com
 # Date 1439424913 25200
 # Node ID caf9562dc947f93e8ee237574575e9b67d494fc8
 # Parent  09846d1566428a73d70d2fcf2d50324c0dfbbb7f
 asm: fix bug in macro vpbroadcastd for case ymm,xmm
 ---
  source/common/x86/x86inc.asm |6 --
  1 files changed, 4 insertions(+), 2 deletions(-)

 diff -r 09846d156642 -r caf9562dc947 source/common/x86/x86inc.asm
 --- a/source/common/x86/x86inc.asm  Wed Aug 12 16:46:57 2015 -0700
 +++ b/source/common/x86/x86inc.asm  Wed Aug 12 17:15:13 2015 -0700
 @@ -1486,10 +1486,12 @@

  ; workaround: vpbroadcastd with register, the yasm will generate wrong
 code
  %macro vpbroadcastd 2
 -  %ifid %2
 +  %ifidni %2,xm ; case ymm,xmm
 +vpbroadcastd %1, %2
 +  %elifid %2; case ymm,rN
  movd %1 %+ xmm, %2
  vpbroadcastd %1, %1 %+ xmm
%else
 -vpbroadcastd %1, %2
 +vpbroadcastd %1, %2 ; case ymm,[address]
%endif
  %endmacro

 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 2 of 4] asm: AVX2 asm for intra_ang_32 mode 6, improved over 48% than SSE asm

2015-08-13 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439366905 -19800
#  Wed Aug 12 13:38:25 2015 +0530
# Node ID 643a001494a42e65366cfa3e468cc0858955095f
# Parent  07110baa95f1d53c8100929b16eafba3b16138d6
asm: AVX2 asm for intra_ang_32 mode 6, improved over 48% than SSE asm

updated intra_ang_32 mode 30 AVX2 asm code, improved over 20% than previous 
AVX2 code

diff -r 07110baa95f1 -r 643a001494a4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 11 18:23:48 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Aug 12 13:38:25 2015 +0530
@@ -3018,6 +3018,7 @@
 p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2);
 p.cu[BLOCK_32x32].intra_pred[4]  = PFX(intra_pred_ang32_4_avx2);
 p.cu[BLOCK_32x32].intra_pred[5]  = PFX(intra_pred_ang32_5_avx2);
+p.cu[BLOCK_32x32].intra_pred[6]  = PFX(intra_pred_ang32_6_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 07110baa95f1 -r 643a001494a4 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Tue Aug 11 18:23:48 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Wed Aug 12 13:38:25 2015 +0530
@@ -320,28 +320,6 @@
 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 
9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
-
-ALIGN 32
-c_ang32_mode_30:db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 
19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 
12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 
18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 
11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 
10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 
 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 
15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 
21, 11, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 
14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 
1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 
20, 12, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
 ALIGN 32
 c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
@@ -12517,6 +12495,292 @@
 call ang32_mode_5_31_row_16_31
 RET
 
+cglobal ang32_mode_6_30_row_0_15
+testr7d,r7d
+; rows 0 to 7
+movum0, [r2 +  1]   ; [32 31 30 29 28 27 26 25 24 
23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
+movum1

[x265] [PATCH 3 of 4] asm: AVX2 asm for intra_ang_32 mode 7, improved over 40% than SSE asm

2015-08-13 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439373105 -19800
#  Wed Aug 12 15:21:45 2015 +0530
# Node ID c12d411014f68affea550ee640e26ba61f51e509
# Parent  643a001494a42e65366cfa3e468cc0858955095f
asm: AVX2 asm for intra_ang_32 mode 7, improved over 40% than SSE asm

updated intra_ang_32 mode 29 AVX2 asm code, improved over 10% than previous 
AVX2 code

diff -r 643a001494a4 -r c12d411014f6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Aug 12 13:38:25 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Aug 12 15:21:45 2015 +0530
@@ -3019,6 +3019,7 @@
 p.cu[BLOCK_32x32].intra_pred[4]  = PFX(intra_pred_ang32_4_avx2);
 p.cu[BLOCK_32x32].intra_pred[5]  = PFX(intra_pred_ang32_5_avx2);
 p.cu[BLOCK_32x32].intra_pred[6]  = PFX(intra_pred_ang32_6_avx2);
+p.cu[BLOCK_32x32].intra_pred[7]  = PFX(intra_pred_ang32_7_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 643a001494a4 -r c12d411014f6 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Wed Aug 12 13:38:25 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Wed Aug 12 15:21:45 2015 +0530
@@ -300,27 +300,6 @@
 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
 ALIGN 32
-c_ang32_mode_29:db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 
14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 
19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 
15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 
11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 
7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 
12, 20, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 
22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 
13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 
18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 
9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
 c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 
10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
@@ -12781,6 +12760,284 @@
 call ang32_mode_6_30_row_16_31
 RET
 
+cglobal ang32_mode_7_29_row_0_15
+testr7d,r7d
+; rows 0 to 7
+movum0, [r2 +  1]   ; [32 31 30 29 28 27 26 25 24 
23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
+movu

[x265] [PATCH 1 of 4] asm: AVX2 asm for intra_ang_32 mode 5, improved over 48% than SSE asm

2015-08-13 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439297628 -19800
#  Tue Aug 11 18:23:48 2015 +0530
# Node ID 07110baa95f1d53c8100929b16eafba3b16138d6
# Parent  bc5a7c2ac38b06d2a232b983f10bc0394d252ad7
asm: AVX2 asm for intra_ang_32 mode 5, improved over 48% than SSE asm

updated intra_ang_32 mode 31 AVX2 asm code, improved over 20% than previous 
AVX2 code

diff -r bc5a7c2ac38b -r 07110baa95f1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Aug 12 15:13:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Aug 11 18:23:48 2015 +0530
@@ -3017,6 +3017,7 @@
 p.cu[BLOCK_16x16].intra_pred[23] = PFX(intra_pred_ang16_23_avx2);
 p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2);
 p.cu[BLOCK_32x32].intra_pred[4]  = PFX(intra_pred_ang32_4_avx2);
+p.cu[BLOCK_32x32].intra_pred[5]  = PFX(intra_pred_ang32_5_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r bc5a7c2ac38b -r 07110baa95f1 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Wed Aug 12 15:13:51 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Tue Aug 11 18:23:48 2015 +0530
@@ -342,27 +342,6 @@
 db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
-
-ALIGN 32
-c_ang32_mode_31:db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 
15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 
11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 
7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 
20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 
18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 
14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 
10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 
6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 
21, 11, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 
19, 13, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 
17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
 ALIGN 32
 c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
@@ -12249,6 +12228,295 @@
 call ang32_mode_4_32_row_16_31
 RET
 
+cglobal ang32_mode_5_31_row_0_15
+testr7d,r7d
+; rows 0 to 7
+movum0, [r2 +  1]   ; [32 31 30 29 28 27 26 25 24 
23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
+movum1, [r2 +  2]   ; [33 32 31 30 29 28 27 26 25 
24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8

[x265] [PATCH] asm: AVX2 asm for intra_ang_32 mode 4, improved over 45% than SSE asm

2015-08-10 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1439209099 -19800
#  Mon Aug 10 17:48:19 2015 +0530
# Branch stable
# Node ID 1ae0654c996a3ccab15e384dc8a394c029094544
# Parent  4781e6cef251006db10e107b2916741572f7760a
asm: AVX2 asm for intra_ang_32 mode 4, improved over 45% than SSE asm

updated intra_ang_32 mode 32 AVX2 asm code, improved over 32% than previous 
AVX2 code

diff -r 4781e6cef251 -r 1ae0654c996a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Aug 07 12:29:40 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Aug 10 17:48:19 2015 +0530
@@ -3016,6 +3016,7 @@
 p.cu[BLOCK_16x16].intra_pred[24] = PFX(intra_pred_ang16_24_avx2);
 p.cu[BLOCK_16x16].intra_pred[23] = PFX(intra_pred_ang16_23_avx2);
 p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2);
+p.cu[BLOCK_32x32].intra_pred[4]  = PFX(intra_pred_ang32_4_avx2);
 p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
 p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
 p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 4781e6cef251 -r 1ae0654c996a source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Fri Aug 07 12:29:40 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Mon Aug 10 17:48:19 2015 +0530
@@ -363,31 +363,6 @@
 db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 
17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
-
-ALIGN 32
-c_ang32_mode_32:   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 
11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 
12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 
2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 
13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 
3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 
14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 
4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 
15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 
5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-   db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 
6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-   db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 
17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 
7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 
18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-   db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-   db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 
19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 
9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 
20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 
10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 
21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 
32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
 ALIGN 32
 c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
4, 28, 4, 28, 4

Re: [x265] [PATCH] asm: avx2 code for intra_ang_16 modes 3 33

2015-08-05 Thread Dnyaneshwar Gorade
This is new algorithm for intra_ang16x16.
1075 cycles - current AVX2 asm
827 cycles - new AVX2 asm (improved 23% over current avx2 asm)

On Thu, Aug 6, 2015 at 10:41 AM, Deepthi Nandakumar 
deep...@multicorewareinc.com wrote:

 Please be sure to mention what is the baseline - for instance, what is
 1075 cycles?

 On Wed, Aug 5, 2015 at 6:06 PM, raj...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Rajesh Paulrajraj...@multicorewareinc.com
 # Date 1438766294 -19800
 #  Wed Aug 05 14:48:14 2015 +0530
 # Node ID 4a71c4261e5a7955a7ecdda61db1f20744254b0e
 # Parent  3fa7f6838098854de79d3800b2d775dabaf45705
 asm: avx2 code for intra_ang_16 modes 3  33

  intra_ang_16x16[ 3]  - improved 1075.09-827.85
  intra_ang_16x16[ 33] - improved 796.68-565.86

 diff -r 3fa7f6838098 -r 4a71c4261e5a source/common/x86/intrapred8.asm
 --- a/source/common/x86/intrapred8.asm  Mon Aug 03 14:56:21 2015 -0500
 +++ b/source/common/x86/intrapred8.asm  Wed Aug 05 14:48:14 2015 +0530
 @@ -294,32 +294,6 @@
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24,
 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

  ALIGN 32
 -c_ang16_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26,
 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20,
 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12,
 20
 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14,
 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18,
 14
 - db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,
 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2,
 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22,
 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10,
 22
 - db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
 16
 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10,
 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22,
 10
 - db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4,
 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
 - db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24,
 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18,
 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14,
 18
 - db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12,
 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20,
 12
 - db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6,
 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
 - db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0,
 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 -
 -ALIGN 32
 -c_ang16_mode_3:  db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26,
 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20,
 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14,
 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
 - db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,
 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2,
 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
 - db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28,
 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22,
 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
 - db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 -
 -ALIGN 32
  c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27,
 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17,
 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20,
 12
   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7,
 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
 @@ -13534,131 +13508,226 @@
  INTRA_PRED_TRANS_STORE_16x16
  RET

 -
 -INIT_YMM avx2
 -cglobal intra_pred_ang16_3, 3, 6, 12
 -mova  

[x265] [PATCH] asm: disabled AVX AVX2 primitives having less than 3% speed up over SSE

2015-08-05 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1438757401 -19800
#  Wed Aug 05 12:20:01 2015 +0530
# Node ID 3eb2ec5922be1cd934dec7f7ed886d03c0125ef5
# Parent  3fa7f6838098854de79d3800b2d775dabaf45705
asm: disabled AVX  AVX2 primitives having less than 3% speed up over SSE

these primitives are slower than SSE primitives

diff -r 3fa7f6838098 -r 3eb2ec5922be source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 03 14:56:21 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp  Wed Aug 05 12:20:01 2015 +0530
@@ -2568,7 +2568,6 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = 
PFX(pixel_satd_16x16_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = 
PFX(pixel_satd_32x32_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = 
PFX(pixel_satd_16x64_avx);
-p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = 
PFX(pixel_satd_16x8_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = 
PFX(pixel_satd_32x16_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = 
PFX(pixel_satd_8x16_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = 
PFX(pixel_satd_8x8_avx);
@@ -2578,7 +2577,6 @@
 p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx);
 p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx);
 p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx);
-p.pu[LUMA_16x8].satd  = PFX(pixel_satd_16x8_avx);
 p.pu[LUMA_8x16].satd  = PFX(pixel_satd_8x16_avx);
 p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx);
 p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx);
@@ -2586,10 +2584,8 @@
 p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx);
 p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx);
 
-p.pu[LUMA_16x4].satd  = PFX(pixel_satd_16x4_avx);
 p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx);
 p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx);
-p.pu[LUMA_32x8].satd  = PFX(pixel_satd_32x8_avx);
 p.pu[LUMA_8x32].satd  = PFX(pixel_satd_8x32_avx);
 p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx);
 p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx);
@@ -2599,38 +2595,28 @@
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = 
PFX(pixel_satd_8x8_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = 
PFX(pixel_satd_16x16_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = 
PFX(pixel_satd_32x32_avx);
-p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = 
PFX(pixel_satd_16x8_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = 
PFX(pixel_satd_8x16_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = 
PFX(pixel_satd_32x16_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = 
PFX(pixel_satd_16x32_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = 
PFX(pixel_satd_16x12_avx);
-p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = 
PFX(pixel_satd_16x4_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = 
PFX(pixel_satd_32x24_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = 
PFX(pixel_satd_24x32_avx);
-p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = 
PFX(pixel_satd_32x8_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = 
PFX(pixel_satd_8x32_avx);
-ASSIGN_SA8D(avx);
+
+p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_avx);
+p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = 
PFX(pixel_sa8d_8x16_avx);
+p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = 
PFX(pixel_sa8d_16x32_avx);
+p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = 
PFX(pixel_sa8d_32x64_avx);
+
 p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = 
PFX(pixel_sa8d_32x32_avx);
 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = 
PFX(pixel_sa8d_16x16_avx);
 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = 
PFX(pixel_sa8d_8x8_avx);
 
-p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx);
-p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx);
-
 p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = 
PFX(pixel_ssd_8x8_avx);
 
 p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx);
 
 p.pu[LUMA_16x4].sad_x4  = PFX(pixel_sad_x4_16x4_avx);
-p.pu[LUMA_16x8].sad_x4  = PFX(pixel_sad_x4_16x8_avx);
-p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx);
-p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx);
-p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx);
-p.pu[LUMA_32x8].sad_x4  = PFX(pixel_sad_x4_32x8_avx);
-p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx);
-p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx);
-p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx);
-p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx);
 
 p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_avx

[x265] [PATCH] asm: updated avx2 algorithm for copy_ps 32xN 64xN, improved over 45% than SSE asm

2015-08-05 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1438767554 -19800
#  Wed Aug 05 15:09:14 2015 +0530
# Node ID 377a996a8d74110f838ff2e3cef1c42781d6d730
# Parent  3eb2ec5922be1cd934dec7f7ed886d03c0125ef5
asm: updated avx2 algorithm for copy_ps 32xN  64xN, improved over 45% than SSE 
asm

diff -r 3eb2ec5922be -r 377a996a8d74 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Aug 05 12:20:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Aug 05 15:09:14 2015 +0530
@@ -3622,6 +3622,11 @@
 if (cpuMask  X265_CPU_BMI2)
 p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
 
+p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
+p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = 
PFX(blockcopy_ps_32x32_avx2);
+p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = 
PFX(blockcopy_ps_32x64_avx2);
+p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
+
 /* The following primitives have been disabled since performance 
compared to SSE is negligible/negative */
 #if 0
 p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
@@ -3652,10 +3657,6 @@
 p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_avx2);
 p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = 
PFX(blockcopy_sp_16x16_avx2);
 p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = 
PFX(blockcopy_sp_16x32_avx2);
-p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
-p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = 
PFX(blockcopy_ps_32x32_avx2);
-p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = 
PFX(blockcopy_ps_32x64_avx2);
-p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = 
PFX(interp_4tap_horiz_pp_4x8_avx2);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = 
PFX(interp_4tap_horiz_pp_4x16_avx2);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = 
PFX(interp_4tap_vert_pp_16x4_avx2);
diff -r 3eb2ec5922be -r 377a996a8d74 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm  Wed Aug 05 12:20:01 2015 +0530
+++ b/source/common/x86/blockcopy8.asm  Wed Aug 05 15:09:14 2015 +0530
@@ -3043,43 +3043,31 @@
 ;-
 %macro BLOCKCOPY_PS_W32_H4_avx2 2
 INIT_YMM avx2
-cglobal blockcopy_ps_%1x%2, 4, 7, 3
+cglobal blockcopy_ps_%1x%2, 4, 7, 2
 add r1, r1
 mov r4d, %2/4
 lea r5, [3 * r3]
 lea r6, [3 * r1]
-pxorm0, m0
-
 .loop:
-movu  m1, [r2]
-punpcklbw m2, m1, m0
-punpckhbw m1, m1, m0
-vperm2i128m3, m2, m1, 0010b
-vperm2i128m2, m2, m1, 00110001b
-movu  [r0], m3
-movu  [r0 + 32], m2
-movu  m1, [r2 + r3]
-punpcklbw m2, m1, m0
-punpckhbw m1, m1, m0
-vperm2i128m3, m2, m1, 0010b
-vperm2i128m2, m2, m1, 00110001b
-movu  [r0 + r1], m3
-movu  [r0 + r1 + 32], m2
-movu  m1, [r2 + 2 * r3]
-punpcklbw m2, m1, m0
-punpckhbw m1, m1, m0
-vperm2i128m3, m2, m1, 0010b
-vperm2i128m2, m2, m1, 00110001b
-movu  [r0 + 2 * r1], m3
-movu  [r0 + 2 * r1 + 32], m2
-movu  m1, [r2 + r5]
-punpcklbw m2, m1, m0
-punpckhbw m1, m1, m0
-vperm2i128m3, m2, m1, 0010b
-vperm2i128m2, m2, m1, 00110001b
-movu  [r0 + r6], m3
-movu  [r0 + r6 + 32], m2
-
+pmovzxbw  m0, [r2 +  0]
+pmovzxbw  m1, [r2 + 16]
+movu  [r0 +  0], m0
+movu  [r0 + 32], m1
+
+pmovzxbw  m0, [r2 + r3 +  0]
+pmovzxbw  m1, [r2 + r3 + 16]
+movu  [r0 + r1 +  0], m0
+movu  [r0 + r1 + 32], m1
+
+pmovzxbw  m0, [r2 + r3 * 2 +  0]
+pmovzxbw  m1, [r2 + r3 * 2 + 16]
+movu  [r0 + r1 * 2 +  0], m0
+movu  [r0 + r1 * 2 + 32], m1
+
+pmovzxbw  m0, [r2 + r5 +  0]
+pmovzxbw  m1, [r2 + r5 + 16]
+movu  [r0 + r6 +  0], m0
+movu  [r0 + r6 + 32], m1
 lea   r0, [r0 + 4 * r1]
 lea   r2, [r2 + 4 * r3]
 dec   r4d
@@ -3228,71 +3216,49 @@
 INIT_YMM avx2
 cglobal blockcopy_ps_64x64, 4, 7, 4
 add r1, r1
-mov r4d, 64/4
+mov r4d, 64/8
 lea r5, [3 * r3]
 lea r6, [3 * r1]
-pxorm0, m0
-
 .loop:
-movu  m1, [r2]
-punpcklbw m2, m1, m0
-punpckhbw m1, m1, m0
-vperm2i128m3, m2, m1, 0010b
-vperm2i128m2, m2, m1, 00110001b
-movu  [r0], m3
-movu  [r0 + 32], m2
-movu  m1, [r2 + 32]
-punpcklbw m2, m1, m0
-punpckhbw m1, m1, m0
-vperm2i128m3, m2, m1, 0010b
-vperm2i128m2, m2, m1, 00110001b
-movu

[x265] [PATCH] asm: disabled AVX primitives having less than 3% speed up over SSE

2015-08-04 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1438669788 -19800
#  Tue Aug 04 11:59:48 2015 +0530
# Node ID fc84f3731e2c9eafc8164361b67422732f811008
# Parent  2b89c446b404ed20c0316efaab5b1e088289c0b4
asm: disabled AVX primitives having less than 3% speed up over SSE

these AVX primitives are slower than SSE primitives

diff -r 2b89c446b404 -r fc84f3731e2c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Aug 03 16:45:04 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Aug 04 11:59:48 2015 +0530
@@ -2556,7 +2556,7 @@
 }
 if (cpuMask  X265_CPU_AVX)
 {
-p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx);
+//p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = 
PFX(pixel_satd_16x24_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = 
PFX(pixel_satd_32x48_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = 
PFX(pixel_satd_24x64_avx);
@@ -2571,28 +2571,53 @@
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = 
PFX(pixel_satd_16x64_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = 
PFX(pixel_satd_16x8_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = 
PFX(pixel_satd_32x16_avx);
-p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = 
PFX(pixel_satd_8x4_avx);
+//p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = 
PFX(pixel_satd_8x4_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = 
PFX(pixel_satd_8x16_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = 
PFX(pixel_satd_8x8_avx);
 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = 
PFX(pixel_satd_8x32_avx);
-p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = 
PFX(pixel_satd_4x8_avx);
-p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = 
PFX(pixel_satd_4x16_avx);
-p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = 
PFX(pixel_satd_4x4_avx);
-ALL_LUMA_PU(satd, pixel_satd, avx);
-p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = 
PFX(pixel_satd_4x4_avx);
+//p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = 
PFX(pixel_satd_4x8_avx);
+//p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = 
PFX(pixel_satd_4x16_avx);
+//p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = 
PFX(pixel_satd_4x4_avx);
+
+p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_avx);
+p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx);
+p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx);
+p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx);
+//p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_avx);
+//p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_avx);
+p.pu[LUMA_16x8].satd  = PFX(pixel_satd_16x8_avx);
+p.pu[LUMA_8x16].satd  = PFX(pixel_satd_8x16_avx);
+p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx);
+p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx);
+p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx);
+p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx);
+p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx);
+//p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_avx);
+p.pu[LUMA_16x4].satd  = PFX(pixel_satd_16x4_avx);
+//p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_avx);
+p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx);
+p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx);
+p.pu[LUMA_32x8].satd  = PFX(pixel_satd_32x8_avx);
+p.pu[LUMA_8x32].satd  = PFX(pixel_satd_8x32_avx);
+p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx);
+p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx);
+p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx);
+p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx);
+
+//p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = 
PFX(pixel_satd_4x4_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = 
PFX(pixel_satd_8x8_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = 
PFX(pixel_satd_16x16_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = 
PFX(pixel_satd_32x32_avx);
-p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = 
PFX(pixel_satd_8x4_avx);
-p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = 
PFX(pixel_satd_4x8_avx);
+//p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = 
PFX(pixel_satd_8x4_avx);
+//p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = 
PFX(pixel_satd_4x8_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = 
PFX(pixel_satd_16x8_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = 
PFX(pixel_satd_8x16_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = 
PFX(pixel_satd_32x16_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = 
PFX(pixel_satd_16x32_avx);
 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12

[x265] [PATCH] asm: avx2 code for pixelavg_pp 32xN 64xN, improved over 40% than SSE

2015-08-03 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1438596650 -19800
#  Mon Aug 03 15:40:50 2015 +0530
# Node ID 43fe4ec1c13a2514030010c2cd699382b67f65cb
# Parent  a3b72e2a25a7fc544b1b76e872eda012035bf4ac
asm: avx2 code for pixelavg_pp 32xN  64xN, improved over 40% than SSE

diff -r a3b72e2a25a7 -r 43fe4ec1c13a source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmMon Aug 03 10:28:34 2015 +0530
+++ b/source/common/x86/mc-a.asmMon Aug 03 15:40:50 2015 +0530
@@ -4300,24 +4300,12 @@
 AVGH  4,  8
 AVGH  4,  4
 AVGH  4,  2
+
 INIT_XMM avx2
 ; TODO: active AVX2 after debug
 ;AVG_FUNC 24, movdqu, movdqa
 ;AVGH 24, 32
 
-AVG_FUNC 64, movdqu, movdqa
-AVGH 64, 64
-AVGH 64, 48
-AVGH 64, 32
-AVGH 64, 16
-
-AVG_FUNC 32, movdqu, movdqa
-AVGH 32, 64
-AVGH 32, 32
-AVGH 32, 24
-AVGH 32, 16
-AVGH 32, 8
-
 AVG_FUNC 16, movdqu, movdqa
 AVGH 16, 64
 AVGH 16, 32
@@ -4328,7 +4316,109 @@
 
 %endif ;HIGH_BIT_DEPTH
 
-
+;---
+;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t 
sstride0, const pixel* src1, intptr_t sstride1, int)
+;---
+%if ARCH_X86_64  BIT_DEPTH == 8
+INIT_YMM avx2
+cglobal pixel_avg_8x32
+%rep 4
+movum0, [r2]
+movum2, [r2 + r3]
+movum1, [r4]
+movum3, [r4 + r5]
+pavgb   m0, m1
+pavgb   m2, m3
+movu[r0], m0
+movu[r0 + r1], m2
+
+lea r2, [r2 + r3 * 2]
+lea r4, [r4 + r5 * 2]
+lea r0, [r0 + r1 * 2]
+%endrep
+ret
+
+cglobal pixel_avg_16x64_8bit
+%rep 8
+movum0, [r2]
+movum2, [r2 + mmsize]
+movum1, [r4]
+movum3, [r4 + mmsize]
+pavgb   m0, m1
+pavgb   m2, m3
+movu[r0], m0
+movu[r0 + mmsize], m2
+
+movum0, [r2 + r3]
+movum2, [r2 + r3 + mmsize]
+movum1, [r4 + r5]
+movum3, [r4 + r5 + mmsize]
+pavgb   m0, m1
+pavgb   m2, m3
+movu[r0 + r1], m0
+movu[r0 + r1 + mmsize], m2
+
+lea r2, [r2 + r3 * 2]
+lea r4, [r4 + r5 * 2]
+lea r0, [r0 + r1 * 2]
+%endrep
+ret
+
+cglobal pixel_avg_32x8, 6,6,4
+call pixel_avg_8x32
+RET
+
+cglobal pixel_avg_32x16, 6,6,4
+call pixel_avg_8x32
+call pixel_avg_8x32
+RET
+
+cglobal pixel_avg_32x24, 6,6,4
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+RET
+
+cglobal pixel_avg_32x32, 6,6,4
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+RET
+
+cglobal pixel_avg_32x64, 6,6,4
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+call pixel_avg_8x32
+RET
+
+cglobal pixel_avg_64x16, 6,6,4
+call pixel_avg_16x64_8bit
+RET
+
+cglobal pixel_avg_64x32, 6,6,4
+call pixel_avg_16x64_8bit
+call pixel_avg_16x64_8bit
+RET
+
+cglobal pixel_avg_64x48, 6,6,4
+call pixel_avg_16x64_8bit
+call pixel_avg_16x64_8bit
+call pixel_avg_16x64_8bit
+RET
+
+cglobal pixel_avg_64x64, 6,6,4
+call pixel_avg_16x64_8bit
+call pixel_avg_16x64_8bit
+call pixel_avg_16x64_8bit
+call pixel_avg_16x64_8bit
+RET
+%endif
 
 ;=
 ; pixel avg2
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] main12: added lambda tables based based on qp values

2015-07-23 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1437640145 -19800
#  Thu Jul 23 13:59:05 2015 +0530
# Node ID 0bdab1ab0e78684cbb3ecc4913e59d2b35b4e1b7
# Parent  42bc8575020b73d129d0bcef70c7cbe80a8b51df
main12: added lambda tables based based on qp values

diff -r 42bc8575020b -r 0bdab1ab0e78 source/common/constants.cpp
--- a/source/common/constants.cpp   Wed Jul 22 12:56:34 2015 -0500
+++ b/source/common/constants.cpp   Thu Jul 23 13:59:05 2015 +0530
@@ -27,7 +27,48 @@
 
 namespace X265_NS {
 
-#if HIGH_BIT_DEPTH
+#if X265_DEPTH == 12
+
+// lambda = pow(2, (double)q / 6 - 2) * (1  (12 - 8));
+double x265_lambda_tab[QP_MAX_MAX + 1] =
+{
+4.,4.4898,5.0397,5.6569, 6.3496,
+7.1272,8.,8.9797,10.0794,11.3137,
+12.6992,   14.2544,   16.,   17.9594,20.1587,
+22.6274,   25.3984,   28.5088,   32.,35.9188,
+40.3175,   45.2548,   50.7968,   57.0175,64.,
+71.8376,   80.6349,   90.5097,   101.5937,   114.0350,
+128.,  143.6751,  161.2699,  181.0193,   203.1873,
+228.0701,  256.,  287.3503,  322.5398,   362.0387,
+406.3747,  456.1401,  512.,  574.7006,   645.0796,
+724.0773,  812.7493,  912.2803,  1024.,  1149.4011,
+1290.1592, 1448.1547, 1625.4987, 1824.5606,  2048.,
+2298.8023, 2580.3183, 2896.3094, 3250.9974,  3649.1211,
+4096., 4597.6045, 5160.6366, 5792.6188,  6501.9947,
+7298.2423, 8192., 9195.2091, 10321.2732, 11585.2375
+};
+
+// lambda2 = pow(lambda, 2) * scale (0.85);
+double x265_lambda2_tab[QP_MAX_MAX + 1] =
+{
+13.6000,   17.1349,   21.5887,   27.2000,   34.2699,
+43.1773,   54.4000,   68.5397,   86.3546,   108.8000,
+137.0794,  172.7092,  217.6000,  274.1588,  345.4185,
+435.2000,  548.3176,  690.8369,  870.4000,  1096.6353,
+1381.6739, 1740.8000, 2193.2706, 2763.3478, 3481.6000,
+4386.5411, 5526.6955, 6963.2000, 8773.0822, 11053.3910,
+13926.4000,17546.1645,22106.7819,27852.8000,35092.3291,
+44213.5641,55705.6000,70184.6579,88427.1282,111411.2000,
+140369.3159,   176854.2563,   222822.4000,   280738.6324,   353708.5127,
+445644.8001,   561477.2648,   707417.0237,   891289.6000,   1122954.5277,
+1414834.0484,  1782579.2003,  2245909.0566,  2829668.0981,  3565158.4000,
+4491818.1146,  5659336.1938,  7130316.8013,  8983636.2264,  11318672.3923,
+14260633.6000, 17967272.4585, 22637344.7751, 28521267.1953, 35934544.9165,
+45274689.5567, 57042534.4000, 71869089.8338, 90549379.1181, 114085068.8008
+};
+
+#elif X265_DEPTH == 10
+
 // lambda = pow(2, (double)q / 6 - 2) * (1  (X265_DEPTH - 8));
 double x265_lambda_tab[QP_MAX_MAX + 1] =
 {
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: fix linux build error- cannot override register size

2015-07-13 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1436771870 -19800
#  Mon Jul 13 12:47:50 2015 +0530
# Node ID 96eaae96478a252f46736416248ec8dcba618c7d
# Parent  7cb28662875630da90d85d62b01d58f4c51f7e32
asm: fix linux build error- cannot override register size

diff -r 7cb286628756 -r 96eaae96478a source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Mon Jul 13 12:16:57 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Mon Jul 13 12:47:50 2015 +0530
@@ -2132,8 +2132,8 @@
 jnz .loopH
 
 ; sum to global buffer
-mov r1, r5m
-mov r0, r6m
+mov r1, r5mp
+mov r0, r6mp
 
 ; s_eoTable = {1, 2, 0, 3, 4}
 movzx   r5d, word [rsp + 0 * 2]
@@ -2165,9 +2165,9 @@
 
;---
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsE1, 4,11,9,0-32; Stack: 5 of stats and 5 of count
+cglobal saoCuStatsE1, 4,12,9,0-32; Stack: 5 of stats and 5 of count
 mov r4d, r4m
-mov r5d, r5m
+mov r11d, r5d
 
 ; clear internal temporary buffer
 pxorm0, m0
@@ -2183,7 +2183,7 @@
 mov r6d, r4d
 mov r9, r0
 mov r10, r1
-mov r5, r3
+mov r11, r3
 
 .loopW:
 movum1, [r10]
@@ -2200,12 +2200,12 @@
 psubb   m3, m2  ; -signDown
 
 ; edgeType
-movum4, [r5]
+movum4, [r11]
 paddb   m4, m6
 paddb   m2, m4
 
 ; update upBuff1
-movu[r5], m3
+movu[r11], m3
 
 ; stats[edgeType]
 pxorm1, m0
@@ -2236,7 +2236,7 @@
 
 add r9, 16
 add r10, 16
-add r5, 16
+add r11, 16
 jmp .loopW
 
 .next:
@@ -2244,7 +2244,7 @@
 add r0, r2
 add r1, r2
 
-dec byte r5m
+dec r5d
 jg .loopH
 
 ; restore unavailable pixels
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 3 of 3] asm: sse4 code for saoCuStatsE1, improved 320369c-151086c

2015-07-07 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1436252372 -19800
#  Tue Jul 07 12:29:32 2015 +0530
# Node ID 25a8323b886f480347f4b0813f7ded18e579704a
# Parent  235930aae11da04863e3fb13905e2d1d95e3dc0a
asm: sse4 code for saoCuStatsE1, improved 320369c-151086c

diff -r 235930aae11d -r 25a8323b886f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Jul 07 12:29:32 2015 +0530
@@ -2499,6 +2499,7 @@
 #if X86_64
 p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
 p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
+p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
 p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
 p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
 
diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Tue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Tue Jul 07 12:29:32 2015 +0530
@@ -2159,3 +2159,122 @@
 add [r1 + 4 * 4], r6d
 RET
 %endif
+
+;---
+; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t 
*upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+;---
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE1, 4,11,9,0-32; Stack: 5 of stats and 5 of count
+mov r4d, r4m
+mov r5d, r5m
+
+; clear internal temporary buffer
+pxorm0, m0
+mova[rsp], m0
+mova[rsp + mmsize], m0
+movam0, [pb_128]
+movam5, [pb_1]
+movam6, [pb_2]
+movam8, [hmul_16p + 16]
+movhm7, [r3 + r4]
+
+.loopH:
+mov r6d, r4d
+mov r9, r0
+mov r10, r1
+mov r5, r3
+
+.loopW:
+movum1, [r10]
+movum2, [r10 + r2]
+
+; signDown
+pxorm1, m0
+pxorm2, m0
+pcmpgtb m3, m1, m2
+pandm3, m5
+pcmpgtb m2, m1
+por m2, m3
+pxorm3, m3
+psubb   m3, m2  ; -signDown
+
+; edgeType
+movum4, [r5]
+paddb   m4, m6
+paddb   m2, m4
+
+; update upBuff1
+movu[r5], m3
+
+; stats[edgeType]
+pxorm1, m0
+movum3, [r9]
+punpckhbw   m4, m3, m1
+punpcklbw   m3, m1
+pmaddubsw   m3, m8
+pmaddubsw   m4, m8
+
+; 16 pixels
+%assign x 0
+%rep 16
+pextrb  r7d, m2, x
+inc word [rsp + r7 * 2]
+
+  %if (x  8)
+pextrw  r8d, m3, (x % 8)
+  %else
+pextrw  r8d, m4, (x % 8)
+  %endif
+movsx   r8d, r8w
+add [rsp + 5 * 2 + r7 * 4], r8d
+
+dec r6d
+jz .next
+%assign x x+1
+%endrep
+
+add r9, 16
+add r10, 16
+add r5, 16
+jmp .loopW
+
+.next:
+; restore pointer upBuff1
+add r0, r2
+add r1, r2
+
+dec byte r5m
+jg .loopH
+
+; restore unavailable pixels
+movh[r3 + r4], m7
+
+; sum to global buffer
+mov r1, r6m
+mov r0, r7m
+
+; s_eoTable = {1,2,0,3,4}
+movzx   r6d, word [rsp + 0 * 2]
+add [r0 + 1 * 4], r6d
+movzx   r6d, word [rsp + 1 * 2]
+add [r0 + 2 * 4], r6d
+movzx   r6d, word [rsp + 2 * 2]
+add [r0 + 0 * 4], r6d
+movzx   r6d, word [rsp + 3 * 2]
+add [r0 + 3 * 4], r6d
+movzx   r6d, word [rsp + 4 * 2]
+add [r0 + 4 * 4], r6d
+
+mov r6d, [rsp + 5 * 2 + 0 * 4]
+add [r1 + 1 * 4], r6d
+mov r6d, [rsp + 5 * 2 + 1 * 4]
+add [r1 + 2 * 4], r6d
+mov r6d, [rsp + 5 * 2 + 2 * 4]
+add [r1 + 0 * 4], r6d
+mov r6d, [rsp + 5 * 2 + 3 * 4]
+add [r1 + 3 * 4], r6d
+mov r6d, [rsp + 5 * 2 + 4 * 4]
+add [r1 + 4 * 4], r6d
+RET
+%endif ; ARCH_X86_64
diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.hTue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/loopfilter.hTue Jul 07 12:29:32 2015 +0530
@@ -37,6 +37,7 @@
 void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int 
ctuWidth, int ctuHeight, intptr_t stride); \
 void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, 
intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
 void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, 
intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, 
intptr_t

[x265] [PATCH 2 of 3] asm: sse4 code for saoCuStatsE0, improved 250341c-147284c

2015-07-07 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1436251628 -19800
#  Tue Jul 07 12:17:08 2015 +0530
# Node ID 235930aae11da04863e3fb13905e2d1d95e3dc0a
# Parent  e0166f09f332af72a83eb059d878044db15f59bd
asm: sse4 code for saoCuStatsE0, improved 250341c-147284c

diff -r e0166f09f332 -r 235930aae11d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Jul 07 12:17:08 2015 +0530
@@ -2498,6 +2498,7 @@
 
 #if X86_64
 p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
+p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
 p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
 p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
 
diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Tue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Tue Jul 07 12:17:08 2015 +0530
@@ -2043,3 +2043,119 @@
 jnz .loopH
 RET
 %endif
+
+;---
+; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, 
int endY, int32_t *stats, int32_t *count)
+;---
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE0, 5,8,8, 0-32
+mov r3d, r3m
+
+; clear internal temporary buffer
+pxorm0, m0
+mova[rsp], m0
+mova[rsp + mmsize], m0
+movam4, [pb_128]
+movam5, [hmul_16p + 16]
+movam6, [pb_2]
+xor r7d, r7d
+
+.loopH:
+mov r5d, r3d
+
+; calculate signLeft
+mov r7b, [r1]
+sub r7b, [r1 - 1]
+setar7b
+setbr6b
+sub r7b, r6b
+neg r7b
+pinsrb  m0, r7d, 15
+
+.loopL:
+movum7, [r1]
+movum2, [r1 + 1]
+
+pxorm1, m7, m4
+pxorm3, m2, m4
+pcmpgtb m2, m1, m3
+pcmpgtb m3, m1
+pandm2, [pb_1]
+por m2, m3  ; signRight
+
+palignr m3, m2, m0, 15
+psignb  m3, m4  ; signLeft
+
+movam0, m2
+paddb   m2, m3
+paddb   m2, m6  ; edgeType
+
+; stats[edgeType]
+movum3, [r0]; fenc[0-15]
+punpckhbw   m1, m3, m7
+punpcklbw   m3, m7
+pmaddubsw   m1, m5
+pmaddubsw   m3, m5
+
+%assign x 0
+%rep 16
+pextrb  r7d, m2, x
+
+%if (x  8)
+pextrw  r6d, m3, (x % 8)
+%else
+pextrw  r6d, m1, (x % 8)
+%endif
+movsx   r6d, r6w
+inc word [rsp + r7 * 2] ; tmp_count[edgeType]++
+add [rsp + 5 * 2 + r7 * 4], r6d ; tmp_stats[edgeType] += 
(fenc[x] - rec[x])
+dec r5d
+jz  .next
+%assign x x+1
+%endrep
+
+add r0q, 16
+add r1q, 16
+jmp .loopL
+
+.next:
+mov r6d, r3d
+and r6d, 15
+
+sub r6, r3
+add r6, r2
+add r0, r6
+add r1, r6
+
+dec r4d
+jnz .loopH
+
+; sum to global buffer
+mov r1, r5m
+mov r0, r6m
+
+; s_eoTable = {1, 2, 0, 3, 4}
+movzx   r5d, word [rsp + 0 * 2]
+add [r0 + 1 * 4], r5d
+movzx   r6d, word [rsp + 1 * 2]
+add [r0 + 2 * 4], r6d
+movzx   r5d, word [rsp + 2 * 2]
+add [r0 + 0 * 4], r5d
+movzx   r6d, word [rsp + 3 * 2]
+add [r0 + 3 * 4], r6d
+movzx   r5d, word [rsp + 4 * 2]
+add [r0 + 4 * 4], r5d
+
+mov r6d, [rsp + 5 * 2 + 0 * 4]
+add [r1 + 1 * 4], r6d
+mov r5d, [rsp + 5 * 2 + 1 * 4]
+add [r1 + 2 * 4], r5d
+mov r6d, [rsp + 5 * 2 + 2 * 4]
+add [r1 + 0 * 4], r6d
+mov r5d, [rsp + 5 * 2 + 3 * 4]
+add [r1 + 3 * 4], r5d
+mov r6d, [rsp + 5 * 2 + 4 * 4]
+add [r1 + 4 * 4], r6d
+RET
+%endif
diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.hTue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/loopfilter.hTue Jul 07 12:17:08 2015 +0530
@@ -36,6 +36,7 @@
 void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t 
*m_offsetEo, intptr_t stride, int startX, int endX); \
 void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int 
ctuWidth, int ctuHeight, intptr_t stride); \
 void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, 
intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, 
intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
 void PFX

[x265] [PATCH] asm: fix 32-bit build error- undefined symbol r7d, r8d

2015-07-06 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1436183156 -19800
#  Mon Jul 06 17:15:56 2015 +0530
# Node ID 45e56ef3de405a3f9c6451b46b876e3dc46aac38
# Parent  bf57ce5d38d5208a491bf4192e389ab1eb4a4f32
asm: fix 32-bit build error- undefined symbol r7d, r8d

diff -r bf57ce5d38d5 -r 45e56ef3de40 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Jul 03 19:50:02 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Mon Jul 06 17:15:56 2015 +0530
@@ -1677,6 +1677,7 @@
 %endif
 
 
+%if ARCH_X86_64 == 1
 %if HIGH_BIT_DEPTH
 INIT_YMM avx2
 cglobal weight_sp, 6,7,9
@@ -1872,6 +1873,7 @@
 jnz .loopH
 RET
 %endif
+%endif
 
 ;-
 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] Compiling 8-bit Win32 target fails: 64 registers r7/r8 used

2015-07-06 Thread Dnyaneshwar Gorade
sent a fix patch. Yes, it was caused by %ARCH_X86_64 removal.

On Mon, Jul 6, 2015 at 5:20 PM, Mario *LigH* Rohkrämer cont...@ligh.de
wrote:

 Possibly after a line with a check %if ARCH_X86_64 was removed?

 Win32 non-HBD still allows ASM.

 +
 [  8%] Building ASM_YASM object
 common/CMakeFiles/common.dir/x86/pixel-util8.asm.obj
 h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1801:
 error: undefined symbol `r7d' (first use)
 h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1801:
 error:  (Each undefined symbol is reported only once.)
 h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1812:
 error: undefined symbol `r7' (first use)
 h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1813:
 warning: `r8' is a register in 64-bit mode
 h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1813:
 error: undefined symbol `r8' (first use)
 h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1819:
 warning: `r8' is a register in 64-bit mode
 h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1869:
 warning: `r8' is a register in 64-bit mode
 h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1869:
 warning: `r8' is a register in 64-bit mode
 make[2]: *** [common/CMakeFiles/common.dir/x86/pixel-util8.asm.obj] Error 1
 +

 --

 Fun and success!
 Mario *LigH* Rohkrämer
 mailto:cont...@ligh.de

 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 3 of 3] sao: created new primitive for saoCuStatsBO

2015-07-02 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435749680 -19800
#  Wed Jul 01 16:51:20 2015 +0530
# Node ID 9fd6c4bca7695f847ff9a28a065122b840ecae5a
# Parent  915d02816797d3c70004e652a13b3804571c251b
sao: created new primitive for saoCuStatsBO

diff -r 915d02816797 -r 9fd6c4bca769 source/common/primitives.h
--- a/source/common/primitives.hWed Jul 01 16:50:32 2015 +0530
+++ b/source/common/primitives.hWed Jul 01 16:51:20 2015 +0530
@@ -174,6 +174,7 @@
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, 
intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, 
int ctuHeight, intptr_t stride);
 
+typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, 
int32_t *count);
@@ -299,6 +300,7 @@
 saoCuOrgE3_t  saoCuOrgE3[2];
 saoCuOrgB0_t  saoCuOrgB0;
 
+saoCuStatsBO_tsaoCuStatsBO;
 saoCuStatsE0_tsaoCuStatsE0;
 saoCuStatsE1_tsaoCuStatsE1;
 saoCuStatsE2_tsaoCuStatsE2;
diff -r 915d02816797 -r 9fd6c4bca769 source/encoder/sao.cpp
--- a/source/encoder/sao.cppWed Jul 01 16:50:32 2015 +0530
+++ b/source/encoder/sao.cppWed Jul 01 16:51:20 2015 +0530
@@ -666,7 +666,6 @@
 /* Calculate SAO statistics for current CTU without non-crossing slice */
 void SAO::calcSaoStatsCu(int addr, int plane)
 {
-int x, y;
 const CUData* cu = m_frame-m_encData-getPicCTU(addr);
 const pixel* fenc0 = m_frame-m_fencPic-getPlaneAddr(plane, addr);
 const pixel* rec0  = m_frame-m_reconPic-getPlaneAddr(plane, addr);
@@ -697,8 +696,6 @@
 int startY;
 int endX;
 int endY;
-int32_t* stats;
-int32_t* count;
 
 int skipB = plane ? 2 : 4;
 int skipR = plane ? 3 : 5;
@@ -708,34 +705,16 @@
 
 // SAO_BO:
 {
-const int boShift = X265_DEPTH - SAO_BO_BITS;
-
 if (m_param-bSaoNonDeblocked)
 {
 skipB = plane ? 1 : 3;
 skipR = plane ? 2 : 4;
 }
-stats = m_offsetOrg[plane][SAO_BO];
-count = m_count[plane][SAO_BO];
-
-fenc = fenc0;
-rec  = rec0;
 
 endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
 endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
 
-for (y = 0; y  endY; y++)
-{
-for (x = 0; x  endX; x++)
-{
-int classIdx = 1 + (rec[x]  boShift);
-stats[classIdx] += (fenc[x] - rec[x]);
-count[classIdx]++;
-}
-
-fenc += stride;
-rec += stride;
-}
+primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, 
m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
 }
 
 {
@@ -785,8 +764,6 @@
 skipB = plane ? 2 : 4;
 skipR = plane ? 3 : 5;
 }
-stats = m_offsetOrg[plane][SAO_EO_2];
-count = m_count[plane][SAO_EO_2];
 
 fenc = fenc0;
 rec  = rec0;
@@ -814,8 +791,6 @@
 skipB = plane ? 2 : 4;
 skipR = plane ? 3 : 5;
 }
-stats = m_offsetOrg[plane][SAO_EO_3];
-count = m_count[plane][SAO_EO_3];
 
 fenc = fenc0;
 rec  = rec0;
@@ -1552,6 +1527,25 @@
 }
 
 // NOTE: must put in namespace X265_NS since we need class SAO
+void saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int 
endX, int endY, int32_t *stats, int32_t *count)
+{
+int x, y;
+const int boShift = X265_DEPTH - SAO_BO_BITS;
+
+for (y = 0; y  endY; y++)
+{
+for (x = 0; x  endX; x++)
+{
+int classIdx = 1 + (rec[x]  boShift);
+stats[classIdx] += (fenc[x] - rec[x]);
+count[classIdx]++;
+}
+
+fenc += stride;
+rec += stride;
+}
+}
+
 void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int 
endX, int endY, int32_t *stats, int32_t *count)
 {
 int x, y;
@@ -1702,6 +1696,7 @@
 void setupSaoPrimitives_c(EncoderPrimitives p)
 {
 // TODO: move other sao functions to here
+p.saoCuStatsBO = saoCuStatsBO_c;
 p.saoCuStatsE0 = saoCuStatsE0_c;
 p.saoCuStatsE1 = saoCuStatsE1_c;
 p.saoCuStatsE2 = saoCuStatsE2_c;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo

[x265] [PATCH 2 of 3] sao: created new primitive for saoCuStatsE0

2015-07-02 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435749632 -19800
#  Wed Jul 01 16:50:32 2015 +0530
# Node ID 915d02816797d3c70004e652a13b3804571c251b
# Parent  18151ada638dd19843551e2a6d5d8b2cc9bd28be
sao: created new primitive for saoCuStatsE0

diff -r 18151ada638d -r 915d02816797 source/common/primitives.h
--- a/source/common/primitives.hWed Jul 01 16:49:24 2015 +0530
+++ b/source/common/primitives.hWed Jul 01 16:50:32 2015 +0530
@@ -174,6 +174,7 @@
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, 
intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, 
int ctuHeight, intptr_t stride);
 
+typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, 
int32_t *count);
 typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
@@ -298,6 +299,7 @@
 saoCuOrgE3_t  saoCuOrgE3[2];
 saoCuOrgB0_t  saoCuOrgB0;
 
+saoCuStatsE0_tsaoCuStatsE0;
 saoCuStatsE1_tsaoCuStatsE1;
 saoCuStatsE2_tsaoCuStatsE2;
 saoCuStatsE3_tsaoCuStatsE3;
diff -r 18151ada638d -r 915d02816797 source/encoder/sao.cpp
--- a/source/encoder/sao.cppWed Jul 01 16:49:24 2015 +0530
+++ b/source/encoder/sao.cppWed Jul 01 16:50:32 2015 +0530
@@ -706,11 +706,6 @@
 int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
 int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
 
-// Dynamic Range: 64x64x14bpp = 24bits
-int32_t tmp_stats[NUM_EDGETYPE];
-// TODO: improve by uint64_t, but need Haswell SHLX
-uint16_t tmp_count[NUM_EDGETYPE];
-
 // SAO_BO:
 {
 const int boShift = X265_DEPTH - SAO_BO_BITS;
@@ -752,41 +747,10 @@
 skipR = plane ? 3 : 5;
 }
 
-fenc = fenc0;
-rec  = rec0;
-
 startX = !lpelx;
 endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
 
-memset(tmp_stats, 0, sizeof(tmp_stats));
-memset(tmp_count, 0, sizeof(tmp_count));
-
-for (y = 0; y  ctuHeight - skipB; y++)
-{
-int signLeft = signOf(rec[startX] - rec[startX - 1]);
-for (x = startX; x  endX; x++)
-{
-int signRight = signOf2(rec[x], rec[x + 1]);
-X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), 
signDown check failure\n);
-uint32_t edgeType = signRight + signLeft + 2;
-signLeft = -signRight;
-
-X265_CHECK(edgeType = 4, edgeType check failure\n);
-tmp_stats[edgeType] += (fenc[x] - rec[x]);
-tmp_count[edgeType]++;
-}
-
-fenc += stride;
-rec += stride;
-}
-
-stats = m_offsetOrg[plane][SAO_EO_0];
-count = m_count[plane][SAO_EO_0];
-for (x = 0; x  NUM_EDGETYPE; x++)
-{
-stats[s_eoTable[x]] += tmp_stats[x];
-count[s_eoTable[x]] += tmp_count[x];
-}
+primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, 
endX - startX, ctuHeight - skipB, m_offsetOrg[plane][SAO_EO_0], 
m_count[plane][SAO_EO_0]);
 }
 
 // SAO_EO_1: // dir: |
@@ -1588,6 +1552,41 @@
 }
 
 // NOTE: must put in namespace X265_NS since we need class SAO
+void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int 
endX, int endY, int32_t *stats, int32_t *count)
+{
+int x, y;
+int32_t tmp_stats[SAO::NUM_EDGETYPE];
+int32_t tmp_count[SAO::NUM_EDGETYPE];
+
+memset(tmp_stats, 0, sizeof(tmp_stats));
+memset(tmp_count, 0, sizeof(tmp_count));
+
+for (y = 0; y  endY; y++)
+{
+int signLeft = signOf(rec[0] - rec[-1]);
+for (x = 0; x  endX; x++)
+{
+int signRight = signOf2(rec[x], rec[x + 1]);
+X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), signDown 
check failure\n);
+uint32_t edgeType = signRight + signLeft + 2;
+signLeft = -signRight;
+
+X265_CHECK(edgeType = 4, edgeType check failure\n);
+tmp_stats[edgeType] += (fenc[x] - rec[x]);
+tmp_count[edgeType]++;
+}
+
+fenc += stride;
+rec += stride;
+}
+
+for (x = 0; x  SAO::NUM_EDGETYPE; x++)
+{
+stats[SAO::s_eoTable[x]] += tmp_stats[x

[x265] [PATCH 1 of 3] sao: created new primitive for saoCuStatsE1

2015-07-02 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435749564 -19800
#  Wed Jul 01 16:49:24 2015 +0530
# Node ID 18151ada638dd19843551e2a6d5d8b2cc9bd28be
# Parent  76a314f91799c2dce6878c389503d2fe9007dbe8
sao: created new primitive for saoCuStatsE1

diff -r 76a314f91799 -r 18151ada638d source/common/primitives.h
--- a/source/common/primitives.hWed Jul 01 17:05:52 2015 -0700
+++ b/source/common/primitives.hWed Jul 01 16:49:24 2015 +0530
@@ -174,6 +174,7 @@
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, 
intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, 
int ctuHeight, intptr_t stride);
 
+typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, 
int32_t *count);
 typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t 
stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
@@ -297,6 +298,7 @@
 saoCuOrgE3_t  saoCuOrgE3[2];
 saoCuOrgB0_t  saoCuOrgB0;
 
+saoCuStatsE1_tsaoCuStatsE1;
 saoCuStatsE2_tsaoCuStatsE2;
 saoCuStatsE3_tsaoCuStatsE3;
 
diff -r 76a314f91799 -r 18151ada638d source/encoder/sao.cpp
--- a/source/encoder/sao.cppWed Jul 01 17:05:52 2015 -0700
+++ b/source/encoder/sao.cppWed Jul 01 16:49:24 2015 +0530
@@ -811,33 +811,7 @@
 
 primitives.sign(upBuff1, rec, rec[- stride], ctuWidth);
 
-memset(tmp_stats, 0, sizeof(tmp_stats));
-memset(tmp_count, 0, sizeof(tmp_count));
-
-for (y = startY; y  endY; y++)
-{
-for (x = 0; x  endX; x++)
-{
-int signDown = signOf2(rec[x], rec[x + stride]);
-X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), 
signDown check failure\n);
-uint32_t edgeType = signDown + upBuff1[x] + 2;
-upBuff1[x] = (int8_t)(-signDown);
-
-tmp_stats[edgeType] += (fenc[x] - rec[x]);
-tmp_count[edgeType]++;
-}
-
-fenc += stride;
-rec += stride;
-}
-
-stats = m_offsetOrg[plane][SAO_EO_1];
-count = m_count[plane][SAO_EO_1];
-for (x = 0; x  NUM_EDGETYPE; x++)
-{
-stats[s_eoTable[x]] += tmp_stats[x];
-count[s_eoTable[x]] += tmp_count[x];
-}
+primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * 
stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], 
m_count[plane][SAO_EO_1]);
 }
 
 // SAO_EO_2: // dir: 135
@@ -1614,6 +1588,41 @@
 }
 
 // NOTE: must put in namespace X265_NS since we need class SAO
+void saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, 
int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+{
+X265_CHECK(endX = MAX_CU_SIZE, endX check failure\n);
+X265_CHECK(endY = MAX_CU_SIZE, endY check failure\n);
+
+int x, y;
+int32_t tmp_stats[SAO::NUM_EDGETYPE];
+int32_t tmp_count[SAO::NUM_EDGETYPE];
+
+memset(tmp_stats, 0, sizeof(tmp_stats));
+memset(tmp_count, 0, sizeof(tmp_count));
+
+for (y = 0; y  endY; y++)
+{
+for (x = 0; x  endX; x++)
+{
+int signDown = signOf2(rec[x], rec[x + stride]);
+X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), signDown 
check failure\n);
+uint32_t edgeType = signDown + upBuff1[x] + 2;
+upBuff1[x] = (int8_t)(-signDown);
+
+tmp_stats[edgeType] += (fenc[x] - rec[x]);
+tmp_count[edgeType]++;
+}
+fenc += stride;
+rec += stride;
+}
+
+for (x = 0; x  SAO::NUM_EDGETYPE; x++)
+{
+stats[SAO::s_eoTable[x]] += tmp_stats[x];
+count[SAO::s_eoTable[x]] += tmp_count[x];
+}
+}
+
 void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, 
int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t 
*count)
 {
 X265_CHECK(endX  MAX_CU_SIZE, endX check failure\n);
@@ -1694,6 +1703,7 @@
 void setupSaoPrimitives_c(EncoderPrimitives p)
 {
 // TODO: move other sao functions to here
+p.saoCuStatsE1 = saoCuStatsE1_c;
 p.saoCuStatsE2 = saoCuStatsE2_c;
 p.saoCuStatsE3 = saoCuStatsE3_c;
 }
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: intra_filter4x4 avx2 code, improved 8bit: 141c-118c, 10bit: 121c-88c

2015-06-30 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435663360 -19800
#  Tue Jun 30 16:52:40 2015 +0530
# Node ID 9340454d3b551f57ba9ce6a3f77fade041975e62
# Parent  b1301944894051b9641006797e4d6253b277f3e4
asm: intra_filter4x4 avx2 code, improved 8bit: 141c-118c, 10bit: 121c-88c

diff -r b13019448940 -r 9340454d3b55 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Jun 30 16:52:40 2015 +0530
@@ -1290,6 +1290,8 @@
 }
 if (cpuMask  X265_CPU_AVX2)
 {
+p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
 p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
 p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
@@ -2619,6 +2621,8 @@
 #if X86_64
 if (cpuMask  X265_CPU_AVX2)
 {
+p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
 p.planecopy_sp = PFX(downShift_16_avx2);
 
 p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
diff -r b13019448940 -r 9340454d3b55 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue Jun 30 16:52:40 2015 +0530
@@ -77,6 +77,7 @@
 
 intra_filter4_shuf0:db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7, 
 8,  9, 10 ,11, 12, 13
 intra_filter4_shuf1:db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7, 
 8,  9, 10 ,11, 12, 13
+intra_filter4_shuf2:times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 
10, 11, 12, 13, 14, 15
 
 ;; (blkSize - 1 - x)
 pw_planar4_0:   dw  3,  2,  1,  0,  3,  2,  1,  0
@@ -22047,3 +22048,29 @@
 mov [r1 + 128], r2w ; topLast
 mov [r1 + 256], r3w ; LeftLast
 RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+mov r2w, word [r0 + 16] ; topLast
+mov r3w, word [r0 + 32] ; LeftLast
+
+; filtering top
+movum0, [r0]
+vpbroadcastwm2, xm0
+movum1, [r0 + 16]
+
+palignr m3, m0, m2, 14  ; [6 5 4 3 2 1 0 0] [14 13 12 
11 10 9 8 0]
+pshufb  m3, [intra_filter4_shuf2]   ; [6 5 4 3 2 1 0 1] [14 13 12 
11 10 9 0 9] samples[i - 1]
+palignr m1, m0, 4   ; [9 8 7 6 5 4 3 2]
+palignr m1, m1, 14  ; [9 8 7 6 5 4 3 2]
+
+psllw   m0, 1
+paddw   m3, m1
+paddw   m0, m3
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+movu[r1], m0
+mov [r1 + 16], r2w  ; topLast
+mov [r1 + 32], r3w  ; LeftLast
+RET
diff -r b13019448940 -r 9340454d3b55 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Tue Jun 30 16:52:40 2015 +0530
@@ -30,8 +30,9 @@
 intra_pred_shuff_0_8:times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 
7, 8
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 
3, 2, 1, 0
 
-intra_filter4_shuf0:  db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
-intra_filter4_shuf1:  db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  
9, 10, 11, 12, 13
+intra_filter4_shuf1:  times 2 db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  
9, 10, 11, 12, 13
+intra_filter4_shuf2:  times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 
11, 12, 13, 14, 15
 
 pb_0_8times 8 db  0,  8
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
@@ -18690,3 +18691,32 @@
 mov [r1 +  64], r2b ; topLast
 mov [r1 + 128], r3b ; LeftLast
 RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+mov r2b, byte [r0 +  8] ; topLast
+mov r3b, byte [r0 + 16] ; LeftLast
+
+; filtering top
+pmovzxbwm0, [r0]
+vpbroadcastwm2, xm0
+pmovzxbwm1, [r0 + 8]
+
+palignr m3, m0, m2, 14  ; [6 5 4 3 2 1 0 0] [14 13 12 
11 10 9 8 0]
+pshufb  m3, [intra_filter4_shuf2]   ; [6 5 4 3 2 1 0 1] [14 13 12 
11 10 9 0 9] samples[i - 1]
+palignr m1, m0, 4   ; [9 8 7 6 5 4 3 2]
+palignr m1, m1, 14  ; [9 8 7 6 5 4 3 2]
+
+psllw   m0, 1
+paddw   m3, m1
+paddw   m0, m3
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+packuswbm0, m0
+vpermq  m0, m0, 10001000b
+
+movu[r1], xm0
+mov [r1 +  8], r2b  ; topLast
+mov [r1 + 16], r3b  ; LeftLast
+RET

[x265] [PATCH] asm: intra_filter 10bpp sse4 code

2015-06-29 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435578547 -19800
#  Mon Jun 29 17:19:07 2015 +0530
# Node ID 60832369ebb4e1014b4080b27a0401f97af93958
# Parent  9feee64efa440c25f016d15ae982789e5393a77e
asm: intra_filter 10bpp sse4 code

Performance improved over C code:
intra_filter_32x32 7.46x525.64  3922.56
intra_filter_16x16 6.53x289.11  1886.86
intra_filter_8x8   5.60x170.75  956.81
intra_filter_4x4   3.05x121.20  369.74

diff -r 9feee64efa44 -r 60832369ebb4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Jun 26 15:29:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Jun 29 17:19:07 2015 +0530
@@ -1120,6 +1120,11 @@
 ALL_LUMA_PU(satd, pixel_satd, sse4);
 ASSIGN_SA8D(sse4);
 
+p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
+p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
+p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
+p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
+
 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
 INTRA_ANG_SSE4_COMMON(sse4);
diff -r 9feee64efa44 -r 60832369ebb4 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Jun 26 15:29:51 2015 +0530
+++ b/source/common/x86/intrapred16.asm Mon Jun 29 17:19:07 2015 +0530
@@ -75,6 +75,9 @@
 const pw_ang16_13,  db 14, 15,  8,  9,  0,  1,  0,  0,  0,  0, 
 0,  0,  0,  0,  0,  0
 const pw_ang16_16,  db  0,  0,  0,  0,  0,  0, 10, 11,  8,  9, 
 6,  7,  2,  3,  0,  1
 
+intra_filter4_shuf0:db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7, 
 8,  9, 10 ,11, 12, 13
+intra_filter4_shuf1:db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7, 
 8,  9, 10 ,11, 12, 13
+
 ;; (blkSize - 1 - x)
 pw_planar4_0:   dw  3,  2,  1,  0,  3,  2,  1,  0
 
@@ -21634,3 +21637,413 @@
 decr4
 jnz.loop
 RET
+
+;---
+; void intra_filter_NxN(const pixel* references, pixel* filtered)
+;---
+INIT_XMM sse4
+cglobal intra_filter_4x4, 2,4,5
+mov r2w, word [r0 + 16] ; topLast
+mov r3w, word [r0 + 32] ; LeftLast
+
+; filtering top
+movum0, [r0 +  0]
+movum1, [r0 + 16]
+movum2, [r0 + 32]
+
+pshufb  m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] 
samples[i - 1]
+palignr m3, m1, m0, 4
+pshufb  m3, [intra_filter4_shuf1]   ; [8 7 6 5 4 3 2 9] 
samples[i + 1]
+
+psllw   m0, 1
+paddw   m4, m3
+paddw   m0, m4
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+; filtering left
+palignr m4, m1, m1, 14
+pinsrw  m4, [r0], 1
+palignr m3, m2, m1, 4
+pshufb  m3, [intra_filter4_shuf1]
+
+psllw   m1, 1
+paddw   m4, m3
+paddw   m1, m4
+paddw   m1, [pw_2]
+psrlw   m1, 2
+
+movu[r1], m0
+movu[r1 + 16], m1
+mov [r1 + 16], r2w  ; topLast
+mov [r1 + 32], r3w  ; LeftLast
+RET
+
+INIT_XMM sse4
+cglobal intra_filter_8x8, 2,4,6
+mov r2w, word [r0 + 32] ; topLast
+mov r3w, word [r0 + 64] ; LeftLast
+
+; filtering top
+movum0, [r0]
+movum1, [r0 + 16]
+movum2, [r0 + 32]
+
+pshufb  m4, m0, [intra_filter4_shuf0]
+palignr m5, m1, m0, 2
+pinsrw  m5, [r0 + 34], 0
+
+palignr m3, m1, m0, 14
+psllw   m0, 1
+paddw   m4, m5
+paddw   m0, m4
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+palignr m4, m2, m1, 2
+psllw   m1, 1
+paddw   m4, m3
+paddw   m1, m4
+paddw   m1, [pw_2]
+psrlw   m1, 2
+movu[r1], m0
+movu[r1 + 16], m1
+
+; filtering left
+movum1, [r0 + 48]
+movum0, [r0 + 64]
+
+palignr m4, m2, m2, 14
+pinsrw  m4, [r0], 1
+palignr m5, m1, m2, 2
+
+palignr m3, m1, m2, 14
+palignr m0, m1, 2
+
+psllw   m2, 1
+paddw   m4, m5
+paddw   m2, m4
+paddw   m2, [pw_2]
+psrlw   m2, 2
+
+psllw   m1, 1
+paddw   m0, m3
+paddw   m1, m0
+paddw   m1, [pw_2]
+psrlw   m1, 2
+
+movu[r1 + 32], m2
+movu

[x265] [PATCH] asm: fix gcc build error, invalid size for operand 1

2015-06-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435307390 -19800
#  Fri Jun 26 13:59:50 2015 +0530
# Node ID 504a42904fab2a43e4d8b5b65513db7a7dd30ee1
# Parent  1e5c4d155ab85e8e8dd199bb3515801766ea9e88
asm: fix gcc build error, invalid size for operand 1

diff -r 1e5c4d155ab8 -r 504a42904fab source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Thu Jun 25 13:42:29 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Fri Jun 26 13:59:50 2015 +0530
@@ -246,7 +246,7 @@
 movdxm1, r1d
 vinserti128 m0, m0, xm1, 1
 movam5, [pw_1023]
-mov r1, r4m
+mov r1d, r4m
 add r1d, r1d
 shr r2d, 4
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 2 of 4] asm: intra_filter8x8 sse4 code, improved 990c-201c over C code

2015-06-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435323520 -19800
#  Fri Jun 26 18:28:40 2015 +0530
# Node ID 93c31f8b404708cd39d00b85a07b2418794fc103
# Parent  44b574b61b29a3cfba99e8f0d06622e44a86df17
asm: intra_filter8x8 sse4 code, improved 990c-201c over C code

diff -r 44b574b61b29 -r 93c31f8b4047 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Jun 26 18:21:07 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jun 26 18:28:40 2015 +0530
@@ -2454,6 +2454,7 @@
 p.weight_sp = PFX(weight_sp_sse4);
 
 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
+p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
 
 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
diff -r 44b574b61b29 -r 93c31f8b4047 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Fri Jun 26 18:21:07 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Fri Jun 26 18:28:40 2015 +0530
@@ -18320,3 +18320,63 @@
 mov [r1 +  8], r2b  ; topLast
 mov [r1 + 16], r3b  ; LeftLast
 RET
+
+INIT_XMM sse4
+cglobal intra_filter_8x8, 2,4,6
+mov r2b, byte [r0 + 16] ; topLast
+mov r3b, byte [r0 + 32] ; LeftLast
+
+; filtering top
+pmovzxbwm0, [r0 +  0]
+pmovzxbwm1, [r0 +  8]
+pmovzxbwm2, [r0 + 16]
+
+pshufb  m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] 
samples[i - 1]
+palignr m5, m1, m0, 2
+pinsrb  m5, [r0 + 17], 0; [8 7 6 5 4 3 2 9] 
samples[i + 1]
+
+palignr m3, m1, m0, 14
+psllw   m0, 1
+paddw   m4, m5
+paddw   m0, m4
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+palignr m4, m2, m1, 2
+psllw   m1, 1
+paddw   m4, m3
+paddw   m1, m4
+paddw   m1, [pw_2]
+psrlw   m1, 2
+
+packuswbm0, m1
+movu[r1], m0
+
+; filtering left
+pmovzxbwm1, [r0 + 24]
+pmovzxbwm0, [r0 + 32]
+
+palignr m4, m2, m2, 14
+pinsrb  m4, [r0], 2
+palignr m5, m1, m2, 2
+
+palignr m3, m1, m2, 14
+palignr m0, m1, 2
+
+psllw   m2, 1
+paddw   m4, m5
+paddw   m2, m4
+paddw   m2, [pw_2]
+psrlw   m2, 2
+
+psllw   m1, 1
+paddw   m0, m3
+paddw   m1, m0
+paddw   m1, [pw_2]
+psrlw   m1, 2
+
+packuswbm2, m1
+movu[r1 + 16], m2
+mov [r1 + 16], r2b  ; topLast
+mov [r1 + 32], r3b  ; LeftLast
+RET
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 1 of 4] asm: intra_filter4x4 sse4 code and added testbench support, improved 357c-141c over C code

2015-06-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435323067 -19800
#  Fri Jun 26 18:21:07 2015 +0530
# Node ID 44b574b61b29a3cfba99e8f0d06622e44a86df17
# Parent  d64227e54233d1646c55bcb4b0b831e5340009ed
asm: intra_filter4x4 sse4 code and added testbench support, improved 357c-141c 
over C code

diff -r d64227e54233 -r 44b574b61b29 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jun 26 18:21:07 2015 +0530
@@ -2453,6 +2453,8 @@
 p.weight_pp = PFX(weight_pp_sse4);
 p.weight_sp = PFX(weight_sp_sse4);
 
+p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
+
 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
 ALL_LUMA_TU(intra_pred_allangs, all_angs_pred, sse4);
diff -r d64227e54233 -r 44b574b61b29 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/intrapred.h Fri Jun 26 18:21:07 2015 +0530
@@ -66,6 +66,7 @@
 
 #define DECL_ALL(cpu) \
 FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel 
*filtPix, int bLuma); \
+FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel 
*filtered); \
 DECL_ANGS(4, cpu); \
 DECL_ANGS(8, cpu); \
 DECL_ANGS(16, cpu); \
diff -r d64227e54233 -r 44b574b61b29 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Fri Jun 26 18:21:07 2015 +0530
@@ -30,6 +30,9 @@
 intra_pred_shuff_0_8:times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 
7, 8
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 
3, 2, 1, 0
 
+intra_filter4_shuf0:  db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf1:  db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+
 pb_0_8times 8 db  0,  8
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
 pb_swap8: times 2 db  7,  6,  5,  4,  3,  2,  1,  0
@@ -18276,3 +18279,44 @@
 
 INTRA_PRED_STORE_4x4
 RET
+
+;---
+; void intra_filter_NxN(const pixel* references, pixel* filtered)
+;---
+INIT_XMM sse4
+cglobal intra_filter_4x4, 2,4,5
+mov r2b, byte [r0 +  8] ; topLast
+mov r3b, byte [r0 + 16] ; LeftLast
+
+; filtering top
+pmovzxbwm0, [r0 +  0]
+pmovzxbwm1, [r0 +  8]
+pmovzxbwm2, [r0 + 16]
+
+pshufb  m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] 
samples[i - 1]
+palignr m3, m1, m0, 4
+pshufb  m3, [intra_filter4_shuf1]   ; [8 7 6 5 4 3 2 9] 
samples[i + 1]
+
+psllw   m0, 1
+paddw   m4, m3
+paddw   m0, m4
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+; filtering left
+palignr m4, m1, m1, 14  ; [14 13 12 11 10 9 8 15] 
samples[i - 1]
+pinsrb  m4, [r0], 2 ; [14 13 12 11 10 9 0 15] 
samples[i + 1]
+palignr m3, m2, m1, 4
+pshufb  m3, [intra_filter4_shuf1]
+
+psllw   m1, 1
+paddw   m4, m3
+paddw   m1, m4
+paddw   m1, [pw_2]
+psrlw   m1, 2
+packuswbm0, m1
+
+movu[r1], m0
+mov [r1 +  8], r2b  ; topLast
+mov [r1 + 16], r3b  ; LeftLast
+RET
diff -r d64227e54233 -r 44b574b61b29 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp  Thu Jun 25 16:25:51 2015 +0530
+++ b/source/test/intrapredharness.cpp  Fri Jun 26 18:21:07 2015 +0530
@@ -31,6 +31,16 @@
 {
 for (int i = 0; i  INPUT_SIZE; i++)
 pixel_buff[i] = rand() % PIXEL_MAX;
+
+/* [0] --- Random values
+ * [1] --- Minimum
+ * [2] --- Maximum */
+for (int i = 0; i  BUFFSIZE; i++)
+{
+pixel_test_buff[0][i]   = rand() % PIXEL_MAX;
+pixel_test_buff[1][i]   = PIXEL_MIN;
+pixel_test_buff[2][i]   = PIXEL_MAX;
+}
 }
 
 bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, 
int width)
@@ -177,6 +187,27 @@
 return true;
 }
 
+bool IntraPredHarness::check_intra_filter_primitive(const intra_filter_t ref, 
const intra_filter_t opt)
+{
+memset(pixel_out_c, 0, 64 * 64 * sizeof(pixel));
+memset(pixel_out_vec, 0, 64 * 64 * sizeof(pixel));
+int j = 0;
+
+for (int i = 0; i  100; i++)
+{
+int index = rand() % TEST_CASES;
+
+ref(pixel_test_buff[index] + j, pixel_out_c);
+checked(opt, pixel_test_buff[index] + j, pixel_out_vec);
+
+if (memcmp(pixel_out_c

[x265] [PATCH 4 of 4] asm: intra_filter32x32 sse4 code, improved 4050c-652c over C code

2015-06-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435323958 -19800
#  Fri Jun 26 18:35:58 2015 +0530
# Node ID e04bde60af516f6f016e3e6f37d5d64e97e589f3
# Parent  1995a55f1320a029fb423f23cbfd24555c258d09
asm: intra_filter32x32 sse4 code, improved 4050c-652c over C code

diff -r 1995a55f1320 -r e04bde60af51 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Jun 26 18:32:00 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jun 26 18:35:58 2015 +0530
@@ -2456,6 +2456,7 @@
 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
 p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
 p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
+p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
 
 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
diff -r 1995a55f1320 -r e04bde60af51 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Fri Jun 26 18:32:00 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Fri Jun 26 18:35:58 2015 +0530
@@ -18485,3 +18485,208 @@
 mov [r1 + 32], r2b  ; topLast
 mov [r1 + 64], r3b  ; LeftLast
 RET
+
+INIT_XMM sse4
+cglobal intra_filter_32x32, 2,4,6
+mov r2b, byte [r0 +  64]; topLast
+mov r3b, byte [r0 + 128]; LeftLast
+
+; filtering top
+; 0 to 15
+pmovzxbwm0, [r0 +  0]
+pmovzxbwm1, [r0 +  8]
+pmovzxbwm2, [r0 + 16]
+
+pshufb  m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] 
samples[i - 1]
+palignr m5, m1, m0, 2
+pinsrb  m5, [r0 + 65], 0; [8 7 6 5 4 3 2 9] 
samples[i + 1]
+
+palignr m3, m1, m0, 14
+psllw   m0, 1
+paddw   m4, m5
+paddw   m0, m4
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+palignr m4, m2, m1, 2
+psllw   m5, m1, 1
+paddw   m4, m3
+paddw   m5, m4
+paddw   m5, [pw_2]
+psrlw   m5, 2
+packuswbm0, m5
+movu[r1], m0
+
+; 16 to 31
+pmovzxbwm0, [r0 + 24]
+pmovzxbwm5, [r0 + 32]
+
+palignr m3, m2, m1, 14
+palignr m4, m0, m2, 2
+
+psllw   m1, m2, 1
+paddw   m3, m4
+paddw   m1, m3
+paddw   m1, [pw_2]
+psrlw   m1, 2
+
+palignr m3, m0, m2, 14
+palignr m4, m5, m0, 2
+
+psllw   m2, m0, 1
+paddw   m4, m3
+paddw   m2, m4
+paddw   m2, [pw_2]
+psrlw   m2, 2
+packuswbm1, m2
+movu[r1 + 16], m1
+
+; 32 to 47
+pmovzxbwm1, [r0 + 40]
+pmovzxbwm2, [r0 + 48]
+
+palignr m3, m5, m0, 14
+palignr m4, m1, m5, 2
+
+psllw   m0, m5, 1
+paddw   m3, m4
+paddw   m0, m3
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+palignr m3, m1, m5, 14
+palignr m4, m2, m1, 2
+
+psllw   m5, m1, 1
+paddw   m4, m3
+paddw   m5, m4
+paddw   m5, [pw_2]
+psrlw   m5, 2
+packuswbm0, m5
+movu[r1 + 32], m0
+
+; 48 to 63
+pmovzxbwm0, [r0 + 56]
+pmovzxbwm5, [r0 + 64]
+
+palignr m3, m2, m1, 14
+palignr m4, m0, m2, 2
+
+psllw   m1, m2, 1
+paddw   m3, m4
+paddw   m1, m3
+paddw   m1, [pw_2]
+psrlw   m1, 2
+
+palignr m3, m0, m2, 14
+palignr m4, m5, m0, 2
+
+psllw   m0, 1
+paddw   m4, m3
+paddw   m0, m4
+paddw   m0, [pw_2]
+psrlw   m0, 2
+packuswbm1, m0
+movu[r1 + 48], m1
+
+; filtering left
+; 64 to 79
+pmovzxbwm1, [r0 + 72]
+pmovzxbwm2, [r0 + 80]
+
+palignr m4, m5, m5, 14
+pinsrb  m4, [r0], 2
+palignr m0, m1, m5, 2
+
+psllw   m3, m5, 1
+paddw   m4, m0
+paddw   m3, m4
+paddw   m3, [pw_2]
+psrlw   m3, 2
+
+palignr m0, m1, m5, 14
+palignr m4, m2, m1, 2
+
+psllw   m5, m1, 1
+paddw   m4, m0
+paddw   m5, m4
+paddw   m5, [pw_2]
+psrlw   m5, 2
+packuswbm3, m5
+movu[r1 + 64], m3
+
+; 80 to 95
+pmovzxbwm5, [r0 + 88]
+pmovzxbwm0, [r0 + 96]
+
+palignr m3, m2, m1, 14
+palignr m4, m5, m2, 2
+
+psllw   m1, m2, 1
+paddw   m3, m4
+paddw   m1, m3
+paddw

[x265] [PATCH 0 of 4 ] asm code and testbench support for intra_filter primitive

2015-06-26 Thread dnyaneshwar
intra_filter_4x4 2.52x141.82  357.20
intra_filter_8x8 4.79x198.79  951.41
intra_filter_16x16   5.56x351.03  1952.17
intra_filter_32x32   6.20x652.82  4050.76
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 3 of 4] asm: intra_filter16x16 sse4 code, improved 1952c-351c over C code

2015-06-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435323720 -19800
#  Fri Jun 26 18:32:00 2015 +0530
# Node ID 1995a55f1320a029fb423f23cbfd24555c258d09
# Parent  93c31f8b404708cd39d00b85a07b2418794fc103
asm: intra_filter16x16 sse4 code, improved 1952c-351c over C code

diff -r 93c31f8b4047 -r 1995a55f1320 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Jun 26 18:28:40 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jun 26 18:32:00 2015 +0530
@@ -2455,6 +2455,7 @@
 
 p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
 p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
+p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
 
 ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
 ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
diff -r 93c31f8b4047 -r 1995a55f1320 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Fri Jun 26 18:28:40 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Fri Jun 26 18:32:00 2015 +0530
@@ -18380,3 +18380,108 @@
 mov [r1 + 16], r2b  ; topLast
 mov [r1 + 32], r3b  ; LeftLast
 RET
+
+INIT_XMM sse4
+cglobal intra_filter_16x16, 2,4,6
+mov r2b, byte [r0 + 32] ; topLast
+mov r3b, byte [r0 + 64] ; LeftLast
+
+; filtering top
+pmovzxbwm0, [r0 +  0]
+pmovzxbwm1, [r0 +  8]
+pmovzxbwm2, [r0 + 16]
+
+pshufb  m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] 
samples[i - 1]
+palignr m5, m1, m0, 2
+pinsrb  m5, [r0 + 33], 0; [8 7 6 5 4 3 2 9] 
samples[i + 1]
+
+palignr m3, m1, m0, 14
+psllw   m0, 1
+paddw   m4, m5
+paddw   m0, m4
+paddw   m0, [pw_2]
+psrlw   m0, 2
+
+palignr m4, m2, m1, 2
+psllw   m5, m1, 1
+paddw   m4, m3
+paddw   m5, m4
+paddw   m5, [pw_2]
+psrlw   m5, 2
+packuswbm0, m5
+movu[r1], m0
+
+pmovzxbwm0, [r0 + 24]
+pmovzxbwm5, [r0 + 32]
+
+palignr m3, m2, m1, 14
+palignr m4, m0, m2, 2
+
+psllw   m1, m2, 1
+paddw   m3, m4
+paddw   m1, m3
+paddw   m1, [pw_2]
+psrlw   m1, 2
+
+palignr m3, m0, m2, 14
+palignr m4, m5, m0, 2
+
+psllw   m0, 1
+paddw   m4, m3
+paddw   m0, m4
+paddw   m0, [pw_2]
+psrlw   m0, 2
+packuswbm1, m0
+movu[r1 + 16], m1
+
+; filtering left
+pmovzxbwm1, [r0 + 40]
+pmovzxbwm2, [r0 + 48]
+
+palignr m4, m5, m5, 14
+pinsrb  m4, [r0], 2
+palignr m0, m1, m5, 2
+
+psllw   m3, m5, 1
+paddw   m4, m0
+paddw   m3, m4
+paddw   m3, [pw_2]
+psrlw   m3, 2
+
+palignr m0, m1, m5, 14
+palignr m4, m2, m1, 2
+
+psllw   m5, m1, 1
+paddw   m4, m0
+paddw   m5, m4
+paddw   m5, [pw_2]
+psrlw   m5, 2
+packuswbm3, m5
+movu[r1 + 32], m3
+
+pmovzxbwm5, [r0 + 56]
+pmovzxbwm0, [r0 + 64]
+
+palignr m3, m2, m1, 14
+palignr m4, m5, m2, 2
+
+psllw   m1, m2, 1
+paddw   m3, m4
+paddw   m1, m3
+paddw   m1, [pw_2]
+psrlw   m1, 2
+
+palignr m3, m5, m2, 14
+palignr m4, m0, m5, 2
+
+psllw   m5, 1
+paddw   m4, m3
+paddw   m5, m4
+paddw   m5, [pw_2]
+psrlw   m5, 2
+packuswbm1, m5
+movu[r1 + 48], m1
+
+mov [r1 + 32], r2b  ; topLast
+mov [r1 + 64], r3b  ; LeftLast
+RET
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 1 of 6] asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c-690c over SSE

2015-06-25 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435212794 -19800
#  Thu Jun 25 11:43:14 2015 +0530
# Node ID faec09e1ab60531924f2d919d4f283fa91bfec81
# Parent  b1af4c36f48a4500a4912373ebcda9a5540b5c15
asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c-690c over SSE

diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp  Thu Jun 25 11:43:14 2015 +0530
@@ -1284,6 +1284,8 @@
 }
 if (cpuMask  X265_CPU_AVX2)
 {
+p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
+
 p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2);
 p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
 p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx2);
diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/const-a.asm Thu Jun 25 11:43:14 2015 +0530
@@ -41,7 +41,7 @@
 const pb_16,times 32 db 16
 const pb_32,times 32 db 32
 const pb_64,times 32 db 64
-const pb_128,   times 16 db 128
+const pb_128,   times 32 db 128
 const pb_a1,times 16 db 0xa1
 
 const pb_01,times  8 db   0,   1
diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/loopfilter.asm  Thu Jun 25 11:43:14 2015 +0530
@@ -235,6 +235,67 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE0, 4,4,9
+vbroadcasti128  m6, [r1]
+movzx   r1d, byte [r3]
+neg r1b
+movdxm0, r1d
+movzx   r1d, byte [r3 + 1]
+neg r1b
+movdxm1, r1d
+vinserti128 m0, m0, xm1, 1
+movam5, [pw_1023]
+mov r1, r4m
+add r1d, r1d
+shr r2d, 4
+
+.loop:
+movum7, [r0]
+movum8, [r0 + r1]
+movum2, [r0 + 2]
+movum1, [r0 + r1 + 2]
+
+pcmpgtw m3, m7, m2
+pcmpgtw m2, m7
+pcmpgtw m4, m8, m1
+pcmpgtw m1, m8
+
+packsswbm3, m4
+packsswbm2, m1
+vpermq  m3, m3, 11011000b
+vpermq  m2, m2, 11011000b
+
+pandm3, [pb_1]
+por m3, m2
+
+pslldq  m2, m3, 1
+por m2, m0
+
+psignb  m2, [pb_128]; m2 = signLeft
+pxorm0, m0
+palignr m0, m3, 15
+paddb   m3, m2
+paddb   m3, [pb_2]  ; m3 = uiEdgeType
+pshufb  m2, m6, m3
+pmovsxbwm3, xm2 ; offsetEo
+vextracti128xm2, m2, 1
+pmovsxbwm2, xm2
+pxorm4, m4
+paddw   m7, m3
+paddw   m8, m2
+pmaxsw  m7, m4
+pmaxsw  m8, m4
+pminsw  m7, m5
+pminsw  m8, m5
+movu[r0], m7
+movu[r0 + r1], m8
+
+add r0q, 32
+dec r2d
+jnz .loop
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
 
 mov r4d,r4m
@@ -287,6 +348,7 @@
 sub r2d,16
 jnz .loop
 RET
+%endif
 
 
;==
 ; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int 
iStride, Int iLcuWidth)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 4 of 6] asm: 10bpp AVX2 code for saoCuOrgE2

2015-06-25 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435213857 -19800
#  Thu Jun 25 12:00:57 2015 +0530
# Node ID 8b680fd502e08ec2cab4fff7f5833791bb5bfeef
# Parent  f43aa44673dcd8e96581c938cf22ad4bbb7657e3
asm: 10bpp AVX2 code for saoCuOrgE2

SAO_EO_2[0] 207c-166
SAO_EO_2[1] 555c-422c

diff -r f43aa44673dc -r 8b680fd502e0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 11:54:22 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Jun 25 12:00:57 2015 +0530
@@ -1287,6 +1287,8 @@
 p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
 p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
+p.saoCuOrgE2[0] = PFX(saoCuOrgE2_avx2);
+p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2);
 
 p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2);
 p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
diff -r f43aa44673dc -r 8b680fd502e0 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Thu Jun 25 11:54:22 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Thu Jun 25 12:00:57 2015 +0530
@@ -948,6 +948,55 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE2, 6,6,7
+mov r4d, r4m
+add r5d, r5d
+inc r1
+movqxm4, [r0 + r4 * 2]
+movhps  xm4, [r1 + r4]
+vbroadcasti128  m5, [r3]
+movam6, [pw_1023]
+.loop
+movum1, [r0]
+movum3, [r0 + r5 + 2]
+
+pcmpgtw m2, m1, m3
+pcmpgtw m3, m1
+
+packsswbm2, m3
+vpermq  m3, m2, 11011101b
+vpermq  m2, m2, 10001000b
+
+pandxm2, [pb_1]
+por xm2, xm3
+
+movuxm3, [r2]
+
+paddb   xm3, xm2
+paddb   xm3, [pb_2]
+pshufb  xm0, xm5, xm3
+pmovsxbwm3, xm0
+
+pxorm0, m0
+paddw   m1, m3
+pmaxsw  m1, m0
+pminsw  m1, m6
+movu[r0], m1
+
+psubb   xm0, xm2
+movu[r1], xm0
+
+add r0, 32
+add r1, 16
+add r2, 16
+sub r4, 16
+jg  .loop
+
+movq[r0 + r4 * 2], xm4
+movhps  [r1 + r4], xm4
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE2, 5, 6, 7, rec, bufft, buff1, offsetEo, lcuWidth
 movr4d,   r4m
 movr5d,   r5m
@@ -987,8 +1036,70 @@
 movq   [r0 + r4], xm6
 movhps [r1 + r4], xm6
 RET
+%endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE2_32, 6,6,8
+mov r4d, r4m
+add r5d, r5d
+inc r1
+movqxm4, [r0 + r4 * 2]
+movhps  xm4, [r1 + r4]
+vbroadcasti128  m5, [r3]
+
+.loop
+movum1, [r0]
+movum7, [r0 + 32]
+movum3, [r0 + r5 + 2]
+movum6, [r0 + r5 + 34]
+
+pcmpgtw m2, m1, m3
+pcmpgtw m0, m7, m6
+pcmpgtw m3, m1
+pcmpgtw m6, m7
+
+packsswbm2, m0
+packsswbm3, m6
+vpermq  m3, m3, 11011000b
+vpermq  m2, m2, 11011000b
+
+pandm2, [pb_1]
+por m2, m3
+
+movum3, [r2]
+
+paddb   m3, m2
+paddb   m3, [pb_2]
+pshufb  m0, m5, m3
+
+pmovsxbwm3, xm0
+vextracti128xm0, m0, 1
+pmovsxbwm6, xm0
+
+pxorm0, m0
+paddw   m1, m3
+paddw   m7, m6
+pmaxsw  m1, m0
+pmaxsw  m7, m0
+pminsw  m1, [pw_1023]
+pminsw  m7, [pw_1023]
+movu[r0], m1
+movu[r0 + 32], m7
+
+psubb   m0, m2
+movu[r1], m0
+
+add r0, 64
+add r1, 32
+add r2, 32
+sub r4, 32
+jg  .loop
+
+movq[r0 + r4 * 2], xm4
+movhps  [r1 + r4], xm4
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE2_32, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth
 mov r4d,   r4m
 mov r5d,   r5m
@@ -1040,6 +1151,7 @@
 movq[r0 + r4], xm6
 movhps  [r1 + r4], xm6
 RET
+%endif
 
 
;===
 ;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t 
stride, int startX, int endX)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 3 of 6] asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c-614c over SSE

2015-06-25 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435213462 -19800
#  Thu Jun 25 11:54:22 2015 +0530
# Node ID f43aa44673dcd8e96581c938cf22ad4bbb7657e3
# Parent  31da07b7198ca730bae37577d5053a3337477f7b
asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c-614c over SSE

diff -r 31da07b7198c -r f43aa44673dc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 11:49:07 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Jun 25 11:54:22 2015 +0530
@@ -1286,6 +1286,7 @@
 {
 p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
 p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
+p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
 
 p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2);
 p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
diff -r 31da07b7198c -r f43aa44673dc source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Thu Jun 25 11:49:07 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Thu Jun 25 11:54:22 2015 +0530
@@ -728,6 +728,62 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE1_2Rows, 4,5,8
+add r3d, r3d
+mov r4d, r4m
+movam4, [pw_1023]
+vbroadcasti128  m6, [r2]; m6 = m_iOffsetEo
+shr r4d, 4
+.loop
+movum7, [r0]
+movum5, [r0 + r3]
+movum1, [r0 + r3 * 2]
+
+pcmpgtw m2, m7, m5
+pcmpgtw m3, m5, m7
+pcmpgtw m0, m5, m1
+pcmpgtw m1, m5
+
+packsswbm2, m0
+packsswbm3, m1
+vpermq  m2, m2, 11011000b
+vpermq  m3, m3, 11011000b
+
+pandm2, [pb_1]
+por m2, m3
+
+movuxm3, [r1]   ; m3 = m_iUpBuff1
+pxorm0, m0
+psubb   m1, m0, m2
+vinserti128 m3, m3, xm1, 1
+vextracti128[r1], m1, 1
+
+paddb   m3, m2
+paddb   m3, [pb_2]
+
+pshufb  m1, m6, m3
+pmovsxbwm3, xm1
+vextracti128xm1, m1, 1
+pmovsxbwm1, xm1
+
+paddw   m7, m3
+paddw   m5, m1
+
+pmaxsw  m7, m0
+pmaxsw  m5, m0
+pminsw  m7, m4
+pminsw  m5, m4
+
+movu[r0], m7
+movu[r0 + r3],  m5
+
+add r0, 32
+add r1, 16
+dec r4d
+jnz .loop
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, 
iLcuWidth
 mov r3d,r3m
 mov r4d,r4m
@@ -775,6 +831,7 @@
 dec r4d
 jnz .loop
 RET
+%endif
 
 
;==
 ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * 
offsetEo, int lcuWidth, intptr_t stride)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 6 of 6] asm: 10bpp AVX2 code for saoCuOrgB0, improved 23127c-15595c over SSE

2015-06-25 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435219949 -19800
#  Thu Jun 25 13:42:29 2015 +0530
# Node ID f1ff5636cba3e2b714ceed86261362a53e8c6aca
# Parent  85d5582eedd40e4227131bff366235e6dc2b361a
asm: 10bpp AVX2 code for saoCuOrgB0, improved 23127c-15595c over SSE

diff -r 85d5582eedd4 -r f1ff5636cba3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 12:11:45 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Jun 25 13:42:29 2015 +0530
@@ -1291,6 +1291,7 @@
 p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2);
 p.saoCuOrgE3[0] = PFX(saoCuOrgE3_avx2);
 p.saoCuOrgE3[1] = PFX(saoCuOrgE3_32_avx2);
+p.saoCuOrgB0 = PFX(saoCuOrgB0_avx2);
 
 p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2);
 p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
diff -r 85d5582eedd4 -r f1ff5636cba3 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Thu Jun 25 12:11:45 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Thu Jun 25 13:42:29 2015 +0530
@@ -1643,6 +1643,89 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgB0, 5,7,8
+vbroadcasti128  m3, [r1]
+vbroadcasti128  m4, [r1 + 16]
+add r4d, r4d
+lea r1, [r4 * 2]
+sub r1d, r2d
+sub r1d, r2d
+shr r2d, 4
+movam7, [pw_1023]
+
+mov r6d, r3d
+shr r3d, 1
+
+.loopH
+mov r5d, r2d
+.loopW
+movum2, [r0]
+movum5, [r0 + r4]
+psrlw   m0, m2, 5
+psrlw   m6, m5, 5
+packuswbm0, m6
+vpermq  m0, m0, 11011000b
+pandm0, [pb_31] ; m0 = [index]
+
+pshufb  m6, m3, m0
+pshufb  m1, m4, m0
+pcmpgtb m0, [pb_15] ; m0 = [mask]
+
+pblendvbm6, m6, m1, m0  ; NOTE: don't use 3 parameters style, 
x264 macro have some bug!
+
+pmovsxbwm0, xm6
+vextracti128xm6, m6, 1
+pmovsxbwm6, xm6
+
+paddw   m2, m0
+paddw   m5, m6
+pxorm1, m1
+pmaxsw  m2, m1
+pmaxsw  m5, m1
+pminsw  m2, m7
+pminsw  m5, m7
+
+movu[r0], m2
+movu[r0 + r4], m5
+
+add r0, 32
+dec r5d
+jnz .loopW
+
+add r0, r1
+dec r3d
+jnz .loopH
+
+testr6b, 1
+jz  .end
+xor r1, r1
+.loopW1:
+movum2, [r0 + r1]
+psrlw   m0, m2, 5
+packuswbm0, m0
+vpermq  m0, m0, 10001000b
+pandm0, [pb_31] ; m0 = [index]
+
+pshufb  m6, m3, m0
+pshufb  m1, m4, m0
+pcmpgtb m0, [pb_15] ; m0 = [mask]
+
+pblendvbm6, m6, m1, m0  ; NOTE: don't use 3 parameters style, 
x264 macro have some bug!
+pmovsxbwm0, xm6 ; offset
+
+paddw   m2, m0
+pxorm0, m0
+pmaxsw  m2, m0
+pminsw  m2, m7
+
+movu[r0 + r1], m2
+add r1d, 32
+dec r2d
+jnz .loopW1
+.end:
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgB0, 4, 7, 8
 
 mov r3d,r3m
@@ -1717,6 +1800,7 @@
 jnz .loopW1
 .end
 RET
+%endif
 
 
;
 ; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int 
width)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 5 of 6] asm: 10bpp AVX2 code for saoCuOrgE3

2015-06-25 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1435214505 -19800
#  Thu Jun 25 12:11:45 2015 +0530
# Node ID 85d5582eedd40e4227131bff366235e6dc2b361a
# Parent  8b680fd502e08ec2cab4fff7f5833791bb5bfeef
asm: 10bpp AVX2 code for saoCuOrgE3

SAO_EO_3[0] 236c-195
SAO_EO_3[1] 570c-490c

diff -r 8b680fd502e0 -r 85d5582eedd4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 12:00:57 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Jun 25 12:11:45 2015 +0530
@@ -1289,6 +1289,8 @@
 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
 p.saoCuOrgE2[0] = PFX(saoCuOrgE2_avx2);
 p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2);
+p.saoCuOrgE3[0] = PFX(saoCuOrgE3_avx2);
+p.saoCuOrgE3[1] = PFX(saoCuOrgE3_32_avx2);
 
 p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2);
 p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
diff -r 8b680fd502e0 -r 85d5582eedd4 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Thu Jun 25 12:00:57 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Thu Jun 25 12:11:45 2015 +0530
@@ -1290,6 +1290,61 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE3, 4,6,6
+add r3d, r3d
+mov r4d, r4m
+mov r5d, r5m
+
+; save latest 2 pixels for case startX=1 or left_endX=15
+movqxm5, [r0 + r5 * 2]
+movhps  xm5, [r1 + r5 - 1]
+
+; move to startX+1
+inc r4d
+lea r0, [r0 + r4 * 2]   ; x = startX + 1
+add r1, r4
+sub r5d, r4d
+movuxm4, [r2]
+
+.loop:
+movum1, [r0]
+movum0, [r0 + r3]
+
+pcmpgtw m2, m1, m0
+pcmpgtw m0, m1
+packsswbm2, m0
+vpermq  m0, m2, 11011101b
+vpermq  m2, m2, 10001000b
+pandm2, [pb_1]
+por m2, m0
+
+movuxm0, [r1]
+paddb   xm0, xm2
+paddb   xm0, [pb_2]
+
+pshufb  xm3, xm4, xm0
+pmovsxbwm3, xm3
+
+paddw   m1, m3
+pxorm0, m0
+pmaxsw  m1, m0
+pminsw  m1, [pw_1023]
+movu[r0], m1
+
+psubb   xm0, xm2
+movu[r1 - 1], xm0
+
+add r0, 32
+add r1, 16
+sub r5, 16
+jg .loop
+
+; restore last pixels (up to 2)
+movq[r0 + r5 * 2], xm5
+movhps  [r1 + r5 - 1], xm5
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE3, 3, 6, 8
 mov r3d,  r3m
 mov r4d,  r4m
@@ -1350,8 +1405,76 @@
 movq[r0 + r5], xm7
 movhps  [r1 + r5 - 1], xm7
 RET
+%endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE3_32, 3,6,8
+add r3d, r3d
+mov r4d, r4m
+mov r5d, r5m
+
+; save latest 2 pixels for case startX=1 or left_endX=15
+movqxm5, [r0 + r5 * 2]
+movhps  xm5, [r1 + r5 - 1]
+
+; move to startX+1
+inc r4d
+lea r0, [r0 + r4 * 2]   ; x = startX + 1
+add r1, r4
+sub r5d, r4d
+vbroadcasti128  m4, [r2]
+
+.loop:
+movum1, [r0]
+movum7, [r0 + 32]
+movum0, [r0 + r3]
+movum6, [r0 + r3 + 32]
+
+pcmpgtw m2, m1, m0
+pcmpgtw m3, m7, m6
+pcmpgtw m0, m1
+pcmpgtw m6, m7
+
+packsswbm2, m3
+packsswbm0, m6
+vpermq  m2, m2, 11011000b
+vpermq  m0, m0, 11011000b
+pandm2, [pb_1]
+por m2, m0
+
+movum0, [r1]
+paddb   m0, m2
+paddb   m0, [pb_2]
+
+pshufb  m3, m4, m0
+vextracti128xm6, m3, 1
+pmovsxbwm3, xm3
+pmovsxbwm6, xm6
+
+paddw   m1, m3
+paddw   m7, m6
+pxorm0, m0
+pmaxsw  m1, m0
+pmaxsw  m7, m0
+pminsw  m1, [pw_1023]
+pminsw  m7, [pw_1023]
+movu[r0], m1
+movu[r0 + 32], m7
+
+psubb   m0, m2
+movu[r1 - 1], m0
+
+add r0, 64
+add r1, 32
+sub r5, 32
+jg .loop
+
+; restore last pixels (up to 2)
+movq[r0 + r5 * 2], xm5
+movhps  [r1 + r5 - 1], xm5
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE3_32, 3, 6, 8
 mov r3d,  r3m
 mov r4d,  r4m
@@ -1416,6 +1539,7 @@
 movq[r0 + r5], xm7
 movhps  [r1 + r5 - 1], xm7
 RET
+%endif

Re: [x265] [PATCH 0 of 6 ] SAO SSE4 asm code for HIGH_BIT_DEPTH

2015-06-22 Thread Dnyaneshwar Gorade
Okay. Will check IACA report and try pxor for m0 and buffer 1023.

On Mon, Jun 22, 2015 at 8:24 PM, chen chenm...@163.com wrote:

 right

 some comment:
 'psignb X, [pb_128]' equal to 'psubb X, 0, X', in AVX2, second type
 faster, in SSE4, choice depends on IACA report

 in PMINSW, you buffer ZERO into M0, and use pw_1023 directly, could you
 try buffer  pw_1023 and use PXOR to get ZERO?


 At 2015-06-22 20:50:32,dnyanesh...@multicorewareinc.com wrote:
 SAO_EO_08.97x974.03  8740.81
 SAO_EO_110.18x   492.67  5017.42
 SAO_EO_1_2Rows  11.21x   900.82  10095.86
 SAO_EO_2[0] 6.27x207.22  1298.92
 SAO_EO_2[1] 8.92x555.20  4949.69
 SAO_EO_3[0] 4.97x236.72  1177.29
 SAO_EO_3[1] 8.67x551.14  4778.67
 SAO_BO_07.50x
 23127.89173346.66
 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel


 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel


___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 1 of 6] asm: 10bpp sse4 code for saoCuOrgE0, improved 8740c-974c, over C code

2015-06-22 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1434712676 -19800
#  Fri Jun 19 16:47:56 2015 +0530
# Node ID a94e9a1f0fde08e060a9b52e3353ce2f242d9257
# Parent  83a7d824442455ba5e0a6b53ea68e6b7043845de
asm: 10bpp sse4 code for saoCuOrgE0, improved 8740c-974c, over C code

diff -r 83a7d8244424 -r a94e9a1f0fde source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/CMakeLists.txt  Fri Jun 19 16:47:56 2015 +0530
@@ -46,7 +46,7 @@
mc-a2.asm pixel-util8.asm blockcopy8.asm
pixeladd8.asm dct8.asm)
 if(HIGH_BIT_DEPTH)
-set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
+set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm 
loopfilter.asm)
 else()
 set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm 
ipfilter8.asm loopfilter.asm)
 endif()
diff -r 83a7d8244424 -r a94e9a1f0fde source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jun 19 16:47:56 2015 +0530
@@ -1089,6 +1089,8 @@
 }
 if (cpuMask  X265_CPU_SSE4)
 {
+p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
+
 LUMA_ADDAVG(sse4);
 CHROMA_420_ADDAVG(sse4);
 CHROMA_422_ADDAVG(sse4);
diff -r 83a7d8244424 -r a94e9a1f0fde source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Fri Jun 19 16:47:56 2015 +0530
@@ -38,6 +38,7 @@
 cextern pb_128
 cextern pb_2
 cextern pw_2
+cextern pw_1023
 cextern pb_movemask
 
 
@@ -45,6 +46,107 @@
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* 
signLeft, intptr_t stride)
 
;
 INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE0, 4,5,9
+mov r4d, r4m
+movhm6,  [r1]
+movzx   r1d, byte [r3]
+pxorm5, m5
+neg r1b
+movdm0, r1d
+lea r1, [r0 + r4 * 2]
+mov r4d, r2d
+
+.loop:
+movum7, [r0]
+movum8, [r0 + 16]
+movum2, [r0 + 2]
+movum1, [r0 + 18]
+
+pcmpgtw m3, m7, m2
+pcmpgtw m2, m7
+pcmpgtw m4, m8, m1
+pcmpgtw m1, m8 
+
+packsswbm3, m4
+packsswbm2, m1
+
+pandm3, [pb_1]
+por m3, m2
+
+palignr m2, m3, m5, 15
+por m2, m0
+
+movam4, [pw_1023]
+psignb  m2, [pb_128]; m2 = signLeft
+pxorm0, m0
+palignr m0, m3, 15
+paddb   m3, m2
+paddb   m3, [pb_2]  ; m2 = uiEdgeType
+pshufb  m2, m6, m3
+pmovsxbwm3, m2  ; offsetEo
+punpckhbw   m2, m2
+psraw   m2, 8
+paddw   m7, m3
+paddw   m8, m2
+pmaxsw  m7, m5
+pmaxsw  m8, m5
+pminsw  m7, m4
+pminsw  m8, m4
+movu[r0], m7
+movu[r0 + 16], m8
+
+add r0q, 32
+sub r2d, 16
+jnz.loop
+
+movzx   r3d, byte [r3 + 1]
+neg r3b
+movdm0, r3d
+.loopH:
+movum7, [r1]
+movum8, [r1 + 16]
+movum2, [r1 + 2]
+movum1, [r1 + 18]
+
+pcmpgtw m3, m7, m2
+pcmpgtw m2, m7
+pcmpgtw m4, m8, m1
+pcmpgtw m1, m8 
+
+packsswbm3, m4
+packsswbm2, m1
+
+pandm3, [pb_1]
+por m3, m2
+
+palignr m2, m3, m5, 15
+por m2, m0
+
+movam4, [pw_1023]
+psignb  m2, [pb_128]; m2 = signLeft
+pxorm0, m0
+palignr m0, m3, 15
+paddb   m3, m2
+paddb   m3, [pb_2]  ; m2 = uiEdgeType
+pshufb  m2, m6, m3
+pmovsxbwm3, m2  ; offsetEo
+punpckhbw   m2, m2
+psraw   m2, 8
+paddw   m7, m3
+paddw   m8, m2
+pmaxsw  m7, m5
+pmaxsw  m8, m5
+pminsw  m7, m4
+pminsw  m8, m4
+movu[r1], m7
+movu[r1 + 16], m8
+
+add r1q, 32
+sub r4d, 16
+jnz.loopH
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
 
 mov r4d, r4m
@@ -130,6 +232,7 @@
 sub r4d, 16
 jnz.loopH
 RET
+%endif
 
 INIT_YMM avx2
 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
diff -r 83a7d8244424 -r a94e9a1f0fde source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp  Mon Jun 22 15:15:33 2015 +0530
+++ b/source/test/pixelharness.cpp  Fri Jun 19 16:47:56 2015 +0530
@@ -901,8 +901,8 @@
 ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
 ALIGN_VAR_16(pixel, opt_dest

[x265] [PATCH 4 of 6] asm: 10bpp sse4 code for saoCuOrgE2

2015-06-22 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1434963191 -19800
#  Mon Jun 22 14:23:11 2015 +0530
# Node ID f85c15cc0e1d70e63182b03e294c2778f598143d
# Parent  558ffdc4e832061d99f1ec688fe1ae64db48642f
asm: 10bpp sse4 code for saoCuOrgE2

Performance improvement over C:
SAO_EO_2[0] 6.27x207.22  1298.92
SAO_EO_2[1] 8.92x555.20  4949.69

diff -r 558ffdc4e832 -r f85c15cc0e1d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Jun 22 18:15:40 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Jun 22 14:23:11 2015 +0530
@@ -1092,6 +1092,8 @@
 p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
 p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
 p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
+p.saoCuOrgE2[0] = PFX(saoCuOrgE2_sse4);
+p.saoCuOrgE2[1] = PFX(saoCuOrgE2_sse4);
 
 LUMA_ADDAVG(sse4);
 CHROMA_420_ADDAVG(sse4);
diff -r 558ffdc4e832 -r f85c15cc0e1d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm  Mon Jun 22 18:15:40 2015 +0530
+++ b/source/common/x86/loopfilter.asm  Mon Jun 22 14:23:11 2015 +0530
@@ -672,6 +672,64 @@
 ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * 
offsetEo, int lcuWidth, intptr_t stride)
 
;==
 INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE2, 6,6,8
+mov r4d, r4m
+add r5d, r5d
+pxorm0, m0
+inc r1
+movhm6, [r0 + r4 * 2]
+movhps  m6, [r1 + r4]
+
+.loop
+movum7, [r0]
+movum5, [r0 + 16]
+movum3, [r0 + r5 + 2]
+movum1, [r0 + r5 + 18]
+
+pcmpgtw m2, m7, m3
+pcmpgtw m3, m7
+pcmpgtw m4, m5, m1
+pcmpgtw m1, m5
+packsswbm2, m4
+packsswbm3, m1
+pandm2, [pb_1]
+por m2, m3
+
+movum3, [r2]
+
+paddb   m3, m2
+paddb   m3, [pb_2]
+
+movum4, [r3]
+pshufb  m4, m3
+
+psubb   m3, m0, m2
+movu[r1], m3
+
+pmovsxbwm3, m4
+punpckhbw   m4, m4
+psraw   m4, 8
+
+paddw   m7, m3
+paddw   m5, m4
+pmaxsw  m7, m0
+pmaxsw  m5, m0
+pminsw  m7, [pw_1023]
+pminsw  m5, [pw_1023]
+movu[r0], m7
+movu[r0 + 16], m5
+
+add r0, 32
+add r1, 16
+add r2, 16
+sub r4, 16
+jg  .loop
+
+movh[r0 + r4 * 2], m6
+movhps  [r1 + r4], m6
+RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE2, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth
 mov r4d,   r4m
 mov r5d,   r5m
@@ -722,6 +780,7 @@
 movh[r0 + r4], m5
 movhps  [r1 + r4], m5
 RET
+%endif
 
 INIT_YMM avx2
 cglobal saoCuOrgE2, 5, 6, 7, rec, bufft, buff1, offsetEo, lcuWidth
diff -r 558ffdc4e832 -r f85c15cc0e1d source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp  Mon Jun 22 18:15:40 2015 +0530
+++ b/source/test/pixelharness.cpp  Mon Jun 22 14:23:11 2015 +0530
@@ -957,8 +957,8 @@
 ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
 ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
 
-memset(ref_dest, 0xCD, sizeof(ref_dest));
-memset(opt_dest, 0xCD, sizeof(opt_dest));
+for (int i = 0; i  64 * 64; i++)
+ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
 
 for (int id = 0; id  2; id++)
 {
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


  1   2   3   4   >