[x265] [PATCH] cleanup the temporary function pointer initialization

2013-11-25 Thread praveen
# HG changeset patch
# User Praveen Tiwari
# Date 1385370359 -19800
# Node ID e9c2faf1e31ab1a1318c484493704405996dcfa8
# Parent  10f605bd053009c8c981c7529322fecd1e54af7b
cleanup the temporary function pointer initialization

diff -r 10f605bd0530 -r e9c2faf1e31a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Nov 22 14:59:34 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 25 14:35:59 2013 +0530
@@ -612,48 +612,10 @@
 p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = 
x265_blockcopy_sp_2x8_sse4;
 p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = 
x265_blockcopy_sp_6x8_sse4;
 
-// This function pointer initialization is temporary will be removed
-// later with macro definitions.  It is used to avoid linker errors
-// until all partitions are coded and commit smaller patches, easier to
-// review.
-
-p.chroma[X265_CSP_I420].add_ps[CHROMA_2x8] = 
x265_pixel_add_ps_2x8_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_2x4] = 
x265_pixel_add_ps_2x4_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_4x2] = 
x265_pixel_add_ps_4x2_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_4x4] = 
x265_pixel_add_ps_4x4_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_4x8] = 
x265_pixel_add_ps_4x8_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_4x16] = 
x265_pixel_add_ps_4x16_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_6x8] = 
x265_pixel_add_ps_6x8_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_8x2] = 
x265_pixel_add_ps_8x2_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_8x4] = 
x265_pixel_add_ps_8x4_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_8x6] = 
x265_pixel_add_ps_8x6_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_8x8] = 
x265_pixel_add_ps_8x8_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_8x16] = 
x265_pixel_add_ps_8x16_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_8x32] = 
x265_pixel_add_ps_8x32_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_12x16] = 
x265_pixel_add_ps_12x16_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_16x4] = 
x265_pixel_add_ps_16x4_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_16x8] = 
x265_pixel_add_ps_16x8_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_16x12] = 
x265_pixel_add_ps_16x12_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_16x16] = 
x265_pixel_add_ps_16x16_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_16x32] = 
x265_pixel_add_ps_16x32_sse4;
-p.luma_add_ps[LUMA_16x64] = x265_pixel_add_ps_16x64_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_24x32] = 
x265_pixel_add_ps_24x32_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_32x8] = 
x265_pixel_add_ps_32x8_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_32x16] = 
x265_pixel_add_ps_32x16_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_32x24] = 
x265_pixel_add_ps_32x24_sse4;
-p.chroma[X265_CSP_I420].add_ps[CHROMA_32x32] = 
x265_pixel_add_ps_32x32_sse4;
-p.luma_add_ps[LUMA_32x64] = x265_pixel_add_ps_32x64_sse4;
-
 p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x4] = 
x265_interp_4tap_vert_sp_2x4_sse4;
 p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x8] = 
x265_interp_4tap_vert_sp_2x8_sse4;
 p.chroma[X265_CSP_I420].filter_vsp[CHROMA_6x8] = 
x265_interp_4tap_vert_sp_6x8_sse4;
 
-p.luma_add_ps[LUMA_48x64] = x265_pixel_add_ps_48x64_sse4;
-p.luma_add_ps[LUMA_64x16] = x265_pixel_add_ps_64x16_sse4;
-p.luma_add_ps[LUMA_64x32] = x265_pixel_add_ps_64x32_sse4;
-p.luma_add_ps[LUMA_64x48] = x265_pixel_add_ps_64x48_sse4;
-p.luma_add_ps[LUMA_64x64] = x265_pixel_add_ps_64x64_sse4;
-
 p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4;
 p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;
 p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 0 of 3 ] Adding asm routine , function declaration and function pointer initialization for weight_pp() function.

2013-11-25 Thread nabajit

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 2 of 3] Test bench modifications for weight_pp() asm routine

2013-11-25 Thread nabajit
# HG changeset patch
# User Nabajit Deka
# Date 1385374525 -19800
#  Mon Nov 25 15:45:25 2013 +0530
# Node ID f7422dfb7eef017344b4d974dac641cb00f7f5b7
# Parent  365f90b3b78cd3c91d6f0985b0d467da4a91d95a
Test bench modifications for weight_pp() asm routine.

diff -r 365f90b3b78c -r f7422dfb7eef source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp  Mon Nov 25 15:31:55 2013 +0530
+++ b/source/test/pixelharness.cpp  Mon Nov 25 15:45:25 2013 +0530
@@ -315,10 +315,10 @@
 memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
 memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
 int j = 0;
-int width = (2 * rand()) % 64;
+int width = 16 * (rand() % 4 + 1);
 int height = 8;
-int w0 = rand() % 256;
-int shift = rand() % 12;
+int w0 = rand() % 128;
+int shift = rand() % 15;
 int round = shift ? (1  (shift - 1)) : 0;
 int offset = (rand() % 256) - 128;
 for (int i = 0; i  ITERS; i++)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 0 of 3 ] Adding asm routine, function declaration and function pointer initialization for weight_sp() function.

2013-11-25 Thread nabajit

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 1 of 3] asm : routine for weight_sp()

2013-11-25 Thread nabajit
# HG changeset patch
# User Nabajit Deka
# Date 1385375693 -19800
#  Mon Nov 25 16:04:53 2013 +0530
# Node ID 4a5ad44661863551a57ab5a2d38f9e91e4297b7c
# Parent  92969306ae85ed2c506d53d709e02f3d98b895f7
asm : routine for weight_sp().

diff -r 92969306ae85 -r 4a5ad4466186 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm  Mon Nov 25 15:46:49 2013 +0530
+++ b/source/common/x86/pixel-util.asm  Mon Nov 25 16:04:53 2013 +0530
@@ -31,6 +31,7 @@
 c_d_1234:   dd 1, 2, 3, 4
 
 tab_c_1:times 8 dw 1
+tab_c_8192: times 8 dw 8192
 
 
 SECTION .text
@@ -751,3 +752,87 @@
 jnz .loopH
 
 RET
+
+;-
+;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t 
dstStride, int width, int height, int w0, int round, int shift, int offset)
+;-
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal weight_sp, 6, 7+2, 6
+%define tmp_r0  r7
+%define tmp_r1  r8
+%else ; ARCH_X86_64 = 0
+cglobal weight_sp, 6, 7, 6, 0-(2*4)
+%define tmp_r0  [(rsp + 0 * 4)]
+%define tmp_r1  [(rsp + 1 * 4)]
+%endif ; ARCH_X86_64
+
+movdm0, r6m ; m0 = [w0]
+
+movdm1, r7m ; m1 = [round]
+punpcklwd   m0, m1
+pshufd  m0, m0, 0   ; m0 = [w0 round]
+
+movdm1, r8m ; m1 = [shift]
+
+movdm2, r9m
+pshufd  m2, m2, 0   ; m2 =[offset]
+
+movam3, [tab_c_1]
+movam4, [tab_c_8192]
+
+add r2d, r2d
+
+.loopH
+mov r6d, r4d
+
+; save old src and dst
+mov tmp_r0, r0
+mov tmp_r1, r1
+.loopW:
+movum5, [r0]
+paddw   m5, m4
+
+punpcklwd   m6,m5, m3
+pmaddwd m6, m0
+psrad   m6, m1
+paddd   m6, m2
+
+punpckhwd   m5, m3
+pmaddwd m5, m0
+psrad   m5, m1
+paddd   m5, m2
+
+packssdwm6, m5
+packuswbm6, m6
+
+sub r6d, 8
+jl  .width4
+movh[r1], m6
+je  .nextH
+add r0, 16
+add r1, 8
+
+jmp .loopW
+
+.width4
+cmp r6d, -4
+jl  .width2
+movd[r1], m6
+je  .nextH
+add r1, 4
+pshufd  m6, m6, 1
+
+.width2
+pextrw  [r1], m6, 0
+
+.nextH
+mov r0, tmp_r0
+mov r1, tmp_r1
+lea r0, [r0 + r2]
+lea r1, [r1 + r3]
+
+dec r5d
+jnz .loopH
+
+RET
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH] cleanup: removed unused code in pixel-a.asm

2013-11-25 Thread Deepthi Nandakumar
Does not apply at the tip.


On Mon, Nov 25, 2013 at 11:40 AM, yuva...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Yuvaraj Venkatesh yuva...@multicorewareinc.com
 # Date 1385359751 -19800
 #  Mon Nov 25 11:39:11 2013 +0530
 # Node ID 90a80def0f1aabdf29e1f08dd0f2263d8e6af805
 # Parent  c0c862dc71fbd021efd3922de99da4f2f93e81f4
 cleanup: removed unused code in pixel-a.asm

 diff -r c0c862dc71fb -r 90a80def0f1a source/common/x86/pixel-a.asm
 --- a/source/common/x86/pixel-a.asm Sun Nov 24 17:34:12 2013 +0800
 +++ b/source/common/x86/pixel-a.asm Mon Nov 25 11:39:11 2013 +0530
 @@ -7157,173 +7157,6 @@
  %endif ; !ARCH_X86_64
  %endmacro ; SA8D


 -;=
 -; SA8D_SATD

 -;=
 -
 -; %1: vertical/horizontal mode
 -; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
 -; m10: satd result
 -; m6, m11-15: tmp regs
 -%macro SA8D_SATD_8x4 5
 -%if %1
 -LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
 -HADAMARD   0, sumsub, %2, %3, 6
 -HADAMARD   0, sumsub, %4, %5, 6
 -SBUTTERFLYwd, %2, %3, 6
 -SBUTTERFLYwd, %4, %5, 6
 -HADAMARD2_2D  %2, %4, %3, %5, 6, dq
 -
 -mova   m12, m%2
 -mova   m13, m%3
 -mova   m14, m%4
 -mova   m15, m%5
 -HADAMARD 0, sumsub, %2, %3, 6
 -HADAMARD 0, sumsub, %4, %5, 6
 -SBUTTERFLY qdq, 12, 13, 6
 -HADAMARD   0, amax, 12, 13, 6
 -SBUTTERFLY qdq, 14, 15, 6
 -paddw m10, m12
 -HADAMARD   0, amax, 14, 15, 6
 -paddw m10, m14
 -%else
 -LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
 -HADAMARD4_V %2, %3, %4, %5, 6
 -
 -pabswm12, m%2 ; doing the abs first is a slight advantage
 -pabswm14, m%4
 -pabswm13, m%3
 -pabswm15, m%5
 -HADAMARD 1, max, 12, 14, 6, 11
 -paddwm10, m12
 -HADAMARD 1, max, 13, 15, 6, 11
 -paddwm10, m13
 -%endif
 -%endmacro ; SA8D_SATD_8x4
 -
 -; %1: add spilled regs?
 -; %2: spill regs?
 -%macro SA8D_SATD_ACCUM 2
 -%if HIGH_BIT_DEPTH
 -pmaddwd m10, [pw_1]
 -HADDUWD  m0, m1
 -%if %1
 -paddd   m10, temp1
 -padddm0, temp0
 -%endif
 -%if %2
 -mova  temp1, m10
 -pxorm10, m10
 -%endif
 -%elif %1
 -paddwm0, temp0
 -%endif
 -%if %2
 -mova  temp0, m0
 -%endif
 -%endmacro
 -
 -%macro SA8D_SATD 0
 -%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
 -cglobal pixel_sa8d_satd_8x8_internal
 -SA8D_SATD_8x4 vertical, 0, 1, 2, 3
 -SA8D_SATD_8x4 vertical, 4, 5, 8, 9
 -
 -%if vertical ; sse2-style
 -HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
 -HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
 -%else; complete sa8d
 -SUMSUB_BADC w, 0, 4, 1, 5, 12
 -HADAMARD 2, sumsub, 0, 4, 12, 11
 -HADAMARD 2, sumsub, 1, 5, 12, 11
 -SUMSUB_BADC w, 2, 8, 3, 9, 12
 -HADAMARD 2, sumsub, 2, 8, 12, 11
 -HADAMARD 2, sumsub, 3, 9, 12, 11
 -HADAMARD 1, amax, 0, 4, 12, 11
 -HADAMARD 1, amax, 1, 5, 12, 4
 -HADAMARD 1, amax, 2, 8, 12, 4
 -HADAMARD 1, amax, 3, 9, 12, 4
 -%endif
 -
 -; create sa8d sub results
 -paddwm1, m2
 -paddwm0, m3
 -paddwm0, m1
 -
 -SAVE_MM_PERMUTATION
 -ret
 -

 -;---
 -; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )

 -;---
 -cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
 -%define temp0 [rsp+0*mmsize]
 -%define temp1 [rsp+1*mmsize]
 -FIX_STRIDES r1, r3
 -%if vertical==0
 -mova m7, [hmul_8p]
 -%endif
 -lea  r4, [3*r1]
 -lea  r5, [3*r3]
 -pxorm10, m10
 -
 -%if mmsize==32
 -call pixel_sa8d_satd_8x8_internal
 -SA8D_SATD_ACCUM 0, 1
 -call pixel_sa8d_satd_8x8_internal
 -SA8D_SATD_ACCUM 1, 0
 -vextracti128 xm1, m0, 1
 -vextracti128 xm2, m10, 1
 -paddw   xm0, xm1
 -paddw  xm10, xm2
 -%else
 -lea  r6, [r2+8*SIZEOF_PIXEL]
 -lea  r7, [r0+8*SIZEOF_PIXEL]
 -
 -call pixel_sa8d_satd_8x8_internal
 -SA8D_SATD_ACCUM 0, 1
 -call pixel_sa8d_satd_8x8_internal
 -SA8D_SATD_ACCUM 1, 1
 -
 -mov  r0, r7
 -mov  r2, r6
 -
 -call pixel_sa8d_satd_8x8_internal
 -SA8D_SATD_ACCUM 1, 1
 -call pixel_sa8d_satd_8x8_internal
 -SA8D_SATD_ACCUM 1, 0
 -%endif
 -
 -; xop already has fast horizontal sums
 -%if cpuflag(sse4)  notcpuflag(xop)  HIGH_BIT_DEPTH==0
 -pmaddwd xm10, [pw_1]
 -HADDUWD xm0, xm1
 -phaddd  xm0, xm10   ;  sa8d1  sa8d2  satd1  satd2
 -pshufd  xm1, xm0, q2301 ;  sa8d2  sa8d1  satd2  satd1
 -paddd   xm0, xm1;   sa8d   sa8d   satd   satd
 -movdr0d, xm0
 -pextrd  eax, xm0, 2
 -%else
 -%if HIGH_BIT_DEPTH
 -HADDD   xm0, xm1
 -HADDD  xm10, xm2
 -%else
 -HADDUW  xm0, xm1

[x265] [PATCH] Test bench modifications for weight_sp() asm routine

2013-11-25 Thread nabajit
# HG changeset patch
# User Nabajit Deka
# Date 1385378388 -19800
#  Mon Nov 25 16:49:48 2013 +0530
# Node ID d2d31d26493438d3b4ee22802bdab085460359a4
# Parent  4a5ad44661863551a57ab5a2d38f9e91e4297b7c
 Test bench modifications for weight_sp() asm routine

diff -r 4a5ad4466186 -r d2d31d264934 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp  Mon Nov 25 16:04:53 2013 +0530
+++ b/source/test/pixelharness.cpp  Mon Nov 25 16:49:48 2013 +0530
@@ -287,10 +287,10 @@
 memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
 memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
 int j = 0;
-int width = (2 * rand()) % 64;
+int width = 2 * (rand() % 32 + 1);
 int height = 8;
-int w0 = rand() % 256;
-int shift = rand() % 12;
+int w0 = rand() % 128;
+int shift = rand() % 15;
 int round = shift ? (1  (shift - 1)) : 0;
 int offset = (rand() % 256) - 128;
 for (int i = 0; i  ITERS; i++)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] Test bench: code for pixel_var

2013-11-25 Thread murugan
# HG changeset patch
# User Murugan Vairavel muru...@multicorewareinc.com
# Date 1385385388 -19800
#  Mon Nov 25 18:46:28 2013 +0530
# Node ID 43da6ca15a61e18d033931ca58940d6794f6f8f8
# Parent  10f605bd053009c8c981c7529322fecd1e54af7b
Test bench: code for pixel_var

diff -r 10f605bd0530 -r 43da6ca15a61 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp  Fri Nov 22 14:59:34 2013 -0600
+++ b/source/test/pixelharness.cpp  Mon Nov 25 18:46:28 2013 +0530
@@ -632,6 +632,23 @@
 return true;
 }
 
+bool PixelHarness::check_pixel_var(var_t ref, var_t opt)
+{
+int j = 0;
+
+for (int i = 0; i  ITERS; i++)
+{
+uint64_t vres = opt(pbuf1, STRIDE);
+uint64_t cres = ref(pbuf1, STRIDE);
+if (vres != cres)
+return false;
+
+j += INCR;
+}
+
+return true;
+}
+
 bool PixelHarness::testPartition(int part, const EncoderPrimitives ref, const 
EncoderPrimitives opt)
 {
 if (opt.satd[part])
@@ -759,6 +776,16 @@
 return false;
 }
 }
+
+if (opt.var[part])
+{
+if (!check_pixel_var(ref.var[part], opt.var[part]))
+{
+printf(var[%s]: failed!\n, lumaPartStr[part]);
+return false;
+}
+}
+
 for(int i = 0; i  X265_CSP_COUNT; i++)
 {
 if (opt.chroma[i].copy_pp[part])
@@ -1053,6 +1080,12 @@
 REPORT_SPEEDUP(opt.luma_add_ps[part], ref.luma_add_ps[part], pbuf1, 
FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
 }
 
+if (opt.var[part])
+{
+HEADER(var[%s], lumaPartStr[part]);
+REPORT_SPEEDUP(opt.var[part], ref.var[part], pbuf1, STRIDE);
+}
+
 for (int i = 0; i  X265_CSP_COUNT; i++)
 {
 if (opt.chroma[i].copy_pp[part])
diff -r 10f605bd0530 -r 43da6ca15a61 source/test/pixelharness.h
--- a/source/test/pixelharness.hFri Nov 22 14:59:34 2013 -0600
+++ b/source/test/pixelharness.hMon Nov 25 18:46:28 2013 +0530
@@ -60,6 +60,7 @@
 bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
 bool check_downscale_t(downscale_t ref, downscale_t opt);
 bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
+bool check_pixel_var(var_t ref, var_t opt);
 
 public:
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: assembly code for pixel_sse_ss_12x16

2013-11-25 Thread yuvaraj
# HG changeset patch
# User Yuvaraj Venkatesh yuva...@multicorewareinc.com
# Date 1385385872 -19800
#  Mon Nov 25 18:54:32 2013 +0530
# Node ID fea660d227b842c411240ff17297ddfbb738b540
# Parent  a69a8392ffeb32d5b136bd315b456b2067cceb29
asm: assembly code for pixel_sse_ss_12x16

diff -r a69a8392ffeb -r fea660d227b8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 25 18:30:49 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 25 18:54:32 2013 +0530
@@ -95,6 +95,7 @@
 p.sse_ss[LUMA_8x8]   = x265_pixel_ssd_ss_8x8_ ## cpu; \
 p.sse_ss[LUMA_8x16]   = x265_pixel_ssd_ss_8x16_ ## cpu; \
 p.sse_ss[LUMA_8x32]   = x265_pixel_ssd_ss_8x32_ ## cpu; \
+p.sse_ss[LUMA_12x16]   = x265_pixel_ssd_ss_12x16_ ## cpu; \
 p.sse_ss[LUMA_16x4]   = x265_pixel_ssd_ss_16x4_ ## cpu; \
 p.sse_ss[LUMA_16x8]   = x265_pixel_ssd_ss_16x8_ ## cpu; \
 p.sse_ss[LUMA_16x12]   = x265_pixel_ssd_ss_16x12_ ## cpu; \
diff -r a69a8392ffeb -r fea660d227b8 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 25 18:30:49 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Nov 25 18:54:32 2013 +0530
@@ -378,12 +378,63 @@
 SSD_SS16, 64
 %endmacro
 
+%macro SSD_SS_12x16 0
+cglobal pixel_ssd_ss_12x16, 4,7,6
+FIX_STRIDES r1, r3
+movr4d, 8
+pxorm0, m0
+.loop
+pmovsxwd  m1, [r0]
+pmovsxwd  m2, [r2]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 8]
+pmovsxwd  m2, [r2 + 8]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 16]
+pmovsxwd  m2, [r2 + 16]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+lea   r0, [r0 + 2*r1]
+lea   r2, [r2 + 2*r3]
+pmovsxwd  m1, [r0]
+pmovsxwd  m2, [r2]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 8]
+pmovsxwd  m2, [r2 + 8]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 16]
+pmovsxwd  m2, [r2 + 16]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+lea   r0, [r0 + 2*r1]
+lea   r2, [r2 + 2*r3]
+dec  r4d
+jnz .loop
+phadddm0, m0
+phadddm0, m0
+movd eax, m0
+RET
+%endmacro
+
 INIT_XMM sse2
 SSD_SS_ONE
+SSD_SS_12x16
 INIT_XMM sse4
 SSD_SS_ONE
+SSD_SS_12x16
 INIT_XMM avx
 SSD_SS_ONE
+SSD_SS_12x16
 %endif ; !HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH Review only] asm: code for pixel_var_8xN

2013-11-25 Thread murugan
# HG changeset patch
# User Murugan Vairavel muru...@multicorewareinc.com
# Date 1385386658 -19800
#  Mon Nov 25 19:07:38 2013 +0530
# Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f
# Parent  43da6ca15a61e18d033931ca58940d6794f6f8f8
asm: code for pixel_var_8xN

diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp
--- a/source/common/pixel.cpp   Mon Nov 25 18:46:28 2013 +0530
+++ b/source/common/pixel.cpp   Mon Nov 25 19:07:38 2013 +0530
@@ -968,8 +968,11 @@
 p.ssim_4x4x2_core = ssim_4x4x2_core;
 p.ssim_end_4 = ssim_end_4;
 
-p.var[LUMA_16x16] = pixel_var16, 16;
+p.var[LUMA_8x4] = pixel_var8, 4;
 p.var[LUMA_8x8] = pixel_var8, 8;
+p.var[LUMA_8x16] = pixel_var8, 16;
+p.var[LUMA_8x32] = pixel_var8, 32;
+
 p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
 }
 }
diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 25 18:46:28 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 25 19:07:38 2013 +0530
@@ -412,6 +412,15 @@
 SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
 SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
 
+#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
+p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
+
+#define LUMA_VAR(cpu) \
+SETUP_PIXEL_VAR_DEF(8,   4, cpu); \
+SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
+SETUP_PIXEL_VAR_DEF(8,  16, cpu); \
+SETUP_PIXEL_VAR_DEF(8,  32, cpu);
+
 namespace x265 {
 // private x265 namespace
 
@@ -442,6 +451,8 @@
 PIXEL_AVG(sse2);
 PIXEL_AVG_W4(mmx2);
 
+LUMA_VAR(_sse2);
+
 p.sad[LUMA_8x32]  = x265_pixel_sad_8x32_sse2;
 p.sad[LUMA_16x4]  = x265_pixel_sad_16x4_sse2;
 p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 25 18:46:28 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530
@@ -1301,6 +1301,106 @@
 
 %if HIGH_BIT_DEPTH == 0
 %macro VAR 0
+cglobal pixel_var_8x4, 2,3,8
+VAR_START 1
+lea   r2,[r1 * 3]
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+VAR_END 8, 4
+
+cglobal pixel_var_8x8, 2,3,8
+VAR_START 1
+lea   r2,[r1 * 3]
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+VAR_END 8, 8
+
+
+cglobal pixel_var_8x16, 2,4,8
+VAR_START 1
+lea   r2,[r1 * 3]
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+VAR_END 8, 16
+
+cglobal pixel_var_8x32, 2,4,8
+VAR_START 1
+mov   r2d,   2
+lea   r3,[r1 * 3]
+.loop:
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r3]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r3]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r3]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+movh  m0,[r0]
+movh  m3,[r0 + r1]
+movhpsm0,[r0 + r1 * 2]
+movhpsm3,[r0 + r3]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+decr2d
+jnz.loop
+VAR_END 8, 32
+
 cglobal pixel_var_16x16, 2,3,8
 VAR_START 1
 mov  r2d, 8
@@ -1313,38 +1413,6 @@
 dec r2d
 jg .loop
 VAR_END 16, 16
-
-cglobal pixel_var_8x8, 2,4,8
-VAR_START 1
-mov  r2d, 2
-lea   r3, [r1*3]
-.loop:
-movh  

[x265] [PATCH] asm: assembly code for intra_pred_planar[4x4]

2013-11-25 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1385387273 -19800
#  Mon Nov 25 19:17:53 2013 +0530
# Node ID c070e25af31107c7c5a5a6cb5c5e049871c56e22
# Parent  10f605bd053009c8c981c7529322fecd1e54af7b
asm: assembly code for intra_pred_planar[4x4]

diff -r 10f605bd0530 -r c070e25af311 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Nov 22 14:59:34 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 25 19:17:53 2013 +0530
@@ -663,6 +663,8 @@
 p.intra_pred_dc[BLOCK_8x8] = x265_intra_pred_dc8_sse4;
 p.intra_pred_dc[BLOCK_16x16] = x265_intra_pred_dc16_sse4;
 p.intra_pred_dc[BLOCK_32x32] = x265_intra_pred_dc32_sse4;
+
+p.intra_pred_planar[BLOCK_4x4] = x265_intra_pred_planar4_sse4;
 }
 if (cpuMask  X265_CPU_AVX)
 {
diff -r 10f605bd0530 -r c070e25af311 source/common/x86/intrapred.asm
--- a/source/common/x86/intrapred.asm   Fri Nov 22 14:59:34 2013 -0600
+++ b/source/common/x86/intrapred.asm   Mon Nov 25 19:17:53 2013 +0530
@@ -26,7 +26,7 @@
 
 SECTION_RODATA 32
 
-
+multi_2Row: dw 1, 2, 3, 4, 1, 2, 3, 4
 
 SECTION .text
 
@@ -362,3 +362,63 @@
 %endrep
 
 RET
+
+INIT_XMM sse4
+cglobal intra_pred_planar4, 4,7,5, above, left, dst, dstStride
+
+pmovzxbwm0, [r0]  ; topRow[i] = above[i];
+punpcklqdq  m0, m0
+
+pxorm1, m1
+movdm2, [r1 + 4]  ; bottomLeft = left[4]
+movzx   r6d, byte   [r0 + 4]  ; topRight   = above[4];
+pshufb  m2, m1
+punpcklbw   m2, m1
+psubw   m2, m0; bottomRow[i] = bottomLeft - 
topRow[i]
+psllw   m0, 2
+punpcklqdq  m3, m2, m1
+psubw   m0, m3
+paddw   m2, m2
+
+%macro COMP_PRED_PLANAR_2ROW 1
+movzx   r4d, byte   [r1 + %1]
+lea r4d,[r4d * 4 + 4]
+movdm3, r4d
+pshuflw m3, m3, 0
+
+movzx   r4d, byte   [r1 + %1 + 1]
+lea r4d,[r4d * 4 + 4]
+movdm4, r4d
+pshuflw m4, m4, 0
+punpcklqdq  m3, m4; horPred
+
+movzx   r4d, byte   [r1 + %1]
+mov r5d,r6d
+sub r5d,r4d
+movdm4, r5d
+pshuflw m4, m4, 0
+
+movzx   r4d, byte   [r1 + %1 + 1]
+mov r5d,r6d
+sub r5d,r4d
+movdm1, r5d
+pshuflw m1, m1, 0
+punpcklqdq  m4, m1; rightColumnN
+
+pmullw  m4, [multi_2Row]
+paddw   m3, m4
+paddw   m0, m2
+paddw   m3, m0
+psraw   m3, 3
+packuswbm3, m3
+
+movd[r2],   m3
+pshufd  m3, m3, 0x55
+movd[r2 + r3],  m3
+lea r2, [r2 + 2 * r3]
+%endmacro
+
+COMP_PRED_PLANAR_2ROW 0
+COMP_PRED_PLANAR_2ROW 2
+
+RET
diff -r 10f605bd0530 -r c070e25af311 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Nov 22 14:59:34 2013 -0600
+++ b/source/common/x86/intrapred.h Mon Nov 25 19:17:53 2013 +0530
@@ -31,4 +31,6 @@
 void x265_intra_pred_dc16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t 
dstStride, int filter);
 void x265_intra_pred_dc32_sse4(pixel* above, pixel* left, pixel* dst, intptr_t 
dstStride, int filter);
 
+void x265_intra_pred_planar4_sse4(pixel* above, pixel* left, pixel* dst, 
intptr_t dstStride);
+
 #endif // ifndef X265_INTRAPRED_H
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: assembly code for pixel_sse_ss_32xN

2013-11-25 Thread yuvaraj
# HG changeset patch
# User Yuvaraj Venkatesh yuva...@multicorewareinc.com
# Date 1385387530 -19800
#  Mon Nov 25 19:22:10 2013 +0530
# Node ID 2ba2e95b57963f8c23412faaf7b73c4671fb8a10
# Parent  fea660d227b842c411240ff17297ddfbb738b540
asm: assembly code for pixel_sse_ss_32xN

diff -r fea660d227b8 -r 2ba2e95b5796 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 25 18:54:32 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 25 19:22:10 2013 +0530
@@ -102,6 +102,11 @@
 p.sse_ss[LUMA_16x16]   = x265_pixel_ssd_ss_16x16_ ## cpu; \
 p.sse_ss[LUMA_16x32]   = x265_pixel_ssd_ss_16x32_ ## cpu; \
 p.sse_ss[LUMA_16x64]   = x265_pixel_ssd_ss_16x64_ ## cpu; \
+p.sse_ss[LUMA_32x8]   = x265_pixel_ssd_ss_32x8_ ## cpu; \
+p.sse_ss[LUMA_32x16]   = x265_pixel_ssd_ss_32x16_ ## cpu; \
+p.sse_ss[LUMA_32x24]   = x265_pixel_ssd_ss_32x24_ ## cpu; \
+p.sse_ss[LUMA_32x32]   = x265_pixel_ssd_ss_32x32_ ## cpu; \
+p.sse_ss[LUMA_32x64]   = x265_pixel_ssd_ss_32x64_ ## cpu;
 
 #define SA8D_INTER_FROM_BLOCK(cpu) \
 p.sa8d_inter[LUMA_4x8]  = x265_pixel_satd_4x8_ ## cpu; \
diff -r fea660d227b8 -r 2ba2e95b5796 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 25 18:54:32 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:22:10 2013 +0530
@@ -426,15 +426,124 @@
 RET
 %endmacro
 
+%macro SSD_SS_32 1
+cglobal pixel_ssd_ss_32x%1, 4,7,6
+FIX_STRIDES r1, r3
+movr4d, %1/2
+pxorm0, m0
+.loop
+pmovsxwd  m1, [r0]
+pmovsxwd  m2, [r2]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 8]
+pmovsxwd  m2, [r2 + 8]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 16]
+pmovsxwd  m2, [r2 + 16]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 24]
+pmovsxwd  m2, [r2 + 24]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 32]
+pmovsxwd  m2, [r2 + 32]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 40]
+pmovsxwd  m2, [r2 + 40]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 48]
+pmovsxwd  m2, [r2 + 48]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 56]
+pmovsxwd  m2, [r2 + 56]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+lea   r0, [r0 + 2*r1]
+lea   r2, [r2 + 2*r3]
+pmovsxwd  m1, [r0]
+pmovsxwd  m2, [r2]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 8]
+pmovsxwd  m2, [r2 + 8]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 16]
+pmovsxwd  m2, [r2 + 16]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 24]
+pmovsxwd  m2, [r2 + 24]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 32]
+pmovsxwd  m2, [r2 + 32]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 40]
+pmovsxwd  m2, [r2 + 40]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 48]
+pmovsxwd  m2, [r2 + 48]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+pmovsxwd  m1, [r0 + 56]
+pmovsxwd  m2, [r2 + 56]
+psubd m1, m2
+pmulldm1, m1
+paddd m0, m1
+lea   r0, [r0 + 2*r1]
+lea   r2, [r2 + 2*r3]
+dec  r4d
+jnz .loop
+phadddm0, m0
+phadddm0, m0
+movd eax, m0
+RET
+%endmacro
+
+%macro SSD_SS_32xN 0
+SSD_SS_32 8
+SSD_SS_32 16
+SSD_SS_32 24
+SSD_SS_32 32
+SSD_SS_32 64
+%endmacro
+
 INIT_XMM sse2
 SSD_SS_ONE
 SSD_SS_12x16
+SSD_SS_32xN
 INIT_XMM sse4
 SSD_SS_ONE
 SSD_SS_12x16
+SSD_SS_32xN
 INIT_XMM avx
 SSD_SS_ONE
 SSD_SS_12x16
+SSD_SS_32xN
 %endif ; !HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH Review only] asm: code for pixel_var_16xN

2013-11-25 Thread murugan
# HG changeset patch
# User Murugan Vairavel muru...@multicorewareinc.com
# Date 1385387913 -19800
#  Mon Nov 25 19:28:33 2013 +0530
# Node ID 9e9767a887e3a91c0953b9bfa17c2f34f03ecf11
# Parent  deb2fc2dcaf24a86132ebfe0fbaac4859611c92f
asm: code for pixel_var_16xN

diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/pixel.cpp
--- a/source/common/pixel.cpp   Mon Nov 25 19:07:38 2013 +0530
+++ b/source/common/pixel.cpp   Mon Nov 25 19:28:33 2013 +0530
@@ -972,6 +972,12 @@
 p.var[LUMA_8x8] = pixel_var8, 8;
 p.var[LUMA_8x16] = pixel_var8, 16;
 p.var[LUMA_8x32] = pixel_var8, 32;
+p.var[LUMA_16x4] = pixel_var16, 4;
+p.var[LUMA_16x8] = pixel_var16, 8;
+p.var[LUMA_16x12] = pixel_var16, 12;
+p.var[LUMA_16x16] = pixel_var16, 16;
+p.var[LUMA_16x32] = pixel_var16, 32;
+p.var[LUMA_16x64] = pixel_var16, 64;
 
 p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
 }
diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 25 19:07:38 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Nov 25 19:28:33 2013 +0530
@@ -419,7 +419,13 @@
 SETUP_PIXEL_VAR_DEF(8,   4, cpu); \
 SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
 SETUP_PIXEL_VAR_DEF(8,  16, cpu); \
-SETUP_PIXEL_VAR_DEF(8,  32, cpu);
+SETUP_PIXEL_VAR_DEF(8,  32, cpu); \
+SETUP_PIXEL_VAR_DEF(16,  4, cpu); \
+SETUP_PIXEL_VAR_DEF(16,  8, cpu); \
+SETUP_PIXEL_VAR_DEF(16, 12, cpu); \
+SETUP_PIXEL_VAR_DEF(16, 16, cpu); \
+SETUP_PIXEL_VAR_DEF(16, 32, cpu); \
+SETUP_PIXEL_VAR_DEF(16, 64, cpu);
 
 namespace x265 {
 // private x265 namespace
diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:28:33 2013 +0530
@@ -1401,18 +1401,201 @@
 jnz.loop
 VAR_END 8, 32
 
+cglobal pixel_var_16x4, 2,3,8
+VAR_START 1
+lea   r2,[r1 * 3]
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+VAR_END 16, 4
+
+cglobal pixel_var_16x8, 2,3,8
+VAR_START 1
+lea   r2,[r1 * 3]
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+VAR_END 16, 8
+
+cglobal pixel_var_16x12, 2,3,8
+VAR_START 1
+lea   r2,[r1 * 3]
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+VAR_END 16, 12
+
 cglobal pixel_var_16x16, 2,3,8
 VAR_START 1
-mov  r2d, 8
+lea   r2,[r1 * 3]
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+lea   r0,[r0 + r1 * 4]
+VAR_CORE
+mova  m0,[r0]
+mova  m3,[r0 + r1]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+mova  m0,[r0 + 2 * r1]
+mova  m3,[r0 + r2]
+DEINTB1, 0, 4, 3, 7
+VAR_CORE
+VAR_END 16, 16
+
+cglobal pixel_var_16x32, 2,4,8
+VAR_START 1
+mov   r2d,   2
+lea   r3,[r1 * 3]
 .loop:
-mova  m0, [r0]
-mova  m3, [r0+r1]
+mova  m0,[r0]
+mova  m3,[r0 + r1]
 DEINTB1, 0, 4, 3, 7
-lea   r0, [r0+r1*2]
 

Re: [x265] [PATCH 1 of 3] asm : routine for weight_pp(), for input width in multiples of 16

2013-11-25 Thread chen
+tab_c_1:times 8 dw 1
there have a pw_1 like this, I modify it___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH Review only] asm: code for pixel_var_8xN

2013-11-25 Thread Steve Borho

On Nov 25, 2013, at 7:38 AM, muru...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Murugan Vairavel muru...@multicorewareinc.com
 # Date 1385386658 -19800
 #  Mon Nov 25 19:07:38 2013 +0530
 # Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f
 # Parent  43da6ca15a61e18d033931ca58940d6794f6f8f8
 asm: code for pixel_var_8xN

I'm not sure the encoder uses any variance block measurements other than 8x8

 
 diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp
 --- a/source/common/pixel.cpp Mon Nov 25 18:46:28 2013 +0530
 +++ b/source/common/pixel.cpp Mon Nov 25 19:07:38 2013 +0530
 @@ -968,8 +968,11 @@
 p.ssim_4x4x2_core = ssim_4x4x2_core;
 p.ssim_end_4 = ssim_end_4;
 
 -p.var[LUMA_16x16] = pixel_var16, 16;
 +p.var[LUMA_8x4] = pixel_var8, 4;
 p.var[LUMA_8x8] = pixel_var8, 8;
 +p.var[LUMA_8x16] = pixel_var8, 16;
 +p.var[LUMA_8x32] = pixel_var8, 32;
 +
 p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
 }
 }
 diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cppMon Nov 25 18:46:28 2013 +0530
 +++ b/source/common/x86/asm-primitives.cppMon Nov 25 19:07:38 2013 +0530
 @@ -412,6 +412,15 @@
 SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
 SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
 
 +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
 +p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
 +
 +#define LUMA_VAR(cpu) \
 +SETUP_PIXEL_VAR_DEF(8,   4, cpu); \
 +SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
 +SETUP_PIXEL_VAR_DEF(8,  16, cpu); \
 +SETUP_PIXEL_VAR_DEF(8,  32, cpu);
 +
 namespace x265 {
 // private x265 namespace
 
 @@ -442,6 +451,8 @@
 PIXEL_AVG(sse2);
 PIXEL_AVG_W4(mmx2);
 
 +LUMA_VAR(_sse2);
 +
 p.sad[LUMA_8x32]  = x265_pixel_sad_8x32_sse2;
 p.sad[LUMA_16x4]  = x265_pixel_sad_16x4_sse2;
 p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
 diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm
 --- a/source/common/x86/pixel-a.asm   Mon Nov 25 18:46:28 2013 +0530
 +++ b/source/common/x86/pixel-a.asm   Mon Nov 25 19:07:38 2013 +0530
 @@ -1301,6 +1301,106 @@
 
 %if HIGH_BIT_DEPTH == 0
 %macro VAR 0
 +cglobal pixel_var_8x4, 2,3,8
 +VAR_START 1
 +lea   r2,[r1 * 3]
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +VAR_END 8, 4
 +
 +cglobal pixel_var_8x8, 2,3,8
 +VAR_START 1
 +lea   r2,[r1 * 3]
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +VAR_CORE
 +VAR_END 8, 8
 +
 +
 +cglobal pixel_var_8x16, 2,4,8
 +VAR_START 1
 +lea   r2,[r1 * 3]
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +VAR_CORE
 +VAR_END 8, 16
 +
 +cglobal pixel_var_8x32, 2,4,8
 +VAR_START 1
 +mov   r2d,   2
 +lea   r3,[r1 * 3]
 +.loop:
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r3]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r3]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r3]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r3]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +decr2d
 +jnz.loop
 +VAR_END 

Re: [x265] [PATCH Review only] asm: code for pixel_var_8xN

2013-11-25 Thread Steve Borho
I just checked and ratecontrol.cpp uses var for block sizes 8x8 and 16x16.  All 
the other block sizes are unused.

We should probably define only square block sizes for this primitive.

On Nov 25, 2013, at 2:07 PM, Steve Borho st...@borho.org wrote:

 
 On Nov 25, 2013, at 7:38 AM, muru...@multicorewareinc.com wrote:
 
 # HG changeset patch
 # User Murugan Vairavel muru...@multicorewareinc.com
 # Date 1385386658 -19800
 #  Mon Nov 25 19:07:38 2013 +0530
 # Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f
 # Parent  43da6ca15a61e18d033931ca58940d6794f6f8f8
 asm: code for pixel_var_8xN
 
 I'm not sure the encoder uses any variance block measurements other than 8x8
 
 
 diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp
 --- a/source/common/pixel.cppMon Nov 25 18:46:28 2013 +0530
 +++ b/source/common/pixel.cppMon Nov 25 19:07:38 2013 +0530
 @@ -968,8 +968,11 @@
p.ssim_4x4x2_core = ssim_4x4x2_core;
p.ssim_end_4 = ssim_end_4;
 
 -p.var[LUMA_16x16] = pixel_var16, 16;
 +p.var[LUMA_8x4] = pixel_var8, 4;
p.var[LUMA_8x8] = pixel_var8, 8;
 +p.var[LUMA_8x16] = pixel_var8, 16;
 +p.var[LUMA_8x32] = pixel_var8, 32;
 +
p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
 }
 }
 diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cpp   Mon Nov 25 18:46:28 2013 +0530
 +++ b/source/common/x86/asm-primitives.cpp   Mon Nov 25 19:07:38 2013 +0530
 @@ -412,6 +412,15 @@
SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
 
 +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
 +p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
 +
 +#define LUMA_VAR(cpu) \
 +SETUP_PIXEL_VAR_DEF(8,   4, cpu); \
 +SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
 +SETUP_PIXEL_VAR_DEF(8,  16, cpu); \
 +SETUP_PIXEL_VAR_DEF(8,  32, cpu);
 +
 namespace x265 {
 // private x265 namespace
 
 @@ -442,6 +451,8 @@
PIXEL_AVG(sse2);
PIXEL_AVG_W4(mmx2);
 
 +LUMA_VAR(_sse2);
 +
p.sad[LUMA_8x32]  = x265_pixel_sad_8x32_sse2;
p.sad[LUMA_16x4]  = x265_pixel_sad_16x4_sse2;
p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
 diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm
 --- a/source/common/x86/pixel-a.asm  Mon Nov 25 18:46:28 2013 +0530
 +++ b/source/common/x86/pixel-a.asm  Mon Nov 25 19:07:38 2013 +0530
 @@ -1301,6 +1301,106 @@
 
 %if HIGH_BIT_DEPTH == 0
 %macro VAR 0
 +cglobal pixel_var_8x4, 2,3,8
 +VAR_START 1
 +lea   r2,[r1 * 3]
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +VAR_END 8, 4
 +
 +cglobal pixel_var_8x8, 2,3,8
 +VAR_START 1
 +lea   r2,[r1 * 3]
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +VAR_CORE
 +VAR_END 8, 8
 +
 +
 +cglobal pixel_var_8x16, 2,4,8
 +VAR_START 1
 +lea   r2,[r1 * 3]
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r2]
 +DEINTB1, 0, 4, 3, 7
 +VAR_CORE
 +VAR_END 8, 16
 +
 +cglobal pixel_var_8x32, 2,4,8
 +VAR_START 1
 +mov   r2d,   2
 +lea   r3,[r1 * 3]
 +.loop:
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r3]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r3]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +movh  m0,[r0]
 +movh  m3,[r0 + r1]
 +movhpsm0,[r0 + r1 * 2]
 +movhpsm3,[r0 + r3]
 +DEINTB1, 0, 4, 3, 7
 +lea   r0,[r0 + r1 * 4]
 +VAR_CORE
 +

[x265] [PATCH] asm: removed unused code in pixel_var module

2013-11-25 Thread murugan
# HG changeset patch
# User Murugan Vairavel muru...@multicorewareinc.com
# Date 1385450061 -19800
#  Tue Nov 26 12:44:21 2013 +0530
# Node ID e866b2f9fcd2d4004e968243f18be1fa2a6c87a9
# Parent  9e9767a887e3a91c0953b9bfa17c2f34f03ecf11
asm: removed unused code in pixel_var module

diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/pixel.cpp
--- a/source/common/pixel.cpp   Mon Nov 25 19:28:33 2013 +0530
+++ b/source/common/pixel.cpp   Tue Nov 26 12:44:21 2013 +0530
@@ -968,17 +968,8 @@
 p.ssim_4x4x2_core = ssim_4x4x2_core;
 p.ssim_end_4 = ssim_end_4;
 
-p.var[LUMA_8x4] = pixel_var8, 4;
 p.var[LUMA_8x8] = pixel_var8, 8;
-p.var[LUMA_8x16] = pixel_var8, 16;
-p.var[LUMA_8x32] = pixel_var8, 32;
-p.var[LUMA_16x4] = pixel_var16, 4;
-p.var[LUMA_16x8] = pixel_var16, 8;
-p.var[LUMA_16x12] = pixel_var16, 12;
 p.var[LUMA_16x16] = pixel_var16, 16;
-p.var[LUMA_16x32] = pixel_var16, 32;
-p.var[LUMA_16x64] = pixel_var16, 64;
-
 p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
 }
 }
diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/primitives.h
--- a/source/common/primitives.hMon Nov 25 19:28:33 2013 +0530
+++ b/source/common/primitives.hTue Nov 26 12:44:21 2013 +0530
@@ -268,7 +268,7 @@
 calcrecon_t calcrecon[NUM_SQUARE_BLOCKS];
 transpose_t transpose[NUM_SQUARE_BLOCKS];
 
-var_t   var[NUM_LUMA_PARTITIONS];
+var_t   var[NUM_SQUARE_BLOCKS];
 ssim_4x4x2_core_t ssim_4x4x2_core;
 ssim_end4_t ssim_end_4;
 
diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Mon Nov 25 19:28:33 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Nov 26 12:44:21 2013 +0530
@@ -416,16 +416,8 @@
 p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
 
 #define LUMA_VAR(cpu) \
-SETUP_PIXEL_VAR_DEF(8,   4, cpu); \
 SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
-SETUP_PIXEL_VAR_DEF(8,  16, cpu); \
-SETUP_PIXEL_VAR_DEF(8,  32, cpu); \
-SETUP_PIXEL_VAR_DEF(16,  4, cpu); \
-SETUP_PIXEL_VAR_DEF(16,  8, cpu); \
-SETUP_PIXEL_VAR_DEF(16, 12, cpu); \
-SETUP_PIXEL_VAR_DEF(16, 16, cpu); \
-SETUP_PIXEL_VAR_DEF(16, 32, cpu); \
-SETUP_PIXEL_VAR_DEF(16, 64, cpu);
+SETUP_PIXEL_VAR_DEF(16, 16, cpu);
 
 namespace x265 {
 // private x265 namespace
diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 25 19:28:33 2013 +0530
+++ b/source/common/x86/pixel-a.asm Tue Nov 26 12:44:21 2013 +0530
@@ -1254,12 +1254,6 @@
 VAR_2ROW 8*SIZEOF_PIXEL, 16
 VAR_END 16, 16
 
-cglobal pixel_var_8x16, 2,3
-FIX_STRIDES r1
-VAR_START 0
-VAR_2ROW r1, 8
-VAR_END 8, 16
-
 cglobal pixel_var_8x8, 2,3
 FIX_STRIDES r1
 VAR_START 0
@@ -1301,18 +1295,6 @@
 
 %if HIGH_BIT_DEPTH == 0
 %macro VAR 0
-cglobal pixel_var_8x4, 2,3,8
-VAR_START 1
-lea   r2,[r1 * 3]
-movh  m0,[r0]
-movh  m3,[r0 + r1]
-movhpsm0,[r0 + r1 * 2]
-movhpsm3,[r0 + r2]
-DEINTB1, 0, 4, 3, 7
-lea   r0,[r0 + r1 * 4]
-VAR_CORE
-VAR_END 8, 4
-
 cglobal pixel_var_8x8, 2,3,8
 VAR_START 1
 lea   r2,[r1 * 3]
@@ -1331,142 +1313,6 @@
 VAR_CORE
 VAR_END 8, 8
 
-
-cglobal pixel_var_8x16, 2,4,8
-VAR_START 1
-lea   r2,[r1 * 3]
-movh  m0,[r0]
-movh  m3,[r0 + r1]
-movhpsm0,[r0 + r1 * 2]
-movhpsm3,[r0 + r2]
-DEINTB1, 0, 4, 3, 7
-lea   r0,[r0 + r1 * 4]
-VAR_CORE
-movh  m0,[r0]
-movh  m3,[r0 + r1]
-movhpsm0,[r0 + r1 * 2]
-movhpsm3,[r0 + r2]
-DEINTB1, 0, 4, 3, 7
-lea   r0,[r0 + r1 * 4]
-VAR_CORE
-movh  m0,[r0]
-movh  m3,[r0 + r1]
-movhpsm0,[r0 + r1 * 2]
-movhpsm3,[r0 + r2]
-DEINTB1, 0, 4, 3, 7
-lea   r0,[r0 + r1 * 4]
-VAR_CORE
-movh  m0,[r0]
-movh  m3,[r0 + r1]
-movhpsm0,[r0 + r1 * 2]
-movhpsm3,[r0 + r2]
-DEINTB1, 0, 4, 3, 7
-VAR_CORE
-VAR_END 8, 16
-
-cglobal pixel_var_8x32, 2,4,8
-VAR_START 1
-mov   r2d,   2
-lea   r3,[r1 * 3]
-.loop:
-movh  m0,[r0]
-movh  m3,[r0 + r1]
-movhpsm0,[r0 + r1 * 2]
-movhpsm3,[r0 + r3]
-DEINTB1, 0, 4, 3, 7
-lea   r0,[r0 + r1 * 4]
-VAR_CORE
-movh  m0,[r0]
-movh  m3,[r0 + r1]
-movhpsm0,[r0 + r1 * 2]
-movhpsm3,[r0 + r3]
-DEINTB1, 0, 4, 3, 7
-lea   r0,[r0 + r1 * 4]
-VAR_CORE
-movh  m0,[r0]
-movh  m3,[r0 + r1]
-movhpsm0,[r0 + r1 * 2]
-movhpsm3,[r0 + r3]
-DEINTB1, 0, 4, 3, 7
-lea   r0,[r0 + r1 * 4]
-VAR_CORE
-movh  m0,

Re: [x265] [PATCH] asm: removed unused code in pixel_var module

2013-11-25 Thread Murugan Vairavel
Ignore this patch. Need some modifications in C code.



On Tue, Nov 26, 2013 at 12:45 PM, muru...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Murugan Vairavel muru...@multicorewareinc.com
 # Date 1385450061 -19800
 #  Tue Nov 26 12:44:21 2013 +0530
 # Node ID e866b2f9fcd2d4004e968243f18be1fa2a6c87a9
 # Parent  9e9767a887e3a91c0953b9bfa17c2f34f03ecf11
 asm: removed unused code in pixel_var module

 diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/pixel.cpp
 --- a/source/common/pixel.cpp   Mon Nov 25 19:28:33 2013 +0530
 +++ b/source/common/pixel.cpp   Tue Nov 26 12:44:21 2013 +0530
 @@ -968,17 +968,8 @@
  p.ssim_4x4x2_core = ssim_4x4x2_core;
  p.ssim_end_4 = ssim_end_4;

 -p.var[LUMA_8x4] = pixel_var8, 4;
  p.var[LUMA_8x8] = pixel_var8, 8;
 -p.var[LUMA_8x16] = pixel_var8, 16;
 -p.var[LUMA_8x32] = pixel_var8, 32;
 -p.var[LUMA_16x4] = pixel_var16, 4;
 -p.var[LUMA_16x8] = pixel_var16, 8;
 -p.var[LUMA_16x12] = pixel_var16, 12;
  p.var[LUMA_16x16] = pixel_var16, 16;
 -p.var[LUMA_16x32] = pixel_var16, 32;
 -p.var[LUMA_16x64] = pixel_var16, 64;
 -
  p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
  }
  }
 diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/primitives.h
 --- a/source/common/primitives.hMon Nov 25 19:28:33 2013 +0530
 +++ b/source/common/primitives.hTue Nov 26 12:44:21 2013 +0530
 @@ -268,7 +268,7 @@
  calcrecon_t calcrecon[NUM_SQUARE_BLOCKS];
  transpose_t transpose[NUM_SQUARE_BLOCKS];

 -var_t   var[NUM_LUMA_PARTITIONS];
 +var_t   var[NUM_SQUARE_BLOCKS];
  ssim_4x4x2_core_t ssim_4x4x2_core;
  ssim_end4_t ssim_end_4;

 diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cpp  Mon Nov 25 19:28:33 2013
 +0530
 +++ b/source/common/x86/asm-primitives.cpp  Tue Nov 26 12:44:21 2013
 +0530
 @@ -416,16 +416,8 @@
  p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;

  #define LUMA_VAR(cpu) \
 -SETUP_PIXEL_VAR_DEF(8,   4, cpu); \
  SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
 -SETUP_PIXEL_VAR_DEF(8,  16, cpu); \
 -SETUP_PIXEL_VAR_DEF(8,  32, cpu); \
 -SETUP_PIXEL_VAR_DEF(16,  4, cpu); \
 -SETUP_PIXEL_VAR_DEF(16,  8, cpu); \
 -SETUP_PIXEL_VAR_DEF(16, 12, cpu); \
 -SETUP_PIXEL_VAR_DEF(16, 16, cpu); \
 -SETUP_PIXEL_VAR_DEF(16, 32, cpu); \
 -SETUP_PIXEL_VAR_DEF(16, 64, cpu);
 +SETUP_PIXEL_VAR_DEF(16, 16, cpu);

  namespace x265 {
  // private x265 namespace
 diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/x86/pixel-a.asm
 --- a/source/common/x86/pixel-a.asm Mon Nov 25 19:28:33 2013 +0530
 +++ b/source/common/x86/pixel-a.asm Tue Nov 26 12:44:21 2013 +0530
 @@ -1254,12 +1254,6 @@
  VAR_2ROW 8*SIZEOF_PIXEL, 16
  VAR_END 16, 16

 -cglobal pixel_var_8x16, 2,3
 -FIX_STRIDES r1
 -VAR_START 0
 -VAR_2ROW r1, 8
 -VAR_END 8, 16
 -
  cglobal pixel_var_8x8, 2,3
  FIX_STRIDES r1
  VAR_START 0
 @@ -1301,18 +1295,6 @@

  %if HIGH_BIT_DEPTH == 0
  %macro VAR 0
 -cglobal pixel_var_8x4, 2,3,8
 -VAR_START 1
 -lea   r2,[r1 * 3]
 -movh  m0,[r0]
 -movh  m3,[r0 + r1]
 -movhpsm0,[r0 + r1 * 2]
 -movhpsm3,[r0 + r2]
 -DEINTB1, 0, 4, 3, 7
 -lea   r0,[r0 + r1 * 4]
 -VAR_CORE
 -VAR_END 8, 4
 -
  cglobal pixel_var_8x8, 2,3,8
  VAR_START 1
  lea   r2,[r1 * 3]
 @@ -1331,142 +1313,6 @@
  VAR_CORE
  VAR_END 8, 8

 -
 -cglobal pixel_var_8x16, 2,4,8
 -VAR_START 1
 -lea   r2,[r1 * 3]
 -movh  m0,[r0]
 -movh  m3,[r0 + r1]
 -movhpsm0,[r0 + r1 * 2]
 -movhpsm3,[r0 + r2]
 -DEINTB1, 0, 4, 3, 7
 -lea   r0,[r0 + r1 * 4]
 -VAR_CORE
 -movh  m0,[r0]
 -movh  m3,[r0 + r1]
 -movhpsm0,[r0 + r1 * 2]
 -movhpsm3,[r0 + r2]
 -DEINTB1, 0, 4, 3, 7
 -lea   r0,[r0 + r1 * 4]
 -VAR_CORE
 -movh  m0,[r0]
 -movh  m3,[r0 + r1]
 -movhpsm0,[r0 + r1 * 2]
 -movhpsm3,[r0 + r2]
 -DEINTB1, 0, 4, 3, 7
 -lea   r0,[r0 + r1 * 4]
 -VAR_CORE
 -movh  m0,[r0]
 -movh  m3,[r0 + r1]
 -movhpsm0,[r0 + r1 * 2]
 -movhpsm3,[r0 + r2]
 -DEINTB1, 0, 4, 3, 7
 -VAR_CORE
 -VAR_END 8, 16
 -
 -cglobal pixel_var_8x32, 2,4,8
 -VAR_START 1
 -mov   r2d,   2
 -lea   r3,[r1 * 3]
 -.loop:
 -movh  m0,[r0]
 -movh  m3,[r0 + r1]
 -movhpsm0,[r0 + r1 * 2]
 -movhpsm3,[r0 + r3]
 -DEINTB1, 0, 4, 3, 7
 -lea   r0,[r0 + r1 * 4]
 -VAR_CORE
 -movh  m0,[r0]
 -movh  m3,[r0 + r1]
 -movhpsm0,[r0 + r1 * 2]
 -movhpsm3,[r0 + r3]
 -DEINTB1, 0, 4, 3, 7
 -lea