[x265] [PATCH] cleanup the temporary function pointer initialization
# HG changeset patch # User Praveen Tiwari # Date 1385370359 -19800 # Node ID e9c2faf1e31ab1a1318c484493704405996dcfa8 # Parent 10f605bd053009c8c981c7529322fecd1e54af7b cleanup the temporary function pointer initialization diff -r 10f605bd0530 -r e9c2faf1e31a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Nov 22 14:59:34 2013 -0600 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 14:35:59 2013 +0530 @@ -612,48 +612,10 @@ p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4; p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4; -// This function pointer initialization is temporary will be removed -// later with macro definitions. It is used to avoid linker errors -// until all partitions are coded and commit smaller patches, easier to -// review. - -p.chroma[X265_CSP_I420].add_ps[CHROMA_2x8] = x265_pixel_add_ps_2x8_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_2x4] = x265_pixel_add_ps_2x4_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_4x2] = x265_pixel_add_ps_4x2_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_4x4] = x265_pixel_add_ps_4x4_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_4x8] = x265_pixel_add_ps_4x8_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_4x16] = x265_pixel_add_ps_4x16_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_6x8] = x265_pixel_add_ps_6x8_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_8x2] = x265_pixel_add_ps_8x2_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_8x4] = x265_pixel_add_ps_8x4_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_8x6] = x265_pixel_add_ps_8x6_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_8x8] = x265_pixel_add_ps_8x8_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_8x16] = x265_pixel_add_ps_8x16_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_8x32] = x265_pixel_add_ps_8x32_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_12x16] = x265_pixel_add_ps_12x16_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_16x4] = x265_pixel_add_ps_16x4_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_16x8] = x265_pixel_add_ps_16x8_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_16x12] = x265_pixel_add_ps_16x12_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_16x16] = x265_pixel_add_ps_16x16_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_16x32] = x265_pixel_add_ps_16x32_sse4; -p.luma_add_ps[LUMA_16x64] = x265_pixel_add_ps_16x64_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_24x32] = x265_pixel_add_ps_24x32_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_32x8] = x265_pixel_add_ps_32x8_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_32x16] = x265_pixel_add_ps_32x16_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_32x24] = x265_pixel_add_ps_32x24_sse4; -p.chroma[X265_CSP_I420].add_ps[CHROMA_32x32] = x265_pixel_add_ps_32x32_sse4; -p.luma_add_ps[LUMA_32x64] = x265_pixel_add_ps_32x64_sse4; - p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x4] = x265_interp_4tap_vert_sp_2x4_sse4; p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x8] = x265_interp_4tap_vert_sp_2x8_sse4; p.chroma[X265_CSP_I420].filter_vsp[CHROMA_6x8] = x265_interp_4tap_vert_sp_6x8_sse4; -p.luma_add_ps[LUMA_48x64] = x265_pixel_add_ps_48x64_sse4; -p.luma_add_ps[LUMA_64x16] = x265_pixel_add_ps_64x16_sse4; -p.luma_add_ps[LUMA_64x32] = x265_pixel_add_ps_64x32_sse4; -p.luma_add_ps[LUMA_64x48] = x265_pixel_add_ps_64x48_sse4; -p.luma_add_ps[LUMA_64x64] = x265_pixel_add_ps_64x64_sse4; - p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4; p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4; p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 0 of 3 ] Adding asm routine , function declaration and function pointer initialization for weight_pp() function.
___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 3] Test bench modifications for weight_pp() asm routine
# HG changeset patch # User Nabajit Deka # Date 1385374525 -19800 # Mon Nov 25 15:45:25 2013 +0530 # Node ID f7422dfb7eef017344b4d974dac641cb00f7f5b7 # Parent 365f90b3b78cd3c91d6f0985b0d467da4a91d95a Test bench modifications for weight_pp() asm routine. diff -r 365f90b3b78c -r f7422dfb7eef source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Mon Nov 25 15:31:55 2013 +0530 +++ b/source/test/pixelharness.cpp Mon Nov 25 15:45:25 2013 +0530 @@ -315,10 +315,10 @@ memset(ref_dest, 0, 64 * 64 * sizeof(pixel)); memset(opt_dest, 0, 64 * 64 * sizeof(pixel)); int j = 0; -int width = (2 * rand()) % 64; +int width = 16 * (rand() % 4 + 1); int height = 8; -int w0 = rand() % 256; -int shift = rand() % 12; +int w0 = rand() % 128; +int shift = rand() % 15; int round = shift ? (1 (shift - 1)) : 0; int offset = (rand() % 256) - 128; for (int i = 0; i ITERS; i++) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 0 of 3 ] Adding asm routine, function declaration and function pointer initialization for weight_sp() function.
___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 3] asm : routine for weight_sp()
# HG changeset patch # User Nabajit Deka # Date 1385375693 -19800 # Mon Nov 25 16:04:53 2013 +0530 # Node ID 4a5ad44661863551a57ab5a2d38f9e91e4297b7c # Parent 92969306ae85ed2c506d53d709e02f3d98b895f7 asm : routine for weight_sp(). diff -r 92969306ae85 -r 4a5ad4466186 source/common/x86/pixel-util.asm --- a/source/common/x86/pixel-util.asm Mon Nov 25 15:46:49 2013 +0530 +++ b/source/common/x86/pixel-util.asm Mon Nov 25 16:04:53 2013 +0530 @@ -31,6 +31,7 @@ c_d_1234: dd 1, 2, 3, 4 tab_c_1:times 8 dw 1 +tab_c_8192: times 8 dw 8192 SECTION .text @@ -751,3 +752,87 @@ jnz .loopH RET + +;- +;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) +;- +INIT_XMM sse4 +%if ARCH_X86_64 +cglobal weight_sp, 6, 7+2, 6 +%define tmp_r0 r7 +%define tmp_r1 r8 +%else ; ARCH_X86_64 = 0 +cglobal weight_sp, 6, 7, 6, 0-(2*4) +%define tmp_r0 [(rsp + 0 * 4)] +%define tmp_r1 [(rsp + 1 * 4)] +%endif ; ARCH_X86_64 + +movdm0, r6m ; m0 = [w0] + +movdm1, r7m ; m1 = [round] +punpcklwd m0, m1 +pshufd m0, m0, 0 ; m0 = [w0 round] + +movdm1, r8m ; m1 = [shift] + +movdm2, r9m +pshufd m2, m2, 0 ; m2 =[offset] + +movam3, [tab_c_1] +movam4, [tab_c_8192] + +add r2d, r2d + +.loopH +mov r6d, r4d + +; save old src and dst +mov tmp_r0, r0 +mov tmp_r1, r1 +.loopW: +movum5, [r0] +paddw m5, m4 + +punpcklwd m6,m5, m3 +pmaddwd m6, m0 +psrad m6, m1 +paddd m6, m2 + +punpckhwd m5, m3 +pmaddwd m5, m0 +psrad m5, m1 +paddd m5, m2 + +packssdwm6, m5 +packuswbm6, m6 + +sub r6d, 8 +jl .width4 +movh[r1], m6 +je .nextH +add r0, 16 +add r1, 8 + +jmp .loopW + +.width4 +cmp r6d, -4 +jl .width2 +movd[r1], m6 +je .nextH +add r1, 4 +pshufd m6, m6, 1 + +.width2 +pextrw [r1], m6, 0 + +.nextH +mov r0, tmp_r0 +mov r1, tmp_r1 +lea r0, [r0 + r2] +lea r1, [r1 + r3] + +dec r5d +jnz .loopH + +RET ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] cleanup: removed unused code in pixel-a.asm
Does not apply at the tip. On Mon, Nov 25, 2013 at 11:40 AM, yuva...@multicorewareinc.com wrote: # HG changeset patch # User Yuvaraj Venkatesh yuva...@multicorewareinc.com # Date 1385359751 -19800 # Mon Nov 25 11:39:11 2013 +0530 # Node ID 90a80def0f1aabdf29e1f08dd0f2263d8e6af805 # Parent c0c862dc71fbd021efd3922de99da4f2f93e81f4 cleanup: removed unused code in pixel-a.asm diff -r c0c862dc71fb -r 90a80def0f1a source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Sun Nov 24 17:34:12 2013 +0800 +++ b/source/common/x86/pixel-a.asm Mon Nov 25 11:39:11 2013 +0530 @@ -7157,173 +7157,6 @@ %endif ; !ARCH_X86_64 %endmacro ; SA8D -;= -; SA8D_SATD -;= - -; %1: vertical/horizontal mode -; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9) -; m10: satd result -; m6, m11-15: tmp regs -%macro SA8D_SATD_8x4 5 -%if %1 -LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 -HADAMARD 0, sumsub, %2, %3, 6 -HADAMARD 0, sumsub, %4, %5, 6 -SBUTTERFLYwd, %2, %3, 6 -SBUTTERFLYwd, %4, %5, 6 -HADAMARD2_2D %2, %4, %3, %5, 6, dq - -mova m12, m%2 -mova m13, m%3 -mova m14, m%4 -mova m15, m%5 -HADAMARD 0, sumsub, %2, %3, 6 -HADAMARD 0, sumsub, %4, %5, 6 -SBUTTERFLY qdq, 12, 13, 6 -HADAMARD 0, amax, 12, 13, 6 -SBUTTERFLY qdq, 14, 15, 6 -paddw m10, m12 -HADAMARD 0, amax, 14, 15, 6 -paddw m10, m14 -%else -LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 -HADAMARD4_V %2, %3, %4, %5, 6 - -pabswm12, m%2 ; doing the abs first is a slight advantage -pabswm14, m%4 -pabswm13, m%3 -pabswm15, m%5 -HADAMARD 1, max, 12, 14, 6, 11 -paddwm10, m12 -HADAMARD 1, max, 13, 15, 6, 11 -paddwm10, m13 -%endif -%endmacro ; SA8D_SATD_8x4 - -; %1: add spilled regs? -; %2: spill regs? -%macro SA8D_SATD_ACCUM 2 -%if HIGH_BIT_DEPTH -pmaddwd m10, [pw_1] -HADDUWD m0, m1 -%if %1 -paddd m10, temp1 -padddm0, temp0 -%endif -%if %2 -mova temp1, m10 -pxorm10, m10 -%endif -%elif %1 -paddwm0, temp0 -%endif -%if %2 -mova temp0, m0 -%endif -%endmacro - -%macro SA8D_SATD 0 -%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) -cglobal pixel_sa8d_satd_8x8_internal -SA8D_SATD_8x4 vertical, 0, 1, 2, 3 -SA8D_SATD_8x4 vertical, 4, 5, 8, 9 - -%if vertical ; sse2-style -HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax -HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax -%else; complete sa8d -SUMSUB_BADC w, 0, 4, 1, 5, 12 -HADAMARD 2, sumsub, 0, 4, 12, 11 -HADAMARD 2, sumsub, 1, 5, 12, 11 -SUMSUB_BADC w, 2, 8, 3, 9, 12 -HADAMARD 2, sumsub, 2, 8, 12, 11 -HADAMARD 2, sumsub, 3, 9, 12, 11 -HADAMARD 1, amax, 0, 4, 12, 11 -HADAMARD 1, amax, 1, 5, 12, 4 -HADAMARD 1, amax, 2, 8, 12, 4 -HADAMARD 1, amax, 3, 9, 12, 4 -%endif - -; create sa8d sub results -paddwm1, m2 -paddwm0, m3 -paddwm0, m1 - -SAVE_MM_PERMUTATION -ret - -;--- -; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t ) -;--- -cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize -%define temp0 [rsp+0*mmsize] -%define temp1 [rsp+1*mmsize] -FIX_STRIDES r1, r3 -%if vertical==0 -mova m7, [hmul_8p] -%endif -lea r4, [3*r1] -lea r5, [3*r3] -pxorm10, m10 - -%if mmsize==32 -call pixel_sa8d_satd_8x8_internal -SA8D_SATD_ACCUM 0, 1 -call pixel_sa8d_satd_8x8_internal -SA8D_SATD_ACCUM 1, 0 -vextracti128 xm1, m0, 1 -vextracti128 xm2, m10, 1 -paddw xm0, xm1 -paddw xm10, xm2 -%else -lea r6, [r2+8*SIZEOF_PIXEL] -lea r7, [r0+8*SIZEOF_PIXEL] - -call pixel_sa8d_satd_8x8_internal -SA8D_SATD_ACCUM 0, 1 -call pixel_sa8d_satd_8x8_internal -SA8D_SATD_ACCUM 1, 1 - -mov r0, r7 -mov r2, r6 - -call pixel_sa8d_satd_8x8_internal -SA8D_SATD_ACCUM 1, 1 -call pixel_sa8d_satd_8x8_internal -SA8D_SATD_ACCUM 1, 0 -%endif - -; xop already has fast horizontal sums -%if cpuflag(sse4) notcpuflag(xop) HIGH_BIT_DEPTH==0 -pmaddwd xm10, [pw_1] -HADDUWD xm0, xm1 -phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2 -pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1 -paddd xm0, xm1; sa8d sa8d satd satd -movdr0d, xm0 -pextrd eax, xm0, 2 -%else -%if HIGH_BIT_DEPTH -HADDD xm0, xm1 -HADDD xm10, xm2 -%else -HADDUW xm0, xm1
[x265] [PATCH] Test bench modifications for weight_sp() asm routine
# HG changeset patch # User Nabajit Deka # Date 1385378388 -19800 # Mon Nov 25 16:49:48 2013 +0530 # Node ID d2d31d26493438d3b4ee22802bdab085460359a4 # Parent 4a5ad44661863551a57ab5a2d38f9e91e4297b7c Test bench modifications for weight_sp() asm routine diff -r 4a5ad4466186 -r d2d31d264934 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Mon Nov 25 16:04:53 2013 +0530 +++ b/source/test/pixelharness.cpp Mon Nov 25 16:49:48 2013 +0530 @@ -287,10 +287,10 @@ memset(ref_dest, 0, 64 * 64 * sizeof(pixel)); memset(opt_dest, 0, 64 * 64 * sizeof(pixel)); int j = 0; -int width = (2 * rand()) % 64; +int width = 2 * (rand() % 32 + 1); int height = 8; -int w0 = rand() % 256; -int shift = rand() % 12; +int w0 = rand() % 128; +int shift = rand() % 15; int round = shift ? (1 (shift - 1)) : 0; int offset = (rand() % 256) - 128; for (int i = 0; i ITERS; i++) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] Test bench: code for pixel_var
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1385385388 -19800 # Mon Nov 25 18:46:28 2013 +0530 # Node ID 43da6ca15a61e18d033931ca58940d6794f6f8f8 # Parent 10f605bd053009c8c981c7529322fecd1e54af7b Test bench: code for pixel_var diff -r 10f605bd0530 -r 43da6ca15a61 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Fri Nov 22 14:59:34 2013 -0600 +++ b/source/test/pixelharness.cpp Mon Nov 25 18:46:28 2013 +0530 @@ -632,6 +632,23 @@ return true; } +bool PixelHarness::check_pixel_var(var_t ref, var_t opt) +{ +int j = 0; + +for (int i = 0; i ITERS; i++) +{ +uint64_t vres = opt(pbuf1, STRIDE); +uint64_t cres = ref(pbuf1, STRIDE); +if (vres != cres) +return false; + +j += INCR; +} + +return true; +} + bool PixelHarness::testPartition(int part, const EncoderPrimitives ref, const EncoderPrimitives opt) { if (opt.satd[part]) @@ -759,6 +776,16 @@ return false; } } + +if (opt.var[part]) +{ +if (!check_pixel_var(ref.var[part], opt.var[part])) +{ +printf(var[%s]: failed!\n, lumaPartStr[part]); +return false; +} +} + for(int i = 0; i X265_CSP_COUNT; i++) { if (opt.chroma[i].copy_pp[part]) @@ -1053,6 +1080,12 @@ REPORT_SPEEDUP(opt.luma_add_ps[part], ref.luma_add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE); } +if (opt.var[part]) +{ +HEADER(var[%s], lumaPartStr[part]); +REPORT_SPEEDUP(opt.var[part], ref.var[part], pbuf1, STRIDE); +} + for (int i = 0; i X265_CSP_COUNT; i++) { if (opt.chroma[i].copy_pp[part]) diff -r 10f605bd0530 -r 43da6ca15a61 source/test/pixelharness.h --- a/source/test/pixelharness.hFri Nov 22 14:59:34 2013 -0600 +++ b/source/test/pixelharness.hMon Nov 25 18:46:28 2013 +0530 @@ -60,6 +60,7 @@ bool check_weightp(weightp_sp_t ref, weightp_sp_t opt); bool check_downscale_t(downscale_t ref, downscale_t opt); bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt); +bool check_pixel_var(var_t ref, var_t opt); public: ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: assembly code for pixel_sse_ss_12x16
# HG changeset patch # User Yuvaraj Venkatesh yuva...@multicorewareinc.com # Date 1385385872 -19800 # Mon Nov 25 18:54:32 2013 +0530 # Node ID fea660d227b842c411240ff17297ddfbb738b540 # Parent a69a8392ffeb32d5b136bd315b456b2067cceb29 asm: assembly code for pixel_sse_ss_12x16 diff -r a69a8392ffeb -r fea660d227b8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 18:30:49 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 18:54:32 2013 +0530 @@ -95,6 +95,7 @@ p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_ ## cpu; \ p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_ ## cpu; \ p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_ ## cpu; \ +p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_ ## cpu; \ p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_ ## cpu; \ p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_ ## cpu; \ p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_ ## cpu; \ diff -r a69a8392ffeb -r fea660d227b8 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 25 18:30:49 2013 +0530 +++ b/source/common/x86/pixel-a.asm Mon Nov 25 18:54:32 2013 +0530 @@ -378,12 +378,63 @@ SSD_SS16, 64 %endmacro +%macro SSD_SS_12x16 0 +cglobal pixel_ssd_ss_12x16, 4,7,6 +FIX_STRIDES r1, r3 +movr4d, 8 +pxorm0, m0 +.loop +pmovsxwd m1, [r0] +pmovsxwd m2, [r2] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 8] +pmovsxwd m2, [r2 + 8] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 16] +pmovsxwd m2, [r2 + 16] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +lea r0, [r0 + 2*r1] +lea r2, [r2 + 2*r3] +pmovsxwd m1, [r0] +pmovsxwd m2, [r2] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 8] +pmovsxwd m2, [r2 + 8] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 16] +pmovsxwd m2, [r2 + 16] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +lea r0, [r0 + 2*r1] +lea r2, [r2 + 2*r3] +dec r4d +jnz .loop +phadddm0, m0 +phadddm0, m0 +movd eax, m0 +RET +%endmacro + INIT_XMM sse2 SSD_SS_ONE +SSD_SS_12x16 INIT_XMM sse4 SSD_SS_ONE +SSD_SS_12x16 INIT_XMM avx SSD_SS_ONE +SSD_SS_12x16 %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH Review only] asm: code for pixel_var_8xN
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1385386658 -19800 # Mon Nov 25 19:07:38 2013 +0530 # Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f # Parent 43da6ca15a61e18d033931ca58940d6794f6f8f8 asm: code for pixel_var_8xN diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp --- a/source/common/pixel.cpp Mon Nov 25 18:46:28 2013 +0530 +++ b/source/common/pixel.cpp Mon Nov 25 19:07:38 2013 +0530 @@ -968,8 +968,11 @@ p.ssim_4x4x2_core = ssim_4x4x2_core; p.ssim_end_4 = ssim_end_4; -p.var[LUMA_16x16] = pixel_var16, 16; +p.var[LUMA_8x4] = pixel_var8, 4; p.var[LUMA_8x8] = pixel_var8, 8; +p.var[LUMA_8x16] = pixel_var8, 16; +p.var[LUMA_8x32] = pixel_var8, 32; + p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; } } diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 18:46:28 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:07:38 2013 +0530 @@ -412,6 +412,15 @@ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu); +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \ +p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu; + +#define LUMA_VAR(cpu) \ +SETUP_PIXEL_VAR_DEF(8, 4, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 8, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 16, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 32, cpu); + namespace x265 { // private x265 namespace @@ -442,6 +451,8 @@ PIXEL_AVG(sse2); PIXEL_AVG_W4(mmx2); +LUMA_VAR(_sse2); + p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2; p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2; p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2; diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 25 18:46:28 2013 +0530 +++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530 @@ -1301,6 +1301,106 @@ %if HIGH_BIT_DEPTH == 0 %macro VAR 0 +cglobal pixel_var_8x4, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +VAR_END 8, 4 + +cglobal pixel_var_8x8, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 8, 8 + + +cglobal pixel_var_8x16, 2,4,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 8, 16 + +cglobal pixel_var_8x32, 2,4,8 +VAR_START 1 +mov r2d, 2 +lea r3,[r1 * 3] +.loop: +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +decr2d +jnz.loop +VAR_END 8, 32 + cglobal pixel_var_16x16, 2,3,8 VAR_START 1 mov r2d, 8 @@ -1313,38 +1413,6 @@ dec r2d jg .loop VAR_END 16, 16 - -cglobal pixel_var_8x8, 2,4,8 -VAR_START 1 -mov r2d, 2 -lea r3, [r1*3] -.loop: -movh
[x265] [PATCH] asm: assembly code for intra_pred_planar[4x4]
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1385387273 -19800 # Mon Nov 25 19:17:53 2013 +0530 # Node ID c070e25af31107c7c5a5a6cb5c5e049871c56e22 # Parent 10f605bd053009c8c981c7529322fecd1e54af7b asm: assembly code for intra_pred_planar[4x4] diff -r 10f605bd0530 -r c070e25af311 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Nov 22 14:59:34 2013 -0600 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:17:53 2013 +0530 @@ -663,6 +663,8 @@ p.intra_pred_dc[BLOCK_8x8] = x265_intra_pred_dc8_sse4; p.intra_pred_dc[BLOCK_16x16] = x265_intra_pred_dc16_sse4; p.intra_pred_dc[BLOCK_32x32] = x265_intra_pred_dc32_sse4; + +p.intra_pred_planar[BLOCK_4x4] = x265_intra_pred_planar4_sse4; } if (cpuMask X265_CPU_AVX) { diff -r 10f605bd0530 -r c070e25af311 source/common/x86/intrapred.asm --- a/source/common/x86/intrapred.asm Fri Nov 22 14:59:34 2013 -0600 +++ b/source/common/x86/intrapred.asm Mon Nov 25 19:17:53 2013 +0530 @@ -26,7 +26,7 @@ SECTION_RODATA 32 - +multi_2Row: dw 1, 2, 3, 4, 1, 2, 3, 4 SECTION .text @@ -362,3 +362,63 @@ %endrep RET + +INIT_XMM sse4 +cglobal intra_pred_planar4, 4,7,5, above, left, dst, dstStride + +pmovzxbwm0, [r0] ; topRow[i] = above[i]; +punpcklqdq m0, m0 + +pxorm1, m1 +movdm2, [r1 + 4] ; bottomLeft = left[4] +movzx r6d, byte [r0 + 4] ; topRight = above[4]; +pshufb m2, m1 +punpcklbw m2, m1 +psubw m2, m0; bottomRow[i] = bottomLeft - topRow[i] +psllw m0, 2 +punpcklqdq m3, m2, m1 +psubw m0, m3 +paddw m2, m2 + +%macro COMP_PRED_PLANAR_2ROW 1 +movzx r4d, byte [r1 + %1] +lea r4d,[r4d * 4 + 4] +movdm3, r4d +pshuflw m3, m3, 0 + +movzx r4d, byte [r1 + %1 + 1] +lea r4d,[r4d * 4 + 4] +movdm4, r4d +pshuflw m4, m4, 0 +punpcklqdq m3, m4; horPred + +movzx r4d, byte [r1 + %1] +mov r5d,r6d +sub r5d,r4d +movdm4, r5d +pshuflw m4, m4, 0 + +movzx r4d, byte [r1 + %1 + 1] +mov r5d,r6d +sub r5d,r4d +movdm1, r5d +pshuflw m1, m1, 0 +punpcklqdq m4, m1; rightColumnN + +pmullw m4, [multi_2Row] +paddw m3, m4 +paddw m0, m2 +paddw m3, m0 +psraw m3, 3 +packuswbm3, m3 + +movd[r2], m3 +pshufd m3, m3, 0x55 +movd[r2 + r3], m3 +lea r2, [r2 + 2 * r3] +%endmacro + +COMP_PRED_PLANAR_2ROW 0 +COMP_PRED_PLANAR_2ROW 2 + +RET diff -r 10f605bd0530 -r c070e25af311 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Fri Nov 22 14:59:34 2013 -0600 +++ b/source/common/x86/intrapred.h Mon Nov 25 19:17:53 2013 +0530 @@ -31,4 +31,6 @@ void x265_intra_pred_dc16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter); void x265_intra_pred_dc32_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter); +void x265_intra_pred_planar4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride); + #endif // ifndef X265_INTRAPRED_H ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: assembly code for pixel_sse_ss_32xN
# HG changeset patch # User Yuvaraj Venkatesh yuva...@multicorewareinc.com # Date 1385387530 -19800 # Mon Nov 25 19:22:10 2013 +0530 # Node ID 2ba2e95b57963f8c23412faaf7b73c4671fb8a10 # Parent fea660d227b842c411240ff17297ddfbb738b540 asm: assembly code for pixel_sse_ss_32xN diff -r fea660d227b8 -r 2ba2e95b5796 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 18:54:32 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:22:10 2013 +0530 @@ -102,6 +102,11 @@ p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_ ## cpu; \ p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_ ## cpu; \ p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_ ## cpu; \ +p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_ ## cpu; \ +p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_ ## cpu; \ +p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_ ## cpu; \ +p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_ ## cpu; \ +p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_ ## cpu; #define SA8D_INTER_FROM_BLOCK(cpu) \ p.sa8d_inter[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \ diff -r fea660d227b8 -r 2ba2e95b5796 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 25 18:54:32 2013 +0530 +++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:22:10 2013 +0530 @@ -426,15 +426,124 @@ RET %endmacro +%macro SSD_SS_32 1 +cglobal pixel_ssd_ss_32x%1, 4,7,6 +FIX_STRIDES r1, r3 +movr4d, %1/2 +pxorm0, m0 +.loop +pmovsxwd m1, [r0] +pmovsxwd m2, [r2] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 8] +pmovsxwd m2, [r2 + 8] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 16] +pmovsxwd m2, [r2 + 16] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 24] +pmovsxwd m2, [r2 + 24] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 32] +pmovsxwd m2, [r2 + 32] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 40] +pmovsxwd m2, [r2 + 40] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 48] +pmovsxwd m2, [r2 + 48] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 56] +pmovsxwd m2, [r2 + 56] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +lea r0, [r0 + 2*r1] +lea r2, [r2 + 2*r3] +pmovsxwd m1, [r0] +pmovsxwd m2, [r2] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 8] +pmovsxwd m2, [r2 + 8] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 16] +pmovsxwd m2, [r2 + 16] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 24] +pmovsxwd m2, [r2 + 24] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 32] +pmovsxwd m2, [r2 + 32] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 40] +pmovsxwd m2, [r2 + 40] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 48] +pmovsxwd m2, [r2 + 48] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +pmovsxwd m1, [r0 + 56] +pmovsxwd m2, [r2 + 56] +psubd m1, m2 +pmulldm1, m1 +paddd m0, m1 +lea r0, [r0 + 2*r1] +lea r2, [r2 + 2*r3] +dec r4d +jnz .loop +phadddm0, m0 +phadddm0, m0 +movd eax, m0 +RET +%endmacro + +%macro SSD_SS_32xN 0 +SSD_SS_32 8 +SSD_SS_32 16 +SSD_SS_32 24 +SSD_SS_32 32 +SSD_SS_32 64 +%endmacro + INIT_XMM sse2 SSD_SS_ONE SSD_SS_12x16 +SSD_SS_32xN INIT_XMM sse4 SSD_SS_ONE SSD_SS_12x16 +SSD_SS_32xN INIT_XMM avx SSD_SS_ONE SSD_SS_12x16 +SSD_SS_32xN %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH Review only] asm: code for pixel_var_16xN
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1385387913 -19800 # Mon Nov 25 19:28:33 2013 +0530 # Node ID 9e9767a887e3a91c0953b9bfa17c2f34f03ecf11 # Parent deb2fc2dcaf24a86132ebfe0fbaac4859611c92f asm: code for pixel_var_16xN diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/pixel.cpp --- a/source/common/pixel.cpp Mon Nov 25 19:07:38 2013 +0530 +++ b/source/common/pixel.cpp Mon Nov 25 19:28:33 2013 +0530 @@ -972,6 +972,12 @@ p.var[LUMA_8x8] = pixel_var8, 8; p.var[LUMA_8x16] = pixel_var8, 16; p.var[LUMA_8x32] = pixel_var8, 32; +p.var[LUMA_16x4] = pixel_var16, 4; +p.var[LUMA_16x8] = pixel_var16, 8; +p.var[LUMA_16x12] = pixel_var16, 12; +p.var[LUMA_16x16] = pixel_var16, 16; +p.var[LUMA_16x32] = pixel_var16, 32; +p.var[LUMA_16x64] = pixel_var16, 64; p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; } diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 19:07:38 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:28:33 2013 +0530 @@ -419,7 +419,13 @@ SETUP_PIXEL_VAR_DEF(8, 4, cpu); \ SETUP_PIXEL_VAR_DEF(8, 8, cpu); \ SETUP_PIXEL_VAR_DEF(8, 16, cpu); \ -SETUP_PIXEL_VAR_DEF(8, 32, cpu); +SETUP_PIXEL_VAR_DEF(8, 32, cpu); \ +SETUP_PIXEL_VAR_DEF(16, 4, cpu); \ +SETUP_PIXEL_VAR_DEF(16, 8, cpu); \ +SETUP_PIXEL_VAR_DEF(16, 12, cpu); \ +SETUP_PIXEL_VAR_DEF(16, 16, cpu); \ +SETUP_PIXEL_VAR_DEF(16, 32, cpu); \ +SETUP_PIXEL_VAR_DEF(16, 64, cpu); namespace x265 { // private x265 namespace diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530 +++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:28:33 2013 +0530 @@ -1401,18 +1401,201 @@ jnz.loop VAR_END 8, 32 +cglobal pixel_var_16x4, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 16, 4 + +cglobal pixel_var_16x8, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 16, 8 + +cglobal pixel_var_16x12, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 16, 12 + cglobal pixel_var_16x16, 2,3,8 VAR_START 1 -mov r2d, 8 +lea r2,[r1 * 3] +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +mova m0,[r0] +mova m3,[r0 + r1] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +mova m0,[r0 + 2 * r1] +mova m3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 16, 16 + +cglobal pixel_var_16x32, 2,4,8 +VAR_START 1 +mov r2d, 2 +lea r3,[r1 * 3] .loop: -mova m0, [r0] -mova m3, [r0+r1] +mova m0,[r0] +mova m3,[r0 + r1] DEINTB1, 0, 4, 3, 7 -lea r0, [r0+r1*2]
Re: [x265] [PATCH 1 of 3] asm : routine for weight_pp(), for input width in multiples of 16
+tab_c_1:times 8 dw 1 there have a pw_1 like this, I modify it___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH Review only] asm: code for pixel_var_8xN
On Nov 25, 2013, at 7:38 AM, muru...@multicorewareinc.com wrote: # HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1385386658 -19800 # Mon Nov 25 19:07:38 2013 +0530 # Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f # Parent 43da6ca15a61e18d033931ca58940d6794f6f8f8 asm: code for pixel_var_8xN I'm not sure the encoder uses any variance block measurements other than 8x8 diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp --- a/source/common/pixel.cpp Mon Nov 25 18:46:28 2013 +0530 +++ b/source/common/pixel.cpp Mon Nov 25 19:07:38 2013 +0530 @@ -968,8 +968,11 @@ p.ssim_4x4x2_core = ssim_4x4x2_core; p.ssim_end_4 = ssim_end_4; -p.var[LUMA_16x16] = pixel_var16, 16; +p.var[LUMA_8x4] = pixel_var8, 4; p.var[LUMA_8x8] = pixel_var8, 8; +p.var[LUMA_8x16] = pixel_var8, 16; +p.var[LUMA_8x32] = pixel_var8, 32; + p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; } } diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cppMon Nov 25 18:46:28 2013 +0530 +++ b/source/common/x86/asm-primitives.cppMon Nov 25 19:07:38 2013 +0530 @@ -412,6 +412,15 @@ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu); +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \ +p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu; + +#define LUMA_VAR(cpu) \ +SETUP_PIXEL_VAR_DEF(8, 4, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 8, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 16, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 32, cpu); + namespace x265 { // private x265 namespace @@ -442,6 +451,8 @@ PIXEL_AVG(sse2); PIXEL_AVG_W4(mmx2); +LUMA_VAR(_sse2); + p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2; p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2; p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2; diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 25 18:46:28 2013 +0530 +++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530 @@ -1301,6 +1301,106 @@ %if HIGH_BIT_DEPTH == 0 %macro VAR 0 +cglobal pixel_var_8x4, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +VAR_END 8, 4 + +cglobal pixel_var_8x8, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 8, 8 + + +cglobal pixel_var_8x16, 2,4,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 8, 16 + +cglobal pixel_var_8x32, 2,4,8 +VAR_START 1 +mov r2d, 2 +lea r3,[r1 * 3] +.loop: +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +decr2d +jnz.loop +VAR_END
Re: [x265] [PATCH Review only] asm: code for pixel_var_8xN
I just checked and ratecontrol.cpp uses var for block sizes 8x8 and 16x16. All the other block sizes are unused. We should probably define only square block sizes for this primitive. On Nov 25, 2013, at 2:07 PM, Steve Borho st...@borho.org wrote: On Nov 25, 2013, at 7:38 AM, muru...@multicorewareinc.com wrote: # HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1385386658 -19800 # Mon Nov 25 19:07:38 2013 +0530 # Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f # Parent 43da6ca15a61e18d033931ca58940d6794f6f8f8 asm: code for pixel_var_8xN I'm not sure the encoder uses any variance block measurements other than 8x8 diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp --- a/source/common/pixel.cppMon Nov 25 18:46:28 2013 +0530 +++ b/source/common/pixel.cppMon Nov 25 19:07:38 2013 +0530 @@ -968,8 +968,11 @@ p.ssim_4x4x2_core = ssim_4x4x2_core; p.ssim_end_4 = ssim_end_4; -p.var[LUMA_16x16] = pixel_var16, 16; +p.var[LUMA_8x4] = pixel_var8, 4; p.var[LUMA_8x8] = pixel_var8, 8; +p.var[LUMA_8x16] = pixel_var8, 16; +p.var[LUMA_8x32] = pixel_var8, 32; + p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; } } diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 18:46:28 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:07:38 2013 +0530 @@ -412,6 +412,15 @@ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu); +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \ +p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu; + +#define LUMA_VAR(cpu) \ +SETUP_PIXEL_VAR_DEF(8, 4, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 8, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 16, cpu); \ +SETUP_PIXEL_VAR_DEF(8, 32, cpu); + namespace x265 { // private x265 namespace @@ -442,6 +451,8 @@ PIXEL_AVG(sse2); PIXEL_AVG_W4(mmx2); +LUMA_VAR(_sse2); + p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2; p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2; p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2; diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 25 18:46:28 2013 +0530 +++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530 @@ -1301,6 +1301,106 @@ %if HIGH_BIT_DEPTH == 0 %macro VAR 0 +cglobal pixel_var_8x4, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +VAR_END 8, 4 + +cglobal pixel_var_8x8, 2,3,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 8, 8 + + +cglobal pixel_var_8x16, 2,4,8 +VAR_START 1 +lea r2,[r1 * 3] +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r2] +DEINTB1, 0, 4, 3, 7 +VAR_CORE +VAR_END 8, 16 + +cglobal pixel_var_8x32, 2,4,8 +VAR_START 1 +mov r2d, 2 +lea r3,[r1 * 3] +.loop: +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +movh m0,[r0] +movh m3,[r0 + r1] +movhpsm0,[r0 + r1 * 2] +movhpsm3,[r0 + r3] +DEINTB1, 0, 4, 3, 7 +lea r0,[r0 + r1 * 4] +VAR_CORE +
[x265] [PATCH] asm: removed unused code in pixel_var module
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1385450061 -19800 # Tue Nov 26 12:44:21 2013 +0530 # Node ID e866b2f9fcd2d4004e968243f18be1fa2a6c87a9 # Parent 9e9767a887e3a91c0953b9bfa17c2f34f03ecf11 asm: removed unused code in pixel_var module diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/pixel.cpp --- a/source/common/pixel.cpp Mon Nov 25 19:28:33 2013 +0530 +++ b/source/common/pixel.cpp Tue Nov 26 12:44:21 2013 +0530 @@ -968,17 +968,8 @@ p.ssim_4x4x2_core = ssim_4x4x2_core; p.ssim_end_4 = ssim_end_4; -p.var[LUMA_8x4] = pixel_var8, 4; p.var[LUMA_8x8] = pixel_var8, 8; -p.var[LUMA_8x16] = pixel_var8, 16; -p.var[LUMA_8x32] = pixel_var8, 32; -p.var[LUMA_16x4] = pixel_var16, 4; -p.var[LUMA_16x8] = pixel_var16, 8; -p.var[LUMA_16x12] = pixel_var16, 12; p.var[LUMA_16x16] = pixel_var16, 16; -p.var[LUMA_16x32] = pixel_var16, 32; -p.var[LUMA_16x64] = pixel_var16, 64; - p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; } } diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/primitives.h --- a/source/common/primitives.hMon Nov 25 19:28:33 2013 +0530 +++ b/source/common/primitives.hTue Nov 26 12:44:21 2013 +0530 @@ -268,7 +268,7 @@ calcrecon_t calcrecon[NUM_SQUARE_BLOCKS]; transpose_t transpose[NUM_SQUARE_BLOCKS]; -var_t var[NUM_LUMA_PARTITIONS]; +var_t var[NUM_SQUARE_BLOCKS]; ssim_4x4x2_core_t ssim_4x4x2_core; ssim_end4_t ssim_end_4; diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 19:28:33 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Nov 26 12:44:21 2013 +0530 @@ -416,16 +416,8 @@ p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu; #define LUMA_VAR(cpu) \ -SETUP_PIXEL_VAR_DEF(8, 4, cpu); \ SETUP_PIXEL_VAR_DEF(8, 8, cpu); \ -SETUP_PIXEL_VAR_DEF(8, 16, cpu); \ -SETUP_PIXEL_VAR_DEF(8, 32, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 4, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 8, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 12, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 16, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 32, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 64, cpu); +SETUP_PIXEL_VAR_DEF(16, 16, cpu); namespace x265 { // private x265 namespace diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 25 19:28:33 2013 +0530 +++ b/source/common/x86/pixel-a.asm Tue Nov 26 12:44:21 2013 +0530 @@ -1254,12 +1254,6 @@ VAR_2ROW 8*SIZEOF_PIXEL, 16 VAR_END 16, 16 -cglobal pixel_var_8x16, 2,3 -FIX_STRIDES r1 -VAR_START 0 -VAR_2ROW r1, 8 -VAR_END 8, 16 - cglobal pixel_var_8x8, 2,3 FIX_STRIDES r1 VAR_START 0 @@ -1301,18 +1295,6 @@ %if HIGH_BIT_DEPTH == 0 %macro VAR 0 -cglobal pixel_var_8x4, 2,3,8 -VAR_START 1 -lea r2,[r1 * 3] -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -VAR_END 8, 4 - cglobal pixel_var_8x8, 2,3,8 VAR_START 1 lea r2,[r1 * 3] @@ -1331,142 +1313,6 @@ VAR_CORE VAR_END 8, 8 - -cglobal pixel_var_8x16, 2,4,8 -VAR_START 1 -lea r2,[r1 * 3] -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -VAR_CORE -VAR_END 8, 16 - -cglobal pixel_var_8x32, 2,4,8 -VAR_START 1 -mov r2d, 2 -lea r3,[r1 * 3] -.loop: -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r3] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r3] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r3] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,
Re: [x265] [PATCH] asm: removed unused code in pixel_var module
Ignore this patch. Need some modifications in C code. On Tue, Nov 26, 2013 at 12:45 PM, muru...@multicorewareinc.com wrote: # HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1385450061 -19800 # Tue Nov 26 12:44:21 2013 +0530 # Node ID e866b2f9fcd2d4004e968243f18be1fa2a6c87a9 # Parent 9e9767a887e3a91c0953b9bfa17c2f34f03ecf11 asm: removed unused code in pixel_var module diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/pixel.cpp --- a/source/common/pixel.cpp Mon Nov 25 19:28:33 2013 +0530 +++ b/source/common/pixel.cpp Tue Nov 26 12:44:21 2013 +0530 @@ -968,17 +968,8 @@ p.ssim_4x4x2_core = ssim_4x4x2_core; p.ssim_end_4 = ssim_end_4; -p.var[LUMA_8x4] = pixel_var8, 4; p.var[LUMA_8x8] = pixel_var8, 8; -p.var[LUMA_8x16] = pixel_var8, 16; -p.var[LUMA_8x32] = pixel_var8, 32; -p.var[LUMA_16x4] = pixel_var16, 4; -p.var[LUMA_16x8] = pixel_var16, 8; -p.var[LUMA_16x12] = pixel_var16, 12; p.var[LUMA_16x16] = pixel_var16, 16; -p.var[LUMA_16x32] = pixel_var16, 32; -p.var[LUMA_16x64] = pixel_var16, 64; - p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; } } diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/primitives.h --- a/source/common/primitives.hMon Nov 25 19:28:33 2013 +0530 +++ b/source/common/primitives.hTue Nov 26 12:44:21 2013 +0530 @@ -268,7 +268,7 @@ calcrecon_t calcrecon[NUM_SQUARE_BLOCKS]; transpose_t transpose[NUM_SQUARE_BLOCKS]; -var_t var[NUM_LUMA_PARTITIONS]; +var_t var[NUM_SQUARE_BLOCKS]; ssim_4x4x2_core_t ssim_4x4x2_core; ssim_end4_t ssim_end_4; diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 19:28:33 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Nov 26 12:44:21 2013 +0530 @@ -416,16 +416,8 @@ p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu; #define LUMA_VAR(cpu) \ -SETUP_PIXEL_VAR_DEF(8, 4, cpu); \ SETUP_PIXEL_VAR_DEF(8, 8, cpu); \ -SETUP_PIXEL_VAR_DEF(8, 16, cpu); \ -SETUP_PIXEL_VAR_DEF(8, 32, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 4, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 8, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 12, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 16, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 32, cpu); \ -SETUP_PIXEL_VAR_DEF(16, 64, cpu); +SETUP_PIXEL_VAR_DEF(16, 16, cpu); namespace x265 { // private x265 namespace diff -r 9e9767a887e3 -r e866b2f9fcd2 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 25 19:28:33 2013 +0530 +++ b/source/common/x86/pixel-a.asm Tue Nov 26 12:44:21 2013 +0530 @@ -1254,12 +1254,6 @@ VAR_2ROW 8*SIZEOF_PIXEL, 16 VAR_END 16, 16 -cglobal pixel_var_8x16, 2,3 -FIX_STRIDES r1 -VAR_START 0 -VAR_2ROW r1, 8 -VAR_END 8, 16 - cglobal pixel_var_8x8, 2,3 FIX_STRIDES r1 VAR_START 0 @@ -1301,18 +1295,6 @@ %if HIGH_BIT_DEPTH == 0 %macro VAR 0 -cglobal pixel_var_8x4, 2,3,8 -VAR_START 1 -lea r2,[r1 * 3] -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -VAR_END 8, 4 - cglobal pixel_var_8x8, 2,3,8 VAR_START 1 lea r2,[r1 * 3] @@ -1331,142 +1313,6 @@ VAR_CORE VAR_END 8, 8 - -cglobal pixel_var_8x16, 2,4,8 -VAR_START 1 -lea r2,[r1 * 3] -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r2] -DEINTB1, 0, 4, 3, 7 -VAR_CORE -VAR_END 8, 16 - -cglobal pixel_var_8x32, 2,4,8 -VAR_START 1 -mov r2d, 2 -lea r3,[r1 * 3] -.loop: -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r3] -DEINTB1, 0, 4, 3, 7 -lea r0,[r0 + r1 * 4] -VAR_CORE -movh m0,[r0] -movh m3,[r0 + r1] -movhpsm0,[r0 + r1 * 2] -movhpsm3,[r0 + r3] -DEINTB1, 0, 4, 3, 7 -lea