[x265] [PATCH] asm code for all_angs_pred_4x4, all modes
# HG changeset patch # User Praveen Tiwari # Date 1386062469 -19800 # Node ID d18c574e0ce928adcbeb2438b9d291058bffb928 # Parent ca7bd538e052d104b1b333691836db37739cfdf0 asm code for all_angs_pred_4x4, all modes diff -r ca7bd538e052 -r d18c574e0ce9 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Mon Dec 02 20:26:19 2013 -0600 +++ b/source/common/CMakeLists.txt Tue Dec 03 14:51:09 2013 +0530 @@ -118,10 +118,10 @@ endif(ENABLE_PRIMITIVES_VEC) if(ENABLE_PRIMITIVES_ASM) -set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h) +set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h allangs-pred.h) set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm ssd-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util8.asm blockcopy8.asm intrapred8.asm - pixeladd8.asm dct8.asm) + pixeladd8.asm dct8.asm allangs-pred8.asm) if (NOT X64) set(A_SRCS ${A_SRCS} pixel-32.asm) endif() diff -r ca7bd538e052 -r d18c574e0ce9 source/common/x86/allangs-pred.h --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/x86/allangs-pred.h Tue Dec 03 14:51:09 2013 +0530 @@ -0,0 +1,31 @@ +/* + * allangspred.h: Intra Prediction metrics + * + * Copyright (C) 2003-2013 x264 project + * + * Authors: Praveen Kumar Tiwariprav...@multicorewareinc.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licens...@x264.com. + */ + +#ifndef X265_ALLANGSPRED_H +#define X265_ALLANGSPRED_H + +void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma); + +#endif diff -r ca7bd538e052 -r d18c574e0ce9 source/common/x86/allangs-pred8.asm --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/x86/allangs-pred8.asm Tue Dec 03 14:51:09 2013 +0530 @@ -0,0 +1,920 @@ +;* +;* Copyright (C) 2013 x265 project +;* +;* Authors: Praveen Kumar Tiwari prav...@multicorewareinc.com +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licens...@multicorewareinc.com. +;*/ + +%include x86inc.asm + +SECTION_RODATA 32 + +tab_6_26: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 0, 0 +tab_12_20: db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 +tab_18_14: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 +tab_24_8: db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 +tab_11_21: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 +tab_22_10: db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 +tab_1_31: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 +tab_15_17: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 +tab_30_2: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 +tab_13_19: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 +tab_28_4: db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 +tab_19_13: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 +tab_25_7: db 25,
[x265] [PATCH] asm: pixel_satd - 12x16, 24x32, 48x64 for 16bpp
# HG changeset patch # User Yuvaraj Venkatesh yuva...@multicorewareinc.com # Date 1386063278 -19800 # Tue Dec 03 15:04:38 2013 +0530 # Node ID 70be1456ef76e3289d91842e0de59cfa0bf06817 # Parent 21adddaee4606b718fe96f4bb2f5aebcbdf80c2a asm: pixel_satd - 12x16, 24x32, 48x64 for 16bpp diff -r 21adddaee460 -r 70be1456ef76 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 11:53:32 2013 +0800 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 15:04:38 2013 +0530 @@ -497,6 +497,9 @@ p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2; p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2; p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2; +p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2; +p.satd[LUMA_24x32] = x265_pixel_satd_24x32_sse2; +p.satd[LUMA_48x64] = x265_pixel_satd_48x64_sse2; p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2; p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2; diff -r 21adddaee460 -r 70be1456ef76 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Dec 03 11:53:32 2013 +0800 +++ b/source/common/x86/pixel-a.asm Tue Dec 03 15:04:38 2013 +0530 @@ -1502,48 +1502,48 @@ call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 -lea r0, [r6 + 8] -lea r2, [r7 + 8] -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -lea r0, [r6 + 16] -lea r2, [r7 + 16] -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -lea r0, [r6 + 24] -lea r2, [r7 + 24] -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -lea r0, [r6 + 32] -lea r2, [r7 + 32] -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -lea r0, [r6 + 40] -lea r2, [r7 + 40] +lea r0, [r6 + 8*SIZEOF_PIXEL] +lea r2, [r7 + 8*SIZEOF_PIXEL] +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +lea r0, [r6 + 16*SIZEOF_PIXEL] +lea r2, [r7 + 16*SIZEOF_PIXEL] +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +lea r0, [r6 + 24*SIZEOF_PIXEL] +lea r2, [r7 + 24*SIZEOF_PIXEL] +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +lea r0, [r6 + 32*SIZEOF_PIXEL] +lea r2, [r7 + 32*SIZEOF_PIXEL] +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +lea r0, [r6 + 40*SIZEOF_PIXEL] +lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 @@ -1572,53 +1572,53 @@ call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 -lea r0, [r6 + 8] +lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] -add r2,8 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -call pixel_satd_8x8_internal2 -lea r0, [r6 + 16] +add r2,8*SIZEOF_PIXEL +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call pixel_satd_8x8_internal2 +call
[x265] [PATCH] asm: pixel_satd_64xN for 16bpp
# HG changeset patch # User Yuvaraj Venkatesh yuva...@multicorewareinc.com # Date 1386063805 -19800 # Tue Dec 03 15:13:25 2013 +0530 # Node ID a616349e2a19c18369a9cf4524202fa6ebe5b6be # Parent 70be1456ef76e3289d91842e0de59cfa0bf06817 asm: pixel_satd_64xN for 16bpp diff -r 70be1456ef76 -r a616349e2a19 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 15:04:38 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 15:13:25 2013 +0530 @@ -500,6 +500,10 @@ p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2; p.satd[LUMA_24x32] = x265_pixel_satd_24x32_sse2; p.satd[LUMA_48x64] = x265_pixel_satd_48x64_sse2; +p.satd[LUMA_64x16] = x265_pixel_satd_64x16_sse2; +p.satd[LUMA_64x32] = x265_pixel_satd_64x32_sse2; +p.satd[LUMA_64x48] = x265_pixel_satd_64x48_sse2; +p.satd[LUMA_64x64] = x265_pixel_satd_64x64_sse2; p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2; p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2; diff -r 70be1456ef76 -r a616349e2a19 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Dec 03 15:04:38 2013 +0530 +++ b/source/common/x86/pixel-a.asm Tue Dec 03 15:13:25 2013 +0530 @@ -1644,35 +1644,42 @@ mov r7, r2 call pixel_satd_8x8_internal call pixel_satd_8x8_internal -lea r0, [r6 + 8] -lea r2, [r7 + 8] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 16] -lea r2, [r7 + 16] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 24] -lea r2, [r7 + 24] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 32] -lea r2, [r7 + 32] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 40] -lea r2, [r7 + 40] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 48] -lea r2, [r7 + 48] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 56] -lea r2, [r7 + 56] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -SATD_END_SSE2 m6 +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 8*SIZEOF_PIXEL] +lea r2, [r7 + 8*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 16*SIZEOF_PIXEL] +lea r2, [r7 + 16*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 24*SIZEOF_PIXEL] +lea r2, [r7 + 24*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 32*SIZEOF_PIXEL] +lea r2, [r7 + 32*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 40*SIZEOF_PIXEL] +lea r2, [r7 + 40*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 48*SIZEOF_PIXEL] +lea r2, [r7 + 48*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 56*SIZEOF_PIXEL] +lea r2, [r7 + 56*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_END_SSE2 m6, m7 %else cglobal pixel_satd_64x16, 4,7,8,0-4;if !WIN64 SATD_START_SSE2 m6, m7 @@ -1680,42 +1687,52 @@ mov [rsp], r2 call pixel_satd_8x8_internal call pixel_satd_8x8_internal -lea r0, [r6 + 8] +%if HIGH_BIT_DEPTH +pxor m7, m7 +%endif +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] -add r2,8 -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 16] +add r2,8*SIZEOF_PIXEL +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] -add r2,16 -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 24] +add r2,16*SIZEOF_PIXEL +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] -add r2,24 -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 32] +add r2,24*SIZEOF_PIXEL +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] -add r2,32 -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 40] +add r2,32*SIZEOF_PIXEL +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +SATD_ACCUM m6, m0, m7 +lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] -add r2,40 -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 48] +add
[x265] [PATCH] asm: pixel_satd_32xN for 16bpp
# HG changeset patch # User Yuvaraj Venkatesh yuva...@multicorewareinc.com # Date 1386064572 -19800 # Tue Dec 03 15:26:12 2013 +0530 # Node ID 31c21157620ce37d18c8d11132caf8c79a7e449a # Parent a616349e2a19c18369a9cf4524202fa6ebe5b6be asm: pixel_satd_32xN for 16bpp diff -r a616349e2a19 -r 31c21157620c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 15:13:25 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 15:26:12 2013 +0530 @@ -491,6 +491,8 @@ if (cpuMask X265_CPU_SSE2) { INIT6(satd, _sse2); +HEVC_SATD(sse2); +p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2; p.satd[LUMA_4x16] = x265_pixel_satd_4x16_sse2; p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2; p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2; @@ -498,12 +500,9 @@ p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2; p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2; p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2; -p.satd[LUMA_24x32] = x265_pixel_satd_24x32_sse2; -p.satd[LUMA_48x64] = x265_pixel_satd_48x64_sse2; -p.satd[LUMA_64x16] = x265_pixel_satd_64x16_sse2; -p.satd[LUMA_64x32] = x265_pixel_satd_64x32_sse2; -p.satd[LUMA_64x48] = x265_pixel_satd_64x48_sse2; -p.satd[LUMA_64x64] = x265_pixel_satd_64x64_sse2; +p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2; +p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2; +p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2; p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2; p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2; diff -r a616349e2a19 -r 31c21157620c source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Dec 03 15:13:25 2013 +0530 +++ b/source/common/x86/pixel-a.asm Tue Dec 03 15:26:12 2013 +0530 @@ -1190,14 +1190,14 @@ mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal -lea r0, [r6 + 8] -lea r2, [r7 + 8] -call pixel_satd_8x8_internal -lea r0, [r6 + 16] -lea r2, [r7 + 16] -call pixel_satd_8x8_internal -lea r0, [r6 + 24] -lea r2, [r7 + 24] +lea r0, [r6 + 8*SIZEOF_PIXEL] +lea r2, [r7 + 8*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +lea r0, [r6 + 16*SIZEOF_PIXEL] +lea r2, [r7 + 16*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +lea r0, [r6 + 24*SIZEOF_PIXEL] +lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal SATD_END_SSE2 m6 %else @@ -1206,17 +1206,17 @@ mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal -lea r0, [r6 + 8] +lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] -add r2, 8 -call pixel_satd_8x8_internal -lea r0, [r6 + 16] +add r2, 8*SIZEOF_PIXEL +call pixel_satd_8x8_internal +lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] -add r2, 16 -call pixel_satd_8x8_internal -lea r0, [r6 + 24] +add r2, 16*SIZEOF_PIXEL +call pixel_satd_8x8_internal +lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] -add r2, 24 +add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal SATD_END_SSE2 m6 %endif @@ -1228,16 +1228,16 @@ mov r7, r2 call pixel_satd_8x8_internal call pixel_satd_8x8_internal -lea r0, [r6 + 8] -lea r2, [r7 + 8] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 16] -lea r2, [r7 + 16] -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 24] -lea r2, [r7 + 24] +lea r0, [r6 + 8*SIZEOF_PIXEL] +lea r2, [r7 + 8*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +lea r0, [r6 + 16*SIZEOF_PIXEL] +lea r2, [r7 + 16*SIZEOF_PIXEL] +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +lea r0, [r6 + 24*SIZEOF_PIXEL] +lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal call pixel_satd_8x8_internal SATD_END_SSE2 m6 @@ -1248,19 +1248,19 @@ mov [rsp], r2 call pixel_satd_8x8_internal call pixel_satd_8x8_internal -lea r0, [r6 + 8] +lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] -add r2, 8 -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 16] +add r2, 8*SIZEOF_PIXEL +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] -add r2, 16 -call pixel_satd_8x8_internal -call pixel_satd_8x8_internal -lea r0, [r6 + 24] +add r2, 16*SIZEOF_PIXEL +call pixel_satd_8x8_internal +call pixel_satd_8x8_internal +lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] -add r2, 24 +add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal call pixel_satd_8x8_internal SATD_END_SSE2 m6 @@ -1274,22 +1274,25 @@ call pixel_satd_8x8_internal call pixel_satd_8x8_internal call pixel_satd_8x8_internal -lea r0, [r6 + 8] -
[x265] [PATCH] aq: bug fix for hash mismatch between recon with decoded output
# HG changeset patch # User Aarthi Thirumalai # Date 1386068495 -19800 # Tue Dec 03 16:31:35 2013 +0530 # Node ID 660ec2c027982db73366560ca8f600e5d86cc2e3 # Parent 86d23688b0174e06f3949c81ac182ba3e83908d1 aq: bug fix for hash mismatch between recon with decoded output diff -r 86d23688b017 -r 660ec2c02798 source/encoder/compress.cpp --- a/source/encoder/compress.cpp Tue Dec 03 11:24:15 2013 +0530 +++ b/source/encoder/compress.cpp Tue Dec 03 16:31:35 2013 +0530 @@ -74,6 +74,7 @@ cu-m_totalBits = m_entropyCoder-getNumberOfWrittenBits(); cu-m_totalCost = m_rdCost-calcRdCost(cu-m_totalDistortion, cu-m_totalBits); +xCheckDQP(cu); } void TEncCu::xComputeCostIntraInInter(TComDataCU* cu, PartSize partSize) @@ -302,6 +303,7 @@ //No-residue mode m_search-encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], true); +xCheckDQP(outTempCU); tmp = outTempCU; outTempCU = outBestCU; @@ -313,6 +315,7 @@ //Encode with residue m_search-estimateRDInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], false); +xCheckDQP(outTempCU); if (outTempCU-m_totalCost outBestCU-m_totalCost)//Choose best from no-residue mode and residue mode { @@ -485,6 +488,7 @@ m_search-estimateRDInterCU(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestResiYuv[depth], m_bestRecoYuv[depth], false); +xCheckDQP(outBestCU); if (m_bestMergeCU[depth]-m_totalCost outBestCU-m_totalCost) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: 10bpp code for pixel_sse_pp for 12x16, 24x32 and 64xN
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1386075793 -19800 # Tue Dec 03 18:33:13 2013 +0530 # Node ID 126f3aefc79dad37e7985953c404ccff370d2729 # Parent 5c2fcf4dfc981de6ede28e6b205e0d27c6d4608d asm: 10bpp code for pixel_sse_pp for 12x16, 24x32 and 64xN diff -r 5c2fcf4dfc98 -r 126f3aefc79d source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Tue Dec 03 12:21:16 2013 +0530 +++ b/source/common/x86/ssd-a.asm Tue Dec 03 18:33:13 2013 +0530 @@ -45,7 +45,7 @@ ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;- %macro SSD_ONE 2 -cglobal pixel_ssd_ss_%1x%2, 4,7,6 +cglobal pixel_ssd_ss_%1x%2, 4,7,8 FIX_STRIDES r1, r3 %if mmsize == %1*2 %define offset0_1 r1 @@ -81,10 +81,14 @@ movum2, [r0+offset0_1] movum3, [r0+offset0_2] movum4, [r0+offset0_3] -psubw m1, [r2] -psubw m2, [r2+offset1_1] -psubw m3, [r2+offset1_2] -psubw m4, [r2+offset1_3] +movum6, [r2] +movum7, [r2+offset1_1] +psubw m1, m6 +psubw m2, m7 +movum6, [r2+offset1_2] +movum7, [r2+offset1_3] +psubw m3, m6 +psubw m4, m7 %if %%n 1 lea r0, [r0+r1*(%2/%%n)] lea r2, [r2+r3*(%2/%%n)] @@ -109,6 +113,205 @@ RET %endmacro +%macro SSD_TWO 2 +cglobal pixel_ssd_ss_%1x%2, 4,7,8 +FIX_STRIDES r1, r3 +pxorm0, m0 +mov r4d, %2/2 +lea r5, [r1 * 2] +lea r6, [r3 * 2] +.loop +movum1, [r0] +movum2, [r0 + 16] +movum3, [r0 + 32] +movum4, [r0 + 48] +movum6, [r2] +movum7, [r2 + 16] +psubw m1, m6 +psubw m2, m7 +movum6, [r2 + 32] +movum7, [r2 + 48] +psubw m3, m6 +psubw m4, m7 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +pmaddwd m4, m4 +paddd m1, m2 +paddd m3, m4 +paddd m0, m1 +paddd m0, m3 +movum1, [r0 + 64] +movum2, [r0 + 80] +movum6, [r2 + 64] +movum7, [r2 + 80] +psubw m1, m6 +psubw m2, m7 +pmaddwd m1, m1 +pmaddwd m2, m2 +paddd m1, m2 +paddd m0, m1 +%if %1 == 64 +movum3, [r0 + 96] +movum4, [r0 + 112] +movum6, [r2 + 96] +movum7, [r2 + 112] +psubw m3, m6 +psubw m4, m7 +pmaddwd m3, m3 +pmaddwd m4, m4 +paddd m3, m4 +paddd m0, m3 +%endif +movum1, [r0 + r1] +movum2, [r0 + r1 + 16] +movum3, [r0 + r1 + 32] +movum4, [r0 + r1 + 48] +movum6, [r2 + r3] +movum7, [r2 + r3 + 16] +psubw m1, m6 +psubw m2, m7 +movum6, [r2 + r3 + 32] +movum7, [r2 + r3 + 48] +psubw m3, m6 +psubw m4, m7 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +pmaddwd m4, m4 +paddd m1, m2 +paddd m3, m4 +paddd m0, m1 +paddd m0, m3 +movum1, [r0 + r1 + 64] +movum2, [r0 + r1 + 80] +movum6, [r2 + r3 + 64] +movum7, [r2 + r3 + 80] +psubw m1, m6 +psubw m2, m7 +pmaddwd m1, m1 +pmaddwd m2, m2 +paddd m1, m2 +paddd m0, m1 +%if %1 == 64 +movum3, [r0 + r1 + 96] +movum4, [r0 + r1 + 112] +movum6, [r2 + r3 + 96] +movum7, [r2 + r3 + 112] +psubw m3, m6 +psubw m4, m7 +pmaddwd m3, m3 +pmaddwd m4, m4 +paddd m3, m4 +paddd m0, m3 +%endif +lea r0, [r0 + r5] +lea r2, [r2 + r6] +dec r4d +jnz .loop +HADDD m0, m5 +movd eax, xm0 +RET +%endmacro +%macro SSD_24 2 +cglobal pixel_ssd_ss_%1x%2, 4,7,8 +FIX_STRIDES r1, r3 +pxorm0, m0 +mov r4d, %2/2 +lea r5, [r1 * 2] +lea r6, [r3 * 2] +.loop +movum1, [r0] +movum2, [r0 + 16] +movum3, [r0 + 32] +movum5, [r2] +movum6, [r2 + 16] +movum7, [r2 + 32] +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m1, m2 +paddd m0, m1 +movum1, [r0 + r1] +movum2, [r0 + r1 + 16] +movum4, [r0 + r1 + 32] +movum5, [r2 + r3] +movum6, [r2 + r3 + 16] +movum7, [r2 + r3 + 32] +psubw m1, m5 +psubw m2, m6 +psubw m4, m7 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m4, m4 +paddd m1, m2 +paddd m3, m4 +paddd m0, m1 +paddd m0, m3 +lea r0, [r0 + r5] +lea r2, [r2 + r6] +dec r4d +jnz .loop +HADDD m0, m5 +movd eax, xm0 +RET +%endmacro +%macro SSD_12 2 +cglobal pixel_ssd_ss_%1x%2, 4,7,8 +FIX_STRIDES r1, r3 +pxorm0, m0 +mov r4d, %2/4 +lea r5, [r1 * 2] +lea r6, [r3 * 2] +.loop +movum1, [r0] +
Re: [x265] [PATCH Review only] asm: 10bpp code for transpose 4x4 and 8x8
Ignore this patch. Need modifications for 16x16. On Tue, Dec 3, 2013 at 7:08 PM, muru...@multicorewareinc.com wrote: # HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1386077908 -19800 # Tue Dec 03 19:08:28 2013 +0530 # Node ID 1ae4e8ae04d0792db6590a62272990d83f49a265 # Parent 126f3aefc79dad37e7985953c404ccff370d2729 asm: 10bpp code for transpose 4x4 and 8x8 diff -r 126f3aefc79d -r 1ae4e8ae04d0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 18:33:13 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 19:08:28 2013 +0530 @@ -520,6 +520,9 @@ p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2; p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2; +p.transpose[BLOCK_4x4] = x265_transpose4_sse2; +p.transpose[BLOCK_8x8] = x265_transpose8_sse2; + p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; PIXEL_AVG(sse2); PIXEL_AVG_W4(mmx2); diff -r 126f3aefc79d -r 1ae4e8ae04d0 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Dec 03 18:33:13 2013 +0530 +++ b/source/common/x86/pixel-util8.asm Tue Dec 03 19:08:28 2013 +0530 @@ -830,7 +830,20 @@ ;- INIT_XMM sse2 cglobal transpose4, 3, 3, 4, dest, src, stride - +%if HIGH_BIT_DEPTH +add r2,r2 +movh m0,[r1] +movh m1,[r1 + r2] +movh m2,[r1 + 2 * r2] +lea r1,[r1 + 2 * r2] +movh m3,[r1 + r2] +punpcklwdm0,m1 +punpcklwdm2,m3 +punpckhdqm1,m0,m2 +punpckldqm0,m2 +movu [r0], m0 +movu [r0 + 16], m1 +%else movd m0,[r1] movd m1,[r1 + r2] movd m2,[r1 + 2 * r2] @@ -841,26 +854,61 @@ punpcklbwm2,m3 punpcklwdm0,m2 movu [r0],m0 - +%endif RET ;- ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride) ;- INIT_XMM sse2 -cglobal transpose8, 3, 3, 8, dest, src, stride - +%if HIGH_BIT_DEPTH +%macro TRANSPOSE_4x4 1 movh m0,[r1] movh m1,[r1 + r2] movh m2,[r1 + 2 * r2] lea r1,[r1 + 2 * r2] movh m3,[r1 + r2] -movh m4,[r1 + 2 * r2] -lea r1,[r1 + 2 * r2] +punpcklwdm0,m1 +punpcklwdm2,m3 +punpckhdqm1,m0,m2 +punpckldqm0,m2 +movlps [r0], m0 +movhps [r0 + %1],m0 +movlps [r0 + 2 * %1],m1 +lear0, [r0 + 2 * %1] +movhps [r0 + %1],m1 +%endmacro +cglobal transpose8_internal +TRANSPOSE_4x4 r5 +lear1,[r1 + 2 * r2] +lear0,[r3 + 8] +TRANSPOSE_4x4 r5 +lear1,[r4 + 8] +lear0,[r3 + 4 * r5] +TRANSPOSE_4x4 r5 +lear1,[r1 + 2 * r2] +lear0,[r3 + 8 + 4 * r5] +TRANSPOSE_4x4 r5 +ret +cglobal transpose8, 3, 6, 4, dest, src, stride +addr2,r2 +movr3,r0 +movr4,r1 +movr5,16 +call transpose8_internal +%else +cglobal transpose8, 3, 5, 8, dest, src, stride +lea r3,[2 * r2] +lea r4,[3 * r2] +movh m0,[r1] +movh m1,[r1 + r2] +movh m2,[r1 + r3] +movh m3,[r1 + r4] +movh m4,[r1 + 4 * r2] +lea r1,[r1 + 4 * r2] movh m5,[r1 + r2] -movh m6,[r1 + 2 * r2] -lea r1,[r1 + 2 * r2] -movh m7,[r1 + r2] +movh m6,[r1 + r3] +movh m7,[r1 + r4] punpcklbwm0,m1 punpcklbwm2,m3 @@ -880,7 +928,7 @@ movu [r0 + 16],m2 movu [r0 + 32],m1 movu [r0 + 48],m3 - +%endif RET %macro TRANSPOSE_8x8 1 -- With Regards, Murugan. V +919659287478 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH Review only] asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1386082906 -19800 # Tue Dec 03 20:31:46 2013 +0530 # Node ID 99134096118bff621f56949e3922cd3f53afdf10 # Parent 126f3aefc79dad37e7985953c404ccff370d2729 asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks diff -r 126f3aefc79d -r 99134096118b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 18:33:13 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 20:31:46 2013 +0530 @@ -520,6 +520,10 @@ p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2; p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2; +p.transpose[BLOCK_4x4] = x265_transpose4_sse2; +p.transpose[BLOCK_8x8] = x265_transpose8_sse2; +p.transpose[BLOCK_16x16] = x265_transpose16_sse2; + p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; PIXEL_AVG(sse2); PIXEL_AVG_W4(mmx2); diff -r 126f3aefc79d -r 99134096118b source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Dec 03 18:33:13 2013 +0530 +++ b/source/common/x86/pixel-util8.asm Tue Dec 03 20:31:46 2013 +0530 @@ -830,7 +830,20 @@ ;- INIT_XMM sse2 cglobal transpose4, 3, 3, 4, dest, src, stride - +%if HIGH_BIT_DEPTH +add r2,r2 +movh m0,[r1] +movh m1,[r1 + r2] +movh m2,[r1 + 2 * r2] +lea r1,[r1 + 2 * r2] +movh m3,[r1 + r2] +punpcklwdm0,m1 +punpcklwdm2,m3 +punpckhdqm1,m0,m2 +punpckldqm0,m2 +movu [r0], m0 +movu [r0 + 16], m1 +%else movd m0,[r1] movd m1,[r1 + r2] movd m2,[r1 + 2 * r2] @@ -841,15 +854,88 @@ punpcklbwm2,m3 punpcklwdm0,m2 movu [r0],m0 - +%endif RET ;- ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride) ;- INIT_XMM sse2 -cglobal transpose8, 3, 3, 8, dest, src, stride - +%if HIGH_BIT_DEPTH +%macro TRANSPOSE_4x4 1 +movh m0,[r1] +movh m1,[r1 + r2] +movh m2,[r1 + 2 * r2] +lea r1,[r1 + 2 * r2] +movh m3,[r1 + r2] +punpcklwdm0,m1 +punpcklwdm2,m3 +punpckhdqm1,m0,m2 +punpckldqm0,m2 +movlps [r0], m0 +movhps [r0 + %1],m0 +movlps [r0 + 2 * %1],m1 +lear0, [r0 + 2 * %1] +movhps [r0 + %1],m1 +%endmacro +cglobal transpose8_internal +TRANSPOSE_4x4 r5 +lear1,[r1 + 2 * r2] +lear0,[r3 + 8] +TRANSPOSE_4x4 r5 +lear1,[r1 + 2 * r2] +negr2 +lear1,[r1 + r2 * 8 + 8] +negr2 +lear0,[r3 + 4 * r5] +TRANSPOSE_4x4 r5 +lear1,[r1 + 2 * r2] +lear0,[r3 + 8 + 4 * r5] +TRANSPOSE_4x4 r5 +ret +cglobal transpose8, 3, 6, 4, dest, src, stride +addr2,r2 +movr3,r0 +movr4,r1 +movr5,16 +call transpose8_internal +ret +%else +cglobal transpose8, 3, 5, 8, dest, src, stride +lea r3,[2 * r2] +lea r4,[3 * r2] +movh m0,[r1] +movh m1,[r1 + r2] +movh m2,[r1 + r3] +movh m3,[r1 + r4] +movh m4,[r1 + 4 * r2] +lea r1,[r1 + 4 * r2] +movh m5,[r1 + r2] +movh m6,[r1 + r3] +movh m7,[r1 + r4] + +punpcklbwm0,m1 +punpcklbwm2,m3 +punpcklbwm4,m5 +punpcklbwm6,m7 + +punpckhwdm1,m0,m2 +punpcklwdm0,m2 +punpckhwdm5,m4,m6 +punpcklwdm4,m6 +punpckhdqm2,m0,m4 +punpckldqm0,m4 +punpckhdqm3,m1,m5 +punpckldqm1,m5 + +movu [r0], m0 +movu [r0 + 16],m2 +movu [r0 + 32],m1 +movu [r0 + 48],m3 +%endif +RET + +%macro TRANSPOSE_8x8 1 movh m0,[r1] movh m1,[r1 + r2] movh m2,[r1 + 2 * r2] @@ -866,42 +952,6 @@ punpcklbwm2,m3 punpcklbwm4,m5 punpcklbwm6,m7 - -punpckhwdm1,m0,m2 -punpcklwdm0,m2 -punpckhwdm5,m4,m6 -punpcklwdm4,m6 -punpckhdqm2,m0,m4 -punpckldqm0,m4 -punpckhdqm3,m1,m5 -punpckldqm1,m5 - -movu [r0], m0 -movu [r0 + 16],m2 -movu [r0 + 32],m1 -movu [r0 + 48],m3 - -RET -
Re: [x265] [PATCH Review only] asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks
I verify your 4x4 and 8x8 code, it is right, I pushed it, may you make a new 16x16 patch? At 2013-12-03 23:02:09,muru...@multicorewareinc.com wrote: # HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1386082906 -19800 # Tue Dec 03 20:31:46 2013 +0530 # Node ID 99134096118bff621f56949e3922cd3f53afdf10 # Parent 126f3aefc79dad37e7985953c404ccff370d2729 asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks diff -r 126f3aefc79d -r 99134096118b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 18:33:13 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 20:31:46 2013 +0530 @@ -520,6 +520,10 @@ p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2; p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2; +p.transpose[BLOCK_4x4] = x265_transpose4_sse2; +p.transpose[BLOCK_8x8] = x265_transpose8_sse2; +p.transpose[BLOCK_16x16] = x265_transpose16_sse2; + p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; PIXEL_AVG(sse2); PIXEL_AVG_W4(mmx2); diff -r 126f3aefc79d -r 99134096118b source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asmTue Dec 03 18:33:13 2013 +0530 +++ b/source/common/x86/pixel-util8.asmTue Dec 03 20:31:46 2013 +0530 @@ -830,7 +830,20 @@ ;- INIT_XMM sse2 cglobal transpose4, 3, 3, 4, dest, src, stride - +%if HIGH_BIT_DEPTH +add r2,r2 +movh m0,[r1] +movh m1,[r1 + r2] +movh m2,[r1 + 2 * r2] +lea r1,[r1 + 2 * r2] +movh m3,[r1 + r2] +punpcklwdm0,m1 +punpcklwdm2,m3 +punpckhdqm1,m0,m2 +punpckldqm0,m2 +movu [r0], m0 +movu [r0 + 16], m1 +%else movd m0,[r1] movd m1,[r1 + r2] movd m2,[r1 + 2 * r2] @@ -841,15 +854,88 @@ punpcklbwm2,m3 punpcklwdm0,m2 movu [r0],m0 - +%endif RET ;- ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride) ;- INIT_XMM sse2 -cglobal transpose8, 3, 3, 8, dest, src, stride - +%if HIGH_BIT_DEPTH +%macro TRANSPOSE_4x4 1 +movh m0,[r1] +movh m1,[r1 + r2] +movh m2,[r1 + 2 * r2] +lea r1,[r1 + 2 * r2] +movh m3,[r1 + r2] +punpcklwdm0,m1 +punpcklwdm2,m3 +punpckhdqm1,m0,m2 +punpckldqm0,m2 +movlps [r0], m0 +movhps [r0 + %1],m0 +movlps [r0 + 2 * %1],m1 +lear0, [r0 + 2 * %1] +movhps [r0 + %1],m1 +%endmacro +cglobal transpose8_internal +TRANSPOSE_4x4 r5 +lear1,[r1 + 2 * r2] +lear0,[r3 + 8] +TRANSPOSE_4x4 r5 +lear1,[r1 + 2 * r2] +negr2 +lear1,[r1 + r2 * 8 + 8] +negr2 +lear0,[r3 + 4 * r5] +TRANSPOSE_4x4 r5 +lear1,[r1 + 2 * r2] +lear0,[r3 + 8 + 4 * r5] +TRANSPOSE_4x4 r5 +ret +cglobal transpose8, 3, 6, 4, dest, src, stride +addr2,r2 +movr3,r0 +movr4,r1 +movr5,16 +call transpose8_internal +ret +%else +cglobal transpose8, 3, 5, 8, dest, src, stride +lea r3,[2 * r2] +lea r4,[3 * r2] +movh m0,[r1] +movh m1,[r1 + r2] +movh m2,[r1 + r3] +movh m3,[r1 + r4] +movh m4,[r1 + 4 * r2] +lea r1,[r1 + 4 * r2] +movh m5,[r1 + r2] +movh m6,[r1 + r3] +movh m7,[r1 + r4] + +punpcklbwm0,m1 +punpcklbwm2,m3 +punpcklbwm4,m5 +punpcklbwm6,m7 + +punpckhwdm1,m0,m2 +punpcklwdm0,m2 +punpckhwdm5,m4,m6 +punpcklwdm4,m6 +punpckhdqm2,m0,m4 +punpckldqm0,m4 +punpckhdqm3,m1,m5 +punpckldqm1,m5 + +movu [r0], m0 +movu [r0 + 16],m2 +movu [r0 + 32],m1 +movu [r0 + 48],m3 +%endif +RET + +%macro TRANSPOSE_8x8 1 movh m0,[r1] movh m1,[r1 + r2] movh m2,[r1 + 2 * r2] @@ -866,42 +952,6 @@ punpcklbwm2,m3 punpcklbwm4,m5 punpcklbwm6,m7 - -punpckhwdm1,m0,m2 -punpcklwdm0,m2 -punpckhwdm5,m4,m6 -punpcklwdm4,m6 -punpckhdqm2,m0,m4 -punpckldqm0,m4 -punpckhdqm3,m1,m5 -punpckldqm1,
Re: [x265] [PATCH] aq: bug fix for hash mismatch between recon with decoded output
Pushed. So what are the latest results for different sequences on enabling aq-mode? On Tue, Dec 3, 2013 at 4:31 PM, Aarthi Thirumalai aar...@multicorewareinc.com wrote: # HG changeset patch # User Aarthi Thirumalai # Date 1386068495 -19800 # Tue Dec 03 16:31:35 2013 +0530 # Node ID 660ec2c027982db73366560ca8f600e5d86cc2e3 # Parent 86d23688b0174e06f3949c81ac182ba3e83908d1 aq: bug fix for hash mismatch between recon with decoded output diff -r 86d23688b017 -r 660ec2c02798 source/encoder/compress.cpp --- a/source/encoder/compress.cpp Tue Dec 03 11:24:15 2013 +0530 +++ b/source/encoder/compress.cpp Tue Dec 03 16:31:35 2013 +0530 @@ -74,6 +74,7 @@ cu-m_totalBits = m_entropyCoder-getNumberOfWrittenBits(); cu-m_totalCost = m_rdCost-calcRdCost(cu-m_totalDistortion, cu-m_totalBits); +xCheckDQP(cu); } void TEncCu::xComputeCostIntraInInter(TComDataCU* cu, PartSize partSize) @@ -302,6 +303,7 @@ //No-residue mode m_search-encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], true); +xCheckDQP(outTempCU); tmp = outTempCU; outTempCU = outBestCU; @@ -313,6 +315,7 @@ //Encode with residue m_search-estimateRDInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], false); +xCheckDQP(outTempCU); if (outTempCU-m_totalCost outBestCU-m_totalCost)//Choose best from no-residue mode and residue mode { @@ -485,6 +488,7 @@ m_search-estimateRDInterCU(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestResiYuv[depth], m_bestRecoYuv[depth], false); +xCheckDQP(outBestCU); if (m_bestMergeCU[depth]-m_totalCost outBestCU-m_totalCost) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [ANN] x265 0.6 is released
x265 0.6 is a regularly scheduled release There were large improvements in compression efficiency since 0.5, mostly a result of the completion of weightp and b-pyramid. There is also a large amount of new assembly code; replacing most of the compiler intrinsic functions and adding coverage for some new primitives. = New Features = * CLI reads input video from stdin * Main10 profile is enabled, requires a HIGH_BIT_DEPTH build * weightp is now complete enough to be enabled by default * performance presets have been defined, matching x264 preset names * b-pyramid (hierarchical B frames) now supported * Constant Rate Factor rate control is considered stable * Adaptive Quantization introduced, still experimental Adaptive Quantization is still considered experimental. We are not always seeing the expected improvements to SSIM when it is enabled, and thus it is still not enabled by default. = API Changes = * x265_nal data members renamed * x265_picture now has colorSpace member * --weightp enabled by default * default parameters now match our medium preset * new x265_param_default_preset() method for assigning preset and tune * new x265_param_alloc() and x265_param_free() methods for version safety * new x265_picture_alloc() and x265_picture_free() methods for version safety The public data structures have changed enough that apps compiled against previous versions of x265 must be recompiled to use x265 0.6. We are taking steps to add version safety to the public interface. If you use the new alloc/free methods for the param and picture structures, and use x265_param_parse() to set param values by name, you will likely not have to recompile your application to dynamically link against later releases of x265. = New CLI Options = * --y4m overrides detection of Y4M input stream, ex: x265 --y4m - out.hevc vid.y4m * --version long option alias for -V * -p/--preset sets performance preset * -t/--tune sets parameter tuning * --[no-]b-pyramid enabled by default * --input-csp color space parameter, only i420 is supported in this release * --crf constant rate factor rate control * --aq-mode and --aq-strength See x265 --help for more details = Upcoming improvements = * motion compensated weightp analysis (using lookahead data) * CU-tree (MBtree adapted from x264) * VBV rate control * assembly for HIGH_BIT_DEPTH builds -- Steve Borho ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] 16bpp: assembly code for sad_NxN functions
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1386137638 -19800 # Wed Dec 04 11:43:58 2013 +0530 # Node ID 3e6159dfd6a1b59beedf430894b997e848e9e3b1 # Parent 55c0bf9d99661073a7acdb5749e2625379d8393a 16bpp: assembly code for sad_NxN functions diff -r 55c0bf9d9966 -r 3e6159dfd6a1 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Tue Dec 03 14:14:44 2013 -0600 +++ b/source/common/CMakeLists.txt Wed Dec 04 11:43:58 2013 +0530 @@ -118,9 +118,15 @@ if(ENABLE_PRIMITIVES_ASM) set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h) -set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm ssd-a.asm mc-a.asm +set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util8.asm blockcopy8.asm intrapred8.asm pixeladd8.asm dct8.asm) +if(HIGH_BIT_DEPTH) +set(A_SRCS ${A_SRCS} sad16-a.asm) +else() +set(A_SRCS ${A_SRCS} sad-a.asm) +endif() + if (NOT X64) set(A_SRCS ${A_SRCS} pixel-32.asm) endif() diff -r 55c0bf9d9966 -r 3e6159dfd6a1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 14:14:44 2013 -0600 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 11:43:58 2013 +0530 @@ -533,6 +533,27 @@ PIXEL_AVG(sse2); PIXEL_AVG_W4(mmx2); LUMA_VAR(_sse2); + +INIT8(sad, _mmx2); +p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2; +p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2; +p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2; +p.sad[LUMA_16x32] = x265_pixel_sad_16x32_sse2; + +p.sad[LUMA_32x8] = x265_pixel_sad_32x8_sse2; +p.sad[LUMA_32x16] = x265_pixel_sad_32x16_sse2; +p.sad[LUMA_32x24] = x265_pixel_sad_32x24_sse2; +p.sad[LUMA_32x32] = x265_pixel_sad_32x32_sse2; +p.sad[LUMA_32x64] = x265_pixel_sad_32x64_sse2; + +p.sad[LUMA_64x16] = x265_pixel_sad_64x16_sse2; +p.sad[LUMA_64x32] = x265_pixel_sad_64x32_sse2; +p.sad[LUMA_64x48] = x265_pixel_sad_64x48_sse2; +p.sad[LUMA_64x64] = x265_pixel_sad_64x64_sse2; + +p.sad[LUMA_48x64] = x265_pixel_sad_48x64_sse2; +p.sad[LUMA_24x32] = x265_pixel_sad_24x32_sse2; +p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2; } if (cpuMask X265_CPU_SSSE3) { diff -r 55c0bf9d9966 -r 3e6159dfd6a1 source/common/x86/sad16-a.asm --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/x86/sad16-a.asm Wed Dec 04 11:43:58 2013 +0530 @@ -0,0 +1,772 @@ +;* +;* sad16-a.asm: x86 high depth sad functions +;* +;* Copyright (C) 2010-2013 x264 project +;* +;* Authors: Oskar Arvidsson os...@irock.se +;* Henrik Gramner hen...@gramner.com +;* Dnyaneshwar Gorade dnyanesh...@multicorewareinc.com +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licens...@x264.com. +;* + +%include x86inc.asm +%include x86util.asm + +SECTION .text + +cextern pw_1 + +;= +; SAD MMX +;= + +%macro SAD_INC_1x16P_MMX 0 +movum1, [r0+ 0] +movum2, [r0+ 8] +movum3, [r0+16] +movum4, [r0+24] +psubw m1, [r2+ 0] +psubw m2, [r2+ 8] +psubw m3, [r2+16] +psubw m4, [r2+24] +ABSW2 m1, m2, m1, m2, m5, m6 +ABSW2 m3, m4, m3, m4, m7, m5 +lea r0, [r0+2*r1] +lea r2, [r2+2*r3] +paddw m1, m2 +paddw m3, m4 +paddw m0, m1 +paddw m0, m3 +%endmacro + +%macro SAD_INC_2x8P_MMX 0 +movum1, [r0+0] +movum2, [r0+8] +movum3, [r0+2*r1+0] +movum4, [r0+2*r1+8] +psubw m1, [r2+0] +psubw m2, [r2+8] +psubw m3, [r2+2*r3+0] +psubw m4, [r2+2*r3+8] +ABSW2 m1, m2, m1, m2, m5, m6 +ABSW2 m3, m4, m3, m4, m7, m5 +lea
[x265] [PATCH] asm: 10bpp code for transpose 16x16
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1386138979 -19800 # Wed Dec 04 12:06:19 2013 +0530 # Node ID 8b73b22d90e1a0d70495e8b5f009a9c4fc37f258 # Parent 55c0bf9d99661073a7acdb5749e2625379d8393a asm: 10bpp code for transpose 16x16 diff -r 55c0bf9d9966 -r 8b73b22d90e1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 14:14:44 2013 -0600 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 12:06:19 2013 +0530 @@ -528,6 +528,7 @@ p.transpose[BLOCK_4x4] = x265_transpose4_sse2; p.transpose[BLOCK_8x8] = x265_transpose8_sse2; +p.transpose[BLOCK_16x16] = x265_transpose16_sse2; p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; PIXEL_AVG(sse2); diff -r 55c0bf9d9966 -r 8b73b22d90e1 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Dec 03 14:14:44 2013 -0600 +++ b/source/common/x86/pixel-util8.asm Wed Dec 04 12:06:19 2013 +0530 @@ -883,7 +883,10 @@ lear1,[r1 + 2 * r2] lear0,[r3 + 8] TRANSPOSE_4x4 r5 -lear1,[r4 + 8] +lear1,[r1 + 2 * r2] +negr2 +lear1,[r1 + r2 * 8 + 8] +negr2 lear0,[r3 + 4 * r5] TRANSPOSE_4x4 r5 lear1,[r1 + 2 * r2] @@ -893,7 +896,6 @@ cglobal transpose8, 3, 6, 4, dest, src, stride addr2,r2 movr3,r0 -movr4,r1 movr5,16 call transpose8_internal %else @@ -978,8 +980,29 @@ ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride) ;- INIT_XMM sse2 +%if HIGH_BIT_DEPTH +cglobal transpose16, 3, 7, 4, dest, src, stride +addr2,r2 +movr3,r0 +movr4,r1 +movr5,32 +movr6,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16] +movr3,r0 +call transpose8_internal +lear1,[r4 + 16] +lear0,[r6 + 8 * r5] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * r5 + 16] +movr3,r0 +call transpose8_internal + +%else cglobal transpose16, 3, 5, 8, dest, src, stride - movr3,r0 movr4,r1 TRANSPOSE_8x8 16 @@ -992,7 +1015,7 @@ lear1,[r1 + 2 * r2] lear0,[r3 + 8 * 16 + 8] TRANSPOSE_8x8 16 - +%endif RET cglobal transpose16_internal ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] 16bpp: enabled avt32to16_shr and cvt16to32_shl assembly code
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1386139398 -19800 # Wed Dec 04 12:13:18 2013 +0530 # Node ID 5b95aefb3aaec8e63e6cb54998b5add7e585841f # Parent 3e6159dfd6a1b59beedf430894b997e848e9e3b1 16bpp: enabled avt32to16_shr and cvt16to32_shl assembly code diff -r 3e6159dfd6a1 -r 5b95aefb3aae source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 04 11:43:58 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 12:13:18 2013 +0530 @@ -554,6 +554,9 @@ p.sad[LUMA_48x64] = x265_pixel_sad_48x64_sse2; p.sad[LUMA_24x32] = x265_pixel_sad_24x32_sse2; p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2; + +p.cvt32to16_shr = x265_cvt32to16_shr_sse2; +p.cvt16to32_shl = x265_cvt16to32_shl_sse2; } if (cpuMask X265_CPU_SSSE3) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: 10bpp code for transpose 32x32
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1386140597 -19800 # Wed Dec 04 12:33:17 2013 +0530 # Node ID ee1221fac033355129128ba5f847910e3ed49047 # Parent 8b73b22d90e1a0d70495e8b5f009a9c4fc37f258 asm: 10bpp code for transpose 32x32 diff -r 8b73b22d90e1 -r ee1221fac033 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 04 12:06:19 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 12:33:17 2013 +0530 @@ -529,6 +529,7 @@ p.transpose[BLOCK_4x4] = x265_transpose4_sse2; p.transpose[BLOCK_8x8] = x265_transpose8_sse2; p.transpose[BLOCK_16x16] = x265_transpose16_sse2; +p.transpose[BLOCK_32x32] = x265_transpose32_sse2; p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; PIXEL_AVG(sse2); diff -r 8b73b22d90e1 -r ee1221fac033 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Wed Dec 04 12:06:19 2013 +0530 +++ b/source/common/x86/pixel-util8.asm Wed Dec 04 12:33:17 2013 +0530 @@ -1039,8 +1039,76 @@ ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride) ;- INIT_XMM sse2 +%if HIGH_BIT_DEPTH +cglobal transpose32, 3, 7, 4, dest, src, stride +addr2,r2 +movr3,r0 +movr4,r1 +movr5,64 +movr6,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 32] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 48] +movr3,r0 +call transpose8_internal +lear1,[r4 + 16] +lear0,[r6 + 8 * 64] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 64 + 16] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 64 + 32] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 64 + 48] +movr3,r0 +call transpose8_internal +lear1,[r4 + 32] +lear0,[r6 + 16 * 64] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 64 + 16] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 64 + 32] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 64 + 48] +movr3,r0 +call transpose8_internal +lear1,[r4 + 48] +lear0,[r6 + 24 * 64] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 24 * 64 + 16] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 24 * 64 + 32] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 24 * 64 + 48] +movr3,r0 +call transpose8_internal +%else cglobal transpose32, 3, 7, 8, dest, src, stride - movr3,r0 movr4,r1 movr5,r0 @@ -1058,7 +1126,7 @@ lear0,[r3 + 16 * 32 + 16] movr5,r0 call transpose16_internal - +%endif RET ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: 16bpp asm code for pixel_sa8d_16xN
# HG changeset patch # User Yuvaraj Venkatesh yuva...@multicorewareinc.com # Date 1386140982 -19800 # Wed Dec 04 12:39:42 2013 +0530 # Node ID 6a41cb559feb98056d30482651f5a83f5e326300 # Parent 55c0bf9d99661073a7acdb5749e2625379d8393a asm: 16bpp asm code for pixel_sa8d_16xN diff -r 55c0bf9d9966 -r 6a41cb559feb source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 14:14:44 2013 -0600 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 12:39:42 2013 +0530 @@ -504,6 +504,18 @@ p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2; p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2; +p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2; +p.sa8d_inter[LUMA_4x8] = x265_pixel_satd_4x8_sse2; +p.sa8d_inter[LUMA_4x16] = x265_pixel_satd_4x16_sse2; +p.sa8d_inter[LUMA_8x4] = x265_pixel_satd_8x4_sse2; +p.sa8d_inter[LUMA_8x16] = x265_pixel_sa8d_8x16_sse2; +p.sa8d_inter[LUMA_8x32] = x265_pixel_sa8d_8x32_sse2; +p.sa8d_inter[LUMA_12x16] = x265_pixel_satd_12x16_sse2; +p.sa8d_inter[LUMA_16x4] = x265_pixel_satd_16x4_sse2; +p.sa8d_inter[LUMA_16x8] = x265_pixel_sa8d_16x8_sse2; +p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_sse2; +p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_sse2; +p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_sse2; p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2; p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2; diff -r 55c0bf9d9966 -r 6a41cb559feb source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Dec 03 14:14:44 2013 -0600 +++ b/source/common/x86/pixel-a.asm Wed Dec 04 12:39:42 2013 +0530 @@ -2501,8 +2501,10 @@ %endmacro %macro AVG_16x16 0 -paddusw m0, [esp+48] +SA8D_INTER +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 +%endif movd r4d, m0 add r4d, 1 shr r4d, 1 @@ -2630,8 +2632,8 @@ mova m7, [hmul_8p] %endif SA8D_8x8 -add r0, 8 -add r2, 8 +add r0, 8*SIZEOF_PIXEL +add r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET @@ -3601,6 +3603,9 @@ lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER @@ -3614,8 +3619,10 @@ SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 -paddusw m0, [esp+48] +SA8D_INTER +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 +%endif movd r4d, m0 add r4d, 1 shr r4d, 1 @@ -3629,6 +3636,9 @@ lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER @@ -3646,8 +3656,10 @@ SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 -paddusw m0, [esp+48] +SA8D_INTER +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 +%endif movd r4d, m0 add r4d, 1 shr r4d, 1 @@ -3665,6 +3677,9 @@ lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER @@ -3678,8 +3693,10 @@ SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 -paddusw m0, [esp+48] +SA8D_INTER +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 +%endif movd r4d, m0 add r4d, 1 shr r4d, 1 @@ -3696,6 +3713,9 @@ lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER @@ -3722,6 +3742,9 @@ lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER @@ -3748,6 +3771,9 @@ lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER @@ -3761,8 +3787,10 @@ SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 -paddusw m0, [esp+48] +SA8D_INTER +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 +%endif movd r4d, m0 add r4d, 1 shr r4d, 1 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: 10bpp code for transpose 64x64
# HG changeset patch # User Murugan Vairavel muru...@multicorewareinc.com # Date 1386141799 -19800 # Wed Dec 04 12:53:19 2013 +0530 # Node ID e1e18d9cd5b0fa7d14c655819bd347a5c8accbde # Parent ee1221fac033355129128ba5f847910e3ed49047 asm: 10bpp code for transpose 64x64 diff -r ee1221fac033 -r e1e18d9cd5b0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 04 12:33:17 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 12:53:19 2013 +0530 @@ -530,6 +530,7 @@ p.transpose[BLOCK_8x8] = x265_transpose8_sse2; p.transpose[BLOCK_16x16] = x265_transpose16_sse2; p.transpose[BLOCK_32x32] = x265_transpose32_sse2; +p.transpose[BLOCK_64x64] = x265_transpose64_sse2; p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; PIXEL_AVG(sse2); diff -r ee1221fac033 -r e1e18d9cd5b0 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Wed Dec 04 12:33:17 2013 +0530 +++ b/source/common/x86/pixel-util8.asm Wed Dec 04 12:53:19 2013 +0530 @@ -1133,8 +1133,275 @@ ; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride) ;- INIT_XMM sse2 +%if HIGH_BIT_DEPTH +cglobal transpose64, 3, 7, 4, dest, src, stride +addr2,r2 +movr3,r0 +movr4,r1 +movr5,128 +movr6,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 32] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 48] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 64] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 80] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 96] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 112] +movr3,r0 +call transpose8_internal + +lear1,[r4 + 16] +lear0,[r6 + 8 * 128] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 128 + 16] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 128 + 32] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 128 + 48] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 128 + 64] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 128 + 80] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 128 + 96] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 8 * 128 + 112] +movr3,r0 +call transpose8_internal + +lear1,[r4 + 32] +lear0,[r6 + 16 * 128] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 128 + 16] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 128 + 32] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 128 + 48] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 128 + 64] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 128 + 80] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 128 + 96] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 16 * 128 + 112] +movr3,r0 +call transpose8_internal + +lear1,[r4 + 48] +lear0,[r6 + 24 * 128] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 24 * 128 + 16] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 24 * 128 + 32] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 24 * 128 + 48] +movr3,r0 +call transpose8_internal +lear1,[r1 - 8 + 2 * r2] +lear0,[r6 + 24 * 128 + 64] +movr3,r0 +call
[x265] [PATCH] asm: move constant to const-a.asm
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1386140099 -19800 # Wed Dec 04 12:24:59 2013 +0530 # Node ID 759a5eee46d38ffa0205fc1551291653086208c9 # Parent 5b95aefb3aaec8e63e6cb54998b5add7e585841f asm: move constant to const-a.asm diff -r 5b95aefb3aae -r 759a5eee46d3 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Wed Dec 04 12:13:18 2013 +0530 +++ b/source/common/x86/const-a.asm Wed Dec 04 12:24:59 2013 +0530 @@ -74,6 +74,12 @@ const pd_, times 4 dd 0x const pw_ff00, times 8 dw 0xff00 +const multi_2Row, dw 1, 2, 3, 4, 1, 2, 3, 4 +const multiL, dw 1, 2, 3, 4, 5, 6, 7, 8 +const multiH, dw 9, 10, 11, 12, 13, 14, 15, 16 +const multiH2, dw 17, 18, 19, 20, 21, 22, 23, 24 +const multiH3, dw 25, 26, 27, 28, 29, 30, 31, 32 + const popcnt_table %assign x 0 %rep 256 diff -r 5b95aefb3aae -r 759a5eee46d3 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Wed Dec 04 12:13:18 2013 +0530 +++ b/source/common/x86/intrapred8.asm Wed Dec 04 12:24:59 2013 +0530 @@ -26,12 +26,6 @@ SECTION_RODATA 32 -multi_2Row: dw 1, 2, 3, 4, 1, 2, 3, 4 -multiL: dw 1, 2, 3, 4, 5, 6, 7, 8 -multiH: dw 9, 10, 11, 12, 13, 14, 15, 16 -multiH2:dw 17, 18, 19, 20, 21, 22, 23, 24 -multiH3:dw 25, 26, 27, 28, 29, 30, 31, 32 - c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 const ang_table @@ -44,7 +38,12 @@ SECTION .text cextern pw_8 +cextern multiL +cextern multiH +cextern multiH2 +cextern multiH3 cextern pw_1024 +cextern multi_2Row ;- ; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 4] rdlevel: Add code for rdlevel 2
# HG changeset patch # User Deepthi Devaki deepthidev...@multicorewareinc.com # Date 1386142554 -19800 # Node ID 71d40e349b4062df2ac12b54cd6eded9cdfa70a2 # Parent 57a407b0a15d388b8c0cf26f752918fd3f74 rdlevel: Add code for rdlevel 2 Use signalling bits + sa8d cost to choose best among inter/merge/intra. Encode only best mode at each depth. diff -r 57a407b0 -r 71d40e349b40 source/encoder/compress.cpp --- a/source/encoder/compress.cpp Wed Dec 04 13:04:39 2013 +0530 +++ b/source/encoder/compress.cpp Wed Dec 04 13:05:54 2013 +0530 @@ -286,39 +286,27 @@ } } -//calculate the motion compensation for chroma for the best mode selected -int numPart = outBestCU-getNumPartInter(); -for (int partIdx = 0; partIdx numPart; partIdx++) +if (m_cfg-param.rdLevel 2) { -m_search-motionCompensation(outBestCU, bestPredYuv, REF_PIC_LIST_X, partIdx, false, true); -} +//calculate the motion compensation for chroma for the best mode selected +int numPart = outBestCU-getNumPartInter(); +for (int partIdx = 0; partIdx numPart; partIdx++) +{ +m_search-motionCompensation(outBestCU, bestPredYuv, REF_PIC_LIST_X, partIdx, false, true); +} -TComDataCU* tmp; -TComYuv *yuv; +TComDataCU* tmp; +TComYuv *yuv; -outTempCU-setMergeIndexSubParts(bestMergeCand, 0, 0, depth); -outTempCU-setInterDirSubParts(interDirNeighbours[bestMergeCand], 0, 0, depth); -outTempCU-getCUMvField(REF_PIC_LIST_0)-setAllMvField(mvFieldNeighbours[0 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0); -outTempCU-getCUMvField(REF_PIC_LIST_1)-setAllMvField(mvFieldNeighbours[1 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0); +outTempCU-setMergeIndexSubParts(bestMergeCand, 0, 0, depth); +outTempCU-setInterDirSubParts(interDirNeighbours[bestMergeCand], 0, 0, depth); + outTempCU-getCUMvField(REF_PIC_LIST_0)-setAllMvField(mvFieldNeighbours[0 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0); + outTempCU-getCUMvField(REF_PIC_LIST_1)-setAllMvField(mvFieldNeighbours[1 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0); -//No-residue mode -m_search-encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], true); -xCheckDQP(outTempCU); +//No-residue mode +m_search-encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], true); +xCheckDQP(outTempCU); -tmp = outTempCU; -outTempCU = outBestCU; -outBestCU = tmp; - -yuv = yuvReconBest; -yuvReconBest = m_tmpRecoYuv[depth]; -m_tmpRecoYuv[depth] = yuv; - -//Encode with residue -m_search-encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], false); -xCheckDQP(outTempCU); - -if (outTempCU-m_totalCost outBestCU-m_totalCost)//Choose best from no-residue mode and residue mode -{ tmp = outTempCU; outTempCU = outBestCU; outBestCU = tmp; @@ -326,8 +314,22 @@ yuv = yuvReconBest; yuvReconBest = m_tmpRecoYuv[depth]; m_tmpRecoYuv[depth] = yuv; + +//Encode with residue +m_search-encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], false); +xCheckDQP(outTempCU); + +if (outTempCU-m_totalCost outBestCU-m_totalCost)//Choose best from no-residue mode and residue mode +{ +tmp = outTempCU; +outTempCU = outBestCU; +outBestCU = tmp; + +yuv = yuvReconBest; +yuvReconBest = m_tmpRecoYuv[depth]; +m_tmpRecoYuv[depth] = yuv; +} } -m_tmpResiYuv[depth]-clear(); x265_emms(); } @@ -479,17 +481,19 @@ m_modePredYuv[2][depth] = m_bestPredYuv[depth]; m_bestPredYuv[depth] = tempYuv; } -//calculate the motion compensation for chroma for the best mode selected -int numPart = outBestCU-getNumPartInter(); -for (int partIdx = 0; partIdx numPart; partIdx++) +if (m_cfg-param.rdLevel 2) { -m_search-motionCompensation(outBestCU, m_bestPredYuv[depth], REF_PIC_LIST_X, partIdx, false, true); +//calculate the motion compensation for chroma for the best mode selected +int numPart = outBestCU-getNumPartInter(); +for (int partIdx = 0; partIdx numPart; partIdx++) +{ +m_search-motionCompensation(outBestCU, m_bestPredYuv[depth], REF_PIC_LIST_X, partIdx, false, true); +} + +m_search-encodeResAndCalcRdInterCU(outBestCU, m_origYuv[depth],
[x265] [PATCH] asm: 16bpp support for sa8d - 24x32 and 48x64
# HG changeset patch # User Yuvaraj Venkatesh yuva...@multicorewareinc.com # Date 1386142683 -19800 # Wed Dec 04 13:08:03 2013 +0530 # Node ID 546523046d990119dc910b87ebe3f4c8ab25f236 # Parent 6a41cb559feb98056d30482651f5a83f5e326300 asm: 16bpp support for sa8d - 24x32 and 48x64 diff -r 6a41cb559feb -r 546523046d99 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 04 12:39:42 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 13:08:03 2013 +0530 @@ -516,6 +516,8 @@ p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_sse2; p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_sse2; p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_sse2; +p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_sse2; +p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_sse2; p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2; p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2; diff -r 6a41cb559feb -r 546523046d99 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Wed Dec 04 12:39:42 2013 +0530 +++ b/source/common/x86/pixel-a.asm Wed Dec 04 13:08:03 2013 +0530 @@ -2683,38 +2683,38 @@ mova m7, [hmul_8p] %endif SA8D_8x8 -add r0, 8 -add r2, 8 +add r0, 8*SIZEOF_PIXEL +add r2, 8*SIZEOF_PIXEL SA8D_8x8 -add r0, 8 -add r2, 8 +add r0, 8*SIZEOF_PIXEL +add r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 -sub r0, 8 -sub r2, 8 +sub r0, 8*SIZEOF_PIXEL +sub r2, 8*SIZEOF_PIXEL SA8D_8x8 -sub r0, 8 -sub r2, 8 +sub r0, 8*SIZEOF_PIXEL +sub r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 -add r0, 8 -add r2, 8 +add r0, 8*SIZEOF_PIXEL +add r2, 8*SIZEOF_PIXEL SA8D_8x8 -add r0, 8 -add r2, 8 +add r0, 8*SIZEOF_PIXEL +add r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 -sub r0, 8 -sub r2, 8 +sub r0, 8*SIZEOF_PIXEL +sub r2, 8*SIZEOF_PIXEL SA8D_8x8 -sub r0, 8 -sub r2, 8 +sub r0, 8*SIZEOF_PIXEL +sub r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET @@ -2909,8 +2909,8 @@ lea r5, [8*r3] sub r2, r4 sub r0, r5 -add r2, 16 -add r0, 16 +add r2, 16*SIZEOF_PIXEL +add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 @@ -2918,8 +2918,8 @@ lea r5, [8*r3] sub r2, r4 sub r0, r5 -add r2, 16 -add r0, 16 +add r2, 16*SIZEOF_PIXEL +add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 @@ -2930,8 +2930,8 @@ lea r5, [8*r3] sub r2, r4 sub r0, r5 -sub r2, 16 -sub r0, 16 +sub r2, 16*SIZEOF_PIXEL +sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 @@ -2939,8 +2939,8 @@ lea r5, [8*r3] sub r2, r4 sub r0, r5 -sub r2, 16 -sub r0, 16 +sub r2, 16*SIZEOF_PIXEL +sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 @@ -2951,8 +2951,8 @@ lea r5, [8*r3] sub r2, r4 sub r0, r5 -add r2, 16 -add r0, 16 +add r2, 16*SIZEOF_PIXEL +add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 @@ -2960,8 +2960,8 @@ lea r5, [8*r3] sub r2, r4 sub r0, r5 -add r2, 16 -add r0, 16 +add r2, 16*SIZEOF_PIXEL +add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 @@ -2972,8 +2972,8 @@ lea r5, [8*r3] sub r2, r4 sub r0, r5 -sub r2, 16 -sub r0, 16 +sub r2, 16*SIZEOF_PIXEL +sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 @@ -2981,8 +2981,8 @@ lea r5, [8*r3] sub r2, r4 sub r0, r5 -sub r2, 16 -sub r0, 16 +sub r2, 16*SIZEOF_PIXEL +sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 @@ -4577,6 +4577,9 @@ lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER @@ -4590,8 +4593,10 @@ SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 -paddusw m0, [esp+48] +SA8D_INTER +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 +%endif movd r4d, m0 add r4d, 1 shr r4d, 1 @@ -4603,6 +4608,9 @@ add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER @@ -4624,6 +4632,9 @@ add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 +%if HIGH_BIT_DEPTH +HADDUW m0, m1 +%endif mova
[x265] [PATCH 4 of 4] rdlevel: skip Intra if inter/merge sa8d less than a threshold
# HG changeset patch # User Deepthi Devaki deepthidev...@multicorewareinc.com # Date 1386142598 -19800 # Node ID cf9d0fbba6e9fe30cede78e40cc418198c52f2b5 # Parent bd58942f9dd2f717f51081183cdf301d20dc1d56 rdlevel: skip Intra if inter/merge sa8d less than a threshold In higher rdlevels Intra is skipped if inter/merge cu cbf is 0. A threshold of sa8d expects that cu cbf will be 0. Thresholds have to be refined further. diff -r bd58942f9dd2 -r cf9d0fbba6e9 source/encoder/compress.cpp --- a/source/encoder/compress.cpp Wed Dec 04 13:06:17 2013 +0530 +++ b/source/encoder/compress.cpp Wed Dec 04 13:06:38 2013 +0530 @@ -516,6 +516,11 @@ bdoIntra = (outBestCU-getCbf(0, TEXT_LUMA) || outBestCU-getCbf(0, TEXT_CHROMA_U) || outBestCU-getCbf(0, TEXT_CHROMA_V)); } +else +{ +uint32_t threshold[4] = { 2, 6000, 1600, 500 }; +bdoIntra = (outBestCU-m_totalDistortion threshold[depth]); +} if (bdoIntra) { xComputeCostIntraInInter(m_intraInInterCU[depth], SIZE_2Nx2N); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 0 of 4 ] Implement rd level 2
___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 3 of 4] rdlevel: compare Merge-skip(merge2Nx2N with no residue) to best among inter/intra/merge in rdlevel 2
# HG changeset patch # User Deepthi Devaki deepthidev...@multicorewareinc.com # Date 1386142577 -19800 # Node ID bd58942f9dd2f717f51081183cdf301d20dc1d56 # Parent 71d40e349b4062df2ac12b54cd6eded9cdfa70a2 rdlevel: compare Merge-skip(merge2Nx2N with no residue) to best among inter/intra/merge in rdlevel 2 diff -r 71d40e349b40 -r bd58942f9dd2 source/encoder/compress.cpp --- a/source/encoder/compress.cpp Wed Dec 04 13:05:54 2013 +0530 +++ b/source/encoder/compress.cpp Wed Dec 04 13:06:17 2013 +0530 @@ -286,6 +286,10 @@ } } +outTempCU-setMergeIndexSubParts(bestMergeCand, 0, 0, depth); +outTempCU-setInterDirSubParts(interDirNeighbours[bestMergeCand], 0, 0, depth); +outTempCU-getCUMvField(REF_PIC_LIST_0)-setAllMvField(mvFieldNeighbours[0 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0); +outTempCU-getCUMvField(REF_PIC_LIST_1)-setAllMvField(mvFieldNeighbours[1 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0); if (m_cfg-param.rdLevel 2) { //calculate the motion compensation for chroma for the best mode selected @@ -298,11 +302,6 @@ TComDataCU* tmp; TComYuv *yuv; -outTempCU-setMergeIndexSubParts(bestMergeCand, 0, 0, depth); -outTempCU-setInterDirSubParts(interDirNeighbours[bestMergeCand], 0, 0, depth); - outTempCU-getCUMvField(REF_PIC_LIST_0)-setAllMvField(mvFieldNeighbours[0 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0); - outTempCU-getCUMvField(REF_PIC_LIST_1)-setAllMvField(mvFieldNeighbours[1 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0); - //No-residue mode m_search-encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], true); xCheckDQP(outTempCU); @@ -437,7 +436,8 @@ /* Compute Merge Cost */ xComputeCostMerge2Nx2N(m_bestMergeCU[depth], m_mergeCU[depth], m_modePredYuv[3][depth], m_bestMergeRecoYuv[depth]); - +TComYuv* bestMergePred; +bestMergePred = m_modePredYuv[3][depth]; if (!(m_cfg-param.bEnableEarlySkip m_bestMergeCU[depth]-isSkipped(0))) { /*Compute 2Nx2N mode costs*/ @@ -567,6 +567,30 @@ { xEncodeIntraInInter(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestRecoYuv[depth]); } +//Check Merge-skip +if (!(outBestCU-getPredictionMode(0) == MODE_INTER outBestCU-getPartitionSize(0) == SIZE_2Nx2N outBestCU-getMergeFlag(0))) +{ +int numPart = m_mergeCU[depth]-getNumPartInter(); +for (int partIdx = 0; partIdx numPart; partIdx++) +{ +m_search-motionCompensation(m_mergeCU[depth], bestMergePred, REF_PIC_LIST_X, partIdx, false, true); +} +} + +m_search-encodeResAndCalcRdInterCU(m_mergeCU[depth], m_origYuv[depth], bestMergePred, m_tmpResiYuv[depth], +m_bestResiYuv[depth], m_tmpRecoYuv[depth], true); + +if (m_mergeCU[depth]-m_totalCost outBestCU-m_totalCost) +{ +outBestCU = m_mergeCU[depth]; +tempYuv = m_bestRecoYuv[depth]; +m_bestRecoYuv[depth] = m_tmpRecoYuv[depth]; +m_tmpRecoYuv[depth] = tempYuv; +if (bestMergePred != m_bestPredYuv[depth]) +{ +bestMergePred-copyPartToPartYuv(m_bestPredYuv[depth], 0, outBestCU-getWidth(0), outBestCU-getHeight(0)); +} +} } /* Disable recursive analysis for whole CUs temporarily */ if ((outBestCU != 0) (outBestCU-isSkipped(0))) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 4] Enable topskip and earlyexit for all rd levels = 4 (output changes for presets faster than slow)
# HG changeset patch # User Deepthi Devaki deepthidev...@multicorewareinc.com # Date 1386142479 -19800 # Node ID 57a407b0a15d388b8c0cf26f752918fd3f74 # Parent 1d2d60f4eb81882fa0f3ba6c4e7aa9a220968f7a Enable topskip and earlyexit for all rd levels = 4 (output changes for presets faster than slow) Also use the encodeResandCalcRDInter instead of the refactored estimate function. diff -r 1d2d60f4eb81 -r 57a407b0 source/encoder/compress.cpp --- a/source/encoder/compress.cpp Tue Dec 03 23:56:22 2013 -0600 +++ b/source/encoder/compress.cpp Wed Dec 04 13:04:39 2013 +0530 @@ -26,8 +26,8 @@ /* Lambda Partition Select adjusts the threshold value for Early Exit in No-RDO flow */ #define LAMBDA_PARTITION_SELECT 0.9 -#define EARLY_EXIT 0 -#define TOPSKIP 0 +#define EARLY_EXIT 1 +#define TOPSKIP 1 using namespace x265; @@ -314,7 +314,7 @@ m_tmpRecoYuv[depth] = yuv; //Encode with residue -m_search-estimateRDInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], false); +m_search-encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], false); xCheckDQP(outTempCU); if (outTempCU-m_totalCost outBestCU-m_totalCost)//Choose best from no-residue mode and residue mode @@ -486,7 +486,7 @@ m_search-motionCompensation(outBestCU, m_bestPredYuv[depth], REF_PIC_LIST_X, partIdx, false, true); } -m_search-estimateRDInterCU(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], +m_search-encodeResAndCalcRdInterCU(outBestCU, m_origYuv[depth], m_bestPredYuv[depth], m_tmpResiYuv[depth], m_bestResiYuv[depth], m_bestRecoYuv[depth], false); xCheckDQP(outBestCU); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel