[x265] [PATCH] arm: Implement filterPixelToShort ARM NEON asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1456831820 -19800 # Tue Mar 01 17:00:20 2016 +0530 # Node ID 61e51faf9e7ee1c8056ac2f66cf51da104bfa106 # Parent 79c00b9bc2b81afef2e41526fc3c390528f3174c arm: Implement filterPixelToShort ARM NEON asm diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Tue Mar 01 12:18:18 2016 +0530 +++ b/source/common/CMakeLists.txt Tue Mar 01 17:00:20 2016 +0530 @@ -89,7 +89,7 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here -set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S) +set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Tue Mar 01 12:18:18 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Tue Mar 01 17:00:20 2016 +0530 @@ -33,6 +33,7 @@ #include "blockcopy8.h" #include "pixel.h" #include "pixel-util.h" +#include "ipfilter8.h" } namespace X265_NS { @@ -42,6 +43,33 @@ { if (cpuMask & X265_CPU_NEON) { +// filterPixelToShort +p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_neon); +p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_neon); +p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_neon); +p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_neon); +p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_neon); +p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_neon); +p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_neon); +p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon); +p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_neon); +p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_neon); +p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon); +p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon); +p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon); +p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon); +p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon); +p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_neon); +p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon); +p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon); +p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon); +p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon); +p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon); +p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon); +p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon); +p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon); +p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon); + // Block_fill p.cu[BLOCK_4x4].blockfill_s = PFX(blockfill_s_4x4_neon); p.cu[BLOCK_8x8].blockfill_s = PFX(blockfill_s_8x8_neon); diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.S --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/arm/ipfilter8.S Tue Mar 01 17:00:20 2016 +0530 @@ -0,0 +1,694 @@ +/***** + * Copyright (C) 2016 x265 project + * + * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + */ + +#include "asm.S" + +.section .rodata + +.align 4 + +.text + +// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* ds
[x265] [PATCH 0 of 3 ] Patch series for new primitive pelFilterChroma and ASM code
Speed up = pelFilterChroma_Vertical : 600c -> 300c pelFilterChroma_Horizontal : 585c -> 160c ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 3] asm: separated pelFilterChroma function into horizontal & vertical primitives for asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1456466613 -19800 # Fri Feb 26 11:33:33 2016 +0530 # Node ID 5ff8ee940ad7f4d34b106ae4999b996245c87919 # Parent 01782e7f0a8cb93efbe4ff1534602ff9055c8565 asm: separated pelFilterChroma function into horizontal & vertical primitives for asm diff -r 01782e7f0a8c -r 5ff8ee940ad7 source/common/deblock.cpp --- a/source/common/deblock.cpp Thu Feb 25 12:17:57 2016 +0530 +++ b/source/common/deblock.cpp Fri Feb 26 11:33:33 2016 +0530 @@ -319,27 +319,6 @@ } } -/* Deblocking of one line/column for the chrominance component - * \param src pointer to picture data - * \param offset offset value for picture data - * \param tc tc value - * \param maskP indicator to disable filtering on partP - * \param maskQ indicator to disable filtering on partQ */ -static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ) -{ -for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) -{ -int16_t m4 = (int16_t)src[0]; -int16_t m3 = (int16_t)src[-offset]; -int16_t m5 = (int16_t)src[offset]; -int16_t m2 = (int16_t)src[-offset * 2]; - -int32_t delta = x265_clip3(-tc, tc, m4 - m3) * 4) + m2 - m5 + 4) >> 3)); -src[-offset] = x265_clip(m3 + (delta & maskP)); -src[0] = x265_clip(m4 - (delta & maskQ)); -} -} - void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]) { PicYuv* reconPic = cuQ->m_encData->m_reconPic; @@ -517,7 +496,7 @@ int32_t tc = s_tcTable[indexTC] << bitdepthShift; pixel* srcC = srcChroma[chromaIdx]; -pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, maskQ); +primitives.pelFilterChroma[dir](srcC + unitOffset, srcStep, offset, tc, maskP, maskQ); } } } diff -r 01782e7f0a8c -r 5ff8ee940ad7 source/common/loopfilter.cpp --- a/source/common/loopfilter.cpp Thu Feb 25 12:17:57 2016 +0530 +++ b/source/common/loopfilter.cpp Fri Feb 26 11:33:33 2016 +0530 @@ -158,6 +158,27 @@ src[offset * 2] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6); } } + +/* Deblocking of one line/column for the chrominance component +* \param src pointer to picture data +* \param offset offset value for picture data +* \param tc tc value +* \param maskP indicator to disable filtering on partP +* \param maskQ indicator to disable filtering on partQ */ +static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ) +{ +for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) +{ +int16_t m4 = (int16_t)src[0]; +int16_t m3 = (int16_t)src[-offset]; +int16_t m5 = (int16_t)src[offset]; +int16_t m2 = (int16_t)src[-offset * 2]; + +int32_t delta = x265_clip3(-tc, tc, m4 - m3) * 4) + m2 - m5 + 4) >> 3)); +src[-offset] = x265_clip(m3 + (delta & maskP)); +src[0]= x265_clip(m4 - (delta & maskQ)); +} +} } namespace X265_NS { @@ -176,5 +197,7 @@ // C code is same for EDGE_VER and EDGE_HOR only asm code is different p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c; p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c; +p.pelFilterChroma[0] = pelFilterChroma_c; +p.pelFilterChroma[1] = pelFilterChroma_c; } } diff -r 01782e7f0a8c -r 5ff8ee940ad7 source/common/primitives.h --- a/source/common/primitives.hThu Feb 25 12:17:57 2016 +0530 +++ b/source/common/primitives.hFri Feb 26 11:33:33 2016 +0530 @@ -197,6 +197,7 @@ typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset); typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ); +typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ); /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ @@ -332,6 +333,7 @@ costC1C2Flag_tcostC1C2Flag; pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1 +pelFilterChroma_t pelFilterChroma[2]; // EDGE_VER = 0, EDGE_HOR = 1 /* There is one set of chroma primitives per color space. An encoder will * have just a single color space and thus it will only ever use one entry ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 3 of 3] asm: asm code for pelFilterLumaStrong_V/H & pelFilterChroma_V/H for main10 & main12
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1456466696 -19800 # Fri Feb 26 11:34:56 2016 +0530 # Node ID d7d0c03b5e6e7fd0258d609ad5e9f4d7c0a40390 # Parent 59d9eca3d144e71f11d509a5dd40b634bb9ab500 asm: asm code for pelFilterLumaStrong_V/H & pelFilterChroma_V/H for main10 & main12 diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Feb 26 11:34:39 2016 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Feb 26 11:34:56 2016 +0530 @@ -1101,6 +1101,11 @@ } if (cpuMask & X265_CPU_SSE4) { +p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4); +p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4); +p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4); +p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4); + p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4); p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4); p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4); diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Fri Feb 26 11:34:39 2016 +0530 +++ b/source/common/x86/const-a.asm Fri Feb 26 11:34:56 2016 +0530 @@ -69,6 +69,7 @@ const pb_000F, db 0xff times 15 db 0x00 const pb_shuf_off4, times 2 db 0, 4, 1, 5, 2, 6, 3, 7 +const pw_shuf_off4, times 1 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ;; 16-bit constants diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Fri Feb 26 11:34:39 2016 +0530 +++ b/source/common/x86/loopfilter.asm Fri Feb 26 11:34:56 2016 +0530 @@ -51,6 +51,8 @@ cextern hmul_16p cextern pw_1_ cextern pb_shuf_off4 +cextern pw_shuf_off4 + ; ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride) ; @@ -3758,6 +3760,9 @@ INIT_XMM sse4 cglobal pelFilterLumaStrong_H, 5,7,10 +%if HIGH_BIT_DEPTH +add r2d, r2d +%endif mov r1, r2 neg r3d neg r4d @@ -3766,6 +3771,16 @@ lea r5, [r2 * 3] lea r6, [r1 * 3] +%if HIGH_BIT_DEPTH +movum4, [r0]; src[0] +movum3, [r0 + r1] ; src[-offset] +movum2, [r0 + r1 * 2] ; src[-offset * 2] +movum1, [r0 + r6] ; src[-offset * 3] +movum0, [r0 + r1 * 4] ; src[-offset * 4] +movum5, [r0 + r2] ; src[offset] +movum6, [r0 + r2 * 2] ; src[offset * 2] +movum7, [r0 + r5] ; src[offset * 3] +%else pmovzxbwm4, [r0]; src[0] pmovzxbwm3, [r0 + r1] ; src[-offset] pmovzxbwm2, [r0 + r1 * 2] ; src[-offset * 2] @@ -3774,6 +3789,7 @@ pmovzxbwm5, [r0 + r2] ; src[offset] pmovzxbwm6, [r0 + r2 * 2] ; src[offset * 2] pmovzxbwm7, [r0 + r5] ; src[offset * 3] +%endif paddw m0, m0 ; m0*2 movam8, m2 @@ -3841,6 +3857,15 @@ paddw m0, m1 paddw m3, m4 paddw m9, m5 + +%if HIGH_BIT_DEPTH +movh[r0 + r6], m0 +movhps [r0 + r1], m0 +movh[r0], m3 +movhps [r0 + r2 * 2], m3, +movh[r0 + r2 * 1], m9 +movhps [r0 + r1 * 2], m9 +%else packuswbm0, m0 packuswbm3, m9 @@ -3850,14 +3875,41 @@ pextrd [r0 + r2 * 2], m3, 1 pextrd [r0 + r2 * 1], m3, 2 pextrd [r0 + r1 * 2], m3, 3 +%endif RET INIT_XMM sse4 cglobal pelFilterLumaStrong_V, 5,5,10 +%if HIGH_BIT_DEPTH +add r1d, r1d +%endif neg r3d neg r4d lea r2, [r1 * 3] +%if HIGH_BIT_DEPTH +movum0, [r0 - 8]; src[-offset * 4] row 0 +movum1, [r0 + r1 * 1 - 8] ; src[-offset * 4] row 1 +movum2, [r0 + r1 * 2 - 8] ; src[-offset * 4] row 2 +movum3, [r0 + r2 * 1 - 8] ; src[-offset * 4] row 3 + +punpckhwd m4, m0, m1 ; [m4 m4 m5 m5 m6 m6 m7 m7] +punpcklwd m0, m1 ; [m0 m0 m1 m1 m2 m2 m3 m3] + +punpckhwd m5, m2, m3 ; [m4 m4 m5 m5 m6 m6 m7 m7] +punpcklwd m2, m3 ; [m0 m0 m1 m1 m2 m2 m3 m3] + +punpckhdq m3, m0, m2 ; [m2 m2 m2 m2 m3 m3 m3
[x265] [PATCH] arm: Implement pixel_ssd_s ARM NEON asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1456136894 -19800 # Mon Feb 22 15:58:14 2016 +0530 # Node ID ed3dd1a26cb5801e306db8f1d4a52cd1f4d6620b # Parent 4a1b8f3c0c7385ff19fd61133e0af4464510e9aa arm: Implement pixel_ssd_s ARM NEON asm diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Thu Feb 25 12:15:51 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Mon Feb 22 15:58:14 2016 +0530 @@ -42,6 +42,12 @@ { if (cpuMask & X265_CPU_NEON) { +// ssd_s +p.cu[BLOCK_4x4].ssd_s = PFX(pixel_ssd_s_4x4_neon); +p.cu[BLOCK_8x8].ssd_s = PFX(pixel_ssd_s_8x8_neon); +p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon); +p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon); + // sse_ss p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_neon); p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_neon); diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/pixel.h --- a/source/common/arm/pixel.h Thu Feb 25 12:15:51 2016 +0530 +++ b/source/common/arm/pixel.h Mon Feb 22 15:58:14 2016 +0530 @@ -123,6 +123,12 @@ sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_ssd_s_4x4_neon(const int16_t* a, intptr_t dstride); +sse_t x265_pixel_ssd_s_8x8_neon(const int16_t* a, intptr_t dstride); +sse_t x265_pixel_ssd_s_16x16_neon(const int16_t* a, intptr_t dstride); +sse_t x265_pixel_ssd_s_32x32_neon(const int16_t* a, intptr_t dstride); +sse_t x265_pixel_ssd_s_64x64_neon(const int16_t* a, intptr_t dstride); + void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/ssd-a.S --- a/source/common/arm/ssd-a.S Thu Feb 25 12:15:51 2016 +0530 +++ b/source/common/arm/ssd-a.S Mon Feb 22 15:58:14 2016 +0530 @@ -371,4 +371,99 @@ bx lr endfunc +function x265_pixel_ssd_s_4x4_neon +add r1, r1 +vld1.s16{d4}, [r0], r1 +vld1.s16{d5}, [r0], r1 +vld1.s16{d6}, [r0], r1 +vld1.s16{d7}, [r0] +vmull.s16 q0, d4, d4 +vmull.s16 q1, d5, d5 +vmlal.s16 q0, d6, d6 +vmlal.s16 q1, d7, d7 +vadd.s32q0, q1 +vadd.s32d0, d0, d1 +vpadd.s32 d0, d0, d0 +vmov.32 r0, d0[0] +bx lr +endfunc +function x265_pixel_ssd_s_8x8_neon +add r1, r1 +vld1.s16{q8}, [r0], r1 +vld1.s16{q9}, [r0], r1 +vmull.s16 q0, d16, d16 +vmull.s16 q1, d17, d17 +vmlal.s16 q0, d18, d18 +vmlal.s16 q1, d19, d19 +.rept 3 +vld1.s16{q8}, [r0], r1 +vld1.s16{q9}, [r0], r1 +vmlal.s16 q0, d16, d16 +vmlal.s16 q1, d17, d17 +vmlal.s16 q0, d18, d18 +vmlal.s16 q1, d19, d19 +.endr +vadd.s32q0, q1 +vadd.s32d0, d0, d1 +vpadd.s32 d0, d0, d0 +vmov.32 r0, d0[0] +bx lr +endfunc + +function x265_pixel_ssd_s_16x16_neon +add r1, r1 +mov r12, #4 +veor.u8 q0, q0 +veor.u8 q1, q1 + +.loop_ssd_s_16: +subsr12, #1 +.rept 2 +vld1.s16{q8-q9}, [r0], r1 +vld1.s16{q10-q11}, [r0], r1 +vmlal.s16 q0, d16, d16 +vmlal.s16 q1, d17, d17 +vmlal.s16 q0, d18, d18 +vmlal.s16 q1, d19, d19 +vmlal.s16 q0, d20, d20 +vmlal.s16 q1, d21, d21 +vmlal.s16 q0, d22, d22 +vmlal.s16 q1, d23, d23 +.endr +bne .loop_ssd_s_16 +vadd.s32q0, q1 +vadd.s32d0, d0, d1 +vpadd.s32 d0, d0, d0 +vmov.32 r0, d0[0] +bx lr +endfunc + +function x265_pixel_ssd_s_32x32_neon +add r1, r1 +sub r1, #32 +mov r12, #8 +veor.u8 q0, q0 +veor.u8 q1, q1 + +.loop_ssd_s_32: +subsr12, #1 +.rept 4 +vld1.s16{q8-q9}, [r0]! +vld1.s16{q10-q11}, [r0], r1 +vmlal.s16 q0, d16, d16 +vmlal.s16 q1, d17, d17 +vmlal.s16 q0, d18, d18 +vmlal.s16 q1, d19, d19 +vmlal.s16 q0, d20, d20 +vmlal.s16 q1, d21, d21 +vmlal.s16 q0, d22, d22 +vmlal.s16 q1, d23, d23 +.endr +bne .loop_ssd_s_32 +vadd.s32q0, q1 +vadd.s32d0, d0, d1 +vpadd.s32 d0, d0, d0 +vmov.32 r0, d0[0] +bx lr +endfunc ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/lis
[x265] [PATCH] arm: Implement pixel_sse_ss ARM NEON asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1456382751 -19800 # Thu Feb 25 12:15:51 2016 +0530 # Node ID 4a1b8f3c0c7385ff19fd61133e0af4464510e9aa # Parent 45c0dbd43dec24608199362a86bfba6ef91cacca arm: Implement pixel_sse_ss ARM NEON asm diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Mon Feb 22 18:22:37 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Thu Feb 25 12:15:51 2016 +0530 @@ -42,6 +42,13 @@ { if (cpuMask & X265_CPU_NEON) { +// sse_ss +p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_neon); +p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_neon); +p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_neon); +p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_neon); +p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_neon); + // pixel_sub_ps p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon); p.cu[BLOCK_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon); diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/pixel.h --- a/source/common/arm/pixel.h Mon Feb 22 18:22:37 2016 +0530 +++ b/source/common/arm/pixel.h Thu Feb 25 12:15:51 2016 +0530 @@ -117,6 +117,12 @@ sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_4x4_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_8x8_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_16x16_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); + void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/ssd-a.S --- a/source/common/arm/ssd-a.S Mon Feb 22 18:22:37 2016 +0530 +++ b/source/common/arm/ssd-a.S Thu Feb 25 12:15:51 2016 +0530 @@ -194,3 +194,181 @@ vmov.32 r0, d0[0] bx lr endfunc + +function x265_pixel_sse_ss_4x4_neon +add r1, r1 +add r3, r3 + +vld1.s16{d16}, [r0], r1 +vld1.s16{d18}, [r2], r3 +vsub.s16q2, q8, q9 +vld1.s16{d16}, [r0], r1 +vmull.s16 q0, d4, d4 +vld1.s16{d18}, [r2], r3 + +vsub.s16q2, q8, q9 +vld1.s16{d16}, [r0], r1 +vmlal.s16 q0, d4, d4 +vld1.s16{d18}, [r2], r3 + +vsub.s16q2, q8, q9 +vld1.s16{d16}, [r0], r1 +vmlal.s16 q0, d4, d4 +vld1.s16{d18}, [r2], r3 + +vsub.s16q2, q8, q9 +vmlal.s16 q0, d4, d4 + +vadd.s32d0, d0, d1 +vpadd.s32 d0, d0, d0 +vmov.32 r0, d0[0] +bx lr +endfunc + +function x265_pixel_sse_ss_8x8_neon +add r1, r1 +add r3, r3 + +vld1.s16{q8}, [r0], r1 +vld1.s16{q9}, [r2], r3 +vsub.s16q8, q9 +vmull.s16 q0, d16, d16 +vmull.s16 q1, d17, d17 + +.rept 7 +vld1.s16{q8}, [r0], r1 +vld1.s16{q9}, [r2], r3 +vsub.s16q8, q9 +vmlal.s16 q0, d16, d16 +vmlal.s16 q1, d17, d17 +.endr +vadd.s32q0, q1 +vadd.s32d0, d0, d1 +vpadd.s32 d0, d0, d0 +vmov.32 r0, d0[0] +bx lr +endfunc + +function x265_pixel_sse_ss_16x16_neon +add r1, r1 +add r3, r3 + +mov r12, #4 +veor.u8 q0, q0 +veor.u8 q1, q1 + +.loop_sse_ss_16: +subsr12, #1 +.rept 4 +vld1.s16{q8-q9}, [r0], r1 +vld1.s16{q10-q11}, [r2], r3 +vsub.s16q8, q10 +vsub.s16q9, q11 +vmlal.s16 q0, d16, d16 +vmlal.s16 q1, d17, d17 +vmlal.s16 q0, d18, d18 +vmlal.s16 q1, d19, d19 +.endr +bne .loop_sse_ss_16 +vadd.s32q0, q1 +vadd.s32d0, d0, d1 +vpadd.s32 d0, d0, d0 +vmov.32 r0, d0[0] +bx lr +endfunc + +function x265_pixel_sse_ss_32x32_neon +add r1, r1 +add r3, r3 +sub r1, #32 +sub r3, #32 +mov r12, #8 +veor.u8 q0, q0 +veor.u8 q1, q1 + +.loop_sse_ss_32: +subsr12, #1 +.rept 4 +vld1.s16{q
[x265] [PATCH] arm: Implement pixel_sse_pp ARM NEON asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1455794242 -19800 # Thu Feb 18 16:47:22 2016 +0530 # Node ID 5e4593ef30cc4bccc5eec2a0109b8dff397e5c93 # Parent b31fa1a4ef43697e163d17dda0f4650de45d6ff9 arm: Implement pixel_sse_pp ARM NEON asm diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Thu Feb 18 16:37:01 2016 +0530 +++ b/source/common/CMakeLists.txt Thu Feb 18 16:47:22 2016 +0530 @@ -89,7 +89,7 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here -set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S) +set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Thu Feb 18 16:37:01 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Thu Feb 18 16:47:22 2016 +0530 @@ -42,6 +42,13 @@ { if (cpuMask & X265_CPU_NEON) { +// sse_pp +p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_neon); +p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon); +p.cu[BLOCK_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon); +p.cu[BLOCK_32x32].sse_pp = PFX(pixel_sse_pp_32x32_neon); +p.cu[BLOCK_64x64].sse_pp = PFX(pixel_sse_pp_64x64_neon); + // pixel_var p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon); diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/pixel.h --- a/source/common/arm/pixel.h Thu Feb 18 16:37:01 2016 +0530 +++ b/source/common/arm/pixel.h Thu Feb 18 16:47:22 2016 +0530 @@ -111,4 +111,10 @@ void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res); void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res); +sse_t x265_pixel_sse_pp_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_pp_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_pp_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); + #endif // ifndef X265_I386_PIXEL_ARM_H diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/ssd-a.S --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/arm/ssd-a.S Thu Feb 18 16:47:22 2016 +0530 @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2016 x265 project + * + * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + */ + +#include "asm.S" + +.section .rodata + +.align 4 + + +.text + + +function x265_pixel_sse_pp_4x4_neon +vld1.32 {d16[]}, [r0], r1 +vld1.32 {d17[]}, [r2], r3 +vsubl.u8q2, d16, d17 +vld1.32 {d16[]}, [r0], r1 +vmull.s16 q0, d4, d4 +vld1.32 {d17[]}, [r2], r3 + +vsubl.u8q2, d16, d17 +vld1.32 {d16[]}, [r0], r1 +vmlal.s16 q0, d4, d4 +vld1.32 {d17[]}, [r2], r3 + +vsubl.u8q2, d16, d17 +vld1.32 {d16[]}, [r0], r1 +vmlal.s16 q0, d4, d4 +vld1.32 {d17[]}, [r2], r3 + +vsubl.u8q2, d16, d17 +vmlal.s16 q0, d4, d4 +vadd.s32d0, d0, d1 +vpadd.s32 d0, d0, d0 +vmov.32 r0, d0[0] +bx lr +endfunc + +function x265_pixel_sse_pp_8x8_neon +vld1.64
[x265] [PATCH] arm: Implement pixel_var ARM NEON asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1455793621 -19800 # Thu Feb 18 16:37:01 2016 +0530 # Node ID b31fa1a4ef43697e163d17dda0f4650de45d6ff9 # Parent cb8769b5ea70304d658173e02deb254fb8572bd6 arm: Implement pixel_var ARM NEON asm diff -r cb8769b5ea70 -r b31fa1a4ef43 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Thu Feb 18 10:23:24 2016 +0530 +++ b/source/common/CMakeLists.txt Thu Feb 18 16:37:01 2016 +0530 @@ -89,7 +89,7 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here -set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S) +set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") diff -r cb8769b5ea70 -r b31fa1a4ef43 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Thu Feb 18 10:23:24 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Thu Feb 18 16:37:01 2016 +0530 @@ -32,6 +32,7 @@ extern "C" { #include "blockcopy8.h" #include "pixel.h" +#include "pixel-util.h" } namespace X265_NS { @@ -41,6 +42,12 @@ { if (cpuMask & X265_CPU_NEON) { +// pixel_var +p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon); +p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon); +p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_neon); +p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_neon); + // blockcopy p.pu[LUMA_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon); p.pu[LUMA_8x4].copy_pp = PFX(blockcopy_pp_8x4_neon); diff -r cb8769b5ea70 -r b31fa1a4ef43 source/common/arm/pixel-util.S --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/arm/pixel-util.SThu Feb 18 16:37:01 2016 +0530 @@ -0,0 +1,243 @@ +/***** + * Copyright (C) 2016 x265 project + * + * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + */ + +#include "asm.S" + +.section .rodata + +.align 4 + + +.text + +.macro VAR_SQR_SUM qsqr_sum, qsqr_last, qsqr_temp, dsrc, num=0, vpadal=vpadal.u16 +vmull.u8\qsqr_temp, \dsrc, \dsrc +vaddw.u8q\num, q\num, \dsrc +\vpadal \qsqr_sum, \qsqr_last +.endm + +function x265_pixel_var_8x8_neon +vld1.u8 {d16}, [r0], r1 +vmull.u8q1, d16, d16 +vmovl.u8q0, d16 +vld1.u8 {d18}, [r0], r1 +vmull.u8q2, d18, d18 +vaddw.u8q0, q0, d18 + +vld1.u8 {d20}, [r0], r1 +VAR_SQR_SUM q1, q1, q3, d20, 0, vpaddl.u16 +vld1.u8 {d22}, [r0], r1 +VAR_SQR_SUM q2, q2, q8, d22, 0, vpaddl.u16 + +vld1.u8 {d24}, [r0], r1 +VAR_SQR_SUM q1, q3, q9, d24 +vld1.u8 {d26}, [r0], r1 +VAR_SQR_SUM q2, q8, q10, d26 +vld1.u8 {d24}, [r0], r1 +VAR_SQR_SUM q1, q9, q14, d24 +vld1.u8 {d26}, [r0], r1 +VAR_SQR_SUM q2, q10, q15, d26 + +vpaddl.u16 q8, q14 +vpaddl.u16 q9, q15 +vadd.u32q1, q1, q8 +vadd.u16d0, d0, d1 +vadd.u32q1, q1, q9 +vadd.u32q1, q1, q2 +vpaddl.u16 d0, d0 +vadd.u32d2, d2, d3 +vpadd.u32 d0, d0, d2 + +vmovr0, r1, d0 +bx lr +endfunc + +function x265_pixel_var_16x16_neon +veor.u8 q0, q0 +veor.u8 q1, q1 +veor.u8 q2, q2 +veor.u8 q14, q14 +veor.u8 q15, q15 +mov ip, #4 + +.var16_loop: +subsip, ip, #1 +vld1.u8 {q8}, [r0], r1 +VAR_SQR_SUM q1, q14, q12, d16 +VAR_SQR_SUM q2, q15, q13, d17 + +vld1.u8 {q9}, [r0], r1 +VAR_SQR_SUM q1, q12, q14, d18 +VAR_SQR_SUM q2, q13, q15, d19 + +vld1.u8 {q8}, [r0], r1 +V
[x265] [PATCH] arm: Implement sad_x3 and sad_x4 ARM NEON asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1455598958 -19800 # Tue Feb 16 10:32:38 2016 +0530 # Node ID ac6c535109a43e9cdb69f30db1143c06400a19f4 # Parent e3902c96c3c268ec4ab1a4976ee2feae7348b36f arm: Implement sad_x3 and sad_x4 ARM NEON asm diff -r e3902c96c3c2 -r ac6c535109a4 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Thu Feb 11 15:00:20 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Tue Feb 16 10:32:38 2016 +0530 @@ -41,6 +41,7 @@ { if (cpuMask & X265_CPU_NEON) { +// blockcopy p.pu[LUMA_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon); p.pu[LUMA_8x4].copy_pp = PFX(blockcopy_pp_8x4_neon); p.pu[LUMA_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon); @@ -66,11 +67,65 @@ p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_neon); p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon); p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon); + +// sad_x3 +p.pu[LUMA_4x4].sad_x3 = PFX(sad_x3_4x4_neon); +p.pu[LUMA_4x8].sad_x3 = PFX(sad_x3_4x8_neon); +p.pu[LUMA_4x16].sad_x3 = PFX(sad_x3_4x16_neon); +p.pu[LUMA_8x4].sad_x3 = PFX(sad_x3_8x4_neon); +p.pu[LUMA_8x8].sad_x3 = PFX(sad_x3_8x8_neon); +p.pu[LUMA_8x16].sad_x3 = PFX(sad_x3_8x16_neon); +p.pu[LUMA_8x32].sad_x3 = PFX(sad_x3_8x32_neon); +p.pu[LUMA_12x16].sad_x3 = PFX(sad_x3_12x16_neon); +p.pu[LUMA_16x4].sad_x3 = PFX(sad_x3_16x4_neon); +p.pu[LUMA_16x8].sad_x3 = PFX(sad_x3_16x8_neon); +p.pu[LUMA_16x12].sad_x3 = PFX(sad_x3_16x12_neon); +p.pu[LUMA_16x16].sad_x3 = PFX(sad_x3_16x16_neon); +p.pu[LUMA_16x32].sad_x3 = PFX(sad_x3_16x32_neon); +p.pu[LUMA_16x64].sad_x3 = PFX(sad_x3_16x64_neon); +p.pu[LUMA_24x32].sad_x3 = PFX(sad_x3_24x32_neon); +p.pu[LUMA_32x8].sad_x3 = PFX(sad_x3_32x8_neon); +p.pu[LUMA_32x16].sad_x3 = PFX(sad_x3_32x16_neon); +p.pu[LUMA_32x24].sad_x3 = PFX(sad_x3_32x24_neon); +p.pu[LUMA_32x32].sad_x3 = PFX(sad_x3_32x32_neon); +p.pu[LUMA_32x64].sad_x3 = PFX(sad_x3_32x64_neon); +p.pu[LUMA_48x64].sad_x3 = PFX(sad_x3_48x64_neon); +p.pu[LUMA_64x16].sad_x3 = PFX(sad_x3_64x16_neon); +p.pu[LUMA_64x32].sad_x3 = PFX(sad_x3_64x32_neon); +p.pu[LUMA_64x48].sad_x3 = PFX(sad_x3_64x48_neon); +p.pu[LUMA_64x64].sad_x3 = PFX(sad_x3_64x64_neon); + +// sad_x4 +p.pu[LUMA_4x4].sad_x4 = PFX(sad_x4_4x4_neon); +p.pu[LUMA_4x8].sad_x4 = PFX(sad_x4_4x8_neon); +p.pu[LUMA_4x16].sad_x4 = PFX(sad_x4_4x16_neon); +p.pu[LUMA_8x4].sad_x4 = PFX(sad_x4_8x4_neon); +p.pu[LUMA_8x8].sad_x4 = PFX(sad_x4_8x8_neon); +p.pu[LUMA_8x16].sad_x4 = PFX(sad_x4_8x16_neon); +p.pu[LUMA_8x32].sad_x4 = PFX(sad_x4_8x32_neon); +p.pu[LUMA_12x16].sad_x4 = PFX(sad_x4_12x16_neon); +p.pu[LUMA_16x4].sad_x4 = PFX(sad_x4_16x4_neon); +p.pu[LUMA_16x8].sad_x4 = PFX(sad_x4_16x8_neon); +p.pu[LUMA_16x12].sad_x4 = PFX(sad_x4_16x12_neon); +p.pu[LUMA_16x16].sad_x4 = PFX(sad_x4_16x16_neon); +p.pu[LUMA_16x32].sad_x4 = PFX(sad_x4_16x32_neon); +p.pu[LUMA_16x64].sad_x4 = PFX(sad_x4_16x64_neon); +p.pu[LUMA_24x32].sad_x4 = PFX(sad_x4_24x32_neon); +p.pu[LUMA_32x8].sad_x4 = PFX(sad_x4_32x8_neon); +p.pu[LUMA_32x16].sad_x4 = PFX(sad_x4_32x16_neon); +p.pu[LUMA_32x24].sad_x4 = PFX(sad_x4_32x24_neon); +p.pu[LUMA_32x32].sad_x4 = PFX(sad_x4_32x32_neon); +p.pu[LUMA_32x64].sad_x4 = PFX(sad_x4_32x64_neon); +p.pu[LUMA_48x64].sad_x4 = PFX(sad_x4_48x64_neon); +p.pu[LUMA_64x16].sad_x4 = PFX(sad_x4_64x16_neon); +p.pu[LUMA_64x32].sad_x4 = PFX(sad_x4_64x32_neon); +p.pu[LUMA_64x48].sad_x4 = PFX(sad_x4_64x48_neon); +p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon); } if (cpuMask & X265_CPU_ARMV6) { -p.pu[LUMA_4x4].sad=PFX(pixel_sad_4x4_armv6); - p.pu[LUMA_4x8].sad=PFX(pixel_sad_4x8_armv6); +p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_armv6); +p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_armv6); } } } // namespace X265_NS diff -r e3902c96c3c2 -r ac6c535109a4 source/common/arm/asm.S --- a/source/common/arm/asm.S Thu Feb 11 15:00:20 2016 +0530 +++ b/source/common/arm/asm.S Tue Feb 16 10:32:38 2016 +0530 @@ -108,7 +108,7 @@ #define JOIN(a, b) GLUE(a, b) #define X(s) JOIN(EXTERN_ASM, s) -#define FENC_STRIDE 16 +#define FENC_STRIDE 64 #define FDEC_STRIDE 32 .macro HORIZ_ADD dest, a, b diff -r e3902c96c3c2 -r ac6c535109a4 source/common/arm/mc-a.S --- a/source/common/arm/mc-a.S Thu Feb 11 15:00:20 2016 +0530 +++ b/source/common/arm/mc-a.S Tue Feb 16 10:32:38 2016 +0530 @@ -34,7 +34,7 @@ * r0 - dst * r1 - dstStride * r2 - src - * d3 - srcStride */ + * r3 - srcStrid
Re: [x265] [PATCH] arm: Implement blockcopy_pp_NxN_neon
On Thu, Feb 11, 2016 at 5:30 PM, chenwrote: > > At 2016-02-11 17:54:45,radhakrish...@multicorewareinc.com wrote: > ># HG changeset patch > ># User radhakrish...@multicorewareinc.com > ># Date 1455183020 -19800 > ># Thu Feb 11 15:00:20 2016 +0530 > ># Node ID 4f5720ccaf1aa04868054636f14dce8ea65390ad > ># Parent a2ff6747eaf7b25102f27f808cf5526f441df488 > >arm: Implement blockcopy_pp_NxN_neon > > > >+function x265_blockcopy_pp_48x64_neon > >+push{r4, r5} > >+mov r4, #8 > >+mov r5, #32 > >+sub r3, r5 > >+sub r1, r5 > >+loop_48x64: > >+.rept 8 > >+vld1.8 {q0, q1}, [r2]! > >+vld1.8 {q2}, [r2], r3 > the ARM support format"vld1.8 {q0, q1, q1}, Rn, Rm" > > > Load support maximum upto 4 double word vld1.8 {d0,d1,d2,d3}, [Rn], Rm > OR 2 quadwords vld1.8 {q0,q1}, [Rn], Rm > > > >+vst1.8 {q0, q1}, [r0]! > >+vst1.8 {q2}, [r0], r1 > >+.endr > >+subsr4, r4, #1 > >+bne loop_48x64 > >+pop {r4, r5} > >+bx lr > >+endfunc > > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] threadpool: utilize all processors on embedded ARM platforms
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1455010589 -19800 # Tue Feb 09 15:06:29 2016 +0530 # Node ID 18b83aaee1b56e2048a425c25a452aa62c39da89 # Parent 023e6051c4c63ab1633b2de0e8f37e6158796288 threadpool: utilize all processors on embedded ARM platforms diff -r 023e6051c4c6 -r 18b83aaee1b5 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Fri Feb 05 15:13:57 2016 +0530 +++ b/source/common/threadpool.cpp Tue Feb 09 15:06:29 2016 +0530 @@ -528,6 +528,10 @@ SYSTEM_INFO sysinfo; GetSystemInfo(); return sysinfo.dwNumberOfProcessors; +#elif __unix__ && X265_ARCH_ARM +/* Return the number of processors configured by OS. Because, most embedded linux distributions + * uses only one processor as the scheduler doesn't have enough work to utilize all processors */ +return sysconf(_SC_NPROCESSORS_CONF); #elif __unix__ return sysconf(_SC_NPROCESSORS_ONLN); #elif MACOS ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] arm: Implement blockcopy_pp_16x16_neon. Modified include guards with ARM suffix
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1454410744 -19800 # Tue Feb 02 16:29:04 2016 +0530 # Node ID 5463e2b9f37e4952bb16e94673c6fd2991243145 # Parent dc62b47dd0d98f732165345883edac55320baec1 arm: Implement blockcopy_pp_16x16_neon. Modified include guards with ARM suffix. diff -r dc62b47dd0d9 -r 5463e2b9f37e source/CMakeLists.txt --- a/source/CMakeLists.txt Mon Jan 25 14:59:50 2016 +0530 +++ b/source/CMakeLists.txt Tue Feb 02 16:29:04 2016 +0530 @@ -182,9 +182,11 @@ add_definitions(-march=i686) endif() if(ARM AND CROSS_COMPILE_ARM) -add_definitions(-march=armv6 -mfloat-abi=soft -mfpu=vfp) +set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp) +add_definitions(${ARM_ARGS}) elseif(ARM) -add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp) +set(ARM_ARGS -march=armv6 -mfloat-abi=hard -mfpu=vfp) +add_definitions(${ARM_ARGS}) endif() if(FPROFILE_GENERATE) if(INTEL_CXX) @@ -418,7 +420,7 @@ add_subdirectory(encoder) add_subdirectory(common) -if((MSVC_IDE OR XCODE) AND ENABLE_ASSEMBLY) +if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) # this is required because of this cmake bug # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170 if(WIN32) @@ -429,23 +431,33 @@ if(ARM OR CROSS_COMPILE_ARM) # compile ARM arch asm files here - +enable_language(ASM) +foreach(ASM ${ARM_ASMS}) +set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM}) +list(APPEND ASM_SRCS ${ASM_SRC}) +list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) +add_custom_command( +OUTPUT ${ASM}.${SUFFIX} +COMMAND ${CMAKE_CXX_COMPILER} +ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} +DEPENDS ${ASM_SRC}) +endforeach() elseif(X86) # compile X86 arch asm files here foreach(ASM ${MSVC_ASMS}) -set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) -list(APPEND YASM_SRCS ${YASM_SRC}) -list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) +set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) +list(APPEND ASM_SRCS ${ASM_SRC}) +list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) add_custom_command( OUTPUT ${ASM}.${SUFFIX} -COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX} -DEPENDS ${YASM_SRC}) +COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${ASM_SRC} -o ${ASM}.${SUFFIX} +DEPENDS ${ASM_SRC}) endforeach() endif() endif() -source_group(ASM FILES ${YASM_SRCS}) -add_library(x265-static STATIC $ $ ${YASM_OBJS} ${YASM_SRCS}) +source_group(ASM FILES ${ASM_SRCS}) +add_library(x265-static STATIC $ $ ${ASM_OBJS} ${ASM_SRCS}) if(NOT MSVC) set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265) endif() @@ -479,7 +491,7 @@ option(ENABLE_SHARED "Build shared library" ON) if(ENABLE_SHARED) -add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS} +add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${ASM_OBJS} ${X265_RC_FILE} $ $) if(EXTRA_LIB) target_link_libraries(x265-shared ${EXTRA_LIB}) @@ -575,7 +587,7 @@ # Xcode seems unable to link the CLI with libs, so link as one targget add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp - $ $ ${YASM_OBJS} ${YASM_SRCS}) + $ $ ${ASM_OBJS} ${ASM_SRCS}) else() add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE} ${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp) diff -r dc62b47dd0d9 -r 5463e2b9f37e source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Mon Jan 25 14:59:50 2016 +0530 +++ b/source/common/CMakeLists.txt Tue Feb 02 16:29:04 2016 +0530 @@ -89,9 +89,10 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here -set(A_SRCS) +set(A_SRCS asm.S cpu-a.S mc-a.S) set(VEC_PRIMITIVES) +set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") foreach(SRC ${C_SRCS}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC}) endforeach() diff -r dc62b47dd0d9 -r 5463e2b9f37e source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Mon Jan 25 14:59:50 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Tue Feb 02 16:29:04 2016 +0530 @@ -29,12 +29,18 @@ #include "x265.h" #include "cpu.h" +extern "C" {
[x265] [PATCH] arm: Implement blockcopy_pp_16x16_neon. Modified include guards with ARM suffix
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1454327470 -19800 # Mon Feb 01 17:21:10 2016 +0530 # Node ID 894e0fce5d14844d3c85cdb2a287f302fc8cffca # Parent dc62b47dd0d98f732165345883edac55320baec1 arm: Implement blockcopy_pp_16x16_neon. Modified include guards with ARM suffix. diff -r dc62b47dd0d9 -r 894e0fce5d14 source/CMakeLists.txt --- a/source/CMakeLists.txt Mon Jan 25 14:59:50 2016 +0530 +++ b/source/CMakeLists.txt Mon Feb 01 17:21:10 2016 +0530 @@ -182,9 +182,11 @@ add_definitions(-march=i686) endif() if(ARM AND CROSS_COMPILE_ARM) -add_definitions(-march=armv6 -mfloat-abi=soft -mfpu=vfp) +set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp) +add_definitions(${ARM_ARGS}) elseif(ARM) -add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp) +set(ARM_ARGS -march=armv6 -mfloat-abi=hard -mfpu=vfp) +add_definitions(${ARM_ARGS}) endif() if(FPROFILE_GENERATE) if(INTEL_CXX) @@ -418,7 +420,7 @@ add_subdirectory(encoder) add_subdirectory(common) -if((MSVC_IDE OR XCODE) AND ENABLE_ASSEMBLY) +if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) # this is required because of this cmake bug # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170 if(WIN32) @@ -429,7 +431,17 @@ if(ARM OR CROSS_COMPILE_ARM) # compile ARM arch asm files here - +enable_language(ASM) +foreach(ASM ${ARM_ASMS}) +set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM}) +list(APPEND YASM_SRCS ${YASM_SRC}) +list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) +add_custom_command( +OUTPUT ${ASM}.${SUFFIX} +COMMAND ${CMAKE_CXX_COMPILER} +ARGS ${ARM_ARGS} -c ${YASM_SRC} -o ${ASM}.${SUFFIX} +DEPENDS ${YASM_SRC}) +endforeach() elseif(X86) # compile X86 arch asm files here foreach(ASM ${MSVC_ASMS}) diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Mon Jan 25 14:59:50 2016 +0530 +++ b/source/common/CMakeLists.txt Mon Feb 01 17:21:10 2016 +0530 @@ -89,9 +89,10 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here -set(A_SRCS) +set(ARM_SRCS asm.S cpu-a.S mc-a.S) set(VEC_PRIMITIVES) +set(ARM_ASMS "${ARM_SRCS}" CACHE INTERNAL "ARM Assembly Sources") foreach(SRC ${C_SRCS}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC}) endforeach() diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Mon Jan 25 14:59:50 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Mon Feb 01 17:21:10 2016 +0530 @@ -29,12 +29,18 @@ #include "x265.h" #include "cpu.h" +extern "C" { +#include "blockcopy8.h" +} namespace X265_NS { // private x265 namespace void setupAssemblyPrimitives(EncoderPrimitives , int cpuMask) { - +if (cpuMask & X265_CPU_NEON) +{ +p.pu[LUMA_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon); +} } } // namespace X265_NS diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/asm.S --- a/source/common/arm/asm.S Mon Jan 25 14:59:50 2016 +0530 +++ b/source/common/arm/asm.S Mon Feb 01 17:21:10 2016 +0530 @@ -25,8 +25,6 @@ * For more information, contact us at license @ x265.com. */ -#include "x265_config.h" - .syntax unified #if HAVE_NEON diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/blockcopy8.h --- a/source/common/arm/blockcopy8.hMon Jan 25 14:59:50 2016 +0530 +++ b/source/common/arm/blockcopy8.hMon Feb 01 17:21:10 2016 +0530 @@ -23,7 +23,9 @@ * For more information, contact us at license @ x265.com. */ -#ifndef X265_BLOCKCOPY8_H -#define X265_BLOCKCOPY8_H +#ifndef X265_BLOCKCOPY8_ARM_H +#define X265_BLOCKCOPY8_ARM_H -#endif // ifndef X265_I386_PIXEL_H +void x265_blockcopy_pp_16x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); + +#endif // ifndef X265_I386_PIXEL_ARM_H diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/dct8.h --- a/source/common/arm/dct8.h Mon Jan 25 14:59:50 2016 +0530 +++ b/source/common/arm/dct8.h Mon Feb 01 17:21:10 2016 +0530 @@ -22,7 +22,7 @@ * For more information, contact us at license @ x265.com. */ -#ifndef X265_DCT8_H -#define X265_DCT8_H +#ifndef X265_DCT8_ARM_H +#define X265_DCT8_ARM_H -#endif // ifndef X265_DCT8_H +#endif // ifndef X265_DCT8_ARM_H diff -r dc62b47dd0d9 -r 894e0fce5d14 source/common/arm/intrapred.h --- a/source/common/arm/in
[x265] [PATCH] testbench: port x264 stack & register check code for ARM arch
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1453891819 -19800 # Wed Jan 27 16:20:19 2016 +0530 # Node ID 14c4806a24eb277d31fa77c1c906838ffcb62395 # Parent f548abe8eae8fb75513a85d1b09233e706c7b5ba testbench: port x264 stack & register check code for ARM arch diff -r f548abe8eae8 -r 14c4806a24eb source/common/arm/asm.S --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/arm/asm.S Wed Jan 27 16:20:19 2016 +0530 @@ -0,0 +1,184 @@ +/* + * asm.S: arm utility macros + * + * Copyright (C) 2016 x265 project + * + * Authors: Mans Rullgard <m...@mansr.com> + * David Conrad <lesse...@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + */ + +#include "x265_config.h" + +.syntax unified + +#if HAVE_NEON +.arch armv7-a +#elif HAVE_ARMV6T2 +.arch armv6t2 +#elif HAVE_ARMV6 +.arch armv6 +#endif + +.fpu neon + +#ifdef PREFIX +# define EXTERN_ASM _ +#else +# define EXTERN_ASM +#endif + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC @ +#endif + +.macro require8, val=1 +ELF .eabi_attribute 24, \val +.endm + +.macro preserve8, val=1 +ELF .eabi_attribute 25, \val +.endm + +.macro function name, export=1 +.macro endfunc +ELF .size \name, . - \name +FUNC.endfunc +.purgem endfunc +.endm +.align 2 +.if \export == 1 +.global EXTERN_ASM\name +ELF .hidden EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC.func EXTERN_ASM\name +EXTERN_ASM\name: +.else +ELF .hidden \name +ELF .type \name, %function +FUNC.func \name +\name: +.endif +.endm + +.macro movrel rd, val +#if HAVE_ARMV6T2 && !defined(PIC) +movw\rd, #:lower16:\val +movt\rd, #:upper16:\val +#else +ldr \rd, =\val +#endif +.endm + +.macro movconst rd, val +#if HAVE_ARMV6T2 +movw\rd, #:lower16:\val +.if \val >> 16 +movt\rd, #:upper16:\val +.endif +#else +ldr \rd, =\val +#endif +.endm + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + +#define FENC_STRIDE 16 +#define FDEC_STRIDE 32 + +.macro HORIZ_ADD dest, a, b +.ifnb \b +vadd.u16\a, \a, \b +.endif +vpaddl.u16 \a, \a +vpaddl.u32 \dest, \a +.endm + +.macro SUMSUB_AB sum, diff, a, b +vadd.s16\sum, \a, \b +vsub.s16\diff, \a, \b +.endm + +.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d +SUMSUB_AB \s1, \d1, \a, \b +SUMSUB_AB \s2, \d2, \c, \d +.endm + +.macro ABS2 a b +vabs.s16 \a, \a +vabs.s16 \b, \b +.endm + +// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes) +// op = sumsub/amax (sum and diff / maximum of absolutes) +// d1/2 = destination registers +// s1/2 = source registers +.macro HADAMARD dist, op, d1, d2, s1, s2 +.if \dist == 1 +vtrn.16 \s1, \s2 +.else +vtrn.32 \s1, \s2 +.endif +.ifc \op, sumsub +SUMSUB_AB \d1, \d2, \s1, \s2 +.else +vabs.s16\s1, \s1 +vabs.s16\s2, \s2 +vmax.s16\d1, \s1, \s2 +.endif +.endm + +.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7 +vtrn.32 \r0, \r4 +vtrn.32 \r1, \r5 +vtrn.32 \r2, \r6 +vtrn.32 \r3, \r7 +vtrn.16 \r0, \r2 +vtrn.16 \r1, \r3 +vtrn.16 \r4, \r6 +vtrn.16 \r5, \r7 +vtrn.8 \r0, \r1 +vtrn.8 \r2, \r3 +vtrn.8 \r4, \r5 +vtrn.8 \r6, \r7 +.endm + +.macro TRANSPOSE4x4 r0 r1 r2 r3 +vtrn.16 \r0, \r2 +vtrn.16 \r1, \r3 +vtrn.8 \r0, \r1 +vtrn.8 \r2, \r3 +.endm + +.macro TRANSPOSE4x4_16 d0 d1 d2 d3 +vtrn.32 \d0, \d2 +vtrn.32 \d1, \d3 +vtrn.16 \d0
[x265] [PATCH] testbench: port x264 stack & register check code for ARM arch
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1453872887 -19800 # Wed Jan 27 11:04:47 2016 +0530 # Node ID f98483674435cdb5cbd7acb655ee217feffdf976 # Parent f548abe8eae8fb75513a85d1b09233e706c7b5ba testbench: port x264 stack & register check code for ARM arch diff -r f548abe8eae8 -r f98483674435 source/common/arm/asm.S --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/arm/asm.S Wed Jan 27 11:04:47 2016 +0530 @@ -0,0 +1,184 @@ +/* + * asm.S: arm utility macros + * + * Copyright (C) 2008-2015 x264 project + * + * Authors: Mans Rullgard <m...@mansr.com> + * David Conrad <lesse...@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licens...@x264.com. + */ + +#include "x265_config.h" + +.syntax unified + +#if HAVE_NEON +.arch armv7-a +#elif HAVE_ARMV6T2 +.arch armv6t2 +#elif HAVE_ARMV6 +.arch armv6 +#endif + +.fpu neon + +#ifdef PREFIX +# define EXTERN_ASM _ +#else +# define EXTERN_ASM +#endif + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC @ +#endif + +.macro require8, val=1 +ELF .eabi_attribute 24, \val +.endm + +.macro preserve8, val=1 +ELF .eabi_attribute 25, \val +.endm + +.macro function name, export=1 +.macro endfunc +ELF .size \name, . - \name +FUNC.endfunc +.purgem endfunc +.endm +.align 2 +.if \export == 1 +.global EXTERN_ASM\name +ELF .hidden EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC.func EXTERN_ASM\name +EXTERN_ASM\name: +.else +ELF .hidden \name +ELF .type \name, %function +FUNC.func \name +\name: +.endif +.endm + +.macro movrel rd, val +#if HAVE_ARMV6T2 && !defined(PIC) +movw\rd, #:lower16:\val +movt\rd, #:upper16:\val +#else +ldr \rd, =\val +#endif +.endm + +.macro movconst rd, val +#if HAVE_ARMV6T2 +movw\rd, #:lower16:\val +.if \val >> 16 +movt\rd, #:upper16:\val +.endif +#else +ldr \rd, =\val +#endif +.endm + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + +#define FENC_STRIDE 16 +#define FDEC_STRIDE 32 + +.macro HORIZ_ADD dest, a, b +.ifnb \b +vadd.u16\a, \a, \b +.endif +vpaddl.u16 \a, \a +vpaddl.u32 \dest, \a +.endm + +.macro SUMSUB_AB sum, diff, a, b +vadd.s16\sum, \a, \b +vsub.s16\diff, \a, \b +.endm + +.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d +SUMSUB_AB \s1, \d1, \a, \b +SUMSUB_AB \s2, \d2, \c, \d +.endm + +.macro ABS2 a b +vabs.s16 \a, \a +vabs.s16 \b, \b +.endm + +// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes) +// op = sumsub/amax (sum and diff / maximum of absolutes) +// d1/2 = destination registers +// s1/2 = source registers +.macro HADAMARD dist, op, d1, d2, s1, s2 +.if \dist == 1 +vtrn.16 \s1, \s2 +.else +vtrn.32 \s1, \s2 +.endif +.ifc \op, sumsub +SUMSUB_AB \d1, \d2, \s1, \s2 +.else +vabs.s16\s1, \s1 +vabs.s16\s2, \s2 +vmax.s16\d1, \s1, \s2 +.endif +.endm + +.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7 +vtrn.32 \r0, \r4 +vtrn.32 \r1, \r5 +vtrn.32 \r2, \r6 +vtrn.32 \r3, \r7 +vtrn.16 \r0, \r2 +vtrn.16 \r1, \r3 +vtrn.16 \r4, \r6 +vtrn.16 \r5, \r7 +vtrn.8 \r0, \r1 +vtrn.8 \r2, \r3 +vtrn.8 \r4, \r5 +vtrn.8 \r6, \r7 +.endm + +.macro TRANSPOSE4x4 r0 r1 r2 r3 +vtrn.16 \r0, \r2 +vtrn.16 \r1, \r3 +vtrn.8 \r0, \r1 +vtrn.8 \r2, \r3 +.endm + +.macro TRANSPOSE4x4_16 d0 d1 d2 d3 +vtrn.32 \d0, \d2 +vtrn.32 \d1, \d3 +vtrn.16 \d0
[x265] [PATCH 2 of 2] asm: improved intra_ang8x8 modes 3 to 17 AVX2 asm over 20% than previous AVX2 asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1449813841 -19800 # Fri Dec 11 11:34:01 2015 +0530 # Node ID ee47dd944e08ebb49fd54114979c65dadabfe0df # Parent 593a1907e915c9bad7bd3ff608a30770289c249a asm: improved intra_ang8x8 modes 3 to 17 AVX2 asm over 20% than previous AVX2 asm diff -r 593a1907e915 -r ee47dd944e08 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sat Dec 12 09:56:10 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Dec 11 11:34:01 2015 +0530 @@ -2932,6 +2932,7 @@ p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2); p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2); p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2); +p.cu[BLOCK_8x8].intra_pred[17] = PFX(intra_pred_ang8_17_avx2); p.cu[BLOCK_8x8].intra_pred[20] = PFX(intra_pred_ang8_20_avx2); p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2); p.cu[BLOCK_8x8].intra_pred[22] = PFX(intra_pred_ang8_22_avx2); diff -r 593a1907e915 -r ee47dd944e08 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Sat Dec 12 09:56:10 2015 +0530 +++ b/source/common/x86/intrapred8.asm Fri Dec 11 11:34:01 2015 +0530 @@ -355,55 +355,55 @@ times 8 db (32-22), 22 times 8 db (32-11), 11 -const ang16_shuf_mode9,times 8 db 0, 1 - times 8 db 1, 2 - -const angHor_tab_9, db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16 - db (32-18), 18, (32-20), 20, (32-22), 22, (32-24), 24, (32-26), 26, (32-28), 28, (32-30), 30, (32-32), 32 - -const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16 - db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0 - -const ang16_shuf_mode12, db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3 - db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2 - -const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24 - db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16 - -const ang16_shuf_mode13, db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4 - db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2 - db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0 - -const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24 - db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16 - -const ang16_shuf_mode14, db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5 - db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2 - db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0 - -const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24 - db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16 - -const ang16_shuf_mode15, db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6 - db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2 - db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0 - -const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24 - db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16 - -const ang16_shuf_mode16, db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7 - db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2 - db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0 - -const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-2
[x265] [PATCH 1 of 2] asm: move common constants into const-a.asm, remove unused constants
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1449894370 -19800 # Sat Dec 12 09:56:10 2015 +0530 # Node ID 593a1907e915c9bad7bd3ff608a30770289c249a # Parent a5309338d1352978e79da6210a0d64eb88d60c8f asm: move common constants into const-a.asm, remove unused constants diff -r a5309338d135 -r 593a1907e915 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Sat Jan 09 13:45:00 2016 +0530 +++ b/source/common/x86/blockcopy8.asm Sat Dec 12 09:56:10 2015 +0530 @@ -28,8 +28,6 @@ SECTION_RODATA 32 -tab_Vm:db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 - cextern pb_4 cextern pb_1 cextern pb_16 diff -r a5309338d135 -r 593a1907e915 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Sat Jan 09 13:45:00 2016 +0530 +++ b/source/common/x86/const-a.asm Sat Dec 12 09:56:10 2015 +0530 @@ -40,8 +40,10 @@ const pb_8, times 32 db 8 const pb_15,times 32 db 15 const pb_16,times 32 db 16 +const pb_31,times 32 db 31 const pb_32,times 32 db 32 const pb_64,times 32 db 64 +const pb_124, times 32 db 124 const pb_128, times 32 db 128 const pb_a1,times 16 db 0xa1 @@ -146,10 +148,6 @@ const pd_planar16_mul2, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8,7, 6, 5, 4, 3, 2, 1, 0 const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7 -const popcnt_table -%assign x 0 -%rep 256 -; population count -db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) -%assign x x+1 -%endrep +;; 64-bit constants + +const pq_1, times 1 dq 1 diff -r a5309338d135 -r 593a1907e915 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Sat Jan 09 13:45:00 2016 +0530 +++ b/source/common/x86/loopfilter.asm Sat Dec 12 09:56:10 2015 +0530 @@ -29,9 +29,6 @@ %include "x86util.asm" SECTION_RODATA 32 -pb_31: times 32 db 31 -pb_124: times 32 db 124 -pb_15: times 32 db 15 SECTION .text cextern pb_1 @@ -39,6 +36,9 @@ cextern pb_3 cextern pb_4 cextern pb_01 +cextern pb_15 +cextern pb_31 +cextern pb_124 cextern pb_128 cextern pw_1 cextern pw_n1 diff -r a5309338d135 -r 593a1907e915 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmSat Jan 09 13:45:00 2016 +0530 +++ b/source/common/x86/mc-a.asmSat Dec 12 09:56:10 2015 +0530 @@ -53,7 +53,6 @@ times 8 db 2 times 8 db 4 times 8 db 6 -sq_1: times 1 dq 1 SECTION .text @@ -74,6 +73,7 @@ cextern pw_pixel_max cextern pd_32 cextern pd_64 +cextern pq_1 ; ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) @@ -3638,7 +3638,7 @@ movam3, [r4+16] movdm2, [r4+32] ; denom movam4, [pw_pixel_max] -paddw m2, [sq_1] ; denom+1 +paddw m2, [pq_1] ; denom+1 %endmacro ; src1, src2 diff -r a5309338d135 -r 593a1907e915 source/common/x86/mc-a2.asm --- a/source/common/x86/mc-a2.asm Sat Jan 09 13:45:00 2016 +0530 +++ b/source/common/x86/mc-a2.asm Sat Dec 12 09:56:10 2015 +0530 @@ -43,11 +43,7 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif -pw_1024: times 16 dw 1024 -pd_16: times 4 dd 16 -pd_0f: times 4 dd 0x -pf_inv256: times 8 dd 0.00390625 const pd_inv256,times 4 dq 0.00390625 const pd_0_5, times 4 dq 0.5 @@ -59,9 +55,11 @@ cextern pw_32 cextern pw_512 cextern pw_00ff +cextern pw_1024 cextern pw_3fff cextern pw_pixel_max cextern pd_ +cextern pd_16 ;The hpel_filter routines use non-temporal writes for output. ;The following defines may be uncommented for testing. diff -r a5309338d135 -r 593a1907e915 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Sat Jan 09 13:45:00 2016 +0530 +++ b/source/common/x86/pixel-a.asm Sat Dec 12 09:56:10 2015 +0530 @@ -50,9 +50,6 @@ transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 -sw_f0: dq 0xfff0, 0 -pd_f0: times 4 dd 0x - SECTION .text cextern pb_0 @@ -67,7 +64,6 @@ cextern pw_pmpmpmpm cextern pw_pmmp cextern pd_1 -cextern popcnt_table cextern pd_2 cextern hmul_16p cextern pb_movemask ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] testbench: setup testbench for ARM assembly
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1452327300 -19800 # Sat Jan 09 13:45:00 2016 +0530 # Node ID a5309338d1352978e79da6210a0d64eb88d60c8f # Parent d94f6c2b45f87f5b4b10b4fa70f8a9bd03d3d1c2 testbench: setup testbench for ARM assembly X86 intrinsics has been commented from the ARM testbench. This ARM testbench is for Linux and ARMv6 arch and above diff -r d94f6c2b45f8 -r a5309338d135 source/CMakeLists.txt --- a/source/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530 +++ b/source/CMakeLists.txt Sat Jan 09 13:45:00 2016 +0530 @@ -275,7 +275,9 @@ endif(GCC) find_package(Yasm) -if(YASM_FOUND AND X86) +if(ARM OR CROSS_COMPILE_ARM) +option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF) +elseif(YASM_FOUND AND X86) if (YASM_VERSION_STRING VERSION_LESS "1.2.0") message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 or later required") option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF) @@ -423,15 +425,22 @@ else() set(SUFFIX o) endif() -foreach(ASM ${MSVC_ASMS}) -set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) -list(APPEND YASM_SRCS ${YASM_SRC}) -list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) -add_custom_command( -OUTPUT ${ASM}.${SUFFIX} -COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX} -DEPENDS ${YASM_SRC}) -endforeach() + +if(ARM OR CROSS_COMPILE_ARM) +# compile ARM arch asm files here + +elseif(X86) +# compile X86 arch asm files here +foreach(ASM ${MSVC_ASMS}) +set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) +list(APPEND YASM_SRCS ${YASM_SRC}) +list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) +add_custom_command( +OUTPUT ${ASM}.${SUFFIX} +COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX} +DEPENDS ${YASM_SRC}) +endforeach() +endif() endif() source_group(ASM FILES ${YASM_SRCS}) diff -r d94f6c2b45f8 -r a5309338d135 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530 +++ b/source/common/CMakeLists.txt Sat Jan 09 13:45:00 2016 +0530 @@ -16,12 +16,14 @@ if(ENABLE_ASSEMBLY) set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1) list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1") +endif(ENABLE_ASSEMBLY) +if(ENABLE_ASSEMBLY AND X86) set(SSE3 vec/dct-sse3.cpp) set(SSSE3 vec/dct-ssse3.cpp) set(SSE41 vec/dct-sse41.cpp) -if(MSVC AND X86) +if(MSVC) set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41}) set(WARNDISABLE "/wd4100") # unreferenced formal parameter if(INTEL_CXX) @@ -38,7 +40,7 @@ set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2") endif() endif() -if(GCC AND X86) +if(GCC) if(CLANG) # llvm intrinsic headers cause shadow warnings set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter") @@ -81,7 +83,20 @@ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC}) endforeach() source_group(Assembly FILES ${ASM_PRIMITIVES}) -endif(ENABLE_ASSEMBLY) +endif(ENABLE_ASSEMBLY AND X86) + +if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM)) +set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) + +# add ARM assembly/intrinsic files here +set(A_SRCS) +set(VEC_PRIMITIVES) + +foreach(SRC ${C_SRCS}) +set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC}) +endforeach() +source_group(Assembly FILES ${ASM_PRIMITIVES}) +endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM)) # set_target_properties can't do list expansion string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}") diff -r d94f6c2b45f8 -r a5309338d135 source/common/arm/asm-primitives.cpp --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/arm/asm-primitives.cpp Sat Jan 09 13:45:00 2016 +0530 @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2016 x265 project + * + * Authors: Steve Borho <st...@borho.org> + * Praveen Kumar Tiwari <prav...@multicorewareinc.com> + * Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com> + * Dnyaneshwar Gorade <dnyanesh...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is
[x265] [PATCH] testbench: setup testbench for ARM assembly
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1452321658 -19800 # Sat Jan 09 12:10:58 2016 +0530 # Node ID cd9318b1671bb24212321fcd005381e50642af4c # Parent d94f6c2b45f87f5b4b10b4fa70f8a9bd03d3d1c2 testbench: setup testbench for ARM assembly X86 intrinsics has been commented from the ARM testbench. This ARM testbench is for Linux and ARMv6 arch and above diff -r d94f6c2b45f8 -r cd9318b1671b source/CMakeLists.txt --- a/source/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530 +++ b/source/CMakeLists.txt Sat Jan 09 12:10:58 2016 +0530 @@ -275,7 +275,9 @@ endif(GCC) find_package(Yasm) -if(YASM_FOUND AND X86) +if(ARM OR CROSS_COMPILE_ARM) +option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF) +elseif(YASM_FOUND AND X86) if (YASM_VERSION_STRING VERSION_LESS "1.2.0") message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 or later required") option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF) @@ -423,15 +425,22 @@ else() set(SUFFIX o) endif() -foreach(ASM ${MSVC_ASMS}) -set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) -list(APPEND YASM_SRCS ${YASM_SRC}) -list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) -add_custom_command( -OUTPUT ${ASM}.${SUFFIX} -COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX} -DEPENDS ${YASM_SRC}) -endforeach() + +if(ARM OR CROSS_COMPILE_ARM) +# compile ARM arch asm files here + +elseif(X86) +# compile X86 arch asm files here +foreach(ASM ${MSVC_ASMS}) +set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) +list(APPEND YASM_SRCS ${YASM_SRC}) +list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) +add_custom_command( +OUTPUT ${ASM}.${SUFFIX} +COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX} +DEPENDS ${YASM_SRC}) +endforeach() +endif() endif() source_group(ASM FILES ${YASM_SRCS}) diff -r d94f6c2b45f8 -r cd9318b1671b source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530 +++ b/source/common/CMakeLists.txt Sat Jan 09 12:10:58 2016 +0530 @@ -16,12 +16,14 @@ if(ENABLE_ASSEMBLY) set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1) list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1") +endif(ENABLE_ASSEMBLY) +if(ENABLE_ASSEMBLY AND X86) set(SSE3 vec/dct-sse3.cpp) set(SSSE3 vec/dct-ssse3.cpp) set(SSE41 vec/dct-sse41.cpp) -if(MSVC AND X86) +if(MSVC) set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41}) set(WARNDISABLE "/wd4100") # unreferenced formal parameter if(INTEL_CXX) @@ -38,7 +40,7 @@ set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2") endif() endif() -if(GCC AND X86) +if(GCC) if(CLANG) # llvm intrinsic headers cause shadow warnings set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter") @@ -81,7 +83,20 @@ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC}) endforeach() source_group(Assembly FILES ${ASM_PRIMITIVES}) -endif(ENABLE_ASSEMBLY) +endif(ENABLE_ASSEMBLY AND X86) + +if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM)) +set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) + +# add ARM assembly/intrinsic files here +set(A_SRCS) +set(VEC_PRIMITIVES) + +foreach(SRC ${C_SRCS}) +set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC}) +endforeach() +source_group(Assembly FILES ${ASM_PRIMITIVES}) +endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM)) # set_target_properties can't do list expansion string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}") diff -r d94f6c2b45f8 -r cd9318b1671b source/common/arm/asm-primitives.cpp --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/arm/asm-primitives.cpp Sat Jan 09 12:10:58 2016 +0530 @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2016 x265 project + * + * Authors: Steve Borho <st...@borho.org> + * Praveen Kumar Tiwari <prav...@multicorewareinc.com> + * Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com> + * Dnyaneshwar Gorade <dnyanesh...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is
Re: [x265] [PATCH] testbench: setup testbench for ARM assembly
Please ignore this patch. need little modifications. On Sat, Jan 9, 2016 at 12:12 PM, <dnyanesh...@multicorewareinc.com> wrote: > # HG changeset patch > # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> > # Date 1452321658 -19800 > # Sat Jan 09 12:10:58 2016 +0530 > # Node ID cd9318b1671bb24212321fcd005381e50642af4c > # Parent d94f6c2b45f87f5b4b10b4fa70f8a9bd03d3d1c2 > testbench: setup testbench for ARM assembly > > X86 intrinsics has been commented from the ARM testbench. > This ARM testbench is for Linux and ARMv6 arch and above > > diff -r d94f6c2b45f8 -r cd9318b1671b source/CMakeLists.txt > --- a/source/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530 > +++ b/source/CMakeLists.txt Sat Jan 09 12:10:58 2016 +0530 > @@ -275,7 +275,9 @@ > endif(GCC) > > find_package(Yasm) > -if(YASM_FOUND AND X86) > +if(ARM OR CROSS_COMPILE_ARM) > +option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF) > +elseif(YASM_FOUND AND X86) > if (YASM_VERSION_STRING VERSION_LESS "1.2.0") > message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. > 1.2.0 or later required") > option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" > OFF) > @@ -423,15 +425,22 @@ > else() > set(SUFFIX o) > endif() > -foreach(ASM ${MSVC_ASMS}) > -set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) > -list(APPEND YASM_SRCS ${YASM_SRC}) > -list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) > -add_custom_command( > -OUTPUT ${ASM}.${SUFFIX} > -COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o > ${ASM}.${SUFFIX} > -DEPENDS ${YASM_SRC}) > -endforeach() > + > +if(ARM OR CROSS_COMPILE_ARM) > +# compile ARM arch asm files here > + > +elseif(X86) > +# compile X86 arch asm files here > +foreach(ASM ${MSVC_ASMS}) > +set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM}) > +list(APPEND YASM_SRCS ${YASM_SRC}) > +list(APPEND YASM_OBJS ${ASM}.${SUFFIX}) > +add_custom_command( > +OUTPUT ${ASM}.${SUFFIX} > +COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} > -o ${ASM}.${SUFFIX} > +DEPENDS ${YASM_SRC}) > +endforeach() > +endif() > endif() > > source_group(ASM FILES ${YASM_SRCS}) > diff -r d94f6c2b45f8 -r cd9318b1671b source/common/CMakeLists.txt > --- a/source/common/CMakeLists.txt Sat Jan 09 11:32:33 2016 +0530 > +++ b/source/common/CMakeLists.txt Sat Jan 09 12:10:58 2016 +0530 > @@ -16,12 +16,14 @@ > if(ENABLE_ASSEMBLY) > set_source_files_properties(threading.cpp primitives.cpp PROPERTIES > COMPILE_FLAGS -DENABLE_ASSEMBLY=1) > list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1") > +endif(ENABLE_ASSEMBLY) > > +if(ENABLE_ASSEMBLY AND X86) > set(SSE3 vec/dct-sse3.cpp) > set(SSSE3 vec/dct-ssse3.cpp) > set(SSE41 vec/dct-sse41.cpp) > > -if(MSVC AND X86) > +if(MSVC) > set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41}) > set(WARNDISABLE "/wd4100") # unreferenced formal parameter > if(INTEL_CXX) > @@ -38,7 +40,7 @@ > set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} > PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2") > endif() > endif() > -if(GCC AND X86) > +if(GCC) > if(CLANG) > # llvm intrinsic headers cause shadow warnings > set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter") > @@ -81,7 +83,20 @@ > set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC}) > endforeach() > source_group(Assembly FILES ${ASM_PRIMITIVES}) > -endif(ENABLE_ASSEMBLY) > +endif(ENABLE_ASSEMBLY AND X86) > + > +if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM)) > +set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h > dct8.h loopfilter.h) > + > +# add ARM assembly/intrinsic files here > +set(A_SRCS) > +set(VEC_PRIMITIVES) > + > +foreach(SRC ${C_SRCS}) > +set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC}) > +endforeach() > +source_group(Assembly FILES ${ASM_PRIMITIVES}) > +endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM)) > > # set_target_properties can't do list expansion > string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}") > diff -r d94f6c2b45f8 -r cd9318b1671b source/common/arm/asm-primitives.cpp > --- /dev/null Thu Jan 01 00:00:00 1970 + > +++ b/source/common/arm/asm-primitives.cpp Sat Jan 09 12:10:58 2016 &
[x265] [PATCH] enable arm-linux cross compile build
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1450516372 -19800 # Sat Dec 19 14:42:52 2015 +0530 # Node ID d4de155912366fb831021c9f6a0fde6757a168d7 # Parent 25f78ff3d8efaa1e9d85bc3e718c887ec9afa557 enable arm-linux cross compile build diff -r 25f78ff3d8ef -r d4de15591236 build/arm-linux/make-Makefiles.bash --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/build/arm-linux/make-Makefiles.bash Sat Dec 19 14:42:52 2015 +0530 @@ -0,0 +1,4 @@ +#!/bin/bash +# Run this from within a bash shell + +cmake -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake -G "Unix Makefiles" ../../source && ccmake ../../source diff -r 25f78ff3d8ef -r d4de15591236 build/arm-linux/toolchain.cmake --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/build/arm-linux/toolchain.cmake Sat Dec 19 14:42:52 2015 +0530 @@ -0,0 +1,12 @@ +# CMake toolchain file for cross compiling x265 for ARM arch + +set(CROSS_COMPILE_ARM 1) +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR armv6l) + +# specify the cross compiler +set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc) +set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++) + +# specify the target environment +SET(CMAKE_FIND_ROOT_PATH /usr/arm-linux-gnueabi) diff -r 25f78ff3d8ef -r d4de15591236 source/CMakeLists.txt --- a/source/CMakeLists.txt Tue Dec 22 18:13:28 2015 +0530 +++ b/source/CMakeLists.txt Sat Dec 19 14:42:52 2015 +0530 @@ -59,6 +59,11 @@ set(POWER 1) add_definitions(-DX265_ARCH_POWER=1) elseif(${SYSPROC} STREQUAL "armv6l") +if(CROSS_COMPILE_ARM) +message(STATUS "Cross compiling for ARM arch") +else() +set(CROSS_COMPILE_ARM 0) +endif() message(STATUS "Detected ARM target processor") set(ARM 1) add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1) @@ -175,7 +180,9 @@ elseif(X86 AND NOT X64) add_definitions(-march=i686) endif() -if(ARM) +if(ARM AND CROSS_COMPILE_ARM) +add_definitions(-march=armv6 -mfloat-abi=soft -mfpu=vfp) +elseif(ARM) add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp) endif() if(FPROFILE_GENERATE) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 3] asm: psyCost_pp avx2 asm code for main12
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1448963172 -19800 # Tue Dec 01 15:16:12 2015 +0530 # Node ID 9357c1f448a7b987cebfd3cc5542cc6c65e63fe2 # Parent e2b07541670331ab0cd94b5f312f8f7cac893f92 asm: psyCost_pp avx2 asm code for main12 psy_cost_pp[8x8]6.55x1254.76 8224.62 psy_cost_pp[16x16] 6.51x5087.56 33111.62 psy_cost_pp[32x32] 6.50x20230.92131523.63 psy_cost_pp[64x64] 6.57x80351.48528226.25 diff -r e2b075416703 -r 9357c1f448a7 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 09 13:13:57 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 01 15:16:12 2015 +0530 @@ -1479,12 +1479,11 @@ p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2); p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2); p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2); -#if X265_DEPTH <= 10 + p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2); p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2); p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2); p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2); -#endif p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2); p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2); diff -r e2b075416703 -r 9357c1f448a7 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Wed Dec 09 13:13:57 2015 +0530 +++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:16:12 2015 +0530 @@ -10090,16 +10090,272 @@ pabsd xm1, xm1 %endmacro +%macro PSY_COST_PP_8x8_MAIN12 0 +; load source pixels +lea r4, [r1 * 3] +pmovzxwdm0, [r0] +pmovzxwdm1, [r0 + r1] +pmovzxwdm2, [r0 + r1 * 2] +pmovzxwdm3, [r0 + r4] +lea r5, [r0 + r1 * 4] +pmovzxwdm4, [r5] +pmovzxwdm5, [r5 + r1] +pmovzxwdm6, [r5 + r1 * 2] +pmovzxwdm7, [r5 + r4] + +; source SAD +paddd m8, m0, m1 +paddd m8, m2 +paddd m8, m3 +paddd m8, m4 +paddd m8, m5 +paddd m8, m6 +paddd m8, m7 + +vextracti128xm9, m8, 1 +paddd m8, m9 ; sad_8x8 +movhlps xm9, xm8 +paddd xm8, xm9 +pshuflw xm9, xm8, 0Eh +paddd xm8, xm9 +psrld m8, 2 + +; source SA8D +psubd m9, m1, m0 +paddd m0, m1 +psubd m1, m3, m2 +paddd m2, m3 +punpckhdq m3, m0, m9 +punpckldq m0, m9 +psubd m9, m3, m0 +paddd m0, m3 +punpckhdq m3, m2, m1 +punpckldq m2, m1 +psubd m10, m3, m2 +paddd m2, m3 +psubd m3, m5, m4 +paddd m4, m5 +psubd m5, m7, m6 +paddd m6, m7 +punpckhdq m1, m4, m3 +punpckldq m4, m3 +psubd m7, m1, m4 +paddd m4, m1 +punpckhdq m3, m6, m5 +punpckldq m6, m5 +psubd m1, m3, m6 +paddd m6, m3 +psubd m3, m2, m0 +paddd m0, m2 +psubd m2, m10, m9 +paddd m9, m10 +punpckhqdq m5, m0, m3 +punpcklqdq m0, m3 +psubd m10, m5, m0 +paddd m0, m5 +punpckhqdq m3, m9, m2 +punpcklqdq m9, m2 +psubd m5, m3, m9 +paddd m9, m3 +psubd m3, m6, m4 +paddd m4, m6 +psubd m6, m1, m7 +paddd m7, m1 +punpckhqdq m2, m4, m3 +punpcklqdq m4, m3 +psubd m1, m2, m4 +paddd m4, m2 +punpckhqdq m3, m7, m6 +punpcklqdq m7, m6 +psubd m2, m3, m7 +paddd m7, m3 +psubd m3, m4, m0 +paddd m0, m4 +psubd m4, m1, m10 +paddd m10, m1 +vinserti128 m6, m0, xm3, 1 +vperm2i128 m0, m0, m3, 00110001b +pabsd m0, m0 +pabsd m6, m6 +pmaxsd m0, m6 +vinserti128 m3, m10, xm4, 1 +vperm2i128 m10, m10, m4, 00110001b +pabsd m10, m10 +pabsd m3, m3 +pmaxsd m10, m3 +psubd m3, m7, m9 +paddd m9, m7 +psubd m7, m2, m5 +paddd m5, m2 +vinserti128 m4, m9, xm3, 1 +vperm2i128 m9, m9, m3, 00110001b +pabsd m9, m9 +pabsd m4, m4 +pmaxsd m9, m4 +vinserti128 m3, m5, xm7, 1 +vperm2i128 m5, m5, m7, 00110001b +pabsd m5, m5 +pabsd m3, m3 +pmaxsd m5, m3 +paddd m0, m9 +
[x265] [PATCH 1 of 3] asm: SA8D avx2 asm code for main12
# HG changeset patch # User Dnyaneshwar Gorade <gorad...@gmail.com> # Date 1449647037 -19800 # Wed Dec 09 13:13:57 2015 +0530 # Node ID e2b07541670331ab0cd94b5f312f8f7cac893f92 # Parent b80087c9bf25697c3d354d732323fc895a2ca11f asm: SA8D avx2 asm code for main12 sa8d[ 8x8] 4.70x564.58 2652.82 sa8d[ 8x16] 4.00x1358.06 5429.52 sa8d[16x16] 5.57x2013.70 11212.47 sa8d[16x32] 3.90x5610.47 21883.35 sa8d[32x32] 5.36x8274.18 44361.61 sa8d[32x64] 3.86x23024.0488901.80 sa8d[64x64] 4.35x45509.79198165.11 diff -r b80087c9bf25 -r e2b075416703 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 08 15:52:21 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 13:13:57 2015 +0530 @@ -1313,6 +1313,9 @@ } if (cpuMask & X265_CPU_AVX2) { +#if X265_DEPTH == 12 +ASSIGN_SA8D(avx2); +#endif p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2); // TODO: the planecopy_sp is really planecopy_SC now, must be fix it diff -r b80087c9bf25 -r e2b075416703 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Dec 08 15:52:21 2015 +0530 +++ b/source/common/x86/pixel-a.asm Wed Dec 09 13:13:57 2015 +0530 @@ -6499,6 +6499,1357 @@ %endif ; !ARCH_X86_64 %endmacro ; SA8D + +%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 +INIT_YMM avx2 +cglobal sa8d_8x8_12bit +pmovzxwdm0, [r0] +pmovzxwdm9, [r2] +psubd m0, m9 + +pmovzxwdm1, [r0 + r1] +pmovzxwdm9, [r2 + r3] +psubd m1, m9 + +pmovzxwdm2, [r0 + r1 * 2] +pmovzxwdm9, [r2 + r3 * 2] +psubd m2, m9 + +pmovzxwdm8, [r0 + r4] +pmovzxwdm9, [r2 + r5] +psubd m8, m9 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] + +pmovzxwdm4, [r0] +pmovzxwdm9, [r2] +psubd m4, m9 + +pmovzxwdm5, [r0 + r1] +pmovzxwdm9, [r2 + r3] +psubd m5, m9 + +pmovzxwdm3, [r0 + r1 * 2] +pmovzxwdm9, [r2 + r3 * 2] +psubd m3, m9 + +pmovzxwdm7, [r0 + r4] +pmovzxwdm9, [r2 + r5] +psubd m7, m9 + +movam6, m0 +paddd m0, m1 +psubd m1, m6 +movam6, m2 +paddd m2, m8 +psubd m8, m6 +movam6, m0 + +punpckldq m0, m1 +punpckhdq m6, m1 + +movam1, m0 +paddd m0, m6 +psubd m6, m1 +movam1, m2 + +punpckldq m2, m8 +punpckhdq m1, m8 + +movam8, m2 +paddd m2, m1 +psubd m1, m8 +movam8, m4 +paddd m4, m5 +psubd m5, m8 +movam8, m3 +paddd m3, m7 +psubd m7, m8 +movam8, m4 + +punpckldq m4, m5 +punpckhdq m8, m5 + +movam5, m4 +paddd m4, m8 +psubd m8, m5 +movam5, m3 +punpckldq m3, m7 +punpckhdq m5, m7 + +movam7, m3 +paddd m3, m5 +psubd m5, m7 +movam7, m0 +paddd m0, m2 +psubd m2, m7 +movam7, m6 +paddd m6, m1 +psubd m1, m7 +movam7, m0 + +punpcklqdq m0, m2 +punpckhqdq m7, m2 + +movam2, m0 +paddd m0, m7 +psubd m7, m2 +movam2, m6 + +punpcklqdq m6, m1 +punpckhqdq m2, m1 + +movam1, m6 +paddd m6, m2 +psubd m2, m1 +movam1, m4 +paddd m4, m3 +psubd m3, m1 +movam1, m8 +paddd m8, m5 +psubd m5, m1 +movam1, m4 + +punpcklqdq m4, m3 +punpckhqdq m1, m3 + +movam3, m4 +paddd m4, m1 +psubd m1, m3 +movam3, m8 + +punpcklqdq m8, m5 +punpckhqdq m3, m5 + +movam5, m8 +paddd m8, m3 +psubd m3, m5 +movam5, m0 +paddd m0, m4 +psubd m4, m5 +movam5, m7 +paddd m7, m1 +psubd m1, m5 +movam5, m0 + +vinserti128 m0, m0, xm4, 1 +vperm2i128 m5, m5, m4, 00110001b + +pxorm4, m4 +psubd m4, m0 +pmaxsd m0, m4 +pxorm4, m4 +psubd m4, m5 +pmaxsd m5, m4 +pmaxsd m0, m5 +movam4, m7 + +vinserti128 m7, m7, xm1, 1 +vperm2i128 m4, m4, m1, 00110001b + +
[x265] [PATCH 3 of 3] asm: fix dct[8x8] AVX2 asm for main12
# HG changeset patch # User Aasaipriya Chandran# Date 1449648215 -19800 # Wed Dec 09 13:33:35 2015 +0530 # Node ID 9e3f71d784e59527a14702e83de474bc3f12fd15 # Parent 9357c1f448a7b987cebfd3cc5542cc6c65e63fe2 asm: fix dct[8x8] AVX2 asm for main12 diff -r 9357c1f448a7 -r 9e3f71d784e5 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 01 15:16:12 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 13:33:35 2015 +0530 @@ -1573,9 +1573,8 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2); ALL_LUMA_TU_S(idct, idct, avx2); -#if X265_DEPTH <= 10 ALL_LUMA_TU_S(dct, dct, avx2); -#endif + ALL_LUMA_CU_S(transpose, transpose, avx2); ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2); diff -r 9357c1f448a7 -r 9e3f71d784e5 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmTue Dec 01 15:16:12 2015 +0530 +++ b/source/common/x86/dct8.asmWed Dec 09 13:33:35 2015 +0530 @@ -2174,7 +2174,7 @@ pmaddwd m0, m%4 phaddd m2, m0 paddd m2, m5 -psrad m2, DCT_SHIFT +psrad m2, DCT8_SHIFT1 packssdwm2, m2 vpermq m2, m2, 0x08 mova[r5 + %2], xm2 @@ -2190,7 +2190,7 @@ phaddd m8, m9 phaddd m6, m8 paddd m6, m5 -psrad m6, DCT_SHIFT2 +psrad m6, DCT8_SHIFT2 vbroadcasti128 m4, [r6 + %2] pmaddwd m10,m0, m4 @@ -2201,7 +2201,7 @@ phaddd m8, m9 phaddd m10,m8 paddd m10,m5 -psrad m10,DCT_SHIFT2 +psrad m10,DCT8_SHIFT2 packssdwm6, m10 vpermq m10,m6, 0xD8 @@ -2210,18 +2210,7 @@ INIT_YMM avx2 cglobal dct8, 3, 7, 11, 0-8*16 -%if BIT_DEPTH == 12 -%define DCT_SHIFT 6 -vbroadcasti128 m5,[pd_16] -%elif BIT_DEPTH == 10 -%define DCT_SHIFT 4 -vbroadcasti128 m5,[pd_8] -%elif BIT_DEPTH == 8 -%define DCT_SHIFT 2 -vbroadcasti128 m5,[pd_2] -%else -%error Unsupported BIT_DEPTH! -%endif +vbroadcasti128 m5,[pd_ %+ DCT8_ROUND1] %define DCT_SHIFT2 9 add r2d, r2d @@ -2265,7 +2254,7 @@ DCT8_PASS_1 7 * 16, 7 * 16, 4, 1 ;pass2 -vbroadcasti128 m5,[pd_256] +vbroadcasti128 m5,[pd_ %+ DCT8_ROUND2] movam0,[r5] movam1,[r5 + 32] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: move common constants into const-a.asm, remove unused constants
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1449723720 -19800 # Thu Dec 10 10:32:00 2015 +0530 # Node ID ff08c87f20a7f3f36bfb0849bd2d10fc1f8da465 # Parent 33d04da2f68830ac51151cfbda8f38fb9a7e8bb9 asm: move common constants into const-a.asm, remove unused constants diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Dec 09 22:24:25 2015 +0530 +++ b/source/common/x86/blockcopy8.asm Thu Dec 10 10:32:00 2015 +0530 @@ -28,8 +28,6 @@ SECTION_RODATA 32 -tab_Vm:db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 - cextern pb_4 cextern pb_1 cextern pb_16 diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Wed Dec 09 22:24:25 2015 +0530 +++ b/source/common/x86/const-a.asm Thu Dec 10 10:32:00 2015 +0530 @@ -40,8 +40,10 @@ const pb_8, times 32 db 8 const pb_15,times 32 db 15 const pb_16,times 32 db 16 +const pb_31,times 32 db 31 const pb_32,times 32 db 32 const pb_64,times 32 db 64 +const pb_124, times 32 db 124 const pb_128, times 32 db 128 const pb_a1,times 16 db 0xa1 @@ -146,10 +148,6 @@ const pd_planar16_mul2, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8,7, 6, 5, 4, 3, 2, 1, 0 const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7 -const popcnt_table -%assign x 0 -%rep 256 -; population count -db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) -%assign x x+1 -%endrep +;; 64-bit constants + +const pq_1, times 1 dq 1 diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Wed Dec 09 22:24:25 2015 +0530 +++ b/source/common/x86/loopfilter.asm Thu Dec 10 10:32:00 2015 +0530 @@ -29,15 +29,15 @@ %include "x86util.asm" SECTION_RODATA 32 -pb_31: times 32 db 31 -pb_124: times 32 db 124 -pb_15: times 32 db 15 pb_movemask_32: times 32 db 0x00 times 32 db 0xFF SECTION .text cextern pb_1 cextern pb_01 +cextern pb_15 +cextern pb_31 +cextern pb_124 cextern pb_128 cextern pb_2 cextern pw_2 diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmWed Dec 09 22:24:25 2015 +0530 +++ b/source/common/x86/mc-a.asmThu Dec 10 10:32:00 2015 +0530 @@ -53,7 +53,6 @@ times 8 db 2 times 8 db 4 times 8 db 6 -sq_1: times 1 dq 1 SECTION .text @@ -74,6 +73,7 @@ cextern pw_pixel_max cextern pd_32 cextern pd_64 +cextern pq_1 ; ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) @@ -3638,7 +3638,7 @@ movam3, [r4+16] movdm2, [r4+32] ; denom movam4, [pw_pixel_max] -paddw m2, [sq_1] ; denom+1 +paddw m2, [pq_1] ; denom+1 %endmacro ; src1, src2 diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/mc-a2.asm --- a/source/common/x86/mc-a2.asm Wed Dec 09 22:24:25 2015 +0530 +++ b/source/common/x86/mc-a2.asm Thu Dec 10 10:32:00 2015 +0530 @@ -43,11 +43,7 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif -pw_1024: times 16 dw 1024 -pd_16: times 4 dd 16 -pd_0f: times 4 dd 0x -pf_inv256: times 8 dd 0.00390625 const pd_inv256,times 4 dq 0.00390625 const pd_0_5, times 4 dq 0.5 @@ -59,9 +55,11 @@ cextern pw_32 cextern pw_512 cextern pw_00ff +cextern pw_1024 cextern pw_3fff cextern pw_pixel_max cextern pd_ +cextern pd_16 ;The hpel_filter routines use non-temporal writes for output. ;The following defines may be uncommented for testing. diff -r 33d04da2f688 -r ff08c87f20a7 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Wed Dec 09 22:24:25 2015 +0530 +++ b/source/common/x86/pixel-a.asm Thu Dec 10 10:32:00 2015 +0530 @@ -50,9 +50,6 @@ transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 -sw_f0: dq 0xfff0, 0 -pd_f0: times 4 dd 0x - SECTION .text cextern pb_0 @@ -67,7 +64,6 @@ cextern pw_pmpmpmpm cextern pw_pmmp cextern pd_1 -cextern popcnt_table cextern pd_2 cextern hmul_16p cextern pb_movemask ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH 1 of 2] asm: SA8D avx2 asm code for main12
Thanks, Min. I am re-sending these two patches with the above modifications. On Wed, Dec 2, 2015 at 8:57 PM, chen <chenm...@163.com> wrote: > I suggest just keep one name of sa8d_avx2 > > At 2015-12-02 12:31:59,"Dnyaneshwar Gorade" < > dnyanesh...@multicorewareinc.com> wrote: > > the real function name is sa8d_8x8_avx2 whereas the common function name > is sa8d_8x8_avx2_avx2, that's why we got proper call. both are different. > > On Tue, Dec 1, 2015 at 9:08 PM, chen <chenm...@163.com> wrote: > >> >> >> At 2015-12-01 18:58:43,dnyanesh...@multicorewareinc.com wrote: >> ># HG changeset patch >> ># User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> >> ># Date 1448962785 -19800 >> ># Tue Dec 01 15:09:45 2015 +0530 >> ># Node ID f8b0ce4e9f4092a38d8095961825e734a34f112e >> ># Parent e2e507ffe752d6c193a219b242c433bdc55f39f7 >> >asm: SA8D avx2 asm code for main12 >> > >> >sa8d[ 8x8] 4.70x564.58 2652.82 >> >sa8d[ 8x16] 4.00x1358.06 5429.52 >> >sa8d[16x16] 5.57x2013.70 11212.47 >> >sa8d[16x32] 3.90x5610.47 21883.35 >> >sa8d[32x32] 5.36x8274.18 44361.61 >> >sa8d[32x64] 3.86x23024.0488901.80 >> >sa8d[64x64] 4.35x45509.79198165.11 >> > >> >diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/asm-primitives.cpp >> >--- a/source/common/x86/asm-primitives.cpp Mon Nov 30 11:23:38 2015 +0530 >> >+++ b/source/common/x86/asm-primitives.cpp Tue Dec 01 15:09:45 2015 +0530 >> >@@ -1313,6 +1313,9 @@ >> > } >> > if (cpuMask & X265_CPU_AVX2) >> > { >> >+#if X265_DEPTH == 12 >> >+ASSIGN_SA8D(avx2); >> >+#endif >> > p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2); >> > >> > // TODO: the planecopy_sp is really planecopy_SC now, must be fix >> > it >> >diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/pixel-a.asm >> >--- a/source/common/x86/pixel-a.asm Mon Nov 30 11:23:38 2015 +0530 >> >+++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:09:45 2015 +0530 >> >@@ -6499,6 +6499,1357 @@ >> > %endif ; !ARCH_X86_64 >> > %endmacro ; SA8D >> > >> >+ >> >+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 >> >+INIT_YMM avx2 >> >+cglobal sa8d_8x8_avx2 >> the really function name is sa8d_8x8_avx2_avx2, we are lucky, below call use >> correct name >> >> >> ___ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >> >> > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH 1 of 2] asm: SA8D avx2 asm code for main12
the real function name is sa8d_8x8_avx2 whereas the common function name is sa8d_8x8_avx2_avx2, that's why we got proper call. both are different. On Tue, Dec 1, 2015 at 9:08 PM, chen <chenm...@163.com> wrote: > > > At 2015-12-01 18:58:43,dnyanesh...@multicorewareinc.com wrote: > ># HG changeset patch > ># User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> > ># Date 1448962785 -19800 > ># Tue Dec 01 15:09:45 2015 +0530 > ># Node ID f8b0ce4e9f4092a38d8095961825e734a34f112e > ># Parent e2e507ffe752d6c193a219b242c433bdc55f39f7 > >asm: SA8D avx2 asm code for main12 > > > >sa8d[ 8x8] 4.70x564.58 2652.82 > >sa8d[ 8x16] 4.00x1358.06 5429.52 > >sa8d[16x16] 5.57x2013.70 11212.47 > >sa8d[16x32] 3.90x5610.47 21883.35 > >sa8d[32x32] 5.36x8274.18 44361.61 > >sa8d[32x64] 3.86x23024.0488901.80 > >sa8d[64x64] 4.35x45509.79198165.11 > > > >diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/asm-primitives.cpp > >--- a/source/common/x86/asm-primitives.cpp Mon Nov 30 11:23:38 2015 +0530 > >+++ b/source/common/x86/asm-primitives.cpp Tue Dec 01 15:09:45 2015 +0530 > >@@ -1313,6 +1313,9 @@ > > } > > if (cpuMask & X265_CPU_AVX2) > > { > >+#if X265_DEPTH == 12 > >+ASSIGN_SA8D(avx2); > >+#endif > > p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2); > > > > // TODO: the planecopy_sp is really planecopy_SC now, must be fix it > >diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/pixel-a.asm > >--- a/source/common/x86/pixel-a.asm Mon Nov 30 11:23:38 2015 +0530 > >+++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:09:45 2015 +0530 > >@@ -6499,6 +6499,1357 @@ > > %endif ; !ARCH_X86_64 > > %endmacro ; SA8D > > > >+ > >+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 > >+INIT_YMM avx2 > >+cglobal sa8d_8x8_avx2 > the really function name is sa8d_8x8_avx2_avx2, we are lucky, below call use > correct name > > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2] asm: psyCost_pp avx2 asm code for main12
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1448963172 -19800 # Tue Dec 01 15:16:12 2015 +0530 # Node ID dbc004801f4734ba048a451d779c1c9c82f1b6ac # Parent f8b0ce4e9f4092a38d8095961825e734a34f112e asm: psyCost_pp avx2 asm code for main12 psy_cost_pp[8x8]6.55x1254.76 8224.62 psy_cost_pp[16x16] 6.51x5087.56 33111.62 psy_cost_pp[32x32] 6.50x20230.92131523.63 psy_cost_pp[64x64] 6.57x80351.48528226.25 diff -r f8b0ce4e9f40 -r dbc004801f47 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 01 15:09:45 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 01 15:16:12 2015 +0530 @@ -1479,12 +1479,11 @@ p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2); p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2); p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2); -#if X265_DEPTH <= 10 + p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2); p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2); p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2); p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2); -#endif p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2); p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2); diff -r f8b0ce4e9f40 -r dbc004801f47 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Dec 01 15:09:45 2015 +0530 +++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:16:12 2015 +0530 @@ -10090,16 +10090,272 @@ pabsd xm1, xm1 %endmacro +%macro PSY_COST_PP_8x8_MAIN12 0 +; load source pixels +lea r4, [r1 * 3] +pmovzxwdm0, [r0] +pmovzxwdm1, [r0 + r1] +pmovzxwdm2, [r0 + r1 * 2] +pmovzxwdm3, [r0 + r4] +lea r5, [r0 + r1 * 4] +pmovzxwdm4, [r5] +pmovzxwdm5, [r5 + r1] +pmovzxwdm6, [r5 + r1 * 2] +pmovzxwdm7, [r5 + r4] + +; source SAD +paddd m8, m0, m1 +paddd m8, m2 +paddd m8, m3 +paddd m8, m4 +paddd m8, m5 +paddd m8, m6 +paddd m8, m7 + +vextracti128xm9, m8, 1 +paddd m8, m9 ; sad_8x8 +movhlps xm9, xm8 +paddd xm8, xm9 +pshuflw xm9, xm8, 0Eh +paddd xm8, xm9 +psrld m8, 2 + +; source SA8D +psubd m9, m1, m0 +paddd m0, m1 +psubd m1, m3, m2 +paddd m2, m3 +punpckhdq m3, m0, m9 +punpckldq m0, m9 +psubd m9, m3, m0 +paddd m0, m3 +punpckhdq m3, m2, m1 +punpckldq m2, m1 +psubd m10, m3, m2 +paddd m2, m3 +psubd m3, m5, m4 +paddd m4, m5 +psubd m5, m7, m6 +paddd m6, m7 +punpckhdq m1, m4, m3 +punpckldq m4, m3 +psubd m7, m1, m4 +paddd m4, m1 +punpckhdq m3, m6, m5 +punpckldq m6, m5 +psubd m1, m3, m6 +paddd m6, m3 +psubd m3, m2, m0 +paddd m0, m2 +psubd m2, m10, m9 +paddd m9, m10 +punpckhqdq m5, m0, m3 +punpcklqdq m0, m3 +psubd m10, m5, m0 +paddd m0, m5 +punpckhqdq m3, m9, m2 +punpcklqdq m9, m2 +psubd m5, m3, m9 +paddd m9, m3 +psubd m3, m6, m4 +paddd m4, m6 +psubd m6, m1, m7 +paddd m7, m1 +punpckhqdq m2, m4, m3 +punpcklqdq m4, m3 +psubd m1, m2, m4 +paddd m4, m2 +punpckhqdq m3, m7, m6 +punpcklqdq m7, m6 +psubd m2, m3, m7 +paddd m7, m3 +psubd m3, m4, m0 +paddd m0, m4 +psubd m4, m1, m10 +paddd m10, m1 +vinserti128 m6, m0, xm3, 1 +vperm2i128 m0, m0, m3, 00110001b +pabsd m0, m0 +pabsd m6, m6 +pmaxsd m0, m6 +vinserti128 m3, m10, xm4, 1 +vperm2i128 m10, m10, m4, 00110001b +pabsd m10, m10 +pabsd m3, m3 +pmaxsd m10, m3 +psubd m3, m7, m9 +paddd m9, m7 +psubd m7, m2, m5 +paddd m5, m2 +vinserti128 m4, m9, xm3, 1 +vperm2i128 m9, m9, m3, 00110001b +pabsd m9, m9 +pabsd m4, m4 +pmaxsd m9, m4 +vinserti128 m3, m5, xm7, 1 +vperm2i128 m5, m5, m7, 00110001b +pabsd m5, m5 +pabsd m3, m3 +pmaxsd m5, m3 +paddd m0, m9 +
[x265] [PATCH 1 of 2] asm: SA8D avx2 asm code for main12
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1448962785 -19800 # Tue Dec 01 15:09:45 2015 +0530 # Node ID f8b0ce4e9f4092a38d8095961825e734a34f112e # Parent e2e507ffe752d6c193a219b242c433bdc55f39f7 asm: SA8D avx2 asm code for main12 sa8d[ 8x8] 4.70x564.58 2652.82 sa8d[ 8x16] 4.00x1358.06 5429.52 sa8d[16x16] 5.57x2013.70 11212.47 sa8d[16x32] 3.90x5610.47 21883.35 sa8d[32x32] 5.36x8274.18 44361.61 sa8d[32x64] 3.86x23024.0488901.80 sa8d[64x64] 4.35x45509.79198165.11 diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 30 11:23:38 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 01 15:09:45 2015 +0530 @@ -1313,6 +1313,9 @@ } if (cpuMask & X265_CPU_AVX2) { +#if X265_DEPTH == 12 +ASSIGN_SA8D(avx2); +#endif p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2); // TODO: the planecopy_sp is really planecopy_SC now, must be fix it diff -r e2e507ffe752 -r f8b0ce4e9f40 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Nov 30 11:23:38 2015 +0530 +++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:09:45 2015 +0530 @@ -6499,6 +6499,1357 @@ %endif ; !ARCH_X86_64 %endmacro ; SA8D + +%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 +INIT_YMM avx2 +cglobal sa8d_8x8_avx2 +pmovzxwdm0, [r0] +pmovzxwdm9, [r2] +psubd m0, m9 + +pmovzxwdm1, [r0 + r1] +pmovzxwdm9, [r2 + r3] +psubd m1, m9 + +pmovzxwdm2, [r0 + r1 * 2] +pmovzxwdm9, [r2 + r3 * 2] +psubd m2, m9 + +pmovzxwdm8, [r0 + r4] +pmovzxwdm9, [r2 + r5] +psubd m8, m9 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] + +pmovzxwdm4, [r0] +pmovzxwdm9, [r2] +psubd m4, m9 + +pmovzxwdm5, [r0 + r1] +pmovzxwdm9, [r2 + r3] +psubd m5, m9 + +pmovzxwdm3, [r0 + r1 * 2] +pmovzxwdm9, [r2 + r3 * 2] +psubd m3, m9 + +pmovzxwdm7, [r0 + r4] +pmovzxwdm9, [r2 + r5] +psubd m7, m9 + +movam6, m0 +paddd m0, m1 +psubd m1, m6 +movam6, m2 +paddd m2, m8 +psubd m8, m6 +movam6, m0 + +punpckldq m0, m1 +punpckhdq m6, m1 + +movam1, m0 +paddd m0, m6 +psubd m6, m1 +movam1, m2 + +punpckldq m2, m8 +punpckhdq m1, m8 + +movam8, m2 +paddd m2, m1 +psubd m1, m8 +movam8, m4 +paddd m4, m5 +psubd m5, m8 +movam8, m3 +paddd m3, m7 +psubd m7, m8 +movam8, m4 + +punpckldq m4, m5 +punpckhdq m8, m5 + +movam5, m4 +paddd m4, m8 +psubd m8, m5 +movam5, m3 +punpckldq m3, m7 +punpckhdq m5, m7 + +movam7, m3 +paddd m3, m5 +psubd m5, m7 +movam7, m0 +paddd m0, m2 +psubd m2, m7 +movam7, m6 +paddd m6, m1 +psubd m1, m7 +movam7, m0 + +punpcklqdq m0, m2 +punpckhqdq m7, m2 + +movam2, m0 +paddd m0, m7 +psubd m7, m2 +movam2, m6 + +punpcklqdq m6, m1 +punpckhqdq m2, m1 + +movam1, m6 +paddd m6, m2 +psubd m2, m1 +movam1, m4 +paddd m4, m3 +psubd m3, m1 +movam1, m8 +paddd m8, m5 +psubd m5, m1 +movam1, m4 + +punpcklqdq m4, m3 +punpckhqdq m1, m3 + +movam3, m4 +paddd m4, m1 +psubd m1, m3 +movam3, m8 + +punpcklqdq m8, m5 +punpckhqdq m3, m5 + +movam5, m8 +paddd m8, m3 +psubd m3, m5 +movam5, m0 +paddd m0, m4 +psubd m4, m5 +movam5, m7 +paddd m7, m1 +psubd m1, m5 +movam5, m0 + +vinserti128 m0, m0, xm4, 1 +vperm2i128 m5, m5, m4, 00110001b + +pxorm4, m4 +psubd m4, m0 +pmaxsd m0, m4 +pxorm4, m4 +psubd m4, m5 +pmaxsd m5, m4 +pmaxsd m0, m5 +movam4, m7 + +vinserti128 m7, m7, xm1, 1 +vperm2i128 m4, m4, m1, 001
[x265] [PATCH] use 32-bits multiply in mbtree_propagate_cost to avoid intraCost overflow
# HG changeset patch # User Min Chen# Date 1447865933 21600 # Wed Nov 18 10:58:53 2015 -0600 # Node ID d4e8af415c2ea939f1c82cf2dc1561fee20847de # Parent ad15f3756ad888b99a4ba868b857e09909dae226 use 32-bits multiply in mbtree_propagate_cost to avoid intraCost overflow diff -r ad15f3756ad8 -r d4e8af415c2e source/common/x86/mc-a2.asm --- a/source/common/x86/mc-a2.asm Fri Nov 06 12:33:51 2015 +0530 +++ b/source/common/x86/mc-a2.asm Wed Nov 18 10:58:53 2015 -0600 @@ -1019,15 +1019,11 @@ por m3, m1 movdm1, [r1+r5*2] ; prop -%if (BIT_DEPTH <= 8) -pmaddwd m0, m2 -%else punpckldq m2, m2 punpckldq m0, m0 pmuludq m0, m2 pshufd m2, m2, q3120 pshufd m0, m0, q3120 -%endif punpcklwd m1, m4 cvtdq2pdm0, m0 @@ -1072,15 +1068,11 @@ por m3, m1 movdm1, [r1+r5*2] ; prop -%if (BIT_DEPTH <= 8) -pmaddwd m0, m2 -%else -punpckldq m2, m2 ; DWORD [- 1 - 0] +punpckldq m2, m2 ; DWORD [_ 1 _ 0] punpckldq m0, m0 pmuludq m0, m2 ; QWORD [m1 m0] pshufd m2, m2, q3120 pshufd m0, m0, q3120 -%endif punpcklwd m1, m4 cvtdq2pdm0, m0 mulpd m0, m6 ; intra*invq*fps_factor>>8 @@ -1120,11 +1112,7 @@ pminsd xm3, xm2 pmovzxwdxm1, [r1+r5*2] ; prop -%if (BIT_DEPTH <= 8) -pmaddwd xm0, xm2 -%else pmulld xm0, xm2 -%endif cvtdq2pdm0, xm0 cvtdq2pdm1, xm1 ; prop %if cpuflag(avx2) @@ -1166,11 +1154,7 @@ movdxm1, [r1+r5*2] ; prop pmovzxwdxm1, xm1 -%if (BIT_DEPTH <= 8) -pmaddwd xm0, xm2 -%else pmulld xm0, xm2 -%endif cvtdq2pdm0, xm0 cvtdq2pdm1, xm1 ; prop %if cpuflag(avx2) @@ -1204,11 +1188,7 @@ movzx r6d, word [r1+r5*2] ; prop movdxm1, r6d -%if (BIT_DEPTH <= 8) -pmaddwd xm0, xm2 -%else pmulld xm0, xm2 -%endif cvtdq2pdm0, xm0 cvtdq2pdm1, xm1 ; prop %if cpuflag(avx2) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: fix inconsistent crash due to unaligned NR buffer in denoiseDct SSE4 asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1447829883 -19800 # Wed Nov 18 12:28:03 2015 +0530 # Node ID 653430a3de3f9ba342922ee6ea46d4cf52c1eb39 # Parent e8f9a60d4cd9e73c9f2baf05c2ccda5af1892b46 asm: fix inconsistent crash due to unaligned NR buffer in denoiseDct SSE4 asm Also, fixes warning C4316: object allocated on the heap may not be aligned 16 diff -r e8f9a60d4cd9 -r 653430a3de3f source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmMon Nov 16 16:44:33 2015 +0530 +++ b/source/common/x86/dct8.asmWed Nov 18 12:28:03 2015 +0530 @@ -2115,15 +2115,15 @@ mova m0, [r0] pabswm1, m0 -mova m2, [r1] +movu m2, [r1] pmovsxwd m3, m1 padddm2, m3 -mova [r1], m2 -mova m2, [r1 + 16] +movu [r1], m2 +movu m2, [r1 + 16] psrldq m3, m1, 8 pmovsxwd m4, m3 padddm2, m4 -mova [r1 + 16], m2 +movu [r1 + 16], m2 movu m3, [r2] psubusw m1, m3 diff -r e8f9a60d4cd9 -r 653430a3de3f source/encoder/encoder.h --- a/source/encoder/encoder.h Mon Nov 16 16:44:33 2015 +0530 +++ b/source/encoder/encoder.h Wed Nov 18 12:28:03 2015 +0530 @@ -79,7 +79,7 @@ { public: -ALIGN_VAR_16(uint32_t, m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]); +uint32_t m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; uint32_t m_countEmergency[MAX_NUM_TR_CATEGORIES]; uint16_t (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: fix output change due to overflow in mbtree_propagate_cost 10bit asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1447828315 -19800 # Wed Nov 18 12:01:55 2015 +0530 # Node ID 58c177d2e182e5b633670024c567b535eb49614f # Parent e8f9a60d4cd9e73c9f2baf05c2ccda5af1892b46 asm: fix output change due to overflow in mbtree_propagate_cost 10bit asm diff -r e8f9a60d4cd9 -r 58c177d2e182 source/common/x86/mc-a2.asm --- a/source/common/x86/mc-a2.asm Mon Nov 16 16:44:33 2015 +0530 +++ b/source/common/x86/mc-a2.asm Wed Nov 18 12:01:55 2015 +0530 @@ -1019,7 +1019,7 @@ por m3, m1 movdm1, [r1+r5*2] ; prop -%if (BIT_DEPTH <= 10) +%if (BIT_DEPTH <= 8) pmaddwd m0, m2 %else punpckldq m2, m2 @@ -1072,7 +1072,7 @@ por m3, m1 movdm1, [r1+r5*2] ; prop -%if (BIT_DEPTH <= 10) +%if (BIT_DEPTH <= 8) pmaddwd m0, m2 %else punpckldq m2, m2 ; DWORD [- 1 - 0] @@ -1120,7 +1120,7 @@ pminsd xm3, xm2 pmovzxwdxm1, [r1+r5*2] ; prop -%if (BIT_DEPTH <= 10) +%if (BIT_DEPTH <= 8) pmaddwd xm0, xm2 %else pmulld xm0, xm2 @@ -1166,7 +1166,7 @@ movdxm1, [r1+r5*2] ; prop pmovzxwdxm1, xm1 -%if (BIT_DEPTH <= 10) +%if (BIT_DEPTH <= 8) pmaddwd xm0, xm2 %else pmulld xm0, xm2 @@ -1204,7 +1204,7 @@ movzx r6d, word [r1+r5*2] ; prop movdxm1, r6d -%if (BIT_DEPTH <= 10) +%if (BIT_DEPTH <= 8) pmaddwd xm0, xm2 %else pmulld xm0, xm2 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: fix intrapred_planar16x16 SSE4 code for main12
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1446700839 -19800 # Thu Nov 05 10:50:39 2015 +0530 # Node ID 69bd13c0047d2c1a3b232bea40b72e436baa618e # Parent 3103afbd31fa9b26533f06202516a511ee221439 asm: fix intrapred_planar16x16 SSE4 code for main12 diff -r 3103afbd31fa -r 69bd13c0047d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 05 06:13:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 05 10:50:39 2015 +0530 @@ -1144,9 +1144,9 @@ p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4); p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4); +p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4); #if X265_DEPTH <= 10 -p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4); p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4); #endif ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); diff -r 3103afbd31fa -r 69bd13c0047d source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Thu Nov 05 06:13:51 2015 +0530 +++ b/source/common/x86/intrapred16.asm Thu Nov 05 10:50:39 2015 +0530 @@ -2427,6 +2427,118 @@ ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--- INIT_XMM sse4 +%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 +cglobal intra_pred_planar16, 3,5,12 +add r1d, r1d + +pmovzxwdm2, [r2 + 2] +pmovzxwdm7, [r2 + 10] +pmovzxwdm10, [r2 + 18] +pmovzxwdm0, [r2 + 26] + +movzx r3d, word [r2 + 34] ; topRight = above[16] +lea r4, [pd_planar16_mul1] + +movdm3, r3d +pshufd m3, m3, 0 ; topRight + +pslld m8, m3, 2 +pmulld m3, m3, [r4 + 0*mmsize] ; (x + 1) * topRight +paddd m9, m3, m8 +paddd m4, m9, m8 +paddd m8, m4 + +pslld m1, m2, 4 +pslld m6, m7, 4 +pslld m5, m10, 4 +pslld m11, m0, 4 +psubd m1, m2 +psubd m6, m7 +psubd m5, m10 +psubd m11, m0 + +paddd m4, m5 +paddd m3, m1 +paddd m8, m11 +paddd m9, m6 + +movam5, [pd_16] +paddd m3, m5 +paddd m9, m5 +paddd m4, m5 +paddd m8, m5 + +movzx r4d, word [r2 + 98] ; bottomLeft = left[16] +movdm6, r4d +pshufd m6, m6, 0 ; bottomLeft + +paddd m4, m6 +paddd m3, m6 +paddd m8, m6 +paddd m9, m6 + +psubd m1, m6, m0 ; column 12-15 +psubd m11, m6, m10; column 8-11 +psubd m10, m6, m7 ; column 4-7 +psubd m6, m2 ; column 0-3 + +add r2, 66 +lea r4, [pd_planar16_mul0] + +%macro INTRA_PRED_PLANAR16 1 +movzx r3d, word [r2] +movdm5, r3d +pshufd m5, m5, 0 + +pmulld m0, m5, [r4 + 3*mmsize] ; column 12-15 +pmulld m2, m5, [r4 + 2*mmsize] ; column 8-11 +pmulld m7, m5, [r4 + 1*mmsize] ; column 4-7 +pmulld m5, m5, [r4 + 0*mmsize] ; column 0-3 + +paddd m0, m8 +paddd m2, m4 +paddd m7, m9 +paddd m5, m3 + +paddd m8, m1 +paddd m4, m11 +paddd m9, m10 +paddd m3, m6 + +psrad m0, 5 +psrad m2, 5 +psrad m7, 5 +psrad m5, 5 + +packusdwm2, m0 +packusdwm5, m7 +movu[r0], m5 +movu[r0 + mmsize], m2 + +add r2, 2 +lea r0, [r0 + r1] +%endmacro + +INTRA_PRED_PLANAR16 0 +INTRA_PRED_PLANAR16 1 +INTRA_PRED_PLANAR16 2 +INTRA_PRED_PLANAR16 3 +INTRA_PRED_PLANAR16 4 +INTRA_PRED_PLANAR16 5 +INTRA_PRED_PLANAR16 6 +INTRA_PRED_PLANAR16 7 +INTRA_PRED_PLANAR16 8 +INTRA_PRED_PLANAR16 9 +INTRA_PRED_PLANAR16 10 +INTRA_PRED_PLANAR16 11 +INTRA_PRED_PLANAR16 12 +INTRA_PRED_PLANAR16 13 +INTRA_PRED_PLANAR16 14 +INTRA_PRED_PLANAR16 15 +RET + +%else +; code for BIT_DEPTH == 10 cglobal intra_pred_planar16, 3,3,8 add r1, r1 movum2, [r2 + 2] @@ -2504,6 +2616,7 @@ INTRA
[x265] [PATCH] asm: fix mbtree_propagate_cost asm failure, fixes crash in OpenBSD
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1446645042 -19800 # Wed Nov 04 19:20:42 2015 +0530 # Node ID 25bada1bb5494fc12d62e87d1b7b788307dd963f # Parent c11dd97a8b999414c60dceef8620d3d9055cf4c1 asm: fix mbtree_propagate_cost asm failure, fixes crash in OpenBSD The SSE2 asm code reads and write extra 4 bytes if loop counter is not multiple of 2 as SSE2 asm code process 2 int values in single iteration The AVX asm code reads and write extra 4,8 or 12 bytes if loop counter is not multiple of 4 as AVX asm code process 4 int values in single iteration diff -r c11dd97a8b99 -r 25bada1bb549 source/common/x86/mc-a2.asm --- a/source/common/x86/mc-a2.asm Wed Nov 04 17:06:33 2015 +0530 +++ b/source/common/x86/mc-a2.asm Wed Nov 04 19:20:42 2015 +0530 @@ -995,7 +995,8 @@ ; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len ) ;- INIT_XMM sse2 -cglobal mbtree_propagate_cost, 6,6,7 +cglobal mbtree_propagate_cost, 7,7,7 +dec r6d movsd m6, [r5] mulpd m6, [pd_inv256] xor r5d, r5d @@ -1044,8 +1045,40 @@ movh[r0+r5*4], m0 add r5d, 2 -cmp r5d, r6m +cmp r5d, r6d jl .loop + +xor r6d, r5d +jnz .even +movdm2, [r2+r5*4] ; intra +movdm0, [r4+r5*4] ; invq +movdm3, [r3+r5*2] ; inter +pandm3, m5 +punpcklwd m3, m4 + +; PMINSD +pcmpgtd m1, m2, m3 +pandm3, m1 +pandn m1, m2 +por m3, m1 + +movdm1, [r1+r5*2] ; prop +pmaddwd m0, m2 +punpcklwd m1, m4 +cvtdq2pdm0, m0 +mulpd m0, m6 ; intra*invq*fps_factor>>8 +cvtdq2pdm1, m1 ; prop +addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) +cvtdq2pdm1, m2 ; intra +psubd m2, m3 ; intra - inter +cvtdq2pdm2, m2 ; intra - inter +mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + +divpd m0, m1 +addpd m0, [pd_0_5] +cvttpd2dqm0, m0 +movd[r0+r5*4], m0 +.even: RET @@ -1055,7 +1088,8 @@ ;- ; FIXME: align loads/stores to 16 bytes %macro MBTREE_AVX 0 -cglobal mbtree_propagate_cost, 6,6,7 +cglobal mbtree_propagate_cost, 7,7,7 +sub r6d, 3 vbroadcastsdm6, [r5] mulpd m6, [pd_inv256] xor r5d, r5d @@ -1089,9 +1123,81 @@ cvttpd2dq xm0, m0 movu[r0+r5*4], xm0 -add r5d, 4 -cmp r5d, r6m +add r5d, 4 ; process 4 values in one iteration +cmp r5d, r6d jl .loop + +add r6d, 3 +xor r6d, r5d +jz .even ; if loop counter is multiple of 4, all values are processed + +and r6d, 3 ; otherwise, remaining unprocessed values must be 1, 2 or 3 +cmp r6d, 1 +je .process1 ; if only 1 value is unprocessed + +; process 2 values here +movqxm2, [r2+r5*4] ; intra +movqxm0, [r4+r5*4] ; invq +movdxm3, [r3+r5*2] ; inter +pmovzxwdxm3, xm3 +pandxm3, xm5 +pminsd xm3, xm2 + +movdxm1, [r1+r5*2] ; prop +pmovzxwdxm1, xm1 +pmaddwd xm0, xm2 +cvtdq2pdm0, xm0 +cvtdq2pdm1, xm1 ; prop +%if cpuflag(avx2) +fmaddpd m0, m0, m6, m1 +%else +mulpd m0, m6 ; intra*invq*fps_factor>>8 +addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) +%endif +cvtdq2pdm1, xm2 ; intra +psubd xm2, xm3; intra - inter +cvtdq2pdm2, xm2 ; intra - inter +mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + +divpd m0, m1 +addpd m0, [pd_0_5] +cvttpd2dq xm0, m0 +movq[r0+r5*4], xm0 + +xor r6d, 2 +jz .even +add r5d, 2 + +; process 1 value here +.process1: +movdxm2, [r2+r5*4] ; intra +movdxm0, [r4+r5*4] ; invq +movzx r6d, word [r3+r5*2] ; inter +movdxm3, r6d +pandxm3, xm5 +pminsd xm3, xm2 + +movzx r6d, word [r1+r5*2] ; prop +movdxm1, r6d +pmaddwd xm0, xm2 +cvtdq
Re: [x265] [PATCH] fix invalid Instruction Set provided in CLI if CPU doesn't support it
Ok. I will check if we can use AND mask and provide more information to user. On Tue, Nov 3, 2015 at 10:36 AM, Deepthi Nandakumar < deep...@multicorewareinc.com> wrote: > Since the idea here is to correctly log a user-generated error (user-cpuid > > detected cpuid), the patch is headed in the right direction. > > Min's suggestion on using an AND mask sounds good, and can you also make > the warning more informative (print user-cpuid, and the cpuid we're > defaulting to) ? > > On Thu, Oct 29, 2015 at 11:16 PM, Steve Borho <st...@borho.org> wrote: > >> On 10/28, dnyanesh...@multicorewareinc.com wrote: >> > # HG changeset patch >> > # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> >> > # Date 1446021877 -19800 >> > # Wed Oct 28 14:14:37 2015 +0530 >> > # Node ID 975087370d14e90cd63edecb34fb4bf2feda2468 >> > # Parent 6563218ce342c30bfd4f9bc172a1dab510e6e55b >> > fix invalid Instruction Set provided in CLI if CPU doesn't support it >> > >> > This patch avoids crash/invalid instructions when we provide >> instruction sets to >> > be used are higher than the cpu capabilities. >> > >> > For example, if our cpu supports instruction sets upto AVX and we >> provide >> > --asm "avx2" (AVX2 is higher than AVX) then it will show warning and >> use default >> > x265 detected intruction sets. >> >> The whole point of having this override is in case our CPU detection is >> somehow wrong. The user needs to be able to override the detection mask. >> >> That said.. if the user provided mask has bits set that were not >> detected, it's ok to log a serious warning that says you think the >> encoder is about to break and it is the user's fault. >> >> BTW: this feature is often used for benchmarking, to disable certain >> optimizations piecemeal, but that is not the primary reason why it >> exists. >> >> > diff -r 6563218ce342 -r 975087370d14 source/common/primitives.cpp >> > --- a/source/common/primitives.cppMon Oct 26 12:13:53 2015 +0530 >> > +++ b/source/common/primitives.cppWed Oct 28 14:14:37 2015 +0530 >> > @@ -238,6 +238,15 @@ >> > primitives.cu[i].intra_pred_allangs = NULL; >> > >> > #if ENABLE_ASSEMBLY >> > + >> > +if ((uint32_t)param->cpuid > X265_NS::cpu_detect()) >> > +{ >> > +if (param->logLevel >= X265_LOG_INFO) >> > +x265_log(param, X265_LOG_WARNING, "Unsupported CPUID >> provided in CLI, so choosing x265 detected CPUID!\n"); >> > + >> > +param->cpuid = X265_NS::cpu_detect(); >> > +} >> > + >> > setupInstrinsicPrimitives(primitives, param->cpuid); >> > setupAssemblyPrimitives(primitives, param->cpuid); >> > #endif >> > ___ >> > x265-devel mailing list >> > x265-devel@videolan.org >> > https://mailman.videolan.org/listinfo/x265-devel >> >> -- >> Steve Borho >> ___ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >> > > > > -- > Deepthi Nandakumar > Engineering Manager, x265 > Multicoreware, Inc > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] fix invalid Instruction Set provided in CLI if CPU doesn't support it
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1446021877 -19800 # Wed Oct 28 14:14:37 2015 +0530 # Node ID 975087370d14e90cd63edecb34fb4bf2feda2468 # Parent 6563218ce342c30bfd4f9bc172a1dab510e6e55b fix invalid Instruction Set provided in CLI if CPU doesn't support it This patch avoids crash/invalid instructions when we provide instruction sets to be used are higher than the cpu capabilities. For example, if our cpu supports instruction sets upto AVX and we provide --asm "avx2" (AVX2 is higher than AVX) then it will show warning and use default x265 detected intruction sets. diff -r 6563218ce342 -r 975087370d14 source/common/primitives.cpp --- a/source/common/primitives.cpp Mon Oct 26 12:13:53 2015 +0530 +++ b/source/common/primitives.cpp Wed Oct 28 14:14:37 2015 +0530 @@ -238,6 +238,15 @@ primitives.cu[i].intra_pred_allangs = NULL; #if ENABLE_ASSEMBLY + +if ((uint32_t)param->cpuid > X265_NS::cpu_detect()) +{ +if (param->logLevel >= X265_LOG_INFO) +x265_log(param, X265_LOG_WARNING, "Unsupported CPUID provided in CLI, so choosing x265 detected CPUID!\n"); + +param->cpuid = X265_NS::cpu_detect(); +} + setupInstrinsicPrimitives(primitives, param->cpuid); setupAssemblyPrimitives(primitives, param->cpuid); #endif ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: fix intrapred_planar16x16 sse4 code for main12
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1445588852 -19800 # Fri Oct 23 13:57:32 2015 +0530 # Node ID 0fb5a67c2f5ea4f3fe1a7e0dcbc0c5c117dd6dfc # Parent a7251c3e0ef810b95bb25be5371035208e36996d asm: fix intrapred_planar16x16 sse4 code for main12 diff -r a7251c3e0ef8 -r 0fb5a67c2f5e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Oct 22 09:12:28 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Oct 23 13:57:32 2015 +0530 @@ -1145,8 +1145,9 @@ p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4); p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4); +p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4); + #if X265_DEPTH <= 10 -p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4); p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4); #endif ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); diff -r a7251c3e0ef8 -r 0fb5a67c2f5e source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Thu Oct 22 09:12:28 2015 +0530 +++ b/source/common/x86/const-a.asm Fri Oct 23 13:57:32 2015 +0530 @@ -122,6 +122,7 @@ const pd_2, times 8 dd 2 const pd_4, times 4 dd 4 const pd_8, times 4 dd 8 +const pd_15,times 8 dd 15 const pd_16,times 8 dd 16 const pd_31,times 4 dd 31 const pd_32,times 8 dd 32 @@ -136,7 +137,8 @@ const pd_524416,times 4 dd 524416 const pd_n32768,times 8 dd 0x8000 const pd_n131072, times 4 dd 0xfffe - +const pd_planar16_mul0, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7 const popcnt_table diff -r a7251c3e0ef8 -r 0fb5a67c2f5e source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Thu Oct 22 09:12:28 2015 +0530 +++ b/source/common/x86/intrapred16.asm Fri Oct 23 13:57:32 2015 +0530 @@ -109,6 +109,7 @@ cextern pw_16 cextern pw_31 cextern pw_32 +cextern pd_15 cextern pd_16 cextern pd_31 cextern pd_32 @@ -123,6 +124,8 @@ cextern pb_unpackwq1 cextern pb_unpackwq2 cextern pw_planar16_mul +cextern pd_planar16_mul0 +cextern pd_planar16_mul1 cextern pw_planar32_mul ;--- @@ -2216,6 +2219,114 @@ ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--- INIT_XMM sse4 +%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 +cglobal intra_pred_planar16, 3,5,12 +add r1d, r1d + +pmovzxwdm2, [r2 + 2] +pmovzxwdm7, [r2 + 10] +pmovzxwdm10, [r2 + 18] +pmovzxwdm0, [r2 + 26] + +movzx r3d, word [r2 + 34] ; topRight = above[16] +lea r4, [pd_planar16_mul1] + +movdm3, r3d +pshufd m3, m3, 0 ; topRight + +pmulld m8, m3, [r4 + 3*mmsize] ; (x + 1) * topRight +pmulld m4, m3, [r4 + 2*mmsize] ; (x + 1) * topRight +pmulld m9, m3, [r4 + 1*mmsize] ; (x + 1) * topRight +pmulld m3, m3, [r4 + 0*mmsize] ; (x + 1) * topRight + +movam11, [pd_15] +pmulld m1, m2, m11; (blkSize - 1 - y) * above[x] +pmulld m6, m7, m11; (blkSize - 1 - y) * above[x] +pmulld m5, m10, m11; (blkSize - 1 - y) * above[x] +pmulld m11, m0 ; (blkSize - 1 - y) * above[x] + +paddd m4, m5 +paddd m3, m1 +paddd m8, m11 +paddd m9, m6 + +movam5, [pd_16] +paddd m3, m5 +paddd m9, m5 +paddd m4, m5 +paddd m8, m5 + +movzx r4d, word [r2 + 98] ; bottomLeft = left[16] +movdm6, r4d +pshufd m6, m6, 0 ; bottomLeft + +paddd m4, m6 +paddd m3, m6 +paddd m8, m6 +paddd m9, m6 + +psubd m1, m6, m0 ; column 12-15 +psubd m11, m6, m10; column 8-11 +psubd m10, m6, m7 ; column 4-7 +psubd
Re: [x265] [PATCH] asm: fix intrapred_planar16x16 sse4 code for main12
On Wed, Oct 21, 2015 at 7:58 AM, chen <chenm...@163.com> wrote: > > > At 2015-10-20 18:38:56,dnyanesh...@multicorewareinc.com wrote: > ># HG changeset patch > ># User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> > ># Date 1445337446 -19800 > ># Tue Oct 20 16:07:26 2015 +0530 > ># Node ID 987b5f8c2c447dc5b0e410d37f6212470feecd1c > ># Parent f335a9a7b9083dcb2fc7a1cadc2dbeffdd6388f2 > >asm: fix intrapred_planar16x16 sse4 code for main12 > > > >diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/asm-primitives.cpp > >--- a/source/common/x86/asm-primitives.cpp Mon Oct 19 12:42:52 2015 +0530 > >+++ b/source/common/x86/asm-primitives.cpp Tue Oct 20 16:07:26 2015 +0530 > >@@ -1145,8 +1145,9 @@ > > p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = > > PFX(intra_pred_planar4_sse4); > > p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = > > PFX(intra_pred_planar8_sse4); > > > >+p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = > >PFX(intra_pred_planar16_sse4); > >+ > > #if X265_DEPTH <= 10 > >-p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = > >PFX(intra_pred_planar16_sse4); > > p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = > > PFX(intra_pred_planar32_sse4); > > #endif > > ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); > >diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/const-a.asm > >--- a/source/common/x86/const-a.asm Mon Oct 19 12:42:52 2015 +0530 > >+++ b/source/common/x86/const-a.asm Tue Oct 20 16:07:26 2015 +0530 > >@@ -122,6 +122,7 @@ > > const pd_2, times 8 dd 2 > > const pd_4, times 4 dd 4 > > const pd_8, times 4 dd 8 > >+const pd_15,times 8 dd 15 > > const pd_16,times 8 dd 16 > > const pd_31,times 4 dd 31 > > const pd_32,times 8 dd 32 > >@@ -136,7 +137,8 @@ > > const pd_524416,times 4 dd 524416 > > const pd_n32768,times 8 dd 0x8000 > > const pd_n131072, times 4 dd 0xfffe > >- > >+const pd_planar16_mul, times 1 dd 15, 14, 13, 12, 11, 10, 9, > > 8, 7, 6, 5, 4, 3, 2, 1, 0 > >+const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, > > 8, 9, 10, 11, 12, 13, 14, 15, 16 > > const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, > > 7 > > > > const popcnt_table > >diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/intrapred16.asm > >--- a/source/common/x86/intrapred16.asm Mon Oct 19 12:42:52 2015 +0530 > >+++ b/source/common/x86/intrapred16.asm Tue Oct 20 16:07:26 2015 +0530 > >@@ -109,6 +109,7 @@ > > cextern pw_16 > > cextern pw_31 > > cextern pw_32 > >+cextern pd_15 > > cextern pd_16 > > cextern pd_31 > > cextern pd_32 > >@@ -123,6 +124,8 @@ > > cextern pb_unpackwq1 > > cextern pb_unpackwq2 > > cextern pw_planar16_mul > >+cextern pd_planar16_mul > >+cextern pd_planar16_mul1 > > cextern pw_planar32_mul > > > > ;--- > >@@ -2216,6 +2219,114 @@ > > ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, > > int filter) > > ;--- > > INIT_XMM sse4 > >+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 > >+cglobal intra_pred_planar16, 3,5,12 > >+add r1d, r1d > >+ > >+pmovzxwdm2, [r2 + 2] > >+pmovzxwdm7, [r2 + 10] > >+pmovzxwdm10, [r2 + 18] > >+pmovzxwdm0, [r2 + 26] > >+ > >+movzx r3d, word [r2 + 34] ; topRight = > >above[16] > >+lea r4, [pd_planar16_mul1] > >+ > >+movdm3, r3d > >+pshufd m3, m3, 0 ; topRight > >+ > >+pmulld m8, m3, [r4 + 3*mmsize] ; (x + 1) * > >topRight > >+pmulld m4, m3, [r4 + 2*mmsize] ; (x + 1) * > >topRight > >+pmulld m9, m3, [r4 + 1*mmsize] ; (x + 1) * > >topRight > >+pmulld m3, m3, [r4 + 0*mmsize] ; (x + 1) * > >topRight > >+ > >+movam11, [pd_15] > >+pmulld m1, m2, m11; (blkSize - 1 > >- y) * above[x] > >+pmu
[x265] [PATCH] asm: fix intrapred_planar16x16 sse4 code for main12
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1445337446 -19800 # Tue Oct 20 16:07:26 2015 +0530 # Node ID 987b5f8c2c447dc5b0e410d37f6212470feecd1c # Parent f335a9a7b9083dcb2fc7a1cadc2dbeffdd6388f2 asm: fix intrapred_planar16x16 sse4 code for main12 diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Oct 19 12:42:52 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Oct 20 16:07:26 2015 +0530 @@ -1145,8 +1145,9 @@ p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4); p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4); +p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4); + #if X265_DEPTH <= 10 -p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4); p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4); #endif ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Mon Oct 19 12:42:52 2015 +0530 +++ b/source/common/x86/const-a.asm Tue Oct 20 16:07:26 2015 +0530 @@ -122,6 +122,7 @@ const pd_2, times 8 dd 2 const pd_4, times 4 dd 4 const pd_8, times 4 dd 8 +const pd_15,times 8 dd 15 const pd_16,times 8 dd 16 const pd_31,times 4 dd 31 const pd_32,times 8 dd 32 @@ -136,7 +137,8 @@ const pd_524416,times 4 dd 524416 const pd_n32768,times 8 dd 0x8000 const pd_n131072, times 4 dd 0xfffe - +const pd_planar16_mul, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7 const popcnt_table diff -r f335a9a7b908 -r 987b5f8c2c44 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Mon Oct 19 12:42:52 2015 +0530 +++ b/source/common/x86/intrapred16.asm Tue Oct 20 16:07:26 2015 +0530 @@ -109,6 +109,7 @@ cextern pw_16 cextern pw_31 cextern pw_32 +cextern pd_15 cextern pd_16 cextern pd_31 cextern pd_32 @@ -123,6 +124,8 @@ cextern pb_unpackwq1 cextern pb_unpackwq2 cextern pw_planar16_mul +cextern pd_planar16_mul +cextern pd_planar16_mul1 cextern pw_planar32_mul ;--- @@ -2216,6 +2219,114 @@ ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--- INIT_XMM sse4 +%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 +cglobal intra_pred_planar16, 3,5,12 +add r1d, r1d + +pmovzxwdm2, [r2 + 2] +pmovzxwdm7, [r2 + 10] +pmovzxwdm10, [r2 + 18] +pmovzxwdm0, [r2 + 26] + +movzx r3d, word [r2 + 34] ; topRight = above[16] +lea r4, [pd_planar16_mul1] + +movdm3, r3d +pshufd m3, m3, 0 ; topRight + +pmulld m8, m3, [r4 + 3*mmsize] ; (x + 1) * topRight +pmulld m4, m3, [r4 + 2*mmsize] ; (x + 1) * topRight +pmulld m9, m3, [r4 + 1*mmsize] ; (x + 1) * topRight +pmulld m3, m3, [r4 + 0*mmsize] ; (x + 1) * topRight + +movam11, [pd_15] +pmulld m1, m2, m11; (blkSize - 1 - y) * above[x] +pmulld m6, m7, m11; (blkSize - 1 - y) * above[x] +pmulld m5, m10, m11; (blkSize - 1 - y) * above[x] +pmulld m11, m0 ; (blkSize - 1 - y) * above[x] + +paddd m4, m5 +paddd m3, m1 +paddd m8, m11 +paddd m9, m6 + +movam5, [pd_16] +paddd m3, m5 +paddd m9, m5 +paddd m4, m5 +paddd m8, m5 + +movzx r4d, word [r2 + 98] ; bottomLeft = left[16] +movdm6, r4d +pshufd m6, m6, 0 ; bottomLeft + +paddd m4, m6 +paddd m3, m6 +paddd m8, m6 +paddd m9, m6 + +psubd m1, m6, m0 ; column 12-15 +psubd m11, m6, m10; column 8-11 +psubd m10, m6, m7 ; column 4-7 +psubd
[x265] [PATCH] asm: fix intrapred_planar16x16 sse4 code for main12
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1445245458 -19800 # Mon Oct 19 14:34:18 2015 +0530 # Node ID 76d4fc7264a0d22218db30f65bb58095c294db1b # Parent 04575a459a160162391fcf1a12e8e6f2e81e95b4 asm: fix intrapred_planar16x16 sse4 code for main12 diff -r 04575a459a16 -r 76d4fc7264a0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Sep 30 11:22:16 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Oct 19 14:34:18 2015 +0530 @@ -1145,8 +1145,9 @@ p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4); p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4); +p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4); + #if X265_DEPTH <= 10 -p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4); p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4); #endif ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); diff -r 04575a459a16 -r 76d4fc7264a0 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Wed Sep 30 11:22:16 2015 +0530 +++ b/source/common/x86/const-a.asm Mon Oct 19 14:34:18 2015 +0530 @@ -122,6 +122,7 @@ const pd_2, times 8 dd 2 const pd_4, times 4 dd 4 const pd_8, times 4 dd 8 +const pd_15,times 8 dd 15 const pd_16,times 8 dd 16 const pd_31,times 4 dd 31 const pd_32,times 8 dd 32 @@ -136,7 +137,8 @@ const pd_524416,times 4 dd 524416 const pd_n32768,times 8 dd 0x8000 const pd_n131072, times 4 dd 0xfffe - +const pd_planar16_mul, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7 const popcnt_table diff -r 04575a459a16 -r 76d4fc7264a0 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Wed Sep 30 11:22:16 2015 +0530 +++ b/source/common/x86/intrapred16.asm Mon Oct 19 14:34:18 2015 +0530 @@ -109,6 +109,7 @@ cextern pw_16 cextern pw_31 cextern pw_32 +cextern pd_15 cextern pd_16 cextern pd_31 cextern pd_32 @@ -123,6 +124,8 @@ cextern pb_unpackwq1 cextern pb_unpackwq2 cextern pw_planar16_mul +cextern pd_planar16_mul +cextern pd_planar16_mul1 cextern pw_planar32_mul ;--- @@ -2216,6 +2219,110 @@ ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--- INIT_XMM sse4 +%if ARCH_X86_64 == 1 && BIT_DEPTH == 12 +cglobal intra_pred_planar16, 3,5,12 +add r1d, r1d + +pmovzxwdm2, [r2 + 2] +pmovzxwdm7, [r2 + 10] +pmovzxwdm10, [r2 + 18] +pmovzxwdm0, [r2 + 26] + +movzx r3d, word [r2 + 34] ; topRight = above[16] +movzx r4d, word [r2 + 98] ; bottomLeft = left[16] + +movdm3, r3d +pshufd m3, m3, 0 ; topRight + +pmulld m8, m3, [pd_planar16_mul1 + 3*mmsize] ; (x + 1) * topRight +pmulld m4, m3, [pd_planar16_mul1 + 2*mmsize] ; (x + 1) * topRight +pmulld m9, m3, [pd_planar16_mul1 + 1*mmsize] ; (x + 1) * topRight +pmulld m3, m3, [pd_planar16_mul1 + 0*mmsize] ; (x + 1) * topRight + +pmulld m1, m2, [pd_15]; (blkSize - 1 - y) * above[x] +pmulld m6, m7, [pd_15]; (blkSize - 1 - y) * above[x] +pmulld m5, m10, [pd_15]; (blkSize - 1 - y) * above[x] +pmulld m11, m0, [pd_15]; (blkSize - 1 - y) * above[x] + +paddd m3, [pd_16] +paddd m9, [pd_16] +paddd m4, [pd_16] +paddd m8, [pd_16] + +paddd m4, m5 +paddd m3, m1 +paddd m8, m11 +paddd m9, m6 + +movdm6, r4d +pshufd m6, m6, 0 ; bottomLeft + +paddd m4, m6 +paddd m3, m6 +paddd m8, m6 +paddd m9, m6 + +psubd m1, m6, m0 ; column 12-15 +psubd m11, m6, m10; column 8-11 +psubd m10, m6, m7 ; column 4-7 +psubd m6, m2 ; column 0-3 + +add r2, 66 + +%macro INTR
[x265] [PATCH] multilib: fix multiple definition of pelFilterLumaStrong_c
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1444972708 -19800 # Fri Oct 16 10:48:28 2015 +0530 # Node ID 76a36eabd4be405fc4880d882499a754c3f190fa # Parent fe65544b6c40d7cd62c2b86275bf98b264b6edb0 multilib: fix multiple definition of pelFilterLumaStrong_c diff -r fe65544b6c40 -r 76a36eabd4be source/common/loopfilter.cpp --- a/source/common/loopfilter.cpp Wed Oct 07 13:42:41 2015 +0530 +++ b/source/common/loopfilter.cpp Fri Oct 16 10:48:28 2015 +0530 @@ -138,7 +138,7 @@ } } -void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ) +static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ) { for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2] asm: asm code for deblocking filter horizontal and vertical
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1444286180 -19800 # Thu Oct 08 12:06:20 2015 +0530 # Node ID 86627e458e6e2e357fe1746067392c6984b8915f # Parent 38e4b94377fa6ffe57472c49ecff6c909ed4f6dc asm: asm code for deblocking filter horizontal and vertical diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Oct 06 14:19:56 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Oct 08 12:06:20 2015 +0530 @@ -2541,6 +2541,9 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4); #if X86_64 +p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4); +p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4); + p.saoCuStatsBO = PFX(saoCuStatsBO_sse4); p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4); p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4); diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Tue Oct 06 14:19:56 2015 +0530 +++ b/source/common/x86/const-a.asm Thu Oct 08 12:06:20 2015 +0530 @@ -67,6 +67,7 @@ ;; 16-bit constants +const pw_n1,times 16 dw -1 const pw_1, times 16 dw 1 const pw_2, times 16 dw 2 const pw_3, times 16 dw 3 diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Tue Oct 06 14:19:56 2015 +0530 +++ b/source/common/x86/loopfilter.asm Thu Oct 08 12:06:20 2015 +0530 @@ -37,6 +37,7 @@ SECTION .text cextern pb_1 +cextern pb_01 cextern pb_128 cextern pb_2 cextern pw_2 @@ -45,6 +46,8 @@ cextern pw_1 cextern hmul_16p cextern pb_4 +cextern pw_4 +cextern pw_n1 ; @@ -2231,6 +2234,248 @@ RET %endif ; ARCH_X86_64 +%if ARCH_X86_64 +;; argument registers used - +; r0- src +; r1- srcStep +; r2- offset +; r3- tcP +; r4- tcQ + +INIT_XMM sse4 +cglobal pelFilterLumaStrong_H, 5,7,10 +mov r1, r2 +neg r3d +neg r4d +neg r1 + +lea r5, [r2 * 3] +lea r6, [r1 * 3] + +pmovzxbwm4, [r0]; src[0] +pmovzxbwm3, [r0 + r1] ; src[-offset] +pmovzxbwm2, [r0 + r1 * 2] ; src[-offset * 2] +pmovzxbwm1, [r0 + r6] ; src[-offset * 3] +pmovzxbwm0, [r0 + r1 * 4] ; src[-offset * 4] +pmovzxbwm5, [r0 + r2] ; src[offset] +pmovzxbwm6, [r0 + r2 * 2] ; src[offset * 2] +pmovzxbwm7, [r0 + r5] ; src[offset * 3] + +paddw m0, m0 ; m0*2 +movam8, m2 +paddw m8, m3 ; m2 + m3 +paddw m8, m4 ; m2 + m3 + m4 +movam9, m8 +paddw m9, m9 ; 2*m2 + 2*m3 + 2*m4 +paddw m8, m1 ; m2 + m3 + m4 + m1 +paddw m0, m8 ; 2*m0 + m2+ m3 + m4 + m1 +paddw m9, m1 +paddw m0, m1 +paddw m9, m5 ; m1 + 2*m2 + 2*m3 + 2*m4 + m5 +paddw m0, m1 ; 2*m0 + 3*m1 + m2 + m3 + m4 + +punpcklqdq m0, m9 +punpcklqdq m1, m3 + +paddw m3, m4 +movam9, m5 +paddw m9, m6 +paddw m7, m7 ; 2*m7 +paddw m9, m3 ; m3 + m4 + m5 + m6 +movam3, m9 +paddw m3, m3 ; 2*m3 + 2*m4 + 2*m5 + 2*m6 +paddw m7, m9 ; 2*m7 + m3 + m4 + m5 + m6 +paddw m7, m6 +psubw m3, m6 ; 2*m3 + 2*m4 + 2*m5 + m6 +paddw m7, m6 ; m3 + m4 + m5 + 3*m6 + 2*m7 +paddw m3, m2 ; m2 + 2*m3 + 2*m4 + 2*m5 + m6 + +punpcklqdq m9, m8 +punpcklqdq m3, m7 +punpcklqdq m5, m2 +punpcklqdq m4, m6 + +movdm7, r3d ; -tcP +movdm2, r4d ; -tcQ +pshufb m7, [pb_01] +pshufb m2, [pb_01] +movam6, m2 +punpcklqdq m6, m7 + +paddw m0, [pw_4] +paddw m3, [pw_4] +paddw m9, [pw_2] + +psraw m0, 3 +psraw m3, 3 +psraw m9, 2 + +psubw m0, m1 +psubw m3, m4 +psubw m9, m5 + +pmaxsw m0, m7 +pmaxsw m3, m2 +pmaxsw m9, m6 +psignw m7, [pw_n1] +psignw m2, [pw_n1] +psignw m6, [pw_n1] +pminsw m0, m7 +pminsw m3, m2 +
[x265] [PATCH 1 of 2] asm: separated deblocking filter into horizontal & vertical primitives for asm
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1444121396 -19800 # Tue Oct 06 14:19:56 2015 +0530 # Node ID 38e4b94377fa6ffe57472c49ecff6c909ed4f6dc # Parent f8ad1ff7074aab85a6cf376886014c88f46b7275 asm: separated deblocking filter into horizontal & vertical primitives for asm diff -r f8ad1ff7074a -r 38e4b94377fa source/common/deblock.cpp --- a/source/common/deblock.cpp Thu Oct 08 15:27:34 2015 -0500 +++ b/source/common/deblock.cpp Tue Oct 06 14:19:56 2015 +0530 @@ -280,31 +280,6 @@ * \param maskQ indicator to enable filtering on partQ * \param maskP1 decision weak filter/no filter for partP * \param maskQ1 decision weak filter/no filter for partQ */ -static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ) -{ -int32_t tc2 = 2 * tc; -int32_t tcP = (tc2 & maskP); -int32_t tcQ = (tc2 & maskQ); -for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) -{ -int16_t m4 = (int16_t)src[0]; -int16_t m3 = (int16_t)src[-offset]; -int16_t m5 = (int16_t)src[offset]; -int16_t m2 = (int16_t)src[-offset * 2]; -int16_t m6 = (int16_t)src[offset * 2]; -int16_t m1 = (int16_t)src[-offset * 3]; -int16_t m7 = (int16_t)src[offset * 3]; -int16_t m0 = (int16_t)src[-offset * 4]; -src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1); -src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2); -src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3); -src[0] = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4); -src[offset] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5); -src[offset * 2] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6); -} -} - -/* Weak filter */ static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ, int32_t maskP1, int32_t maskQ1) { @@ -446,7 +421,12 @@ useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3)); if (sw) -pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ); +{ +int32_t tc2 = 2 * tc; +int32_t tcP = (tc2 & maskP); +int32_t tcQ = (tc2 & maskQ); +primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ); +} else { int32_t sideThreshold = (beta + (beta >> 1)) >> 3; diff -r f8ad1ff7074a -r 38e4b94377fa source/common/loopfilter.cpp --- a/source/common/loopfilter.cpp Thu Oct 08 15:27:34 2015 -0500 +++ b/source/common/loopfilter.cpp Tue Oct 06 14:19:56 2015 +0530 @@ -137,6 +137,27 @@ rec += stride; } } + +void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ) +{ +for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) +{ +int16_t m4 = (int16_t)src[0]; +int16_t m3 = (int16_t)src[-offset]; +int16_t m5 = (int16_t)src[offset]; +int16_t m2 = (int16_t)src[-offset * 2]; +int16_t m6 = (int16_t)src[offset * 2]; +int16_t m1 = (int16_t)src[-offset * 3]; +int16_t m7 = (int16_t)src[offset * 3]; +int16_t m0 = (int16_t)src[-offset * 4]; +src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1); +src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2); +src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3); +src[0] = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4); +src[offset] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5); +src[offset * 2] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6); +} +} } namespace X265_NS { @@ -151,5 +172,9 @@ p.saoCuOrgE3[1] = processSaoCUE3; p.saoCuOrgB0 = processSaoCUB0; p.sign = calSign; + +// C code is same for EDGE_VER and EDGE_HOR only asm code is different +p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c; +p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c; } } diff -r f8ad1ff7074a -r 38e4b94377fa source/common/primitives.h --- a/source/common/primitives.hThu Oct 08 15:27:34 2015 -0500 +++ b/source/common/primitives.hTue Oct 06 14:19:5
[x265] [PATCH] add 64-byte alignment macro, align NR buffer & Encoder class to cache line of 64-byte
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1444107449 -19800 # Tue Oct 06 10:27:29 2015 +0530 # Node ID 93525c471023575d500c912284a3853ee8df8991 # Parent f8b8ebdc54578e6735216d8b9abce5ba80c05bd8 add 64-byte alignment macro, align NR buffer & Encoder class to cache line of 64-byte diff -r f8b8ebdc5457 -r 93525c471023 source/common/common.h --- a/source/common/common.hMon Sep 28 14:34:41 2015 +0530 +++ b/source/common/common.hTue Oct 06 10:27:29 2015 +0530 @@ -74,6 +74,7 @@ #define ALIGN_VAR_8(T, var) T var __attribute__((aligned(8))) #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16))) #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32))) +#define ALIGN_VAR_64(T, var) T var __attribute__((aligned(64))) #if defined(__MINGW32__) #define fseeko fseeko64 @@ -84,6 +85,7 @@ #define ALIGN_VAR_8(T, var) __declspec(align(8)) T var #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var +#define ALIGN_VAR_64(T, var) __declspec(align(64)) T var #define fseeko _fseeki64 #endif // if defined(__GNUC__) diff -r f8b8ebdc5457 -r 93525c471023 source/encoder/encoder.h --- a/source/encoder/encoder.h Mon Sep 28 14:34:41 2015 +0530 +++ b/source/encoder/encoder.h Tue Oct 06 10:27:29 2015 +0530 @@ -79,7 +79,7 @@ { public: -ALIGN_VAR_16(uint32_t, m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]); +ALIGN_VAR_64(uint32_t, m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]); uint32_t m_countEmergency[MAX_NUM_TR_CATEGORIES]; uint16_t (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: avx2 code for sad_x3_32xN, improved over 40% than SSE
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1443156551 -19800 # Fri Sep 25 10:19:11 2015 +0530 # Node ID 310d35ed0ba85174676d0b0bb91e6b8b5f475726 # Parent 975352b2c0223b9139aad233b43eaf2113ac8167 asm: avx2 code for sad_x3_32xN, improved over 40% than SSE diff -r 975352b2c022 -r 310d35ed0ba8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Sep 23 16:19:48 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Sep 25 10:19:11 2015 +0530 @@ -3587,6 +3587,12 @@ p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2); p.planeClipAndMax = PFX(planeClipAndMax_avx2); +p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2); +p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2); +p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx2); +p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx2); +p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx2); + /* The following primitives have been disabled since performance compared to SSE is negligible/negative */ #if 0 p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2); diff -r 975352b2c022 -r 310d35ed0ba8 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Wed Sep 23 16:19:48 2015 +0530 +++ b/source/common/x86/sad-a.asm Fri Sep 25 10:19:11 2015 +0530 @@ -4674,6 +4674,272 @@ movd[r5 + 8], xm1 RET +%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 +INIT_YMM avx2 +%macro SAD_X3_32x8_AVX2 0 +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m7, m3, m5 +paddd m1, m7 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 +%endmacro + +%macro PIXEL_SAD_X3_END_AVX2 0 +vextracti128 xm3, m0, 1 +vextracti128 xm4, m1, 1 +vextracti128 xm5, m2, 1 +paddd m0, m3 +paddd m1, m4 +paddd m2, m5 +pshufd xm3, xm0, 2 +pshufd xm4, xm1, 2 +pshufd xm5, xm2, 2 +paddd m0, m3 +paddd m1, m4 +paddd m2, m5 + +movd[r5 + 0], xm0 +movd[r5 + 4], xm1 +movd[r5 + 8], xm2 +%endmacro + +cglobal pixel_sad_x3_32x8, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +lea
Re: [x265] How can I enable the AVX2 version of DCT and IDCT?
Hi Ximing, If your machine (and OS also) supports AVX2 instruction set then you are already compiling and using AVX2 version of DCT functions. x265 automatically detects & sets all assembly primitives to highest/latest available instruction sets. When you run encoder, you can check command prompt output if cpu capabilities info shows AVX2 instruction set or not. You can get the source code of DCT AVX2 functions in dct8.asm file. Regards, Dnyaneshwar On Wed, Sep 16, 2015 at 6:25 PM, Ximing Cheng <chengximing1...@gmail.com> wrote: > I read the source code of the /source/common/vec/dct-sse3.cpp and I found > the comments said "Note: We have AVX2 assembly for these functions, but > since AVX2 is still somewhat rare on end-user PCs we still compile and link > these SSE3 intrinsic SIMD functions". > > But now both my PC and server support Intel AVX2 instruction set. If I > want to compile these functions with AVX2 assembly, where to find the AVX2 > version of source code of these functions? > > Thanks! > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 3] asm: AVX2 code for pixel_var primitive, improved over 40% than SSE
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1441715051 -19800 # Tue Sep 08 17:54:11 2015 +0530 # Node ID 89c234e68523b05550b8c5197b83849544dc97d1 # Parent 365f7ed4d89628d49cd6af8d81d4edc01f73ffad asm: AVX2 code for pixel_var primitive, improved over 40% than SSE diff -r 365f7ed4d896 -r 89c234e68523 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Sep 08 16:38:01 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Sep 08 17:54:11 2015 +0530 @@ -2729,6 +2729,10 @@ #if X86_64 if (cpuMask & X265_CPU_AVX2) { +p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2); +p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2); +p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2); + p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2); p.planecopy_sp = PFX(downShift_16_avx2); diff -r 365f7ed4d896 -r 89c234e68523 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Sep 08 16:38:01 2015 +0530 +++ b/source/common/x86/pixel-util8.asm Tue Sep 08 17:54:11 2015 +0530 @@ -6397,6 +6397,78 @@ movd edx, xm6 %endif RET + +INIT_YMM avx2 +cglobal pixel_var_32x32, 2,4,7 +VAR_START 0 +mov r2d, 16 + +.loop: +pmovzxbwm0, [r0] +pmovzxbwm3, [r0 + 16] +pmovzxbwm1, [r0 + r1] +pmovzxbwm4, [r0 + r1 + 16] + +lea r0, [r0 + r1 * 2] + +VAR_CORE + +dec r2d +jg .loop + +vextracti128 xm0, m5, 1 +vextracti128 xm1, m6, 1 +paddw xm5, xm0 +paddd xm6, xm1 +HADDW xm5, xm2 +HADDD xm6, xm1 + +%if ARCH_X86_64 +punpckldq xm5, xm6 +movq rax, xm5 +%else +movd eax, xm5 +movd edx, xm6 +%endif +RET + +INIT_YMM avx2 +cglobal pixel_var_64x64, 2,4,7 +VAR_START 0 +mov r2d, 64 + +.loop: +pmovzxbwm0, [r0] +pmovzxbwm3, [r0 + 16] +pmovzxbwm1, [r0 + mmsize] +pmovzxbwm4, [r0 + mmsize + 16] + +lea r0, [r0 + r1] + +VAR_CORE + +dec r2d +jg .loop + +pxorm1, m1 +punpcklwd m0, m5, m1 +punpckhwd m5, m1 +paddd m5, m0 +vextracti128 xm2, m5, 1 +vextracti128 xm1, m6, 1 +paddd xm5, xm2 +paddd xm6, xm1 +HADDD xm5, xm2 +HADDD xm6, xm1 + +%if ARCH_X86_64 +punpckldq xm5, xm6 +movq rax, xm5 +%else +movd eax, xm5 +movd edx, xm6 +%endif +RET %endif ; !HIGH_BIT_DEPTH %macro VAR2_END 3 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 3] asm: avx2 code for sad_x3_32xN, improved over 40% than SSE
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1441885683 -19800 # Thu Sep 10 17:18:03 2015 +0530 # Node ID 5b5d7438e90196d7974b9ceec2130b6c924e2342 # Parent abab4304e992b7addb65ad8fbdfe309ba57732a6 asm: avx2 code for sad_x3_32xN, improved over 40% than SSE diff -r abab4304e992 -r 5b5d7438e901 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Sep 10 11:40:35 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Sep 10 17:18:03 2015 +0530 @@ -3571,6 +3571,12 @@ p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2); p.planeClipAndMax = PFX(planeClipAndMax_avx2); +p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2); +p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2); +p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx2); +p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx2); +p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx2); + /* The following primitives have been disabled since performance compared to SSE is negligible/negative */ #if 0 p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2); diff -r abab4304e992 -r 5b5d7438e901 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Thu Sep 10 11:40:35 2015 +0530 +++ b/source/common/x86/sad-a.asm Thu Sep 10 17:18:03 2015 +0530 @@ -3949,6 +3949,272 @@ movd[r5 + 8], xm1 RET +%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 +INIT_YMM avx2 +%macro SAD_X3_32x8_AVX2 0 +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m7, m3, m5 +paddd m1, m7 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +lea r0, [r0 + FENC_STRIDE * 4] +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 +%endmacro + +%macro PIXEL_SAD_X3_END_AVX2 0 +vextracti128 xm3, m0, 1 +vextracti128 xm4, m1, 1 +vextracti128 xm5, m2, 1 +paddd m0, m3 +paddd m1, m4 +paddd m2, m5 +pshufd xm3, xm0, 2 +pshufd xm4, xm1, 2 +pshufd xm5, xm2, 2 +paddd m0, m3 +paddd m1, m4 +paddd m2, m5 + +movd[r5 + 0], xm0 +movd[r5 + 4], xm1 +movd[r5 + 8], xm2 +%endmacro + +cglobal pixel_sad_x3_32x8, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +lea
[x265] [PATCH 3 of 3] asm: avx2 code for sad_x3_64xN, improved over 40% than SSE
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1441886472 -19800 # Thu Sep 10 17:31:12 2015 +0530 # Node ID d31b9e8bdcf4f5fac2e3f0c567f1c90c1d19a382 # Parent 5b5d7438e90196d7974b9ceec2130b6c924e2342 asm: avx2 code for sad_x3_64xN, improved over 40% than SSE diff -r 5b5d7438e901 -r d31b9e8bdcf4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Sep 10 17:18:03 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Sep 10 17:31:12 2015 +0530 @@ -3576,6 +3576,11 @@ p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx2); p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx2); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx2); +p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx2); +p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx2); +p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2); +p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2); +p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2); /* The following primitives have been disabled since performance compared to SSE is negligible/negative */ #if 0 diff -r 5b5d7438e901 -r d31b9e8bdcf4 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Thu Sep 10 17:18:03 2015 +0530 +++ b/source/common/x86/sad-a.asm Thu Sep 10 17:31:12 2015 +0530 @@ -4054,6 +4054,372 @@ paddd m2, m3 %endmacro +%macro SAD_X3_64x8_AVX2 0 +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + mmsize] +movum4, [r1 + mmsize] +movum5, [r2 + mmsize] +movum6, [r3 + mmsize] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE + mmsize] +movum4, [r1 + r4 + mmsize] +movum5, [r2 + r4 + mmsize] +movum6, [r3 + r4 + mmsize] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2 + mmsize] +movum4, [r1 + r4 * 2 + mmsize] +movum5, [r2 + r4 * 2 + mmsize] +movum6, [r3 + r4 * 2 + mmsize] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3 + mmsize] +movum4, [r1 + r6 + mmsize] +movum5, [r2 + r6 + mmsize] +movum6, [r3 + r6 + mmsize] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +lea r0, [r0 + FENC_STRIDE * 4] +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + mmsize] +movum4, [r1 + mmsize] +movum5, [r2 + mmsize] +movum6, [r3 + mmsize] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +
[x265] [PATCH] asm: fix crash as NR buffer is not aligned to 16-byte boundry
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1441865435 -19800 # Thu Sep 10 11:40:35 2015 +0530 # Node ID abab4304e992b7addb65ad8fbdfe309ba57732a6 # Parent 89c234e68523b05550b8c5197b83849544dc97d1 asm: fix crash as NR buffer is not aligned to 16-byte boundry diff -r 89c234e68523 -r abab4304e992 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmTue Sep 08 17:54:11 2015 +0530 +++ b/source/common/x86/dct8.asmThu Sep 10 11:40:35 2015 +0530 @@ -2115,15 +2115,15 @@ mova m0, [r0] pabswm1, m0 -mova m2, [r1] +movu m2, [r1] pmovsxwd m3, m1 padddm2, m3 -mova [r1], m2 -mova m2, [r1 + 16] +movu [r1], m2 +movu m2, [r1 + 16] psrldq m3, m1, 8 pmovsxwd m4, m3 padddm2, m4 -mova [r1 + 16], m2 +movu [r1 + 16], m2 movu m3, [r2] psubusw m1, m3 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 2] asm: avx2 asm for intra_ang32 mode 16 & 20
# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1441085487 -19800 # Tue Sep 01 11:01:27 2015 +0530 # Node ID 3238ecbdbdf551a69bcd0dfdf8391f6462db45ac # Parent e1adac00dce8e5641cbe9aec3d50a72261c308d9 asm: avx2 asm for intra_ang32 mode 16 & 20 improved mode16 6000c -> 2200 and mode 20 3700c -> 1400c diff -r e1adac00dce8 -r 3238ecbdbdf5 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Sep 03 14:41:06 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Sep 01 11:01:27 2015 +0530 @@ -3004,6 +3004,8 @@ p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2); p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2); p.cu[BLOCK_32x32].intra_pred[15] = PFX(intra_pred_ang32_15_avx2); +p.cu[BLOCK_32x32].intra_pred[16] = PFX(intra_pred_ang32_16_avx2); +p.cu[BLOCK_32x32].intra_pred[20] = PFX(intra_pred_ang32_20_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r e1adac00dce8 -r 3238ecbdbdf5 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Thu Sep 03 14:41:06 2015 +0530 +++ b/source/common/x86/intrapred8.asm Tue Sep 01 11:01:27 2015 +0530 @@ -448,6 +448,17 @@ db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 7, 5, 3, 1 const ang32_shuf_mode21,db 15, 15, 13, 13, 11, 11, 9, 9, 8, 8, 6, 6, 4, 4, 2, 2, 14, 14, 12, 12, 10, 10, 8, 8, 7, 7, 5, 5, 3, 3, 1, 1 +const ang32_fact_mode16,db (32-11), 11, (32-22), 22, (32- 1), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24 +db (32- 3), 3, (32-14), 14, (32-25), 25, (32- 4), 4, (32-15), 15, (32-26), 26, (32- 5), 5, (32-16), 16 +db (32-27), 27, (32- 6), 6, (32-17), 17, (32-28), 28, (32- 7), 7, (32-18), 18, (32-29), 29, (32- 8), 8 +db (32-19), 19, (32-30), 30, (32- 9), 9, (32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0), 0 +const ang32_shuf_mode16,db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 5, 6, 4, 5 +db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6 +db 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 13, 11, 10, 8, 7, 5, 4, 2, 1 +dd 7, 1, 2, 3, 7, 1, 2, 3 +const ang32_shuf_mode20,db 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 14, 15, 8, 7, 5, 4, 2, 1, 0, 0, 14, 13, 13, 11, 11, 10, 10, 8 +db 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 1, 1, 0, 0 + const ang_table %assign x 0 %rep 32 @@ -17100,6 +17111,728 @@ movu[r0 + r4], m8 RET +cglobal intra_pred_ang32_16, 3,4,10 +movum0, [ang32_fact_mode16] +movum1, [ang32_fact_mode16 + mmsize] +movam2, [pw_1024] +movam7, [ang32_shuf_mode16] +movam8, [ang32_shuf_mode16 + mmsize] +lea r3, [r1 * 3] + +; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2...] + +movum6, [r2] +pshufb m6, [ang32_shuf_mode16 + mmsize*2] +movam9, m6 +movam3, [ang32_shuf_mode16 + mmsize*3] +vpermd m6, m3, m6 +vpermq m9, m9, q3232 +pslldq m9, 4 +palignr m6, m9, 15 +pslldq m9, 1 + +vbroadcasti128 m3, [r2 + mmsize*2 + 1] + +palignr m4, m3, m6, 1 +palignr m5, m6, m9, 6 +pshufb m4, m7 +pshufb m5, m8 +pmaddubsw m4, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +vpermq m4, m4, q3120 +movu[r0], m4 + +palignr m4, m3, m6, 2 +palignr m5, m6, m9, 7 +pshufb m4, m7 +pshufb m5, m8 +pmaddubsw m4, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +vpermq m4, m4, q3120 +movu[r0 + r1], m4 + +palignr
[x265] [PATCH] asm: fix dynamic range of input to quant primitive
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1440736935 -19800 # Fri Aug 28 10:12:15 2015 +0530 # Node ID dce85f739efeea842e490a0f555d4abdc89a5c80 # Parent 905c4f2e203ec082bd50b361865a7d4d297e45ce asm: fix dynamic range of input to quant primitive diff -r 905c4f2e203e -r dce85f739efe source/test/mbdstharness.cpp --- a/source/test/mbdstharness.cpp Thu Aug 27 10:13:56 2015 +0530 +++ b/source/test/mbdstharness.cpp Fri Aug 28 10:12:15 2015 +0530 @@ -215,8 +215,14 @@ uint32_t optReturnValue = 0; uint32_t refReturnValue = 0; -int bits = (rand() % 24) + 8; -int valueToAdd = rand() % (1 bits); +int sliceType = rand() % 2; +int log2TrSize = rand() % 4 + 2; +int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1); +int per = qp / 6; +int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; + +int bits = QUANT_SHIFT + per + transformShift; +int valueToAdd = (sliceType == 1 ? 171 : 85) (bits - 9); int cmp_size = sizeof(int) * height * width; int cmp_size1 = sizeof(short) * height * width; int numCoeff = height * width; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 5 of 7] asm: avx2 asm for intra_ang32 mode 12, 4758c-1474c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1440583211 -19800 # Wed Aug 26 15:30:11 2015 +0530 # Node ID cb3f520f9942080d05ca1b3ba2cae0c1b4bcb345 # Parent a27ac3b998f5677570a48285d22e1b771c08ab75 asm: avx2 asm for intra_ang32 mode 12, 4758c-1474c updated intra_ang_32 mode 25 AVX2 asm code, improved 1438c-1270c diff -r a27ac3b998f5 -r cb3f520f9942 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 25 11:02:17 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 26 15:30:11 2015 +0530 @@ -3000,6 +3000,7 @@ p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx2); p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2); p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2); +p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r a27ac3b998f5 -r cb3f520f9942 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Tue Aug 25 11:02:17 2015 +0530 +++ b/source/common/x86/intrapred8.asm Wed Aug 26 15:30:11 2015 +0530 @@ -262,26 +262,6 @@ db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 -c_ang32_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 - db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 - db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 - db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 - db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 - db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 - db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 - db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1 - db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 - db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3 - db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 - db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - - -ALIGN 32 c_ang32_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 @@ -494,6 +474,15 @@ const ang32_shuf_mode11,times 8 db 1, 2 times 8 db 0, 1 +const ang32_fact_mode12,db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7), 7, (32- 2), 2, (32-29), 29, (32-24), 24 +db (32-11), 11, (32- 6), 6, (32- 1), 1, (32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8), 8 +db (32-19), 19, (32-14), 14, (32- 9), 9, (32- 4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16 +db (32- 3), 3, (32-30), 30, (32-25), 25, (32-20), 20, (32-15), 15
[x265] [PATCH 7 of 7] asm: avx2 asm for intra_ang32 mode 14, 5600c-1400c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1440583506 -19800 # Wed Aug 26 15:35:06 2015 +0530 # Node ID 40ae6c49fa489dc995f78d93a35b441639e0847d # Parent 00b26e64fd2c42bcb9652668721f6953d8f2eb0f asm: avx2 asm for intra_ang32 mode 14, 5600c-1400c updated intra_ang_32 mode 22 AVX2 asm code, improved 2300c-1300c diff -r 00b26e64fd2c -r 40ae6c49fa48 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 26 15:32:14 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 26 15:35:06 2015 +0530 @@ -3002,6 +3002,7 @@ p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2); p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2); p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2); +p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 00b26e64fd2c -r 40ae6c49fa48 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Wed Aug 26 15:32:14 2015 +0530 +++ b/source/common/x86/intrapred8.asm Wed Aug 26 15:35:06 2015 +0530 @@ -262,27 +262,6 @@ db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 -c_ang32_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 - db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 - db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 - db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 - db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 - db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 - db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 - db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9 - db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 - db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 - db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 - db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7 - db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 - db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 c_ang32_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 @@ -471,6 +450,15 @@ db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 9, 5, 2 const ang32_shuf_mode23,db 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 11, 11, 7, 7, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 5, 5, 2, 2 +const ang32_fact_mode14,db (32-19), 19
[x265] [PATCH 3 of 7] asm: avx2 asm for intra_ang32 mode 11, 4550c-1326c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1440479904 -19800 # Tue Aug 25 10:48:24 2015 +0530 # Node ID 630bae9a91392fdf9a327673f7c00eeedf60139f # Parent 0409b136c208cb944fb76bfd400e76ba43e330a8 asm: avx2 asm for intra_ang32 mode 11, 4550c-1326c updated intra_ang_32 mode 25 AVX2 asm code, improved 1300c-1184c removed unnecessary constants from previous asm diff -r 0409b136c208 -r 630bae9a9139 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 25 10:53:32 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 25 10:48:24 2015 +0530 @@ -2999,6 +2999,7 @@ p.cu[BLOCK_32x32].intra_pred[8] = PFX(intra_pred_ang32_8_avx2); p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx2); p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2); +p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 0409b136c208 -r 630bae9a9139 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Tue Aug 25 10:53:32 2015 +0530 +++ b/source/common/x86/intrapred8.asm Tue Aug 25 10:48:24 2015 +0530 @@ -262,24 +262,6 @@ db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 -c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 - db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 - db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 - db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 - db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 c_ang32_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 @@ -509,6 +491,9 @@ const ang32_shuf_mode9, times 8 db 0, 1 times 8 db 1, 2 +const ang32_shuf_mode11,times 8 db 1, 2 +times 8 db 0, 1 + const ang_table %assign x 0 %rep 32 @@ -14020,6 +14005,578 @@ movu[r0 + r4], m3 RET +cglobal intra_pred_ang32_11, 3,4,8 +vbroadcasti128 m0, [angHor_tab_11] +vbroadcasti128 m1, [angHor_tab_11 + mmsize/2] +movam2, [pw_1024] +movam7, [ang32_shuf_mode11] +lea r3, [r1 * 3] + +; prepare for [16 0 -1 -2 ...] +movu xm3, [r2 + mmsize*2 - 1] +vbroadcasti128 m6
[x265] [PATCH] asm: avx2 asm for intra_ang32 mode 15, 5700c-1600c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1440650636 -19800 # Thu Aug 27 10:13:56 2015 +0530 # Node ID 905c4f2e203ec082bd50b361865a7d4d297e45ce # Parent 40ae6c49fa489dc995f78d93a35b441639e0847d asm: avx2 asm for intra_ang32 mode 15, 5700c-1600c updated intra_ang_32 mode 21 AVX2 asm code, improved 2670c-1330c diff -r 40ae6c49fa48 -r 905c4f2e203e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 26 15:35:06 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 27 10:13:56 2015 +0530 @@ -3003,6 +3003,7 @@ p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2); p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2); p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2); +p.cu[BLOCK_32x32].intra_pred[15] = PFX(intra_pred_ang32_15_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 40ae6c49fa48 -r 905c4f2e203e source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Wed Aug 26 15:35:06 2015 +0530 +++ b/source/common/x86/intrapred8.asm Thu Aug 27 10:13:56 2015 +0530 @@ -262,26 +262,6 @@ db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 -c_ang32_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 - db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 - db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 - db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9 - db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1 - db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 - db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 - db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 - db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 - db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 - db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 - db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - - -ALIGN 32 intra_pred_shuff_0_4:times 4 db 0, 1, 1, 2, 2, 3, 3, 4 intra_pred4_shuff1: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5 intra_pred4_shuff2: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5 @@ -459,6 +439,15 @@ db 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 9, 6, 4, 1 const ang32_shuf_mode22,db 0, 0, 15, 15, 13, 13, 10, 10, 8, 8, 5, 5, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 7, 7, 4, 4, 2 +const ang32_fact_mode15,db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9), 9, (32-24), 24 +db (32-31), 31, (32-14), 14, (32-29), 29, (32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8), 8 +db (32- 7), 7, (32-22), 22, (32- 5), 5, (32-20), 20, (32- 3), 3, (32-18), 18, (32- 1), 1, (32-16
Re: [x265] [PATCH] asm: disabled 10bpp AVX AVX2 primitives having less than 10% speed up over SSE
right.. you can send it to mailing list On Wed, Aug 19, 2015 at 3:26 PM, aasaipr...@multicorewareinc.com wrote: # HG changeset patch # User Aasaipriya Chandran aasaipr...@multicorewareinc.com # Date 1439972978 -19800 # Wed Aug 19 13:59:38 2015 +0530 # Node ID 8a45cff3182fa9f6e07493434711247d58f22cc4 # Parent 2980141a744a569ad6f60dbebdece76a4eababfd asm: disabled 10bpp AVX AVX2 primitives having less than 10% speed up over SSE these primitives are slower than SSE primitives diff -r 2980141a744a -r 8a45cff3182f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 18 12:45:52 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 19 13:59:38 2015 +0530 @@ -1185,7 +1185,6 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx); -ALL_LUMA_PU(satd, pixel_satd, avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx); @@ -1194,15 +1193,10 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx); -#if X265_DEPTH = 10 -ASSIGN_SA8D(avx); -#endif + p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx); p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx); p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx); -LUMA_VAR(avx); -p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx); -p.ssim_end_4 = PFX(pixel_ssim_end4_avx); // copy_pp primitives // 16 x N @@ -1299,6 +1293,20 @@ p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx); p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx); p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx); + +/* The following primitives have been disabled since performance compared to SSE is negligible/negative */ +#if 0 +ALL_LUMA_PU(satd, pixel_satd, avx); + +p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx); +p.ssim_end_4 = PFX(pixel_ssim_end4_avx); + +LUMA_VAR(avx); + +#if X265_DEPTH = 10 + ASSIGN_SA8D(avx); +#endif +#endif } if (cpuMask X265_CPU_XOP) { @@ -1414,11 +1422,8 @@ p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2); p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2); -p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2); -p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_avx2); p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_avx2); p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_avx2); -p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2); p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_avx2); p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_avx2); p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2); @@ -1438,12 +1443,10 @@ p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = PFX(addAvg_8x2_avx2); -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = PFX(addAvg_8x4_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = PFX(addAvg_8x6_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = PFX(addAvg_8x8_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = PFX(addAvg_8x16_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = PFX(addAvg_8x32_avx2); -p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx2); @@ -1457,18 +1460,15 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_avx2); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx2); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx2); -p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg = PFX(addAvg_8x8_avx2); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_avx2); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = PFX(addAvg_8x32_avx2); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx2);
Re: [x265] [PATCH] asm: disabled 10bpp AVX AVX2 primitives having less than 3% speed up over SSE
right.. but small correction - in #if 0 #endif disable only specific primitives and not all sizes (expand the macro keep only less than 3%) On Tue, Aug 18, 2015 at 12:05 PM, aasaipr...@multicorewareinc.com wrote: # HG changeset patch # User Aasaipriya Chandran aasaipr...@multicorewareinc.com # Date 1439879745 -19800 # Tue Aug 18 12:05:45 2015 +0530 # Node ID 2d0d8be0f401aa4eac554a280118376a991f5475 # Parent 996ebce8c874fc511d495cee227d24413e99d0c1 asm: disabled 10bpp AVX AVX2 primitives having less than 3% speed up over SSE these primitives are slower than SSE primitives diff -r 996ebce8c874 -r 2d0d8be0f401 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 17 10:52:15 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 18 12:05:45 2015 +0530 @@ -1169,7 +1169,6 @@ } if (cpuMask X265_CPU_AVX) { -// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); fails tests p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_avx); @@ -1177,32 +1176,36 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx); - -ALL_LUMA_PU(satd, pixel_satd, avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx); -#if X265_DEPTH = 10 -ASSIGN_SA8D(avx); -#endif -p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx); -p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx); -p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx); -LUMA_VAR(avx); -p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx); -p.ssim_end_4 = PFX(pixel_ssim_end4_avx); + +p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx); +p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx); +p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx); +p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx); +p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx); +p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx); +p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx); +p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx); +p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx); +p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx); +p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx); +p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx); +p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx); +p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx); +p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx); +p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_avx); +p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx); +p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx); +p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx); +p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx); + +p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx); // copy_pp primitives // 16 x N @@ -1299,6 +1302,33 @@ p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx); p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx); p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx); + +/* The
Re: [x265] [PATCH 1 of 5] asm: AVX2 asm for intra_ang_32 mode 9, improved over 40% than SSE asm
Hi, I wrote this code before we found new algorithm. Sure, I will compare with new algorithm once I finish remaining modes which don't have AVX2 asm. On Tue, Aug 18, 2015 at 8:50 PM, chen chenm...@163.com wrote: This is old algorithm, it need transpose, could you compare to new algorithm? At 2015-08-18 12:11:35,dnyanesh...@multicorewareinc.com wrote: # HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439531917 -19800 # Fri Aug 14 11:28:37 2015 +0530 # Node ID 5ed23f786ea8f98e003189a537f960e4ff16201f # Parent 996ebce8c874fc511d495cee227d24413e99d0c1 asm: AVX2 asm for intra_ang_32 mode 9, improved over 40% than SSE asm updated intra_ang_32 mode 27 AVX2 asm code, improved over 3% than previous AVX2 code removed unnecessary constants from previous asm ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 3 of 5] asm: avx2 asm for intra_ang32 mode 11, 4550c-1326c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439812025 -19800 # Mon Aug 17 17:17:05 2015 +0530 # Node ID 43c9ec65927666db1316efe63d112bd8f9cb5f35 # Parent 8752daab2f07711c556dfffa9a733b7278484479 asm: avx2 asm for intra_ang32 mode 11, 4550c-1326c diff -r 8752daab2f07 -r 43c9ec659276 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 14 18:27:44 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 17 17:17:05 2015 +0530 @@ -3027,6 +3027,7 @@ p.cu[BLOCK_32x32].intra_pred[8] = PFX(intra_pred_ang32_8_avx2); p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx2); p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2); +p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 8752daab2f07 -r 43c9ec659276 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Fri Aug 14 18:27:44 2015 +0530 +++ b/source/common/x86/intrapred8.asm Mon Aug 17 17:17:05 2015 +0530 @@ -440,6 +440,9 @@ const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16 db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0 +const ang32_shuf_mode11,times 8 db 1, 2 +times 8 db 0, 1 + const ang_table %assign x 0 %rep 32 @@ -13627,6 +13630,325 @@ movu[r0 + r4], m3 RET +cglobal intra_pred_ang32_11, 3,4,8 +vbroadcasti128 m0, [angHor_tab_11] +vbroadcasti128 m1, [angHor_tab_11 + mmsize/2] +movam2, [pw_1024] +movam7, [ang32_shuf_mode11] +lea r3, [r1 * 3] + +; prepare for [16 0 -1 -2 ...] +movu xm3, [r2 + mmsize*2 - 1] +vbroadcasti128 m6, [r2 + mmsize*2 + 15] + +pinsrb xm3, [r2 + 0], 1 +pinsrb xm3, [r2 + 16], 0 +vinserti128 m3, m3, xm3, 1 ; [16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] + +pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 16 0 16 0 16 0 16 0 16 0 16 0 16 0 16 0] +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0], m4 + +palignr m5, m6, m3, 1 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0 + r1], m4 + +palignr m5, m6, m3, 2 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0 + r1 * 2], m4 + +palignr m5, m6, m3, 3 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0 + r3], m4 + +lea r0, [r0 + r1 * 4] + +palignr m5, m6, m3, 4 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0], m4 + +palignr m5, m6, m3, 5 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0 + r1], m4 + +palignr m5, m6, m3, 6 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0 + r1 * 2], m4 + +palignr m5, m6, m3, 7 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0 + r3], m4 + +lea r0, [r0 + r1 * 4] + +palignr m5, m6, m3, 8 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswb
[x265] [PATCH 2 of 5] asm: AVX2 asm for intra_ang_32 mode 10, 816c-452c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439557064 -19800 # Fri Aug 14 18:27:44 2015 +0530 # Node ID 8752daab2f07711c556dfffa9a733b7278484479 # Parent 5ed23f786ea8f98e003189a537f960e4ff16201f asm: AVX2 asm for intra_ang_32 mode 10, 816c-452c diff -r 5ed23f786ea8 -r 8752daab2f07 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 14 11:28:37 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 14 18:27:44 2015 +0530 @@ -3026,6 +3026,7 @@ p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx2); p.cu[BLOCK_32x32].intra_pred[8] = PFX(intra_pred_ang32_8_avx2); p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx2); +p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 5ed23f786ea8 -r 8752daab2f07 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Fri Aug 14 11:28:37 2015 +0530 +++ b/source/common/x86/intrapred8.asm Fri Aug 14 18:27:44 2015 +0530 @@ -462,6 +462,7 @@ %endrep SECTION .text +cextern pb_1 cextern pw_2 cextern pw_3 cextern pw_4 @@ -13500,6 +13501,132 @@ call ang32_mode_9_27_avx2 RET +cglobal intra_pred_ang32_10, 5,5,4 +pxorm0, m0 +movam1, [pb_1] +lea r4, [r1 * 3] + +vbroadcasti128 m2, [r2 + mmsize*2 + 1] + +pshufb m3, m2, m0 +movu[r0], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1 * 2], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r4], m3 + +lea r0, [r0 + r1 * 4] + +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1 * 2], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r4], m3 + +lea r0, [r0 + r1 * 4] + +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1 * 2], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r4], m3 + +lea r0, [r0 + r1 * 4] + +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1 * 2], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r4], m3 + +lea r0, [r0 + r1 * 4] +pxorm0, m0 +vbroadcasti128 m2, [r2 + mmsize*2 + mmsize/2 + 1] + +pshufb m3, m2, m0 +movu[r0], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1 * 2], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r4], m3 + +lea r0, [r0 + r1 * 4] + +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1 * 2], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r4], m3 + +lea r0, [r0 + r1 * 4] + +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1 * 2], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r4], m3 + +lea r0, [r0 + r1 * 4] + +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r1 * 2], m3 +paddb m0, m1 +pshufb m3, m2, m0 +movu[r0 + r4], m3 +RET
[x265] [PATCH 1 of 5] asm: AVX2 asm for intra_ang_32 mode 9, improved over 40% than SSE asm
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439531917 -19800 # Fri Aug 14 11:28:37 2015 +0530 # Node ID 5ed23f786ea8f98e003189a537f960e4ff16201f # Parent 996ebce8c874fc511d495cee227d24413e99d0c1 asm: AVX2 asm for intra_ang_32 mode 9, improved over 40% than SSE asm updated intra_ang_32 mode 27 AVX2 asm code, improved over 3% than previous AVX2 code removed unnecessary constants from previous asm diff -r 996ebce8c874 -r 5ed23f786ea8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 17 10:52:15 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 14 11:28:37 2015 +0530 @@ -3025,6 +3025,7 @@ p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx2); p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx2); p.cu[BLOCK_32x32].intra_pred[8] = PFX(intra_pred_ang32_8_avx2); +p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 996ebce8c874 -r 5ed23f786ea8 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Mon Aug 17 10:52:15 2015 +0530 +++ b/source/common/x86/intrapred8.asm Fri Aug 14 11:28:37 2015 +0530 @@ -259,26 +259,6 @@ db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - -ALIGN 32 -c_ang32_mode_27:db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 -db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 -db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 -db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 -db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 -db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 -db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 -db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 -db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 -db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 -db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 -db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 -db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 -db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 -db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 -db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 -db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - ALIGN 32 c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 @@ -13279,6 +13259,247 @@ call ang32_mode_8_28_avx2 RET +cglobal ang32_mode_9_27_avx2 +testr7d,r7d +; rows 0 to 7 +movum0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] +movum1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] +movum3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38
Re: [x265] [PATCH] asm: disabled 10bpp AVX AVX2 primitives having less than 3% speed up over SSE
merge earlier patch (asm: disabled 10bpp AVX) into this one and send again to avoid confusion. 2015-08-17 17:44 GMT+05:30 aasaipr...@multicorewareinc.com: # HG changeset patch # User Aasaipriya Chandran aasaipr...@multicorewareinc.com # Date 1439813601 -19800 # Mon Aug 17 17:43:21 2015 +0530 # Node ID 458c015656c2f66ffc696484712540e1b8e6588d # Parent 4a6143fe6658534aec83c9ba3db386d118550196 asm: disabled 10bpp AVX AVX2 primitives having less than 3% speed up over SSE these primitives are slower than SSE primitives diff -r 4a6143fe6658 -r 458c015656c2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 17 11:56:37 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 17 17:43:21 2015 +0530 @@ -1205,10 +1205,6 @@ p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx); p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx); -p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx); -p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx); -p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx); - p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx); // copy_pp primitives @@ -1326,6 +1322,9 @@ p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx) ASSIGN_SA8D(avx); +p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx); +p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx); p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx); p.ssim_end_4 = PFX(pixel_ssim_end4_avx); @@ -1427,12 +1426,6 @@ p.cu[BLOCK_32x32].intra_pred[34]= PFX(intra_pred_ang32_2_avx2); p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2); -p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2); -p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2); -p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2); -p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2); -p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2); -p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2); p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_avx2); p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2); p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2); @@ -1445,11 +1438,8 @@ p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2); p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2); -p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2); -p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_avx2); p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_avx2); p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_avx2); -p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2); p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_avx2); p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_avx2); p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2); @@ -1468,13 +1458,9 @@ p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx2); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2); -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = PFX(addAvg_8x2_avx2); -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = PFX(addAvg_8x4_avx2); -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = PFX(addAvg_8x6_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = PFX(addAvg_8x8_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = PFX(addAvg_8x16_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = PFX(addAvg_8x32_avx2); -p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx2); @@ -1484,7 +1470,6 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx2); - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_avx2); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx2); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx2); @@ -1494,12 +1479,10 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx2); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_avx2);
[x265] [PATCH 5 of 5] asm: optimized intra_ang16 mode 11 avx2 asm, 520c-370c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439816850 -19800 # Mon Aug 17 18:37:30 2015 +0530 # Node ID 6ff0bcad1688f5ee1e393c648739ed2ae7e79b61 # Parent e75f3a2f1d29f01ca2d71f1b8be970d471b5e1f6 asm: optimized intra_ang16 mode 11 avx2 asm, 520c-370c diff -r e75f3a2f1d29 -r 6ff0bcad1688 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Mon Aug 17 17:24:37 2015 +0530 +++ b/source/common/x86/intrapred8.asm Mon Aug 17 18:37:30 2015 +0530 @@ -425,6 +425,9 @@ const ang32_shuf_mode11,times 8 db 1, 2 times 8 db 0, 1 +const ang16_shuf_mode11,times 8 db 0, 1 +times 8 db 1, 2 + const ang_table %assign x 0 %rep 32 @@ -15630,130 +15633,106 @@ INTRA_PRED_TRANS_STORE_16x16 RET - -INIT_YMM avx2 -cglobal intra_pred_ang16_11, 3,4,5 -movam0, [angHor_tab_11] -movam1, [pw_1024] +INIT_YMM avx2 +cglobal intra_pred_ang16_11, 3,4,8 +vbroadcasti128 m0, [angHor_tab_11] +vbroadcasti128 m1, [angHor_tab_11 + mmsize/2] +movam2, [pw_1024] +movam7, [ang16_shuf_mode11] lea r3, [r1 * 3] ; prepare for [0 -1 -2 ...] -movu xm2, [r2 + 32] ; TODO: input reference pixel buffer need a duplicate of pixel_lt to avoid reduce instruction in every mode -pinsrb xm2, [r2], 0 -pshufb xm2, [intra_pred_shuff_0_8] ; [0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8] - - -vpbroadcastwm3, xm2 ; word [1 0] -psrldq xm2, 2 -vpbroadcastwm4, xm2 ; word [2 1] -psrldq xm2, 2 -pmaddubsw m3, m0 -pmaddubsw m4, m0 -pmulhrswm3, m1 -pmulhrswm4, m1 -packuswbm3, m4 -vpermq m3, m3, q3120 -movu[r0], xm3 -vextracti128[r0 + r1], m3, 1 - -vpbroadcastwm3, xm2 ; word [3 2] -psrldq xm2, 2 -vpbroadcastwm4, xm2 ; word [4 3] -psrldq xm2, 2 -pmaddubsw m3, m0 -pmaddubsw m4, m0 -pmulhrswm3, m1 -pmulhrswm4, m1 -packuswbm3, m4 -vpermq m3, m3, q3120 -movu[r0 + r1 * 2], xm3 -vextracti128[r0 + r3], m3, 1 +movu xm3, [r2 + mmsize] +pinsrb xm3, [r2], 0 +vbroadcasti128 m6, [r2 + mmsize + 16] +vinserti128 m3, m3, xm3, 1 + +pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2] +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0], xm4 +vextracti128[r0 + r1], m4, 1 + +palignr m5, m6, m3, 2 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0 + r1 * 2], xm4 +vextracti128[r0 + r3], m4, 1 + lea r0, [r0 + r1 * 4] -vpbroadcastwm3, xm2 ; word [5 4] -psrldq xm2, 2 -vpbroadcastwm4, xm2 ; word [6 5] -psrldq xm2, 2 -pmaddubsw m3, m0 -pmaddubsw m4, m0 -pmulhrswm3, m1 -pmulhrswm4, m1 -packuswbm3, m4 -vpermq m3, m3, q3120 -movu[r0], xm3 -vextracti128[r0 + r1], m3, 1 - -vpbroadcastwm3, xm2 ; word [7 6] -psrldq xm2, 2 -vpbroadcastwm4, xm2 ; word [8 7] -pmaddubsw m3, m0 -pmaddubsw m4, m0 -pmulhrswm3, m1 -pmulhrswm4, m1 -packuswbm3, m4 -vpermq m3, m3, q3120 -movu[r0 + r1 * 2], xm3 -vextracti128[r0 + r3], m3, 1 +palignr m5, m6, m3, 4 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0], xm4 +vextracti128[r0 + r1], m4, 1 + +palignr m5, m6, m3, 6 +pshufb m5, m7 +pmaddubsw m4, m5, m0 +pmaddubsw m5, m1 +pmulhrswm4, m2 +pmulhrswm5, m2 +packuswbm4, m5 +movu[r0 + r1 * 2], xm4
[x265] [PATCH 4 of 5] asm: updated intra_ang_32 mode 25 AVX2 asm code, 1300c-1184c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439812477 -19800 # Mon Aug 17 17:24:37 2015 +0530 # Node ID e75f3a2f1d29f01ca2d71f1b8be970d471b5e1f6 # Parent 43c9ec65927666db1316efe63d112bd8f9cb5f35 asm: updated intra_ang_32 mode 25 AVX2 asm code, 1300c-1184c diff -r 43c9ec659276 -r e75f3a2f1d29 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Mon Aug 17 17:17:05 2015 +0530 +++ b/source/common/x86/intrapred8.asm Mon Aug 17 17:24:37 2015 +0530 @@ -260,24 +260,6 @@ db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 -c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 - db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 - db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 - db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 - db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 c_ang32_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 @@ -13949,6 +13931,260 @@ movu[r0 + r3], m4 RET +cglobal intra_pred_ang32_25, 3,5,7 +lea r3, [ang_table_avx2 + 32 * 16] +lea r4, [r1 * 3] +movam5, [pw_1024] + +; rows 0 to 7 +movum0, [r2 + 0] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] +movum1, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + +pinsrb xm3,[r2], 15 +pinsrb xm3,[r2 + mmsize*2 + 16], 14 + +punpckhbw m2, m0, m1 ; [32 31 31 30 30 29 29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] +punpcklbw m0, m1 ; [24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] +vinserti128 m3, m3, xm2, 1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 0 16 x x x x x x x x x x x x x x] + +pmaddubsw m4, m0, [r3 + 14 * 32] ; [30] +pmulhrswm4, m5 +pmaddubsw m1, m2, [r3 + 14 * 32] +pmulhrswm1, m5 +packuswbm4, m1 +movu[r0], m4 + +pmaddubsw m4, m0, [r3 + 12 * 32] ; [28] +pmulhrswm4, m5 +pmaddubsw m1, m2, [r3 + 12 * 32
Re: [x265] [PATCH 3 of 4] asm: fix bug in macro vpbroadcastd for case ymm, xmm
Hi Min, This still generates wrong code for case ymm, xmm (as %ifidni %2,xm will be false always). How about the below code ? %macro vpbroadcastd 2-3 ;; incresed one argument for case ymm, xmm %ifid %3 ; case vpbroadcastd ymm, ymm, xmm vpbroadcastd %1, %3 %elifid %2 movd %1 %+ xmm, %2 ; case vpbroadcastd ymm, rN vpbroadcastd %1, %1 %+ xmm %else vpbroadcastd %1, %2 ; case vpbroadcastd ymm, [memory addr] %endif %endmacro Thanks, Dnyaneshwar G On Thu, Aug 13, 2015 at 8:52 AM, Min Chen chenm...@163.com wrote: # HG changeset patch # User Min Chen chenm...@163.com # Date 1439424913 25200 # Node ID caf9562dc947f93e8ee237574575e9b67d494fc8 # Parent 09846d1566428a73d70d2fcf2d50324c0dfbbb7f asm: fix bug in macro vpbroadcastd for case ymm,xmm --- source/common/x86/x86inc.asm |6 -- 1 files changed, 4 insertions(+), 2 deletions(-) diff -r 09846d156642 -r caf9562dc947 source/common/x86/x86inc.asm --- a/source/common/x86/x86inc.asm Wed Aug 12 16:46:57 2015 -0700 +++ b/source/common/x86/x86inc.asm Wed Aug 12 17:15:13 2015 -0700 @@ -1486,10 +1486,12 @@ ; workaround: vpbroadcastd with register, the yasm will generate wrong code %macro vpbroadcastd 2 - %ifid %2 + %ifidni %2,xm ; case ymm,xmm +vpbroadcastd %1, %2 + %elifid %2; case ymm,rN movd %1 %+ xmm, %2 vpbroadcastd %1, %1 %+ xmm %else -vpbroadcastd %1, %2 +vpbroadcastd %1, %2 ; case ymm,[address] %endif %endmacro ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 4] asm: AVX2 asm for intra_ang_32 mode 6, improved over 48% than SSE asm
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439366905 -19800 # Wed Aug 12 13:38:25 2015 +0530 # Node ID 643a001494a42e65366cfa3e468cc0858955095f # Parent 07110baa95f1d53c8100929b16eafba3b16138d6 asm: AVX2 asm for intra_ang_32 mode 6, improved over 48% than SSE asm updated intra_ang_32 mode 30 AVX2 asm code, improved over 20% than previous AVX2 code diff -r 07110baa95f1 -r 643a001494a4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 11 18:23:48 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 12 13:38:25 2015 +0530 @@ -3018,6 +3018,7 @@ p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2); p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx2); p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx2); +p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 07110baa95f1 -r 643a001494a4 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Tue Aug 11 18:23:48 2015 +0530 +++ b/source/common/x86/intrapred8.asm Wed Aug 12 13:38:25 2015 +0530 @@ -320,28 +320,6 @@ db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 -c_ang32_mode_30:db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 -db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 -db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 -db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 -db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 -db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 -db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 -db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 -db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 -db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 -db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 -db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 -db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 -db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 -db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 -db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 -db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 -db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 -db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - ALIGN 32 c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 @@ -12517,6 +12495,292 @@ call ang32_mode_5_31_row_16_31 RET +cglobal ang32_mode_6_30_row_0_15 +testr7d,r7d +; rows 0 to 7 +movum0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] +movum1
[x265] [PATCH 3 of 4] asm: AVX2 asm for intra_ang_32 mode 7, improved over 40% than SSE asm
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439373105 -19800 # Wed Aug 12 15:21:45 2015 +0530 # Node ID c12d411014f68affea550ee640e26ba61f51e509 # Parent 643a001494a42e65366cfa3e468cc0858955095f asm: AVX2 asm for intra_ang_32 mode 7, improved over 40% than SSE asm updated intra_ang_32 mode 29 AVX2 asm code, improved over 10% than previous AVX2 code diff -r 643a001494a4 -r c12d411014f6 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 12 13:38:25 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 12 15:21:45 2015 +0530 @@ -3019,6 +3019,7 @@ p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx2); p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx2); p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx2); +p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 643a001494a4 -r c12d411014f6 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Wed Aug 12 13:38:25 2015 +0530 +++ b/source/common/x86/intrapred8.asm Wed Aug 12 15:21:45 2015 +0530 @@ -300,27 +300,6 @@ db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 ALIGN 32 -c_ang32_mode_29:db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 -db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 -db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 -db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 -db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 -db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 -db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 -db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 -db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 -db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 -db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 -db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 -db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 -db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 -db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 -db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 -db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 -db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 -db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 @@ -12781,6 +12760,284 @@ call ang32_mode_6_30_row_16_31 RET +cglobal ang32_mode_7_29_row_0_15 +testr7d,r7d +; rows 0 to 7 +movum0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] +movu
[x265] [PATCH 1 of 4] asm: AVX2 asm for intra_ang_32 mode 5, improved over 48% than SSE asm
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439297628 -19800 # Tue Aug 11 18:23:48 2015 +0530 # Node ID 07110baa95f1d53c8100929b16eafba3b16138d6 # Parent bc5a7c2ac38b06d2a232b983f10bc0394d252ad7 asm: AVX2 asm for intra_ang_32 mode 5, improved over 48% than SSE asm updated intra_ang_32 mode 31 AVX2 asm code, improved over 20% than previous AVX2 code diff -r bc5a7c2ac38b -r 07110baa95f1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 12 15:13:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 11 18:23:48 2015 +0530 @@ -3017,6 +3017,7 @@ p.cu[BLOCK_16x16].intra_pred[23] = PFX(intra_pred_ang16_23_avx2); p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2); p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx2); +p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r bc5a7c2ac38b -r 07110baa95f1 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Wed Aug 12 15:13:51 2015 +0530 +++ b/source/common/x86/intrapred8.asm Tue Aug 11 18:23:48 2015 +0530 @@ -342,27 +342,6 @@ db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 -c_ang32_mode_31:db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 -db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 -db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 -db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 -db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 -db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 -db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 -db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 -db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 -db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 -db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 -db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 -db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 -db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 -db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 -db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 -db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 -db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - ALIGN 32 c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 @@ -12249,6 +12228,295 @@ call ang32_mode_4_32_row_16_31 RET +cglobal ang32_mode_5_31_row_0_15 +testr7d,r7d +; rows 0 to 7 +movum0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] +movum1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8
[x265] [PATCH] asm: AVX2 asm for intra_ang_32 mode 4, improved over 45% than SSE asm
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1439209099 -19800 # Mon Aug 10 17:48:19 2015 +0530 # Branch stable # Node ID 1ae0654c996a3ccab15e384dc8a394c029094544 # Parent 4781e6cef251006db10e107b2916741572f7760a asm: AVX2 asm for intra_ang_32 mode 4, improved over 45% than SSE asm updated intra_ang_32 mode 32 AVX2 asm code, improved over 32% than previous AVX2 code diff -r 4781e6cef251 -r 1ae0654c996a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 07 12:29:40 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 10 17:48:19 2015 +0530 @@ -3016,6 +3016,7 @@ p.cu[BLOCK_16x16].intra_pred[24] = PFX(intra_pred_ang16_24_avx2); p.cu[BLOCK_16x16].intra_pred[23] = PFX(intra_pred_ang16_23_avx2); p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2); +p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 4781e6cef251 -r 1ae0654c996a source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Fri Aug 07 12:29:40 2015 +0530 +++ b/source/common/x86/intrapred8.asm Mon Aug 10 17:48:19 2015 +0530 @@ -363,31 +363,6 @@ db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 -c_ang32_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 - db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 - db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 - db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 - db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 - db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 - db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 - db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 - db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 - db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 - db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 - db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 - db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 - db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 - db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - ALIGN 32 c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4
Re: [x265] [PATCH] asm: avx2 code for intra_ang_16 modes 3 33
This is new algorithm for intra_ang16x16. 1075 cycles - current AVX2 asm 827 cycles - new AVX2 asm (improved 23% over current avx2 asm) On Thu, Aug 6, 2015 at 10:41 AM, Deepthi Nandakumar deep...@multicorewareinc.com wrote: Please be sure to mention what is the baseline - for instance, what is 1075 cycles? On Wed, Aug 5, 2015 at 6:06 PM, raj...@multicorewareinc.com wrote: # HG changeset patch # User Rajesh Paulrajraj...@multicorewareinc.com # Date 1438766294 -19800 # Wed Aug 05 14:48:14 2015 +0530 # Node ID 4a71c4261e5a7955a7ecdda61db1f20744254b0e # Parent 3fa7f6838098854de79d3800b2d775dabaf45705 asm: avx2 code for intra_ang_16 modes 3 33 intra_ang_16x16[ 3] - improved 1075.09-827.85 intra_ang_16x16[ 33] - improved 796.68-565.86 diff -r 3fa7f6838098 -r 4a71c4261e5a source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Mon Aug 03 14:56:21 2015 -0500 +++ b/source/common/x86/intrapred8.asm Wed Aug 05 14:48:14 2015 +0530 @@ -294,32 +294,6 @@ db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 -c_ang16_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 - db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 - db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 - db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 - db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 - db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 - db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 -c_ang16_mode_3: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 - db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 - db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 - db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 - db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 - db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 - db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 - -ALIGN 32 c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 @@ -13534,131 +13508,226 @@ INTRA_PRED_TRANS_STORE_16x16 RET - -INIT_YMM avx2 -cglobal intra_pred_ang16_3, 3, 6, 12 -mova
[x265] [PATCH] asm: disabled AVX AVX2 primitives having less than 3% speed up over SSE
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1438757401 -19800 # Wed Aug 05 12:20:01 2015 +0530 # Node ID 3eb2ec5922be1cd934dec7f7ed886d03c0125ef5 # Parent 3fa7f6838098854de79d3800b2d775dabaf45705 asm: disabled AVX AVX2 primitives having less than 3% speed up over SSE these primitives are slower than SSE primitives diff -r 3fa7f6838098 -r 3eb2ec5922be source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 03 14:56:21 2015 -0500 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 05 12:20:01 2015 +0530 @@ -2568,7 +2568,6 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx); @@ -2578,7 +2577,6 @@ p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx); p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx); p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx); -p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx); p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx); p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx); p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx); @@ -2586,10 +2584,8 @@ p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx); p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx); -p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx); p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx); p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx); -p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx); p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_avx); p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx); p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx); @@ -2599,38 +2595,28 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_avx); -ASSIGN_SA8D(avx); + +p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_avx); +p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_avx); +p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_avx); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_avx); + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx); p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx); p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx); -p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx); -p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx); - p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = PFX(pixel_ssd_8x8_avx); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx); p.pu[LUMA_16x4].sad_x4 = PFX(pixel_sad_x4_16x4_avx); -p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx); -p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx); -p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx); -p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx); -p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx); -p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx); -p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx); -p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx); -p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx); p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_avx
[x265] [PATCH] asm: updated avx2 algorithm for copy_ps 32xN 64xN, improved over 45% than SSE asm
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1438767554 -19800 # Wed Aug 05 15:09:14 2015 +0530 # Node ID 377a996a8d74110f838ff2e3cef1c42781d6d730 # Parent 3eb2ec5922be1cd934dec7f7ed886d03c0125ef5 asm: updated avx2 algorithm for copy_ps 32xN 64xN, improved over 45% than SSE asm diff -r 3eb2ec5922be -r 377a996a8d74 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 05 12:20:01 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 05 15:09:14 2015 +0530 @@ -3622,6 +3622,11 @@ if (cpuMask X265_CPU_BMI2) p.scanPosLast = PFX(scanPosLast_avx2_bmi2); +p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2); +p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2); +p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2); +p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2); + /* The following primitives have been disabled since performance compared to SSE is negligible/negative */ #if 0 p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2); @@ -3652,10 +3657,6 @@ p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_avx2); p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_avx2); p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_avx2); -p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2); -p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2); -p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2); -p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = PFX(interp_4tap_horiz_pp_4x8_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = PFX(interp_4tap_horiz_pp_4x16_avx2); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx2); diff -r 3eb2ec5922be -r 377a996a8d74 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Aug 05 12:20:01 2015 +0530 +++ b/source/common/x86/blockcopy8.asm Wed Aug 05 15:09:14 2015 +0530 @@ -3043,43 +3043,31 @@ ;- %macro BLOCKCOPY_PS_W32_H4_avx2 2 INIT_YMM avx2 -cglobal blockcopy_ps_%1x%2, 4, 7, 3 +cglobal blockcopy_ps_%1x%2, 4, 7, 2 add r1, r1 mov r4d, %2/4 lea r5, [3 * r3] lea r6, [3 * r1] -pxorm0, m0 - .loop: -movu m1, [r2] -punpcklbw m2, m1, m0 -punpckhbw m1, m1, m0 -vperm2i128m3, m2, m1, 0010b -vperm2i128m2, m2, m1, 00110001b -movu [r0], m3 -movu [r0 + 32], m2 -movu m1, [r2 + r3] -punpcklbw m2, m1, m0 -punpckhbw m1, m1, m0 -vperm2i128m3, m2, m1, 0010b -vperm2i128m2, m2, m1, 00110001b -movu [r0 + r1], m3 -movu [r0 + r1 + 32], m2 -movu m1, [r2 + 2 * r3] -punpcklbw m2, m1, m0 -punpckhbw m1, m1, m0 -vperm2i128m3, m2, m1, 0010b -vperm2i128m2, m2, m1, 00110001b -movu [r0 + 2 * r1], m3 -movu [r0 + 2 * r1 + 32], m2 -movu m1, [r2 + r5] -punpcklbw m2, m1, m0 -punpckhbw m1, m1, m0 -vperm2i128m3, m2, m1, 0010b -vperm2i128m2, m2, m1, 00110001b -movu [r0 + r6], m3 -movu [r0 + r6 + 32], m2 - +pmovzxbw m0, [r2 + 0] +pmovzxbw m1, [r2 + 16] +movu [r0 + 0], m0 +movu [r0 + 32], m1 + +pmovzxbw m0, [r2 + r3 + 0] +pmovzxbw m1, [r2 + r3 + 16] +movu [r0 + r1 + 0], m0 +movu [r0 + r1 + 32], m1 + +pmovzxbw m0, [r2 + r3 * 2 + 0] +pmovzxbw m1, [r2 + r3 * 2 + 16] +movu [r0 + r1 * 2 + 0], m0 +movu [r0 + r1 * 2 + 32], m1 + +pmovzxbw m0, [r2 + r5 + 0] +pmovzxbw m1, [r2 + r5 + 16] +movu [r0 + r6 + 0], m0 +movu [r0 + r6 + 32], m1 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d @@ -3228,71 +3216,49 @@ INIT_YMM avx2 cglobal blockcopy_ps_64x64, 4, 7, 4 add r1, r1 -mov r4d, 64/4 +mov r4d, 64/8 lea r5, [3 * r3] lea r6, [3 * r1] -pxorm0, m0 - .loop: -movu m1, [r2] -punpcklbw m2, m1, m0 -punpckhbw m1, m1, m0 -vperm2i128m3, m2, m1, 0010b -vperm2i128m2, m2, m1, 00110001b -movu [r0], m3 -movu [r0 + 32], m2 -movu m1, [r2 + 32] -punpcklbw m2, m1, m0 -punpckhbw m1, m1, m0 -vperm2i128m3, m2, m1, 0010b -vperm2i128m2, m2, m1, 00110001b -movu
[x265] [PATCH] asm: disabled AVX primitives having less than 3% speed up over SSE
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1438669788 -19800 # Tue Aug 04 11:59:48 2015 +0530 # Node ID fc84f3731e2c9eafc8164361b67422732f811008 # Parent 2b89c446b404ed20c0316efaab5b1e088289c0b4 asm: disabled AVX primitives having less than 3% speed up over SSE these AVX primitives are slower than SSE primitives diff -r 2b89c446b404 -r fc84f3731e2c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 03 16:45:04 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 04 11:59:48 2015 +0530 @@ -2556,7 +2556,7 @@ } if (cpuMask X265_CPU_AVX) { -p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); +//p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_avx); @@ -2571,28 +2571,53 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx); +//p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx); -p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx); -ALL_LUMA_PU(satd, pixel_satd, avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_avx); +//p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx); +//p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx); +//p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx); + +p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx); +p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx); +p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx); +p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx); +//p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_avx); +//p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx); +p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx); +p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx); +p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx); +p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx); +p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx); +p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx); +p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx); +//p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_avx); +p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx); +//p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx); +p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx); +p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx); +p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx); +p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_avx); +p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx); +p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx); +p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx); +p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx); + +//p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx); -p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx); +//p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx); +//p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12
[x265] [PATCH] asm: avx2 code for pixelavg_pp 32xN 64xN, improved over 40% than SSE
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1438596650 -19800 # Mon Aug 03 15:40:50 2015 +0530 # Node ID 43fe4ec1c13a2514030010c2cd699382b67f65cb # Parent a3b72e2a25a7fc544b1b76e872eda012035bf4ac asm: avx2 code for pixelavg_pp 32xN 64xN, improved over 40% than SSE diff -r a3b72e2a25a7 -r 43fe4ec1c13a source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmMon Aug 03 10:28:34 2015 +0530 +++ b/source/common/x86/mc-a.asmMon Aug 03 15:40:50 2015 +0530 @@ -4300,24 +4300,12 @@ AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 + INIT_XMM avx2 ; TODO: active AVX2 after debug ;AVG_FUNC 24, movdqu, movdqa ;AVGH 24, 32 -AVG_FUNC 64, movdqu, movdqa -AVGH 64, 64 -AVGH 64, 48 -AVGH 64, 32 -AVGH 64, 16 - -AVG_FUNC 32, movdqu, movdqa -AVGH 32, 64 -AVGH 32, 32 -AVGH 32, 24 -AVGH 32, 16 -AVGH 32, 8 - AVG_FUNC 16, movdqu, movdqa AVGH 16, 64 AVGH 16, 32 @@ -4328,7 +4316,109 @@ %endif ;HIGH_BIT_DEPTH - +;--- +;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) +;--- +%if ARCH_X86_64 BIT_DEPTH == 8 +INIT_YMM avx2 +cglobal pixel_avg_8x32 +%rep 4 +movum0, [r2] +movum2, [r2 + r3] +movum1, [r4] +movum3, [r4 + r5] +pavgb m0, m1 +pavgb m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +lea r2, [r2 + r3 * 2] +lea r4, [r4 + r5 * 2] +lea r0, [r0 + r1 * 2] +%endrep +ret + +cglobal pixel_avg_16x64_8bit +%rep 8 +movum0, [r2] +movum2, [r2 + mmsize] +movum1, [r4] +movum3, [r4 + mmsize] +pavgb m0, m1 +pavgb m2, m3 +movu[r0], m0 +movu[r0 + mmsize], m2 + +movum0, [r2 + r3] +movum2, [r2 + r3 + mmsize] +movum1, [r4 + r5] +movum3, [r4 + r5 + mmsize] +pavgb m0, m1 +pavgb m2, m3 +movu[r0 + r1], m0 +movu[r0 + r1 + mmsize], m2 + +lea r2, [r2 + r3 * 2] +lea r4, [r4 + r5 * 2] +lea r0, [r0 + r1 * 2] +%endrep +ret + +cglobal pixel_avg_32x8, 6,6,4 +call pixel_avg_8x32 +RET + +cglobal pixel_avg_32x16, 6,6,4 +call pixel_avg_8x32 +call pixel_avg_8x32 +RET + +cglobal pixel_avg_32x24, 6,6,4 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +RET + +cglobal pixel_avg_32x32, 6,6,4 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +RET + +cglobal pixel_avg_32x64, 6,6,4 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +call pixel_avg_8x32 +RET + +cglobal pixel_avg_64x16, 6,6,4 +call pixel_avg_16x64_8bit +RET + +cglobal pixel_avg_64x32, 6,6,4 +call pixel_avg_16x64_8bit +call pixel_avg_16x64_8bit +RET + +cglobal pixel_avg_64x48, 6,6,4 +call pixel_avg_16x64_8bit +call pixel_avg_16x64_8bit +call pixel_avg_16x64_8bit +RET + +cglobal pixel_avg_64x64, 6,6,4 +call pixel_avg_16x64_8bit +call pixel_avg_16x64_8bit +call pixel_avg_16x64_8bit +call pixel_avg_16x64_8bit +RET +%endif ;= ; pixel avg2 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] main12: added lambda tables based based on qp values
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1437640145 -19800 # Thu Jul 23 13:59:05 2015 +0530 # Node ID 0bdab1ab0e78684cbb3ecc4913e59d2b35b4e1b7 # Parent 42bc8575020b73d129d0bcef70c7cbe80a8b51df main12: added lambda tables based based on qp values diff -r 42bc8575020b -r 0bdab1ab0e78 source/common/constants.cpp --- a/source/common/constants.cpp Wed Jul 22 12:56:34 2015 -0500 +++ b/source/common/constants.cpp Thu Jul 23 13:59:05 2015 +0530 @@ -27,7 +27,48 @@ namespace X265_NS { -#if HIGH_BIT_DEPTH +#if X265_DEPTH == 12 + +// lambda = pow(2, (double)q / 6 - 2) * (1 (12 - 8)); +double x265_lambda_tab[QP_MAX_MAX + 1] = +{ +4.,4.4898,5.0397,5.6569, 6.3496, +7.1272,8.,8.9797,10.0794,11.3137, +12.6992, 14.2544, 16., 17.9594,20.1587, +22.6274, 25.3984, 28.5088, 32.,35.9188, +40.3175, 45.2548, 50.7968, 57.0175,64., +71.8376, 80.6349, 90.5097, 101.5937, 114.0350, +128., 143.6751, 161.2699, 181.0193, 203.1873, +228.0701, 256., 287.3503, 322.5398, 362.0387, +406.3747, 456.1401, 512., 574.7006, 645.0796, +724.0773, 812.7493, 912.2803, 1024., 1149.4011, +1290.1592, 1448.1547, 1625.4987, 1824.5606, 2048., +2298.8023, 2580.3183, 2896.3094, 3250.9974, 3649.1211, +4096., 4597.6045, 5160.6366, 5792.6188, 6501.9947, +7298.2423, 8192., 9195.2091, 10321.2732, 11585.2375 +}; + +// lambda2 = pow(lambda, 2) * scale (0.85); +double x265_lambda2_tab[QP_MAX_MAX + 1] = +{ +13.6000, 17.1349, 21.5887, 27.2000, 34.2699, +43.1773, 54.4000, 68.5397, 86.3546, 108.8000, +137.0794, 172.7092, 217.6000, 274.1588, 345.4185, +435.2000, 548.3176, 690.8369, 870.4000, 1096.6353, +1381.6739, 1740.8000, 2193.2706, 2763.3478, 3481.6000, +4386.5411, 5526.6955, 6963.2000, 8773.0822, 11053.3910, +13926.4000,17546.1645,22106.7819,27852.8000,35092.3291, +44213.5641,55705.6000,70184.6579,88427.1282,111411.2000, +140369.3159, 176854.2563, 222822.4000, 280738.6324, 353708.5127, +445644.8001, 561477.2648, 707417.0237, 891289.6000, 1122954.5277, +1414834.0484, 1782579.2003, 2245909.0566, 2829668.0981, 3565158.4000, +4491818.1146, 5659336.1938, 7130316.8013, 8983636.2264, 11318672.3923, +14260633.6000, 17967272.4585, 22637344.7751, 28521267.1953, 35934544.9165, +45274689.5567, 57042534.4000, 71869089.8338, 90549379.1181, 114085068.8008 +}; + +#elif X265_DEPTH == 10 + // lambda = pow(2, (double)q / 6 - 2) * (1 (X265_DEPTH - 8)); double x265_lambda_tab[QP_MAX_MAX + 1] = { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: fix linux build error- cannot override register size
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1436771870 -19800 # Mon Jul 13 12:47:50 2015 +0530 # Node ID 96eaae96478a252f46736416248ec8dcba618c7d # Parent 7cb28662875630da90d85d62b01d58f4c51f7e32 asm: fix linux build error- cannot override register size diff -r 7cb286628756 -r 96eaae96478a source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Mon Jul 13 12:16:57 2015 +0530 +++ b/source/common/x86/loopfilter.asm Mon Jul 13 12:47:50 2015 +0530 @@ -2132,8 +2132,8 @@ jnz .loopH ; sum to global buffer -mov r1, r5m -mov r0, r6m +mov r1, r5mp +mov r0, r6mp ; s_eoTable = {1, 2, 0, 3, 4} movzx r5d, word [rsp + 0 * 2] @@ -2165,9 +2165,9 @@ ;--- %if ARCH_X86_64 INIT_XMM sse4 -cglobal saoCuStatsE1, 4,11,9,0-32; Stack: 5 of stats and 5 of count +cglobal saoCuStatsE1, 4,12,9,0-32; Stack: 5 of stats and 5 of count mov r4d, r4m -mov r5d, r5m +mov r11d, r5d ; clear internal temporary buffer pxorm0, m0 @@ -2183,7 +2183,7 @@ mov r6d, r4d mov r9, r0 mov r10, r1 -mov r5, r3 +mov r11, r3 .loopW: movum1, [r10] @@ -2200,12 +2200,12 @@ psubb m3, m2 ; -signDown ; edgeType -movum4, [r5] +movum4, [r11] paddb m4, m6 paddb m2, m4 ; update upBuff1 -movu[r5], m3 +movu[r11], m3 ; stats[edgeType] pxorm1, m0 @@ -2236,7 +2236,7 @@ add r9, 16 add r10, 16 -add r5, 16 +add r11, 16 jmp .loopW .next: @@ -2244,7 +2244,7 @@ add r0, r2 add r1, r2 -dec byte r5m +dec r5d jg .loopH ; restore unavailable pixels ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 3 of 3] asm: sse4 code for saoCuStatsE1, improved 320369c-151086c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1436252372 -19800 # Tue Jul 07 12:29:32 2015 +0530 # Node ID 25a8323b886f480347f4b0813f7ded18e579704a # Parent 235930aae11da04863e3fb13905e2d1d95e3dc0a asm: sse4 code for saoCuStatsE1, improved 320369c-151086c diff -r 235930aae11d -r 25a8323b886f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 07 12:17:08 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 07 12:29:32 2015 +0530 @@ -2499,6 +2499,7 @@ #if X86_64 p.saoCuStatsBO = PFX(saoCuStatsBO_sse4); p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4); +p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4); p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4); p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4); diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Tue Jul 07 12:17:08 2015 +0530 +++ b/source/common/x86/loopfilter.asm Tue Jul 07 12:29:32 2015 +0530 @@ -2159,3 +2159,122 @@ add [r1 + 4 * 4], r6d RET %endif + +;--- +; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count) +;--- +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal saoCuStatsE1, 4,11,9,0-32; Stack: 5 of stats and 5 of count +mov r4d, r4m +mov r5d, r5m + +; clear internal temporary buffer +pxorm0, m0 +mova[rsp], m0 +mova[rsp + mmsize], m0 +movam0, [pb_128] +movam5, [pb_1] +movam6, [pb_2] +movam8, [hmul_16p + 16] +movhm7, [r3 + r4] + +.loopH: +mov r6d, r4d +mov r9, r0 +mov r10, r1 +mov r5, r3 + +.loopW: +movum1, [r10] +movum2, [r10 + r2] + +; signDown +pxorm1, m0 +pxorm2, m0 +pcmpgtb m3, m1, m2 +pandm3, m5 +pcmpgtb m2, m1 +por m2, m3 +pxorm3, m3 +psubb m3, m2 ; -signDown + +; edgeType +movum4, [r5] +paddb m4, m6 +paddb m2, m4 + +; update upBuff1 +movu[r5], m3 + +; stats[edgeType] +pxorm1, m0 +movum3, [r9] +punpckhbw m4, m3, m1 +punpcklbw m3, m1 +pmaddubsw m3, m8 +pmaddubsw m4, m8 + +; 16 pixels +%assign x 0 +%rep 16 +pextrb r7d, m2, x +inc word [rsp + r7 * 2] + + %if (x 8) +pextrw r8d, m3, (x % 8) + %else +pextrw r8d, m4, (x % 8) + %endif +movsx r8d, r8w +add [rsp + 5 * 2 + r7 * 4], r8d + +dec r6d +jz .next +%assign x x+1 +%endrep + +add r9, 16 +add r10, 16 +add r5, 16 +jmp .loopW + +.next: +; restore pointer upBuff1 +add r0, r2 +add r1, r2 + +dec byte r5m +jg .loopH + +; restore unavailable pixels +movh[r3 + r4], m7 + +; sum to global buffer +mov r1, r6m +mov r0, r7m + +; s_eoTable = {1,2,0,3,4} +movzx r6d, word [rsp + 0 * 2] +add [r0 + 1 * 4], r6d +movzx r6d, word [rsp + 1 * 2] +add [r0 + 2 * 4], r6d +movzx r6d, word [rsp + 2 * 2] +add [r0 + 0 * 4], r6d +movzx r6d, word [rsp + 3 * 2] +add [r0 + 3 * 4], r6d +movzx r6d, word [rsp + 4 * 2] +add [r0 + 4 * 4], r6d + +mov r6d, [rsp + 5 * 2 + 0 * 4] +add [r1 + 1 * 4], r6d +mov r6d, [rsp + 5 * 2 + 1 * 4] +add [r1 + 2 * 4], r6d +mov r6d, [rsp + 5 * 2 + 2 * 4] +add [r1 + 0 * 4], r6d +mov r6d, [rsp + 5 * 2 + 3 * 4] +add [r1 + 3 * 4], r6d +mov r6d, [rsp + 5 * 2 + 4 * 4] +add [r1 + 4 * 4], r6d +RET +%endif ; ARCH_X86_64 diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.h --- a/source/common/x86/loopfilter.hTue Jul 07 12:17:08 2015 +0530 +++ b/source/common/x86/loopfilter.hTue Jul 07 12:29:32 2015 +0530 @@ -37,6 +37,7 @@ void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \ void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \ +void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t
[x265] [PATCH 2 of 3] asm: sse4 code for saoCuStatsE0, improved 250341c-147284c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1436251628 -19800 # Tue Jul 07 12:17:08 2015 +0530 # Node ID 235930aae11da04863e3fb13905e2d1d95e3dc0a # Parent e0166f09f332af72a83eb059d878044db15f59bd asm: sse4 code for saoCuStatsE0, improved 250341c-147284c diff -r e0166f09f332 -r 235930aae11d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 07 11:14:35 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 07 12:17:08 2015 +0530 @@ -2498,6 +2498,7 @@ #if X86_64 p.saoCuStatsBO = PFX(saoCuStatsBO_sse4); +p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4); p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4); p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4); diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Tue Jul 07 11:14:35 2015 +0530 +++ b/source/common/x86/loopfilter.asm Tue Jul 07 12:17:08 2015 +0530 @@ -2043,3 +2043,119 @@ jnz .loopH RET %endif + +;--- +; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) +;--- +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal saoCuStatsE0, 5,8,8, 0-32 +mov r3d, r3m + +; clear internal temporary buffer +pxorm0, m0 +mova[rsp], m0 +mova[rsp + mmsize], m0 +movam4, [pb_128] +movam5, [hmul_16p + 16] +movam6, [pb_2] +xor r7d, r7d + +.loopH: +mov r5d, r3d + +; calculate signLeft +mov r7b, [r1] +sub r7b, [r1 - 1] +setar7b +setbr6b +sub r7b, r6b +neg r7b +pinsrb m0, r7d, 15 + +.loopL: +movum7, [r1] +movum2, [r1 + 1] + +pxorm1, m7, m4 +pxorm3, m2, m4 +pcmpgtb m2, m1, m3 +pcmpgtb m3, m1 +pandm2, [pb_1] +por m2, m3 ; signRight + +palignr m3, m2, m0, 15 +psignb m3, m4 ; signLeft + +movam0, m2 +paddb m2, m3 +paddb m2, m6 ; edgeType + +; stats[edgeType] +movum3, [r0]; fenc[0-15] +punpckhbw m1, m3, m7 +punpcklbw m3, m7 +pmaddubsw m1, m5 +pmaddubsw m3, m5 + +%assign x 0 +%rep 16 +pextrb r7d, m2, x + +%if (x 8) +pextrw r6d, m3, (x % 8) +%else +pextrw r6d, m1, (x % 8) +%endif +movsx r6d, r6w +inc word [rsp + r7 * 2] ; tmp_count[edgeType]++ +add [rsp + 5 * 2 + r7 * 4], r6d ; tmp_stats[edgeType] += (fenc[x] - rec[x]) +dec r5d +jz .next +%assign x x+1 +%endrep + +add r0q, 16 +add r1q, 16 +jmp .loopL + +.next: +mov r6d, r3d +and r6d, 15 + +sub r6, r3 +add r6, r2 +add r0, r6 +add r1, r6 + +dec r4d +jnz .loopH + +; sum to global buffer +mov r1, r5m +mov r0, r6m + +; s_eoTable = {1, 2, 0, 3, 4} +movzx r5d, word [rsp + 0 * 2] +add [r0 + 1 * 4], r5d +movzx r6d, word [rsp + 1 * 2] +add [r0 + 2 * 4], r6d +movzx r5d, word [rsp + 2 * 2] +add [r0 + 0 * 4], r5d +movzx r6d, word [rsp + 3 * 2] +add [r0 + 3 * 4], r6d +movzx r5d, word [rsp + 4 * 2] +add [r0 + 4 * 4], r5d + +mov r6d, [rsp + 5 * 2 + 0 * 4] +add [r1 + 1 * 4], r6d +mov r5d, [rsp + 5 * 2 + 1 * 4] +add [r1 + 2 * 4], r5d +mov r6d, [rsp + 5 * 2 + 2 * 4] +add [r1 + 0 * 4], r6d +mov r5d, [rsp + 5 * 2 + 3 * 4] +add [r1 + 3 * 4], r5d +mov r6d, [rsp + 5 * 2 + 4 * 4] +add [r1 + 4 * 4], r6d +RET +%endif diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.h --- a/source/common/x86/loopfilter.hTue Jul 07 11:14:35 2015 +0530 +++ b/source/common/x86/loopfilter.hTue Jul 07 12:17:08 2015 +0530 @@ -36,6 +36,7 @@ void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \ void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \ void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \ +void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX
[x265] [PATCH] asm: fix 32-bit build error- undefined symbol r7d, r8d
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1436183156 -19800 # Mon Jul 06 17:15:56 2015 +0530 # Node ID 45e56ef3de405a3f9c6451b46b876e3dc46aac38 # Parent bf57ce5d38d5208a491bf4192e389ab1eb4a4f32 asm: fix 32-bit build error- undefined symbol r7d, r8d diff -r bf57ce5d38d5 -r 45e56ef3de40 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Fri Jul 03 19:50:02 2015 +0530 +++ b/source/common/x86/pixel-util8.asm Mon Jul 06 17:15:56 2015 +0530 @@ -1677,6 +1677,7 @@ %endif +%if ARCH_X86_64 == 1 %if HIGH_BIT_DEPTH INIT_YMM avx2 cglobal weight_sp, 6,7,9 @@ -1872,6 +1873,7 @@ jnz .loopH RET %endif +%endif ;- ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] Compiling 8-bit Win32 target fails: 64 registers r7/r8 used
sent a fix patch. Yes, it was caused by %ARCH_X86_64 removal. On Mon, Jul 6, 2015 at 5:20 PM, Mario *LigH* Rohkrämer cont...@ligh.de wrote: Possibly after a line with a check %if ARCH_X86_64 was removed? Win32 non-HBD still allows ASM. + [ 8%] Building ASM_YASM object common/CMakeFiles/common.dir/x86/pixel-util8.asm.obj h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1801: error: undefined symbol `r7d' (first use) h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1801: error: (Each undefined symbol is reported only once.) h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1812: error: undefined symbol `r7' (first use) h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1813: warning: `r8' is a register in 64-bit mode h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1813: error: undefined symbol `r8' (first use) h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1819: warning: `r8' is a register in 64-bit mode h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1869: warning: `r8' is a register in 64-bit mode h:/MSYS-GCC482/home/Entwicklung/x265/source/common/x86/pixel-util8.asm:1869: warning: `r8' is a register in 64-bit mode make[2]: *** [common/CMakeFiles/common.dir/x86/pixel-util8.asm.obj] Error 1 + -- Fun and success! Mario *LigH* Rohkrämer mailto:cont...@ligh.de ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 3 of 3] sao: created new primitive for saoCuStatsBO
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435749680 -19800 # Wed Jul 01 16:51:20 2015 +0530 # Node ID 9fd6c4bca7695f847ff9a28a065122b840ecae5a # Parent 915d02816797d3c70004e652a13b3804571c251b sao: created new primitive for saoCuStatsBO diff -r 915d02816797 -r 9fd6c4bca769 source/common/primitives.h --- a/source/common/primitives.hWed Jul 01 16:50:32 2015 +0530 +++ b/source/common/primitives.hWed Jul 01 16:51:20 2015 +0530 @@ -174,6 +174,7 @@ typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX); typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); +typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count); @@ -299,6 +300,7 @@ saoCuOrgE3_t saoCuOrgE3[2]; saoCuOrgB0_t saoCuOrgB0; +saoCuStatsBO_tsaoCuStatsBO; saoCuStatsE0_tsaoCuStatsE0; saoCuStatsE1_tsaoCuStatsE1; saoCuStatsE2_tsaoCuStatsE2; diff -r 915d02816797 -r 9fd6c4bca769 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Jul 01 16:50:32 2015 +0530 +++ b/source/encoder/sao.cppWed Jul 01 16:51:20 2015 +0530 @@ -666,7 +666,6 @@ /* Calculate SAO statistics for current CTU without non-crossing slice */ void SAO::calcSaoStatsCu(int addr, int plane) { -int x, y; const CUData* cu = m_frame-m_encData-getPicCTU(addr); const pixel* fenc0 = m_frame-m_fencPic-getPlaneAddr(plane, addr); const pixel* rec0 = m_frame-m_reconPic-getPlaneAddr(plane, addr); @@ -697,8 +696,6 @@ int startY; int endX; int endY; -int32_t* stats; -int32_t* count; int skipB = plane ? 2 : 4; int skipR = plane ? 3 : 5; @@ -708,34 +705,16 @@ // SAO_BO: { -const int boShift = X265_DEPTH - SAO_BO_BITS; - if (m_param-bSaoNonDeblocked) { skipB = plane ? 1 : 3; skipR = plane ? 2 : 4; } -stats = m_offsetOrg[plane][SAO_BO]; -count = m_count[plane][SAO_BO]; - -fenc = fenc0; -rec = rec0; endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR; endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB; -for (y = 0; y endY; y++) -{ -for (x = 0; x endX; x++) -{ -int classIdx = 1 + (rec[x] boShift); -stats[classIdx] += (fenc[x] - rec[x]); -count[classIdx]++; -} - -fenc += stride; -rec += stride; -} +primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]); } { @@ -785,8 +764,6 @@ skipB = plane ? 2 : 4; skipR = plane ? 3 : 5; } -stats = m_offsetOrg[plane][SAO_EO_2]; -count = m_count[plane][SAO_EO_2]; fenc = fenc0; rec = rec0; @@ -814,8 +791,6 @@ skipB = plane ? 2 : 4; skipR = plane ? 3 : 5; } -stats = m_offsetOrg[plane][SAO_EO_3]; -count = m_count[plane][SAO_EO_3]; fenc = fenc0; rec = rec0; @@ -1552,6 +1527,25 @@ } // NOTE: must put in namespace X265_NS since we need class SAO +void saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) +{ +int x, y; +const int boShift = X265_DEPTH - SAO_BO_BITS; + +for (y = 0; y endY; y++) +{ +for (x = 0; x endX; x++) +{ +int classIdx = 1 + (rec[x] boShift); +stats[classIdx] += (fenc[x] - rec[x]); +count[classIdx]++; +} + +fenc += stride; +rec += stride; +} +} + void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) { int x, y; @@ -1702,6 +1696,7 @@ void setupSaoPrimitives_c(EncoderPrimitives p) { // TODO: move other sao functions to here +p.saoCuStatsBO = saoCuStatsBO_c; p.saoCuStatsE0 = saoCuStatsE0_c; p.saoCuStatsE1 = saoCuStatsE1_c; p.saoCuStatsE2 = saoCuStatsE2_c; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo
[x265] [PATCH 2 of 3] sao: created new primitive for saoCuStatsE0
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435749632 -19800 # Wed Jul 01 16:50:32 2015 +0530 # Node ID 915d02816797d3c70004e652a13b3804571c251b # Parent 18151ada638dd19843551e2a6d5d8b2cc9bd28be sao: created new primitive for saoCuStatsE0 diff -r 18151ada638d -r 915d02816797 source/common/primitives.h --- a/source/common/primitives.hWed Jul 01 16:49:24 2015 +0530 +++ b/source/common/primitives.hWed Jul 01 16:50:32 2015 +0530 @@ -174,6 +174,7 @@ typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX); typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); +typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count); typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); @@ -298,6 +299,7 @@ saoCuOrgE3_t saoCuOrgE3[2]; saoCuOrgB0_t saoCuOrgB0; +saoCuStatsE0_tsaoCuStatsE0; saoCuStatsE1_tsaoCuStatsE1; saoCuStatsE2_tsaoCuStatsE2; saoCuStatsE3_tsaoCuStatsE3; diff -r 18151ada638d -r 915d02816797 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Jul 01 16:49:24 2015 +0530 +++ b/source/encoder/sao.cppWed Jul 01 16:50:32 2015 +0530 @@ -706,11 +706,6 @@ int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1; int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1; -// Dynamic Range: 64x64x14bpp = 24bits -int32_t tmp_stats[NUM_EDGETYPE]; -// TODO: improve by uint64_t, but need Haswell SHLX -uint16_t tmp_count[NUM_EDGETYPE]; - // SAO_BO: { const int boShift = X265_DEPTH - SAO_BO_BITS; @@ -752,41 +747,10 @@ skipR = plane ? 3 : 5; } -fenc = fenc0; -rec = rec0; - startX = !lpelx; endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR; -memset(tmp_stats, 0, sizeof(tmp_stats)); -memset(tmp_count, 0, sizeof(tmp_count)); - -for (y = 0; y ctuHeight - skipB; y++) -{ -int signLeft = signOf(rec[startX] - rec[startX - 1]); -for (x = startX; x endX; x++) -{ -int signRight = signOf2(rec[x], rec[x + 1]); -X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), signDown check failure\n); -uint32_t edgeType = signRight + signLeft + 2; -signLeft = -signRight; - -X265_CHECK(edgeType = 4, edgeType check failure\n); -tmp_stats[edgeType] += (fenc[x] - rec[x]); -tmp_count[edgeType]++; -} - -fenc += stride; -rec += stride; -} - -stats = m_offsetOrg[plane][SAO_EO_0]; -count = m_count[plane][SAO_EO_0]; -for (x = 0; x NUM_EDGETYPE; x++) -{ -stats[s_eoTable[x]] += tmp_stats[x]; -count[s_eoTable[x]] += tmp_count[x]; -} +primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]); } // SAO_EO_1: // dir: | @@ -1588,6 +1552,41 @@ } // NOTE: must put in namespace X265_NS since we need class SAO +void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) +{ +int x, y; +int32_t tmp_stats[SAO::NUM_EDGETYPE]; +int32_t tmp_count[SAO::NUM_EDGETYPE]; + +memset(tmp_stats, 0, sizeof(tmp_stats)); +memset(tmp_count, 0, sizeof(tmp_count)); + +for (y = 0; y endY; y++) +{ +int signLeft = signOf(rec[0] - rec[-1]); +for (x = 0; x endX; x++) +{ +int signRight = signOf2(rec[x], rec[x + 1]); +X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), signDown check failure\n); +uint32_t edgeType = signRight + signLeft + 2; +signLeft = -signRight; + +X265_CHECK(edgeType = 4, edgeType check failure\n); +tmp_stats[edgeType] += (fenc[x] - rec[x]); +tmp_count[edgeType]++; +} + +fenc += stride; +rec += stride; +} + +for (x = 0; x SAO::NUM_EDGETYPE; x++) +{ +stats[SAO::s_eoTable[x]] += tmp_stats[x
[x265] [PATCH 1 of 3] sao: created new primitive for saoCuStatsE1
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435749564 -19800 # Wed Jul 01 16:49:24 2015 +0530 # Node ID 18151ada638dd19843551e2a6d5d8b2cc9bd28be # Parent 76a314f91799c2dce6878c389503d2fe9007dbe8 sao: created new primitive for saoCuStatsE1 diff -r 76a314f91799 -r 18151ada638d source/common/primitives.h --- a/source/common/primitives.hWed Jul 01 17:05:52 2015 -0700 +++ b/source/common/primitives.hWed Jul 01 16:49:24 2015 +0530 @@ -174,6 +174,7 @@ typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX); typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); +typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count); typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); @@ -297,6 +298,7 @@ saoCuOrgE3_t saoCuOrgE3[2]; saoCuOrgB0_t saoCuOrgB0; +saoCuStatsE1_tsaoCuStatsE1; saoCuStatsE2_tsaoCuStatsE2; saoCuStatsE3_tsaoCuStatsE3; diff -r 76a314f91799 -r 18151ada638d source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Jul 01 17:05:52 2015 -0700 +++ b/source/encoder/sao.cppWed Jul 01 16:49:24 2015 +0530 @@ -811,33 +811,7 @@ primitives.sign(upBuff1, rec, rec[- stride], ctuWidth); -memset(tmp_stats, 0, sizeof(tmp_stats)); -memset(tmp_count, 0, sizeof(tmp_count)); - -for (y = startY; y endY; y++) -{ -for (x = 0; x endX; x++) -{ -int signDown = signOf2(rec[x], rec[x + stride]); -X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), signDown check failure\n); -uint32_t edgeType = signDown + upBuff1[x] + 2; -upBuff1[x] = (int8_t)(-signDown); - -tmp_stats[edgeType] += (fenc[x] - rec[x]); -tmp_count[edgeType]++; -} - -fenc += stride; -rec += stride; -} - -stats = m_offsetOrg[plane][SAO_EO_1]; -count = m_count[plane][SAO_EO_1]; -for (x = 0; x NUM_EDGETYPE; x++) -{ -stats[s_eoTable[x]] += tmp_stats[x]; -count[s_eoTable[x]] += tmp_count[x]; -} +primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]); } // SAO_EO_2: // dir: 135 @@ -1614,6 +1588,41 @@ } // NOTE: must put in namespace X265_NS since we need class SAO +void saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count) +{ +X265_CHECK(endX = MAX_CU_SIZE, endX check failure\n); +X265_CHECK(endY = MAX_CU_SIZE, endY check failure\n); + +int x, y; +int32_t tmp_stats[SAO::NUM_EDGETYPE]; +int32_t tmp_count[SAO::NUM_EDGETYPE]; + +memset(tmp_stats, 0, sizeof(tmp_stats)); +memset(tmp_count, 0, sizeof(tmp_count)); + +for (y = 0; y endY; y++) +{ +for (x = 0; x endX; x++) +{ +int signDown = signOf2(rec[x], rec[x + stride]); +X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), signDown check failure\n); +uint32_t edgeType = signDown + upBuff1[x] + 2; +upBuff1[x] = (int8_t)(-signDown); + +tmp_stats[edgeType] += (fenc[x] - rec[x]); +tmp_count[edgeType]++; +} +fenc += stride; +rec += stride; +} + +for (x = 0; x SAO::NUM_EDGETYPE; x++) +{ +stats[SAO::s_eoTable[x]] += tmp_stats[x]; +count[SAO::s_eoTable[x]] += tmp_count[x]; +} +} + void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count) { X265_CHECK(endX MAX_CU_SIZE, endX check failure\n); @@ -1694,6 +1703,7 @@ void setupSaoPrimitives_c(EncoderPrimitives p) { // TODO: move other sao functions to here +p.saoCuStatsE1 = saoCuStatsE1_c; p.saoCuStatsE2 = saoCuStatsE2_c; p.saoCuStatsE3 = saoCuStatsE3_c; } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: intra_filter4x4 avx2 code, improved 8bit: 141c-118c, 10bit: 121c-88c
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435663360 -19800 # Tue Jun 30 16:52:40 2015 +0530 # Node ID 9340454d3b551f57ba9ce6a3f77fade041975e62 # Parent b1301944894051b9641006797e4d6253b277f3e4 asm: intra_filter4x4 avx2 code, improved 8bit: 141c-118c, 10bit: 121c-88c diff -r b13019448940 -r 9340454d3b55 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jun 29 17:19:07 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jun 30 16:52:40 2015 +0530 @@ -1290,6 +1290,8 @@ } if (cpuMask X265_CPU_AVX2) { +p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2); + p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2); p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2); p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2); @@ -2619,6 +2621,8 @@ #if X86_64 if (cpuMask X265_CPU_AVX2) { +p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2); + p.planecopy_sp = PFX(downShift_16_avx2); p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2); diff -r b13019448940 -r 9340454d3b55 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Mon Jun 29 17:19:07 2015 +0530 +++ b/source/common/x86/intrapred16.asm Tue Jun 30 16:52:40 2015 +0530 @@ -77,6 +77,7 @@ intra_filter4_shuf0:db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 intra_filter4_shuf1:db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 +intra_filter4_shuf2:times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ;; (blkSize - 1 - x) pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 @@ -22047,3 +22048,29 @@ mov [r1 + 128], r2w ; topLast mov [r1 + 256], r3w ; LeftLast RET + +INIT_YMM avx2 +cglobal intra_filter_4x4, 2,4,4 +mov r2w, word [r0 + 16] ; topLast +mov r3w, word [r0 + 32] ; LeftLast + +; filtering top +movum0, [r0] +vpbroadcastwm2, xm0 +movum1, [r0 + 16] + +palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0] +pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1] +palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2] +palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2] + +psllw m0, 1 +paddw m3, m1 +paddw m0, m3 +paddw m0, [pw_2] +psrlw m0, 2 + +movu[r1], m0 +mov [r1 + 16], r2w ; topLast +mov [r1 + 32], r3w ; LeftLast +RET diff -r b13019448940 -r 9340454d3b55 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Mon Jun 29 17:19:07 2015 +0530 +++ b/source/common/x86/intrapred8.asm Tue Jun 30 16:52:40 2015 +0530 @@ -30,8 +30,9 @@ intra_pred_shuff_0_8:times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 -intra_filter4_shuf1: db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 +intra_filter4_shuf0: times 2 db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 +intra_filter4_shuf1: times 2 db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 +intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_0_8times 8 db 0, 8 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 @@ -18690,3 +18691,32 @@ mov [r1 + 64], r2b ; topLast mov [r1 + 128], r3b ; LeftLast RET + +INIT_YMM avx2 +cglobal intra_filter_4x4, 2,4,4 +mov r2b, byte [r0 + 8] ; topLast +mov r3b, byte [r0 + 16] ; LeftLast + +; filtering top +pmovzxbwm0, [r0] +vpbroadcastwm2, xm0 +pmovzxbwm1, [r0 + 8] + +palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0] +pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1] +palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2] +palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2] + +psllw m0, 1 +paddw m3, m1 +paddw m0, m3 +paddw m0, [pw_2] +psrlw m0, 2 + +packuswbm0, m0 +vpermq m0, m0, 10001000b + +movu[r1], xm0 +mov [r1 + 8], r2b ; topLast +mov [r1 + 16], r3b ; LeftLast +RET
[x265] [PATCH] asm: intra_filter 10bpp sse4 code
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435578547 -19800 # Mon Jun 29 17:19:07 2015 +0530 # Node ID 60832369ebb4e1014b4080b27a0401f97af93958 # Parent 9feee64efa440c25f016d15ae982789e5393a77e asm: intra_filter 10bpp sse4 code Performance improved over C code: intra_filter_32x32 7.46x525.64 3922.56 intra_filter_16x16 6.53x289.11 1886.86 intra_filter_8x8 5.60x170.75 956.81 intra_filter_4x4 3.05x121.20 369.74 diff -r 9feee64efa44 -r 60832369ebb4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 15:29:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 29 17:19:07 2015 +0530 @@ -1120,6 +1120,11 @@ ALL_LUMA_PU(satd, pixel_satd, sse4); ASSIGN_SA8D(sse4); +p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4); +p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4); +p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4); +p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4); + ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4); ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); INTRA_ANG_SSE4_COMMON(sse4); diff -r 9feee64efa44 -r 60832369ebb4 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Fri Jun 26 15:29:51 2015 +0530 +++ b/source/common/x86/intrapred16.asm Mon Jun 29 17:19:07 2015 +0530 @@ -75,6 +75,9 @@ const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1 +intra_filter4_shuf0:db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 +intra_filter4_shuf1:db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 + ;; (blkSize - 1 - x) pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 @@ -21634,3 +21637,413 @@ decr4 jnz.loop RET + +;--- +; void intra_filter_NxN(const pixel* references, pixel* filtered) +;--- +INIT_XMM sse4 +cglobal intra_filter_4x4, 2,4,5 +mov r2w, word [r0 + 16] ; topLast +mov r3w, word [r0 + 32] ; LeftLast + +; filtering top +movum0, [r0 + 0] +movum1, [r0 + 16] +movum2, [r0 + 32] + +pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] +palignr m3, m1, m0, 4 +pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1] + +psllw m0, 1 +paddw m4, m3 +paddw m0, m4 +paddw m0, [pw_2] +psrlw m0, 2 + +; filtering left +palignr m4, m1, m1, 14 +pinsrw m4, [r0], 1 +palignr m3, m2, m1, 4 +pshufb m3, [intra_filter4_shuf1] + +psllw m1, 1 +paddw m4, m3 +paddw m1, m4 +paddw m1, [pw_2] +psrlw m1, 2 + +movu[r1], m0 +movu[r1 + 16], m1 +mov [r1 + 16], r2w ; topLast +mov [r1 + 32], r3w ; LeftLast +RET + +INIT_XMM sse4 +cglobal intra_filter_8x8, 2,4,6 +mov r2w, word [r0 + 32] ; topLast +mov r3w, word [r0 + 64] ; LeftLast + +; filtering top +movum0, [r0] +movum1, [r0 + 16] +movum2, [r0 + 32] + +pshufb m4, m0, [intra_filter4_shuf0] +palignr m5, m1, m0, 2 +pinsrw m5, [r0 + 34], 0 + +palignr m3, m1, m0, 14 +psllw m0, 1 +paddw m4, m5 +paddw m0, m4 +paddw m0, [pw_2] +psrlw m0, 2 + +palignr m4, m2, m1, 2 +psllw m1, 1 +paddw m4, m3 +paddw m1, m4 +paddw m1, [pw_2] +psrlw m1, 2 +movu[r1], m0 +movu[r1 + 16], m1 + +; filtering left +movum1, [r0 + 48] +movum0, [r0 + 64] + +palignr m4, m2, m2, 14 +pinsrw m4, [r0], 1 +palignr m5, m1, m2, 2 + +palignr m3, m1, m2, 14 +palignr m0, m1, 2 + +psllw m2, 1 +paddw m4, m5 +paddw m2, m4 +paddw m2, [pw_2] +psrlw m2, 2 + +psllw m1, 1 +paddw m0, m3 +paddw m1, m0 +paddw m1, [pw_2] +psrlw m1, 2 + +movu[r1 + 32], m2 +movu
[x265] [PATCH] asm: fix gcc build error, invalid size for operand 1
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435307390 -19800 # Fri Jun 26 13:59:50 2015 +0530 # Node ID 504a42904fab2a43e4d8b5b65513db7a7dd30ee1 # Parent 1e5c4d155ab85e8e8dd199bb3515801766ea9e88 asm: fix gcc build error, invalid size for operand 1 diff -r 1e5c4d155ab8 -r 504a42904fab source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Thu Jun 25 13:42:29 2015 +0530 +++ b/source/common/x86/loopfilter.asm Fri Jun 26 13:59:50 2015 +0530 @@ -246,7 +246,7 @@ movdxm1, r1d vinserti128 m0, m0, xm1, 1 movam5, [pw_1023] -mov r1, r4m +mov r1d, r4m add r1d, r1d shr r2d, 4 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 4] asm: intra_filter8x8 sse4 code, improved 990c-201c over C code
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435323520 -19800 # Fri Jun 26 18:28:40 2015 +0530 # Node ID 93c31f8b404708cd39d00b85a07b2418794fc103 # Parent 44b574b61b29a3cfba99e8f0d06622e44a86df17 asm: intra_filter8x8 sse4 code, improved 990c-201c over C code diff -r 44b574b61b29 -r 93c31f8b4047 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 18:21:07 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 18:28:40 2015 +0530 @@ -2454,6 +2454,7 @@ p.weight_sp = PFX(weight_sp_sse4); p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4); +p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4); ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4); ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); diff -r 44b574b61b29 -r 93c31f8b4047 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Fri Jun 26 18:21:07 2015 +0530 +++ b/source/common/x86/intrapred8.asm Fri Jun 26 18:28:40 2015 +0530 @@ -18320,3 +18320,63 @@ mov [r1 + 8], r2b ; topLast mov [r1 + 16], r3b ; LeftLast RET + +INIT_XMM sse4 +cglobal intra_filter_8x8, 2,4,6 +mov r2b, byte [r0 + 16] ; topLast +mov r3b, byte [r0 + 32] ; LeftLast + +; filtering top +pmovzxbwm0, [r0 + 0] +pmovzxbwm1, [r0 + 8] +pmovzxbwm2, [r0 + 16] + +pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] +palignr m5, m1, m0, 2 +pinsrb m5, [r0 + 17], 0; [8 7 6 5 4 3 2 9] samples[i + 1] + +palignr m3, m1, m0, 14 +psllw m0, 1 +paddw m4, m5 +paddw m0, m4 +paddw m0, [pw_2] +psrlw m0, 2 + +palignr m4, m2, m1, 2 +psllw m1, 1 +paddw m4, m3 +paddw m1, m4 +paddw m1, [pw_2] +psrlw m1, 2 + +packuswbm0, m1 +movu[r1], m0 + +; filtering left +pmovzxbwm1, [r0 + 24] +pmovzxbwm0, [r0 + 32] + +palignr m4, m2, m2, 14 +pinsrb m4, [r0], 2 +palignr m5, m1, m2, 2 + +palignr m3, m1, m2, 14 +palignr m0, m1, 2 + +psllw m2, 1 +paddw m4, m5 +paddw m2, m4 +paddw m2, [pw_2] +psrlw m2, 2 + +psllw m1, 1 +paddw m0, m3 +paddw m1, m0 +paddw m1, [pw_2] +psrlw m1, 2 + +packuswbm2, m1 +movu[r1 + 16], m2 +mov [r1 + 16], r2b ; topLast +mov [r1 + 32], r3b ; LeftLast +RET ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 4] asm: intra_filter4x4 sse4 code and added testbench support, improved 357c-141c over C code
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435323067 -19800 # Fri Jun 26 18:21:07 2015 +0530 # Node ID 44b574b61b29a3cfba99e8f0d06622e44a86df17 # Parent d64227e54233d1646c55bcb4b0b831e5340009ed asm: intra_filter4x4 sse4 code and added testbench support, improved 357c-141c over C code diff -r d64227e54233 -r 44b574b61b29 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 18:21:07 2015 +0530 @@ -2453,6 +2453,8 @@ p.weight_pp = PFX(weight_pp_sse4); p.weight_sp = PFX(weight_sp_sse4); +p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4); + ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4); ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); ALL_LUMA_TU(intra_pred_allangs, all_angs_pred, sse4); diff -r d64227e54233 -r 44b574b61b29 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Thu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/intrapred.h Fri Jun 26 18:21:07 2015 +0530 @@ -66,6 +66,7 @@ #define DECL_ALL(cpu) \ FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \ +FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \ DECL_ANGS(4, cpu); \ DECL_ANGS(8, cpu); \ DECL_ANGS(16, cpu); \ diff -r d64227e54233 -r 44b574b61b29 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Thu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/intrapred8.asm Fri Jun 26 18:21:07 2015 +0530 @@ -30,6 +30,9 @@ intra_pred_shuff_0_8:times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 +intra_filter4_shuf1: db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 + pb_0_8times 8 db 0, 8 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0 @@ -18276,3 +18279,44 @@ INTRA_PRED_STORE_4x4 RET + +;--- +; void intra_filter_NxN(const pixel* references, pixel* filtered) +;--- +INIT_XMM sse4 +cglobal intra_filter_4x4, 2,4,5 +mov r2b, byte [r0 + 8] ; topLast +mov r3b, byte [r0 + 16] ; LeftLast + +; filtering top +pmovzxbwm0, [r0 + 0] +pmovzxbwm1, [r0 + 8] +pmovzxbwm2, [r0 + 16] + +pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] +palignr m3, m1, m0, 4 +pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1] + +psllw m0, 1 +paddw m4, m3 +paddw m0, m4 +paddw m0, [pw_2] +psrlw m0, 2 + +; filtering left +palignr m4, m1, m1, 14 ; [14 13 12 11 10 9 8 15] samples[i - 1] +pinsrb m4, [r0], 2 ; [14 13 12 11 10 9 0 15] samples[i + 1] +palignr m3, m2, m1, 4 +pshufb m3, [intra_filter4_shuf1] + +psllw m1, 1 +paddw m4, m3 +paddw m1, m4 +paddw m1, [pw_2] +psrlw m1, 2 +packuswbm0, m1 + +movu[r1], m0 +mov [r1 + 8], r2b ; topLast +mov [r1 + 16], r3b ; LeftLast +RET diff -r d64227e54233 -r 44b574b61b29 source/test/intrapredharness.cpp --- a/source/test/intrapredharness.cpp Thu Jun 25 16:25:51 2015 +0530 +++ b/source/test/intrapredharness.cpp Fri Jun 26 18:21:07 2015 +0530 @@ -31,6 +31,16 @@ { for (int i = 0; i INPUT_SIZE; i++) pixel_buff[i] = rand() % PIXEL_MAX; + +/* [0] --- Random values + * [1] --- Minimum + * [2] --- Maximum */ +for (int i = 0; i BUFFSIZE; i++) +{ +pixel_test_buff[0][i] = rand() % PIXEL_MAX; +pixel_test_buff[1][i] = PIXEL_MIN; +pixel_test_buff[2][i] = PIXEL_MAX; +} } bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width) @@ -177,6 +187,27 @@ return true; } +bool IntraPredHarness::check_intra_filter_primitive(const intra_filter_t ref, const intra_filter_t opt) +{ +memset(pixel_out_c, 0, 64 * 64 * sizeof(pixel)); +memset(pixel_out_vec, 0, 64 * 64 * sizeof(pixel)); +int j = 0; + +for (int i = 0; i 100; i++) +{ +int index = rand() % TEST_CASES; + +ref(pixel_test_buff[index] + j, pixel_out_c); +checked(opt, pixel_test_buff[index] + j, pixel_out_vec); + +if (memcmp(pixel_out_c
[x265] [PATCH 4 of 4] asm: intra_filter32x32 sse4 code, improved 4050c-652c over C code
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435323958 -19800 # Fri Jun 26 18:35:58 2015 +0530 # Node ID e04bde60af516f6f016e3e6f37d5d64e97e589f3 # Parent 1995a55f1320a029fb423f23cbfd24555c258d09 asm: intra_filter32x32 sse4 code, improved 4050c-652c over C code diff -r 1995a55f1320 -r e04bde60af51 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 18:32:00 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 18:35:58 2015 +0530 @@ -2456,6 +2456,7 @@ p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4); p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4); p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4); +p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4); ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4); ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); diff -r 1995a55f1320 -r e04bde60af51 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Fri Jun 26 18:32:00 2015 +0530 +++ b/source/common/x86/intrapred8.asm Fri Jun 26 18:35:58 2015 +0530 @@ -18485,3 +18485,208 @@ mov [r1 + 32], r2b ; topLast mov [r1 + 64], r3b ; LeftLast RET + +INIT_XMM sse4 +cglobal intra_filter_32x32, 2,4,6 +mov r2b, byte [r0 + 64]; topLast +mov r3b, byte [r0 + 128]; LeftLast + +; filtering top +; 0 to 15 +pmovzxbwm0, [r0 + 0] +pmovzxbwm1, [r0 + 8] +pmovzxbwm2, [r0 + 16] + +pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] +palignr m5, m1, m0, 2 +pinsrb m5, [r0 + 65], 0; [8 7 6 5 4 3 2 9] samples[i + 1] + +palignr m3, m1, m0, 14 +psllw m0, 1 +paddw m4, m5 +paddw m0, m4 +paddw m0, [pw_2] +psrlw m0, 2 + +palignr m4, m2, m1, 2 +psllw m5, m1, 1 +paddw m4, m3 +paddw m5, m4 +paddw m5, [pw_2] +psrlw m5, 2 +packuswbm0, m5 +movu[r1], m0 + +; 16 to 31 +pmovzxbwm0, [r0 + 24] +pmovzxbwm5, [r0 + 32] + +palignr m3, m2, m1, 14 +palignr m4, m0, m2, 2 + +psllw m1, m2, 1 +paddw m3, m4 +paddw m1, m3 +paddw m1, [pw_2] +psrlw m1, 2 + +palignr m3, m0, m2, 14 +palignr m4, m5, m0, 2 + +psllw m2, m0, 1 +paddw m4, m3 +paddw m2, m4 +paddw m2, [pw_2] +psrlw m2, 2 +packuswbm1, m2 +movu[r1 + 16], m1 + +; 32 to 47 +pmovzxbwm1, [r0 + 40] +pmovzxbwm2, [r0 + 48] + +palignr m3, m5, m0, 14 +palignr m4, m1, m5, 2 + +psllw m0, m5, 1 +paddw m3, m4 +paddw m0, m3 +paddw m0, [pw_2] +psrlw m0, 2 + +palignr m3, m1, m5, 14 +palignr m4, m2, m1, 2 + +psllw m5, m1, 1 +paddw m4, m3 +paddw m5, m4 +paddw m5, [pw_2] +psrlw m5, 2 +packuswbm0, m5 +movu[r1 + 32], m0 + +; 48 to 63 +pmovzxbwm0, [r0 + 56] +pmovzxbwm5, [r0 + 64] + +palignr m3, m2, m1, 14 +palignr m4, m0, m2, 2 + +psllw m1, m2, 1 +paddw m3, m4 +paddw m1, m3 +paddw m1, [pw_2] +psrlw m1, 2 + +palignr m3, m0, m2, 14 +palignr m4, m5, m0, 2 + +psllw m0, 1 +paddw m4, m3 +paddw m0, m4 +paddw m0, [pw_2] +psrlw m0, 2 +packuswbm1, m0 +movu[r1 + 48], m1 + +; filtering left +; 64 to 79 +pmovzxbwm1, [r0 + 72] +pmovzxbwm2, [r0 + 80] + +palignr m4, m5, m5, 14 +pinsrb m4, [r0], 2 +palignr m0, m1, m5, 2 + +psllw m3, m5, 1 +paddw m4, m0 +paddw m3, m4 +paddw m3, [pw_2] +psrlw m3, 2 + +palignr m0, m1, m5, 14 +palignr m4, m2, m1, 2 + +psllw m5, m1, 1 +paddw m4, m0 +paddw m5, m4 +paddw m5, [pw_2] +psrlw m5, 2 +packuswbm3, m5 +movu[r1 + 64], m3 + +; 80 to 95 +pmovzxbwm5, [r0 + 88] +pmovzxbwm0, [r0 + 96] + +palignr m3, m2, m1, 14 +palignr m4, m5, m2, 2 + +psllw m1, m2, 1 +paddw m3, m4 +paddw m1, m3 +paddw
[x265] [PATCH 0 of 4 ] asm code and testbench support for intra_filter primitive
intra_filter_4x4 2.52x141.82 357.20 intra_filter_8x8 4.79x198.79 951.41 intra_filter_16x16 5.56x351.03 1952.17 intra_filter_32x32 6.20x652.82 4050.76 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 3 of 4] asm: intra_filter16x16 sse4 code, improved 1952c-351c over C code
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435323720 -19800 # Fri Jun 26 18:32:00 2015 +0530 # Node ID 1995a55f1320a029fb423f23cbfd24555c258d09 # Parent 93c31f8b404708cd39d00b85a07b2418794fc103 asm: intra_filter16x16 sse4 code, improved 1952c-351c over C code diff -r 93c31f8b4047 -r 1995a55f1320 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 18:28:40 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 18:32:00 2015 +0530 @@ -2455,6 +2455,7 @@ p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4); p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4); +p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4); ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4); ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); diff -r 93c31f8b4047 -r 1995a55f1320 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Fri Jun 26 18:28:40 2015 +0530 +++ b/source/common/x86/intrapred8.asm Fri Jun 26 18:32:00 2015 +0530 @@ -18380,3 +18380,108 @@ mov [r1 + 16], r2b ; topLast mov [r1 + 32], r3b ; LeftLast RET + +INIT_XMM sse4 +cglobal intra_filter_16x16, 2,4,6 +mov r2b, byte [r0 + 32] ; topLast +mov r3b, byte [r0 + 64] ; LeftLast + +; filtering top +pmovzxbwm0, [r0 + 0] +pmovzxbwm1, [r0 + 8] +pmovzxbwm2, [r0 + 16] + +pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] +palignr m5, m1, m0, 2 +pinsrb m5, [r0 + 33], 0; [8 7 6 5 4 3 2 9] samples[i + 1] + +palignr m3, m1, m0, 14 +psllw m0, 1 +paddw m4, m5 +paddw m0, m4 +paddw m0, [pw_2] +psrlw m0, 2 + +palignr m4, m2, m1, 2 +psllw m5, m1, 1 +paddw m4, m3 +paddw m5, m4 +paddw m5, [pw_2] +psrlw m5, 2 +packuswbm0, m5 +movu[r1], m0 + +pmovzxbwm0, [r0 + 24] +pmovzxbwm5, [r0 + 32] + +palignr m3, m2, m1, 14 +palignr m4, m0, m2, 2 + +psllw m1, m2, 1 +paddw m3, m4 +paddw m1, m3 +paddw m1, [pw_2] +psrlw m1, 2 + +palignr m3, m0, m2, 14 +palignr m4, m5, m0, 2 + +psllw m0, 1 +paddw m4, m3 +paddw m0, m4 +paddw m0, [pw_2] +psrlw m0, 2 +packuswbm1, m0 +movu[r1 + 16], m1 + +; filtering left +pmovzxbwm1, [r0 + 40] +pmovzxbwm2, [r0 + 48] + +palignr m4, m5, m5, 14 +pinsrb m4, [r0], 2 +palignr m0, m1, m5, 2 + +psllw m3, m5, 1 +paddw m4, m0 +paddw m3, m4 +paddw m3, [pw_2] +psrlw m3, 2 + +palignr m0, m1, m5, 14 +palignr m4, m2, m1, 2 + +psllw m5, m1, 1 +paddw m4, m0 +paddw m5, m4 +paddw m5, [pw_2] +psrlw m5, 2 +packuswbm3, m5 +movu[r1 + 32], m3 + +pmovzxbwm5, [r0 + 56] +pmovzxbwm0, [r0 + 64] + +palignr m3, m2, m1, 14 +palignr m4, m5, m2, 2 + +psllw m1, m2, 1 +paddw m3, m4 +paddw m1, m3 +paddw m1, [pw_2] +psrlw m1, 2 + +palignr m3, m5, m2, 14 +palignr m4, m0, m5, 2 + +psllw m5, 1 +paddw m4, m3 +paddw m5, m4 +paddw m5, [pw_2] +psrlw m5, 2 +packuswbm1, m5 +movu[r1 + 48], m1 + +mov [r1 + 32], r2b ; topLast +mov [r1 + 64], r3b ; LeftLast +RET ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 6] asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c-690c over SSE
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435212794 -19800 # Thu Jun 25 11:43:14 2015 +0530 # Node ID faec09e1ab60531924f2d919d4f283fa91bfec81 # Parent b1af4c36f48a4500a4912373ebcda9a5540b5c15 asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c-690c over SSE diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jun 24 10:36:15 2015 -0500 +++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 11:43:14 2015 +0530 @@ -1284,6 +1284,8 @@ } if (cpuMask X265_CPU_AVX2) { +p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2); + p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2); p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2); p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx2); diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Wed Jun 24 10:36:15 2015 -0500 +++ b/source/common/x86/const-a.asm Thu Jun 25 11:43:14 2015 +0530 @@ -41,7 +41,7 @@ const pb_16,times 32 db 16 const pb_32,times 32 db 32 const pb_64,times 32 db 64 -const pb_128, times 16 db 128 +const pb_128, times 32 db 128 const pb_a1,times 16 db 0xa1 const pb_01,times 8 db 0, 1 diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Wed Jun 24 10:36:15 2015 -0500 +++ b/source/common/x86/loopfilter.asm Thu Jun 25 11:43:14 2015 +0530 @@ -235,6 +235,67 @@ %endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE0, 4,4,9 +vbroadcasti128 m6, [r1] +movzx r1d, byte [r3] +neg r1b +movdxm0, r1d +movzx r1d, byte [r3 + 1] +neg r1b +movdxm1, r1d +vinserti128 m0, m0, xm1, 1 +movam5, [pw_1023] +mov r1, r4m +add r1d, r1d +shr r2d, 4 + +.loop: +movum7, [r0] +movum8, [r0 + r1] +movum2, [r0 + 2] +movum1, [r0 + r1 + 2] + +pcmpgtw m3, m7, m2 +pcmpgtw m2, m7 +pcmpgtw m4, m8, m1 +pcmpgtw m1, m8 + +packsswbm3, m4 +packsswbm2, m1 +vpermq m3, m3, 11011000b +vpermq m2, m2, 11011000b + +pandm3, [pb_1] +por m3, m2 + +pslldq m2, m3, 1 +por m2, m0 + +psignb m2, [pb_128]; m2 = signLeft +pxorm0, m0 +palignr m0, m3, 15 +paddb m3, m2 +paddb m3, [pb_2] ; m3 = uiEdgeType +pshufb m2, m6, m3 +pmovsxbwm3, xm2 ; offsetEo +vextracti128xm2, m2, 1 +pmovsxbwm2, xm2 +pxorm4, m4 +paddw m7, m3 +paddw m8, m2 +pmaxsw m7, m4 +pmaxsw m8, m4 +pminsw m7, m5 +pminsw m8, m5 +movu[r0], m7 +movu[r0 + r1], m8 + +add r0q, 32 +dec r2d +jnz .loop +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride mov r4d,r4m @@ -287,6 +348,7 @@ sub r2d,16 jnz .loop RET +%endif ;== ; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 4 of 6] asm: 10bpp AVX2 code for saoCuOrgE2
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435213857 -19800 # Thu Jun 25 12:00:57 2015 +0530 # Node ID 8b680fd502e08ec2cab4fff7f5833791bb5bfeef # Parent f43aa44673dcd8e96581c938cf22ad4bbb7657e3 asm: 10bpp AVX2 code for saoCuOrgE2 SAO_EO_2[0] 207c-166 SAO_EO_2[1] 555c-422c diff -r f43aa44673dc -r 8b680fd502e0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 11:54:22 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 12:00:57 2015 +0530 @@ -1287,6 +1287,8 @@ p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2); p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2); p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2); +p.saoCuOrgE2[0] = PFX(saoCuOrgE2_avx2); +p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2); p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2); p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2); diff -r f43aa44673dc -r 8b680fd502e0 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Thu Jun 25 11:54:22 2015 +0530 +++ b/source/common/x86/loopfilter.asm Thu Jun 25 12:00:57 2015 +0530 @@ -948,6 +948,55 @@ %endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE2, 6,6,7 +mov r4d, r4m +add r5d, r5d +inc r1 +movqxm4, [r0 + r4 * 2] +movhps xm4, [r1 + r4] +vbroadcasti128 m5, [r3] +movam6, [pw_1023] +.loop +movum1, [r0] +movum3, [r0 + r5 + 2] + +pcmpgtw m2, m1, m3 +pcmpgtw m3, m1 + +packsswbm2, m3 +vpermq m3, m2, 11011101b +vpermq m2, m2, 10001000b + +pandxm2, [pb_1] +por xm2, xm3 + +movuxm3, [r2] + +paddb xm3, xm2 +paddb xm3, [pb_2] +pshufb xm0, xm5, xm3 +pmovsxbwm3, xm0 + +pxorm0, m0 +paddw m1, m3 +pmaxsw m1, m0 +pminsw m1, m6 +movu[r0], m1 + +psubb xm0, xm2 +movu[r1], xm0 + +add r0, 32 +add r1, 16 +add r2, 16 +sub r4, 16 +jg .loop + +movq[r0 + r4 * 2], xm4 +movhps [r1 + r4], xm4 +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE2, 5, 6, 7, rec, bufft, buff1, offsetEo, lcuWidth movr4d, r4m movr5d, r5m @@ -987,8 +1036,70 @@ movq [r0 + r4], xm6 movhps [r1 + r4], xm6 RET +%endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE2_32, 6,6,8 +mov r4d, r4m +add r5d, r5d +inc r1 +movqxm4, [r0 + r4 * 2] +movhps xm4, [r1 + r4] +vbroadcasti128 m5, [r3] + +.loop +movum1, [r0] +movum7, [r0 + 32] +movum3, [r0 + r5 + 2] +movum6, [r0 + r5 + 34] + +pcmpgtw m2, m1, m3 +pcmpgtw m0, m7, m6 +pcmpgtw m3, m1 +pcmpgtw m6, m7 + +packsswbm2, m0 +packsswbm3, m6 +vpermq m3, m3, 11011000b +vpermq m2, m2, 11011000b + +pandm2, [pb_1] +por m2, m3 + +movum3, [r2] + +paddb m3, m2 +paddb m3, [pb_2] +pshufb m0, m5, m3 + +pmovsxbwm3, xm0 +vextracti128xm0, m0, 1 +pmovsxbwm6, xm0 + +pxorm0, m0 +paddw m1, m3 +paddw m7, m6 +pmaxsw m1, m0 +pmaxsw m7, m0 +pminsw m1, [pw_1023] +pminsw m7, [pw_1023] +movu[r0], m1 +movu[r0 + 32], m7 + +psubb m0, m2 +movu[r1], m0 + +add r0, 64 +add r1, 32 +add r2, 32 +sub r4, 32 +jg .loop + +movq[r0 + r4 * 2], xm4 +movhps [r1 + r4], xm4 +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE2_32, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth mov r4d, r4m mov r5d, r5m @@ -1040,6 +1151,7 @@ movq[r0 + r4], xm6 movhps [r1 + r4], xm6 RET +%endif ;=== ;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 3 of 6] asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c-614c over SSE
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435213462 -19800 # Thu Jun 25 11:54:22 2015 +0530 # Node ID f43aa44673dcd8e96581c938cf22ad4bbb7657e3 # Parent 31da07b7198ca730bae37577d5053a3337477f7b asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c-614c over SSE diff -r 31da07b7198c -r f43aa44673dc source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 11:49:07 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 11:54:22 2015 +0530 @@ -1286,6 +1286,7 @@ { p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2); p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2); +p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2); p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2); p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2); diff -r 31da07b7198c -r f43aa44673dc source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Thu Jun 25 11:49:07 2015 +0530 +++ b/source/common/x86/loopfilter.asm Thu Jun 25 11:54:22 2015 +0530 @@ -728,6 +728,62 @@ %endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE1_2Rows, 4,5,8 +add r3d, r3d +mov r4d, r4m +movam4, [pw_1023] +vbroadcasti128 m6, [r2]; m6 = m_iOffsetEo +shr r4d, 4 +.loop +movum7, [r0] +movum5, [r0 + r3] +movum1, [r0 + r3 * 2] + +pcmpgtw m2, m7, m5 +pcmpgtw m3, m5, m7 +pcmpgtw m0, m5, m1 +pcmpgtw m1, m5 + +packsswbm2, m0 +packsswbm3, m1 +vpermq m2, m2, 11011000b +vpermq m3, m3, 11011000b + +pandm2, [pb_1] +por m2, m3 + +movuxm3, [r1] ; m3 = m_iUpBuff1 +pxorm0, m0 +psubb m1, m0, m2 +vinserti128 m3, m3, xm1, 1 +vextracti128[r1], m1, 1 + +paddb m3, m2 +paddb m3, [pb_2] + +pshufb m1, m6, m3 +pmovsxbwm3, xm1 +vextracti128xm1, m1, 1 +pmovsxbwm1, xm1 + +paddw m7, m3 +paddw m5, m1 + +pmaxsw m7, m0 +pmaxsw m5, m0 +pminsw m7, m4 +pminsw m5, m4 + +movu[r0], m7 +movu[r0 + r3], m5 + +add r0, 32 +add r1, 16 +dec r4d +jnz .loop +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth mov r3d,r3m mov r4d,r4m @@ -775,6 +831,7 @@ dec r4d jnz .loop RET +%endif ;== ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 6 of 6] asm: 10bpp AVX2 code for saoCuOrgB0, improved 23127c-15595c over SSE
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435219949 -19800 # Thu Jun 25 13:42:29 2015 +0530 # Node ID f1ff5636cba3e2b714ceed86261362a53e8c6aca # Parent 85d5582eedd40e4227131bff366235e6dc2b361a asm: 10bpp AVX2 code for saoCuOrgB0, improved 23127c-15595c over SSE diff -r 85d5582eedd4 -r f1ff5636cba3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 12:11:45 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 13:42:29 2015 +0530 @@ -1291,6 +1291,7 @@ p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2); p.saoCuOrgE3[0] = PFX(saoCuOrgE3_avx2); p.saoCuOrgE3[1] = PFX(saoCuOrgE3_32_avx2); +p.saoCuOrgB0 = PFX(saoCuOrgB0_avx2); p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2); p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2); diff -r 85d5582eedd4 -r f1ff5636cba3 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Thu Jun 25 12:11:45 2015 +0530 +++ b/source/common/x86/loopfilter.asm Thu Jun 25 13:42:29 2015 +0530 @@ -1643,6 +1643,89 @@ %endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgB0, 5,7,8 +vbroadcasti128 m3, [r1] +vbroadcasti128 m4, [r1 + 16] +add r4d, r4d +lea r1, [r4 * 2] +sub r1d, r2d +sub r1d, r2d +shr r2d, 4 +movam7, [pw_1023] + +mov r6d, r3d +shr r3d, 1 + +.loopH +mov r5d, r2d +.loopW +movum2, [r0] +movum5, [r0 + r4] +psrlw m0, m2, 5 +psrlw m6, m5, 5 +packuswbm0, m6 +vpermq m0, m0, 11011000b +pandm0, [pb_31] ; m0 = [index] + +pshufb m6, m3, m0 +pshufb m1, m4, m0 +pcmpgtb m0, [pb_15] ; m0 = [mask] + +pblendvbm6, m6, m1, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! + +pmovsxbwm0, xm6 +vextracti128xm6, m6, 1 +pmovsxbwm6, xm6 + +paddw m2, m0 +paddw m5, m6 +pxorm1, m1 +pmaxsw m2, m1 +pmaxsw m5, m1 +pminsw m2, m7 +pminsw m5, m7 + +movu[r0], m2 +movu[r0 + r4], m5 + +add r0, 32 +dec r5d +jnz .loopW + +add r0, r1 +dec r3d +jnz .loopH + +testr6b, 1 +jz .end +xor r1, r1 +.loopW1: +movum2, [r0 + r1] +psrlw m0, m2, 5 +packuswbm0, m0 +vpermq m0, m0, 10001000b +pandm0, [pb_31] ; m0 = [index] + +pshufb m6, m3, m0 +pshufb m1, m4, m0 +pcmpgtb m0, [pb_15] ; m0 = [mask] + +pblendvbm6, m6, m1, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! +pmovsxbwm0, xm6 ; offset + +paddw m2, m0 +pxorm0, m0 +pmaxsw m2, m0 +pminsw m2, m7 + +movu[r0 + r1], m2 +add r1d, 32 +dec r2d +jnz .loopW1 +.end: +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgB0, 4, 7, 8 mov r3d,r3m @@ -1717,6 +1800,7 @@ jnz .loopW1 .end RET +%endif ; ; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 5 of 6] asm: 10bpp AVX2 code for saoCuOrgE3
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1435214505 -19800 # Thu Jun 25 12:11:45 2015 +0530 # Node ID 85d5582eedd40e4227131bff366235e6dc2b361a # Parent 8b680fd502e08ec2cab4fff7f5833791bb5bfeef asm: 10bpp AVX2 code for saoCuOrgE3 SAO_EO_3[0] 236c-195 SAO_EO_3[1] 570c-490c diff -r 8b680fd502e0 -r 85d5582eedd4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 12:00:57 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 12:11:45 2015 +0530 @@ -1289,6 +1289,8 @@ p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2); p.saoCuOrgE2[0] = PFX(saoCuOrgE2_avx2); p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2); +p.saoCuOrgE3[0] = PFX(saoCuOrgE3_avx2); +p.saoCuOrgE3[1] = PFX(saoCuOrgE3_32_avx2); p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2); p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2); diff -r 8b680fd502e0 -r 85d5582eedd4 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Thu Jun 25 12:00:57 2015 +0530 +++ b/source/common/x86/loopfilter.asm Thu Jun 25 12:11:45 2015 +0530 @@ -1290,6 +1290,61 @@ %endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE3, 4,6,6 +add r3d, r3d +mov r4d, r4m +mov r5d, r5m + +; save latest 2 pixels for case startX=1 or left_endX=15 +movqxm5, [r0 + r5 * 2] +movhps xm5, [r1 + r5 - 1] + +; move to startX+1 +inc r4d +lea r0, [r0 + r4 * 2] ; x = startX + 1 +add r1, r4 +sub r5d, r4d +movuxm4, [r2] + +.loop: +movum1, [r0] +movum0, [r0 + r3] + +pcmpgtw m2, m1, m0 +pcmpgtw m0, m1 +packsswbm2, m0 +vpermq m0, m2, 11011101b +vpermq m2, m2, 10001000b +pandm2, [pb_1] +por m2, m0 + +movuxm0, [r1] +paddb xm0, xm2 +paddb xm0, [pb_2] + +pshufb xm3, xm4, xm0 +pmovsxbwm3, xm3 + +paddw m1, m3 +pxorm0, m0 +pmaxsw m1, m0 +pminsw m1, [pw_1023] +movu[r0], m1 + +psubb xm0, xm2 +movu[r1 - 1], xm0 + +add r0, 32 +add r1, 16 +sub r5, 16 +jg .loop + +; restore last pixels (up to 2) +movq[r0 + r5 * 2], xm5 +movhps [r1 + r5 - 1], xm5 +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE3, 3, 6, 8 mov r3d, r3m mov r4d, r4m @@ -1350,8 +1405,76 @@ movq[r0 + r5], xm7 movhps [r1 + r5 - 1], xm7 RET +%endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE3_32, 3,6,8 +add r3d, r3d +mov r4d, r4m +mov r5d, r5m + +; save latest 2 pixels for case startX=1 or left_endX=15 +movqxm5, [r0 + r5 * 2] +movhps xm5, [r1 + r5 - 1] + +; move to startX+1 +inc r4d +lea r0, [r0 + r4 * 2] ; x = startX + 1 +add r1, r4 +sub r5d, r4d +vbroadcasti128 m4, [r2] + +.loop: +movum1, [r0] +movum7, [r0 + 32] +movum0, [r0 + r3] +movum6, [r0 + r3 + 32] + +pcmpgtw m2, m1, m0 +pcmpgtw m3, m7, m6 +pcmpgtw m0, m1 +pcmpgtw m6, m7 + +packsswbm2, m3 +packsswbm0, m6 +vpermq m2, m2, 11011000b +vpermq m0, m0, 11011000b +pandm2, [pb_1] +por m2, m0 + +movum0, [r1] +paddb m0, m2 +paddb m0, [pb_2] + +pshufb m3, m4, m0 +vextracti128xm6, m3, 1 +pmovsxbwm3, xm3 +pmovsxbwm6, xm6 + +paddw m1, m3 +paddw m7, m6 +pxorm0, m0 +pmaxsw m1, m0 +pmaxsw m7, m0 +pminsw m1, [pw_1023] +pminsw m7, [pw_1023] +movu[r0], m1 +movu[r0 + 32], m7 + +psubb m0, m2 +movu[r1 - 1], m0 + +add r0, 64 +add r1, 32 +sub r5, 32 +jg .loop + +; restore last pixels (up to 2) +movq[r0 + r5 * 2], xm5 +movhps [r1 + r5 - 1], xm5 +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE3_32, 3, 6, 8 mov r3d, r3m mov r4d, r4m @@ -1416,6 +1539,7 @@ movq[r0 + r5], xm7 movhps [r1 + r5 - 1], xm7 RET +%endif
Re: [x265] [PATCH 0 of 6 ] SAO SSE4 asm code for HIGH_BIT_DEPTH
Okay. Will check IACA report and try pxor for m0 and buffer 1023. On Mon, Jun 22, 2015 at 8:24 PM, chen chenm...@163.com wrote: right some comment: 'psignb X, [pb_128]' equal to 'psubb X, 0, X', in AVX2, second type faster, in SSE4, choice depends on IACA report in PMINSW, you buffer ZERO into M0, and use pw_1023 directly, could you try buffer pw_1023 and use PXOR to get ZERO? At 2015-06-22 20:50:32,dnyanesh...@multicorewareinc.com wrote: SAO_EO_08.97x974.03 8740.81 SAO_EO_110.18x 492.67 5017.42 SAO_EO_1_2Rows 11.21x 900.82 10095.86 SAO_EO_2[0] 6.27x207.22 1298.92 SAO_EO_2[1] 8.92x555.20 4949.69 SAO_EO_3[0] 4.97x236.72 1177.29 SAO_EO_3[1] 8.67x551.14 4778.67 SAO_BO_07.50x 23127.89173346.66 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 6] asm: 10bpp sse4 code for saoCuOrgE0, improved 8740c-974c, over C code
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1434712676 -19800 # Fri Jun 19 16:47:56 2015 +0530 # Node ID a94e9a1f0fde08e060a9b52e3353ce2f242d9257 # Parent 83a7d824442455ba5e0a6b53ea68e6b7043845de asm: 10bpp sse4 code for saoCuOrgE0, improved 8740c-974c, over C code diff -r 83a7d8244424 -r a94e9a1f0fde source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Mon Jun 22 15:15:33 2015 +0530 +++ b/source/common/CMakeLists.txt Fri Jun 19 16:47:56 2015 +0530 @@ -46,7 +46,7 @@ mc-a2.asm pixel-util8.asm blockcopy8.asm pixeladd8.asm dct8.asm) if(HIGH_BIT_DEPTH) -set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm) +set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm) else() set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm) endif() diff -r 83a7d8244424 -r a94e9a1f0fde source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jun 22 15:15:33 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 19 16:47:56 2015 +0530 @@ -1089,6 +1089,8 @@ } if (cpuMask X265_CPU_SSE4) { +p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4); + LUMA_ADDAVG(sse4); CHROMA_420_ADDAVG(sse4); CHROMA_422_ADDAVG(sse4); diff -r 83a7d8244424 -r a94e9a1f0fde source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Mon Jun 22 15:15:33 2015 +0530 +++ b/source/common/x86/loopfilter.asm Fri Jun 19 16:47:56 2015 +0530 @@ -38,6 +38,7 @@ cextern pb_128 cextern pb_2 cextern pw_2 +cextern pw_1023 cextern pb_movemask @@ -45,6 +46,107 @@ ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride) ; INIT_XMM sse4 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE0, 4,5,9 +mov r4d, r4m +movhm6, [r1] +movzx r1d, byte [r3] +pxorm5, m5 +neg r1b +movdm0, r1d +lea r1, [r0 + r4 * 2] +mov r4d, r2d + +.loop: +movum7, [r0] +movum8, [r0 + 16] +movum2, [r0 + 2] +movum1, [r0 + 18] + +pcmpgtw m3, m7, m2 +pcmpgtw m2, m7 +pcmpgtw m4, m8, m1 +pcmpgtw m1, m8 + +packsswbm3, m4 +packsswbm2, m1 + +pandm3, [pb_1] +por m3, m2 + +palignr m2, m3, m5, 15 +por m2, m0 + +movam4, [pw_1023] +psignb m2, [pb_128]; m2 = signLeft +pxorm0, m0 +palignr m0, m3, 15 +paddb m3, m2 +paddb m3, [pb_2] ; m2 = uiEdgeType +pshufb m2, m6, m3 +pmovsxbwm3, m2 ; offsetEo +punpckhbw m2, m2 +psraw m2, 8 +paddw m7, m3 +paddw m8, m2 +pmaxsw m7, m5 +pmaxsw m8, m5 +pminsw m7, m4 +pminsw m8, m4 +movu[r0], m7 +movu[r0 + 16], m8 + +add r0q, 32 +sub r2d, 16 +jnz.loop + +movzx r3d, byte [r3 + 1] +neg r3b +movdm0, r3d +.loopH: +movum7, [r1] +movum8, [r1 + 16] +movum2, [r1 + 2] +movum1, [r1 + 18] + +pcmpgtw m3, m7, m2 +pcmpgtw m2, m7 +pcmpgtw m4, m8, m1 +pcmpgtw m1, m8 + +packsswbm3, m4 +packsswbm2, m1 + +pandm3, [pb_1] +por m3, m2 + +palignr m2, m3, m5, 15 +por m2, m0 + +movam4, [pw_1023] +psignb m2, [pb_128]; m2 = signLeft +pxorm0, m0 +palignr m0, m3, 15 +paddb m3, m2 +paddb m3, [pb_2] ; m2 = uiEdgeType +pshufb m2, m6, m3 +pmovsxbwm3, m2 ; offsetEo +punpckhbw m2, m2 +psraw m2, 8 +paddw m7, m3 +paddw m8, m2 +pmaxsw m7, m5 +pmaxsw m8, m5 +pminsw m7, m4 +pminsw m8, m4 +movu[r1], m7 +movu[r1 + 16], m8 + +add r1q, 32 +sub r4d, 16 +jnz.loopH +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride mov r4d, r4m @@ -130,6 +232,7 @@ sub r4d, 16 jnz.loopH RET +%endif INIT_YMM avx2 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride diff -r 83a7d8244424 -r a94e9a1f0fde source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Mon Jun 22 15:15:33 2015 +0530 +++ b/source/test/pixelharness.cpp Fri Jun 19 16:47:56 2015 +0530 @@ -901,8 +901,8 @@ ALIGN_VAR_16(pixel, ref_dest[64 * 64]); ALIGN_VAR_16(pixel, opt_dest
[x265] [PATCH 4 of 6] asm: 10bpp sse4 code for saoCuOrgE2
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1434963191 -19800 # Mon Jun 22 14:23:11 2015 +0530 # Node ID f85c15cc0e1d70e63182b03e294c2778f598143d # Parent 558ffdc4e832061d99f1ec688fe1ae64db48642f asm: 10bpp sse4 code for saoCuOrgE2 Performance improvement over C: SAO_EO_2[0] 6.27x207.22 1298.92 SAO_EO_2[1] 8.92x555.20 4949.69 diff -r 558ffdc4e832 -r f85c15cc0e1d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jun 22 18:15:40 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 22 14:23:11 2015 +0530 @@ -1092,6 +1092,8 @@ p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4); p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4); p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4); +p.saoCuOrgE2[0] = PFX(saoCuOrgE2_sse4); +p.saoCuOrgE2[1] = PFX(saoCuOrgE2_sse4); LUMA_ADDAVG(sse4); CHROMA_420_ADDAVG(sse4); diff -r 558ffdc4e832 -r f85c15cc0e1d source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Mon Jun 22 18:15:40 2015 +0530 +++ b/source/common/x86/loopfilter.asm Mon Jun 22 14:23:11 2015 +0530 @@ -672,6 +672,64 @@ ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride) ;== INIT_XMM sse4 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE2, 6,6,8 +mov r4d, r4m +add r5d, r5d +pxorm0, m0 +inc r1 +movhm6, [r0 + r4 * 2] +movhps m6, [r1 + r4] + +.loop +movum7, [r0] +movum5, [r0 + 16] +movum3, [r0 + r5 + 2] +movum1, [r0 + r5 + 18] + +pcmpgtw m2, m7, m3 +pcmpgtw m3, m7 +pcmpgtw m4, m5, m1 +pcmpgtw m1, m5 +packsswbm2, m4 +packsswbm3, m1 +pandm2, [pb_1] +por m2, m3 + +movum3, [r2] + +paddb m3, m2 +paddb m3, [pb_2] + +movum4, [r3] +pshufb m4, m3 + +psubb m3, m0, m2 +movu[r1], m3 + +pmovsxbwm3, m4 +punpckhbw m4, m4 +psraw m4, 8 + +paddw m7, m3 +paddw m5, m4 +pmaxsw m7, m0 +pmaxsw m5, m0 +pminsw m7, [pw_1023] +pminsw m5, [pw_1023] +movu[r0], m7 +movu[r0 + 16], m5 + +add r0, 32 +add r1, 16 +add r2, 16 +sub r4, 16 +jg .loop + +movh[r0 + r4 * 2], m6 +movhps [r1 + r4], m6 +RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE2, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth mov r4d, r4m mov r5d, r5m @@ -722,6 +780,7 @@ movh[r0 + r4], m5 movhps [r1 + r4], m5 RET +%endif INIT_YMM avx2 cglobal saoCuOrgE2, 5, 6, 7, rec, bufft, buff1, offsetEo, lcuWidth diff -r 558ffdc4e832 -r f85c15cc0e1d source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Mon Jun 22 18:15:40 2015 +0530 +++ b/source/test/pixelharness.cpp Mon Jun 22 14:23:11 2015 +0530 @@ -957,8 +957,8 @@ ALIGN_VAR_16(pixel, ref_dest[64 * 64]); ALIGN_VAR_16(pixel, opt_dest[64 * 64]); -memset(ref_dest, 0xCD, sizeof(ref_dest)); -memset(opt_dest, 0xCD, sizeof(opt_dest)); +for (int i = 0; i 64 * 64; i++) +ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX); for (int id = 0; id 2; id++) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel