# HG changeset patch # User Praveen Tiwari # Date 1386062469 -19800 # Node ID d18c574e0ce928adcbeb2438b9d291058bffb928 # Parent ca7bd538e052d104b1b333691836db37739cfdf0 asm code for all_angs_pred_4x4, all modes
diff -r ca7bd538e052 -r d18c574e0ce9 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Mon Dec 02 20:26:19 2013 -0600 +++ b/source/common/CMakeLists.txt Tue Dec 03 14:51:09 2013 +0530 @@ -118,10 +118,10 @@ endif(ENABLE_PRIMITIVES_VEC) if(ENABLE_PRIMITIVES_ASM) - set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h) + set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h allangs-pred.h) set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm ssd-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util8.asm blockcopy8.asm intrapred8.asm - pixeladd8.asm dct8.asm) + pixeladd8.asm dct8.asm allangs-pred8.asm) if (NOT X64) set(A_SRCS ${A_SRCS} pixel-32.asm) endif() diff -r ca7bd538e052 -r d18c574e0ce9 source/common/x86/allangs-pred.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/x86/allangs-pred.h Tue Dec 03 14:51:09 2013 +0530 @@ -0,0 +1,31 @@ +/***************************************************************************** + * allangspred.h: Intra Prediction metrics + ***************************************************************************** + * Copyright (C) 2003-2013 x264 project + * + * Authors: Praveen Kumar Tiwari<prav...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licens...@x264.com. + *****************************************************************************/ + +#ifndef X265_ALLANGSPRED_H +#define X265_ALLANGSPRED_H + +void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma); + +#endif diff -r ca7bd538e052 -r d18c574e0ce9 source/common/x86/allangs-pred8.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/x86/allangs-pred8.asm Tue Dec 03 14:51:09 2013 +0530 @@ -0,0 +1,920 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Praveen Kumar Tiwari <prav...@multicorewareinc.com> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licens...@multicorewareinc.com. +;*****************************************************************************/ + +%include "x86inc.asm" + +SECTION_RODATA 32 + +tab_6_26: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 0, 0 +tab_12_20: db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 +tab_18_14: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 +tab_24_8: db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 +tab_11_21: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 +tab_22_10: db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 +tab_1_31: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 +tab_15_17: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 +tab_30_2: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 +tab_13_19: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 +tab_28_4: db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 +tab_19_13: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 +tab_25_7: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7 +tab_23_9: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9 +tab_14_18: db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 +tab_5_27: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 +tab_27_5: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 +tab_17_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 +tab_26_6: db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 +tab_2_30: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 +tab_4_28: db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 +tab_8_24: db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 +tab_10_22: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 +tab_20_12: db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 +tab_9_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 +tab_7_25: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 +tab_21_11: db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 +tab_31_1: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1 + +pw_1024: dw 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 + +tab_Si0: db 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0 +tab_Si1: db 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +tab_Zero: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + +SECTION .text + +;----------------------------------------------------------------------------- +; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal all_angs_pred_4x4, 6, 6, 8 dest, above0, left0, above1, left1, bLuma + +; mode 2 + +movd m0, [r2 + 2] +movd m1, [r2 + 3] +movd m2, [r2 + 4] +movd m3, [r2 + 5] + +movd [r0], m0 +movd [r0 + 4], m1 +movd [r0 + 8], m2 +movd [r0 + 12], m3 + +; mode 3 + +mova m0, [pw_1024] + +movu m1, [r2 + 1] + +palignr m2, m1, 1 +punpcklbw m1, m2 + +pmaddubsw m7, m1, [tab_6_26] +pmulhrsw m7, m0 +packuswb m7, m7 +movd [r0 + 16], m7 + +movu m2, [r2 + 2] + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m6, m2, [tab_12_20] +pmulhrsw m6, m0 +packuswb m6, m6 +movd [r0 + 20], m6 + +movu m3, [r2 + 3] + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m4, m3, [tab_18_14] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 24], m4 + +movu m4, [r2 + 4] + +palignr m5, m4, 1 +punpcklbw m4, m5 + +pmaddubsw m4, [tab_24_8] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 28], m4 + +; mode 4 + +pmaddubsw m4, m1, [tab_11_21] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 32], m4 + +pmaddubsw m4, m2, [tab_22_10] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 36], m4 + +pmaddubsw m4, m2, [tab_1_31] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 40], m4 + +pmaddubsw m4, m3, [tab_12_20] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 44], m4 + +; mode 5 + +pmaddubsw m4, m1, [tab_15_17] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 48], m4 + +pmaddubsw m4, m2, [tab_30_2] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 52], m4 + +pmaddubsw m4, m2, [tab_13_19] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 56], m4 + +pmaddubsw m3, [tab_28_4] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 60], m3 + +; mode 6 + +pmaddubsw m3, m1, [tab_19_13] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 64], m3 + +movd [r0 + 68], m7 + +pmaddubsw m3, m2, [tab_25_7] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 72], m3 + +movd [r0 + 76], m6 + +; mode 7 + +pmaddubsw m3, m1, [tab_23_9] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 80], m3 + +pmaddubsw m3, m1, [tab_14_18] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 84], m3 + +pmaddubsw m3, m1, [tab_5_27] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 88], m3 + +pmaddubsw m2, [tab_28_4] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 92], m2 + +; mode 8 + +pmaddubsw m2, m1, [tab_27_5] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 96], m2 + +pmaddubsw m2, m1, [tab_22_10] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 100], m2 + +pmaddubsw m2, m1, [tab_17_15] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 104], m2 + +pmaddubsw m2, m1, [tab_12_20] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 108], m2 + +; mode 9 + +pmaddubsw m2, m1, [tab_30_2] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 112], m2 + +pmaddubsw m2, m1, [tab_28_4] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 116], m2 + +pmaddubsw m2, m1, [tab_26_6] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 120], m2 + +pmaddubsw m1, [tab_24_8] +pmulhrsw m1, m0 +packuswb m1, m1 +movd [r0 + 124], m1 + +; mode 10 + +movd m1, [r2 + 1] +pshufd m2, m1, 0 +movu [r0 + 128], m2 + +mova m2, [tab_Zero] + +pshufb m3, m1, m2 +punpcklbw m3, m2 + +movd m1, [r1] + +pshufb m1, m2 +punpcklbw m1, m2 + +movd m4, [r1 + 1] +punpcklbw m4, m2 + +psubw m4, m1 +psraw m4, 1 + +paddw m3, m4 + +packuswb m3, m2 + +pextrb [r0 + 128], m3, 0 +pextrb [r0 + 132], m3, 1 +pextrb [r0 + 136], m3, 2 +pextrb [r0 + 140], m3, 3 + +; mode 11 + +movu m1, [r2] + +palignr m2, m1, 1 +punpcklbw m1, m2 + +pmaddubsw m2, m1, [tab_2_30] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 144], m2 + +pmaddubsw m2, m1, [tab_4_28] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 148], m2 + +pmaddubsw m2, m1, [tab_6_26] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 152], m2 + +pmaddubsw m2, m1, [tab_8_24] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 156], m2 + +; mode 12 + +pmaddubsw m2, m1, [tab_5_27] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 160], m2 + +pmaddubsw m2, m1, [tab_10_22] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 164], m2 + +pmaddubsw m2, m1, [tab_15_17] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 168], m2 + +pmaddubsw m2, m1, [tab_20_12] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 172], m2 + +; mode 13 + +pmaddubsw m2, m1, [tab_9_23] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 176], m2 + +pmaddubsw m2, m1, [tab_18_14] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 180], m2 + +pmaddubsw m2, m1, [tab_27_5] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 184], m2 + +movh m2, [r2 - 1] +pinsrb m2, [r1 + 4], 0 + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m2, [tab_4_28] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 188], m2 + +; mode 14 + +pmaddubsw m2, m1, [tab_13_19] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 192], m2 + +pmaddubsw m5, m1, [tab_26_6] +pmulhrsw m5, m0 +packuswb m5, m5 +movd [r0 + 196], m5 + +movh m2, [r2 - 1] +pinsrb m2, [r1 + 2], 0 + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m3, m2, [tab_7_25] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 200], m3 + +pmaddubsw m3, m2, [tab_20_12] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 204], m3 + +; mode 15 + +pmaddubsw m3, m1, [tab_17_15] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 208], m3 + +pmaddubsw m3, m2, [tab_2_30] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 212], m3 + +pmaddubsw m3, m2, [tab_19_13] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 216], m3 + +movh m3, [r2 - 2] +pinsrb m3, [r1 + 4], 0 +pinsrb m3, [r1 + 2], 1 + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m3, [tab_4_28] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 220], m3 + +; mode 16 + +pmaddubsw m3, m1, [tab_21_11] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 224], m3 + +pmaddubsw m3, m2, [tab_10_22] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 228], m3 + +pmaddubsw m3, m2, [tab_31_1] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 232], m3 + +movh m3, [r2 - 2] +pinsrb m3, [r1 + 3], 0 +pinsrb m3, [r1 + 2], 1 + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m3, [tab_20_12] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 236], m3 + +; mode 17 + +movd [r0 + 240], m5 + +movh m3, [r2 - 1] +pinsrb m3, [r1 + 1], 0 + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m3, [tab_20_12] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 244], m3 + +movh m3, [r2 - 2] +pinsrb m3, [r1 + 2], 0 +pinsrb m3, [r1 + 1], 1 + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m3, [tab_14_18] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 248], m3 + +movh m3, [r1] +pshufb m3, [tab_Si0] +pinsrb m3, [r2 + 1], 4 + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m3, [tab_8_24] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 252], m3 + +; mode 18 + +movd m3, [r1] +movd [r0 + 256], m3 + +movh m3, [r1 - 1] +pinsrb m3, [r2 + 1], 0 +movd [r0 + 260], m3 + +movh m3, [r1 - 2] +pinsrb m3, [r2 + 2], 0 +pinsrb m3, [r2 + 1], 1 +movd [r0 + 264], m3 + +movh m3, [r2] +pshufb m3, [tab_Si1] +pinsrb m3, [r1], 3 +movd [r0 + 268], m3 + +; mode 19 + +movh m1, [r1] + +palignr m2, m1, 1 +punpcklbw m1, m2 + +pmaddubsw m5, m1, [tab_26_6] +pmulhrsw m5, m0 +packuswb m5, m5 +movd [r0 + 272], m5 + +movh m2, [r1 - 1] +pinsrb m2, [r2 + 1], 0 + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m2, [tab_20_12] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 276], m2 + +movh m2, [r1 - 2] +pinsrb m2, [r2 + 2], 0 +pinsrb m2, [r2 + 1], 1 + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m2, [tab_14_18] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 280], m2 + +movh m2, [r2] +pshufb m2, [tab_Si0] +pinsrb m2, [r1 + 1], 4 + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m2, [tab_8_24] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 284], m2 + +; mode 20 + +pmaddubsw m2, m1, [tab_21_11] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 288], m2 + +movh m2, [r1 - 1] +pinsrb m2, [r2 + 2], 0 + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m3, m2, [tab_10_22] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 292], m3 + +pmaddubsw m3, m2, [tab_31_1] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 296], m3 + +movh m3, [r1 - 2] +pinsrb m3, [r2 + 3], 0 +pinsrb m3, [r2 + 2], 1 + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m3, [tab_20_12] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 300], m3 + +; mode 21 + +pmaddubsw m3, m1, [tab_17_15] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 304], m3 + +pmaddubsw m3, m2, [tab_2_30] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 308], m3 + +pmaddubsw m3, m2, [tab_19_13] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 312], m3 + +movh m3, [r1 - 2] +pinsrb m3, [r2 + 4], 0 +pinsrb m3, [r2 + 2], 1 + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m3, [tab_4_28] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 316], m3 + +; mode 22 + +pmaddubsw m3, m1, [tab_13_19] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 320], m3 + +movd [r0 + 324], m5 + +pmaddubsw m3, m2, [tab_7_25] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 328], m3 + +pmaddubsw m3, m2, [tab_20_12] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 332], m3 + +; mode 23 + +pmaddubsw m2, m1, [tab_9_23] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 336], m2 + +pmaddubsw m2, m1, [tab_18_14] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 340], m2 + +pmaddubsw m2, m1, [tab_27_5] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 344], m2 + +movh m2, [r1 - 1] +pinsrb m2, [r2 + 4], 0 + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m2, [tab_4_28] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 348], m2 + +; mode 24 + +pmaddubsw m2, m1, [tab_5_27] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 352], m2 + +pmaddubsw m2, m1, [tab_10_22] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 356], m2 + +pmaddubsw m2, m1, [tab_15_17] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 360], m2 + +pmaddubsw m2, m1, [tab_20_12] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 364], m2 + +; mode 25 + +pmaddubsw m2, m1, [tab_2_30] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 368], m2 + +pmaddubsw m2, m1, [tab_4_28] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 372], m2 + +pmaddubsw m2, m1, [tab_6_26] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 376], m2 + +pmaddubsw m2, m1, [tab_8_24] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 380], m2 + +; mode 26 + +movd m1, [r1 + 1] +pshufd m2, m1, 0 +movu [r0 + 384], m2 + +mova m2, [tab_Zero] + +pshufb m3, m1, m2 +punpcklbw m3, m2 + +movd m1, [r2] + +pshufb m1, m2 +punpcklbw m1, m2 + +movd m4, [r2 + 1] +punpcklbw m4, m2 + +psubw m4, m1 +psraw m4, 1 + +paddw m3, m4 + +packuswb m3, m2 + +pextrb [r0 + 384], m3, 0 +pextrb [r0 + 388], m3, 1 +pextrb [r0 + 392], m3, 2 +pextrb [r0 + 396], m3, 3 + +; mode 27 + +movh m1, [r1 + 1] + +palignr m2, m1, 1 +punpcklbw m1, m2 + +pmaddubsw m2, m1, [tab_30_2] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 400], m2 + +pmaddubsw m2, m1, [tab_28_4] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 404], m2 + +pmaddubsw m2, m1, [tab_26_6] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 408], m2 + +pmaddubsw m2, m1, [tab_24_8] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 412], m2 + +; mode 28 + +pmaddubsw m2, m1, [tab_27_5] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 416], m2 + +pmaddubsw m2, m1, [tab_22_10] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 420], m2 + +pmaddubsw m2, m1, [tab_17_15] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 424], m2 + +pmaddubsw m2, m1, [tab_12_20] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 428], m2 + +; mode 29 + +pmaddubsw m2, m1, [tab_23_9] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 432], m2 + +pmaddubsw m2, m1, [tab_14_18] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 436], m2 + +pmaddubsw m2, m1, [tab_5_27] +pmulhrsw m2, m0 +packuswb m2, m2 +movd [r0 + 440], m2 + +movh m2, [r1 + 2] + +palignr m3, m2, 1 +punpcklbw m2, m3 + +pmaddubsw m3, m2, [tab_28_4] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 444], m3 + +; mode 30 + +pmaddubsw m3, m1, [tab_19_13] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 448], m3 + +pmaddubsw m6, m1, [tab_6_26] +pmulhrsw m6, m0 +packuswb m6, m6 +movd [r0 + 452], m6 + +pmaddubsw m3, m2, [tab_25_7] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 456], m3 + +pmaddubsw m5, m2, [tab_12_20] +pmulhrsw m5, m0 +packuswb m5, m5 +movd [r0 + 460], m5 + +; mode 31 + +pmaddubsw m3, m1, [tab_15_17] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 464], m3 + +pmaddubsw m3, m2, [tab_30_2] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 468], m3 + +pmaddubsw m3, m2, [tab_13_19] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 472], m3 + +movh m3, [r1 + 3] + +palignr m4, m3, 1 +punpcklbw m3, m4 + +pmaddubsw m4, m3, [tab_28_4] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 476], m4 + +; mode 32 + +pmaddubsw m4, m1, [tab_11_21] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 480], m4 + +pmaddubsw m4, m2, [tab_22_10] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 484], m4 + +pmaddubsw m4, m2, [tab_1_31] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 488], m4 + +pmaddubsw m4, m3, [tab_12_20] +pmulhrsw m4, m0 +packuswb m4, m4 +movd [r0 + 492], m4 + +; mode 33 + +movd [r0 + 496], m6 + +movd [r0 + 500], m5 + +pmaddubsw m3, [tab_18_14] +pmulhrsw m3, m0 +packuswb m3, m3 +movd [r0 + 504], m3 + +movh m1, [r1 + 4] + +palignr m2, m1, 1 +punpcklbw m1, m2 + +pmaddubsw m1, [tab_24_8] +pmulhrsw m1, m0 +packuswb m1, m1 +movd [r0 + 508], m1 + +; mode 34 + +movd m0, [r1 + 2] +movd [r0 + 512], m0 + +movd m0, [r1 + 3] +movd [r0 + 516], m0 + +movd m0, [r1 + 4] +movd [r0 + 520], m0 + +movd m0, [r1 + 5] +movd [r0 + 524], m0 + +RET diff -r ca7bd538e052 -r d18c574e0ce9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Dec 02 20:26:19 2013 -0600 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 14:51:09 2013 +0530 @@ -35,6 +35,7 @@ #include "blockcopy8.h" #include "intrapred.h" #include "dct8.h" +#include "allangs-pred.h" } #define INIT2_NAME(name1, name2, cpu) \ @@ -764,6 +765,8 @@ p.intra_pred_planar[BLOCK_8x8] = x265_intra_pred_planar8_sse4; p.intra_pred_planar[BLOCK_16x16] = x265_intra_pred_planar16_sse4; p.intra_pred_planar[BLOCK_32x32] = x265_intra_pred_planar32_sse4; + + p.intra_pred_allangs[BLOCK_4x4] = x265_all_angs_pred_4x4_sse4; } if (cpuMask & X265_CPU_AVX) { diff -r ca7bd538e052 -r d18c574e0ce9 source/test/intrapredharness.cpp --- a/source/test/intrapredharness.cpp Mon Dec 02 20:26:19 2013 -0600 +++ b/source/test/intrapredharness.cpp Tue Dec 03 14:51:09 2013 +0530 @@ -208,7 +208,7 @@ for (int i = 0; i <= 100; i++) { - isLuma = (width <= 16) && (rand() % 2); + isLuma = (width <= 16) ? true : false; // bFilter is true for 4x4, 8x8, 16x16 and false for 32x32 pixel * refAbove0 = pixel_buff + j; pixel * refLeft0 = refAbove0 + 3 * width; _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel