# HG changeset patch # User Murugan Vairavel <muru...@multicorewareinc.com> # Date 1392125003 -19800 # Tue Feb 11 18:53:23 2014 +0530 # Node ID 7eccc042e269ead4ff5d32f4d853287e30c59044 # Parent 07b5d6b82f5fbcb78ecab12cb8abcf13c78fe552 asm: Optimizations and cleaups on ipfilter functions
diff -r 07b5d6b82f5f -r 7eccc042e269 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Feb 10 15:05:04 2014 -0600 +++ b/source/common/x86/ipfilter8.asm Tue Feb 11 18:53:23 2014 +0530 @@ -814,7 +814,7 @@ pmaddwd %8, [r5 + %10 * 16] paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 paddd %6, %8 ; R1 = H[1+2+3+4] -%endmacro ; FILTER_HV8_START +%endmacro ; FILTER_HV8_MID ; Round and Saturate %macro FILTER_HV8_END 4 ; output in [1, 3] @@ -830,8 +830,7 @@ packssdw %3, %4 ; TODO: is merge better? I think this way is short dependency link - packuswb %1, %1 - packuswb %3, %3 + packuswb %1, %3 %endmacro ; FILTER_HV8_END ;----------------------------------------------------------------------------- @@ -899,8 +898,8 @@ FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 FILTER_HV8_END m3, m0, m4, m1 - movq [r2], m3 - movq [r2 + r3], m4 + movh [r2], m3 + movhps [r2 + r3], m3 lea r0, [r0 + 16 * 2] lea r2, [r2 + r3 * 2] @@ -915,7 +914,7 @@ ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal interp_4tap_vert_pp_2x4, 4, 7, 8 +cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 mov r4d, r4m sub r0, r1 @@ -926,16 +925,15 @@ %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif - +lea r4, [r1 * 3] +lea r5, [r0 + 4 * r1] pshufb m0, [tab_Cm] - mova m1, [tab_c_512] movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] -movd m5, [r5 + r1] +movd m5, [r0 + r4] punpcklbw m2, m3 punpcklbw m6, m4, m5 @@ -943,7 +941,7 @@ pmaddubsw m2, m0 -movd m6, [r0 + 4 * r1] +movd m6, [r5] punpcklbw m3, m4 punpcklbw m7, m5, m6 @@ -954,16 +952,11 @@ phaddw m2, m3 pmulhrsw m2, m1 -packuswb m2, m2 - -pextrw [r2], m2, 0 -pextrw [r2 + r3], m2, 2 - -lea r5, [r0 + 4 * r1] -movd m2, [r5 + r1] + +movd m7, [r5 + r1] punpcklbw m4, m5 -punpcklbw m3, m6, m2 +punpcklbw m3, m6, m7 punpcklbw m4, m3 pmaddubsw m4, m0 @@ -971,19 +964,21 @@ movd m3, [r5 + 2 * r1] punpcklbw m5, m6 -punpcklbw m2, m3 -punpcklbw m5, m2 +punpcklbw m7, m3 +punpcklbw m5, m7 pmaddubsw m5, m0 phaddw m4, m5 pmulhrsw m4, m1 -packuswb m4, m4 - -pextrw [r2 + 2 * r3], m4, 0 -lea r6, [r2 + 2 * r3] -pextrw [r6 + r3], m4, 2 +packuswb m2, m4 + +pextrw [r2], m2, 0 +pextrw [r2 + r3], m2, 2 +lea r2, [r2 + 2 * r3] +pextrw [r2], m2, 4 +pextrw [r2 + r3], m2, 6 RET @@ -992,7 +987,7 @@ ;----------------------------------------------------------------------------- %macro FILTER_V4_W2_H4 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_2x8, 4, 7, 8 +cglobal interp_4tap_vert_pp_2x8, 4, 6, 8 mov r4d, r4m sub r0, r1 @@ -1009,13 +1004,13 @@ mova m1, [tab_c_512] mov r4d, %2 +lea r5, [3 * r1] .loop movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] -movd m5, [r5 + r1] +movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 @@ -1023,7 +1018,8 @@ pmaddubsw m2, m0 -movd m6, [r0 + 4 * r1] +lea r0, [r0 + 4 * r1] +movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 @@ -1034,39 +1030,35 @@ phaddw m2, m3 pmulhrsw m2, m1 -packuswb m2, m2 - -pextrw [r2], m2, 0 -pextrw [r2 + r3], m2, 2 - -lea r5, [r0 + 4 * r1] -movd m2, [r5 + r1] + +movd m7, [r0 + r1] punpcklbw m4, m5 -punpcklbw m3, m6, m2 +punpcklbw m3, m6, m7 punpcklbw m4, m3 pmaddubsw m4, m0 -movd m3, [r5 + 2 * r1] +movd m3, [r0 + 2 * r1] punpcklbw m5, m6 -punpcklbw m2, m3 -punpcklbw m5, m2 +punpcklbw m7, m3 +punpcklbw m5, m7 pmaddubsw m5, m0 phaddw m4, m5 pmulhrsw m4, m1 -packuswb m4, m4 - -pextrw [r2 + 2 * r3], m4, 0 -lea r6, [r2 + 2 * r3] -pextrw [r6 + r3], m4, 2 - -lea r0, [r0 + 4 * r1] -lea r2, [r2 + 4 * r3] +packuswb m2, m4 + +pextrw [r2], m2, 0 +pextrw [r2 + r3], m2, 2 +lea r2, [r2 + 2 * r3] +pextrw [r2], m2, 4 +pextrw [r2 + r3], m2, 6 + +lea r2, [r2 + 2 * r3] sub r4, 4 jnz .loop @@ -1079,7 +1071,7 @@ ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal interp_4tap_vert_pp_4x2, 4, 6, 8 +cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 mov r4d, r4m sub r0, r1 @@ -1092,32 +1084,30 @@ %endif pshufb m0, [tab_Cm] - -mova m1, [tab_c_512] +lea r5, [r0 + 2 * r1] movd m2, [r0] movd m3, [r0 + r1] -movd m4, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] +movd m4, [r5] movd m5, [r5 + r1] punpcklbw m2, m3 -punpcklbw m6, m4, m5 -punpcklbw m2, m6 +punpcklbw m1, m4, m5 +punpcklbw m2, m1 pmaddubsw m2, m0 -movd m6, [r0 + 4 * r1] +movd m1, [r0 + 4 * r1] punpcklbw m3, m4 -punpcklbw m5, m6 +punpcklbw m5, m1 punpcklbw m3, m5 pmaddubsw m3, m0 phaddw m2, m3 -pmulhrsw m2, m1 +pmulhrsw m2, [tab_c_512] packuswb m2, m2 movd [r2], m2 pextrd [r2 + r3], m2, 1 @@ -1128,7 +1118,7 @@ ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal interp_4tap_vert_pp_4x4, 4, 7, 8 +cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 mov r4d, r4m sub r0, r1 @@ -1141,14 +1131,14 @@ %endif pshufb m0, [tab_Cm] - mova m1, [tab_c_512] +lea r5, [r0 + 4 * r1] +lea r4, [r1 * 3] movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] -movd m5, [r5 + r1] +movd m5, [r0 + r4] punpcklbw m2, m3 punpcklbw m6, m4, m5 @@ -1156,7 +1146,7 @@ pmaddubsw m2, m0 -movd m6, [r0 + 4 * r1] +movd m6, [r5] punpcklbw m3, m4 punpcklbw m7, m5, m6 @@ -1167,34 +1157,33 @@ phaddw m2, m3 pmulhrsw m2, m1 -packuswb m2, m2 + +movd m7, [r5 + r1] + +punpcklbw m4, m5 +punpcklbw m3, m6, m7 +punpcklbw m4, m3 + +pmaddubsw m4, m0 + +movd m3, [r5 + 2 * r1] + +punpcklbw m5, m6 +punpcklbw m7, m3 +punpcklbw m5, m7 + +pmaddubsw m5, m0 + +phaddw m4, m5 + +pmulhrsw m4, m1 + +packuswb m2, m4 movd [r2], m2 -pextrd [r2 + r3], m2, 1 - -lea r5, [r0 + 4 * r1] -movd m2, [r5 + r1] - -punpcklbw m4, m5 -punpcklbw m3, m6, m2 -punpcklbw m4, m3 - -pmaddubsw m4, m0 - -movd m3, [r5 + 2 * r1] - -punpcklbw m5, m6 -punpcklbw m2, m3 -punpcklbw m5, m2 - -pmaddubsw m5, m0 - -phaddw m4, m5 - -pmulhrsw m4, m1 -packuswb m4, m4 -movd [r2 + 2 * r3], m4 -lea r6, [r2 + 2 * r3] -pextrd [r6 + r3], m4, 1 +pextrd [r2 + r3], m2, 1 +lea r2, [r2 + 2 * r3] +pextrd [r2], m2, 2 +pextrd [r2 + r3], m2, 3 RET @@ -1203,7 +1192,7 @@ ;----------------------------------------------------------------------------- %macro FILTER_V4_W4_H4 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 @@ -1221,12 +1210,13 @@ mov r4d, %2 +lea r5, [3 * r1] + .loop movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] -movd m5, [r5 + r1] +movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 @@ -1234,7 +1224,8 @@ pmaddubsw m2, m0 -movd m6, [r0 + 4 * r1] +lea r0, [r0 + 4 * r1] +movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 @@ -1245,37 +1236,34 @@ phaddw m2, m3 pmulhrsw m2, m1 -packuswb m2, m2 + +movd m7, [r0 + r1] + +punpcklbw m4, m5 +punpcklbw m3, m6, m7 +punpcklbw m4, m3 + +pmaddubsw m4, m0 + +movd m3, [r0 + 2 * r1] + +punpcklbw m5, m6 +punpcklbw m7, m3 +punpcklbw m5, m7 + +pmaddubsw m5, m0 + +phaddw m4, m5 + +pmulhrsw m4, m1 +packuswb m2, m4 movd [r2], m2 pextrd [r2 + r3], m2, 1 - -lea r5, [r0 + 4 * r1] -movd m2, [r5 + r1] - -punpcklbw m4, m5 -punpcklbw m3, m6, m2 -punpcklbw m4, m3 - -pmaddubsw m4, m0 - -movd m3, [r5 + 2 * r1] - -punpcklbw m5, m6 -punpcklbw m2, m3 -punpcklbw m5, m2 - -pmaddubsw m5, m0 - -phaddw m4, m5 - -pmulhrsw m4, m1 -packuswb m4, m4 -movd [r2 + 2 * r3], m4 -lea r6, [r2 + 2 * r3] -pextrd [r6 + r3], m4, 1 - -lea r0, [r0 + 4 * r1] -lea r2, [r2 + 4 * r3] +lea r2, [r2 + 2 * r3] +pextrd [r2], m2, 2 +pextrd [r2 + r3], m2, 3 + +lea r2, [r2 + 2 * r3] sub r4, 4 jnz .loop @@ -1450,7 +1438,7 @@ ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 -cglobal interp_4tap_vert_ps_4x2, 4, 6, 8 +cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 mov r4d, r4m sub r0, r1 @@ -1465,32 +1453,30 @@ pshufb m0, [tab_Cm] -mova m1, [pw_2000] - movd m2, [r0] movd m3, [r0 + r1] -movd m4, [r0 + 2 * r1] lea r5, [r0 + 2 * r1] +movd m4, [r5] movd m5, [r5 + r1] punpcklbw m2, m3 -punpcklbw m6, m4, m5 -punpcklbw m2, m6 +punpcklbw m1, m4, m5 +punpcklbw m2, m1 pmaddubsw m2, m0 -movd m6, [r0 + 4 * r1] +movd m1, [r0 + 4 * r1] punpcklbw m3, m4 -punpcklbw m5, m6 +punpcklbw m5, m1 punpcklbw m3, m5 pmaddubsw m3, m0 phaddw m2, m3 -psubw m2, m1 -movlps [r2], m2 +psubw m2, [pw_2000] +movh [r2], m2 movhps [r2 + r3], m2 RET @@ -1499,7 +1485,7 @@ ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 -cglobal interp_4tap_vert_ps_4x4, 4, 7, 8 +cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 mov r4d, r4m sub r0, r1 @@ -1514,13 +1500,13 @@ pshufb m0, [tab_Cm] - mova m1, [pw_2000] + lea r4, [r1 * 3] + lea r5, [r0 + 4 * r1] movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movd m5, [r5 + r1] + movd m5, [r0 + r4] punpcklbw m2, m3 punpcklbw m6, m4, m5 @@ -1528,21 +1514,22 @@ pmaddubsw m2, m0 - movd m6, [r0 + 4 * r1] + movd m6, [r5] punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + punpcklbw m1, m5, m6 + punpcklbw m3, m1 pmaddubsw m3, m0 phaddw m2, m3 + mova m1, [pw_2000] + psubw m2, m1 - movlps [r2], m2 + movh [r2], m2 movhps [r2 + r3], m2 - lea r5, [r0 + 4 * r1] movd m2, [r5 + r1] punpcklbw m4, m5 @@ -1562,9 +1549,9 @@ phaddw m4, m5 psubw m4, m1 - movlps [r2 + 2 * r3], m4 - lea r6, [r2 + 2 * r3] - movhps [r6 + r3], m4 + lea r2, [r2 + 2 * r3] + movh [r2], m4 + movhps [r2 + r3], m4 RET @@ -1573,7 +1560,7 @@ ;--------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W4_H4 2 INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 @@ -1591,13 +1578,13 @@ mova m1, [pw_2000] mov r4d, %2/4 + lea r5, [3 * r1] .loop movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movd m5, [r5 + r1] + movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 @@ -1605,7 +1592,8 @@ pmaddubsw m2, m0 - movd m6, [r0 + 4 * r1] + lea r0, [r0 + 4 * r1] + movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 @@ -1616,11 +1604,10 @@ phaddw m2, m3 psubw m2, m1 - movlps [r2], m2 + movh [r2], m2 movhps [r2 + r3], m2 - lea r5, [r0 + 4 * r1] - movd m2, [r5 + r1] + movd m2, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m2 @@ -1628,7 +1615,7 @@ pmaddubsw m4, m0 - movd m3, [r5 + 2 * r1] + movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m2, m3 @@ -1639,12 +1626,11 @@ phaddw m4, m5 psubw m4, m1 - movlps [r2 + 2 * r3], m4 - lea r6, [r2 + 2 * r3] - movhps [r6 + r3], m4 - - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] + lea r2, [r2 + 2 * r3] + movh [r2], m4 + movhps [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] dec r4d jnz .loop @@ -1659,15 +1645,15 @@ ;-------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W8_H8_H16_H2 2 INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC - lea r6, [tab_ChromaCoeff] - movd m5, [r6 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif @@ -1677,34 +1663,34 @@ mova m4, [pw_2000] mov r4d, %2/2 + lea r5, [3 * r1] .loopH movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movq m3, [r5 + r1] + movq m3, [r0 + r5] punpcklbw m0, m1 - punpcklbw m7, m2, m3 + punpcklbw m1, m2 + punpcklbw m2, m3 pmaddubsw m0, m6 - pmaddubsw m7, m5 - - paddw m0, m7 + pmaddubsw m2, m5 + + paddw m0, m2 psubw m0, m4 movu [r2], m0 movq m0, [r0 + 4 * r1] - punpcklbw m1, m2 - punpcklbw m7, m3, m0 + punpcklbw m3, m0 pmaddubsw m1, m6 - pmaddubsw m7, m5 - - paddw m1, m7 + pmaddubsw m3, m5 + + paddw m1, m3 psubw m1, m4 movu [r2 + r3], m1 @@ -1727,15 +1713,15 @@ ;-------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W8_H8_H16_H32 2 INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC - lea r6, [tab_ChromaCoeff] - movd m5, [r6 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif @@ -1745,55 +1731,54 @@ mova m4, [pw_2000] mov r4d, %2/4 + lea r5, [3 * r1] .loop movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movq m3, [r5 + r1] + movq m3, [r0 + r5] punpcklbw m0, m1 - punpcklbw m7, m2, m3 + punpcklbw m1, m2 + punpcklbw m2, m3 pmaddubsw m0, m6 - pmaddubsw m7, m5 + pmaddubsw m7, m2, m5 paddw m0, m7 psubw m0, m4 movu [r2], m0 - movq m0, [r0 + 4 * r1] - - punpcklbw m1, m2 - punpcklbw m7, m3, m0 + lea r0, [r0 + 4 * r1] + movq m0, [r0] + + punpcklbw m3, m0 pmaddubsw m1, m6 - pmaddubsw m7, m5 + pmaddubsw m7, m3, m5 paddw m1, m7 psubw m1, m4 movu [r2 + r3], m1 - lea r6, [r0 + 4 * r1] - movq m1, [r6 + r1] - - punpcklbw m2, m3 - punpcklbw m7, m0, m1 + movq m1, [r0 + r1] + + punpcklbw m0, m1 pmaddubsw m2, m6 - pmaddubsw m7, m5 - - paddw m2, m7 + pmaddubsw m0, m5 + + paddw m2, m0 psubw m2, m4 - movu [r2 + 2 * r3], m2 - - movq m2, [r6 + 2 * r1] - - punpcklbw m3, m0 + lea r2, [r2 + 2 * r3] + movu [r2], m2 + + movq m2, [r0 + 2 * r1] + punpcklbw m1, m2 pmaddubsw m3, m6 @@ -1802,11 +1787,9 @@ paddw m3, m1 psubw m3, m4 - lea r5, [r2 + 2 * r3] - movu [r5 + r3], m3 - - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] + movu [r2 + r3], m3 + + lea r2, [r2 + 2 * r3] dec r4d jnz .loop @@ -1821,15 +1804,15 @@ ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------ INIT_XMM sse4 -cglobal interp_4tap_vert_ps_6x8, 4, 7, 8 +cglobal interp_4tap_vert_ps_6x8, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC - lea r6, [tab_ChromaCoeff] - movd m5, [r6 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif @@ -1837,36 +1820,35 @@ pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] mova m4, [pw_2000] - + lea r5, [3 * r1] mov r4d, 2 .loop movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movq m3, [r5 + r1] + movq m3, [r0 + r5] punpcklbw m0, m1 - punpcklbw m7, m2, m3 + punpcklbw m1, m2 + punpcklbw m2, m3 pmaddubsw m0, m6 - pmaddubsw m7, m5 + pmaddubsw m7, m2, m5 paddw m0, m7 - psubw m0, m4 + movh [r2], m0 pshufd m0, m0, 2 movd [r2 + 8], m0 - movq m0, [r0 + 4 * r1] - - punpcklbw m1, m2 - punpcklbw m7, m3, m0 + lea r0, [r0 + 4 * r1] + movq m0, [r0] + punpcklbw m3, m0 pmaddubsw m1, m6 - pmaddubsw m7, m5 + pmaddubsw m7, m3, m5 paddw m1, m7 psubw m1, m4 @@ -1875,25 +1857,21 @@ pshufd m1, m1, 2 movd [r2 + r3 + 8], m1 - lea r6, [r0 + 4 * r1] - movq m1, [r6 + r1] - - punpcklbw m2, m3 - punpcklbw m7, m0, m1 + movq m1, [r0 + r1] + punpcklbw m0, m1 pmaddubsw m2, m6 - pmaddubsw m7, m5 - - paddw m2, m7 + pmaddubsw m0, m5 + + paddw m2, m0 psubw m2, m4 - movh [r2 + 2 * r3], m2 + lea r2,[r2 + 2 * r3] + movh [r2], m2 pshufd m2, m2, 2 - movd [r2 + 2 * r3 + 8], m2 - - movq m2,[r6 + 2 * r1] - - punpcklbw m3, m0 + movd [r2 + 8], m2 + + movq m2,[r0 + 2 * r1] punpcklbw m1, m2 pmaddubsw m3, m6 @@ -1902,13 +1880,11 @@ paddw m3, m1 psubw m3, m4 - lea r5,[r2 + 2 * r3] - movh [r5 + r3], m3 + movh [r2 + r3], m3 pshufd m3, m3, 2 - movd [r5 + r3 + 8], m3 - - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] + movd [r2 + r3 + 8], m3 + + lea r2, [r2 + 2 * r3] dec r4d jnz .loop @@ -1934,68 +1910,61 @@ pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] - mova m7, [pw_2000] - mov r4d, 16/2 .loop movu m2, [r0] movu m3, [r0 + r1] - punpcklbw m4, m2, m3, - punpckhbw m2, m3, + punpcklbw m4, m2, m3 + punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 - movu m5, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movu m3, [r5 + r1] - - punpcklbw m6, m5, m3, - punpckhbw m5, m3, - + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m6, m5, m7 pmaddubsw m6, m0 - pmaddubsw m5, m0 - paddw m4, m6 - paddw m2, m5 - - psubw m4, m7 - psubw m2, m7 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 movu [r2], m4 movh [r2 + 16], m2 - movu m2, [r0 + r1] - movu m3, [r0 + 2 * r1] - - punpcklbw m4, m2, m3, - punpckhbw m2, m3, + punpcklbw m4, m3, m5 + punpckhbw m3, m5 pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m5, [r5 + r1] - movu m3, [r5 + 2 * r1] - - punpcklbw m6, m5, m3, - punpckhbw m5, m3, - - pmaddubsw m6, m0 + pmaddubsw m3, m1 + + movu m2, [r0 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + pmaddubsw m5, m0 - - paddw m4, m6 - paddw m2, m5 - - psubw m4, m7 - psubw m2, m7 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 movu [r2 + r3], m4 - movh [r2 + r3 + 16], m2 - - lea r0, [r0 + 2 * r1] + movh [r2 + r3 + 16], m3 + lea r2, [r2 + 2 * r3] dec r4d @@ -2022,7 +1991,6 @@ pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] - mov r4d, %2/2 .loop @@ -2030,63 +1998,54 @@ movu m3, [r0 + r1] punpcklbw m4, m2, m3 - punpckhbw m5, m2, m3 + punpckhbw m2, m3 pmaddubsw m4, m1 - pmaddubsw m5, m1 - - movu m2, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movu m3, [r5 + r1] - - punpcklbw m6, m2, m3 - punpckhbw m7, m2, m3 - + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m6, m5, m7 pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r0 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 pmaddubsw m7, m0 - paddw m4, m6 - paddw m5, m7 - - mova m6, [pw_2000] + paddw m4, m2 + paddw m3, m7 psubw m4, m6 - psubw m5, m6 - - movu [r2], m4 - movu [r2 + 16], m5 - - movu m2, [r0 + r1] - movu m3, [r0 + 2 * r1] - - punpcklbw m4, m2, m3 - punpckhbw m5, m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m5, m1 - - lea r5, [r0 + 2 * r1] - movu m2, [r5 + r1] - movu m3, [r5 + 2 * r1] - - punpcklbw m6, m2, m3, - punpckhbw m7, m2, m3, - - pmaddubsw m6, m0 - pmaddubsw m7, m0 - - paddw m4, m6 - paddw m5, m7 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m5, m6 + psubw m3, m6 movu [r2 + r3], m4 - movu [r2 + r3 + 16], m5 - - lea r0, [r0 + 2 * r1] + movu [r2 + r3 + 16], m3 + lea r2, [r2 + 2 * r3] dec r4d @@ -2120,100 +2079,92 @@ pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] - mova m7, [pw_2000] - mov r4d, 32/2 .loop movu m2, [r0] movu m3, [r0 + r1] - punpcklbw m4, m2, m3, - punpckhbw m2, m3, + punpcklbw m4, m2, m3 + punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 - movu m5, [r0 + 2 * r1] lea r5, [r0 + 2 * r1] - movu m3, [r5 + r1] - - punpcklbw m6, m5, m3, - punpckhbw m5, m3 - + + movu m5, [r5] + movu m7, [r5 + r1] + + punpcklbw m6, m5, m7 pmaddubsw m6, m0 - pmaddubsw m5, m0 - paddw m4, m6 - paddw m2, m5 - - psubw m4, m7 - psubw m2, m7 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 movu [r2], m4 movu [r2 + 16], m2 + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m2, [r5 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + + pmaddubsw m5, m0 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + movq m2, [r0 + 16] movq m3, [r0 + r1 + 16] - movq m4, [r0 + 2 * r1 + 16] + movq m4, [r5 + 16] movq m5, [r5 + r1 + 16] punpcklbw m2, m3 - punpcklbw m4, m5 + punpcklbw m7, m4, m5 pmaddubsw m2, m1 - pmaddubsw m4, m0 - - paddw m2, m4 - psubw m2, m7 + pmaddubsw m7, m0 + + paddw m2, m7 + psubw m2, m6 movu [r2 + 32], m2 - movu m2, [r0 + r1] - movu m3, [r0 + 2 * r1] - - punpcklbw m4, m2, m3, - punpckhbw m2, m3, - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m5, [r5 + r1] - movu m3, [r5 + 2 * r1] - - punpcklbw m6, m5, m3, - punpckhbw m5, m3 - - pmaddubsw m6, m0 + movq m2, [r5 + 2 * r1 + 16] + + punpcklbw m3, m4 + punpcklbw m5, m2 + + pmaddubsw m3, m1 pmaddubsw m5, m0 - paddw m4, m6 - paddw m2, m5 - - psubw m4, m7 - psubw m2, m7 - - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m2 - - movq m2, [r0 + r1 + 16] - movq m3, [r0 + 2 * r1 + 16] - movq m4, [r5 + r1 + 16] - movq m5, [r5 + 2 * r1 + 16] - - punpcklbw m2, m3 - punpcklbw m4, m5 - - pmaddubsw m2, m1 - pmaddubsw m4, m0 - - paddw m2, m4 - - psubw m2, m7 - movu [r2 + r3 + 32], m2 - - lea r0, [r0 + 2 * r1] + paddw m3, m5 + psubw m3, m6 + + movu [r2 + r3 + 32], m3 + + mov r0, r5 lea r2, [r2 + 2 * r3] dec r4d @@ -2249,18 +2200,18 @@ movu m2, [r0] movu m3, [r0 + r1] - punpcklbw m4, m2, m3, - punpckhbw m2, m3, + punpcklbw m4, m2, m3 + punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 - movu m3, [r0 + 2 * r1] lea r5, [r0 + 2 * r1] + movu m3, [r5] movu m5, [r5 + r1] punpcklbw m6, m3, m5 - punpckhbw m3, m5, + punpckhbw m3, m5 pmaddubsw m6, m0 pmaddubsw m3, m0 @@ -2277,17 +2228,17 @@ movu m2, [r0 + 16] movu m3, [r0 + r1 + 16] - punpcklbw m4, m2, m3, - punpckhbw m2, m3, + punpcklbw m4, m2, m3 + punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 - movu m3, [r0 + 2 * r1 + 16] + movu m3, [r5 + 16] movu m5, [r5 + r1 + 16] punpcklbw m6, m3, m5 - punpckhbw m3, m5, + punpckhbw m3, m5 pmaddubsw m6, m0 pmaddubsw m3, m0 @@ -2319,14 +2270,14 @@ ;----------------------------------------------------------------------------- %macro FILTER_V4_W8_H8_H16_H32 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC -lea r6, [tab_ChromaCoeff] -movd m5, [r6 + r4 * 4] +lea r5, [tab_ChromaCoeff] +movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif @@ -2334,6 +2285,7 @@ pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] mova m4, [tab_c_512] +lea r5, [r1 * 3] mov r4d, %2 @@ -2341,14 +2293,14 @@ movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] -movq m3, [r5 + r1] +movq m3, [r0 + r5] punpcklbw m0, m1 -punpcklbw m7, m2, m3 +punpcklbw m1, m2 +punpcklbw m2, m3 pmaddubsw m0, m6 -pmaddubsw m7, m5 +pmaddubsw m7, m2, m5 paddw m0, m7 @@ -2356,13 +2308,13 @@ packuswb m0, m0 movh [r2], m0 -movq m0, [r0 + 4 * r1] - -punpcklbw m1, m2 -punpcklbw m7, m3, m0 +lea r0, [r0 + 4 * r1] +movq m0, [r0] + +punpcklbw m3, m0 pmaddubsw m1, m6 -pmaddubsw m7, m5 +pmaddubsw m7, m3, m5 paddw m1, m7 @@ -2370,25 +2322,19 @@ packuswb m1, m1 movh [r2 + r3], m1 -lea r6, [r0 + 4 * r1] -movq m1, [r6 + r1] - -punpcklbw m2, m3 -punpcklbw m7, m0, m1 +movq m1, [r0 + r1] + +punpcklbw m0, m1 pmaddubsw m2, m6 -pmaddubsw m7, m5 - -paddw m2, m7 +pmaddubsw m0, m5 + +paddw m2, m0 pmulhrsw m2, m4 -packuswb m2, m2 -movh [r2 + 2 * r3], m2 - -movq m2, [r6 + 2 * r1] - -punpcklbw m3, m0 -punpcklbw m1, m2 + +movq m7, [r0 + 2 * r1] +punpcklbw m1, m7 pmaddubsw m3, m6 pmaddubsw m1, m5 @@ -2396,13 +2342,13 @@ paddw m3, m1 pmulhrsw m3, m4 -packuswb m3, m3 - -lea r5, [r2 + 2 * r3] -movh [r5 + r3], m3 - -lea r0, [r0 + 4 * r1] -lea r2, [r2 + 4 * r3] +packuswb m2, m3 + +lea r2, [r2 + 2 * r3] +movh [r2], m2 +movhps [r2 + r3], m2 + +lea r2, [r2 + 2 * r3] sub r4, 4 jnz .loop @@ -2418,14 +2364,14 @@ ;----------------------------------------------------------------------------- %macro FILTER_V4_W6_H4 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_6x8, 4, 7, 8 +cglobal interp_4tap_vert_pp_6x8, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC -lea r6, [tab_ChromaCoeff] -movd m5, [r6 + r4 * 4] +lea r5, [tab_ChromaCoeff] +movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif @@ -2435,19 +2381,20 @@ mova m4, [tab_c_512] mov r4d, %2 +lea r5, [3 * r1] .loop movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] -movq m3, [r5 + r1] +movq m3, [r0 + r5] punpcklbw m0, m1 -punpcklbw m7, m2, m3 +punpcklbw m1, m2 +punpcklbw m2, m3 pmaddubsw m0, m6 -pmaddubsw m7, m5 +pmaddubsw m7, m2, m5 paddw m0, m7 @@ -2456,13 +2403,13 @@ movd [r2], m0 pextrw [r2 + 4], m0, 2 -movq m0, [r0 + 4 * r1] - -punpcklbw m1, m2 -punpcklbw m7, m3, m0 +lea r0, [r0 + 4 * r1] + +movq m0, [r0] +punpcklbw m3, m0 pmaddubsw m1, m6 -pmaddubsw m7, m5 +pmaddubsw m7, m3, m5 paddw m1, m7 @@ -2471,10 +2418,7 @@ movd [r2 + r3], m1 pextrw [r2 + r3 + 4], m1, 2 -lea r6, [r0 + 4 * r1] -movq m1, [r6 + r1] - -punpcklbw m2, m3 +movq m1, [r0 + r1] punpcklbw m7, m0, m1 pmaddubsw m2, m6 @@ -2484,12 +2428,11 @@ pmulhrsw m2, m4 packuswb m2, m2 -movd [r2 + 2 * r3], m2 -pextrw [r2 + 2 * r3 + 4], m2, 2 - -movq m2, [r6 + 2 * r1] - -punpcklbw m3, m0 +lea r2, [r2 + 2 * r3] +movd [r2], m2 +pextrw [r2 + 4], m2, 2 + +movq m2, [r0 + 2 * r1] punpcklbw m1, m2 pmaddubsw m3, m6 @@ -2500,12 +2443,10 @@ pmulhrsw m3, m4 packuswb m3, m3 -lea r5, [r2 + 2 * r3] -movd [r5 + r3], m3 -pextrw [r5 + r3 + 4], m3, 2 - -lea r0, [r0 + 4 * r1] -lea r2, [r2 + 4 * r3] +movd [r2 + r3], m3 +pextrw [r2 + r3 + 4], m3, 2 + +lea r2, [r2 + 2 * r3] sub r4, 4 jnz .loop @@ -2534,72 +2475,65 @@ pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] -mova m7, [tab_c_512] - -mov r4d, %2 +mov r4d, %2 .loop movu m2, [r0] movu m3, [r0 + r1] -punpcklbw m4, m2, m3, -punpckhbw m2, m3, +punpcklbw m4, m2, m3 +punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 -movu m5, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] -movu m3, [r5 + r1] - -punpcklbw m6, m5, m3, -punpckhbw m5, m3, - +lea r0, [r0 + 2 * r1] +movu m5, [r0] +movu m7, [r0 + r1] + +punpcklbw m6, m5, m7 pmaddubsw m6, m0 -pmaddubsw m5, m0 - paddw m4, m6 -paddw m2, m5 - -pmulhrsw m4, m7 -pmulhrsw m2, m7 + +punpckhbw m6, m5, m7 +pmaddubsw m6, m0 +paddw m2, m6 + +mova m6, [tab_c_512] + +pmulhrsw m4, m6 +pmulhrsw m2, m6 packuswb m4, m2 movh [r2], m4 pextrd [r2 + 8], m4, 2 -movu m2, [r0 + r1] -movu m3, [r0 + 2 * r1] - -punpcklbw m4, m2, m3, -punpckhbw m2, m3, +punpcklbw m4, m3, m5 +punpckhbw m3, m5 pmaddubsw m4, m1 -pmaddubsw m2, m1 - -lea r5, [r0 + 2 * r1] -movu m5, [r5 + r1] -movu m3, [r5 + 2 * r1] - -punpcklbw m6, m5, m3, -punpckhbw m5, m3, - -pmaddubsw m6, m0 -pmaddubsw m5, m0 - -paddw m4, m6 -paddw m2, m5 - -pmulhrsw m4, m7 -pmulhrsw m2, m7 - -packuswb m4, m2 +pmaddubsw m3, m1 + +movu m5, [r0 + 2 * r1] + +punpcklbw m2, m7, m5 +punpckhbw m7, m5 + +pmaddubsw m2, m0 +pmaddubsw m7, m0 + +paddw m4, m2 +paddw m3, m7 + +pmulhrsw m4, m6 +pmulhrsw m3, m6 + +packuswb m4, m3 movh [r2 + r3], m4 pextrd [r2 + r3 + 8], m4, 2 -lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] sub r4, 2 @@ -2635,66 +2569,57 @@ movu m2, [r0] movu m3, [r0 + r1] -punpcklbw m4, m2, m3, -punpckhbw m5, m2, m3, +punpcklbw m4, m2, m3 +punpckhbw m2, m3 pmaddubsw m4, m1 -pmaddubsw m5, m1 - -movu m2, [r0 + 2 * r1] -lea r5, [r0 + 2 * r1] -movu m3, [r5 + r1] - -punpcklbw m6, m2, m3, -punpckhbw m7, m2, m3, - +pmaddubsw m2, m1 + +lea r0, [r0 + 2 * r1] +movu m5, [r0] +movu m6, [r0 + r1] + +punpckhbw m7, m5, m6 +pmaddubsw m7, m0 +paddw m2, m7 + +punpcklbw m7, m5, m6 +pmaddubsw m7, m0 +paddw m4, m7 + +mova m7, [tab_c_512] + +pmulhrsw m4, m7 +pmulhrsw m2, m7 + +packuswb m4, m2 + +movu [r2], m4 + +punpcklbw m4, m3, m5 +punpckhbw m3, m5 + +pmaddubsw m4, m1 +pmaddubsw m3, m1 + +movu m5, [r0 + 2 * r1] + +punpcklbw m2, m6, m5 +punpckhbw m6, m5 + +pmaddubsw m2, m0 pmaddubsw m6, m0 -pmaddubsw m7, m0 - -paddw m4, m6; -paddw m5, m7; - -mova m6, [tab_c_512] - -pmulhrsw m4, m6 -pmulhrsw m5, m6 - -packuswb m4, m5 - -movu [r2], m4 - -movu m2, [r0 + r1] -movu m3, [r0 + 2 * r1] - -punpcklbw m4, m2, m3, -punpckhbw m5, m2, m3, - -pmaddubsw m4, m1 -pmaddubsw m5, m1 - -lea r5, [r0 + 2 * r1] -movu m2, [r5 + r1] -movu m3, [r5 + 2 * r1] - -punpcklbw m6, m2, m3, -punpckhbw m7, m2, m3, - -pmaddubsw m6, m0 -pmaddubsw m7, m0 - -paddw m4, m6 -paddw m5, m7 - -mova m6, [tab_c_512] - -pmulhrsw m4, m6 -pmulhrsw m5, m6 - -packuswb m4, m5 + +paddw m4, m2 +paddw m3, m6 + +pmulhrsw m4, m7 +pmulhrsw m3, m7 + +packuswb m4, m3 movu [r2 + r3], m4 -lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] sub r4, 2 @@ -2728,43 +2653,66 @@ pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] -mova m7, [tab_c_512] - mov r4d, %2 .loop movu m2, [r0] movu m3, [r0 + r1] -punpcklbw m4, m2, m3, -punpckhbw m2, m3, +punpcklbw m4, m2, m3 +punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 -movu m5, [r0 + 2 * r1] lea r5, [r0 + 2 * r1] -movu m3, [r5 + r1] - -punpcklbw m6, m5, m3, -punpckhbw m5, m3 - +movu m5, [r5] +movu m7, [r5 + r1] + +punpcklbw m6, m5, m7 pmaddubsw m6, m0 +paddw m4, m6 + +punpckhbw m6, m5, m7 +pmaddubsw m6, m0 +paddw m2, m6 + +mova m6, [tab_c_512] + +pmulhrsw m4, m6 +pmulhrsw m2, m6 + +packuswb m4, m2 + +movu [r2], m4 + +punpcklbw m4, m3, m5 +punpckhbw m3, m5 + +pmaddubsw m4, m1 +pmaddubsw m3, m1 + +movu m2, [r5 + 2 * r1] + +punpcklbw m5, m7, m2 +punpckhbw m7, m2 + pmaddubsw m5, m0 - -paddw m4, m6 -paddw m2, m5 - -pmulhrsw m4, m7 -pmulhrsw m2, m7 - -packuswb m4, m2 - -movu [r2], m4 +pmaddubsw m7, m0 + +paddw m4, m5 +paddw m3, m7 + +pmulhrsw m4, m6 +pmulhrsw m3, m6 + +packuswb m4, m3 + +movu [r2 + r3], m4 movq m2, [r0 + 16] movq m3, [r0 + r1 + 16] -movq m4, [r0 + 2 * r1 + 16] +movq m4, [r5 + 16] movq m5, [r5 + r1 + 16] punpcklbw m2, m3 @@ -2775,57 +2723,28 @@ paddw m2, m4 -pmulhrsw m2, m7 -packuswb m2, m2 +pmulhrsw m2, m6 + +movq m3, [r0 + r1 + 16] +movq m4, [r5 + 16] +movq m5, [r5 + r1 + 16] +movq m7, [r5 + 2 * r1 + 16] + +punpcklbw m3, m4 +punpcklbw m5, m7 + +pmaddubsw m3, m1 +pmaddubsw m5, m0 + +paddw m3, m5 + +pmulhrsw m3, m6 +packuswb m2, m3 + movh [r2 + 16], m2 - -movu m2, [r0 + r1] -movu m3, [r0 + 2 * r1] - -punpcklbw m4, m2, m3, -punpckhbw m2, m3, - -pmaddubsw m4, m1 -pmaddubsw m2, m1 - -lea r5, [r0 + 2 * r1] -movu m5, [r5 + r1] -movu m3, [r5 + 2 * r1] - -punpcklbw m6, m5, m3, -punpckhbw m5, m3 - -pmaddubsw m6, m0 -pmaddubsw m5, m0 - -paddw m4, m6 -paddw m2, m5 - -pmulhrsw m4, m7 -pmulhrsw m2, m7 - -packuswb m4, m2 - -movu [r2 + r3], m4 - -movq m2, [r0 + r1 + 16] -movq m3, [r0 + 2 * r1 + 16] -movq m4, [r5 + r1 + 16] -movq m5, [r5 + 2 * r1 + 16] - -punpcklbw m2, m3 -punpcklbw m4, m5 - -pmaddubsw m2, m1 -pmaddubsw m4, m0 - -paddw m2, m4 - -pmulhrsw m2, m7 -packuswb m2, m2 -movh [r2 + r3 + 16], m2 - -lea r0, [r0 + 2 * r1] +movhps [r2 + r3 + 16], m2 + +mov r0, r5 lea r2, [r2 + 2 * r3] sub r4, 2 @@ -2863,18 +2782,18 @@ movu m2, [r0] movu m3, [r0 + r1] -punpcklbw m4, m2, m3, -punpckhbw m2, m3, +punpcklbw m4, m2, m3 +punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 -movu m3, [r0 + 2 * r1] lea r5, [r0 + 2 * r1] +movu m3, [r5] movu m5, [r5 + r1] punpcklbw m6, m3, m5 -punpckhbw m3, m5, +punpckhbw m3, m5 pmaddubsw m6, m0 pmaddubsw m3, m0 @@ -2892,17 +2811,17 @@ movu m2, [r0 + 16] movu m3, [r0 + r1 + 16] -punpcklbw m4, m2, m3, -punpckhbw m2, m3, +punpcklbw m4, m2, m3 +punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 -movu m3, [r0 + 2 * r1 + 16] +movu m3, [r5 + 16] movu m5, [r5 + r1 + 16] punpcklbw m6, m3, m5 -punpckhbw m3, m5, +punpckhbw m3, m5 pmaddubsw m6, m0 pmaddubsw m3, m0 diff -r 07b5d6b82f5f -r 7eccc042e269 source/test/ipfilterharness.cpp --- a/source/test/ipfilterharness.cpp Mon Feb 10 15:05:04 2014 -0600 +++ b/source/test/ipfilterharness.cpp Tue Feb 11 18:53:23 2014 +0530 @@ -158,7 +158,7 @@ rand_coeffIdx = rand() % 8; // Random coeffIdex in the filter rand_srcStride = rand() % 100; // Randomly generated srcStride - rand_dstStride = rand() % 100; // Randomly generated dstStride + rand_dstStride = rand() % 100 + 32; // Randomly generated dstStride opt(pixel_buff + 3 * rand_srcStride, rand_srcStride, @@ -187,7 +187,7 @@ rand_coeffIdx = rand() % 8; // Random coeffIdex in the filter rand_srcStride = rand() % 100; // Randomly generated srcStride - rand_dstStride = rand() % 100; // Randomly generated dstStride + rand_dstStride = rand() % 100 + 32; // Randomly generated dstStride ref(pixel_buff + 3 * rand_srcStride, rand_srcStride, _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel