I think x86 scale factor is less than or equal to 8, so your instruction [tab_coeff + height * 16] is invalid. we can replace by below movq x, [tab_coeff + height * 8] punpcklqdq x, x (or pshufd x, 01000100b it faster on some kind of CPU)
At 2013-10-12 01:51:57,"Praveen Tiwari" <[email protected]> wrote: I have just missed to change the line mova coef2, [tab_coeff + 16] (I was just testing for coeffIdex 1 ) I will make it for random like mova coef2, [tab_coeff + height * 16]. Please Ignore this. Regards, Praveen On Fri, Oct 11, 2013 at 10:20 PM, <[email protected]> wrote: # HG changeset patch # User Praveen Tiwari # Date 1381510220 -19800 # Node ID 5a9160e8b0bdc3117c2417bc29453077488efd8e # Parent c6d89dc62e191f56f63dbcb1781a6494da50a70d chroma 4XN block, coeffIdex insted of coeff pointer diff -r c6d89dc62e19 -r 5a9160e8b0bd source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Oct 11 01:47:53 2013 -0500 +++ b/source/common/x86/ipfilter8.asm Fri Oct 11 22:20:20 2013 +0530 @@ -26,107 +26,58 @@ %include "x86inc.asm" %include "x86util.asm" -%if ARCH_X86_64 == 0 - SECTION_RODATA 32 -tab_leftmask: db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 - tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 tab_c_512: times 8 dw 512 +tab_coeff: db 0, 64, 0, 0, 0, 64, 0, 0, 0, 64, 0, 0, 0, 64, 0, 0 + db -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 + db -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 + db -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 + db -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 + db -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 + db -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 + db -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 + SECTION .text -%macro FILTER_H4 3 - movu %1, [src + col - 1] - pshufb %2, %1, Tm4 +%macro FILTER_H4_w4 3 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 pmaddubsw %2, coef2 - pshufb %1, %1, Tm5 - pmaddubsw %1, coef2 phaddw %2, %1 pmulhrsw %2, %3 packuswb %2, %2 %endmacro +%macro FILTER_H4_w4_CALL 0 + FILTER_H4_w4 x0, x1, x2 + + movd [dstq], x1 + + add srcq, srcstrideq + add dstq, dststrideq +%endmacro + ;----------------------------------------------------------------------------- -; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff) +; void interp_4tap_horiz_pp_w4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal filterHorizontal_p_p_4, 0, 7, 8 -%define src r0 -%define dst r1 -%define row r2 -%define col r3 -%define width r4 -%define widthleft r5 -%define mask_offset r6 -%define coef2 m7 -%define x3 m6 -%define Tm5 m5 -%define Tm4 m4 -%define x2 m3 -%define x1 m2 -%define x0 m1 -%define leftmask m0 -%define tmp r0 -%define tmp1 r1 - - mov tmp, r6m - movu coef2, [tmp] - packsswb coef2, coef2 - pshufd coef2, coef2, 0 +cglobal interp_4tap_horiz_pp_w4, 6, 6, 5, src, srcstride, dst, dststride, height, coeffIdx +%define coef2 m4 +%define Tm0 m3 +%define x2 m2 +%define x1 m1 +%define x0 m0 - mova x3, [tab_c_512] + mova coef2, [tab_coeff + 16] + mova x2, [tab_c_512] + mova Tm0, [tab_Tm] - mov width, r4m - mov widthleft, width - and width, ~7 - and widthleft, 7 - mov mask_offset, widthleft - neg mask_offset +.loop +FILTER_H4_w4_CALL +dec r4d +jnz .loop +RET - movq leftmask, [tab_leftmask + (7 + mask_offset)] - mova Tm4, [tab_Tm] - mova Tm5, [tab_Tm + 16] - - mov src, r0m - mov dst, r2m - mov row, r5m - -_loop_row: - xor col, col - -_loop_col: - FILTER_H4 x0, x1, x3 - movh [dst + col], x1 - - add col, 8 - - cmp col, width - jl _loop_col - -_end_col: - test widthleft, widthleft - jz _next_row - - movq x2, [dst + col] - FILTER_H4 x0, x1, x3 - pblendvb x2, x2, x1, leftmask - movh [dst + col], x2 - -_next_row: - add src, r1m - add dst, r3m - dec row - - test row, row - jz _end_row - - jmp _loop_row - -_end_row: - - RET - -%endif ; ARCH_X86_64 == 0
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
