# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1385275951 -28800 # Node ID 464af047f7b12a0a0e105d7550d454f30cf16eea # Parent 10f605bd053009c8c981c7529322fecd1e54af7b cleanup: remove unused code in mc-a2.asm
diff -r 10f605bd0530 -r 464af047f7b1 source/common/x86/mc-a2.asm --- a/source/common/x86/mc-a2.asm Fri Nov 22 14:59:34 2013 -0600 +++ b/source/common/x86/mc-a2.asm Sun Nov 24 14:52:31 2013 +0800 @@ -32,10 +32,6 @@ SECTION_RODATA 32 -filt_mul20: times 32 db 20 -filt_mul15: times 16 db 1, -5 -filt_mul51: times 16 db -5, 1 -hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 %if HIGH_BIT_DEPTH @@ -51,15 +47,6 @@ pd_0f: times 4 dd 0xffff pf_inv256: times 8 dd 0.00390625 -pad10: times 8 dw 10*PIXEL_MAX -pad20: times 8 dw 20*PIXEL_MAX -pad30: times 8 dw 30*PIXEL_MAX -depad: times 4 dd 32*20*PIXEL_MAX + 512 - -tap1: times 4 dw 1, -5 -tap2: times 4 dw 20, 20 -tap3: times 4 dw -5, 1 - SECTION .text cextern pb_0 @@ -72,86 +59,6 @@ cextern pw_pixel_max cextern pd_ffff -%macro LOAD_ADD 4 - movh %4, %3 - movh %1, %2 - punpcklbw %4, m0 - punpcklbw %1, m0 - paddw %1, %4 -%endmacro - -%macro LOAD_ADD_2 6 - mova %5, %3 - mova %1, %4 - punpckhbw %6, %5, m0 - punpcklbw %5, m0 - punpckhbw %2, %1, m0 - punpcklbw %1, m0 - paddw %1, %5 - paddw %2, %6 -%endmacro - -%macro FILT_V2 6 - psubw %1, %2 ; a-b - psubw %4, %5 - psubw %2, %3 ; b-c - psubw %5, %6 - psllw %2, 2 - psllw %5, 2 - psubw %1, %2 ; a-5*b+4*c - psllw %3, 4 - psubw %4, %5 - psllw %6, 4 - paddw %1, %3 ; a-5*b+20*c - paddw %4, %6 -%endmacro - -%macro FILT_H 3 - psubw %1, %2 ; a-b - psraw %1, 2 ; (a-b)/4 - psubw %1, %2 ; (a-b)/4-b - paddw %1, %3 ; (a-b)/4-b+c - psraw %1, 2 ; ((a-b)/4-b+c)/4 - paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 -%endmacro - -%macro FILT_H2 6 - psubw %1, %2 - psubw %4, %5 - psraw %1, 2 - psraw %4, 2 - psubw %1, %2 - psubw %4, %5 - paddw %1, %3 - paddw %4, %6 - psraw %1, 2 - psraw %4, 2 - paddw %1, %3 - paddw %4, %6 -%endmacro - -%macro FILT_PACK 3-5 -%if cpuflag(ssse3) - pmulhrsw %1, %3 - pmulhrsw %2, %3 -%else - paddw %1, %3 - paddw %2, %3 -%if %0 == 5 - psubusw %1, %5 - psubusw %2, %5 - psrlw %1, %4 - psrlw %2, %4 -%else - psraw %1, %4 - psraw %2, %4 -%endif -%endif -%if HIGH_BIT_DEPTH == 0 - packuswb %1, %2 -%endif -%endmacro - ;The hpel_filter routines use non-temporal writes for output. ;The following defines may be uncommented for testing. ;Doing the hpel_filter temporal may be a win if the last level cache @@ -161,738 +68,7 @@ ;%define movntps movaps ;%define sfence -%if HIGH_BIT_DEPTH -;----------------------------------------------------------------------------- -; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width ); -;----------------------------------------------------------------------------- -%macro HPEL_FILTER 0 -cglobal hpel_filter_v, 5,6,11 - FIX_STRIDES r3, r4 - lea r5, [r1+r3] - sub r1, r3 - sub r1, r3 -%if num_mmregs > 8 - mova m8, [pad10] - mova m9, [pad20] - mova m10, [pad30] - %define s10 m8 - %define s20 m9 - %define s30 m10 -%else - %define s10 [pad10] - %define s20 [pad20] - %define s30 [pad30] -%endif - add r0, r4 - add r2, r4 - neg r4 - mova m7, [pw_pixel_max] - pxor m0, m0 -.loop: - mova m1, [r1] - mova m2, [r1+r3] - mova m3, [r1+r3*2] - mova m4, [r1+mmsize] - mova m5, [r1+r3+mmsize] - mova m6, [r1+r3*2+mmsize] - paddw m1, [r5+r3*2] - paddw m2, [r5+r3] - paddw m3, [r5] - paddw m4, [r5+r3*2+mmsize] - paddw m5, [r5+r3+mmsize] - paddw m6, [r5+mmsize] - add r1, 2*mmsize - add r5, 2*mmsize - FILT_V2 m1, m2, m3, m4, m5, m6 - mova m6, [pw_16] - psubw m1, s20 - psubw m4, s20 - mova [r2+r4], m1 - mova [r2+r4+mmsize], m4 - paddw m1, s30 - paddw m4, s30 - FILT_PACK m1, m4, m6, 5, s10 - CLIPW m1, m0, m7 - CLIPW m4, m0, m7 - mova [r0+r4], m1 - mova [r0+r4+mmsize], m4 - add r4, 2*mmsize - jl .loop - RET - -;----------------------------------------------------------------------------- -; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width ); -;----------------------------------------------------------------------------- -cglobal hpel_filter_c, 3,3,10 - add r2, r2 - add r0, r2 - add r1, r2 - neg r2 - mova m0, [tap1] - mova m7, [tap3] -%if num_mmregs > 8 - mova m8, [tap2] - mova m9, [depad] - %define s1 m8 - %define s2 m9 -%else - %define s1 [tap2] - %define s2 [depad] -%endif -.loop: - movu m1, [r1+r2-4] - movu m2, [r1+r2-2] - mova m3, [r1+r2+0] - movu m4, [r1+r2+2] - movu m5, [r1+r2+4] - movu m6, [r1+r2+6] - pmaddwd m1, m0 - pmaddwd m2, m0 - pmaddwd m3, s1 - pmaddwd m4, s1 - pmaddwd m5, m7 - pmaddwd m6, m7 - paddd m1, s2 - paddd m2, s2 - paddd m3, m5 - paddd m4, m6 - paddd m1, m3 - paddd m2, m4 - psrad m1, 10 - psrad m2, 10 - pslld m2, 16 - pand m1, [pd_0f] - por m1, m2 - CLIPW m1, [pb_0], [pw_pixel_max] - mova [r0+r2], m1 - add r2, mmsize - jl .loop - RET - -;----------------------------------------------------------------------------- -; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width ); -;----------------------------------------------------------------------------- -cglobal hpel_filter_h, 3,4,8 - %define src r1+r2 - add r2, r2 - add r0, r2 - add r1, r2 - neg r2 - mova m0, [pw_pixel_max] -.loop: - movu m1, [src-4] - movu m2, [src-2] - mova m3, [src+0] - movu m6, [src+2] - movu m4, [src+4] - movu m5, [src+6] - paddw m3, m6 ; c0 - paddw m2, m4 ; b0 - paddw m1, m5 ; a0 -%if mmsize == 16 - movu m4, [src-4+mmsize] - movu m5, [src-2+mmsize] -%endif - movu m7, [src+4+mmsize] - movu m6, [src+6+mmsize] - paddw m5, m7 ; b1 - paddw m4, m6 ; a1 - movu m7, [src+2+mmsize] - mova m6, [src+0+mmsize] - paddw m6, m7 ; c1 - FILT_H2 m1, m2, m3, m4, m5, m6 - mova m7, [pw_1] - pxor m2, m2 - FILT_PACK m1, m4, m7, 1 - CLIPW m1, m2, m0 - CLIPW m4, m2, m0 - mova [r0+r2], m1 - mova [r0+r2+mmsize], m4 - add r2, mmsize*2 - jl .loop - RET -%endmacro ; HPEL_FILTER - -INIT_MMX mmx2 -HPEL_FILTER -INIT_XMM sse2 -HPEL_FILTER -%endif ; HIGH_BIT_DEPTH - %if HIGH_BIT_DEPTH == 0 -%macro HPEL_V 1 -;----------------------------------------------------------------------------- -; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width ); -;----------------------------------------------------------------------------- -cglobal hpel_filter_v, 5,6,%1 - lea r5, [r1+r3] - sub r1, r3 - sub r1, r3 - add r0, r4 - lea r2, [r2+r4*2] - neg r4 -%if cpuflag(ssse3) - mova m0, [filt_mul15] -%else - pxor m0, m0 -%endif -.loop: -%if cpuflag(ssse3) - mova m1, [r1] - mova m4, [r1+r3] - mova m2, [r5+r3*2] - mova m5, [r5+r3] - mova m3, [r1+r3*2] - mova m6, [r5] - SBUTTERFLY bw, 1, 4, 7 - SBUTTERFLY bw, 2, 5, 7 - SBUTTERFLY bw, 3, 6, 7 - pmaddubsw m1, m0 - pmaddubsw m4, m0 - pmaddubsw m2, m0 - pmaddubsw m5, m0 - pmaddubsw m3, [filt_mul20] - pmaddubsw m6, [filt_mul20] - paddw m1, m2 - paddw m4, m5 - paddw m1, m3 - paddw m4, m6 - mova m7, [pw_1024] -%else - LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1 - LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 - LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0 - LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1 - FILT_V2 m1, m2, m3, m4, m5, m6 - mova m7, [pw_16] -%endif -%if mmsize==32 - mova [r2+r4*2], xm1 - mova [r2+r4*2+mmsize/2], xm4 - vextracti128 [r2+r4*2+mmsize], m1, 1 - vextracti128 [r2+r4*2+mmsize*3/2], m4, 1 -%else - mova [r2+r4*2], m1 - mova [r2+r4*2+mmsize], m4 -%endif - FILT_PACK m1, m4, m7, 5 - movnta [r0+r4], m1 - add r1, mmsize - add r5, mmsize - add r4, mmsize - jl .loop - RET -%endmacro - -;----------------------------------------------------------------------------- -; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); -;----------------------------------------------------------------------------- -INIT_MMX mmx2 -cglobal hpel_filter_c, 3,3 - add r0, r2 - lea r1, [r1+r2*2] - neg r2 - %define src r1+r2*2 - movq m7, [pw_32] -.loop: - movq m1, [src-4] - movq m2, [src-2] - movq m3, [src ] - movq m4, [src+4] - movq m5, [src+6] - paddw m3, [src+2] ; c0 - paddw m2, m4 ; b0 - paddw m1, m5 ; a0 - movq m6, [src+8] - paddw m4, [src+14] ; a1 - paddw m5, [src+12] ; b1 - paddw m6, [src+10] ; c1 - FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, m7, 6 - movntq [r0+r2], m1 - add r2, 8 - jl .loop - RET - -;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); -;----------------------------------------------------------------------------- -INIT_MMX mmx2 -cglobal hpel_filter_h, 3,3 - add r0, r2 - add r1, r2 - neg r2 - %define src r1+r2 - pxor m0, m0 -.loop: - movd m1, [src-2] - movd m2, [src-1] - movd m3, [src ] - movd m6, [src+1] - movd m4, [src+2] - movd m5, [src+3] - punpcklbw m1, m0 - punpcklbw m2, m0 - punpcklbw m3, m0 - punpcklbw m6, m0 - punpcklbw m4, m0 - punpcklbw m5, m0 - paddw m3, m6 ; c0 - paddw m2, m4 ; b0 - paddw m1, m5 ; a0 - movd m7, [src+7] - movd m6, [src+6] - punpcklbw m7, m0 - punpcklbw m6, m0 - paddw m4, m7 ; c1 - paddw m5, m6 ; b1 - movd m7, [src+5] - movd m6, [src+4] - punpcklbw m7, m0 - punpcklbw m6, m0 - paddw m6, m7 ; a1 - movq m7, [pw_1] - FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, m7, 1 - movntq [r0+r2], m1 - add r2, 8 - jl .loop - RET - -%macro HPEL_C 0 -;----------------------------------------------------------------------------- -; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); -;----------------------------------------------------------------------------- -cglobal hpel_filter_c, 3,3,9 - add r0, r2 - lea r1, [r1+r2*2] - neg r2 - %define src r1+r2*2 -%ifnidn cpuname, sse2 -%if cpuflag(ssse3) - mova m7, [pw_512] -%else - mova m7, [pw_32] -%endif - %define pw_rnd m7 -%elif ARCH_X86_64 - mova m8, [pw_32] - %define pw_rnd m8 -%else - %define pw_rnd [pw_32] -%endif -; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer... -%if mmsize==32 -.loop: - movu m4, [src-4] - movu m5, [src-2] - mova m6, [src+0] - movu m3, [src-4+mmsize] - movu m2, [src-2+mmsize] - mova m1, [src+0+mmsize] - paddw m4, [src+6] - paddw m5, [src+4] - paddw m6, [src+2] - paddw m3, [src+6+mmsize] - paddw m2, [src+4+mmsize] - paddw m1, [src+2+mmsize] - FILT_H2 m4, m5, m6, m3, m2, m1 -%else - mova m0, [src-16] - mova m1, [src] -.loop: - mova m2, [src+16] - PALIGNR m4, m1, m0, 12, m7 - PALIGNR m5, m1, m0, 14, m0 - PALIGNR m0, m2, m1, 6, m7 - paddw m4, m0 - PALIGNR m0, m2, m1, 4, m7 - paddw m5, m0 - PALIGNR m6, m2, m1, 2, m7 - paddw m6, m1 - FILT_H m4, m5, m6 - - mova m0, m2 - mova m5, m2 - PALIGNR m2, m1, 12, m7 - PALIGNR m5, m1, 14, m1 - mova m1, [src+32] - PALIGNR m3, m1, m0, 6, m7 - paddw m3, m2 - PALIGNR m6, m1, m0, 4, m7 - paddw m5, m6 - PALIGNR m6, m1, m0, 2, m7 - paddw m6, m0 - FILT_H m3, m5, m6 -%endif - FILT_PACK m4, m3, pw_rnd, 6 -%if mmsize==32 - vpermq m4, m4, q3120 -%endif - movnta [r0+r2], m4 - add r2, mmsize - jl .loop - RET -%endmacro - -;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); -;----------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal hpel_filter_h, 3,3,8 - add r0, r2 - add r1, r2 - neg r2 - %define src r1+r2 - pxor m0, m0 -.loop: - movh m1, [src-2] - movh m2, [src-1] - movh m3, [src ] - movh m4, [src+1] - movh m5, [src+2] - movh m6, [src+3] - punpcklbw m1, m0 - punpcklbw m2, m0 - punpcklbw m3, m0 - punpcklbw m4, m0 - punpcklbw m5, m0 - punpcklbw m6, m0 - paddw m3, m4 ; c0 - paddw m2, m5 ; b0 - paddw m1, m6 ; a0 - movh m4, [src+6] - movh m5, [src+7] - movh m6, [src+10] - movh m7, [src+11] - punpcklbw m4, m0 - punpcklbw m5, m0 - punpcklbw m6, m0 - punpcklbw m7, m0 - paddw m5, m6 ; b1 - paddw m4, m7 ; a1 - movh m6, [src+8] - movh m7, [src+9] - punpcklbw m6, m0 - punpcklbw m7, m0 - paddw m6, m7 ; c1 - mova m7, [pw_1] ; FIXME xmm8 - FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, m7, 1 - movntps [r0+r2], m1 - add r2, 16 - jl .loop - RET - -;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); -;----------------------------------------------------------------------------- -%macro HPEL_H 0 -cglobal hpel_filter_h, 3,3 - add r0, r2 - add r1, r2 - neg r2 - %define src r1+r2 - mova m0, [src-16] - mova m1, [src] - mova m7, [pw_1024] -.loop: - mova m2, [src+16] - ; Using unaligned loads instead of palignr is marginally slower on SB and significantly - ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid - ; the repeated loads of constants for pmaddubsw. - palignr m3, m1, m0, 14 - palignr m4, m1, m0, 15 - palignr m0, m2, m1, 2 - pmaddubsw m3, [filt_mul15] - pmaddubsw m4, [filt_mul15] - pmaddubsw m0, [filt_mul51] - palignr m5, m2, m1, 1 - palignr m6, m2, m1, 3 - paddw m3, m0 - mova m0, m1 - pmaddubsw m1, [filt_mul20] - pmaddubsw m5, [filt_mul20] - pmaddubsw m6, [filt_mul51] - paddw m3, m1 - paddw m4, m5 - paddw m4, m6 - FILT_PACK m3, m4, m7, 5 - pshufb m3, [hpel_shuf] - mova m1, m2 - movntps [r0+r2], m3 - add r2, 16 - jl .loop - RET -%endmacro - -INIT_MMX mmx2 -HPEL_V 0 -INIT_XMM sse2 -HPEL_V 8 -%if ARCH_X86_64 == 0 -INIT_XMM sse2 -HPEL_C -INIT_XMM ssse3 -HPEL_C -HPEL_V 0 -HPEL_H -INIT_XMM avx -HPEL_C -HPEL_V 0 -HPEL_H -INIT_YMM avx2 -HPEL_V 8 -HPEL_C - -INIT_YMM avx2 -cglobal hpel_filter_h, 3,3,8 - add r0, r2 - add r1, r2 - neg r2 - %define src r1+r2 - mova m5, [filt_mul15] - mova m6, [filt_mul20] - mova m7, [filt_mul51] -.loop: - movu m0, [src-2] - movu m1, [src-1] - movu m2, [src+2] - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m7 - paddw m0, m2 - - mova m2, [src+0] - movu m3, [src+1] - movu m4, [src+3] - pmaddubsw m2, m6 - pmaddubsw m3, m6 - pmaddubsw m4, m7 - paddw m0, m2 - paddw m1, m3 - paddw m1, m4 - - mova m2, [pw_1024] - FILT_PACK m0, m1, m2, 5 - pshufb m0, [hpel_shuf] - movnta [r0+r2], m0 - add r2, mmsize - jl .loop - RET -%endif - -%if ARCH_X86_64 -%macro DO_FILT_V 5 - ;The optimum prefetch distance is difficult to determine in checkasm: - ;any prefetch seems slower than not prefetching. - ;In real use, the prefetch seems to be a slight win. - ;+mmsize is picked somewhat arbitrarily here based on the fact that even one - ;loop iteration is going to take longer than the prefetch. - prefetcht0 [r1+r2*2+mmsize] -%if cpuflag(ssse3) - mova m1, [r3] - mova m2, [r3+r2] - mova %3, [r3+r2*2] - mova m3, [r1] - mova %1, [r1+r2] - mova %2, [r1+r2*2] - punpckhbw m4, m1, m2 - punpcklbw m1, m2 - punpckhbw m2, %1, %2 - punpcklbw %1, %2 - punpckhbw %2, m3, %3 - punpcklbw m3, %3 - - pmaddubsw m1, m12 - pmaddubsw m4, m12 - pmaddubsw %1, m0 - pmaddubsw m2, m0 - pmaddubsw m3, m14 - pmaddubsw %2, m14 - - paddw m1, %1 - paddw m4, m2 - paddw m1, m3 - paddw m4, %2 -%else - LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1 - LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1 - LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1 - packuswb %3, %4 - FILT_V2 m1, m2, m3, m4, m5, m6 -%endif - add r3, mmsize - add r1, mmsize -%if mmsize==32 - vinserti128 %1, m1, xm4, 1 - vperm2i128 %2, m1, m4, q0301 -%else - mova %1, m1 - mova %2, m4 -%endif - FILT_PACK m1, m4, m15, 5 - movntps [r8+r4+%5], m1 -%endmacro - -%macro FILT_C 3 -%if mmsize==32 - vperm2i128 m3, %2, %1, q0003 -%endif - PALIGNR m1, %2, %1, (mmsize-4), m3 - PALIGNR m2, %2, %1, (mmsize-2), m3 -%if mmsize==32 - vperm2i128 %1, %3, %2, q0003 -%endif - PALIGNR m3, %3, %2, 4, %1 - PALIGNR m4, %3, %2, 2, %1 - paddw m3, m2 -%if mmsize==32 - mova m2, %1 -%endif - mova %1, %3 - PALIGNR %3, %3, %2, 6, m2 - paddw m4, %2 - paddw %3, m1 - FILT_H %3, m3, m4 -%endmacro - -%macro DO_FILT_C 4 - FILT_C %1, %2, %3 - FILT_C %2, %1, %4 - FILT_PACK %3, %4, m15, 6 -%if mmsize==32 - vpermq %3, %3, q3120 -%endif - movntps [r5+r4], %3 -%endmacro - -%macro ADD8TO16 5 - punpckhbw %3, %1, %5 - punpcklbw %1, %5 - punpcklbw %4, %2, %5 - punpckhbw %2, %5 - paddw %2, %3 - paddw %1, %4 -%endmacro - -%macro DO_FILT_H 3 -%if mmsize==32 - vperm2i128 m3, %2, %1, q0003 -%endif - PALIGNR m1, %2, %1, (mmsize-2), m3 - PALIGNR m2, %2, %1, (mmsize-1), m3 -%if mmsize==32 - vperm2i128 m3, %3, %2, q0003 -%endif - PALIGNR m4, %3, %2, 1 , m3 - PALIGNR m5, %3, %2, 2 , m3 - PALIGNR m6, %3, %2, 3 , m3 - mova %1, %2 -%if cpuflag(ssse3) - pmaddubsw m1, m12 - pmaddubsw m2, m12 - pmaddubsw %2, m14 - pmaddubsw m4, m14 - pmaddubsw m5, m0 - pmaddubsw m6, m0 - paddw m1, %2 - paddw m2, m4 - paddw m1, m5 - paddw m2, m6 - FILT_PACK m1, m2, m15, 5 - pshufb m1, [hpel_shuf] -%else ; ssse3, avx - ADD8TO16 m1, m6, m12, m3, m0 ; a - ADD8TO16 m2, m5, m12, m3, m0 ; b - ADD8TO16 %2, m4, m12, m3, m0 ; c - FILT_V2 m1, m2, %2, m6, m5, m4 - FILT_PACK m1, m6, m15, 5 -%endif - movntps [r0+r4], m1 - mova %2, %3 -%endmacro - -%macro HPEL 0 -;----------------------------------------------------------------------------- -; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, -; uint8_t *src, intptr_t stride, int width, int height ) -;----------------------------------------------------------------------------- -cglobal hpel_filter, 7,9,16 - mov r7, r3 - sub r5d, mmsize - mov r8, r1 - and r7, mmsize-1 - sub r3, r7 - add r0, r5 - add r8, r5 - add r7, r5 - add r5, r2 - mov r2, r4 - neg r7 - lea r1, [r3+r2] - sub r3, r2 - sub r3, r2 - mov r4, r7 -%if cpuflag(ssse3) - mova m0, [filt_mul51] - mova m12, [filt_mul15] - mova m14, [filt_mul20] - mova m15, [pw_1024] -%else - pxor m0, m0 - mova m15, [pw_16] -%endif -;ALIGN 16 -.loopy: -; first filter_v - DO_FILT_V m8, m7, m13, m12, 0 -;ALIGN 16 -.loopx: - DO_FILT_V m6, m5, m11, m12, mmsize -.lastx: -%if cpuflag(ssse3) - psrlw m15, 1 ; pw_512 -%else - paddw m15, m15 ; pw_32 -%endif - DO_FILT_C m9, m8, m7, m6 -%if cpuflag(ssse3) - paddw m15, m15 ; pw_1024 -%else - psrlw m15, 1 ; pw_16 -%endif - mova m7, m5 - DO_FILT_H m10, m13, m11 - add r4, mmsize - jl .loopx - cmp r4, mmsize - jl .lastx -; setup regs for next y - sub r4, r7 - sub r4, r2 - sub r1, r4 - sub r3, r4 - add r0, r2 - add r8, r2 - add r5, r2 - mov r4, r7 - sub r6d, 1 - jg .loopy - sfence - RET -%endmacro - -INIT_XMM sse2 -HPEL -INIT_XMM ssse3 -HPEL -INIT_XMM avx -HPEL -INIT_YMM avx2 -HPEL -%endif ; ARCH_X86_64 - %undef movntq %undef movntps %undef sfence _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel