> MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ > x86/deinterlace.o \ > x86/fmtconvert.o \ > + x86/h264_qpel.o \ > x86/h264_qpel_10bit.o \ > $(YASM-OBJS-yes)
Is it just my email client, or is there an alignment issue here? > +h_filt_shuf1: db 2, 3 > + db 3, 4 > + db 4, 5 > + db 5, 6 > + db 6, 7 > + db 7, 8 > + db 8, 9 > + db 9, 10 > +h_filt_shuf2: db 0, 5 > + db 1, 6 > + db 2, 7 > + db 3, 8 > + db 4, 9 > + db 5, 10 > + db 6, 11 > + db 7, 12 > +h_filt_shuf3: db 1, 4 > + db 2, 5 > + db 3, 6 > + db 4, 7 > + db 5, 8 > + db 6, 9 > + db 7, 10 > + db 8, 11 Can you make these one line each? This is needlessly line-breaky. > +SECTION .text > + > +%macro AVG_MOV 2 > +; In the internal 16x16 calls, the pointer is incremented by 8 bytes. > +; This breaks 16-byte alignment, so movement to mmx is required. > +; Is there a way around this? > +%if mmsize == 16 > + movdq2q mm0, %2 > + pavgb mm0, %1 > + movq %1, mm0 > +%else > + pavgb %2, %1 > + MOV_OP %1, %2 > +%endif > +%endmacro What in the world is this about? You can use movq with SSE, silly. > +%macro MC 2 > +%define OP_MOV movh > +INIT_MMX mmx2 > +%1 mmx2, put, 4 > +INIT_XMM sse2 > +%1 sse2, put, 8 > +%if %2 ; ssse3, also implies avx Misleading comment, I'm pretty sure ssse3 does not imply avx. > + mov r0, r0m > + mov r1, r1m Why do you need to reload these pointers from the stack? You know /exactly/ how the stub has changed them, so you can take that into account and do the proper math. > +;----------------------------------------------------------------------------- > +; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride) > +;----------------------------------------------------------------------------- > +%macro COPY4 1 > + %1 m0, [r1 ] > + OP_MOV [r0 ], m0 > + %1 m0, [r1+r2 ] > + OP_MOV [r0+r2 ], m0 > + %1 m0, [r1+r2*2] > + OP_MOV [r0+r2*2], m0 > + %1 m0, [r1+r3 ] > + OP_MOV [r0+r3 ], m0 > +%endmacro > + > +%macro MC00 1 > +INIT_MMX mmx2 > +%define MOV_OP movh > +cglobal %1_h264_qpel4_mc00,3,4 > + lea r3, [r2*3 ] > + COPY4 movh > + RET > + > +%define MOV_OP mova > +cglobal %1_h264_qpel8_mc00,3,4 > + lea r3, [r2*3 ] > + COPY4 movu > + lea r0, [r0+r2*4] > + lea r1, [r1+r2*4] > + COPY4 movu > + RET > + > +INIT_XMM sse2 > +cglobal %1_h264_qpel16_mc00,3,5 > + lea r3, [r2*3 ] > + mov r4d, 4 > +.loop: > + COPY4 movu > + lea r0, [r0+r2*4] > + lea r1, [r1+r2*4] > + dec r4d > + jg .loop > + REP_RET > +%endmacro > + > +%macro AVG_MOV_MC00 2 > +; See AVG_MOV above why this is necessary -- any way around it? > + pavgb %2, %1 > + MOV_OP %1, %2 > +%endmacro > + > +INIT_MMX > +%define OP_MOV MOV_OP > +MC00 put > + > +INIT_MMX > +%define OP_MOV AVG_MOV_MC00 > +MC00 avg > + > +%define MOV_OP movh ; After mc00, it should be movh Don't we already have yasm functions for these somewhere? > +;----------------------------------------------------------------------------- > +; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride) > +;----------------------------------------------------------------------------- > +%macro H_FILT_XMM 7 ; tmp1-5, zero/one, mem > +%if cpuflag(ssse3) || cpuflag(avx) > + movu %2, [%7-2] > + pshufb %3, %2, [h_filt_shuf1] > + pshufb %1, %2, [h_filt_shuf2] > + pshufb %2, [h_filt_shuf3] You can do this: + pshufb %2, %2, [h_filt_shuf3] Given how you aligned it, it looks cleaner. Make sure these versions of the functions aren't enabled on Atom -- they will be dog slow because pshufb is completely unpipelined (6/6) there, and pmaddubsw is obnoxiously slow (5/2) too. > + pmaddubsw %3, %6 > + pmaddubsw %1, %6 > + pmaddubsw %2, %6 %6 is 1? Nonononononononono, you're missing the entire point here of SSSE3. Do the multiplication here!!! It will be way faster. This comment applies to all similar functions. > +%macro MC20 3 > +cglobal_mc %1, %2, mc20, %3, 3,4,8, 0 > + mov r3d, %3 > +.skip_prologue: > +%if cpuflag(ssse3) || cpuflag(avx) > + mova m7, [pb_1] > +%else Doesn't AVX imply SSSE3? > + psllw m2, 2 > + psubw m2, m1 > + pmullw m2, m6 Per the above, these three instructions should not be there on SSSE3. > +;----------------------------------------------------------------------------- > +; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) > +;----------------------------------------------------------------------------- > +%macro MC30 3 > +cglobal_mc %1, %2, mc30, %3, 3,5,8, 0 > + mov r3d, %3 > +.skip_prologue: > + lea r4, [r1+1] Please align your commas. > +;----------------------------------------------------------------------------- > +; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) > +;----------------------------------------------------------------------------- > +%macro V_FILT 3-5 > +%if %0 >= 4 > +v_filt%1_%2_%4: > +v_filt%1_%2_%5: > +%endif > +v_filt%1_%2_%3: > + add r4, r2 > +.no_addr4: > + mova m6, m2 > + movh m5, [r1] > + paddw m6, m3 > + psllw m6, 2 > + psubw m6, m1 > + psubw m6, m4 > + punpcklbw m5, m7 > + pmullw m6, [pw_5] > + paddw m0, [pw_16] > + paddw m0, m5 > + paddw m0, m6 > + psraw m0, 5 > + packuswb m0, m0 > + add r1, r2 > + add r0, r2 > + ret > +%endmacro > + > +INIT_MMX > +RESET_MM_PERMUTATION > +%assign i 0 > +%rep 4 > +V_FILT 4, i, mmx2 > +SWAP 0,1,2,3,4,5 > +%assign i i+1 > +%endrep > + > +INIT_XMM > +RESET_MM_PERMUTATION > +%assign i 0 > +%rep 6 > +V_FILT 8, i, sse2, ssse3, avx > +SWAP 0,1,2,3,4,5 > +%assign i i+1 > +%endrep > + > +%macro PRELOAD_V 0 > + pxor m7, m7 > + lea r3, [r2*3] > + sub r1, r3 > + movh m0, [r1+r2] > + movh m1, [r1+r2*2] > + add r1, r3 > + movu m2, [r1] > + movu m3, [r1+r2] > + movu m4, [r1+r2*2] > + add r1, r3 > + punpcklbw m0, m7 > + punpcklbw m1, m7 > + punpcklbw m2, m7 > + punpcklbw m3, m7 > + punpcklbw m4, m7 > +%endmacro > + > +%macro MC02 3 > +cglobal_mc %1, %2, mc02, %3, 3,4,8, 1 > +.skip_prologue: > + PRELOAD_V > + > + sub r0, r2 > +%assign j 0 > +%rep %3 > + %assign i (j % 6) > + call v_filt%3_ %+ i %+ _%1.no_addr4 > + OP_MOV [r0], m0 > + SWAP 0,1,2,3,4,5 > + %assign j j+1 > +%endrep > + ret > +%endmacro > + > +MC MC02, 0 What in the world is going on here? Why are you calling instead of looping? This is confusing and weird and not explained in comments. Just commenting on this first part for now. Jason _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
