Re: [libav-devel] [RFC] H.264: Convert H.264 luma MC ASM from inlined to yasm.

Jason Garrett-Glaser Tue, 03 Jan 2012 16:11:26 -0800

>  MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
>                                           x86/deinterlace.o             \
>                                           x86/fmtconvert.o              \
> +                                          x86/h264_qpel.o               \
>                                           x86/h264_qpel_10bit.o         \
>                                           $(YASM-OBJS-yes)


Is it just my email client, or is there an alignment issue here?

> +h_filt_shuf1: db  2,  3
> +              db  3,  4
> +              db  4,  5
> +              db  5,  6
> +              db  6,  7
> +              db  7,  8
> +              db  8,  9
> +              db  9, 10
> +h_filt_shuf2: db  0,  5
> +              db  1,  6
> +              db  2,  7
> +              db  3,  8
> +              db  4,  9
> +              db  5, 10
> +              db  6, 11
> +              db  7, 12
> +h_filt_shuf3: db  1,  4
> +              db  2,  5
> +              db  3,  6
> +              db  4,  7
> +              db  5,  8
> +              db  6,  9
> +              db  7, 10
> +              db  8, 11

Can you make these one line each?  This is needlessly line-breaky.

> +SECTION .text
> +
> +%macro AVG_MOV 2
> +; In the internal 16x16 calls, the pointer is incremented by 8 bytes.
> +; This breaks 16-byte alignment, so movement to mmx is required.
> +; Is there a way around this?
> +%if mmsize == 16
> +    movdq2q mm0, %2
> +    pavgb   mm0, %1
> +    movq     %1, mm0
> +%else
> +    pavgb    %2, %1
> +    MOV_OP   %1, %2
> +%endif
> +%endmacro

What in the world is this about?  You can use movq with SSE, silly.

> +%macro MC 2
> +%define OP_MOV movh
> +INIT_MMX mmx2
> +%1 mmx2, put, 4
> +INIT_XMM sse2
> +%1 sse2, put, 8
> +%if %2 ; ssse3, also implies avx

Misleading comment, I'm pretty sure ssse3 does not imply avx.

> +    mov  r0, r0m
> +    mov  r1, r1m

Why do you need to reload these pointers from the stack?  You know
/exactly/ how the stub has changed them, so you can take that into
account and do the proper math.

> +;-----------------------------------------------------------------------------
> +; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
> +;-----------------------------------------------------------------------------
> +%macro COPY4 1
> +    %1            m0, [r1     ]
> +    OP_MOV [r0     ], m0
> +    %1            m0, [r1+r2  ]
> +    OP_MOV [r0+r2  ], m0
> +    %1            m0, [r1+r2*2]
> +    OP_MOV [r0+r2*2], m0
> +    %1            m0, [r1+r3  ]
> +    OP_MOV [r0+r3  ], m0
> +%endmacro
> +
> +%macro MC00 1
> +INIT_MMX mmx2
> +%define MOV_OP movh
> +cglobal %1_h264_qpel4_mc00,3,4
> +    lea   r3, [r2*3   ]
> +    COPY4 movh
> +    RET
> +
> +%define MOV_OP mova
> +cglobal %1_h264_qpel8_mc00,3,4
> +    lea   r3, [r2*3   ]
> +    COPY4 movu
> +    lea   r0, [r0+r2*4]
> +    lea   r1, [r1+r2*4]
> +    COPY4 movu
> +    RET
> +
> +INIT_XMM sse2
> +cglobal %1_h264_qpel16_mc00,3,5
> +    lea   r3, [r2*3   ]
> +    mov  r4d, 4
> +.loop:
> +    COPY4 movu
> +    lea   r0, [r0+r2*4]
> +    lea   r1, [r1+r2*4]
> +    dec  r4d
> +    jg .loop
> +    REP_RET
> +%endmacro
> +
> +%macro AVG_MOV_MC00 2
> +; See AVG_MOV above why this is necessary -- any way around it?
> +    pavgb    %2, %1
> +    MOV_OP   %1, %2
> +%endmacro
> +
> +INIT_MMX
> +%define OP_MOV MOV_OP
> +MC00 put
> +
> +INIT_MMX
> +%define OP_MOV AVG_MOV_MC00
> +MC00 avg
> +
> +%define MOV_OP movh ; After mc00, it should be movh

Don't we already have yasm functions for these somewhere?

> +;-----------------------------------------------------------------------------
> +; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
> +;-----------------------------------------------------------------------------
> +%macro H_FILT_XMM 7 ; tmp1-5, zero/one, mem
> +%if cpuflag(ssse3) || cpuflag(avx)
> +    movu      %2, [%7-2]
> +    pshufb    %3, %2, [h_filt_shuf1]
> +    pshufb    %1, %2, [h_filt_shuf2]
> +    pshufb    %2,     [h_filt_shuf3]

You can do this:
 +    pshufb    %2, %2, [h_filt_shuf3]
Given how you aligned it, it looks cleaner.

Make sure these versions of the functions aren't enabled on Atom --
they will be dog slow because pshufb is completely unpipelined (6/6)
there, and pmaddubsw is obnoxiously slow (5/2) too.

> +    pmaddubsw %3, %6
> +    pmaddubsw %1, %6
> +    pmaddubsw %2, %6

%6 is 1?  Nonononononononono, you're missing the entire point here of
SSSE3.  Do the multiplication here!!!  It will be way faster.  This
comment applies to all similar functions.

> +%macro MC20 3
> +cglobal_mc %1, %2, mc20, %3, 3,4,8, 0
> +    mov      r3d, %3
> +.skip_prologue:
> +%if cpuflag(ssse3) || cpuflag(avx)
> +    mova      m7, [pb_1]
> +%else

Doesn't AVX imply SSSE3?

> +    psllw     m2, 2
> +    psubw     m2, m1
> +    pmullw    m2, m6

Per the above, these three instructions should not be there on SSSE3.

> +;-----------------------------------------------------------------------------
> +; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
> +;-----------------------------------------------------------------------------
> +%macro MC30 3
> +cglobal_mc %1, %2, mc30, %3, 3,5,8, 0
> +    mov      r3d, %3
> +.skip_prologue:
> +    lea r4, [r1+1]

Please align your commas.

> +;-----------------------------------------------------------------------------
> +; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
> +;-----------------------------------------------------------------------------
> +%macro V_FILT 3-5
> +%if %0 >= 4
> +v_filt%1_%2_%4:
> +v_filt%1_%2_%5:
> +%endif
> +v_filt%1_%2_%3:
> +    add        r4, r2
> +.no_addr4:
> +    mova       m6, m2
> +    movh       m5, [r1]
> +    paddw      m6, m3
> +    psllw      m6, 2
> +    psubw      m6, m1
> +    psubw      m6, m4
> +    punpcklbw  m5, m7
> +    pmullw     m6, [pw_5]
> +    paddw      m0, [pw_16]
> +    paddw      m0, m5
> +    paddw      m0, m6
> +    psraw      m0, 5
> +    packuswb   m0, m0
> +    add    r1, r2
> +    add    r0, r2
> +    ret
> +%endmacro
> +
> +INIT_MMX
> +RESET_MM_PERMUTATION
> +%assign i 0
> +%rep 4
> +V_FILT 4, i, mmx2
> +SWAP 0,1,2,3,4,5
> +%assign i i+1
> +%endrep
> +
> +INIT_XMM
> +RESET_MM_PERMUTATION
> +%assign i 0
> +%rep 6
> +V_FILT 8, i, sse2, ssse3, avx
> +SWAP 0,1,2,3,4,5
> +%assign i i+1
> +%endrep
> +
> +%macro PRELOAD_V 0
> +    pxor      m7, m7
> +    lea       r3, [r2*3]
> +    sub       r1, r3
> +    movh      m0, [r1+r2]
> +    movh      m1, [r1+r2*2]
> +    add       r1, r3
> +    movu      m2, [r1]
> +    movu      m3, [r1+r2]
> +    movu      m4, [r1+r2*2]
> +    add       r1, r3
> +    punpcklbw m0, m7
> +    punpcklbw m1, m7
> +    punpcklbw m2, m7
> +    punpcklbw m3, m7
> +    punpcklbw m4, m7
> +%endmacro
> +
> +%macro MC02 3
> +cglobal_mc %1, %2, mc02, %3, 3,4,8, 1
> +.skip_prologue:
> +    PRELOAD_V
> +
> +    sub      r0, r2
> +%assign j 0
> +%rep %3
> +    %assign i (j % 6)
> +    call v_filt%3_ %+ i %+ _%1.no_addr4
> +    OP_MOV [r0], m0
> +    SWAP 0,1,2,3,4,5
> +    %assign j j+1
> +%endrep
> +    ret
> +%endmacro
> +
> +MC MC02, 0

What in the world is going on here?  Why are you calling instead of
looping?  This is confusing and weird and not explained in comments.

Just commenting on this first part for now.

Jason
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [RFC] H.264: Convert H.264 luma MC ASM from inlined to yasm.

Reply via email to