> +%macro rgbToY_fn 2

Capitalized macro names please, unless libav has decided to drop this
standard convention for some reason.

> +cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
> +%ifdef ARCH_X86_64
> +    movsxd         wq, wd
> +%endif
> +    add          dstq, wq
> +    neg            wq
> +    pxor           m7, m7
> +    mova           m4, [rgb_Yrnd]
> +%if mmsize == 8
> +    mova           m5, [%2_Ycoeff_12x4]
> +    mova           m6, [%2_Ycoeff_3x56]
> +%define coeff1 m5
> +%define coeff2 m6
> +%else ; mmsize == 16
> +%ifdef ARCH_X86_64
> +    mova          m8, [%2_Ycoeff_12x4]
> +    mova          m9, [%2_Ycoeff_3x56]

Comma alignment.

> +%define coeff1 m8
> +%define coeff2 m9
> +%else ; x86-32
> +%define coeff1 [%2_Ycoeff_12x4]
> +%define coeff2 [%2_Ycoeff_3x56]
> +%endif ; x86-32/64
> +%endif ; mmsize = 8/16
> +.loop:
> +    movd           m0, [srcq+0]           ; (byte) { B0, G0, R0, B1 }
> +    movd           m1, [srcq+2]           ; (byte) { R0, B1, G1, R1 }
> +    movd           m2, [srcq+6]           ; (byte) { B2, G2, R2, B3 }
> +    movd           m3, [srcq+8]           ; (byte) { R2, B3, G3, R3 }
> +%if mmsize == 16
> +    punpckldq      m0, m2
> +    punpckldq      m1, m3
> +    movd           m2, [srcq+12]          ; (byte) { B4, G4, R4, B5 }
> +    movd           m3, [srcq+14]          ; (byte) { R4, B5, G5, R5 }
> +    movd           m5, [srcq+18]          ; (byte) { B6, G6, R6, B7 }
> +    movd           m6, [srcq+20]          ; (byte) { R6, B7, G7, R7 }
> +    punpckldq      m2, m5
> +    punpckldq      m3, m6
> +%endif ; mmsize == 16

When the pigs can't be beaten, you need to call the Mighty
Eagle^H^H^H^H^Hpshufb.  Seriously, this code looks kinda awful.

> +    add          srcq, 3 * mmsize / 2
> +    punpcklbw      m0, m7                 ; (word) { B0, G0, R0, B1 }
> +    punpcklbw      m1, m7                 ; (word) { R0, B1, G1, R1 }
> +    punpcklbw      m2, m7                 ; (word) { B2, G2, R2, B3 }
> +    punpcklbw      m3, m7                 ; (word) { R2, B3, G3, R3 }
> +    pmaddwd        m0, coeff1             ; (dword) { B0*BY + G0*GY, B1*BY }
> +    pmaddwd        m1, coeff2             ; (dword) { R0*RY, G1+GY + R1*RY }
> +    pmaddwd        m2, coeff1             ; (dword) { B2*BY + G2*GY, B3*BY }
> +    pmaddwd        m3, coeff2             ; (dword) { R2*RY, G3+GY + R3*RY }

A lower-precision SSSE3-based maddubsw version might be applicable
here for later work?

Would this be faster with pmulhw and a different byte ordering?

Jason
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to