Hi, On Tue, Jan 17, 2012 at 9:55 PM, Ronald S. Bultje <[email protected]> wrote: > Also implement sse2/ssse3/avx versions. [..] > +; %1 = nr. of XMM registers > +; %2 = rgb or bgr > +%macro RGB_TO_Y_FN 2 > +cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w > +%ifdef ARCH_X86_64 > + movsxd wq, wd > +%endif > + add dstq, wq > + neg wq > + pxor m7, m7 > + mova m4, [rgb_Yrnd] > +%if mmsize == 8 > + mova m5, [%2_Ycoeff_12x4] > + mova m6, [%2_Ycoeff_3x56] > +%define coeff1 m5 > +%define coeff2 m6 > +%else ; mmsize == 16 > +%ifdef ARCH_X86_64 > + mova m8, [%2_Ycoeff_12x4] > + mova m9, [%2_Ycoeff_3x56] > +%define coeff1 m8 > +%define coeff2 m9 > +%if cpuflag(ssse3) > + mova m10, [shuf_rgb_mem] > +%define shuf_rgb m10 > +%endif ; cpuflag(ssse3) > +%else ; x86-32 > +%define coeff1 [%2_Ycoeff_12x4] > +%define coeff2 [%2_Ycoeff_3x56] > +%if cpuflag(ssse3) > +%define shuf_rgb [shuf_rgb_mem] > +%endif ; cpuflag(ssse3) > +%endif ; x86-32/64 > +%endif ; mmsize = 8/16 > +.loop: > +%if cpuflag(ssse3) > + movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3] > + movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7] > + pshufb m0, shuf_rgb ; (byte) { B0, G0, R0, B1, B2, G2, > R2, B3, > + ; R0, B1, G1, R1, R2, B3, > G3, R3 } > + pshufb m2, shuf_rgb ; (byte) { B4, G4, R4, B5, B6, G6, > R6, B7, > + ; R4, B5, G5, R5, R6, B7, > G7, R7 } > + punpckhbw m1, m0, m7 ; (word) { R0, B1, G1, R1, R2, B3, > G3, R3 } > + punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, > R2, B3 } > + punpckhbw m3, m2, m7 ; (word) { R4, B5, G5, R5, R6, B7, > G7, R7 } > + punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, > R6, B7 } > +%else ; !cpuflag(ssse3) > + movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 } > + movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } > + movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 } > + movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 } > +%if mmsize == 16 ; i.e. sse2 > + punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, > R2, B3 } > + punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, > G3, R3 } > + movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 } > + movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 } > + movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 } > + movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 } > + punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, > R6, B7 } > + punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, > G7, R7 } > +%endif ; mmsize == 16 > + punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, > R2, B3 } > + punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, > G3, R3 } > + punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, > R6, B7 } > + punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, > G7, R7 } > +%endif ; cpuflag(ssse3) > + add srcq, 3 * mmsize / 2 > + pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, > B2*BY + G2*GY, B3*BY } > + pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, > R2*RY, G3+GY + R3*RY } > + pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, > B6*BY + G6*GY, B7*BY } > + pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, > R6*RY, G7+GY + R7*RY } > + paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY > }[0-3] > + paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY > }[4-7] > + paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { > Y[0-3] } > + paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { > Y[4-7] } > + psrad m0, 15 > + psrad m2, 15 > + packssdw m0, m2 ; (word) { Y[0-7] } > + packuswb m0, m0 ; (byte) { Y[0-7] } > + movh [dstq+wq], m0 > + add wq, mmsize / 2 > + jl .loop > + REP_RET > +%endmacro
Since this macro is used twice (bgr24 and rgb24) for each optimization (sse2, ssse3, avx), I'm wondering, for something like this, if people agree that on x86-64, where the loop code is identical (since multiplier variable loading happens outside the loop), whether it makes sense to replace the loop of the second iteration of this function (e.g. rgb24) with a jmp to the first iteration of the function (bgr24). I.e.: %if x86-32 || bgr24 .loop: .. code .. jl .loop REP_RET %else jmp bgr24function.loop %endif The same principle could be used later for a function that does RGBA/BGRA/ARGB/ABGR without wasting tons of space for functions that do pretty much the same thing Ronald _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
