Matthieu Herrb:
Does someone have some benchmarks or numbers from real life X usage showing the benefits of using the SSE2 code in pixman 0.12.0 over non SSE2 on x86 and/or x86_64 cpus?

I'm asking because OpenBSD is stuck with an old gcc version which can't compile the SSE2 code, and I wonder if it's worth hacking something to be able ship a version compiled by gcc 4.2 or later with SSE2 support with OpenBSD binary distributions.


I cannot answer your posed question but... it certainly isn't an 1.x version of gcc or something? :-)

Do you have a problem with inline assembler SSE2? Can you compile SSE2-intrinsics code? To figure out, try this small snippet attached. If it goes well then rewrite pixman using SSE2-intrinsics makes sense a lot (if it already isn't), it turns code from "non maintainable" to "maintainable".

Greetings,

    Eeri Kask
/*
  gcc -O2 -msse2 -c -o sse2_YUVtoARGB32.o sse2_YUVtoARGB32.c
  gcc -O2 -msse2 -S -fverbose-asm -o sse2_YUVtoARGB32.S sse2_YUVtoARGB32.c
*/

#ifndef RGBA32
#define RGBA32    unsigned int
#endif


#if ( (defined (__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || 
defined _WIN32 )

#include <emmintrin.h>

#define YUV2RGBA_2x_Convert(R,G,B,rgba)                         \
do {    /* YUV-nach-RGB Berechnung wenn Y,U,V-Werte gegeben: */ \
    register __m128i    P, X, D;                                \
    static const union {                                        \
        signed short    d[8];                                   \
        __m128i         m128;                                   \
    }       /* V     U    Y    I */                             \
    MulR = {{+409,    0, 298, 128,      +409,    0, 298, 128}}, \
    MulG = {{-208, -100, 298, 128,      -208, -100, 298, 128}}, \
    MulB = {{   0, +516, 298, 128,         0, +516, 298, 128}}; \
    /* Blue: */                                                 \
    P = _mm_madd_epi16 ((B), MulB.m128);                        \
    X = _mm_shuffle_epi32 (P, _MM_SHUFFLE(2,3,0,1));/*hi..lo*/  \
    P = _mm_add_epi32 (P, X);                                   \
    P = _mm_srli_epi64 (P, 32);                                 \
    P = _mm_srai_epi32 (P, 8);                                  \
    P = _mm_shufflehi_epi16 (P, _MM_SHUFFLE(3,3,3,0));          \
    P = _mm_shufflelo_epi16 (P, _MM_SHUFFLE(3,3,3,0));          \
    /* Green: */                                                \
    D = _mm_madd_epi16 ((G), MulG.m128);                        \
    X = _mm_shuffle_epi32 (D, _MM_SHUFFLE(2,3,0,1));            \
    D = _mm_add_epi32 (D, X);                                   \
    D = _mm_srli_epi64 (D, 32);                                 \
    D = _mm_srai_epi32 (D, 8);                                  \
    D = _mm_shufflehi_epi16 (D, _MM_SHUFFLE(3,3,0,3));          \
    D = _mm_shufflelo_epi16 (D, _MM_SHUFFLE(3,3,0,3));          \
    P = _mm_or_si128 (P, D);                                    \
    /* Red: */                                                  \
    D = _mm_madd_epi16 ((R), MulR.m128);                        \
    X = _mm_shuffle_epi32 (D, _MM_SHUFFLE(2,3,0,1));            \
    D = _mm_add_epi32 (D, X);                                   \
    D = _mm_srli_epi64 (D, 32);                                 \
    D = _mm_srai_epi32 (D, 8);                                  \
    D = _mm_shufflehi_epi16 (D, _MM_SHUFFLE(3,0,3,3));          \
    D = _mm_shufflelo_epi16 (D, _MM_SHUFFLE(3,0,3,3));          \
    P = _mm_or_si128 (P, D);                                    \
    /* Pack and store two consecutive RGBA pixels: */           \
    P = _mm_packus_epi16 (P, P);                                \
    _mm_storel_epi64 ((__m128i*)(rgba), P);                     \
} while (0)

#define YUV2RGBA_2x(y1,u1,v1,y2,u2,v2,rgba)                     \
do {                                                            \
    register __m128i    T;                                      \
    static const union {                                        \
        signed short    d[8];                                   \
        __m128i         m128;                                   \
    }                                                           \
    AddA = {{-128, -128, -16,   1,      -128, -128, -16,   1}}; \
    /* Load XMM register with current YUV data (for two pixels): */ \
    T = _mm_setr_epi16 ((v1), (u1), (y1), 0, (v2), (u2), (y2), 0);  \
    T = _mm_add_epi16 (T, AddA.m128);                           \
    YUV2RGBA_2x_Convert(T,T,T,rgba);                            \
} while (0)


#else


#define YUV2RGBA(y,u,v,rgba)                            \
do {    /* C-Code der obigen YUV-nach-RGB Berechnung: */ \
    register short R, G, B; short C, D, E;              \
    C = (short)(y)  - 16;                               \
    D = (short)(u) - 128;                               \
    E = (short)(v) - 128;                               \
    R = (298 * C             + 409 * E  + 128) >> 8;    \
    G = (298 * C  - 100 * D  - 208 * E  + 128) >> 8;    \
    B = (298 * C  + 516 * D             + 128) >> 8;    \
    if (R > 255)                                        \
        R = 255;                                        \
    else if (R < 0)                                     \
        R = 0;                                          \
    if (G > 255)                                        \
        G = 255;                                        \
    else if (G < 0)                                     \
        G = 0;                                          \
    if (B > 255)                                        \
        B = 255;                                        \
    else if (B < 0)                                     \
        B = 0;                                          \
    (*(rgba)) = (R << 16) | (G << 8) | (B);             \
} while (0)

#define YUV2RGBA_2x(y1,u1,v1,y2,u2,v2,rgba) \
    
do{YUV2RGBA((y1),(u1),(v1),(rgba));YUV2RGBA((y2),(u2),(v2),(rgba+1));}while(0)


#endif



void sse2_YUV444toARGB32 (int w, int h, unsigned char *yuv, RGBA32 *rgba32)
{
    int         i, j;

    for (i = 0; i < h; ++i)
        for (j = 0; j < w; j+=2) { /* U Y V  (one pixel) */
            YUV2RGBA_2x(/*Y*/(*(yuv+1)),/*U*/(*(yuv+0)),/*V*/(*(yuv+2)), 
/*Y*/(*(yuv+4)),/*U*/(*(yuv+3)),/*V*/(*(yuv+5)),/*out*/rgba32);
            rgba32 += 2;
            yuv += 6;
        }
}


void sse2_YUV422toARGB32 (int w, int h, unsigned char *yuv, RGBA32 *rgba32)
{
    int         i, j;

    for (i = 0; i < h; ++i)
        for (j = 0; j < w; j+=2) { /* U Y1 V Y2  (two pixels) */
            
YUV2RGBA_2x((*(yuv+1)),(*(yuv+0)),(*(yuv+2)),(*(yuv+3)),(*(yuv+0)),(*(yuv+2)),rgba32);
            rgba32 += 2;
            yuv += 4;
        }
}


void sse2_YUV411toARGB32 (int w, int h, unsigned char *yuv, RGBA32 *rgba32)
{
    int         i, j;

    for (i = 0; i < h; ++i)
        for (j = 0; j < w; j+=4) { /* U Y1 Y2 V Y3 Y4  (four pixels) */
            
YUV2RGBA_2x((*(yuv+1)),(*(yuv+0)),(*(yuv+3)),(*(yuv+2)),(*(yuv+0)),(*(yuv+3)),rgba32);
            rgba32 += 2;
            
YUV2RGBA_2x((*(yuv+4)),(*(yuv+0)),(*(yuv+3)),(*(yuv+5)),(*(yuv+0)),(*(yuv+3)),rgba32);
            rgba32 += 2;
            yuv += 6;
        }
}


void sse2_YUV420PtoARGB32 (int w, int h, unsigned char *Y, unsigned char *Cr, 
RGBA32 *rgba32)
{
    int         i, j;
    unsigned char       *Cb = Cr + (w>>1)*(h>>1);

    for (i = 0; i < h; ++i) {
        for (j = 0; j < w; j+=2) { /* planar format: first Y plane, followed by 
U, V planes */
            YUV2RGBA_2x((*Y),(*Cr),(*Cb),(*(Y+1)),(*Cr),(*Cb),(rgba32));
            rgba32 += 2;
            Y += 2;
            ++Cr, ++Cb;
        }
        if ((i&1) == 0)
            Cr -= w>>1, Cb -= w>>1;
    }
}

_______________________________________________
xorg mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/xorg

Reply via email to