Matthieu Herrb:
Does someone have some benchmarks or numbers from real life X usage
showing the benefits of using the SSE2 code in pixman 0.12.0 over non
SSE2 on x86 and/or x86_64 cpus?
I'm asking because OpenBSD is stuck with an old gcc version which can't
compile the SSE2 code, and I wonder if it's worth hacking something to
be able ship a version compiled by gcc 4.2 or later with SSE2 support
with OpenBSD binary distributions.
I cannot answer your posed question but... it certainly isn't an 1.x
version of gcc or something? :-)
Do you have a problem with inline assembler SSE2? Can you compile
SSE2-intrinsics code? To figure out, try this small snippet attached.
If it goes well then rewrite pixman using SSE2-intrinsics makes sense a
lot (if it already isn't), it turns code from "non maintainable" to
"maintainable".
Greetings,
Eeri Kask
/*
gcc -O2 -msse2 -c -o sse2_YUVtoARGB32.o sse2_YUVtoARGB32.c
gcc -O2 -msse2 -S -fverbose-asm -o sse2_YUVtoARGB32.S sse2_YUVtoARGB32.c
*/
#ifndef RGBA32
#define RGBA32 unsigned int
#endif
#if ( (defined (__GNUC__) && (defined(__i386__) || defined(__x86_64__))) ||
defined _WIN32 )
#include <emmintrin.h>
#define YUV2RGBA_2x_Convert(R,G,B,rgba) \
do { /* YUV-nach-RGB Berechnung wenn Y,U,V-Werte gegeben: */ \
register __m128i P, X, D; \
static const union { \
signed short d[8]; \
__m128i m128; \
} /* V U Y I */ \
MulR = {{+409, 0, 298, 128, +409, 0, 298, 128}}, \
MulG = {{-208, -100, 298, 128, -208, -100, 298, 128}}, \
MulB = {{ 0, +516, 298, 128, 0, +516, 298, 128}}; \
/* Blue: */ \
P = _mm_madd_epi16 ((B), MulB.m128); \
X = _mm_shuffle_epi32 (P, _MM_SHUFFLE(2,3,0,1));/*hi..lo*/ \
P = _mm_add_epi32 (P, X); \
P = _mm_srli_epi64 (P, 32); \
P = _mm_srai_epi32 (P, 8); \
P = _mm_shufflehi_epi16 (P, _MM_SHUFFLE(3,3,3,0)); \
P = _mm_shufflelo_epi16 (P, _MM_SHUFFLE(3,3,3,0)); \
/* Green: */ \
D = _mm_madd_epi16 ((G), MulG.m128); \
X = _mm_shuffle_epi32 (D, _MM_SHUFFLE(2,3,0,1)); \
D = _mm_add_epi32 (D, X); \
D = _mm_srli_epi64 (D, 32); \
D = _mm_srai_epi32 (D, 8); \
D = _mm_shufflehi_epi16 (D, _MM_SHUFFLE(3,3,0,3)); \
D = _mm_shufflelo_epi16 (D, _MM_SHUFFLE(3,3,0,3)); \
P = _mm_or_si128 (P, D); \
/* Red: */ \
D = _mm_madd_epi16 ((R), MulR.m128); \
X = _mm_shuffle_epi32 (D, _MM_SHUFFLE(2,3,0,1)); \
D = _mm_add_epi32 (D, X); \
D = _mm_srli_epi64 (D, 32); \
D = _mm_srai_epi32 (D, 8); \
D = _mm_shufflehi_epi16 (D, _MM_SHUFFLE(3,0,3,3)); \
D = _mm_shufflelo_epi16 (D, _MM_SHUFFLE(3,0,3,3)); \
P = _mm_or_si128 (P, D); \
/* Pack and store two consecutive RGBA pixels: */ \
P = _mm_packus_epi16 (P, P); \
_mm_storel_epi64 ((__m128i*)(rgba), P); \
} while (0)
#define YUV2RGBA_2x(y1,u1,v1,y2,u2,v2,rgba) \
do { \
register __m128i T; \
static const union { \
signed short d[8]; \
__m128i m128; \
} \
AddA = {{-128, -128, -16, 1, -128, -128, -16, 1}}; \
/* Load XMM register with current YUV data (for two pixels): */ \
T = _mm_setr_epi16 ((v1), (u1), (y1), 0, (v2), (u2), (y2), 0); \
T = _mm_add_epi16 (T, AddA.m128); \
YUV2RGBA_2x_Convert(T,T,T,rgba); \
} while (0)
#else
#define YUV2RGBA(y,u,v,rgba) \
do { /* C-Code der obigen YUV-nach-RGB Berechnung: */ \
register short R, G, B; short C, D, E; \
C = (short)(y) - 16; \
D = (short)(u) - 128; \
E = (short)(v) - 128; \
R = (298 * C + 409 * E + 128) >> 8; \
G = (298 * C - 100 * D - 208 * E + 128) >> 8; \
B = (298 * C + 516 * D + 128) >> 8; \
if (R > 255) \
R = 255; \
else if (R < 0) \
R = 0; \
if (G > 255) \
G = 255; \
else if (G < 0) \
G = 0; \
if (B > 255) \
B = 255; \
else if (B < 0) \
B = 0; \
(*(rgba)) = (R << 16) | (G << 8) | (B); \
} while (0)
#define YUV2RGBA_2x(y1,u1,v1,y2,u2,v2,rgba) \
do{YUV2RGBA((y1),(u1),(v1),(rgba));YUV2RGBA((y2),(u2),(v2),(rgba+1));}while(0)
#endif
void sse2_YUV444toARGB32 (int w, int h, unsigned char *yuv, RGBA32 *rgba32)
{
int i, j;
for (i = 0; i < h; ++i)
for (j = 0; j < w; j+=2) { /* U Y V (one pixel) */
YUV2RGBA_2x(/*Y*/(*(yuv+1)),/*U*/(*(yuv+0)),/*V*/(*(yuv+2)),
/*Y*/(*(yuv+4)),/*U*/(*(yuv+3)),/*V*/(*(yuv+5)),/*out*/rgba32);
rgba32 += 2;
yuv += 6;
}
}
void sse2_YUV422toARGB32 (int w, int h, unsigned char *yuv, RGBA32 *rgba32)
{
int i, j;
for (i = 0; i < h; ++i)
for (j = 0; j < w; j+=2) { /* U Y1 V Y2 (two pixels) */
YUV2RGBA_2x((*(yuv+1)),(*(yuv+0)),(*(yuv+2)),(*(yuv+3)),(*(yuv+0)),(*(yuv+2)),rgba32);
rgba32 += 2;
yuv += 4;
}
}
void sse2_YUV411toARGB32 (int w, int h, unsigned char *yuv, RGBA32 *rgba32)
{
int i, j;
for (i = 0; i < h; ++i)
for (j = 0; j < w; j+=4) { /* U Y1 Y2 V Y3 Y4 (four pixels) */
YUV2RGBA_2x((*(yuv+1)),(*(yuv+0)),(*(yuv+3)),(*(yuv+2)),(*(yuv+0)),(*(yuv+3)),rgba32);
rgba32 += 2;
YUV2RGBA_2x((*(yuv+4)),(*(yuv+0)),(*(yuv+3)),(*(yuv+5)),(*(yuv+0)),(*(yuv+3)),rgba32);
rgba32 += 2;
yuv += 6;
}
}
void sse2_YUV420PtoARGB32 (int w, int h, unsigned char *Y, unsigned char *Cr,
RGBA32 *rgba32)
{
int i, j;
unsigned char *Cb = Cr + (w>>1)*(h>>1);
for (i = 0; i < h; ++i) {
for (j = 0; j < w; j+=2) { /* planar format: first Y plane, followed by
U, V planes */
YUV2RGBA_2x((*Y),(*Cr),(*Cb),(*(Y+1)),(*Cr),(*Cb),(rgba32));
rgba32 += 2;
Y += 2;
++Cr, ++Cb;
}
if ((i&1) == 0)
Cr -= w>>1, Cb -= w>>1;
}
}
_______________________________________________
xorg mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/xorg