The attached patch speeds up the 888 and 565 compositing
functions in xftcore.c a lot. I don't have standalone
benchmarks, but from previous timings of similar
functions and from the benchmarks in combination
with Pango, my estimation is that it more than
doubles the compositing speed.
- Get rid of the function calls. They hurts a lot.
CPU/Memory rations are high, but not that high.
- Get rid of passing around colors packed together
into words.
- Single assignment for temporary variables ...
GCC generates considerably better code if
you don't reassign variables.
- Do the 888 case with byte reads/writes rather
than word assignments ... for system memory
this is considerably faster on ia32 at least.
(But it's easy to produce a version of the patch
that doesn't do this, and that simplifies
a bit by removing the endian casing.)
I'm not sure how portable the inclusion of Xarch.h
is to get compile-time endian defines. You could
do it run time with two versions of the relevant
function if necessary.
There is another factor of 2-3 of speed available
with MMX, but I don't think it's worth it in this
context...this patch already makes compositing much
less of the bottleneck.
Regards,
Owen
Index: xftcore.c
===================================================================
RCS file: /cvs/xc/lib/Xft/xftcore.c,v
retrieving revision 1.13
diff -u -p -r1.13 xftcore.c
--- xftcore.c 2003/02/15 22:30:51 1.13
+++ xftcore.c 2003/03/06 20:35:28
@@ -25,7 +25,13 @@
#include <stdlib.h>
#include "xftint.h"
#include <X11/Xmd.h>
+#include <X11/Xarch.h>
+#if !defined(X_BIG_ENDIAN) || !defined (X_LITTLE_ENDIAN) || \
+ (X_BYTE_ORDER != X_BIG_ENDIAN && X_BYTE_ORDER != X_LITTLE_ENDIAN)
+#error "X_BYTE_ORDER not properly defined"
+#endif
+
void
XftRectCore (XftDraw *draw,
_Xconst XftColor *color,
@@ -460,29 +466,42 @@ _XftSmoothGlyphGray8888 (XImage *im
int y,
_Xconst XftColor *color)
{
- CARD32 src, srca;
- CARD32 r, g, b;
- CARD32 *dstLine, *dst, d;
- CARD8 *maskLine, *mask, m;
+ CARD32 srca;
+ CARD32 color1, color2, color3;
+ CARD8 *dstLine, *dst;
+ CARD8 *maskLine, *mask;
int dstStride, maskStride;
int width, height;
int w;
srca = color->color.alpha >> 8;
+
+#if X_BYTE_ORDER == X_BIG_ENDIAN
+# define COLOR1_INDEX 1
+# define COLOR2_INDEX 2
+# define COLOR3_INDEX 3
+#else
+# define COLOR1_INDEX 0
+# define COLOR2_INDEX 1
+# define COLOR3_INDEX 2
+#endif
/* This handles only RGB and BGR */
- g = (color->color.green & 0xff00);
+ color2 = color->color.green >> 8;
+#if X_BYTE_ORDER == X_BIG_ENDIAN
if (image->red_mask == 0xff0000)
+#else
+ if (image->red_mask == 0x0000ff)
+#endif
{
- r = (color->color.red & 0xff00) << 8;
- b = color->color.blue >> 8;
+ color1 = color->color.red >> 8;
+ color3 = color->color.blue >> 8;
}
else
{
- r = color->color.red >> 8;
- b = (color->color.blue & 0xff00) << 8;
+ color3 = color->color.red >> 8;
+ color1 = color->color.blue >> 8;
}
- src = (srca << 24) | r | g | b;
width = xftg->metrics.width;
height = xftg->metrics.height;
@@ -490,11 +509,11 @@ _XftSmoothGlyphGray8888 (XImage *im
x -= xftg->metrics.x;
y -= xftg->metrics.y;
- dstLine = (CARD32 *) (image->data + image->bytes_per_line * y + (x << 2));
- dstStride = image->bytes_per_line >> 2;
+ dstLine = image->data + image->bytes_per_line * y + (x << 2);
+ dstStride = image->bytes_per_line;
maskLine = (unsigned char *) xftg->bitmap;
maskStride = (width + 3) & ~3;
-
+
while (height--)
{
dst = dstLine;
@@ -505,20 +524,28 @@ _XftSmoothGlyphGray8888 (XImage *im
while (w--)
{
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- *dst = src;
- else
- *dst = fbOver24 (src, *dst);
- }
- else if (m)
+ if (*mask)
{
- d = fbIn (src, m);
- *dst = fbOver24 (d, *dst);
- }
- dst++;
+ CARD32 tc11, tc21, tc31;
+ CARD32 tc12, tc22, tc32;
+ CARD32 ta1 = *mask * srca + 0x80;
+ CARD8 ta = ((ta1 + (ta1 >> 8)) >> 8);
+
+ tc11 = (255 - ta) * dst[COLOR1_INDEX] + 0x80;
+ tc12 = ta * color1 + 0x80;
+ dst[COLOR1_INDEX] = ((tc11 + (tc11 >> 8)) >> 8) + ((tc12 + (tc12 >> 8)) >> 8);
+
+ tc21 = (255 - ta) * dst[COLOR2_INDEX] + 0x80;
+ tc22 = ta * color2 + 0x80;
+ dst[COLOR2_INDEX] = ((tc21 + (tc21 >> 8)) >> 8) + ((tc22 + (tc22 >> 8)) >> 8);
+
+ tc31 = (255 - ta) * dst[COLOR3_INDEX] + 0x80;
+ tc32 = ta * color3 + 0x80;
+ dst[COLOR3_INDEX] = ((tc31 + (tc31 >> 8)) >> 8) + ((tc32 + (tc32 >> 8)) >> 8);
+ }
+
+ mask++;
+ dst += 4;
}
}
}
@@ -532,9 +559,8 @@ _XftSmoothGlyphGray565 (XImage *ima
{
CARD32 src, srca;
CARD32 r, g, b;
- CARD32 d;
CARD16 *dstLine, *dst;
- CARD8 *maskLine, *mask, m;
+ CARD8 *maskLine, *mask;
int dstStride, maskStride;
int width, height;
int w;
@@ -576,24 +602,36 @@ _XftSmoothGlyphGray565 (XImage *ima
while (w--)
{
- m = *mask++;
- if (m == 0xff)
+ if (*mask)
{
- if (srca == 0xff)
- d = src;
- else
- {
- d = *dst;
- d = fbOver24 (src, cvt0565to8888(d));
- }
- *dst = cvt8888to0565(d);
+ CARD32 tr0, tg0, tb0;
+ CARD32 tr1, tg1, tb1;
+ CARD32 tr2, tg2, tb2;
+ CARD32 tr3, tg3, tb3;
+
+ CARD16 d = *dst;
+ CARD32 ta1 = *mask * srca + 0x80;
+ CARD8 ta = ((ta1 + (ta1 >> 8)) >> 8);
+
+ tr0 = (d & 0xf800);
+ tr1 = (255 - ta) * ((tr0 >> 8) + (tr0 >> 13)) + 0x80;
+ tr2 = ta * r + 0x80;
+ tr3 = ((tr1 + (tr1 >> 8)) >> 8) + ((tr2 + (tr2 >> 8)) >> 8);
+
+ tg0 = (d & 0x07e0);
+ tg1 = (255 - ta) * ((tg0 >> 3) + (tg0 >> 9)) + 0x80;
+ tg2 = ta * g + 0x80;
+ tg3 = ((tg1 + (tg1 >> 8)) >> 8) + ((tg2 + (tg2 >> 8)) >> 8);
+
+ tb0 = (d & 0x001f);
+ tb1 = (255 - ta) * ((tb0 << 3) + (tb0 >> 2)) + 0x80;
+ tb2 = ta * b + 0x80;
+ tb3 = ((tb1 + (tb1 >> 8)) >> 8) + ((tb2 + (tb2 >> 8)) >> 8);
+
+ *dst = (tr3 << 16) | (tg3 << 8) | tb3;
}
- else if (m)
- {
- d = *dst;
- d = fbOver24 (fbIn(src,m), cvt0565to8888(d));
- *dst = cvt8888to0565(d);
- }
+
+ mask++;
dst++;
}
}