The attached patch speeds up the 888 and 565 compositing
functions in xftcore.c a lot. I don't have standalone
benchmarks, but from previous timings of similar
functions and from the benchmarks in combination
with Pango, my estimation is that it more than 
doubles the compositing speed.

 - Get rid of the function calls. They hurts a lot.
   CPU/Memory rations are high, but not that high.
  
 - Get rid of passing around colors packed together
   into words. 

 - Single assignment for temporary variables ...
   GCC generates considerably better code if
   you don't reassign variables.

 - Do the 888 case with byte reads/writes rather
   than word assignments ... for system memory
   this is considerably faster on ia32 at least.
 
   (But it's easy to produce a version of the patch
   that doesn't do this, and that simplifies
   a bit by removing the endian casing.)

I'm not sure how portable the inclusion of Xarch.h
is to get compile-time endian defines. You could
do it run time with two versions of the relevant
function if necessary.

There  is another factor of 2-3 of speed available
with MMX, but I don't think it's worth it in this 
context...this patch already makes compositing much 
less of the bottleneck.

Regards,
                             Owen


Index: xftcore.c
===================================================================
RCS file: /cvs/xc/lib/Xft/xftcore.c,v
retrieving revision 1.13
diff -u -p -r1.13 xftcore.c
--- xftcore.c	2003/02/15 22:30:51	1.13
+++ xftcore.c	2003/03/06 20:35:28
@@ -25,7 +25,13 @@
 #include <stdlib.h>
 #include "xftint.h"
 #include <X11/Xmd.h>
+#include <X11/Xarch.h>
 
+#if !defined(X_BIG_ENDIAN) || !defined (X_LITTLE_ENDIAN) || \
+    (X_BYTE_ORDER != X_BIG_ENDIAN && X_BYTE_ORDER != X_LITTLE_ENDIAN)
+#error "X_BYTE_ORDER not properly defined"
+#endif
+
 void
 XftRectCore (XftDraw		*draw,
 	     _Xconst XftColor	*color,
@@ -460,29 +466,42 @@ _XftSmoothGlyphGray8888 (XImage		    *im
 			 int		    y,
 			 _Xconst XftColor   *color)
 {
-    CARD32	src, srca;
-    CARD32	r, g, b;
-    CARD32	*dstLine, *dst, d;
-    CARD8	*maskLine, *mask, m;
+    CARD32	srca;
+    CARD32	color1, color2, color3;
+    CARD8	*dstLine, *dst;
+    CARD8	*maskLine, *mask;
     int		dstStride, maskStride;
     int		width, height;
     int		w;
 
     srca = color->color.alpha >> 8;
+
+#if X_BYTE_ORDER == X_BIG_ENDIAN
+#  define COLOR1_INDEX 1
+#  define COLOR2_INDEX 2
+#  define COLOR3_INDEX 3
+#else
+#  define COLOR1_INDEX 0    
+#  define COLOR2_INDEX 1
+#  define COLOR3_INDEX 2
+#endif    
     
     /* This handles only RGB and BGR */
-    g = (color->color.green & 0xff00);
+    color2 = color->color.green >> 8;
+#if X_BYTE_ORDER == X_BIG_ENDIAN    
     if (image->red_mask == 0xff0000)
+#else
+    if (image->red_mask == 0x0000ff)
+#endif	
     {
-	r = (color->color.red & 0xff00) << 8;
-	b = color->color.blue >> 8;
+	color1 = color->color.red >> 8;
+	color3 = color->color.blue >> 8;
     }
     else
     {
-	r = color->color.red >> 8;
-	b = (color->color.blue & 0xff00) << 8;
+	color3 = color->color.red >> 8;
+	color1 = color->color.blue >> 8;
     }
-    src = (srca << 24) | r | g | b;
     
     width = xftg->metrics.width;
     height = xftg->metrics.height;
@@ -490,11 +509,11 @@ _XftSmoothGlyphGray8888 (XImage		    *im
     x -= xftg->metrics.x;
     y -= xftg->metrics.y;
 
-    dstLine = (CARD32 *) (image->data + image->bytes_per_line * y + (x << 2));
-    dstStride = image->bytes_per_line >> 2;
+    dstLine = image->data + image->bytes_per_line * y + (x << 2);
+    dstStride = image->bytes_per_line;
     maskLine = (unsigned char *) xftg->bitmap;
     maskStride = (width + 3) & ~3;
-    
+
     while (height--)
     {
 	dst = dstLine;
@@ -505,20 +524,28 @@ _XftSmoothGlyphGray8888 (XImage		    *im
 
 	while (w--)
 	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		    *dst = src;
-		else
-		    *dst = fbOver24 (src, *dst);
-	    }
-	    else if (m)
+	    if (*mask)
 	    {
-		d = fbIn (src, m);
-		*dst = fbOver24 (d, *dst);
-	    }
-	    dst++;
+		CARD32 tc11, tc21, tc31;
+		CARD32 tc12, tc22, tc32;
+		CARD32 ta1 = *mask * srca + 0x80;
+		CARD8 ta = ((ta1 + (ta1 >> 8)) >> 8);
+		
+		tc11 = (255 - ta) * dst[COLOR1_INDEX] + 0x80;
+		tc12 = ta * color1 + 0x80;
+		dst[COLOR1_INDEX] = ((tc11 + (tc11 >> 8)) >> 8) + ((tc12 + (tc12 >> 8)) >> 8);
+		
+		tc21 = (255 - ta) * dst[COLOR2_INDEX] + 0x80;
+		tc22 = ta * color2 + 0x80;
+		dst[COLOR2_INDEX] = ((tc21 + (tc21 >> 8)) >> 8) + ((tc22 + (tc22 >> 8)) >> 8);
+		
+		tc31 = (255 - ta) * dst[COLOR3_INDEX] + 0x80;
+		tc32 = ta * color3 + 0x80;
+		dst[COLOR3_INDEX] = ((tc31 + (tc31 >> 8)) >> 8) + ((tc32 + (tc32 >> 8)) >> 8);
+	    }
+		
+	    mask++;
+	    dst += 4;
 	}
     }
 }
@@ -532,9 +559,8 @@ _XftSmoothGlyphGray565 (XImage		    *ima
 {
     CARD32	src, srca;
     CARD32	r, g, b;
-    CARD32	d;
     CARD16	*dstLine, *dst;
-    CARD8	*maskLine, *mask, m;
+    CARD8	*maskLine, *mask;
     int		dstStride, maskStride;
     int		width, height;
     int		w;
@@ -576,24 +602,36 @@ _XftSmoothGlyphGray565 (XImage		    *ima
 
 	while (w--)
 	{
-	    m = *mask++;
-	    if (m == 0xff)
+	    if (*mask)
 	    {
-		if (srca == 0xff)
-		    d = src;
-		else
-		{
-		    d = *dst;
-		    d = fbOver24 (src, cvt0565to8888(d));
-		}
-		*dst = cvt8888to0565(d);
+		CARD32 tr0, tg0, tb0;
+		CARD32 tr1, tg1, tb1;
+		CARD32 tr2, tg2, tb2;
+		CARD32 tr3, tg3, tb3;
+		
+		CARD16 d = *dst;
+		CARD32 ta1 = *mask * srca + 0x80;
+		CARD8 ta = ((ta1 + (ta1 >> 8)) >> 8);
+		
+		tr0 = (d & 0xf800);
+		tr1 = (255 - ta) * ((tr0 >> 8) + (tr0 >> 13)) + 0x80;
+		tr2 = ta * r + 0x80;
+		tr3 = ((tr1 + (tr1 >> 8)) >> 8) + ((tr2 + (tr2 >> 8)) >> 8);
+		
+		tg0 = (d & 0x07e0);
+		tg1 = (255 - ta) * ((tg0 >> 3) + (tg0 >> 9)) + 0x80;
+		tg2 = ta * g + 0x80;
+		tg3 = ((tg1 + (tg1 >> 8)) >> 8) + ((tg2 + (tg2 >> 8)) >> 8);
+		
+		tb0 = (d & 0x001f);
+		tb1 = (255 - ta) * ((tb0 << 3) + (tb0 >> 2)) + 0x80;
+		tb2 = ta * b + 0x80;
+		tb3 = ((tb1 + (tb1 >> 8)) >> 8) + ((tb2 + (tb2 >> 8)) >> 8);
+		
+		*dst = (tr3 << 16) | (tg3 << 8) | tb3;
 	    }
-	    else if (m)
-	    {
-		d = *dst;
-		d = fbOver24 (fbIn(src,m), cvt0565to8888(d));
-		*dst = cvt8888to0565(d);
-	    }
+		
+	    mask++;
 	    dst++;
 	}
     }

Reply via email to