diff --git a/exa/exa.c b/exa/exa.c
index 6e769a7..f723c90 100644
--- a/exa/exa.c
+++ b/exa/exa.c
@@ -754,6 +754,7 @@ exaCloseScreen(int i, ScreenPtr pScreen)
 	ps->Glyphs = pExaScr->SavedGlyphs;
 	ps->Trapezoids = pExaScr->SavedTrapezoids;
 	ps->Triangles = pExaScr->SavedTriangles;
+	ps->AddTraps = pExaScr->SavedAddTraps;
     }
 #endif
 
@@ -925,6 +926,9 @@ exaDriverInit (ScreenPtr		pScreen,
 
 	pExaScr->SavedTrapezoids = ps->Trapezoids;
 	ps->Trapezoids = exaTrapezoids;
+
+	pExaScr->SavedAddTraps = ps->AddTraps;
+	ps->AddTraps = exaAddTraps;
     }
 #endif
 
diff --git a/exa/exa_priv.h b/exa/exa_priv.h
index bd3c76e..acd3cb3 100644
--- a/exa/exa_priv.h
+++ b/exa/exa_priv.h
@@ -147,6 +147,7 @@ typedef struct {
     TrianglesProcPtr		 SavedTriangles;
     GlyphsProcPtr                SavedGlyphs;
     TrapezoidsProcPtr            SavedTrapezoids;
+    AddTrapsProcPtr           SavedAddTraps;
 #endif
   
     Bool			 swappedOut;
@@ -468,6 +469,13 @@ exaTriangles (CARD8 op, PicturePtr pSrc, PicturePtr pDst,
 	      PictFormatPtr maskFormat, INT16 xSrc, INT16 ySrc,
 	      int ntri, xTriangle *tris);
 
+void
+exaAddTraps (PicturePtr	pDst,
+		INT16	x_off,
+		INT16	y_off,
+		int		ntrap,
+		xTrap	*traps);
+
 /* exa_glyph.c */
 void
 exaGlyphsInit(ScreenPtr pScreen);
diff --git a/exa/exa_render.c b/exa/exa_render.c
index f9c6b40..ac0a81d 100644
--- a/exa/exa_render.c
+++ b/exa/exa_render.c
@@ -1047,6 +1047,7 @@ exaTrapezoids (CARD8 op, PicturePtr pSrc, PicturePtr pDst,
     PictureScreenPtr    ps = GetPictureScreen(pScreen);
     BoxRec		bounds;
     Bool		direct = op == PictOpAdd && miIsSolidAlpha (pSrc);
+    PixmapPtr pDstPix = exaGetDrawablePixmap (pDst->pDrawable);
 
     if (maskFormat || direct) {
 	miTrapezoidBounds (ntrap, traps, &bounds);
@@ -1055,14 +1056,18 @@ exaTrapezoids (CARD8 op, PicturePtr pSrc, PicturePtr pDst,
 	    return;
     }
 
+    /* Try to avoid migration if the destination mask is already offscreen. */
+    /* This assumes UploadToScreen is faster than DownloadFromScreen.  */
+    if (exaPixmapIsOffscreen(pDstPix))
+	direct = FALSE;
+
     /*
      * Check for solid alpha add
      */
     if (direct)
     {
 	DrawablePtr pDraw = pDst->pDrawable;
-	PixmapPtr pixmap = exaGetDrawablePixmap (pDraw);
-	ExaPixmapPriv (pixmap);
+	ExaPixmapPriv (pDstPix);
 
 	/* Damage manually, because Trapezoids expects to hit Composite normally. */
 	/* Composite is wrapped by damage, but Trapezoids isn't. */
@@ -1151,6 +1156,7 @@ exaTriangles (CARD8 op, PicturePtr pSrc, PicturePtr pDst,
     PictureScreenPtr    ps = GetPictureScreen(pScreen);
     BoxRec		bounds;
     Bool		direct = op == PictOpAdd && miIsSolidAlpha (pSrc);
+    PixmapPtr pDstPix = exaGetDrawablePixmap (pDst->pDrawable);
 
     if (maskFormat || direct) {
 	miTriangleBounds (ntri, tris, &bounds);
@@ -1159,14 +1165,18 @@ exaTriangles (CARD8 op, PicturePtr pSrc, PicturePtr pDst,
 	    return;
     }
 
+    /* Try to avoid migration if the destination mask is already offscreen. */
+    /* This assumes UploadToScreen is faster than DownloadFromScreen.  */
+    if (exaPixmapIsOffscreen(pDstPix))
+	direct = FALSE;
+
     /*
      * Check for solid alpha add
      */
     if (direct)
     {
 	DrawablePtr pDraw = pDst->pDrawable;
-	PixmapPtr pixmap = exaGetDrawablePixmap (pDraw);
-	ExaPixmapPriv (pixmap);
+	ExaPixmapPriv (pDstPix);
 
 	/* Damage manually, because Triangles expects to hit Composite normally. */
 	/* Composite is wrapped by damage, but Triangles isn't. */
@@ -1227,3 +1237,82 @@ exaTriangles (CARD8 op, PicturePtr pSrc, PicturePtr pDst,
 	    exaTriangles (op, pSrc, pDst, maskFormat, xSrc, ySrc, 1, tris);
     }
 }
+
+/* This could be moved to mi should it be needed there. */
+#define xTrapValid(t)  ((int) ((t)->bot.y - (t)->top.y) > 0)
+static void
+exaAddTrapsBounds (int ntrap, xTrap *traps, BoxPtr box)
+{
+    box->y1 = MAXSHORT;
+    box->y2 = MINSHORT;
+    box->x1 = MAXSHORT;
+    box->x2 = MINSHORT;
+    for (; ntrap; ntrap--, traps++)
+    {
+	INT16 x1, y1, x2, y2;
+
+	if (!xTrapValid(traps))
+	    continue;
+	y1 = xFixedToInt (traps->top.y);
+	if (y1 < box->y1)
+	    box->y1 = y1;
+	
+	y2 = xFixedToInt (xFixedCeil (traps->bot.y));
+	if (y2 > box->y2)
+	    box->y2 = y2;
+	
+	x1 = xFixedToInt (min(traps->top.l, traps->bot.l));
+	if (x1 < box->x1)
+	    box->x1 = x1;
+	
+	x2 = xFixedToInt (xFixedCeil (max(traps->top.r, traps->bot.r)));
+	if (x2 > box->x2)
+	    box->x2 = x2;
+    }
+}
+
+/* The reason for this wrapper is quite simple. 
+ * It seems better to software render onto a temporary surface, and composite that.
+ * Rather than migrating the entire thing back into normal ram.
+ * This assumes UploadToScreen is faster than DownloadFromScreen.
+ */
+
+void
+exaAddTraps (PicturePtr	pDst,
+		INT16	x_off,
+		INT16	y_off,
+		int		ntrap,
+		xTrap	*traps)
+{
+	PixmapPtr pDstPix = exaGetDrawablePixmap (pDst->pDrawable);
+
+	if (!exaPixmapIsOffscreen(pDstPix)) {
+		exaPrepareAccess(pDst->pDrawable, EXA_PREPARE_DEST);
+		fbAddTraps(pDst, x_off, y_off, ntrap, traps);
+		exaFinishAccess(pDst->pDrawable, EXA_PREPARE_DEST);
+	} else {
+		ScreenPtr pScreen = pDst->pDrawable->pScreen;
+		PicturePtr pPicture;
+		BoxRec bounds;
+		exaAddTrapsBounds(ntrap, traps, &bounds);
+
+		if (bounds.y1 >= bounds.y2 || bounds.x1 >= bounds.x2)
+			return;
+
+		pPicture = exaCreateAlphaPicture (pScreen, pDst, pDst->pFormat,
+						bounds.x2 - bounds.x1,
+						bounds.y2 - bounds.y1);
+		if (!pPicture)
+			return;
+
+		exaPrepareAccess(pPicture->pDrawable, EXA_PREPARE_DEST);
+		fbAddTraps(pPicture, x_off - bounds.x1, y_off - bounds.y1, ntrap, traps);
+		exaFinishAccess(pPicture->pDrawable, EXA_PREPARE_DEST);
+
+		/* Source should start at (0,0) */
+		CompositePicture (PictOpAdd, pPicture, NULL, pDst,
+			  0, 0, 0, 0, bounds.x1, bounds.y1,
+			  bounds.x2 - bounds.x1, bounds.y2 - bounds.y1);
+		FreePicture (pPicture, 0);
+	}
+}
