Module Name:    xsrc
Committed By:   macallan
Date:           Fri Dec  8 22:49:37 UTC 2017

Modified Files:
        xsrc/external/mit/xf86-video-suncg14/dist/src: cg14_render.c

Log Message:
do up to 4 pixels at a time CG14Comp_Over*()


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 \
    xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c
diff -u xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.11 xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.12
--- xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.11	Thu Dec  7 19:23:22 2017
+++ xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c	Fri Dec  8 22:49:37 2017
@@ -1,4 +1,4 @@
-/* $NetBSD: cg14_render.c,v 1.11 2017/12/07 19:23:22 macallan Exp $ */
+/* $NetBSD: cg14_render.c,v 1.12 2017/12/08 22:49:37 macallan Exp $ */
 /*
  * Copyright (c) 2013 Michael Lorenz
  * All rights reserved.
@@ -471,8 +471,8 @@ void CG14Comp_Over32(Cg14Ptr p,
                    uint32_t dst, uint32_t dstpitch,
                    int width, int height, int flip)
 {
-	uint32_t srcx, dstx, m;
-	int line, x, i;
+	uint32_t srcx, dstx, mskx, m;
+	int line, x, i, num;
 
 	ENTER;
 
@@ -481,33 +481,44 @@ void CG14Comp_Over32(Cg14Ptr p,
 		srcx = src;
 		dstx = dst;
 
-		for (x = 0; x < width; x++) {
-			/* fetch source pixel */
-			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
+		for (x = 0; x < width; x += 4) {
+			/* we do up to 4 pixels at a time */
+			num = min(4, width - x);
+			if (num <= 0) {
+				xf86Msg(X_ERROR, "wtf?!\n");
+				continue;
+			}
+			/* fetch source pixels */
+			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
 			if (flip) {
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(13, 0, 40, 0));
+				    SX_GATHER(13, 4, 40, num - 1));
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(15, 0, 13, 0));
+				    SX_GATHER(15, 4, 44, num - 1));
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(40, 0, 15, 0));
+				    SX_SCATTER(40, 4, 15, num - 1));
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SCATTER(44, 4, 13, num - 1));
+			}
+			/* fetch dst pixels */
+			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
+			/* now process up to 4 pixels */
+			for (i = 0; i < num; i++) {
+				int ii = i << 2;
+				/* write inverted alpha into SCAM */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_XORS(12 + ii, 8, R_SCAM, 0));
+				/* dst * (1 - alpha) + src */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SAXP16X16SR8(44 + ii, 12 + ii, 76 + ii, 3));
 			}
-			/* fetch dst pixel */
-			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
-			/* src is premultiplied with alpha */
-			/* write inverted alpha into SCAM */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_XORV(12, 8, R_SCAM, 0));
-			/* dst * (1 - alpha) + R[13:15] */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_SAXP16X16SR8(20, 12, 24, 3));
 			write_sx_io(p, dstx,
-			    SX_STUQ0C(24, 0, dstx & 7));
-			dstx += 4;
-			srcx += 4;
+			    SX_STUQ0C(76, num - 1, dstx & 7));
+			srcx += 16;
+			dstx += 16;
 		}
-		dst += dstpitch;
 		src += srcpitch;
+		dst += dstpitch;
 	}
 }
 
@@ -518,7 +529,7 @@ void CG14Comp_Over32Mask(Cg14Ptr p,
                    int width, int height, int flip)
 {
 	uint32_t srcx, dstx, mskx, m;
-	int line, x, i;
+	int line, x, i, num;
 
 	ENTER;
 
@@ -528,39 +539,50 @@ void CG14Comp_Over32Mask(Cg14Ptr p,
 		mskx = msk;
 		dstx = dst;
 
-		for (x = 0; x < width; x++) {
-			/* fetch source pixel */
-			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
+		for (x = 0; x < width; x += 4) {
+			/* we do up to 4 pixels at a time */
+			num = min(4, width - x);
+			if (num <= 0) {
+				xf86Msg(X_ERROR, "wtf?!\n");
+				continue;
+			}
+			/* fetch source pixels */
+			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
 			if (flip) {
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(13, 0, 40, 0));
+				    SX_GATHER(13, 4, 40, num - 1));
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_GATHER(15, 4, 44, num - 1));
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(15, 0, 13, 0));
+				    SX_SCATTER(40, 4, 15, num - 1));
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(40, 0, 15, 0));
+				    SX_SCATTER(44, 4, 13, num - 1));
 			}
 			/* fetch mask */
-			write_sx_io(p, mskx & (~7), SX_LDB(9, 0, mskx & 7));
-			/* fetch dst pixel */
-			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
-			/* stick mask alpha into SCAM */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_ORS(9, 0, R_SCAM, 0));
-			/* apply mask */
-			/* src is premultiplied with alpha */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_SAXP16X16SR8(12, 0, 16, 3));
-			/* write inverted alpha into SCAM */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_XORV(16, 8, R_SCAM, 0));
-			/* dst * (1 - alpha) + R[13:15] */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_SAXP16X16SR8(20, 16, 24, 3));
+			write_sx_io(p, mskx, SX_LDB(28, num - 1, mskx & 7));
+			/* fetch dst pixels */
+			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
+			/* now process up to 4 pixels */
+			for (i = 0; i < num; i++) {
+				int ii = i << 2;
+				/* mask alpha to SCAM */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_ORS(28 + i, 0, R_SCAM, 0));
+				/* src * alpha */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
+				/* write inverted alpha into SCAM */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_XORS(28 + i, 8, R_SCAM, 0));
+				/* dst * (1 - alpha) + R[60:] */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
+			}
 			write_sx_io(p, dstx,
-			    SX_STUQ0C(24, 0, dstx & 7));
-			srcx += 4;
-			mskx += 1;
-			dstx += 4;
+			    SX_STUQ0C(76, num - 1, dstx & 7));
+			srcx += 16;
+			mskx += 4;
+			dstx += 16;
 		}
 		src += srcpitch;
 		msk += mskpitch;
@@ -575,51 +597,65 @@ void CG14Comp_Over32Mask_noalpha(Cg14Ptr
                    int width, int height, int flip)
 {
 	uint32_t srcx, dstx, mskx, m;
-	int line, x, i;
+	int line, x, i, num;
 
 	ENTER;
 
 	write_sx_reg(p, SX_QUEUED(8), 0xff);
+	write_sx_reg(p, SX_QUEUED(9), 0xff);
+	write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(8, 0, 10, 1));
 	for (line = 0; line < height; line++) {
 		srcx = src;
 		mskx = msk;
 		dstx = dst;
 
-		for (x = 0; x < width; x++) {
-			/* fetch source pixel */
-			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
+		for (x = 0; x < width; x += 4) {
+			/* we do up to 4 pixels at a time */
+			num = min(4, width - x);
+			if (num <= 0) {
+				xf86Msg(X_ERROR, "wtf?!\n");
+				continue;
+			}
+			/* fetch source pixels */
+			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
 			if (flip) {
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(13, 0, 40, 0));
+				    SX_GATHER(13, 4, 40, num - 1));
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_GATHER(15, 4, 44, num - 1));
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(15, 0, 13, 0));
+				    SX_SCATTER(40, 4, 15, num - 1));
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(40, 0, 15, 0));
+				    SX_SCATTER(44, 4, 13, num - 1));
 			}
-			/* set src alpha to 0xff */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_ORS(8, 0, 12, 0));
 			/* fetch mask */
-			write_sx_io(p, mskx & (~7), SX_LDB(9, 0, mskx & 7));
-			/* fetch dst pixel */
-			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
-			/* write alpha into SCAM */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_ORS(9, 0, R_SCAM, 0));
-			/* src * alpha + R0 */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_SAXP16X16SR8(12, 0, 16, 3));
-			/* write inverted alpha into SCAM */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_XORV(9, 8, R_SCAM, 0));
-			/* dst * (1 - alpha) + R[13:15] */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_SAXP16X16SR8(20, 16, 24, 3));
+			write_sx_io(p, mskx, SX_LDB(28, num - 1, mskx & 7));
+			/* fetch dst pixels */
+			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
+			/* set src alpha to 0xff */			
+			write_sx_reg(p, SX_INSTRUCTIONS,
+			    SX_SCATTER(8, 4, 12, num - 1));
+			/* now process up to 4 pixels */
+			for (i = 0; i < num; i++) {
+				int ii = i << 2;
+				/* mask alpha to SCAM */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_ORS(28 + i, 0, R_SCAM, 0));
+				/* src * alpha */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
+				/* write inverted alpha into SCAM */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_XORS(28 + i, 8, R_SCAM, 0));
+				/* dst * (1 - alpha) + R[60:] */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
+			}
 			write_sx_io(p, dstx,
-			    SX_STUQ0C(24, 0, dstx & 7));
-			srcx += 4;
-			mskx += 1;
-			dstx += 4;
+			    SX_STUQ0C(76, num - 1, dstx & 7));
+			srcx += 16;
+			mskx += 4;
+			dstx += 16;
 		}
 		src += srcpitch;
 		msk += mskpitch;
@@ -634,51 +670,65 @@ void CG14Comp_Over32Mask32_noalpha(Cg14P
                    int width, int height, int flip)
 {
 	uint32_t srcx, dstx, mskx, m;
-	int line, x, i;
+	int line, x, i, num;
 
 	ENTER;
 
 	write_sx_reg(p, SX_QUEUED(8), 0xff);
+	write_sx_reg(p, SX_QUEUED(9), 0xff);
+	write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(8, 0, 10, 1));
 	for (line = 0; line < height; line++) {
 		srcx = src;
 		mskx = msk;
 		dstx = dst;
 
-		for (x = 0; x < width; x++) {
-			/* fetch source pixel */
-			write_sx_io(p, srcx, SX_LDUQ0(12, 0, srcx & 7));
+		for (x = 0; x < width; x += 4) {
+			/* we do up to 4 pixels at a time */
+			num = min(4, width - x);
+			if (num <= 0) {
+				xf86Msg(X_ERROR, "wtf?!\n");
+				continue;
+			}
+			/* fetch source pixels */
+			write_sx_io(p, srcx, SX_LDUQ0(12, num - 1, srcx & 7));
 			if (flip) {
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(13, 0, 40, 0));
+				    SX_GATHER(13, 4, 40, num - 1));
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(15, 0, 13, 0));
+				    SX_GATHER(15, 4, 44, num - 1));
 				write_sx_reg(p, SX_INSTRUCTIONS,
-				    SX_ORS(40, 0, 15, 0));
+				    SX_SCATTER(40, 4, 15, num - 1));
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SCATTER(44, 4, 13, num - 1));
 			}
 			/* fetch mask */
-			write_sx_io(p, mskx, SX_LDUQ0(16, 0, mskx & 7));
-			/* fetch dst pixel */
-			write_sx_io(p, dstx, SX_LDUQ0(20, 0, dstx & 7));
-			/* set src alpha to 0xff */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_ORS(8, 0, 12, 0));
-			/* mask alpha to SCAM */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_ORS(16, 0, R_SCAM, 0));
-			/* src * alpha */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_SAXP16X16SR8(12, 0, 24, 3));
-			/* write inverted alpha into SCAM */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_XORS(16, 8, R_SCAM, 0));
-			/* dst * (1 - alpha) + R[24:31] */
-			write_sx_reg(p, SX_INSTRUCTIONS,
-			    SX_SAXP16X16SR8(20, 24, 28, 3));
+			write_sx_io(p, mskx, SX_LDUQ0(28, num - 1, mskx & 7));
+			/* fetch dst pixels */
+			write_sx_io(p, dstx, SX_LDUQ0(44, num - 1, dstx & 7));
+			/* set src alpha to 0xff */			
+			write_sx_reg(p, SX_INSTRUCTIONS,
+			    SX_SCATTER(8, 4, 12, num - 1));
+			/* now process up to 4 pixels */
+			for (i = 0; i < num; i++) {
+				int ii = i << 2;
+				/* mask alpha to SCAM */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_ORS(28 + ii, 0, R_SCAM, 0));
+				/* src * alpha */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3));
+				/* write inverted alpha into SCAM */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_XORS(28 + ii, 8, R_SCAM, 0));
+				/* dst * (1 - alpha) + R[60:] */
+				write_sx_reg(p, SX_INSTRUCTIONS,
+				    SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3));
+			}
 			write_sx_io(p, dstx,
-			    SX_STUQ0C(28, 0, dstx & 7));
-			srcx += 4;
-			mskx += 4;
-			dstx += 4;
+			    SX_STUQ0C(76, num - 1, dstx & 7));
+			srcx += 16;
+			mskx += 16;
+			dstx += 16;
 		}
 		src += srcpitch;
 		msk += mskpitch;

Reply via email to