Module Name: xsrc
Committed By: macallan
Date: Fri Sep 16 22:07:25 UTC 2016
Modified Files:
xsrc/external/mit/xf86-video-suncg14/dist/src: cg14_render.c
Log Message:
do PictOpOver operations 4 pixels at a time - this is a vector processor,
let's do some vector processing
-> about 40% speed increase
To generate a diff of this commit:
cvs rdiff -u -r1.8 -r1.9 \
xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c
diff -u xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.8 xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.9
--- xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.8 Fri Sep 16 21:16:37 2016
+++ xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c Fri Sep 16 22:07:25 2016
@@ -1,4 +1,4 @@
-/* $NetBSD: cg14_render.c,v 1.8 2016/09/16 21:16:37 macallan Exp $ */
+/* $NetBSD: cg14_render.c,v 1.9 2016/09/16 22:07:25 macallan Exp $ */
/*
* Copyright (c) 2013 Michael Lorenz
* All rights reserved.
@@ -43,11 +43,11 @@
#include "cg14.h"
#include <sparc/sxreg.h>
-#define SX_SINGLE
+/*#define SX_SINGLE*/
/*#define SX_RENDER_DEBUG*/
/*#define SX_ADD_SOFTWARE*/
-#ifdef SX__RENDER_DEBUG
+#ifdef SX_RENDER_DEBUG
#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
#define DPRINTF xf86Msg
#else
@@ -71,8 +71,47 @@ void CG14Comp_Over32Solid(Cg14Ptr p,
for (line = 0; line < height; line++) {
mskx = msk;
dstx = dst;
-#ifdef SX_SINGLE
-
+#ifndef SX_SINGLE
+ int rest;
+ for (x = 0; x < width; x += 4) {
+ rest = width - x;
+ /* fetch 4 mask values */
+ write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
+ /* fetch destination pixels */
+ write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
+ /* duplicate them for all channels */
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
+ /* generate inverted alpha */
+ write_sx_reg(p, SX_INSTRUCTIONS,
+ SX_XORS(12, 8, 28, 15));
+ /* multiply source */
+ write_sx_reg(p, SX_INSTRUCTIONS,
+ SX_MUL16X16SR8(8, 12, 44, 3));
+ write_sx_reg(p, SX_INSTRUCTIONS,
+ SX_MUL16X16SR8(8, 16, 48, 3));
+ write_sx_reg(p, SX_INSTRUCTIONS,
+ SX_MUL16X16SR8(8, 20, 52, 3));
+ write_sx_reg(p, SX_INSTRUCTIONS,
+ SX_MUL16X16SR8(8, 24, 56, 3));
+ /* multiply dest */
+ write_sx_reg(p, SX_INSTRUCTIONS,
+ SX_MUL16X16SR8(28, 60, 76, 15));
+ /* add up */
+ write_sx_reg(p, SX_INSTRUCTIONS,
+ SX_ADDV(44, 76, 92, 15));
+ /* write back */
+ if (rest < 4) {
+ write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
+ } else {
+ write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
+ }
+ dstx += 16;
+ mskx += 16;
+ }
+#else /* SX_SINGLE */
for (x = 0; x < width; x++) {
m = *(volatile uint32_t *)(p->fb + mskx);
m = m >> 24;
@@ -114,17 +153,43 @@ void CG14Comp_Over32Solid(Cg14Ptr p,
dstx += 4;
mskx += 4;
}
-#else
+#endif /* SX_SINGLE */
+ dst += dstpitch;
+ msk += srcpitch;
+ }
+}
+
+void CG14Comp_Over8Solid(Cg14Ptr p,
+ uint32_t src, uint32_t srcpitch,
+ uint32_t dst, uint32_t dstpitch,
+ int width, int height)
+{
+ uint32_t msk = src, mskx, dstx, m;
+ int line, x, i;
+#ifdef SX_DEBUG
+ char buffer[256];
+#endif
+ ENTER;
+
+ DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
+ read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
+ *(uint32_t *)(p->fb + p->srcoff));
+ for (line = 0; line < height; line++) {
+ mskx = msk;
+ dstx = dst;
+#ifndef SX_SINGLE
+ int rest;
for (x = 0; x < width; x += 4) {
+ rest = width - x;
/* fetch 4 mask values */
- write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7));
+ write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7));
/* fetch destination pixels */
write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
/* duplicate them for all channels */
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3));
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3));
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3));
write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
- write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2));
- write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2));
- write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2));
/* generate inverted alpha */
write_sx_reg(p, SX_INSTRUCTIONS,
SX_XORS(12, 8, 28, 15));
@@ -144,36 +209,15 @@ void CG14Comp_Over32Solid(Cg14Ptr p,
write_sx_reg(p, SX_INSTRUCTIONS,
SX_ADDV(44, 76, 92, 15));
/* write back */
- write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
+ if (rest < 4) {
+ write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7));
+ } else {
+ write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
+ }
dstx += 16;
- mskx += 16;
+ mskx += 4;
}
-#endif
- dst += dstpitch;
- msk += srcpitch;
- }
-}
-
-void CG14Comp_Over8Solid(Cg14Ptr p,
- uint32_t src, uint32_t srcpitch,
- uint32_t dst, uint32_t dstpitch,
- int width, int height)
-{
- uint32_t msk = src, mskx, dstx, m;
- int line, x, i;
-#ifdef SX_DEBUG
- char buffer[256];
-#endif
- ENTER;
-
- DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)),
- read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)),
- *(uint32_t *)(p->fb + p->srcoff));
- for (line = 0; line < height; line++) {
- mskx = msk;
- dstx = dst;
-#ifdef SX_SINGLE
-
+#else /* SX_SINGLE */
for (x = 0; x < width; x++) {
m = *(volatile uint8_t *)(p->fb + mskx);
#ifdef SX_DEBUG
@@ -217,45 +261,11 @@ void CG14Comp_Over8Solid(Cg14Ptr p,
dstx += 4;
mskx += 1;
}
+#endif /* SX_SINGLE */
#ifdef SX_DEBUG
buffer[x] = 0;
xf86Msg(X_ERROR, "%s\n", buffer);
#endif
-#else
- for (x = 0; x < width; x += 4) {
- /* fetch 4 mask values */
- write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7));
- /* fetch destination pixels */
- write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7));
- /* duplicate them for all channels */
- write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3));
- write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3));
- write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3));
- write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2));
- /* generate inverted alpha */
- write_sx_reg(p, SX_INSTRUCTIONS,
- SX_XORS(12, 8, 28, 15));
- /* multiply source */
- write_sx_reg(p, SX_INSTRUCTIONS,
- SX_MUL16X16SR8(8, 12, 44, 3));
- write_sx_reg(p, SX_INSTRUCTIONS,
- SX_MUL16X16SR8(8, 16, 48, 3));
- write_sx_reg(p, SX_INSTRUCTIONS,
- SX_MUL16X16SR8(8, 20, 52, 3));
- write_sx_reg(p, SX_INSTRUCTIONS,
- SX_MUL16X16SR8(8, 24, 56, 3));
- /* multiply dest */
- write_sx_reg(p, SX_INSTRUCTIONS,
- SX_MUL16X16SR8(28, 60, 76, 15));
- /* add up */
- write_sx_reg(p, SX_INSTRUCTIONS,
- SX_ADDV(44, 76, 92, 15));
- /* write back */
- write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7));
- dstx += 16;
- mskx += 4;
- }
-#endif
dst += dstpitch;
msk += srcpitch;
}