Module Name: xsrc Committed By: macallan Date: Fri Sep 16 22:07:25 UTC 2016
Modified Files: xsrc/external/mit/xf86-video-suncg14/dist/src: cg14_render.c Log Message: do PictOpOver operations 4 pixels at a time - this is a vector processor, let's do some vector processing -> about 40% speed increase To generate a diff of this commit: cvs rdiff -u -r1.8 -r1.9 \ xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c diff -u xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.8 xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.9 --- xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c:1.8 Fri Sep 16 21:16:37 2016 +++ xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_render.c Fri Sep 16 22:07:25 2016 @@ -1,4 +1,4 @@ -/* $NetBSD: cg14_render.c,v 1.8 2016/09/16 21:16:37 macallan Exp $ */ +/* $NetBSD: cg14_render.c,v 1.9 2016/09/16 22:07:25 macallan Exp $ */ /* * Copyright (c) 2013 Michael Lorenz * All rights reserved. @@ -43,11 +43,11 @@ #include "cg14.h" #include <sparc/sxreg.h> -#define SX_SINGLE +/*#define SX_SINGLE*/ /*#define SX_RENDER_DEBUG*/ /*#define SX_ADD_SOFTWARE*/ -#ifdef SX__RENDER_DEBUG +#ifdef SX_RENDER_DEBUG #define ENTER xf86Msg(X_ERROR, "%s>\n", __func__); #define DPRINTF xf86Msg #else @@ -71,8 +71,47 @@ void CG14Comp_Over32Solid(Cg14Ptr p, for (line = 0; line < height; line++) { mskx = msk; dstx = dst; -#ifdef SX_SINGLE - +#ifndef SX_SINGLE + int rest; + for (x = 0; x < width; x += 4) { + rest = width - x; + /* fetch 4 mask values */ + write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7)); + /* fetch destination pixels */ + write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7)); + /* duplicate them for all channels */ + write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2)); + write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2)); + write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2)); + write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2)); + /* generate inverted alpha */ + write_sx_reg(p, SX_INSTRUCTIONS, + SX_XORS(12, 8, 28, 15)); + /* multiply source */ + write_sx_reg(p, SX_INSTRUCTIONS, + SX_MUL16X16SR8(8, 12, 44, 3)); + write_sx_reg(p, SX_INSTRUCTIONS, + SX_MUL16X16SR8(8, 16, 48, 3)); + write_sx_reg(p, SX_INSTRUCTIONS, + SX_MUL16X16SR8(8, 20, 52, 3)); + write_sx_reg(p, SX_INSTRUCTIONS, + SX_MUL16X16SR8(8, 24, 56, 3)); + /* multiply dest */ + write_sx_reg(p, SX_INSTRUCTIONS, + SX_MUL16X16SR8(28, 60, 76, 15)); + /* add up */ + write_sx_reg(p, SX_INSTRUCTIONS, + SX_ADDV(44, 76, 92, 15)); + /* write back */ + if (rest < 4) { + write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7)); + } else { + write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7)); + } + dstx += 16; + mskx += 16; + } +#else /* SX_SINGLE */ for (x = 0; x < width; x++) { m = *(volatile uint32_t *)(p->fb + mskx); m = m >> 24; @@ -114,17 +153,43 @@ void CG14Comp_Over32Solid(Cg14Ptr p, dstx += 4; mskx += 4; } -#else +#endif /* SX_SINGLE */ + dst += dstpitch; + msk += srcpitch; + } +} + +void CG14Comp_Over8Solid(Cg14Ptr p, + uint32_t src, uint32_t srcpitch, + uint32_t dst, uint32_t dstpitch, + int width, int height) +{ + uint32_t msk = src, mskx, dstx, m; + int line, x, i; +#ifdef SX_DEBUG + char buffer[256]; +#endif + ENTER; + + DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)), + read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)), + *(uint32_t *)(p->fb + p->srcoff)); + for (line = 0; line < height; line++) { + mskx = msk; + dstx = dst; +#ifndef SX_SINGLE + int rest; for (x = 0; x < width; x += 4) { + rest = width - x; /* fetch 4 mask values */ - write_sx_io(p, mskx, SX_LDUQ0(12, 3, mskx & 7)); + write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7)); /* fetch destination pixels */ write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7)); /* duplicate them for all channels */ + write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3)); + write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3)); + write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3)); write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2)); - write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 16, 17, 2)); - write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 20, 21, 2)); - write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 24, 25, 2)); /* generate inverted alpha */ write_sx_reg(p, SX_INSTRUCTIONS, SX_XORS(12, 8, 28, 15)); @@ -144,36 +209,15 @@ void CG14Comp_Over32Solid(Cg14Ptr p, write_sx_reg(p, SX_INSTRUCTIONS, SX_ADDV(44, 76, 92, 15)); /* write back */ - write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7)); + if (rest < 4) { + write_sx_io(p, dstx, SX_STUQ0C(92, rest - 1, dstx & 7)); + } else { + write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7)); + } dstx += 16; - mskx += 16; + mskx += 4; } -#endif - dst += dstpitch; - msk += srcpitch; - } -} - -void CG14Comp_Over8Solid(Cg14Ptr p, - uint32_t src, uint32_t srcpitch, - uint32_t dst, uint32_t dstpitch, - int width, int height) -{ - uint32_t msk = src, mskx, dstx, m; - int line, x, i; -#ifdef SX_DEBUG - char buffer[256]; -#endif - ENTER; - - DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)), - read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)), - *(uint32_t *)(p->fb + p->srcoff)); - for (line = 0; line < height; line++) { - mskx = msk; - dstx = dst; -#ifdef SX_SINGLE - +#else /* SX_SINGLE */ for (x = 0; x < width; x++) { m = *(volatile uint8_t *)(p->fb + mskx); #ifdef SX_DEBUG @@ -217,45 +261,11 @@ void CG14Comp_Over8Solid(Cg14Ptr p, dstx += 4; mskx += 1; } +#endif /* SX_SINGLE */ #ifdef SX_DEBUG buffer[x] = 0; xf86Msg(X_ERROR, "%s\n", buffer); #endif -#else - for (x = 0; x < width; x += 4) { - /* fetch 4 mask values */ - write_sx_io(p, mskx, SX_LDB(12, 3, mskx & 7)); - /* fetch destination pixels */ - write_sx_io(p, dstx, SX_LDUQ0(60, 3, dstx & 7)); - /* duplicate them for all channels */ - write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 13, 16, 3)); - write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 14, 20, 3)); - write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 15, 24, 3)); - write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(0, 12, 13, 2)); - /* generate inverted alpha */ - write_sx_reg(p, SX_INSTRUCTIONS, - SX_XORS(12, 8, 28, 15)); - /* multiply source */ - write_sx_reg(p, SX_INSTRUCTIONS, - SX_MUL16X16SR8(8, 12, 44, 3)); - write_sx_reg(p, SX_INSTRUCTIONS, - SX_MUL16X16SR8(8, 16, 48, 3)); - write_sx_reg(p, SX_INSTRUCTIONS, - SX_MUL16X16SR8(8, 20, 52, 3)); - write_sx_reg(p, SX_INSTRUCTIONS, - SX_MUL16X16SR8(8, 24, 56, 3)); - /* multiply dest */ - write_sx_reg(p, SX_INSTRUCTIONS, - SX_MUL16X16SR8(28, 60, 76, 15)); - /* add up */ - write_sx_reg(p, SX_INSTRUCTIONS, - SX_ADDV(44, 76, 92, 15)); - /* write back */ - write_sx_io(p, dstx, SX_STUQ0C(92, 3, dstx & 7)); - dstx += 16; - mskx += 4; - } -#endif dst += dstpitch; msk += srcpitch; }