Module: Mesa Branch: llvmpipe-rast-64 Commit: 98ecab3785a4dd742f7a519d6eeeb3e0a53bc71f URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=98ecab3785a4dd742f7a519d6eeeb3e0a53bc71f
Author: Zack Rusin <za...@vmware.com> Date: Tue Nov 19 16:02:34 2013 -0500 llvmpipe: add support for 32bit optimized sse paths if the triangle is bounded by a 128x128 box then we can use only 32bit arthemtic which is nice because the sse paths for it can be nicely optimized. --- src/gallium/drivers/llvmpipe/lp_rast.c | 11 ++ src/gallium/drivers/llvmpipe/lp_rast.h | 33 ++++++- src/gallium/drivers/llvmpipe/lp_rast_priv.h | 27 ++++++ src/gallium/drivers/llvmpipe/lp_rast_tri.c | 120 ++++++++++++++++++------ src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h | 6 +- src/gallium/drivers/llvmpipe/lp_setup_tri.c | 35 ++++++- 6 files changed, 191 insertions(+), 41 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index af661e9..0cd62c2 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -589,6 +589,17 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] = lp_rast_begin_query, lp_rast_end_query, lp_rast_set_state, + lp_rast_triangle_32_1, + lp_rast_triangle_32_2, + lp_rast_triangle_32_3, + lp_rast_triangle_32_4, + lp_rast_triangle_32_5, + lp_rast_triangle_32_6, + lp_rast_triangle_32_7, + lp_rast_triangle_32_8, + lp_rast_triangle_32_3_4, + lp_rast_triangle_32_3_16, + lp_rast_triangle_32_4_16 }; diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index ce60665..b81d94f 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -60,6 +60,8 @@ struct cmd_bin; */ #define MAX_FIXED_LENGTH (1 << (((FIXED_TYPE_WIDTH/2) - 1) - FIXED_ORDER)) +#define MAX_FIXED_LENGTH32 (1 << (((32/2) - 1) - FIXED_ORDER)) + /* Rasterizer output size going to jit fs, width/height */ #define LP_RASTER_BLOCK_SIZE 4 @@ -104,9 +106,6 @@ struct lp_rast_shader_inputs { /* followed by a0, dadx, dady and planes[] */ }; -/* Note: the order of these values is important as they are loaded by - * sse code in rasterization: - */ struct lp_rast_plane { /* edge function values at minx,miny ?? */ int64_t c; @@ -279,8 +278,19 @@ lp_rast_arg_null( void ) #define LP_RAST_OP_BEGIN_QUERY 0xf #define LP_RAST_OP_END_QUERY 0x10 #define LP_RAST_OP_SET_STATE 0x11 - -#define LP_RAST_OP_MAX 0x12 +#define LP_RAST_OP_TRIANGLE_32_1 0x12 +#define LP_RAST_OP_TRIANGLE_32_2 0x13 +#define LP_RAST_OP_TRIANGLE_32_3 0x14 +#define LP_RAST_OP_TRIANGLE_32_4 0x15 +#define LP_RAST_OP_TRIANGLE_32_5 0x16 +#define LP_RAST_OP_TRIANGLE_32_6 0x17 +#define LP_RAST_OP_TRIANGLE_32_7 0x18 +#define LP_RAST_OP_TRIANGLE_32_8 0x19 +#define LP_RAST_OP_TRIANGLE_32_3_4 0x1a +#define LP_RAST_OP_TRIANGLE_32_3_16 0x1b +#define LP_RAST_OP_TRIANGLE_32_4_16 0x1c + +#define LP_RAST_OP_MAX 0x1d #define LP_RAST_OP_MASK 0xff void @@ -291,4 +301,17 @@ void lp_debug_draw_bins_by_coverage( struct lp_scene *scene ); +#ifdef PIPE_ARCH_SSE +#include <emmintrin.h> +#include "util/u_sse.h" + +static INLINE __m128i +lp_plane_to_m128i(const struct lp_rast_plane *plane) +{ + return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, + (int32_t)plane->dcdy, (int32_t)plane->eo); +} + +#endif + #endif diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index 41fe097..77ec329 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -355,6 +355,33 @@ void lp_rast_triangle_3_16( struct lp_rasterizer_task *, void lp_rast_triangle_4_16( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); + +void lp_rast_triangle_32_1( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_32_2( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_32_3( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_32_4( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_32_5( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_32_6( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_32_7( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_32_8( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); + +void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); + +void lp_rast_triangle_32_3_16( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); + +void lp_rast_triangle_32_4_16( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); + void lp_rast_set_state(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg); diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 8ddda5e..41f6fbf 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -35,12 +35,6 @@ #include "lp_perf.h" #include "lp_rast_priv.h" -/* TODO */ -#undef PIPE_ARCH_SSE - - - - /** * Shade all pixels in a 4x4 block. */ @@ -69,8 +63,6 @@ block_full_16(struct lp_rasterizer_task *task, block_full_4(task, tri, x + ix, y + iy); } -#if !defined(PIPE_ARCH_SSE) - static INLINE unsigned build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy) { @@ -125,6 +117,13 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task, } void +lp_rast_triangle_3_4(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + lp_rast_triangle_3_16(task, arg); +} + +void lp_rast_triangle_4_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { @@ -134,11 +133,33 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task, lp_rast_triangle_4(task, arg2); } +#if !defined(PIPE_ARCH_SSE) + void -lp_rast_triangle_3_4(struct lp_rasterizer_task *task, +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + union lp_rast_cmd_arg arg2; + arg2.triangle.tri = arg.triangle.tri; + arg2.triangle.plane_mask = (1<<3)-1; + lp_rast_triangle_32_3(task, arg2); +} + +void +lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + union lp_rast_cmd_arg arg2; + arg2.triangle.tri = arg.triangle.tri; + arg2.triangle.plane_mask = (1<<4)-1; + lp_rast_triangle_32_4(task, arg2); +} + +void +lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { - lp_rast_triangle_3_16(task, arg); + lp_rast_triangle_32_3_16(task, arg); } #else @@ -147,12 +168,12 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task, static INLINE void -build_masks(int c, - int cdiff, - int dcdx, - int dcdy, - unsigned *outmask, - unsigned *partmask) +build_masks_32(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) { __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = _mm_set1_epi32(dcdy); @@ -193,7 +214,7 @@ build_masks(int c, static INLINE unsigned -build_mask_linear(int c, int dcdx, int dcdy) +build_mask_linear_32(int c, int dcdx, int dcdy) { __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = _mm_set1_epi32(dcdy); @@ -251,7 +272,7 @@ sign_bits4(const __m128i *cstep, int cdiff) void -lp_rast_triangle_3_16(struct lp_rasterizer_task *task, +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; @@ -263,9 +284,9 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task, struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; unsigned nr = 0; - __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ - __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ - __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ + __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ + __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ + __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = _mm_setzero_si128(); __m128i c; @@ -365,7 +386,7 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task, void -lp_rast_triangle_3_4(struct lp_rasterizer_task *task, +lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; @@ -373,9 +394,9 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task, unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; unsigned y = (arg.triangle.plane_mask >> 8) + task->y; - __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ - __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ - __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ + __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ + __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ + __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = _mm_setzero_si128(); __m128i c; @@ -453,7 +474,8 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task, #endif - +#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask) +#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy) #define TAG(x) x##_1 #define NR_PLANES 1 @@ -471,7 +493,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task, #define TAG(x) x##_4 #define NR_PLANES 4 -#define TRI_16 lp_rast_triangle_4_16 +/*#define TRI_16 lp_rast_triangle_4_16*/ #include "lp_rast_tri_tmp.h" #define TAG(x) x##_5 @@ -490,3 +512,47 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task, #define NR_PLANES 8 #include "lp_rast_tri_tmp.h" +#ifdef PIPE_ARCH_SSE +#undef BUILD_MASKS +#undef BUILD_MASK_LINEAR +#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) +#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy) +#endif + +#define TAG(x) x##_32_1 +#define NR_PLANES 1 +#include "lp_rast_tri_tmp.h" + +#define TAG(x) x##_32_2 +#define NR_PLANES 2 +#include "lp_rast_tri_tmp.h" + +#define TAG(x) x##_32_3 +#define NR_PLANES 3 +/*#define TRI_4 lp_rast_triangle_3_4*/ +/*#define TRI_16 lp_rast_triangle_3_16*/ +#include "lp_rast_tri_tmp.h" + +#define TAG(x) x##_32_4 +#define NR_PLANES 4 +#ifdef PIPE_ARCH_SSE +#define TRI_16 lp_rast_triangle_32_4_16 +#endif +#include "lp_rast_tri_tmp.h" + +#define TAG(x) x##_32_5 +#define NR_PLANES 5 +#include "lp_rast_tri_tmp.h" + +#define TAG(x) x##_32_6 +#define NR_PLANES 6 +#include "lp_rast_tri_tmp.h" + +#define TAG(x) x##_32_7 +#define NR_PLANES 7 +#include "lp_rast_tri_tmp.h" + +#define TAG(x) x##_32_8 +#define NR_PLANES 8 +#include "lp_rast_tri_tmp.h" + diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h index a00cbb2..52f6e99 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -50,7 +50,7 @@ TAG(do_block_4)(struct lp_rasterizer_task *task, int j; for (j = 0; j < NR_PLANES; j++) { - mask &= ~build_mask_linear(c[j] - 1, + mask &= ~BUILD_MASK_LINEAR(c[j] - 1, -plane[j].dcdx, plane[j].dcdy); } @@ -85,7 +85,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo; const int64_t cio = IMUL64(ei, 4) - 1; - build_masks(c[j] + cox, + BUILD_MASKS(c[j] + cox, cio - cox, dcdx, dcdy, &outmask, /* sign bits from c[i][0..15] + cox */ @@ -185,7 +185,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo; const int64_t cio = IMUL64(ei, 16) - 1; - build_masks(c[j] + cox, + BUILD_MASKS(c[j] + cox, cio - cox, dcdx, dcdy, &outmask, /* sign bits from c[i][0..15] + cox */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 2a77987..62d2855 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -190,6 +190,19 @@ lp_rast_tri_tab[MAX_PLANES+1] = { LP_RAST_OP_TRIANGLE_8 }; +static unsigned +lp_rast_32_tri_tab[MAX_PLANES+1] = { + 0, /* should be impossible */ + LP_RAST_OP_TRIANGLE_32_1, + LP_RAST_OP_TRIANGLE_32_2, + LP_RAST_OP_TRIANGLE_32_3, + LP_RAST_OP_TRIANGLE_32_4, + LP_RAST_OP_TRIANGLE_32_5, + LP_RAST_OP_TRIANGLE_32_6, + LP_RAST_OP_TRIANGLE_32_7, + LP_RAST_OP_TRIANGLE_32_8 +}; + /** @@ -586,7 +599,6 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, struct lp_scene *scene = setup->scene; struct u_rect trimmed_box = *bbox; int i; - /* What is the largest power-of-two boundary this triangle crosses: */ int dx = floor_pot((bbox->x0 ^ bbox->x1) | @@ -595,8 +607,10 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, /* The largest dimension of the rasterized area of the triangle * (aligned to a 4x4 grid), rounded down to the nearest power of two: */ - int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) | - (bbox->y1 - (bbox->y0 & ~3))); + int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) | + (bbox->y1 - (bbox->y0 & ~3))); + int sz = floor_pot(max_sz); + boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32; /* Now apply scissor, etc to the bounding box. Could do this * earlier, but it confuses the logic for tri-16 and would force @@ -627,6 +641,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, assert(py + 4 <= TILE_SIZE); return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored, + use_32bits ? + LP_RAST_OP_TRIANGLE_32_3_4 : LP_RAST_OP_TRIANGLE_3_4, lp_rast_arg_triangle_contained(tri, px, py) ); } @@ -649,6 +665,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored, + use_32bits ? + LP_RAST_OP_TRIANGLE_32_3_16 : LP_RAST_OP_TRIANGLE_3_16, lp_rast_arg_triangle_contained(tri, px, py) ); } @@ -663,6 +681,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, return lp_scene_bin_cmd_with_state(scene, ix0, iy0, setup->fs.stored, + use_32bits ? + LP_RAST_OP_TRIANGLE_32_4_16 : LP_RAST_OP_TRIANGLE_4_16, lp_rast_arg_triangle_contained(tri, px, py)); } @@ -670,9 +690,10 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, /* Triangle is contained in a single tile: */ - return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored, - lp_rast_tri_tab[nr_planes], - lp_rast_arg_triangle(tri, (1<<nr_planes)-1) ); + return lp_scene_bin_cmd_with_state( + scene, ix0, iy0, setup->fs.stored, + use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes], + lp_rast_arg_triangle(tri, (1<<nr_planes)-1)); } else { @@ -746,6 +767,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup, if (!lp_scene_bin_cmd_with_state( scene, x, y, setup->fs.stored, + use_32bits ? + lp_rast_32_tri_tab[count] : lp_rast_tri_tab[count], lp_rast_arg_triangle(tri, partial) )) goto fail; _______________________________________________ mesa-commit mailing list mesa-commit@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-commit