Y-tiling makes a mess of our cacheline WCB, forcing evictions and writes between each pixel of the linear_to_ytiled routines, effectively reducing the upload to UC performance (i.e. terrible). This patch takes the simple approach of doing the detiling into a temporary page and then copying the page to WC (so we have a very simple and efficient upload into WC, whilst keeping the detiler in cache). --- src/mesa/drivers/dri/i965/intel_tex_subimage.c | 13 +- src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 161 ++++++++++++++++++++++++- src/mesa/drivers/dri/i965/intel_tiled_memcpy.h | 1 + 3 files changed, 160 insertions(+), 15 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c index 56c6cbf7b8..e7486300ab 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c +++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c @@ -83,9 +83,6 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, struct brw_context *brw = brw_context(ctx); struct intel_texture_image *image = intel_texture_image(texImage); - /* The miptree's buffer. */ - struct brw_bo *bo; - mem_copy_fn mem_copy; /* This fastpath is restricted to specific texture types: @@ -136,20 +133,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, if (brw->gen < 5 && brw->has_swizzling) return false; - bo = image->mt->bo; - - /* Uploading into Y-tiling surfaces using WC is slow as each sequential - * write falls outside of the WCB, completely nerfing the WC performance. - */ - if (!bo->cache_coherent && image->mt->surf.tiling == ISL_TILING_Y0) - return false; - /* Since we are going to write raw data to the miptree, we need to resolve * any pending fast color clears before we start. */ assert(image->mt->surf.logical_level0_px.depth == 1); assert(image->mt->surf.logical_level0_px.array_len == 1); + struct brw_bo *bo = image->mt->bo; if (brw_batch_references(&brw->batch, bo)) { perf_debug("Flushing before mapping a referenced bo.\n"); intel_batchbuffer_flush(brw); @@ -209,6 +199,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, map, pixels - (ptrdiff_t) level_y * src_pitch - (ptrdiff_t) level_x * cpp, image->mt->surf.row_pitch, src_pitch, + bo->cache_coherent, brw->has_swizzling, image->mt->surf.tiling, mem_copy diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c index 65dd950c08..7c9fefeda9 100644 --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c @@ -324,7 +324,7 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, static inline void linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t y0, uint32_t y1, - char *dst, const char *src, + char * __restrict__ dst, const char * __restrict__ src, int32_t src_pitch, uint32_t swizzle_bit, mem_copy_fn mem_copy, @@ -378,6 +378,104 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, } } +#if defined(__SSSE3__) +static inline void +linear_to_ytiled0(char * __restrict__ dst, const char * __restrict__ src, + int32_t src_pitch, + uint32_t swizzle_bit, + mem_copy_fn mem_copy) +{ + /* Y tiles consist of columns that are 'ytile_span' wide (and the same height + * as the tile). Thus the destination offset for (x,y) is the sum of: + * (x % column_width) // position within column + * (x / column_width) * bytes_per_column // column number * bytes per column + * y * column_width + * + * The copy destination offset for each range copied is the sum of + * an X offset 'xo0' or 'xo' and a Y offset 'yo.' + */ + const uint32_t column_width = ytile_span; + const uint32_t bytes_per_column = column_width * ytile_height; + + uint32_t x, y; + + for (y = 0; y < ytile_height; y++) { + __m128i xmm[8]; + uint32_t swizzle = 0; + + if (mem_copy == memcpy) { + for (x = 0; x < 8; x++) + xmm[x] = _mm_loadu_si128((__m128i *)src + x); + + for (x = 0; x < 8; x++) { + _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]); + swizzle ^= swizzle_bit; + } + } else { + for (x = 0; x < 4; x++) + xmm[x] = _mm_shuffle_epi8( _mm_loadu_si128((__m128i *)src + x), + *(__m128i *) rgba8_permutation); + + for (x = 0; x < 4; x++) { + _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]); + swizzle ^= swizzle_bit; + } + + for (x = 4; x < 8; x++) + xmm[x] = _mm_shuffle_epi8( _mm_loadu_si128((__m128i *)src + x), + *(__m128i *) rgba8_permutation); + + for (x = 4; x < 8; x++) { + _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]); + swizzle ^= swizzle_bit; + } + } + + src += src_pitch; + } +} + +/** + * Copy texture data from linear to Y tile layout through WC. + * + * \copydoc tile_copy_fn + */ +static inline void +linear_to_ytiled_wc(char * __restrict__ dst, + const char * __restrict__ src, + int32_t src_pitch, + mem_copy_fn mem_copy) +{ + /* Y tiles consist of columns that are 'ytile_span' wide (and the same height + * as the tile). Thus the destination offset for (x,y) is the sum of: + * (x % column_width) // position within column + * (x / column_width) * bytes_per_column // column number * bytes per column + * y * column_width + * + * The copy destination offset for each range copied is the sum of + * an X offset 'xo0' or 'xo' and a Y offset 'yo.' + */ + const uint32_t column_width = ytile_span; + const uint32_t bytes_per_column = column_width * ytile_height; + uint8_t tmp[4096] __attribute__((aligned(16))); + + uint32_t x, y; + + for (y = 0; y < ytile_height; y++) { + __m128i xmm[8]; + + for (x = 0; x < 8; x++) + xmm[x] = _mm_loadu_si128((__m128i *)src + x); + for (x = 0; x < 8; x++) + _mm_store_si128((__m128i *)(tmp + y * column_width + x*bytes_per_column), xmm[x]); + + src += src_pitch; + } + + mem_copy(dst, tmp, sizeof(tmp)); +} +#endif + /** * Copy texture data from X tile layout to linear. * @@ -547,6 +645,14 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy_fn mem_copy) { if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { +#ifdef __SSSE3__ + if (mem_copy == memcpy) + return linear_to_ytiled0(dst, src, src_pitch, swizzle_bit, + memcpy); + else if (mem_copy == rgba8_copy_aligned_dst) + return linear_to_ytiled0(dst, src, src_pitch, swizzle_bit, + rgba8_copy_aligned_dst); +#else if (mem_copy == memcpy) return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, src_pitch, swizzle_bit, memcpy, memcpy); @@ -554,6 +660,7 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, src_pitch, swizzle_bit, rgba8_copy, rgba8_copy_aligned_dst); +#endif /* __SSSE3__ */ else return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, src_pitch, swizzle_bit, @@ -575,6 +682,45 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, } /** + * Copy texture data from linear to Y tile layout using WC, faster. + * + * Same as \ref linear_to_ytiled but optimised for WC. + * + * \copydoc tile_copy_fn + */ +static FLATTEN void +linear_to_ytiled_wc_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, + uint32_t y0, uint32_t y1, + char *dst, const char *src, + int32_t src_pitch, + uint32_t swizzle_bit, + mem_copy_fn mem_copy) +{ +#if defined(__SSSE3__) + if (x3 - x0 == ytile_width && y1 - y0 == ytile_height) { + if (mem_copy == memcpy) + return linear_to_ytiled_wc(dst, src, src_pitch, memcpy); + else + return linear_to_ytiled_wc(dst, src, src_pitch, mem_copy); + } else +#endif + { + if (mem_copy == memcpy) + return linear_to_ytiled(x0, x1, x2, x3, y0, y1, + dst, src, src_pitch, swizzle_bit, + memcpy, memcpy); + else if (mem_copy == rgba8_copy) + return linear_to_ytiled(x0, x1, x2, x3, y0, y1, + dst, src, src_pitch, swizzle_bit, + rgba8_copy, rgba8_copy_aligned_dst); + else + return linear_to_ytiled(x0, x1, x2, x3, y0, y1, + dst, src, src_pitch, swizzle_bit, + mem_copy, mem_copy); + } +} + +/** * Copy texture data from X tile layout to linear, faster. * * Same as \ref xtile_to_linear but faster, because it passes constant @@ -680,6 +826,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2, uint32_t yt1, uint32_t yt2, char *dst, const char *src, uint32_t dst_pitch, int32_t src_pitch, + bool cache_coherent, bool has_swizzling, enum isl_tiling tiling, mem_copy_fn mem_copy) @@ -700,7 +847,12 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2, tw = ytile_width; th = ytile_height; span = ytile_span; - tile_copy = linear_to_ytiled_faster; + if (cache_coherent) + tile_copy = linear_to_ytiled_faster; + else if (has_swizzling) + unreachable("unsupported tiling"); + else + tile_copy = linear_to_ytiled_wc_faster; } else { unreachable("unsupported tiling"); } @@ -718,14 +870,15 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2, * Looping x inside y is the faster memory access pattern. */ for (yt = yt0; yt < yt3; yt += th) { + uint32_t y0 = MAX2(yt1, yt); + uint32_t y1 = MIN2(yt2, yt + th); + for (xt = xt0; xt < xt3; xt += tw) { /* The area to update is [x0,x3) x [y0,y1). * May not want the whole tile, hence the min and max. */ uint32_t x0 = MAX2(xt1, xt); - uint32_t y0 = MAX2(yt1, yt); uint32_t x3 = MIN2(xt2, xt + tw); - uint32_t y1 = MIN2(yt2, yt + th); /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that * the middle interval is the longest span-aligned part. diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h index e9c43920a1..9d6c71d1cf 100644 --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h @@ -42,6 +42,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2, uint32_t yt1, uint32_t yt2, char *dst, const char *src, uint32_t dst_pitch, int32_t src_pitch, + bool cache_coherent, bool has_swizzling, enum isl_tiling tiling, mem_copy_fn mem_copy); -- 2.13.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev