tiled_memcpy: ytiled_to_linear a cache line at a time

Jason Ekstrand Mon, 30 Apr 2018 12:08:41 -0700

Reviewed-by: Jason Ekstrand <[email protected]>

On Mon, Apr 30, 2018 at 10:25 AM, Scott D Phillips <
[email protected]> wrote:


> Similar to the transformation applied to linear_to_ytiled, also align
> each readback from the ytiled source to a cacheline (i.e. transfer a
> whole cacheline from the source before moving on to the next column).
> This will allow us to utilize movntqda (_mm_stream_si128) in a
> subsequent patch to obtain near WB readback performance when accessing
> the uncached ytiled memory, an order of magnitude improvement.
>
> Reviewed-by: Chris Wilson <[email protected]>
> ---
>  src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 72
> +++++++++++++++++++++++---
>  1 file changed, 66 insertions(+), 6 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> index 69306828d72..7c6bde990d6 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> @@ -451,7 +451,7 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t
> x2, uint32_t x3,
>   */
>  static inline void
>  ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
> -                 uint32_t y0, uint32_t y1,
> +                 uint32_t y0, uint32_t y3,
>                   char *dst, const char *src,
>                   int32_t dst_pitch,
>                   uint32_t swizzle_bit,
> @@ -470,6 +470,9 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t
> x2, uint32_t x3,
>     const uint32_t column_width = ytile_span;
>     const uint32_t bytes_per_column = column_width * ytile_height;
>
> +   uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
> +   uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
> +
>     uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) *
> bytes_per_column;
>     uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) *
> bytes_per_column;
>
> @@ -485,24 +488,81 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t
> x2, uint32_t x3,
>
>     dst += (ptrdiff_t)y0 * dst_pitch;
>
> -   for (yo = y0 * column_width; yo < y1 * column_width; yo +=
> column_width) {
> +   if (y0 != y1) {
> +      for (yo = y0 * column_width; yo < y1 * column_width; yo +=
> column_width) {
> +         uint32_t xo = xo1;
> +         uint32_t swizzle = swizzle1;
> +
> +         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
> +
> +         /* Step by spans/columns.  As it happens, the swizzle bit flips
> +          * at each step so we don't need to calculate it explicitly.
> +          */
> +         for (x = x1; x < x2; x += ytile_span) {
> +            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle),
> ytile_span);
> +            xo += bytes_per_column;
> +            swizzle ^= swizzle_bit;
> +         }
> +
> +         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
> +
> +         dst += dst_pitch;
> +      }
> +   }
> +
> +   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 *
> column_width) {
>        uint32_t xo = xo1;
>        uint32_t swizzle = swizzle1;
>
> -      mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
> +      if (x0 != x1) {
> +         mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 *
> column_width) ^ swizzle0), x1 - x0);
> +         mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 *
> column_width) ^ swizzle0), x1 - x0);
> +         mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 *
> column_width) ^ swizzle0), x1 - x0);
> +         mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 *
> column_width) ^ swizzle0), x1 - x0);
> +      }
>
>        /* Step by spans/columns.  As it happens, the swizzle bit flips
>         * at each step so we don't need to calculate it explicitly.
>         */
>        for (x = x1; x < x2; x += ytile_span) {
> -         mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle),
> ytile_span);
> +         mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 *
> column_width) ^ swizzle), ytile_span);
> +         mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 *
> column_width) ^ swizzle), ytile_span);
> +         mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 *
> column_width) ^ swizzle), ytile_span);
> +         mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 *
> column_width) ^ swizzle), ytile_span);
>           xo += bytes_per_column;
>           swizzle ^= swizzle_bit;
>        }
>
> -      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
> +      if (x2 != x3) {
> +         mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 *
> column_width) ^ swizzle), x3 - x2);
> +         mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 *
> column_width) ^ swizzle), x3 - x2);
> +         mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 *
> column_width) ^ swizzle), x3 - x2);
> +         mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 *
> column_width) ^ swizzle), x3 - x2);
> +      }
>
> -      dst += dst_pitch;
> +      dst += 4 * dst_pitch;
> +   }
> +
> +   if (y2 != y3) {
> +      for (yo = y2 * column_width; yo < y3 * column_width; yo +=
> column_width) {
> +         uint32_t xo = xo1;
> +         uint32_t swizzle = swizzle1;
> +
> +         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
> +
> +         /* Step by spans/columns.  As it happens, the swizzle bit flips
> +          * at each step so we don't need to calculate it explicitly.
> +          */
> +         for (x = x1; x < x2; x += ytile_span) {
> +            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle),
> ytile_span);
> +            xo += bytes_per_column;
> +            swizzle ^= swizzle_bit;
> +         }
> +
> +         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
> +
> +         dst += dst_pitch;
> +      }
>     }
>  }
>
> --
> 2.14.3
>
>

_______________________________________________
mesa-dev mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 08/13] i965/tiled_memcpy: ytiled_to_linear a cache line at a time

Reply via email to