Re: [Mesa-dev] [PATCH 4/6] i965: Use blorp+userptr for GPU readback

2017-10-17 Thread Chris Wilson
Quoting Chris Wilson (2017-10-13 10:34:54)
> The primary benefit for this is that we get format conversion for
> "free", along with detiling and cache flushing (most relevant for !llc).
> Using the GPU does impose a bandwidth cost that is presumably better
> used for rendering, hence we limit the use to readback into client
> memory (not pbo) where we would need to stall on the GPU anyway.
> (Uploads remain direct/staged to avoid the synchronisation cost.)
> And we only use the GPU path if a direct read into client memory from
> video memory is unavailable.
> 
> The ultimate user of this is Xorg/glamor! On byt, bsw, bxt (and
> presumably but not measured ilk), x11perf -shmget500 is improved by
> 15-fold. Though conversely the overhead of executing and waiting upon an
> additional blorp batch is shown by x11perf -shmget10 being reduced by a
> factor of 2. I think it is fair to presume that large copies will
> dominate (and that the overhead of a single batch is something that we
> can iteratively reduce, for the benefit of all.) llc machines continue to
> use direct access where there is no format changes (which one hopes is
> the typical use case).

Ah, this needs some improvements to the direct read path
(intel_gettexsubimage_tiled_memcpy) to handle subimages for llc + Xorg.
I have those in the older patches to enable userptr readback, I'll dig
those out again.
-Chris
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/6] i965: Use blorp+userptr for GPU readback

2017-10-13 Thread Chris Wilson
The primary benefit for this is that we get format conversion for
"free", along with detiling and cache flushing (most relevant for !llc).
Using the GPU does impose a bandwidth cost that is presumably better
used for rendering, hence we limit the use to readback into client
memory (not pbo) where we would need to stall on the GPU anyway.
(Uploads remain direct/staged to avoid the synchronisation cost.)
And we only use the GPU path if a direct read into client memory from
video memory is unavailable.

The ultimate user of this is Xorg/glamor! On byt, bsw, bxt (and
presumably but not measured ilk), x11perf -shmget500 is improved by
15-fold. Though conversely the overhead of executing and waiting upon an
additional blorp batch is shown by x11perf -shmget10 being reduced by a
factor of 2. I think it is fair to presume that large copies will
dominate (and that the overhead of a single batch is something that we
can iteratively reduce, for the benefit of all.) llc machines continue to
use direct access where there is no format changes (which one hopes is
the typical use case).

Cc: Jason Ekstrand 
Cc: Topi Pohjolainen 
Cc: Kenneth Graunke 
---
 src/mesa/drivers/dri/i965/brw_blorp.c| 34 ++--
 src/mesa/drivers/dri/i965/intel_pixel_read.c | 21 -
 src/mesa/drivers/dri/i965/intel_tex_image.c  | 27 +++---
 3 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.c 
b/src/mesa/drivers/dri/i965/brw_blorp.c
index ed4f9870f2..c2a549b204 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -36,6 +36,7 @@
 #include "brw_defines.h"
 #include "brw_meta_util.h"
 #include "brw_state.h"
+#include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 #include "intel_fbo.h"
 #include "common/gen_debug.h"
@@ -806,13 +807,10 @@ blorp_get_client_bo(struct brw_context *brw,
 
   *offset_out = offset;
   return bo;
-   } else {
+   } else if (read_only) {
   /* Someone should have already checked that there is data to upload. */
   assert(pixels);
 
-  /* Creating a temp buffer currently only works for upload */
-  assert(read_only);
-
   /* This is not a user-provided PBO.  Instead, pixels is a pointer to CPU
* data which we need to copy into a BO.
*/
@@ -832,6 +830,23 @@ blorp_get_client_bo(struct brw_context *brw,
 
   *offset_out = 0;
   return bo;
+   } else if (brw->screen->kernel_featuers & KERNEL_ALLOWS_USERPTR) {
+  void *addr = (void *)pixels + first_pixel;
+  void *first_page = (void *)((GLintptr)addr & -4096);
+  void *last_page = (void *)(ALIGN((GLintptr)(pixels + last_pixel), 4096));
+
+  struct brw_bo *bo =
+brw_bo_alloc_userptr(brw->bufmgr, "tex_subimage_userptr",
+ first_page, last_page - first_page, 0);
+  if (bo == NULL) {
+ perf_debug("intel_texsubimage: userptr mapping failed\n");
+ return NULL;
+  }
+
+  *offset_out = addr - first_page;
+  return bo;
+   } else {
+  return NULL;
}
 }
 
@@ -974,6 +989,10 @@ brw_blorp_upload_miptree(struct brw_context *brw,
result = true;
 
 err:
+   if (src_bo->userptr) {
+  intel_batchbuffer_flush(brw);
+  brw_bo_wait_rendering(src_bo);
+   }
brw_bo_unreference(src_bo);
 
return result;
@@ -1019,9 +1038,6 @@ brw_blorp_download_miptree(struct brw_context *brw,
   break;
}
 
-   /* This pass only works for PBOs */
-   assert(_mesa_is_bufferobj(packing->BufferObj));
-
uint32_t dst_offset, dst_row_stride, dst_image_stride;
struct brw_bo *dst_bo =
   blorp_get_client_bo(brw, width, height, depth,
@@ -1117,6 +1133,10 @@ brw_blorp_download_miptree(struct brw_context *brw,
brw_emit_mi_flush(brw);
 
 err:
+   if (dst_bo->userptr) {
+  intel_batchbuffer_flush(brw);
+  brw_bo_wait_rendering(dst_bo);
+   }
brw_bo_unreference(dst_bo);
 
return result;
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c 
b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index 4528d6d265..699ce73b0f 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -259,8 +259,6 @@ intelReadPixels(struct gl_context * ctx,
 GLenum format, GLenum type,
 const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
 {
-   bool ok;
-
struct brw_context *brw = brw_context(ctx);
bool dirty;
 
@@ -273,18 +271,19 @@ intelReadPixels(struct gl_context * ctx,
intel_prepare_render(brw);
brw->front_buffer_dirty = dirty;
 
-   if (_mesa_is_bufferobj(pack->BufferObj)) {
-  if (intel_readpixels_blorp(ctx, x, y, width, height,
- format, type, pixels, pack))
- return;
-
-  perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
+   if