Re: [Mesa-dev] [PATCH 7/9] i965: Pack simple pipelined query objects into the same buffer

2018-10-06 Thread Chris Wilson
Quoting Kenneth Graunke (2018-10-06 02:57:29)
> On Tuesday, October 2, 2018 11:06:23 AM PDT Chris Wilson wrote:
> > Reuse the same query object buffer for multiple queries within the same
> > batch.
> > 
> > A task for the future is propagating the GL_NO_MEMORY errors.
> > 
> > Signed-off-by: Chris Wilson 
> > Cc: Kenneth Graunke 
> > Cc: Matt Turner 
> > ---
> >  src/mesa/drivers/dri/i965/brw_context.c   |  3 ++
> >  src/mesa/drivers/dri/i965/brw_context.h   | 10 +++--
> >  src/mesa/drivers/dri/i965/brw_queryobj.c  | 16 +++
> >  src/mesa/drivers/dri/i965/gen6_queryobj.c | 51 ++-
> >  4 files changed, 59 insertions(+), 21 deletions(-)
> 
> Don't want to do this.  This means that WaitQuery will wait on the whole
> group of packed queries instead of just the one the app asked about.
> 
> Vulkan has to pack queries by design, and it turns out this was a real
> issue there.  See b2c97bc789198427043cd902bc76e194e7e81c7d.  Jason ended
> up making it busy-wait to avoid bo_waiting on the entire pool, and it
> improved Serious Engine performance by 20%.
> 
> We could busy-wait in GL too, for lower latency but more CPU waste,
> but I think I prefer separate BOs + bo_wait.

It's the same latency for wait as a new BO is used for each batch, and
waits are on the batch fence not the individual write into the query
batch. (So no change whatsoever for waits from the current code.) Polling
is improved as we there we can check the individual fence. Pipelined is
unaffected.

Now we could keep the query buffer across multiple batches and use
fences (userspace seqno + batch handles) to poll or wait on the partial
writes.
-Chris
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 7/9] i965: Pack simple pipelined query objects into the same buffer

2018-10-05 Thread Kenneth Graunke
On Tuesday, October 2, 2018 11:06:23 AM PDT Chris Wilson wrote:
> Reuse the same query object buffer for multiple queries within the same
> batch.
> 
> A task for the future is propagating the GL_NO_MEMORY errors.
> 
> Signed-off-by: Chris Wilson 
> Cc: Kenneth Graunke 
> Cc: Matt Turner 
> ---
>  src/mesa/drivers/dri/i965/brw_context.c   |  3 ++
>  src/mesa/drivers/dri/i965/brw_context.h   | 10 +++--
>  src/mesa/drivers/dri/i965/brw_queryobj.c  | 16 +++
>  src/mesa/drivers/dri/i965/gen6_queryobj.c | 51 ++-
>  4 files changed, 59 insertions(+), 21 deletions(-)

Don't want to do this.  This means that WaitQuery will wait on the whole
group of packed queries instead of just the one the app asked about.

Vulkan has to pack queries by design, and it turns out this was a real
issue there.  See b2c97bc789198427043cd902bc76e194e7e81c7d.  Jason ended
up making it busy-wait to avoid bo_waiting on the entire pool, and it
improved Serious Engine performance by 20%.

We could busy-wait in GL too, for lower latency but more CPU waste,
but I think I prefer separate BOs + bo_wait.


signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 7/9] i965: Pack simple pipelined query objects into the same buffer

2018-10-02 Thread Chris Wilson
Reuse the same query object buffer for multiple queries within the same
batch.

A task for the future is propagating the GL_NO_MEMORY errors.

Signed-off-by: Chris Wilson 
Cc: Kenneth Graunke 
Cc: Matt Turner 
---
 src/mesa/drivers/dri/i965/brw_context.c   |  3 ++
 src/mesa/drivers/dri/i965/brw_context.h   | 10 +++--
 src/mesa/drivers/dri/i965/brw_queryobj.c  | 16 +++
 src/mesa/drivers/dri/i965/gen6_queryobj.c | 51 ++-
 4 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
b/src/mesa/drivers/dri/i965/brw_context.c
index 6ba64e4e06d..53912c9c98e 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -953,6 +953,8 @@ brwCreateContext(gl_api api,
 
brw->isl_dev = screen->isl_dev;
 
+   brw->query.last_index = 4096;
+
brw->vs.base.stage = MESA_SHADER_VERTEX;
brw->tcs.base.stage = MESA_SHADER_TESS_CTRL;
brw->tes.base.stage = MESA_SHADER_TESS_EVAL;
@@ -1164,6 +1166,7 @@ intelDestroyContext(__DRIcontext * driContextPriv)
brw_bo_unreference(brw->tes.base.push_const_bo);
brw_bo_unreference(brw->gs.base.push_const_bo);
brw_bo_unreference(brw->wm.base.push_const_bo);
+   brw_bo_unreference(brw->query.bo);
 
brw_destroy_hw_context(brw->bufmgr, brw->hw_ctx);
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 418941c9194..917bb3a7baf 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -462,7 +462,7 @@ struct brw_query_object {
uint64_t *results;
 
/** Last index in bo with query data for this object. */
-   int last_index;
+   unsigned index;
 
/** True if we know the batch has been flushed since we ended the query. */
bool flushed;
@@ -474,13 +474,13 @@ struct brw_query_object {
 static inline unsigned
 gen6_query_predicate_offset(const struct brw_query_object *query)
 {
-   return GEN6_QUERY_PREDICATE * sizeof(uint64_t);
+   return (query->index + GEN6_QUERY_PREDICATE) * sizeof(uint64_t);
 }
 
 static inline unsigned
 gen6_query_results_offset(const struct brw_query_object *query, unsigned idx)
 {
-   return (GEN6_QUERY_RESULTS + idx) * sizeof(uint64_t);
+   return (query->index + GEN6_QUERY_RESULTS + idx) * sizeof(uint64_t);
 }
 
 struct brw_reloc_list {
@@ -1199,6 +1199,10 @@ struct brw_context
} cc;
 
struct {
+  struct brw_bo *bo;
+  uint64_t *map;
+  unsigned last_index;
+
   struct brw_query_object *obj;
   bool begin_emitted;
} query;
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c 
b/src/mesa/drivers/dri/i965/brw_queryobj.c
index bc4b8c43e7b..c77d630a138 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -188,7 +188,7 @@ brw_queryobj_get_results(struct gl_context *ctx,
* run out of space in the query's BO and allocated a new one.  If so,
* this function was already called to accumulate the results so far.
*/
-  for (i = 0; i < query->last_index; i++) {
+  for (i = 0; i < query->index; i++) {
 query->Base.Result += results[i * 2 + 1] - results[i * 2];
   }
   break;
@@ -198,7 +198,7 @@ brw_queryobj_get_results(struct gl_context *ctx,
   /* If the starting and ending PS_DEPTH_COUNT from any of the batches
* differ, then some fragments passed the depth test.
*/
-  for (i = 0; i < query->last_index; i++) {
+  for (i = 0; i < query->index; i++) {
 if (results[i * 2 + 1] != results[i * 2]) {
 query->Base.Result = GL_TRUE;
 break;
@@ -304,7 +304,7 @@ brw_begin_query(struct gl_context *ctx, struct 
gl_query_object *q)
*/
   brw_bo_unreference(query->bo);
   query->bo = NULL;
-  query->last_index = -1;
+  query->index = -1;
 
   brw->query.obj = query;
 
@@ -441,7 +441,7 @@ ensure_bo_has_space(struct gl_context *ctx, struct 
brw_query_object *query)
 
assert(devinfo->gen < 6);
 
-   if (!query->bo || query->last_index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
+   if (!query->bo || query->index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
 
   if (query->bo != NULL) {
  /* The old query BO did not have enough space, so we allocated a new
@@ -452,7 +452,7 @@ ensure_bo_has_space(struct gl_context *ctx, struct 
brw_query_object *query)
   }
 
   query->bo = brw_bo_alloc(brw->bufmgr, "query", 4096, BRW_MEMZONE_OTHER);
-  query->last_index = 0;
+  query->index = 0;
}
 }
 
@@ -490,7 +490,7 @@ brw_emit_query_begin(struct brw_context *brw)
 
ensure_bo_has_space(ctx, query);
 
-   brw_write_depth_count(brw, query->bo, query->last_index * 2);
+   brw_write_depth_count(brw, query->bo, query->index * 2);
 
brw->query.begin_emitted = true;
 }
@@ -509,10 +509,10 @@ brw_emit_query_end(struct brw_context *brw)
if (!brw->query.begin_emitted)
   return;
 
-   brw_write_depth_count(brw, query->bo,