Re: [Mesa-dev] [PATCH 10/10] gallium/u_threaded: don't run out of memory with staging uploads

2018-01-22 Thread Marek Olšák
On Mon, Jan 22, 2018 at 2:07 PM, Nicolai Hähnle  wrote:
> On 10.01.2018 20:49, Marek Olšák wrote:
>>
>> From: Marek Olšák 
>>
>> Cc: 17.2 17.3 
>> ---
>>   src/gallium/auxiliary/util/u_threaded_context.c | 13 +
>>   src/gallium/auxiliary/util/u_threaded_context.h |  8 
>>   2 files changed, 21 insertions(+)
>>
>> diff --git a/src/gallium/auxiliary/util/u_threaded_context.c
>> b/src/gallium/auxiliary/util/u_threaded_context.c
>> index ffa8247..7bd13cf 100644
>> --- a/src/gallium/auxiliary/util/u_threaded_context.c
>> +++ b/src/gallium/auxiliary/util/u_threaded_context.c
>> @@ -1508,35 +1508,47 @@ struct tc_resource_copy_region {
>>   };
>> static void
>>   tc_resource_copy_region(struct pipe_context *_pipe,
>>   struct pipe_resource *dst, unsigned dst_level,
>>   unsigned dstx, unsigned dsty, unsigned dstz,
>>   struct pipe_resource *src, unsigned src_level,
>>   const struct pipe_box *src_box);
>> static void
>> +tc_notify_staging_upload_done(struct threaded_context *tc, unsigned size)
>> +{
>> +   tc->unflushed_transfer_size += size;
>> +
>> +   if (tc->unflushed_transfer_size >
>> TC_MAX_UNFLUSHED_STAGING_UPLOAD_SIZE) {
>> +  tc->base.flush(>base, NULL, PIPE_FLUSH_ASYNC);
>> +  tc->unflushed_transfer_size = 0;
>> +   }
>> +}
>> +
>> +static void
>>   tc_buffer_do_flush_region(struct threaded_context *tc,
>> struct threaded_transfer *ttrans,
>> const struct pipe_box *box)
>>   {
>>  struct threaded_resource *tres =
>> threaded_resource(ttrans->b.resource);
>>if (ttrans->staging) {
>> struct pipe_box src_box;
>>   u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
>>  box->width, _box);
>>   /* Copy the staging buffer into the original one. */
>> tc_resource_copy_region(>base, ttrans->b.resource, 0, box->x,
>> 0, 0,
>> ttrans->staging, 0, _box);
>> +  tc_notify_staging_upload_done(tc, box->width);
>>  }
>>util_range_add(tres->base_valid_buffer_range, box->x, box->x +
>> box->width);
>>   }
>> static void
>>   tc_transfer_flush_region(struct pipe_context *_pipe,
>>struct pipe_transfer *transfer,
>>const struct pipe_box *rel_box)
>>   {
>> @@ -1653,20 +1665,21 @@ tc_buffer_subdata(struct pipe_context *_pipe,
>>/* The upload is small. Enqueue it. */
>>  struct tc_buffer_subdata *p =
>> tc_add_slot_based_call(tc, TC_CALL_buffer_subdata,
>> tc_buffer_subdata, size);
>>tc_set_resource_reference(>resource, resource);
>>  p->usage = usage;
>>  p->offset = offset;
>>  p->size = size;
>>  memcpy(p->slot, data, size);
>> +   tc_notify_staging_upload_done(tc, size);
>>   }
>> struct tc_texture_subdata {
>>  struct pipe_resource *resource;
>>  unsigned level, usage, stride, layer_stride;
>>  struct pipe_box box;
>>  char slot[0]; /* more will be allocated if needed */
>>   };
>> static void
>> diff --git a/src/gallium/auxiliary/util/u_threaded_context.h
>> b/src/gallium/auxiliary/util/u_threaded_context.h
>> index 53c5a7e..295464a 100644
>> --- a/src/gallium/auxiliary/util/u_threaded_context.h
>> +++ b/src/gallium/auxiliary/util/u_threaded_context.h
>> @@ -225,20 +225,27 @@ struct tc_unflushed_batch_token;
>>   /* Threshold for when to use the queue or sync. */
>>   #define TC_MAX_STRING_MARKER_BYTES  512
>> /* Threshold for when to enqueue buffer/texture_subdata as-is.
>>* If the upload size is greater than this, it will do instead:
>>* - for buffers: DISCARD_RANGE is done by the threaded context
>>* - for textures: sync and call the driver directly
>>*/
>>   #define TC_MAX_SUBDATA_BYTES320
>>   +/* Every staging upload allocates memory. If we have too many uploads
>> + * in a row without flushes, we might run out of memory. This limit
>> controls
>> + * how many bytes of queued uploads we can have at a time. If we go over,
>> + * the threaded context triggers a context flush.
>> + */
>> +#define TC_MAX_UNFLUSHED_STAGING_UPLOAD_SIZE (512 * 1024 * 1024)
>
>
> This seems very aggressive. In reality, this should probably scale with free
> space in GART, but we don't know that here. Can you reduce it to something
> like 64MB? Unless there's concrete evidence that having it higher is
> beneficial, of course.

Well the current limit is infinite. I think 64MB is too low. I would
expect games to use 2-4GB of VRAM per IB. 512MB seems reasonable for a
GTT upload limit. 512MB might not be great for small APUs, but it's
definitely not very high for mid-sized dGPUs, and context flushes can
be bad for perf.

I know that Metro 2033 is slower if I decrease the limit a lot, but I
don't remember what size I 

Re: [Mesa-dev] [PATCH 10/10] gallium/u_threaded: don't run out of memory with staging uploads

2018-01-22 Thread Nicolai Hähnle

On 10.01.2018 20:49, Marek Olšák wrote:

From: Marek Olšák 

Cc: 17.2 17.3 
---
  src/gallium/auxiliary/util/u_threaded_context.c | 13 +
  src/gallium/auxiliary/util/u_threaded_context.h |  8 
  2 files changed, 21 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_threaded_context.c 
b/src/gallium/auxiliary/util/u_threaded_context.c
index ffa8247..7bd13cf 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -1508,35 +1508,47 @@ struct tc_resource_copy_region {
  };
  
  static void

  tc_resource_copy_region(struct pipe_context *_pipe,
  struct pipe_resource *dst, unsigned dst_level,
  unsigned dstx, unsigned dsty, unsigned dstz,
  struct pipe_resource *src, unsigned src_level,
  const struct pipe_box *src_box);
  
  static void

+tc_notify_staging_upload_done(struct threaded_context *tc, unsigned size)
+{
+   tc->unflushed_transfer_size += size;
+
+   if (tc->unflushed_transfer_size > TC_MAX_UNFLUSHED_STAGING_UPLOAD_SIZE) {
+  tc->base.flush(>base, NULL, PIPE_FLUSH_ASYNC);
+  tc->unflushed_transfer_size = 0;
+   }
+}
+
+static void
  tc_buffer_do_flush_region(struct threaded_context *tc,
struct threaded_transfer *ttrans,
const struct pipe_box *box)
  {
 struct threaded_resource *tres = threaded_resource(ttrans->b.resource);
  
 if (ttrans->staging) {

struct pipe_box src_box;
  
u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,

 box->width, _box);
  
/* Copy the staging buffer into the original one. */

tc_resource_copy_region(>base, ttrans->b.resource, 0, box->x, 0, 0,
ttrans->staging, 0, _box);
+  tc_notify_staging_upload_done(tc, box->width);
 }
  
 util_range_add(tres->base_valid_buffer_range, box->x, box->x + box->width);

  }
  
  static void

  tc_transfer_flush_region(struct pipe_context *_pipe,
   struct pipe_transfer *transfer,
   const struct pipe_box *rel_box)
  {
@@ -1653,20 +1665,21 @@ tc_buffer_subdata(struct pipe_context *_pipe,
  
 /* The upload is small. Enqueue it. */

 struct tc_buffer_subdata *p =
tc_add_slot_based_call(tc, TC_CALL_buffer_subdata, tc_buffer_subdata, 
size);
  
 tc_set_resource_reference(>resource, resource);

 p->usage = usage;
 p->offset = offset;
 p->size = size;
 memcpy(p->slot, data, size);
+   tc_notify_staging_upload_done(tc, size);
  }
  
  struct tc_texture_subdata {

 struct pipe_resource *resource;
 unsigned level, usage, stride, layer_stride;
 struct pipe_box box;
 char slot[0]; /* more will be allocated if needed */
  };
  
  static void

diff --git a/src/gallium/auxiliary/util/u_threaded_context.h 
b/src/gallium/auxiliary/util/u_threaded_context.h
index 53c5a7e..295464a 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -225,20 +225,27 @@ struct tc_unflushed_batch_token;
  /* Threshold for when to use the queue or sync. */
  #define TC_MAX_STRING_MARKER_BYTES  512
  
  /* Threshold for when to enqueue buffer/texture_subdata as-is.

   * If the upload size is greater than this, it will do instead:
   * - for buffers: DISCARD_RANGE is done by the threaded context
   * - for textures: sync and call the driver directly
   */
  #define TC_MAX_SUBDATA_BYTES320
  
+/* Every staging upload allocates memory. If we have too many uploads

+ * in a row without flushes, we might run out of memory. This limit controls
+ * how many bytes of queued uploads we can have at a time. If we go over,
+ * the threaded context triggers a context flush.
+ */
+#define TC_MAX_UNFLUSHED_STAGING_UPLOAD_SIZE (512 * 1024 * 1024)


This seems very aggressive. In reality, this should probably scale with 
free space in GART, but we don't know that here. Can you reduce it to 
something like 64MB? Unless there's concrete evidence that having it 
higher is beneficial, of course.


With that, patches 9 & 10:

Reviewed-by: Nicolai Hähnle 



+
  typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
 struct pipe_resource *dst,
 struct pipe_resource *src);
  typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context 
*ctx,
struct 
tc_unflushed_batch_token *token);
  
  struct threaded_resource {

 struct pipe_resource b;
 const struct u_resource_vtbl *vtbl;
  
@@ -346,20 +353,21 @@ struct tc_batch {

 struct tc_call call[TC_CALLS_PER_BATCH];
  };
  
  struct threaded_context {

 

Re: [Mesa-dev] [PATCH 10/10] gallium/u_threaded: don't run out of memory with staging uploads

2018-01-10 Thread Dieter Nützel

For the series:

Tested-by: Dieter Nützel 

on RX580 with UH, UV, glmark2 and Blender 2.79 with and without nir.

Dieter

Am 10.01.2018 20:49, schrieb Marek Olšák:

From: Marek Olšák 

Cc: 17.2 17.3 
---
 src/gallium/auxiliary/util/u_threaded_context.c | 13 +
 src/gallium/auxiliary/util/u_threaded_context.h |  8 
 2 files changed, 21 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_threaded_context.c
b/src/gallium/auxiliary/util/u_threaded_context.c
index ffa8247..7bd13cf 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -1508,35 +1508,47 @@ struct tc_resource_copy_region {
 };

 static void
 tc_resource_copy_region(struct pipe_context *_pipe,
 struct pipe_resource *dst, unsigned dst_level,
 unsigned dstx, unsigned dsty, unsigned dstz,
 struct pipe_resource *src, unsigned src_level,
 const struct pipe_box *src_box);

 static void
+tc_notify_staging_upload_done(struct threaded_context *tc, unsigned 
size)

+{
+   tc->unflushed_transfer_size += size;
+
+   if (tc->unflushed_transfer_size > 
TC_MAX_UNFLUSHED_STAGING_UPLOAD_SIZE) {

+  tc->base.flush(>base, NULL, PIPE_FLUSH_ASYNC);
+  tc->unflushed_transfer_size = 0;
+   }
+}
+
+static void
 tc_buffer_do_flush_region(struct threaded_context *tc,
   struct threaded_transfer *ttrans,
   const struct pipe_box *box)
 {
struct threaded_resource *tres = 
threaded_resource(ttrans->b.resource);


if (ttrans->staging) {
   struct pipe_box src_box;

   u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
box->width, _box);

   /* Copy the staging buffer into the original one. */
   tc_resource_copy_region(>base, ttrans->b.resource, 0, 
box->x, 0, 0,

   ttrans->staging, 0, _box);
+  tc_notify_staging_upload_done(tc, box->width);
}

util_range_add(tres->base_valid_buffer_range, box->x, box->x + 
box->width);

 }

 static void
 tc_transfer_flush_region(struct pipe_context *_pipe,
  struct pipe_transfer *transfer,
  const struct pipe_box *rel_box)
 {
@@ -1653,20 +1665,21 @@ tc_buffer_subdata(struct pipe_context *_pipe,

/* The upload is small. Enqueue it. */
struct tc_buffer_subdata *p =
   tc_add_slot_based_call(tc, TC_CALL_buffer_subdata,
tc_buffer_subdata, size);

tc_set_resource_reference(>resource, resource);
p->usage = usage;
p->offset = offset;
p->size = size;
memcpy(p->slot, data, size);
+   tc_notify_staging_upload_done(tc, size);
 }

 struct tc_texture_subdata {
struct pipe_resource *resource;
unsigned level, usage, stride, layer_stride;
struct pipe_box box;
char slot[0]; /* more will be allocated if needed */
 };

 static void
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h
b/src/gallium/auxiliary/util/u_threaded_context.h
index 53c5a7e..295464a 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -225,20 +225,27 @@ struct tc_unflushed_batch_token;
 /* Threshold for when to use the queue or sync. */
 #define TC_MAX_STRING_MARKER_BYTES  512

 /* Threshold for when to enqueue buffer/texture_subdata as-is.
  * If the upload size is greater than this, it will do instead:
  * - for buffers: DISCARD_RANGE is done by the threaded context
  * - for textures: sync and call the driver directly
  */
 #define TC_MAX_SUBDATA_BYTES320

+/* Every staging upload allocates memory. If we have too many uploads
+ * in a row without flushes, we might run out of memory. This limit 
controls
+ * how many bytes of queued uploads we can have at a time. If we go 
over,

+ * the threaded context triggers a context flush.
+ */
+#define TC_MAX_UNFLUSHED_STAGING_UPLOAD_SIZE (512 * 1024 * 1024)
+
 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context 
*ctx,
struct pipe_resource 
*dst,
struct pipe_resource 
*src);

 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct
pipe_context *ctx,
   struct
tc_unflushed_batch_token *token);

 struct threaded_resource {
struct pipe_resource b;
const struct u_resource_vtbl *vtbl;

@@ -346,20 +353,21 @@ struct tc_batch {
struct tc_call call[TC_CALLS_PER_BATCH];
 };

 struct threaded_context {
struct pipe_context base;
struct pipe_context *pipe;
struct slab_child_pool pool_transfers;
tc_replace_buffer_storage_func replace_buffer_storage;
tc_create_fence_func create_fence;
unsigned map_buffer_alignment;
+   unsigned unflushed_transfer_size;

struct list_head