Re: [Mesa-dev] [PATCH 07/10] i965: Prepare batchbuffer module for softpin support.

2018-05-08 Thread Scott D Phillips
Kenneth Graunke  writes:

> If EXEC_OBJECT_PINNED is set, we don't want to emit any relocations.
> We simply want to add the BO to the validation list, and possibly mark
> it as writeable.  The new brw_use_pinned_bo() interface does just that.
>
> To avoid having to make every caller consider both the relocation and
> softpin cases, we make emit_reloc() call brw_use_pinned_bo() when given
> a softpinned buffer.
>
> We also can't grow buffers that are softpinned - the mechanism places a
> larger BO at the same offset as the original, which requires moving BOs
> around in the VMA.  With softpin, we only allocate enough VMA for the
> original size of the BO.
> ---
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c | 40 +--
>  src/mesa/drivers/dri/i965/intel_batchbuffer.h |  4 ++
>  2 files changed, 40 insertions(+), 4 deletions(-)
>
> Overallocating is gross, but I have to make incremental progress somehow.
> For batch buffers, the ultimate plan is to switch from growing to batch
> chaining (just create another batch and MI_BATCH_BUFFER_START to GOTO
> the new batch and carry on).  We can do that on Gen8+.  It's easier to
> do that in the softpin world - otherwise, we'd need a third set of
> relocation lists, which gets messier.
>
> For state buffers, the plan is to set Dynamic State Base Address to a
> fixed 4GB region of the VMA, then just use intel_upload.c to make
> however many buffers we need, allocated out of that memzone.  Being able
> to fix BO addresses within 4GB of the base address eliminates the need
> to force all state to be in a single BO, and gives us a lot more
> flexibility with less magic required.
>
> But again...can't convert everything overnight, especially when having
> to care about older hardware and ancient kernels.

The plan of changing overallocation to chaining in a future patch sounds
good to me. A nice enhancement at some point might be a
INTEL_DEBUG=wasted-memory where we print out how much memory we're
wasting due to overallocation, either here or by bucketing.

I don't quite understand Chris's comment so this isn't meant to be an
assertion that I think what he's saying isn't an issue, but fwiw:

Reviewed-by: Scott D Phillips 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 07/10] i965: Prepare batchbuffer module for softpin support.

2018-05-04 Thread Chris Wilson
Quoting Kenneth Graunke (2018-05-04 02:12:37)
> @@ -933,6 +945,14 @@ emit_reloc(struct intel_batchbuffer *batch,
>  {
> assert(target != NULL);
>  
> +   if (target->kflags & EXEC_OBJECT_PINNED) {
> +  brw_use_pinned_bo(batch, target, reloc_flags & RELOC_WRITE);
> +  return target->gtt_offset + target_offset;
> +   }
> +
> +   unsigned int index = add_exec_bo(batch, target);
> +   struct drm_i915_gem_exec_object2 *entry = >validation_list[index];
> +
> if (rlist->reloc_count == rlist->reloc_array_size) {
>rlist->reloc_array_size *= 2;
>rlist->relocs = realloc(rlist->relocs,
> @@ -940,9 +960,6 @@ emit_reloc(struct intel_batchbuffer *batch,
>sizeof(struct drm_i915_gem_relocation_entry));
> }
>  
> -   unsigned int index = add_exec_bo(batch, target);
> -   struct drm_i915_gem_exec_object2 *entry = >validation_list[index];
> -
> if (reloc_flags & RELOC_32BIT) {
>/* Restrict this buffer to the low 32 bits of the address space.
> *
> @@ -976,6 +993,21 @@ emit_reloc(struct intel_batchbuffer *batch,
> return entry->offset + target_offset;
>  }
>  
> +void
> +brw_use_pinned_bo(struct intel_batchbuffer *batch, struct brw_bo *bo,
> +  unsigned writable_flag)
> +{
> +   assert(bo->kflags & EXEC_OBJECT_PINNED);
> +   assert((writable_flag & ~EXEC_OBJECT_WRITE) == 0);
> +
> +   unsigned int index = add_exec_bo(batch, bo);
> +   struct drm_i915_gem_exec_object2 *entry = >validation_list[index];
> +   assert(entry->offset == bo->gtt_offset);
> +
> +   if (writable_flag)
> +  entry->flags |= EXEC_OBJECT_WRITE;
> +}

I'm not fond of this (at least the emit_reloc() perspective). In
emit_reloc() we were very careful to order the writes to ensure the
validation object was always consistent with the batchbuffer entry,
and this throws it all away (granted it's not a concern, my worry is
just that the code looks dangerous).

My preference would be something like:

static uint64_t
emit_reloc(struct intel_batchbuffer *batch,
   struct brw_reloc_list *rlist, uint32_t offset,
   struct brw_bo *target, int32_t target_offset,
   unsigned int reloc_flags)
{
   assert(target != NULL);

   unsigned int index = add_exec_bo(batch, target);
   struct drm_i915_gem_exec_object2 *entry = >validation_list[index];

   if (target->kflags & EXEC_OBJECT_PINNED) {
  assert(!(reloc_flags & ~EXEC_OBJECT_WRITE));
  goto skip_relocs;
   }

   if (rlist->reloc_count == rlist->reloc_array_size) {
  rlist->reloc_array_size *= 2;
  rlist->relocs = realloc(rlist->relocs,
  rlist->reloc_array_size *
  sizeof(struct drm_i915_gem_relocation_entry));
   }

   if (reloc_flags & RELOC_32BIT) {
  /* Restrict this buffer to the low 32 bits of the address space.
   *
   * Altering the validation list flags restricts it for this batch,
   * but we also alter the BO's kflags to restrict it permanently
   * (until the BO is destroyed and put back in the cache).  Buffers
   * may stay bound across batches, and we want keep it constrained.
   */
  target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
  entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;

  /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
  reloc_flags &= ~RELOC_32BIT;
   }

   rlist->relocs[rlist->reloc_count++] =
  (struct drm_i915_gem_relocation_entry) {
 .offset = offset,
 .delta = target_offset,
 .target_handle = batch->use_batch_first ? index : target->gem_handle,
 .presumed_offset = entry->offset,
  };

skip_relocs:
   if (reloc_flags)
  entry->flags |= reloc_flags & batch->valid_reloc_flags;

   /* Using the old buffer offset, write in what the right data would be, in
* case the buffer doesn't move and we can short-circuit the relocation
* processing in the kernel
*/
   return entry->offset + target_offset;
}

The relationship between validation object entry and the batch buffer
contents is much easier to verify.
-Chris
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 07/10] i965: Prepare batchbuffer module for softpin support.

2018-05-03 Thread Kenneth Graunke
If EXEC_OBJECT_PINNED is set, we don't want to emit any relocations.
We simply want to add the BO to the validation list, and possibly mark
it as writeable.  The new brw_use_pinned_bo() interface does just that.

To avoid having to make every caller consider both the relocation and
softpin cases, we make emit_reloc() call brw_use_pinned_bo() when given
a softpinned buffer.

We also can't grow buffers that are softpinned - the mechanism places a
larger BO at the same offset as the original, which requires moving BOs
around in the VMA.  With softpin, we only allocate enough VMA for the
original size of the BO.
---
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 40 +--
 src/mesa/drivers/dri/i965/intel_batchbuffer.h |  4 ++
 2 files changed, 40 insertions(+), 4 deletions(-)

Overallocating is gross, but I have to make incremental progress somehow.
For batch buffers, the ultimate plan is to switch from growing to batch
chaining (just create another batch and MI_BATCH_BUFFER_START to GOTO
the new batch and carry on).  We can do that on Gen8+.  It's easier to
do that in the softpin world - otherwise, we'd need a third set of
relocation lists, which gets messier.

For state buffers, the plan is to set Dynamic State Base Address to a
fixed 4GB region of the VMA, then just use intel_upload.c to make
however many buffers we need, allocated out of that memzone.  Being able
to fix BO addresses within 4GB of the base address eliminates the need
to force all state to be in a single BO, and gives us a lot more
flexibility with less magic required.

But again...can't convert everything overnight, especially when having
to care about older hardware and ancient kernels.

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c 
b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index fe1ea02ca41..96f1a238c0d 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -232,6 +232,10 @@ recreate_growing_buffer(struct brw_context *brw,
struct intel_batchbuffer *batch = >batch;
struct brw_bufmgr *bufmgr = screen->bufmgr;
 
+   /* We can't grow buffers when using softpin, so just overallocate them. */
+   if (brw_using_softpin(bufmgr))
+  size *= 2;
+
grow->bo = brw_bo_alloc(bufmgr, name, size, memzone);
grow->bo->kflags |= can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
grow->partial_bo = NULL;
@@ -389,6 +393,13 @@ grow_buffer(struct brw_context *brw,
struct brw_bufmgr *bufmgr = brw->bufmgr;
struct brw_bo *bo = grow->bo;
 
+   /* We can't grow buffers that are softpinned, as the growing mechanism
+* involves putting a larger buffer at the same gtt_offset...and we've
+* only allocated the smaller amount of VMA.  Without relocations, this
+* simply won't work.  This should never happen, however.
+*/
+   assert(!(bo->kflags & EXEC_OBJECT_PINNED));
+
perf_debug("Growing %s - ran out of space\n", bo->name);
 
if (grow->partial_bo) {
@@ -730,7 +741,8 @@ execbuffer(int fd,
   bo->index = -1;
 
   /* Update brw_bo::gtt_offset */
-  if (batch->validation_list[i].offset != bo->gtt_offset) {
+  if (!(bo->kflags & EXEC_OBJECT_PINNED) &&
+  batch->validation_list[i].offset != bo->gtt_offset) {
  DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
  bo->gem_handle, bo->gtt_offset,
  batch->validation_list[i].offset);
@@ -933,6 +945,14 @@ emit_reloc(struct intel_batchbuffer *batch,
 {
assert(target != NULL);
 
+   if (target->kflags & EXEC_OBJECT_PINNED) {
+  brw_use_pinned_bo(batch, target, reloc_flags & RELOC_WRITE);
+  return target->gtt_offset + target_offset;
+   }
+
+   unsigned int index = add_exec_bo(batch, target);
+   struct drm_i915_gem_exec_object2 *entry = >validation_list[index];
+
if (rlist->reloc_count == rlist->reloc_array_size) {
   rlist->reloc_array_size *= 2;
   rlist->relocs = realloc(rlist->relocs,
@@ -940,9 +960,6 @@ emit_reloc(struct intel_batchbuffer *batch,
   sizeof(struct drm_i915_gem_relocation_entry));
}
 
-   unsigned int index = add_exec_bo(batch, target);
-   struct drm_i915_gem_exec_object2 *entry = >validation_list[index];
-
if (reloc_flags & RELOC_32BIT) {
   /* Restrict this buffer to the low 32 bits of the address space.
*
@@ -976,6 +993,21 @@ emit_reloc(struct intel_batchbuffer *batch,
return entry->offset + target_offset;
 }
 
+void
+brw_use_pinned_bo(struct intel_batchbuffer *batch, struct brw_bo *bo,
+  unsigned writable_flag)
+{
+   assert(bo->kflags & EXEC_OBJECT_PINNED);
+   assert((writable_flag & ~EXEC_OBJECT_WRITE) == 0);
+
+   unsigned int index = add_exec_bo(batch, bo);
+   struct drm_i915_gem_exec_object2 *entry = >validation_list[index];
+   assert(entry->offset == bo->gtt_offset);
+
+   if (writable_flag)
+  entry->flags |= EXEC_OBJECT_WRITE;
+}
+
 uint64_t
 brw_batch_reloc(struct intel_batchbuffer *batch,