On 07/06/2015 01:33 PM, Chris Wilson wrote:
> +/* > + * Add a relocation entry for the target buffer into the current batch. > + * > + * This is the heart of performing fast relocations, both here and in > + * the corresponding kernel relocation routines. > + * > + * - Instead of passing in handles for the kernel convert back into > + * the buffer for every relocation, we tell the kernel which > + * execobject slot corresponds with the relocation. The kernel is > + * able to use a simple LUT constructed as it first looks up each buffer > + * for the batch rather than search a small, overfull hashtable. As both > + * the number of relocations and buffers in a batch grow, the simple > + * LUT is much more efficient (though the LUT itself is less cache > + * friendly). > + * However, as the batch buffer is by definition the last object in > + * the execbuffer array we have to perform a pass to relabel the > + * target of all relocations pointing to the batch. (Except when > + * the kernel supports batch-first, in which case we can do the relocation > + * target processing for the batch inline.) > + * > + * - If the kernel has not moved the buffer, it will still be in the same > + * location as last time we used it. If we tell the kernel that all the > + * relocation entries are the same as the offset for the buffer, then > + * the kernel need only check that all the buffers are still in the same > + * location and then skip performing relocations entirely. A huge win. > + * > + * - As a consequence of telling the kernel to skip processing the > relocations, > + * we need to tell the kernel about the read/write domains and special > needs > + * of the buffers. > + * > + * - Alternatively, we can request the kernel place the buffer exactly > + * where we want it and forgo all relocations to that buffer entirely. > + * The buffer is effectively pinned for its lifetime (if the kernel > + * does have to move it, for example to swap it out to recover memory, > + * the kernel will return it back to our requested location at the start > + * of the next batch.) This of course imposes a lot of constraints on where > + * we can say the buffers are, they must meet all the alignment constraints > + * and not overlap. > + * > + * - Essential to all these techniques is that we always use the same > + * presumed_offset for the relocations as for submitting the execobject. > + * That value must be written into the batch and it must match the value > + * we tell the kernel. (This breaks down when using relocation tries shared > + * between multiple contexts, hence the need for context-local batch > + * management.) > + * > + * In contrast to libdrm, we can build the execbuffer array along with > + * the batch by forgoing the ability to handle general relocation trees. > + * This avoids having multiple passes to build the execbuffer parameter, > + * and also gives us a means to cheaply track when a buffer has been > + * referenced by the batch. > + */ > +uint64_t __brw_batch_reloc(struct brw_batch *batch, > + uint32_t batch_offset, > + struct brw_bo *target_bo, > + uint64_t target_offset, > + unsigned read_domains, > + unsigned write_domain) > +{ > + assert(target_bo->refcnt); > + if (unlikely(target_bo->batch != batch)) { > + /* XXX legal sharing between contexts/threads? */ > + target_bo = brw_bo_import(batch, target_bo->base, true); > + if (unlikely(target_bo == NULL)) > + longjmp(batch->jmpbuf, -ENOMEM); > + target_bo->refcnt--; /* kept alive by the implicit active reference */ > + } > + assert(target_bo->batch == batch); > + > + if (target_bo->exec == NULL) { > + int n; > + > + /* reserve one exec entry for the batch */ > + if (unlikely(batch->emit.nexec + 1 == batch->exec_size)) > + __brw_batch_grow_exec(batch); > + > + n = batch->emit.nexec++; > + target_bo->target_handle = has_lut(batch) ? n : target_bo->handle; > + target_bo->exec = memset(batch->exec + n, 0, sizeof(*target_bo->exec)); > + target_bo->exec->handle = target_bo->handle; > + target_bo->exec->alignment = target_bo->alignment; > + target_bo->exec->offset = target_bo->offset; > + if (target_bo->pinned) > + target_bo->exec->flags = EXEC_OBJECT_PINNED; > + > + /* Track the total amount of memory in use by all active requests */ > + if (target_bo->read.rq == NULL) { > + batch->rss += target_bo->size; > + if (batch->rss > batch->peak_rss) > + batch->peak_rss = batch->rss; > + } > + target_bo->read.rq = batch->next_request; > + list_move_tail(&target_bo->read.link, &batch->next_request->read); > + > + batch->aperture += target_bo->size; > + } > + > + if (!target_bo->pinned) { > + int n; > + > + if (unlikely(batch->emit.nreloc == batch->reloc_size)) > + __brw_batch_grow_reloc(batch); > + > + n = batch->emit.nreloc++; > + batch->reloc[n].offset = batch_offset; > + batch->reloc[n].delta = target_offset; > + batch->reloc[n].target_handle = target_bo->target_handle; > + batch->reloc[n].presumed_offset = target_bo->offset; > + batch->reloc[n].read_domains = read_domains; > + batch->reloc[n].write_domain = write_domain; > + > + /* If we haven't added the batch to the execobject array yet, we > + * will have to process all the relocations pointing to the > + * batch when finalizing the request for submission. > + */ > + if (target_bo->target_handle == -1) { > + int m = batch->emit.nself++; > + if (m < 256) > + batch->self_reloc[m] = n; > + } > + } > + > + if (write_domain && !target_bo->dirty) { > + assert(target_bo != batch->bo); > + target_bo->write.rq = batch->next_request; > + list_move(&target_bo->write.link, &batch->next_request->write); > + assert(target_bo->write.rq == target_bo->read.rq); > + target_bo->dirty = true; > + target_bo->domain = DOMAIN_GPU; > + if (has_lut(batch)) { > + target_bo->exec->flags |= EXEC_OBJECT_WRITE; > + if (write_domain == I915_GEM_DOMAIN_INSTRUCTION && > + batch->needs_pipecontrol_ggtt_wa) > + target_bo->exec->flags |= EXEC_OBJECT_NEEDS_GTT; > + } > + } > + > + return target_bo->offset + target_offset; > +} I looked at your Surface State setup changes in: brw_wm_surface_state.c gen6_blorp.cpp gen6_surface_state.c gen7_blorp.cpp gen7_wm_surface_state.c gen8_surface_state.c and I especially like the removal of the additional to call drm_intel_bo_emit_reloc() just to mark those surfaces for relocation after specifying the every Surface Base Address. The new setup certainly looks more intuitive! I see no issues in those sections, except: > > /* Creates a new WM constant buffer reflecting the current fragment program's > @@ -560,7 +553,7 @@ brw_emit_null_surface_state(struct brw_context *brw, > * - Surface Format must be R8G8B8A8_UNORM. > */ > unsigned surface_type = BRW_SURFACE_NULL; > - drm_intel_bo *bo = NULL; > + struct brw_bo *bo = NULL; > unsigned pitch_minus_1 = 0; > uint32_t multisampling_state = 0; > uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32, > @@ -600,7 +593,10 @@ brw_emit_null_surface_state(struct brw_context *brw, > 1 << BRW_SURFACE_WRITEDISABLE_B_SHIFT | > 1 << BRW_SURFACE_WRITEDISABLE_A_SHIFT); > } > - surf[1] = bo ? bo->offset64 : 0; > + surf[1] = brw_batch_reloc(&brw->batch, *out_offset + 4, > + bo, 0, > + I915_GEM_DOMAIN_RENDER, > + I915_GEM_DOMAIN_RENDER); null check for bo? > surf[2] = ((width - 1) << BRW_SURFACE_WIDTH_SHIFT | > (height - 1) << BRW_SURFACE_HEIGHT_SHIFT); > > @@ -613,13 +609,6 @@ brw_emit_null_surface_state(struct brw_context *brw, > pitch_minus_1 << BRW_SURFACE_PITCH_SHIFT); > surf[4] = multisampling_state; > surf[5] = 0; > - > - if (bo) { > - drm_intel_bo_emit_reloc(brw->batch.bo, > - *out_offset + 4, > - bo, 0, > - I915_GEM_DOMAIN_RENDER, > I915_GEM_DOMAIN_RENDER); > - } > } > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev