On 2018-02-26 16:05:46, Kenneth Graunke wrote: > This allows most GPU objects to use the full 48-bit address space > offered by Gen8+ platforms, rather than being stuck with 32-bit. > This expands the available GPU memory from 4G to 256TB or so. > > A few objects - instruction, scratch, and vertex buffers - need to > remain pinned in the low 4GB of the address space for various reasons. > We default everything to 48-bit but disable it in those cases. > > Thanks to Jason Ekstrand for blazing this trail in anv first and > finding the nasty undocumented hardware issues. This patch simply > rips off all of his findings. > --- > src/mesa/drivers/dri/i965/brw_bufmgr.c | 23 +++++++++ > src/mesa/drivers/dri/i965/brw_misc_state.c | 13 +++-- > src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 23 ++++++--- > src/mesa/drivers/dri/i965/genX_blorp_exec.c | 9 ++++ > src/mesa/drivers/dri/i965/genX_state_upload.c | 60 > ++++++++++++++++++++---- > src/mesa/drivers/dri/i965/intel_batchbuffer.c | 15 ++++++ > src/mesa/drivers/dri/i965/intel_batchbuffer.h | 2 + > 7 files changed, 127 insertions(+), 18 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c > b/src/mesa/drivers/dri/i965/brw_bufmgr.c > index fb180289a0c..2e54adb3ed2 100644 > --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c > +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c > @@ -119,6 +119,7 @@ struct brw_bufmgr { > bool has_llc:1; > bool has_mmap_wc:1; > bool bo_reuse:1; > + bool supports_48b_addresses:1; > }; > > static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode, > @@ -409,6 +410,8 @@ retry: > bo->reusable = true; > bo->cache_coherent = bufmgr->has_llc; > bo->index = -1; > + if (bufmgr->supports_48b_addresses) > + bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS; > > mtx_unlock(&bufmgr->lock); > > @@ -1385,6 +1388,24 @@ gem_param(int fd, int name) > return v; > } > > +static bool > +gem_supports_48b_addresses(int fd) > +{ > + struct drm_i915_gem_exec_object2 obj = { > + .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS, > + }; > + > + struct drm_i915_gem_execbuffer2 execbuf = { > + .buffers_ptr = (uintptr_t)&obj, > + .buffer_count = 1, > + .rsvd1 = 0xffffffu, > + }; > + > + int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf); > + > + return ret == -1 && errno == ENOENT; > +} > + > /** > * Initializes the GEM buffer manager, which uses the kernel to allocate, > map, > * and manage map buffer objections. > @@ -1418,6 +1439,8 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd) > > bufmgr->has_llc = devinfo->has_llc; > bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0; > + bufmgr->supports_48b_addresses = > + devinfo->gen >= 8 && gem_supports_48b_addresses(fd); > > init_cache_buckets(bufmgr); > > diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c > b/src/mesa/drivers/dri/i965/brw_misc_state.c > index c4ef6812bff..29d74876c27 100644 > --- a/src/mesa/drivers/dri/i965/brw_misc_state.c > +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c > @@ -634,6 +634,12 @@ brw_upload_state_base_address(struct brw_context *brw) > } > > if (devinfo->gen >= 8) { > + /* STATE_BASE_ADDRESS has issues with 48-bit address spaces. If the > + * address + size as seen by STATE_BASE_ADDRESS overflows 48 bits, > + * the GPU appears to treat all accesses to the buffer as being out > + * of bounds and returns zero. To work around this, we pin all SBAs > + * to the bottom 4GB. > + */ > uint32_t mocs_wb = devinfo->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; > int pkt_len = devinfo->gen >= 9 ? 19 : 16; > > @@ -644,15 +650,14 @@ brw_upload_state_base_address(struct brw_context *brw) > OUT_BATCH(0); > OUT_BATCH(mocs_wb << 16); > /* Surface state base address: */ > - OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1); > + OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
Lines like this seem a little confusing with the RELOC_32BIT name. What about something like RELOC_BELOW4G? Reviewed-by: Jordan Justen <jordan.l.jus...@intel.com> > /* Dynamic state base address: */ > - OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1); > + OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1); > /* Indirect object base address: MEDIA_OBJECT data */ > OUT_BATCH(mocs_wb << 4 | 1); > OUT_BATCH(0); > /* Instruction base address: shader kernels (incl. SIP) */ > - OUT_RELOC64(brw->cache.bo, 0, mocs_wb << 4 | 1); > - > + OUT_RELOC64(brw->cache.bo, RELOC_32BIT, mocs_wb << 4 | 1); > /* General state buffer size */ > OUT_BATCH(0xfffff001); > /* Dynamic state buffer size */ > diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > index 0b6016427bd..55e752261a5 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > @@ -203,12 +203,23 @@ brw_emit_surface_state(struct brw_context *brw, > * FIXME: move to the point of assignment. > */ > assert((aux_offset & 0xfff) == 0); > - uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset; > - *aux_addr = brw_state_reloc(&brw->batch, > - *surf_offset + > - brw->isl_dev.ss.aux_addr_offset, > - aux_bo, *aux_addr, > - reloc_flags); > + > + if (devinfo->gen >= 8) { > + uint64_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset; > + *aux_addr = brw_state_reloc(&brw->batch, > + *surf_offset + > + brw->isl_dev.ss.aux_addr_offset, > + aux_bo, *aux_addr, > + reloc_flags); > + } else { > + uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset; > + *aux_addr = brw_state_reloc(&brw->batch, > + *surf_offset + > + brw->isl_dev.ss.aux_addr_offset, > + aux_bo, *aux_addr, > + reloc_flags); > + > + } > } > } > > diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c > b/src/mesa/drivers/dri/i965/genX_blorp_exec.c > index 062171af600..2bdd93e9bdd 100644 > --- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c > +++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c > @@ -166,6 +166,15 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, > uint32_t size, > .buffer = brw->batch.state.bo, > .offset = offset, > > + /* The VF cache designers apparently cut corners, and made the cache > + * only consider the bottom 32 bits of memory addresses. If you happen > + * to have two vertex buffers which get placed exactly 4 GiB apart and > + * use them in back-to-back draw calls, you can get collisions. To > work > + * around this problem, we restrict vertex buffers to the low 32 bits > of > + * the address space. > + */ > + .reloc_flags = RELOC_32BIT, > + > #if GEN_GEN == 10 > .mocs = CNL_MOCS_WB, > #elif GEN_GEN == 9 > diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c > b/src/mesa/drivers/dri/i965/genX_state_upload.c > index 8668abd591f..0ab1f12152a 100644 > --- a/src/mesa/drivers/dri/i965/genX_state_upload.c > +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c > @@ -101,7 +101,7 @@ __gen_combine_address(struct brw_context *brw, void > *location, > } > } > > -static struct brw_address > +UNUSED static struct brw_address > rw_bo(struct brw_bo *bo, uint32_t offset) > { > return (struct brw_address) { > @@ -120,6 +120,26 @@ ro_bo(struct brw_bo *bo, uint32_t offset) > }; > } > > +static struct brw_address > +rw_32_bo(struct brw_bo *bo, uint32_t offset) > +{ > + return (struct brw_address) { > + .bo = bo, > + .offset = offset, > + .reloc_flags = RELOC_WRITE | RELOC_32BIT, > + }; > +} > + > +static struct brw_address > +ro_32_bo(struct brw_bo *bo, uint32_t offset) > +{ > + return (struct brw_address) { > + .bo = bo, > + .offset = offset, > + .reloc_flags = RELOC_32BIT, > + }; > +} > + > UNUSED static struct brw_address > ggtt_bo(struct brw_bo *bo, uint32_t offset) > { > @@ -317,7 +337,15 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw, > struct GENX(VERTEX_BUFFER_STATE) buf_state = { > .VertexBufferIndex = buffer_nr, > .BufferPitch = stride, > - .BufferStartingAddress = ro_bo(bo, start_offset), > + > + /* The VF cache designers apparently cut corners, and made the cache > + * only consider the bottom 32 bits of memory addresses. If you happen > + * to have two vertex buffers which get placed exactly 4 GiB apart and > + * use them in back-to-back draw calls, you can get collisions. To > work > + * around this problem, we restrict vertex buffers to the low 32 bits > of > + * the address space. > + */ > + .BufferStartingAddress = ro_32_bo(bo, start_offset), > #if GEN_GEN >= 8 > .BufferSize = end_offset - start_offset, > #endif > @@ -858,7 +886,15 @@ genX(emit_index_buffer)(struct brw_context *brw) > ib.CutIndexEnable = brw->prim_restart.enable_cut_index; > #endif > ib.IndexFormat = brw_get_index_type(index_buffer->index_size); > - ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0); > + > + /* The VF cache designers apparently cut corners, and made the cache > + * only consider the bottom 32 bits of memory addresses. If you happen > + * to have two index buffers which get placed exactly 4 GiB apart and > + * use them in back-to-back draw calls, you can get collisions. To > work > + * around this problem, we restrict index buffers to the low 32 bits of > + * the address space. > + */ > + ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0); > #if GEN_GEN >= 8 > ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; > ib.BufferSize = brw->ib.size; > @@ -1895,7 +1931,7 @@ genX(upload_wm)(struct brw_context *brw) > #endif > > if (wm_prog_data->base.total_scratch) { > - wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0); > + wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); > wm.PerThreadScratchSpace = > ffs(stage_state->per_thread_scratch) - 11; > } > @@ -2014,6 +2050,14 @@ static const struct brw_tracked_state genX(wm_state) = > { > > /* ---------------------------------------------------------------------- */ > > +/* We restrict scratch buffers to the bottom 32 bits of the address space > + * by using rw_32_bo(). > + * > + * General State Base Address is a bit broken. If the address + size as > + * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat > + * all accesses to the buffer as being out of bounds and returns zero. > + */ > + > #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \ > pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset); \ > pkt.SamplerCount = \ > @@ -2023,7 +2067,7 @@ static const struct brw_tracked_state genX(wm_state) = { > pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \ > \ > if (stage_prog_data->total_scratch) { \ > - pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0); \ > + pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \ > pkt.PerThreadScratchSpace = \ > ffs(stage_state->per_thread_scratch) - 11; \ > } \ > @@ -3892,8 +3936,8 @@ genX(upload_ps)(struct brw_context *brw) > > if (prog_data->base.total_scratch) { > ps.ScratchSpaceBasePointer = > - rw_bo(stage_state->scratch_bo, > - ffs(stage_state->per_thread_scratch) - 11); > + rw_32_bo(stage_state->scratch_bo, > + ffs(stage_state->per_thread_scratch) - 11); > } > } > } > @@ -4214,7 +4258,7 @@ genX(upload_cs_state)(struct brw_context *brw) > */ > per_thread_scratch_value = stage_state->per_thread_scratch / > 1024 - 1; > } > - vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0); > + vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); > vfe.PerThreadScratchSpace = per_thread_scratch_value; > } > > diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c > b/src/mesa/drivers/dri/i965/intel_batchbuffer.c > index 26718e0d1a2..0a8d3a80b64 100644 > --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c > +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c > @@ -1093,6 +1093,21 @@ emit_reloc(struct intel_batchbuffer *batch, > unsigned int index = add_exec_bo(batch, target); > struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index]; > > + if (reloc_flags & RELOC_32BIT) { > + /* Restrict this buffer to the low 32 bits of the address space. > + * > + * Altering the validation list flags restricts it for this batch, > + * but we also alter the BO's kflags to restrict it permanently > + * (until the BO is destroyed and put back in the cache). Buffers > + * may stay bound across batches, and we want keep it constrained. > + */ > + target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; > + entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; > + > + /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */ > + reloc_flags &= ~RELOC_32BIT; > + } > + > if (reloc_flags) > entry->flags |= reloc_flags & batch->valid_reloc_flags; > > diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h > b/src/mesa/drivers/dri/i965/intel_batchbuffer.h > index a9a34600ad1..7be5b10f3ab 100644 > --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h > +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h > @@ -53,6 +53,8 @@ bool brw_batch_references(struct intel_batchbuffer *batch, > struct brw_bo *bo); > > #define RELOC_WRITE EXEC_OBJECT_WRITE > #define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT > +/* Inverted meaning, but using the same bit...emit_reloc will flip it. */ > +#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS > uint64_t brw_batch_reloc(struct intel_batchbuffer *batch, > uint32_t batch_offset, > struct brw_bo *target, > -- > 2.16.1 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev