On Mon, Oct 15, 2018 at 8:34 AM Lionel Landwerlin < lionel.g.landwer...@intel.com> wrote:
> On 13/10/2018 14:09, Jason Ekstrand wrote: > > --- > > src/intel/vulkan/anv_cmd_buffer.c | 29 +++++++ > > src/intel/vulkan/anv_device.c | 24 ++++++ > > src/intel/vulkan/anv_extensions.py | 2 +- > > src/intel/vulkan/anv_pipeline.c | 10 ++- > > src/intel/vulkan/anv_private.h | 13 +++ > > src/intel/vulkan/genX_cmd_buffer.c | 125 +++++++++++++++++++++++++++++ > > src/intel/vulkan/genX_pipeline.c | 122 ++++++++++++++++++++++++++++ > > 7 files changed, 323 insertions(+), 2 deletions(-) > > > ... > > > > uint32_t topology; > > > > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > b/src/intel/vulkan/genX_cmd_buffer.c > > index c3a7e5c83c3..90469abbf21 100644 > > --- a/src/intel/vulkan/genX_cmd_buffer.c > > +++ b/src/intel/vulkan/genX_cmd_buffer.c > > @@ -2571,6 +2571,30 @@ genX(cmd_buffer_flush_state)(struct > anv_cmd_buffer *cmd_buffer) > > > > cmd_buffer->state.gfx.vb_dirty &= ~vb_emit; > > > > +#if GEN_GEN >= 8 > > + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) { > > + /* We don't need any per-buffer dirty tracking because you're not > > + * allowed to bind different XFB buffers while XFB is enabled. > > + */ > > + for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { > > + struct anv_xfb_binding *xfb = > &cmd_buffer->state.xfb_bindings[idx]; > > + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), > sob) { > > + sob.SOBufferIndex = idx; > > + > > + if (cmd_buffer->state.xfb_enabled && xfb->buffer) { > > + sob.SOBufferEnable = true; > > + sob.SOBufferMOCS = cmd_buffer->device->default_mocs, > > + sob.StreamOffsetWriteEnable = false; > > + sob.SurfaceBaseAddress = > anv_address_add(xfb->buffer->address, > > + xfb->offset); > > + /* Size is in DWords - 1 */ > > + sob.SurfaceSize = xfb->size / 4 - 1; > > + } > > + } > > > Apparently documentation says we need a PIPE_CONTROL with CS Stall bit > set after 3DSTATE_SO_BUFFER. > So it does. I've added it for GEN_GEN >= 10. --Jason > > + } > > + } > > +#endif > > + > > if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { > > anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); > > > > @@ -2970,6 +2994,107 @@ void genX(CmdDrawIndexedIndirect)( > > } > > } > > > > +void genX(CmdBeginTransformFeedbackEXT)( > > + VkCommandBuffer commandBuffer, > > + uint32_t firstCounterBuffer, > > + uint32_t counterBufferCount, > > + const VkBuffer* pCounterBuffers, > > + const VkDeviceSize* pCounterBufferOffsets) > > +{ > > + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); > > + > > + assert(firstCounterBuffer < MAX_XFB_BUFFERS); > > + assert(counterBufferCount < MAX_XFB_BUFFERS); > > + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); > > + > > + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: > > + * > > + * "Ssoftware must ensure that no HW stream output operations can > be in > > + * process or otherwise pending at the point that the > MI_LOAD/STORE > > + * commands are processed. This will likely require a pipeline > flush." > > + */ > > + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; > > + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); > > + > > + for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { > > + /* If we have a counter buffer, this is a resume so we need to > load the > > + * value into the streamout offset register. Otherwise, this is > a begin > > + * and we need to reset it to zero. > > + */ > > + if (pCounterBuffers && > > + idx >= firstCounterBuffer && > > + idx - firstCounterBuffer < counterBufferCount && > > + pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) { > > + uint32_t cb_idx = idx - firstCounterBuffer; > > + ANV_FROM_HANDLE(anv_buffer, counter_buffer, > pCounterBuffers[cb_idx]); > > + uint64_t offset = pCounterBufferOffsets ? > > + pCounterBufferOffsets[cb_idx] : 0; > > + > > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), > lrm) { > > + lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; > > + lrm.MemoryAddress = > anv_address_add(counter_buffer->address, > > + offset); > > + } > > + } else { > > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), > lri) { > > + lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4; > > + lri.DataDWord = 0; > > + } > > + } > > + } > > + > > + cmd_buffer->state.xfb_enabled = true; > > + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; > > +} > > + > > +void genX(CmdEndTransformFeedbackEXT)( > > + VkCommandBuffer commandBuffer, > > + uint32_t firstCounterBuffer, > > + uint32_t counterBufferCount, > > + const VkBuffer* pCounterBuffers, > > + const VkDeviceSize* pCounterBufferOffsets) > > +{ > > + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); > > + > > + assert(firstCounterBuffer < MAX_XFB_BUFFERS); > > + assert(counterBufferCount < MAX_XFB_BUFFERS); > > + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); > > + > > + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: > > + * > > + * "Ssoftware must ensure that no HW stream output operations can > be in > > + * process or otherwise pending at the point that the > MI_LOAD/STORE > > + * commands are processed. This will likely require a pipeline > flush." > > + */ > > + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; > > + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); > > + > > + for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) { > > + unsigned idx = firstCounterBuffer + cb_idx; > > + > > + /* If we have a counter buffer, this is a resume so we need to > load the > > + * value into the streamout offset register. Otherwise, this is > a begin > > + * and we need to reset it to zero. > > + */ > > + if (pCounterBuffers && > > + cb_idx < counterBufferCount && > > + pCounterBuffers[cb_idx] != VK_NULL_HANDLE) { > > + ANV_FROM_HANDLE(anv_buffer, counter_buffer, > pCounterBuffers[cb_idx]); > > + uint64_t offset = pCounterBufferOffsets ? > > + pCounterBufferOffsets[cb_idx] : 0; > > + > > + anv_batch_emit(&cmd_buffer->batch, > GENX(MI_STORE_REGISTER_MEM), srm) { > > + srm.MemoryAddress = > anv_address_add(counter_buffer->address, > > + offset); > > + srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; > > + } > > + } > > + } > > + > > + cmd_buffer->state.xfb_enabled = false; > > + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; > > +} > > + > > static VkResult > > flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) > > { > > diff --git a/src/intel/vulkan/genX_pipeline.c > b/src/intel/vulkan/genX_pipeline.c > > index 9595a7133ae..5dd78a18fb5 100644 > > --- a/src/intel/vulkan/genX_pipeline.c > > +++ b/src/intel/vulkan/genX_pipeline.c > > @@ -28,6 +28,7 @@ > > > > #include "common/gen_l3_config.h" > > #include "common/gen_sample_positions.h" > > +#include "nir/nir_xfb_info.h" > > #include "vk_util.h" > > #include "vk_format_info.h" > > > > @@ -1097,9 +1098,130 @@ static void > > emit_3dstate_streamout(struct anv_pipeline *pipeline, > > const VkPipelineRasterizationStateCreateInfo > *rs_info) > > { > > +#if GEN_GEN >= 8 > > + const struct brw_vue_prog_data *prog_data = > > + anv_pipeline_get_last_vue_prog_data(pipeline); > > + const struct brw_vue_map *vue_map = &prog_data->vue_map; > > +#endif > > + > > + nir_xfb_info *xfb_info; > > + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) > > + xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info; > > + else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) > > + xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info; > > + else > > + xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info; > > + > > + pipeline->xfb_used = xfb_info ? xfb_info->buffers_written : 0; > > + > > anv_batch_emit(&pipeline->batch, GENX(3DSTATE_STREAMOUT), so) { > > so.RenderingDisable = rs_info->rasterizerDiscardEnable; > > + > > +#if GEN_GEN >= 8 > > + if (xfb_info) { > > + so.SOFunctionEnable = true; > > + > > + const VkPipelineRasterizationStateStreamCreateInfoEXT > *stream_info = > > + vk_find_struct_const(rs_info, > PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT); > > + so.RenderStreamSelect = stream_info ? > > + stream_info->rasterizationStream : 0; > > + > > + so.Buffer0SurfacePitch = xfb_info->strides[0]; > > + so.Buffer1SurfacePitch = xfb_info->strides[1]; > > + so.Buffer2SurfacePitch = xfb_info->strides[2]; > > + so.Buffer3SurfacePitch = xfb_info->strides[3]; > > + > > + int urb_entry_read_offset = 0; > > + int urb_entry_read_length = > > + (prog_data->vue_map.num_slots + 1) / 2 - > urb_entry_read_offset; > > + > > + /* We always read the whole vertex. This could be reduced at > some > > + * point by reading less and offsetting the register index in > the > > + * SO_DECLs. > > + */ > > + so.Stream0VertexReadOffset = urb_entry_read_offset; > > + so.Stream0VertexReadLength = urb_entry_read_length - 1; > > + so.Stream1VertexReadOffset = urb_entry_read_offset; > > + so.Stream1VertexReadLength = urb_entry_read_length - 1; > > + so.Stream2VertexReadOffset = urb_entry_read_offset; > > + so.Stream2VertexReadLength = urb_entry_read_length - 1; > > + so.Stream3VertexReadOffset = urb_entry_read_offset; > > + so.Stream3VertexReadLength = urb_entry_read_length - 1; > > + } > > +#endif /* GEN_GEN >= 8 */ > > + } > > + > > +#if GEN_GEN >= 8 > > + if (xfb_info) { > > + struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128]; > > + int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0}; > > + int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0}; > > + > > + memset(so_decl, 0, sizeof(so_decl)); > > + > > + for (unsigned i = 0; i < xfb_info->output_count; i++) { > > + const nir_xfb_output_info *output = &xfb_info->outputs[i]; > > + unsigned buffer = output->buffer; > > + unsigned stream = xfb_info->buffer_to_stream[buffer]; > > + > > + /* Our hardware is unusual in that it requires us to program > SO_DECLs > > + * for fake "hole" components, rather than simply taking the > offset > > + * for each real varying. Each hole can have size 1, 2, 3, or > 4; we > > + * program as many size = 4 holes as we can, then a final hole > to > > + * accommodate the final 1, 2, or 3 remaining. > > + */ > > + int hole_dwords = (output->offset - next_offset[buffer]) / 4; > > + while (hole_dwords > 0) { > > + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { > > + .HoleFlag = 1, > > + .OutputBufferSlot = buffer, > > + .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1, > > + }; > > + hole_dwords -= 4; > > + } > > + > > + next_offset[buffer] = output->offset + > > + > __builtin_popcount(output->component_mask) * 4; > > + > > + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { > > + .OutputBufferSlot = buffer, > > + .RegisterIndex = vue_map->varying_to_slot[output->location], > > + .ComponentMask = output->component_mask, > > + }; > > + } > > + > > + int max_decls = 0; > > + for (unsigned s = 0; s < MAX_XFB_STREAMS; s++) > > + max_decls = MAX2(max_decls, decls[s]); > > + > > + uint8_t sbs[MAX_XFB_STREAMS] = { }; > > + for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) { > > + if (xfb_info->buffers_written & (1 << b)) > > + sbs[xfb_info->buffer_to_stream[b]] |= 1 << b; > > + } > > + > > + uint32_t *dw = anv_batch_emitn(&pipeline->batch, 3 + 2 * > max_decls, > > + GENX(3DSTATE_SO_DECL_LIST), > > + .StreamtoBufferSelects0 = sbs[0], > > + .StreamtoBufferSelects1 = sbs[1], > > + .StreamtoBufferSelects2 = sbs[2], > > + .StreamtoBufferSelects3 = sbs[3], > > + .NumEntries0 = decls[0], > > + .NumEntries1 = decls[1], > > + .NumEntries2 = decls[2], > > + .NumEntries3 = decls[3]); > > + > > + for (int i = 0; i < max_decls; i++) { > > + GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2, > > + &(struct GENX(SO_DECL_ENTRY)) { > > + .Stream0Decl = so_decl[0][i], > > + .Stream1Decl = so_decl[1][i], > > + .Stream2Decl = so_decl[2][i], > > + .Stream3Decl = so_decl[3][i], > > + }); > > + } > > } > > +#endif /* GEN_GEN >= 8 */ > > } > > > > static uint32_t > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev