On 2016-05-07 03:14:43, Kenneth Graunke wrote: > My old implementation accumulated <start, end> pairs in a buffer, > and eventually processed that data on the CPU. This meant flushing > the batchbuffer and waiting for it to completely execute before we > could map it, resulting in really long stalls. We could also run out > of space in the buffer, and have to do this early. > > Instead, we can use Haswell's MI_MATH command to do the (end - start) > subtraction, as well as the multiplication by 2 or 3 to convert from > the number of primitives written to the number of vertices written. > We still need to CS stall to read the counters, but otherwise everything > is completely pipelined - there's no CPU<->GPU synchronization required. > It also uses only 80 bytes in the buffer, no matter what. > > Improves performance in Manhattan on Skylake GT3e at 800x600 by > 6.1086% +/- 0.954166% (n=9). At 1920x1080, improves performance > by 2.82103% +/- 0.148596% (n=84). > > v2: Fix number of primitives -> number of vertices calculation for > GL_TRIANGLES (I was multiplying by 4 instead of 3.) Caught by > Jordan Justen. > > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > --- > src/mesa/drivers/dri/i965/Makefile.sources | 1 + > src/mesa/drivers/dri/i965/brw_context.c | 14 +- > src/mesa/drivers/dri/i965/brw_context.h | 14 ++ > src/mesa/drivers/dri/i965/brw_draw.c | 38 ++++- > src/mesa/drivers/dri/i965/hsw_sol.c | 263 > +++++++++++++++++++++++++++++ > 5 files changed, 318 insertions(+), 12 deletions(-) > create mode 100644 src/mesa/drivers/dri/i965/hsw_sol.c > > Ouch. Good catch, Jordan! This look better? > > diff --git a/src/mesa/drivers/dri/i965/Makefile.sources > b/src/mesa/drivers/dri/i965/Makefile.sources > index 8c60954..d35775e 100644 > --- a/src/mesa/drivers/dri/i965/Makefile.sources > +++ b/src/mesa/drivers/dri/i965/Makefile.sources > @@ -228,6 +228,7 @@ i965_FILES = \ > gen8_vs_state.c \ > gen8_wm_depth_stencil.c \ > hsw_queryobj.c \ > + hsw_sol.c \ > intel_batchbuffer.c \ > intel_batchbuffer.h \ > intel_blit.c \ > diff --git a/src/mesa/drivers/dri/i965/brw_context.c > b/src/mesa/drivers/dri/i965/brw_context.c > index 1380d41..26514a0 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.c > +++ b/src/mesa/drivers/dri/i965/brw_context.c > @@ -372,13 +372,18 @@ brw_init_driver_functions(struct brw_context *brw, > > functions->NewTransformFeedback = brw_new_transform_feedback; > functions->DeleteTransformFeedback = brw_delete_transform_feedback; > - functions->GetTransformFeedbackVertexCount = > - brw_get_transform_feedback_vertex_count; > - if (brw->gen >= 7) { > + if (brw->intelScreen->has_mi_math_and_lrr) { > + functions->BeginTransformFeedback = hsw_begin_transform_feedback; > + functions->EndTransformFeedback = hsw_end_transform_feedback; > + functions->PauseTransformFeedback = hsw_pause_transform_feedback; > + functions->ResumeTransformFeedback = hsw_resume_transform_feedback; > + } else if (brw->gen >= 7) { > functions->BeginTransformFeedback = gen7_begin_transform_feedback; > functions->EndTransformFeedback = gen7_end_transform_feedback; > functions->PauseTransformFeedback = gen7_pause_transform_feedback; > functions->ResumeTransformFeedback = gen7_resume_transform_feedback; > + functions->GetTransformFeedbackVertexCount = > + brw_get_transform_feedback_vertex_count; > } else { > functions->BeginTransformFeedback = brw_begin_transform_feedback; > functions->EndTransformFeedback = brw_end_transform_feedback; > @@ -494,7 +499,8 @@ brw_initialize_context_constants(struct brw_context *brw) > ctx->Const.MaxTransformFeedbackSeparateComponents = > BRW_MAX_SOL_BINDINGS / BRW_MAX_SOL_BUFFERS; > > - ctx->Const.AlwaysUseGetTransformFeedbackVertexCount = true; > + ctx->Const.AlwaysUseGetTransformFeedbackVertexCount = > + !brw->intelScreen->has_mi_math_and_lrr; > > int max_samples; > const int *msaa_modes = intel_supported_msaa_modes(brw->intelScreen); > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index b620f14..035cbe9 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -1656,6 +1656,20 @@ void > gen7_resume_transform_feedback(struct gl_context *ctx, > struct gl_transform_feedback_object *obj); > > +/* hsw_sol.c */ > +void > +hsw_begin_transform_feedback(struct gl_context *ctx, GLenum mode, > + struct gl_transform_feedback_object *obj); > +void > +hsw_end_transform_feedback(struct gl_context *ctx, > + struct gl_transform_feedback_object *obj); > +void > +hsw_pause_transform_feedback(struct gl_context *ctx, > + struct gl_transform_feedback_object *obj); > +void > +hsw_resume_transform_feedback(struct gl_context *ctx, > + struct gl_transform_feedback_object *obj); > + > /* brw_blorp_blit.cpp */ > GLbitfield > brw_blorp_framebuffer(struct brw_context *brw, > diff --git a/src/mesa/drivers/dri/i965/brw_draw.c > b/src/mesa/drivers/dri/i965/brw_draw.c > index afa8a4e..9d034cf 100644 > --- a/src/mesa/drivers/dri/i965/brw_draw.c > +++ b/src/mesa/drivers/dri/i965/brw_draw.c > @@ -153,7 +153,9 @@ trim(GLenum prim, GLuint length) > static void > brw_emit_prim(struct brw_context *brw, > const struct _mesa_prim *prim, > - uint32_t hw_prim) > + uint32_t hw_prim, > + struct brw_transform_feedback_object *xfb_obj, > + unsigned stream) > { > int verts_per_instance; > int vertex_access_type; > @@ -185,7 +187,7 @@ brw_emit_prim(struct brw_context *brw, > verts_per_instance = prim->count; > > /* If nothing to emit, just return. */ > - if (verts_per_instance == 0 && !prim->is_indirect) > + if (verts_per_instance == 0 && !prim->is_indirect && !xfb_obj) > return; > > /* If we're set to always flush, do it before and after the primitive > emit. > @@ -197,7 +199,25 @@ brw_emit_prim(struct brw_context *brw, > brw_emit_mi_flush(brw); > > /* If indirect, emit a bunch of loads from the indirect BO. */ > - if (prim->is_indirect) { > + if (xfb_obj) { > + indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE; > + > + brw_load_register_mem(brw, GEN7_3DPRIM_VERTEX_COUNT, > + xfb_obj->prim_count_bo, > + I915_GEM_DOMAIN_VERTEX, 0, > + stream * sizeof(uint32_t)); > + BEGIN_BATCH(9); > + OUT_BATCH(MI_LOAD_REGISTER_IMM | (9 - 2)); > + OUT_BATCH(GEN7_3DPRIM_INSTANCE_COUNT); > + OUT_BATCH(prim->num_instances); > + OUT_BATCH(GEN7_3DPRIM_START_VERTEX); > + OUT_BATCH(0); > + OUT_BATCH(GEN7_3DPRIM_BASE_VERTEX); > + OUT_BATCH(0); > + OUT_BATCH(GEN7_3DPRIM_START_INSTANCE); > + OUT_BATCH(0); > + ADVANCE_BATCH(); > + } else if (prim->is_indirect) { > struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer; > drm_intel_bo *bo = intel_bufferobj_buffer(brw, > intel_buffer_object(indirect_buffer), > @@ -382,6 +402,8 @@ brw_try_draw_prims(struct gl_context *ctx, > const struct _mesa_index_buffer *ib, > GLuint min_index, > GLuint max_index, > + struct brw_transform_feedback_object *xfb_obj, > + unsigned stream, > struct gl_buffer_object *indirect) > { > struct brw_context *brw = brw_context(ctx); > @@ -531,7 +553,7 @@ retry: > brw_upload_render_state(brw); > } > > - brw_emit_prim(brw, &prims[i], brw->primitive); > + brw_emit_prim(brw, &prims[i], brw->primitive, xfb_obj, stream); > > brw->no_batch_wrap = false; > > @@ -573,14 +595,14 @@ brw_draw_prims(struct gl_context *ctx, > GLboolean index_bounds_valid, > GLuint min_index, > GLuint max_index, > - struct gl_transform_feedback_object *unused_tfb_object, > + struct gl_transform_feedback_object *gl_xfb_obj, > unsigned stream, > struct gl_buffer_object *indirect) > { > struct brw_context *brw = brw_context(ctx); > const struct gl_client_array **arrays = ctx->Array._DrawArrays; > - > - assert(unused_tfb_object == NULL); > + struct brw_transform_feedback_object *xfb_obj = > + (struct brw_transform_feedback_object *) gl_xfb_obj; > > if (!brw_check_conditional_render(brw)) > return; > @@ -619,7 +641,7 @@ brw_draw_prims(struct gl_context *ctx, > * to it. > */ > brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index, > - indirect); > + xfb_obj, stream, indirect); > } > > void > diff --git a/src/mesa/drivers/dri/i965/hsw_sol.c > b/src/mesa/drivers/dri/i965/hsw_sol.c > new file mode 100644 > index 0000000..ef8fcf4 > --- /dev/null > +++ b/src/mesa/drivers/dri/i965/hsw_sol.c > @@ -0,0 +1,263 @@ > +/* > + * Copyright © 2016 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS > + * IN THE SOFTWARE. > + */ > + > +/** > + * An implementation of the transform feedback driver hooks for Haswell > + * and later hardware. This uses MI_MATH to compute the number of vertices > + * written (for use by DrawTransformFeedback()) without any CPU<->GPU > + * synchronization which could stall. > + */ > + > +#include "brw_context.h" > +#include "brw_state.h" > +#include "brw_defines.h" > +#include "intel_batchbuffer.h" > +#include "intel_buffer_objects.h" > +#include "main/transformfeedback.h" > + > +/** > + * We store several values in obj->prim_count_bo: > + * > + * [4x 32-bit values]: Final Number of Vertices Written > + * [4x 32-bit values]: Tally of Primitives Written So Far > + * [4x 64-bit values]: Starting SO_NUM_PRIMS_WRITTEN Counter Snapshots > + * > + * The first set of values is used by DrawTransformFeedback(), which > + * copies one of them into the 3DPRIM_VERTEX_COUNT register and performs > + * an indirect draw. The other values are just temporary storage. > + */ > + > +#define TALLY_OFFSET (BRW_MAX_XFB_STREAMS * sizeof(uint32_t)) > +#define START_OFFSET (TALLY_OFFSET * 2) > + > +/** > + * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t > values) > + * to prim_count_bo. > + */ > +static void > +save_prim_start_values(struct brw_context *brw, > + struct brw_transform_feedback_object *obj) > +{ > + /* Flush any drawing so that the counters have the right values. */ > + brw_emit_mi_flush(brw); > + > + /* Emit MI_STORE_REGISTER_MEM commands to write the values. */ > + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { > + brw_store_register_mem64(brw, obj->prim_count_bo, > + GEN7_SO_NUM_PRIMS_WRITTEN(i), > + START_OFFSET + i * sizeof(uint64_t)); > + } > +} > + > +/** > + * Compute the number of primitives written during our most recent > + * transform feedback activity (the current SO_NUM_PRIMS_WRITTEN value > + * minus the stashed "start" value), and add it to our running tally. > + * > + * If \p finalize is true, also compute the number of vertices written > + * (by multiplying by the number of vertices per primitive), and store > + * that to the "final" location. > + * > + * Otherwise, just overwrite the old tally with the new one. > + */ > +static void > +tally_prims_written(struct brw_context *brw, > + struct brw_transform_feedback_object *obj, > + bool finalize) > +{ > + /* Flush any drawing so that the counters have the right values. */ > + brw_emit_mi_flush(brw); > + > + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { > + /* GPR0 = Tally */ > + brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0); > + brw_load_register_mem(brw, HSW_CS_GPR(0), obj->prim_count_bo, > + I915_GEM_DOMAIN_INSTRUCTION, > + I915_GEM_DOMAIN_INSTRUCTION, > + TALLY_OFFSET + i * sizeof(uint32_t)); > + /* GPR1 = Start Snapshot */ > + brw_load_register_mem64(brw, HSW_CS_GPR(1), obj->prim_count_bo, > + I915_GEM_DOMAIN_INSTRUCTION, > + I915_GEM_DOMAIN_INSTRUCTION, > + START_OFFSET + i * sizeof(uint64_t)); > + /* GPR2 = Ending Snapshot */ > + brw_load_register_reg64(brw, GEN7_SO_NUM_PRIMS_WRITTEN(i), > HSW_CS_GPR(2)); > + > + BEGIN_BATCH(9); > + OUT_BATCH(HSW_MI_MATH | (9 - 2)); > + /* GPR1 = GPR2 (End) - GPR1 (Start) */ > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2)); > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); > + OUT_BATCH(MI_MATH_ALU0(SUB)); > + OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU)); > + /* GPR0 = GPR0 (Tally) + GPR1 (Diff) */ > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); > + OUT_BATCH(MI_MATH_ALU0(ADD)); > + OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
I don't know if it would have worked (I wonder if STORE changes ACCU), but my idea for v1 was to add: OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU)); here, and then keep the loop from v1 except always load SRCB from R1. It doesn't seem to make a big difference either way. This way is actually clearer code. Reviewed-by: Jordan Justen <jordan.l.jus...@intel.com> Did you get a chance to see if this made a difference in Manhattan? > + ADVANCE_BATCH(); > + > + if (!finalize) { > + /* Write back the new tally */ > + brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0), > + TALLY_OFFSET + i * sizeof(uint32_t)); > + } else { > + /* Convert the number of primitives to the number of vertices. */ > + if (obj->primitive_mode == GL_LINES) { > + /* Double R0 (R0 = R0 + R0) */ > + BEGIN_BATCH(5); > + OUT_BATCH(HSW_MI_MATH | (5 - 2)); > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0)); > + OUT_BATCH(MI_MATH_ALU0(ADD)); > + OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); > + ADVANCE_BATCH(); > + } else if (obj->primitive_mode == GL_TRIANGLES) { > + /* Triple R0 (R1 = R0 + R0, R0 = R0 + R1) */ > + BEGIN_BATCH(9); > + OUT_BATCH(HSW_MI_MATH | (9 - 2)); > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0)); > + OUT_BATCH(MI_MATH_ALU0(ADD)); > + OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU)); > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); > + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); > + OUT_BATCH(MI_MATH_ALU0(ADD)); > + OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); > + ADVANCE_BATCH(); > + } > + /* Store it to the final result */ > + brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0), > + i * sizeof(uint32_t)); > + } > + } > +} > + > +/** > + * BeginTransformFeedback() driver hook. > + */ > +void > +hsw_begin_transform_feedback(struct gl_context *ctx, GLenum mode, > + struct gl_transform_feedback_object *obj) > +{ > + struct brw_context *brw = brw_context(ctx); > + struct brw_transform_feedback_object *brw_obj = > + (struct brw_transform_feedback_object *) obj; > + > + brw_obj->primitive_mode = mode; > + > + /* Reset the SO buffer offsets to 0. */ > + if (brw->gen >= 8) { > + brw_obj->zero_offsets = true; > + } else { > + BEGIN_BATCH(1 + 2 * BRW_MAX_XFB_STREAMS); > + OUT_BATCH(MI_LOAD_REGISTER_IMM | (1 + 2 * BRW_MAX_XFB_STREAMS - 2)); > + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { > + OUT_BATCH(GEN7_SO_WRITE_OFFSET(i)); > + OUT_BATCH(0); > + } > + ADVANCE_BATCH(); > + } > + > + /* Zero out the initial tallies */ > + brw_store_data_imm64(brw, brw_obj->prim_count_bo, TALLY_OFFSET, 0ull); > + brw_store_data_imm64(brw, brw_obj->prim_count_bo, TALLY_OFFSET + 8, 0ull); > + > + /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */ > + save_prim_start_values(brw, brw_obj); > +} > + > +/** > + * PauseTransformFeedback() driver hook. > + */ > +void > +hsw_pause_transform_feedback(struct gl_context *ctx, > + struct gl_transform_feedback_object *obj) > +{ > + struct brw_context *brw = brw_context(ctx); > + struct brw_transform_feedback_object *brw_obj = > + (struct brw_transform_feedback_object *) obj; > + > + if (brw->is_haswell) { > + /* Save the SOL buffer offset register values. */ > + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { > + BEGIN_BATCH(3); > + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); > + OUT_BATCH(GEN7_SO_WRITE_OFFSET(i)); > + OUT_RELOC(brw_obj->offset_bo, > + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, > + i * sizeof(uint32_t)); > + ADVANCE_BATCH(); > + } > + } > + > + /* Add any primitives written to our tally */ > + tally_prims_written(brw, brw_obj, false); > +} > + > +/** > + * ResumeTransformFeedback() driver hook. > + */ > +void > +hsw_resume_transform_feedback(struct gl_context *ctx, > + struct gl_transform_feedback_object *obj) > +{ > + struct brw_context *brw = brw_context(ctx); > + struct brw_transform_feedback_object *brw_obj = > + (struct brw_transform_feedback_object *) obj; > + > + if (brw->is_haswell) { > + /* Reload the SOL buffer offset registers. */ > + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { > + BEGIN_BATCH(3); > + OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2)); > + OUT_BATCH(GEN7_SO_WRITE_OFFSET(i)); > + OUT_RELOC(brw_obj->offset_bo, > + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, > + i * sizeof(uint32_t)); > + ADVANCE_BATCH(); > + } > + } > + > + /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */ > + save_prim_start_values(brw, brw_obj); > +} > + > +/** > + * EndTransformFeedback() driver hook. > + */ > +void > +hsw_end_transform_feedback(struct gl_context *ctx, > + struct gl_transform_feedback_object *obj) > +{ > + struct brw_context *brw = brw_context(ctx); > + struct brw_transform_feedback_object *brw_obj = > + (struct brw_transform_feedback_object *) obj; > + > + /* Add any primitives written to our tally, convert it from the number > + * of primitives written to the number of vertices written, and store > + * it in the "final" location in the buffer which DrawTransformFeedback() > + * will use as the vertex count. > + */ > + tally_prims_written(brw, brw_obj, true); > +} > -- > 2.8.2 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev