Thank you so much for reviewing these patches Curro! I'll make the suggested changes and resubmit.
On Thu, 5 Apr 2018, 19:25 Francisco Jerez, <curroje...@riseup.net> wrote: > Plamena Manolova <plamena.n.manol...@gmail.com> writes: > > > Adds suppport for ARB_fragment_shader_interlock. We achieve > > the interlock and fragment ordering by issuing a memory fence > > via sendc. > > > > Signed-off-by: Plamena Manolova <plamena.n.manol...@gmail.com> > > --- > > docs/features.txt | 2 +- > > docs/relnotes/18.1.0.html | 1 + > > src/intel/compiler/brw_eu.h | 3 ++- > > src/intel/compiler/brw_eu_defines.h | 2 ++ > > src/intel/compiler/brw_eu_emit.c | 7 ++++--- > > src/intel/compiler/brw_fs_generator.cpp | 7 ++++++- > > src/intel/compiler/brw_fs_nir.cpp | 15 +++++++++++++++ > > src/intel/compiler/brw_shader.cpp | 4 ++++ > > src/intel/compiler/brw_vec4_generator.cpp | 2 +- > > src/mesa/drivers/dri/i965/intel_extensions.c | 1 + > > 10 files changed, 37 insertions(+), 7 deletions(-) > > > > diff --git a/docs/features.txt b/docs/features.txt > > index 5eae34bf0d..a621251efd 100644 > > --- a/docs/features.txt > > +++ b/docs/features.txt > > @@ -297,7 +297,7 @@ Khronos, ARB, and OES extensions that are not part > of any OpenGL or OpenGL ES ve > > GL_ARB_cl_event not started > > GL_ARB_compute_variable_group_size DONE (nvc0, > radeonsi) > > GL_ARB_ES3_2_compatibility DONE > (i965/gen8+) > > - GL_ARB_fragment_shader_interlock not started > > + GL_ARB_fragment_shader_interlock DONE (i965) > > GL_ARB_gpu_shader_int64 DONE > (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe) > > GL_ARB_parallel_shader_compile not started, > but Chia-I Wu did some related work in 2014 > > GL_ARB_post_depth_coverage DONE (i965) > > diff --git a/docs/relnotes/18.1.0.html b/docs/relnotes/18.1.0.html > > index 1d5201717f..9d8e63855d 100644 > > --- a/docs/relnotes/18.1.0.html > > +++ b/docs/relnotes/18.1.0.html > > @@ -51,6 +51,7 @@ Note: some of the new features are only available with > certain drivers. > > <li>GL_EXT_shader_framebuffer_fetch on i965 on desktop GL (GLES was > already supported)</li> > > <li>GL_EXT_shader_framebuffer_fetch_non_coherent on i965</li> > > <li>Disk shader cache support for i965 enabled by default</li> > > +<li>GL_ARB_fragment_shader_interlock on i965</li> > > </ul> > > > > <h2>Bug fixes</h2> > > diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h > > index ca72666a55..b2c36d3ea1 100644 > > --- a/src/intel/compiler/brw_eu.h > > +++ b/src/intel/compiler/brw_eu.h > > @@ -509,7 +509,8 @@ brw_byte_scattered_write(struct brw_codegen *p, > > > > void > > brw_memory_fence(struct brw_codegen *p, > > - struct brw_reg dst); > > + struct brw_reg dst, > > + uint32_t send_op); > > > > The new argument should probably be of type "enum opcode" in order to > avoid losing type information. > > > void > > brw_pixel_interpolator_query(struct brw_codegen *p, > > diff --git a/src/intel/compiler/brw_eu_defines.h > b/src/intel/compiler/brw_eu_defines.h > > index 332d627bc3..2980e98a58 100644 > > --- a/src/intel/compiler/brw_eu_defines.h > > +++ b/src/intel/compiler/brw_eu_defines.h > > @@ -480,6 +480,8 @@ enum opcode { > > > > SHADER_OPCODE_GET_BUFFER_SIZE, > > > > + SHADER_OPCODE_INTERLOCK, > > + > > VEC4_OPCODE_MOV_BYTES, > > VEC4_OPCODE_PACK_BYTES, > > VEC4_OPCODE_UNPACK_UNIFORM, > > diff --git a/src/intel/compiler/brw_eu_emit.c > b/src/intel/compiler/brw_eu_emit.c > > index f039af56d0..6a57397a41 100644 > > --- a/src/intel/compiler/brw_eu_emit.c > > +++ b/src/intel/compiler/brw_eu_emit.c > > @@ -3285,7 +3285,8 @@ brw_set_memory_fence_message(struct brw_codegen *p, > > > > void > > brw_memory_fence(struct brw_codegen *p, > > - struct brw_reg dst) > > + struct brw_reg dst, > > + uint32_t send_op) > > { > > const struct gen_device_info *devinfo = p->devinfo; > > const bool commit_enable = > > @@ -3301,7 +3302,7 @@ brw_memory_fence(struct brw_codegen *p, > > /* Set dst as destination for dependency tracking, the MEMORY_FENCE > > * message doesn't write anything back. > > */ > > - insn = next_insn(p, BRW_OPCODE_SEND); > > + insn = next_insn(p, send_op); > > dst = retype(dst, BRW_REGISTER_TYPE_UW); > > brw_set_dest(p, insn, dst); > > brw_set_src0(p, insn, dst); > > @@ -3313,7 +3314,7 @@ brw_memory_fence(struct brw_codegen *p, > > * flush it too. Use a different register so both flushes can be > > * pipelined by the hardware. > > */ > > - insn = next_insn(p, BRW_OPCODE_SEND); > > + insn = next_insn(p, send_op); > > brw_set_dest(p, insn, offset(dst, 1)); > > brw_set_src0(p, insn, offset(dst, 1)); > > brw_set_memory_fence_message(p, insn, > GEN6_SFID_DATAPORT_RENDER_CACHE, > > diff --git a/src/intel/compiler/brw_fs_generator.cpp > b/src/intel/compiler/brw_fs_generator.cpp > > index 0c85eb8e1e..f099d092d1 100644 > > --- a/src/intel/compiler/brw_fs_generator.cpp > > +++ b/src/intel/compiler/brw_fs_generator.cpp > > @@ -2277,7 +2277,12 @@ fs_generator::generate_code(const cfg_t *cfg, int > dispatch_width) > > break; > > > > case SHADER_OPCODE_MEMORY_FENCE: > > - brw_memory_fence(p, dst); > > + brw_memory_fence(p, dst, BRW_OPCODE_SEND); > > + break; > > + > > + case SHADER_OPCODE_INTERLOCK: > > + /* The interlock is basically a memory fence issued via sendc > */ > > + brw_memory_fence(p, dst, BRW_OPCODE_SENDC); > > break; > > > > case SHADER_OPCODE_FIND_LIVE_CHANNEL: { > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > > index dbd2105f7e..c07201f889 100644 > > --- a/src/intel/compiler/brw_fs_nir.cpp > > +++ b/src/intel/compiler/brw_fs_nir.cpp > > @@ -4771,6 +4771,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder > &bld, nir_intrinsic_instr *instr > > break; > > } > > > > + case nir_intrinsic_begin_invocation_interlock_ARB: { > > + const fs_builder ubld = bld.group(8, 0); > > + const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); > > + > > + ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 * > > + REG_SIZE; > > + > > + break; > > + } > > + > > + case nir_intrinsic_end_invocation_interlock_ARB: { > > + /* We don't need to do anything here */ > > + break; > > + } > > + > > default: > > unreachable("unknown intrinsic"); > > } > > diff --git a/src/intel/compiler/brw_shader.cpp > b/src/intel/compiler/brw_shader.cpp > > index ffe8a7403d..750ef1e886 100644 > > --- a/src/intel/compiler/brw_shader.cpp > > +++ b/src/intel/compiler/brw_shader.cpp > > @@ -292,6 +292,9 @@ brw_instruction_name(const struct gen_device_info > *devinfo, enum opcode op) > > return "typed_surface_write_logical"; > > case SHADER_OPCODE_MEMORY_FENCE: > > return "memory_fence"; > > + case SHADER_OPCODE_INTERLOCK: > > + /* For an interlock we actually issue a memory fence via sendc. */ > > + return "interlock"; > > > > case SHADER_OPCODE_BYTE_SCATTERED_READ: > > return "byte_scattered_read"; > > @@ -991,6 +994,7 @@ backend_instruction::has_side_effects() const > > case SHADER_OPCODE_TYPED_SURFACE_WRITE: > > case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: > > case SHADER_OPCODE_MEMORY_FENCE: > > + case SHADER_OPCODE_INTERLOCK: > > case SHADER_OPCODE_URB_WRITE_SIMD8: > > case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: > > case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: > > diff --git a/src/intel/compiler/brw_vec4_generator.cpp > b/src/intel/compiler/brw_vec4_generator.cpp > > index a3ed8609a2..d6a864ce62 100644 > > --- a/src/intel/compiler/brw_vec4_generator.cpp > > +++ b/src/intel/compiler/brw_vec4_generator.cpp > > @@ -1904,7 +1904,7 @@ generate_code(struct brw_codegen *p, > > break; > > > > case SHADER_OPCODE_MEMORY_FENCE: > > - brw_memory_fence(p, dst); > > + brw_memory_fence(p, dst, BRW_OPCODE_SEND); > > break; > > > > case SHADER_OPCODE_FIND_LIVE_CHANNEL: { > > diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c > b/src/mesa/drivers/dri/i965/intel_extensions.c > > index 73a6c73f53..f786e9917e 100644 > > --- a/src/mesa/drivers/dri/i965/intel_extensions.c > > +++ b/src/mesa/drivers/dri/i965/intel_extensions.c > > @@ -240,6 +240,7 @@ intelInitExtensions(struct gl_context *ctx) > > ctx->Extensions.ARB_transform_feedback2 = true; > > ctx->Extensions.ARB_transform_feedback3 = true; > > ctx->Extensions.ARB_transform_feedback_instanced = true; > > + ctx->Extensions.ARB_fragment_shader_interlock = true; > > > > I don't think there is any reason to make this conditional on > can_do_pipelined_register_writes(). The extension could technically be > implemented trivially all the way back to the original i965 (the > implementation for Gen4-5 would in fact be a no-op because the hardware > implicitly orders *all* fragment shader invocations with overlapping > fragments), but because it would be kind of difficult to take advantage > of the functionality on such hardware in any way it's probably more > reasonable to enable it on Gen7+ only for the moment. > > With these fixed patch is: > > Reviewed-by: Francisco Jerez <curroje...@riseup.net> > > > if (can_do_compute_dispatch(brw->screen) && > > ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) { > > -- > > 2.11.0 >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev