[Mesa-dev] [PATCH] ac/nir: fix txf_ms with an offset
Seems to fix some hair artifacts in Max Payne 3: https://github.com/daniel-schuermann/mesa/issues/76 Signed-off-by: Rhys Perry Fixes: f4e499ec791 ('radv: add initial non-conformant radv vulkan driver') --- src/amd/common/ac_nir_to_llvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 96bf89a8bf9..549a26ea243 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3784,7 +3784,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) goto write_result; } - if (args.offset && instr->op != nir_texop_txf) { + if (args.offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) { LLVMValueRef offset[3], pack; for (unsigned chan = 0; chan < 3; ++chan) offset[chan] = ctx->ac.i32_0; @@ -3919,7 +3919,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) args.coords[sample_chan], fmask_ptr); } - if (args.offset && instr->op == nir_texop_txf) { + if (args.offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) { int num_offsets = instr->src[offset_src].src.ssa->num_components; num_offsets = MIN2(num_offsets, instr->coord_components); for (unsigned i = 0; i < num_offsets; ++i) { -- 2.21.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent
The first and last hunks are needed to pass on the shader_info to the middle hunk, which needs it so that it can test if the compute shader has a derivative group. On Fri, 31 May 2019 at 18:38, Marek Olšák wrote: > > The first and last hunks look like they shouldn't be there. Other than that: > > Reviewed-by: Marek Olšák > > Marek > > On Fri, May 31, 2019 at 11:53 AM Rhys Perry wrote: >> >> Otherwise LLVM can sink them and their texture coordinate calculations >> into divergent branches. >> >> v2: simplify the conditions on which the intrinsic is marked as convergent >> v3: only mark as convergent in FS and CS with derivative groups >> >> Cc: >> Signed-off-by: Rhys Perry >> --- >> src/amd/common/ac_nir_to_llvm.c | 18 ++ >> 1 file changed, 18 insertions(+) >> >> diff --git a/src/amd/common/ac_nir_to_llvm.c >> b/src/amd/common/ac_nir_to_llvm.c >> index 265e3b636c4..9e9fade7227 100644 >> --- a/src/amd/common/ac_nir_to_llvm.c >> +++ b/src/amd/common/ac_nir_to_llvm.c >> @@ -38,6 +38,7 @@ struct ac_nir_context { >> struct ac_shader_abi *abi; >> >> gl_shader_stage stage; >> + shader_info *info; >> >> LLVMValueRef *ssa_defs; >> >> @@ -1394,6 +1395,22 @@ static LLVMValueRef build_tex_intrinsic(struct >> ac_nir_context *ctx, >> } >> >> args->attributes = AC_FUNC_ATTR_READNONE; >> + bool cs_derivs = ctx->stage == MESA_SHADER_COMPUTE && >> +ctx->info->cs.derivative_group != >> DERIVATIVE_GROUP_NONE; >> + if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) { >> + /* Prevent texture instructions with implicit derivatives >> from being >> +* sinked into branches. */ >> + switch (instr->op) { >> + case nir_texop_tex: >> + case nir_texop_txb: >> + case nir_texop_lod: >> + args->attributes |= AC_FUNC_ATTR_CONVERGENT; >> + break; >> + default: >> + break; >> + } >> + } >> + >> return ac_build_image_opcode(&ctx->ac, args); >> } >> >> @@ -4350,6 +4367,7 @@ void ac_nir_translate(struct ac_llvm_context *ac, >> struct ac_shader_abi *abi, >> ctx.abi = abi; >> >> ctx.stage = nir->info.stage; >> + ctx.info = &nir->info; >> >> ctx.main_function = >> LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); >> >> -- >> 2.21.0 >> ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent
Otherwise LLVM can sink them and their texture coordinate calculations into divergent branches. v2: simplify the conditions on which the intrinsic is marked as convergent v3: only mark as convergent in FS and CS with derivative groups Cc: Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 265e3b636c4..9e9fade7227 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -38,6 +38,7 @@ struct ac_nir_context { struct ac_shader_abi *abi; gl_shader_stage stage; + shader_info *info; LLVMValueRef *ssa_defs; @@ -1394,6 +1395,22 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, } args->attributes = AC_FUNC_ATTR_READNONE; + bool cs_derivs = ctx->stage == MESA_SHADER_COMPUTE && +ctx->info->cs.derivative_group != DERIVATIVE_GROUP_NONE; + if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) { + /* Prevent texture instructions with implicit derivatives from being +* sinked into branches. */ + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_lod: + args->attributes |= AC_FUNC_ATTR_CONVERGENT; + break; + default: + break; + } + } + return ac_build_image_opcode(&ctx->ac, args); } @@ -4350,6 +4367,7 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, ctx.abi = abi; ctx.stage = nir->info.stage; + ctx.info = &nir->info; ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); -- 2.21.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent
Otherwise LLVM can sink them and their texture coordinate calculations into divergent branches. v2: simplify the conditions on which the intrinsic is marked as convergent Cc: Signed-off-by: Rhys Perry Reviewed-By: Bas Nieuwenhuizen --- src/amd/common/ac_nir_to_llvm.c | 12 1 file changed, 12 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 265e3b636c4..b1a191ac24c 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1394,6 +1394,18 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, } args->attributes = AC_FUNC_ATTR_READNONE; + /* Prevent texture instructions with implicit derivatives from being +* sinked into branches. */ + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_lod: + args->attributes |= AC_FUNC_ATTR_CONVERGENT; + break; + default: + break; + } + return ac_build_image_opcode(&ctx->ac, args); } -- 2.21.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent
Seems txf can(should?) have a lod supplied. txf_ms and tg4 always use the 0th level. I'll add txf, txf_ms and tg4 to the list of nir_texop which don't ever have implicit derivatives. On Thu, 30 May 2019 at 19:43, Ilia Mirkin wrote: > > txf supplies an lod, but tg4's is implicitly always 0. > > On Thu, May 30, 2019 at 2:26 PM Bas Nieuwenhuizen > wrote: > > > > On Thu, May 30, 2019 at 6:50 PM Rhys Perry wrote: > > > > > > Otherwise LLVM can sink them and their texture coordinate calculations > > > into divergent branches. > > > > > > Cc: > > > Signed-off-by: Rhys Perry > > > --- > > > src/amd/common/ac_nir_to_llvm.c | 29 + > > > 1 file changed, 29 insertions(+) > > > > > > diff --git a/src/amd/common/ac_nir_to_llvm.c > > > b/src/amd/common/ac_nir_to_llvm.c > > > index 265e3b636c4..d2dc617de36 100644 > > > --- a/src/amd/common/ac_nir_to_llvm.c > > > +++ b/src/amd/common/ac_nir_to_llvm.c > > > @@ -1316,6 +1316,30 @@ static nir_deref_instr > > > *get_tex_texture_deref(const nir_tex_instr *instr) > > > return texture_deref_instr; > > > } > > > > > > +static bool has_implicit_derivatives(const nir_tex_instr *instr) > > > +{ > > > + switch (instr->op) { > > > + case nir_texop_txs: > > > + case nir_texop_query_levels: > > > + case nir_texop_texture_samples: > > > + case nir_texop_samples_identical: > > > + return false; > > > + default: > > > + break; > > > + } > > > + for (unsigned i = 0; i < instr->num_srcs; i++) { > > > + switch (instr->src[i].src_type) { > > > + case nir_tex_src_lod: > > > + case nir_tex_src_ddx: > > > + case nir_tex_src_ddy: > > > + return false; > > > + default: > > > + break; > > > + } > > > + } > > > + return true; > > > +} > > > > txf, tg4 and friends do not provide any of lod/ddx/ddy do they? > > > > > + > > > static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, > > > const nir_tex_instr *instr, > > > struct ac_image_args *args) > > > @@ -1394,6 +1418,11 @@ static LLVMValueRef build_tex_intrinsic(struct > > > ac_nir_context *ctx, > > > } > > > > > > args->attributes = AC_FUNC_ATTR_READNONE; > > > + /* Prevent texture instructions with implicit derivatives from > > > being > > > +* sinked into branches. */ > > > + if (has_implicit_derivatives(instr)) > > > + args->attributes |= AC_FUNC_ATTR_CONVERGENT; > > > + > > > return ac_build_image_opcode(&ctx->ac, args); > > > } > > > > > > -- > > > 2.21.0 > > > > > ___ > > mesa-dev mailing list > > mesa-dev@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent
Otherwise LLVM can sink them and their texture coordinate calculations into divergent branches. Cc: Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 29 + 1 file changed, 29 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 265e3b636c4..d2dc617de36 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1316,6 +1316,30 @@ static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr) return texture_deref_instr; } +static bool has_implicit_derivatives(const nir_tex_instr *instr) +{ + switch (instr->op) { + case nir_texop_txs: + case nir_texop_query_levels: + case nir_texop_texture_samples: + case nir_texop_samples_identical: + return false; + default: + break; + } + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_lod: + case nir_tex_src_ddx: + case nir_tex_src_ddy: + return false; + default: + break; + } + } + return true; +} + static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, const nir_tex_instr *instr, struct ac_image_args *args) @@ -1394,6 +1418,11 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, } args->attributes = AC_FUNC_ATTR_READNONE; + /* Prevent texture instructions with implicit derivatives from being +* sinked into branches. */ + if (has_implicit_derivatives(instr)) + args->attributes |= AC_FUNC_ATTR_CONVERGENT; + return ac_build_image_opcode(&ctx->ac, args); } -- 2.21.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radv: fix some compiler warnings
Fixes -Woverflow warnings with GCC 9.1.1 Signed-off-by: Rhys Perry --- src/amd/vulkan/si_cmd_buffer.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index aae8d578c10..d87c00b94e9 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -1360,7 +1360,7 @@ void radv_emit_default_sample_locations(struct radeon_cmdbuf *cs, int nr_samples default: case 1: radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); - radeon_emit(cs, centroid_priority_1x); + radeon_emit(cs, centroid_priority_1x & 0x); radeon_emit(cs, centroid_priority_1x >> 32); radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_1x); radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_1x); @@ -1369,7 +1369,7 @@ void radv_emit_default_sample_locations(struct radeon_cmdbuf *cs, int nr_samples break; case 2: radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); - radeon_emit(cs, centroid_priority_2x); + radeon_emit(cs, centroid_priority_2x & 0x); radeon_emit(cs, centroid_priority_2x >> 32); radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x); radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x); @@ -1378,7 +1378,7 @@ void radv_emit_default_sample_locations(struct radeon_cmdbuf *cs, int nr_samples break; case 4: radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); - radeon_emit(cs, centroid_priority_4x); + radeon_emit(cs, centroid_priority_4x & 0x); radeon_emit(cs, centroid_priority_4x >> 32); radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x); radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x); @@ -1387,7 +1387,7 @@ void radv_emit_default_sample_locations(struct radeon_cmdbuf *cs, int nr_samples break; case 8: radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); - radeon_emit(cs, centroid_priority_8x); + radeon_emit(cs, centroid_priority_8x & 0x); radeon_emit(cs, centroid_priority_8x >> 32); radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14); radeon_emit_array(cs, sample_locs_8x, 4); -- 2.21.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/2] radeonsi: use new atomic LLVM helpers
The patch this depends on, "ac,ac/nir: use a better sync scope for shared atomics", has been pushed: https://gitlab.freedesktop.org/mesa/mesa/commit/bd4c661ad08e772fdccb562ffbb2f45705c4fec8 On Fri, 26 Apr 2019 at 21:41, Marek Olšák wrote: > > From: Marek Olšák > > This depends on "ac,ac/nir: use a better sync scope for shared atomics" > --- > src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c | 12 > 1 file changed, 4 insertions(+), 8 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c > b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c > index eb90bfb10ff..5e540fc5098 100644 > --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c > +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c > @@ -776,38 +776,36 @@ static void store_emit( > emit_data->output[emit_data->chan] = > ac_build_image_opcode(&ctx->ac, &args); > } > } > > static void atomic_emit_memory(struct si_shader_context *ctx, > struct lp_build_emit_data *emit_data) { > LLVMBuilderRef builder = ctx->ac.builder; > const struct tgsi_full_instruction * inst = emit_data->inst; > LLVMValueRef ptr, result, arg; > + const char *sync_scope = HAVE_LLVM >= 0x0900 ? "workgroup-one-as" : > "workgroup"; > > ptr = get_memory_ptr(ctx, inst, ctx->i32, 1); > > arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0); > arg = ac_to_integer(&ctx->ac, arg); > > if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { > LLVMValueRef new_data; > new_data = lp_build_emit_fetch(&ctx->bld_base, >inst, 3, 0); > > new_data = ac_to_integer(&ctx->ac, new_data); > > - result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data, > - > LLVMAtomicOrderingSequentiallyConsistent, > - > LLVMAtomicOrderingSequentiallyConsistent, > - false); > - > + result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, arg, > new_data, > + sync_scope); > result = LLVMBuildExtractValue(builder, result, 0, ""); > } else { > LLVMAtomicRMWBinOp op; > > switch(inst->Instruction.Opcode) { > case TGSI_OPCODE_ATOMUADD: > op = LLVMAtomicRMWBinOpAdd; > break; > case TGSI_OPCODE_ATOMXCHG: > op = LLVMAtomicRMWBinOpXchg; > @@ -830,23 +828,21 @@ static void atomic_emit_memory(struct si_shader_context > *ctx, > case TGSI_OPCODE_ATOMIMIN: > op = LLVMAtomicRMWBinOpMin; > break; > case TGSI_OPCODE_ATOMIMAX: > op = LLVMAtomicRMWBinOpMax; > break; > default: > unreachable("unknown atomic opcode"); > } > > - result = LLVMBuildAtomicRMW(builder, op, ptr, arg, > - > LLVMAtomicOrderingSequentiallyConsistent, > - false); > + result = ac_build_atomic_rmw(&ctx->ac, op, ptr, arg, > sync_scope); > } > emit_data->output[emit_data->chan] = > LLVMBuildBitCast(builder, result, ctx->f32, ""); > } > > static void atomic_emit( > const struct lp_build_tgsi_action *action, > struct lp_build_tgsi_context *bld_base, > struct lp_build_emit_data *emit_data) > { > -- > 2.17.1 > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3] radv: fix set_output_usage_mask() with composite and 64-bit types
It previously used var->type instead of deref_instr->type and didn't handle 64-bit outputs. This fixes lots of transform feedback CTS tests involving transform feedback and geometry shaders (mostly dEQP-VK.transform_feedback.fuzz.random_geometry.*) v2: fix writemask widening when comp != 0 v3: fix 64-bit variables when comp != 0, again Signed-off-by: Rhys Perry Cc: 19.0 --- src/amd/vulkan/radv_shader_info.c | 21 + 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 932a1852266..e771ad79878 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -112,6 +112,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir, } } +static uint32_t +widen_writemask(uint32_t wrmask) +{ + uint32_t new_wrmask = 0; + for(unsigned i = 0; i < 4; i++) + new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2); + return new_wrmask; +} + static void set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, uint8_t *output_usage_mask) @@ -119,7 +128,7 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, nir_deref_instr *deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr); nir_variable *var = nir_deref_instr_get_variable(deref_instr); - unsigned attrib_count = glsl_count_attribute_slots(var->type, false); + unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, false); unsigned idx = var->data.location; unsigned comp = var->data.location_frac; unsigned const_offset = 0; @@ -127,15 +136,19 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, get_deref_offset(deref_instr, &const_offset); if (var->data.compact) { + assert(!glsl_type_is_64bit(deref_instr->type)); const_offset += comp; output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset % 4); return; } - for (unsigned i = 0; i < attrib_count; i++) { + uint32_t wrmask = nir_intrinsic_write_mask(instr); + if (glsl_type_is_64bit(deref_instr->type)) + wrmask = widen_writemask(wrmask); + + for (unsigned i = 0; i < attrib_count; i++) output_usage_mask[idx + i + const_offset] |= - instr->const_index[0] << comp; - } + ((wrmask >> (i * 4)) & 0xf) << comp; } static void -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2] radv: fix set_output_usage_mask() with composite and 64-bit types
It previously used var->type instead of deref_instr->type and didn't handle 64-bit outputs. This fixes lots of transform feedback CTS tests involving transform feedback and geometry shaders (mostly dEQP-VK.transform_feedback.fuzz.random_geometry.*) v2: fix writemask widening when comp != 0 Signed-off-by: Rhys Perry Cc: 19.0 --- src/amd/vulkan/radv_shader_info.c | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 932a1852266..63ee25ab7c9 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -112,6 +112,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir, } } +static uint32_t +widen_writemask(uint32_t wrmask) +{ + uint32_t new_wrmask = 0; + for(unsigned i = 0; i < 4; i++) + new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2); + return new_wrmask; +} + static void set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, uint8_t *output_usage_mask) @@ -119,7 +128,7 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, nir_deref_instr *deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr); nir_variable *var = nir_deref_instr_get_variable(deref_instr); - unsigned attrib_count = glsl_count_attribute_slots(var->type, false); + unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, false); unsigned idx = var->data.location; unsigned comp = var->data.location_frac; unsigned const_offset = 0; @@ -127,15 +136,21 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, get_deref_offset(deref_instr, &const_offset); if (var->data.compact) { + assert(!glsl_type_is_64bit(deref_instr->type)); const_offset += comp; output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset % 4); return; } - for (unsigned i = 0; i < attrib_count; i++) { + uint32_t wrmask = nir_intrinsic_write_mask(instr); + if (glsl_type_is_64bit(deref_instr->type)) + wrmask = widen_writemask(wrmask) << (comp * 2); + else + wrmask = wrmask << comp; + + for (unsigned i = 0; i < attrib_count; i++) output_usage_mask[idx + i + const_offset] |= - instr->const_index[0] << comp; - } + (wrmask >> (i * 4)) & 0xf; } static void -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radv: fix set_output_usage_mask() with composite and 64-bit types
It previously used var->type instead of deref_instr->type and didn't handle 64-bit outputs. This fixes lots of transform feedback CTS tests involving transform feedback and geometry shaders (mostly dEQP-VK.transform_feedback.fuzz.random_geometry.*) Signed-off-by: Rhys Perry Cc: 19.0 --- src/amd/vulkan/radv_shader_info.c | 21 + 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 932a1852266..a3bfc81808e 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -112,6 +112,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir, } } +static uint32_t +widen_writemask(uint32_t wrmask) +{ + uint32_t new_wrmask = 0; + for(unsigned i = 0; i < 4; i++) + new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2); + return new_wrmask; +} + static void set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, uint8_t *output_usage_mask) @@ -119,7 +128,7 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, nir_deref_instr *deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr); nir_variable *var = nir_deref_instr_get_variable(deref_instr); - unsigned attrib_count = glsl_count_attribute_slots(var->type, false); + unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, false); unsigned idx = var->data.location; unsigned comp = var->data.location_frac; unsigned const_offset = 0; @@ -127,15 +136,19 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, get_deref_offset(deref_instr, &const_offset); if (var->data.compact) { + assert(!glsl_type_is_64bit(deref_instr->type)); const_offset += comp; output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset % 4); return; } - for (unsigned i = 0; i < attrib_count; i++) { + uint32_t wrmask = nir_intrinsic_write_mask(instr) << comp; + if (glsl_type_is_64bit(deref_instr->type)) + wrmask = widen_writemask(wrmask); + + for (unsigned i = 0; i < attrib_count; i++) output_usage_mask[idx + i + const_offset] |= - instr->const_index[0] << comp; - } + (wrmask >> (i * 4)) & 0xf; } static void -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] ac, ac/nir: use a better sync scope for shared atomics
https://reviews.llvm.org/rL356946 (present in LLVM 9 and later) changed the meaning of the "system" sync scope, making it no longer restricted to the memory operation's address space. So a single address space sync scope is needed for shared atomic operations (such as "system-one-as" or "workgroup-one-as") otherwise buffer_wbinvl1 and s_waitcnt instructions can be created at each shared atomic operation. This mostly reimplements LLVMBuildAtomicRMW and LLVMBuildAtomicCmpXchg to allow for more sync scopes and uses the new functions in ac->nir with the "workgroup-one-as" or "workgroup" sync scopes. F1 2017 (4K, Ultra High settings, TAA), avg FPS : 59 -> 59.67 (+1.14%) Strange Brigade (4K, ~highest settings), avg FPS : 51.5 -> 51.6 (+0.19%) RotTR/mountain (4K, VeryHigh settings, FXAA), avg FPS : 57.2 -> 57.2 (+0.0%) RotTR/tomb (4K, VeryHigh settings, FXAA), avg FPS : 42.5 -> 43.0 (+1.17%) RotTR/valley (4K, VeryHigh settings, FXAA), avg FPS : 40.7 -> 41.6 (+2.21%) Warhammer II/fallen, avg FPS : 31.63 -> 31.83 (+0.63%) Warhammer II/skaven, avg FPS : 37.77 -> 38.07 (+0.79%) Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.h| 10 +- src/amd/common/ac_llvm_helper.cpp | 59 +++ src/amd/common/ac_nir_to_llvm.c | 12 +++ 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index f4cee667153..98f856106d6 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -26,7 +26,7 @@ #define AC_LLVM_BUILD_H #include -#include +#include #include "compiler/nir/nir.h" #include "amd_family.h" @@ -694,6 +694,14 @@ ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij); LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx); +LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, +LLVMValueRef ptr, LLVMValueRef val, +const char *sync_scope); + +LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef cmp, LLVMValueRef val, + const char *sync_scope); + #ifdef __cplusplus } #endif diff --git a/src/amd/common/ac_llvm_helper.cpp b/src/amd/common/ac_llvm_helper.cpp index dcfb8008546..e5030c6f472 100644 --- a/src/amd/common/ac_llvm_helper.cpp +++ b/src/amd/common/ac_llvm_helper.cpp @@ -31,6 +31,7 @@ #include "ac_binary.h" #include "ac_llvm_util.h" +#include "ac_llvm_build.h" #include #include @@ -167,3 +168,61 @@ void ac_enable_global_isel(LLVMTargetMachineRef tm) { reinterpret_cast(tm)->setGlobalISel(true); } + +LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, +LLVMValueRef ptr, LLVMValueRef val, +const char *sync_scope) { + llvm::AtomicRMWInst::BinOp binop; + switch (op) { + case LLVMAtomicRMWBinOpXchg: + binop = llvm::AtomicRMWInst::Xchg; + break; + case LLVMAtomicRMWBinOpAdd: + binop = llvm::AtomicRMWInst::Add; + break; + case LLVMAtomicRMWBinOpSub: + binop = llvm::AtomicRMWInst::Sub; + break; + case LLVMAtomicRMWBinOpAnd: + binop = llvm::AtomicRMWInst::And; + break; + case LLVMAtomicRMWBinOpNand: + binop = llvm::AtomicRMWInst::Nand; + break; + case LLVMAtomicRMWBinOpOr: + binop = llvm::AtomicRMWInst::Or; + break; + case LLVMAtomicRMWBinOpXor: + binop = llvm::AtomicRMWInst::Xor; + break; + case LLVMAtomicRMWBinOpMax: + binop = llvm::AtomicRMWInst::Max; + break; + case LLVMAtomicRMWBinOpMin: + binop = llvm::AtomicRMWInst::Min; + break; + case LLVMAtomicRMWBinOpUMax: + binop = llvm::AtomicRMWInst::UMax; + break; + case LLVMAtomicRMWBinOpUMin: + binop = llvm::AtomicRMWInst::UMin; + break; + default: + unreachable(!"invalid LLVMAtomicRMWBinOp"); + break; + } + unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); + return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW( + binop, llvm::unwrap(ptr), llvm::unwrap(val), + llvm::AtomicOrdering::SequentiallyConsistent, SSID)); +} + +LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, +
[Mesa-dev] [PATCH] nir,ac/nir: fix cube_face_coord
Seems it was missing the "/ ma + 0.5" and the order was swapped. Fixes: a1a2a8dfda7b9cac7e ('nir: add AMD_gcn_shader extended instructions') Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 11 +-- src/compiler/nir/nir_opcodes.py | 21 +++-- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 0c8891d26a0..12c4c21a8d9 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1081,10 +1081,17 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) LLVMValueRef in[3]; for (unsigned chan = 0; chan < 3; chan++) in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); - results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", + results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", + results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); + LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", +ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); + results[0] = ac_build_fdiv(&ctx->ac, results[0], ma); + results[1] = ac_build_fdiv(&ctx->ac, results[1], ma); + LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5); + results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, ""); + results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, ""); result = ac_build_gather_values(&ctx->ac, results, 2); break; } diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 90f7aed0c0d..0f56dd9596c 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -410,12 +410,21 @@ dst.x = dst.y = 0.0; float absX = fabs(src0.x); float absY = fabs(src0.y); float absZ = fabs(src0.z); -if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; } -if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; } -if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; } -if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; } -if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; } -if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; } + +float ma = 0.0; +if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; } +if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; } +if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; } + +if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; } +if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; } +if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; } +if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; } +if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; } +if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; } + +dst.x = dst.x / ma + 0.5; +dst.y = dst.y / ma + 0.5; """) unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """ -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage
The CTS is buggy because the input_output_float_64_to_16 tests are run even though they shouldn't be run because they try to use a unadvertised (and unimplemented) optional feature. Some of them crash for unrelated reasons though: load_tess_varyings() from ac_nir_to_llvm.c doesn't handle 64-bit varyings. So not all of them would work even if VK_FORMAT_R64_SFLOAT was a implemented vertex format. On Mon, 18 Feb 2019 at 08:53, Samuel Pitoiset wrote: > > > On 2/16/19 1:21 AM, Rhys Perry wrote: > > This series add support for: > > - VK_KHR_shader_float16_int8 > > - VK_AMD_gpu_shader_half_float > > - VK_AMD_gpu_shader_int16 > > - VK_KHR_8bit_storage > > on VI+. Half floats are disabled on LLVM 7 because of a bug causing large > > memory usage and long (or unbounded) compilation times with some CTS > > tests. > > > > It is written against the following patch series: > > - https://patchwork.freedesktop.org/series/53454/ (v4) > > - https://patchwork.freedesktop.org/series/53660/ (v1) > > > > With LLVM 9, there are no reproducable Vulkan CTS regressions with Vega > > and VI except for > > dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_float_64_to_16.* > > which fails or crashes because of unrelated radv bugs with 64-bit varyings > > and because the tests use VK_FORMAT_R64_SFLOAT as a vertex format even > > though radv does not support it. > > test bug? > > The two NIR related patches (22 and 25) should be sent separately, > otherwise people working on NIR might miss them. > > > > > With LLVM 9, there are no reproducable piglit regressions except for > > glsl-array-bounds-12.shader_test because of a LLVM bug when > > SLP vectorization is enabled. > > > > With LLVM 8, there are no reproducable Vulkan CTS regressions with Vega > > and VI except for those with LLVM 9 and a couple of tests because of a > > LLVM bug after the SLP vectorizer and with the current lack of fallback > > for 16-bit interpolation on LLVM versions before LLVM 9. > > > > With LLVM 7, there are no reproducable Vulkan CTS regressions with Vega > > and VI except for those with LLVM 9 and a couple of tests because of a > > LLVM bug after the SLP vectorizer. > > > > The SLP vectorization patch is marked as WIP because it exposes LLVM bugs > > with piglit's glsl-array-bounds-12.shader_test, some Vulkan CTS tests and > > some shader-db test for a game I can't remember. It also over-vectorizes > > 32-bit code which can cause significant worsening in generated code > > quality. > > > > The 16-bit interpolation patch is marked as WIP because it currently > > requires intrinsics only available in LLVM 9 and does not have a fallback. > > > > A branch on Github containing this series can be found at: > > https://github.com/pendingchaos/mesa/commits/radv_fp16_int16_int8_v2 > > > > v2: rebase > > v2: implement 16-bit interpolation > > v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass > > v2: run vectorization unconditionally on GFX9 and later > > v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof() > > v2: remove ac_int_of_size() > > v2: fix 64-bit visit_load_var() > > v2: mark VK_KHR_8bit_storage as DONE in features.txt > > v2: mark SLP vectorization patch as WIP > > v2: fix C++ style comment > > > > Rhys Perry (41): > >radv: bitcast 16-bit outputs to integers > >radv: ensure export arguments are always float > >ac: add various helpers for float16/int16/int8 > >ac/nir: implement 8-bit push constant, ssbo and ubo loads > >ac/nir: implement 8-bit ssbo stores > >ac/nir: fix 16-bit ssbo stores > >ac/nir: implement 8-bit nir_load_const_instr > >ac/nir: implement 8-bit conversions > >ac/nir: fix 64-bit nir_op_f2f16_rtz > >ac/nir: make ac_build_clamp work on all bit sizes > >ac/nir: make ac_build_fract work on all bit sizes > >ac/nir: make ac_build_isign work on all bit sizes > >ac/nir: make ac_build_fsign work on all bit sizes > >ac/nir: make ac_build_fdiv support 16-bit floats > >ac/nir: implement half-float nir_op_frcp > >ac/nir: implement half-float nir_op_frsq > >ac/nir: implement half-float nir_op_ldexp > >radv: lower 16-bit flrp > >ac/nir: support half floats in emit_b2f > >ac/nir: make emit_b2i work on all bit sizes > >ac/nir: implement 16-bit shifts > >compiler/nir: add lowering option for 16-bit ffma > >ac/nir: implement 16-bit ac_build_ddxy > >ac/nir: implement 8 and 16 bit ac
Re: [Mesa-dev] [PATCH v2 06/41] ac/nir: fix 16-bit ssbo stores
I don't see a 16-bit version of tbuffer.store in IntrinsicsAMDGPU.td and simply changing "llvm.amdgcn.tbuffer.store.i32" to "llvm.amdgcn.tbuffer.store.i16" and removing the zext doesn't seem to work. On Mon, 18 Feb 2019 at 08:55, Samuel Pitoiset wrote: > > Does this fix anything know? There is a 16-bit version of tbuffer.store, > maybe we should use it? > > On 2/16/19 1:21 AM, Rhys Perry wrote: > > Signed-off-by: Rhys Perry > > --- > > src/amd/common/ac_nir_to_llvm.c | 2 ++ > > 1 file changed, 2 insertions(+) > > > > diff --git a/src/amd/common/ac_nir_to_llvm.c > > b/src/amd/common/ac_nir_to_llvm.c > > index 89a78b43c6f..b260142c177 100644 > > --- a/src/amd/common/ac_nir_to_llvm.c > > +++ b/src/amd/common/ac_nir_to_llvm.c > > @@ -1586,6 +1586,8 @@ static void visit_store_ssbo(struct ac_nir_context > > *ctx, > > } else if (num_bytes == 2) { > > store_name = "llvm.amdgcn.tbuffer.store.i32"; > > data_type = ctx->ac.i32; > > + data = LLVMBuildBitCast(ctx->ac.builder, data, > > ctx->ac.i16, ""); > > + data = LLVMBuildZExt(ctx->ac.builder, data, > > data_type, ""); > > LLVMValueRef tbuffer_params[] = { > > data, > > rsrc, ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 31/41] ac/nir: implement 16-bit pack/unpack opcodes
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 24 1 file changed, 24 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bad1c2a990e..f6ad1aa7e77 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1015,6 +1015,30 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; } + case nir_op_pack_32_2x16_split: { + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); + result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + + case nir_op_unpack_32_2x16_split_x: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i16, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, +ctx->ac.i32_0, ""); + break; + } + + case nir_op_unpack_32_2x16_split_y: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i16, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, +ctx->ac.i32_1, ""); + break; + } + case nir_op_cube_face_coord: { src[0] = ac_to_float(&ctx->ac, src[0]); LLVMValueRef results[2]; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 32/41] ac/nir: add 8-bit types to glsl_base_to_llvm_type
v2: remove 16-bit additions and rebase Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index f6ad1aa7e77..defbfdf4297 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3969,6 +3969,9 @@ glsl_base_to_llvm_type(struct ac_llvm_context *ac, case GLSL_TYPE_BOOL: case GLSL_TYPE_SUBROUTINE: return ac->i32; + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT8: + return ac->i8; case GLSL_TYPE_INT16: case GLSL_TYPE_UINT16: return ac->i16; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 25/41] nir: make bitfield_reverse and ifind_msb work with all integers
Signed-off-by: Rhys Perry --- src/compiler/nir/nir_opcodes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index dc4cd9ac63d..0f40bd6c548 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -350,7 +350,7 @@ unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") # Bit operations, part of ARB_gpu_shader5. -unop("bitfield_reverse", tuint32, """ +unop("bitfield_reverse", tuint, """ /* we're not winning any awards for speed here, but that's ok */ dst = 0; for (unsigned bit = 0; bit < 32; bit++) @@ -374,7 +374,7 @@ for (int bit = bit_size - 1; bit >= 0; bit--) { } """) -unop("ifind_msb", tint32, """ +unop_convert("ifind_msb", tint32, tint, """ dst = -1; for (int bit = 31; bit >= 0; bit--) { /* If src0 < 0, we're looking for the first 0 bit. -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 33/41] ac/nir, radv: create an array of varying output types
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 68 +++ src/amd/common/ac_shader_abi.h| 1 + src/amd/vulkan/radv_nir_to_llvm.c | 3 ++ 3 files changed, 72 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index defbfdf4297..5821c18aeb1 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -4238,6 +4238,68 @@ static void visit_cf_list(struct ac_nir_context *ctx, } } +static unsigned traverse_var_component_slots(struct ac_llvm_context *ctx, bool vs_in, +struct nir_variable *var, unsigned cur_offset, +const struct glsl_type *cur_type, +void (*cb)(struct ac_llvm_context *, unsigned, enum glsl_base_type, void *), +void *cbdata) +{ + if (glsl_type_is_struct(cur_type)) { + for (unsigned i = 0; i < glsl_get_length(cur_type); i++) { + const struct glsl_type *ft = glsl_get_struct_field(cur_type, i); + cur_offset = traverse_var_component_slots(ctx, vs_in, var, cur_offset, ft, cb, cbdata); + } + return (cur_offset + 3) / 4 * 4; + } + + enum glsl_base_type base_type = glsl_get_base_type(glsl_without_array_or_matrix(cur_type)); + + unsigned stride = glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); + if (!var->data.compact) + stride = (stride + 3) / 4 * 4; + unsigned arr_len = MAX2(glsl_get_matrix_columns(cur_type), 1); + if (glsl_type_is_array(cur_type)) + arr_len *= glsl_get_aoa_size(cur_type); + for (unsigned i = 0; i < arr_len; i++) { + for (unsigned j = 0; j < glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); j++) { + cb(ctx, cur_offset + var->data.location_frac + j, base_type, cbdata); + } + cur_offset += stride; + } + return cur_offset; +} + +static void setup_output_type(struct ac_llvm_context *ctx, unsigned index, enum glsl_base_type base, void *output_types) +{ + LLVMTypeRef type; + switch (base) { + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT8: + type = ctx->i8; + break; + case GLSL_TYPE_INT16: + case GLSL_TYPE_UINT16: + type = ctx->i16; + break; + case GLSL_TYPE_FLOAT16: + type = ctx->f16; + break; + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_INT64: + case GLSL_TYPE_UINT64: + type = ctx->i32; + break; + case GLSL_TYPE_FLOAT: + case GLSL_TYPE_DOUBLE: + default: + type = ctx->f32; + break; + } + ((LLVMTypeRef*)output_types)[index] = type; +} + void ac_handle_shader_output_decl(struct ac_llvm_context *ctx, struct ac_shader_abi *abi, @@ -4275,6 +4337,9 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx, ac_build_alloca_undef(ctx, type, ""); } } + + traverse_var_component_slots(ctx, false, variable, output_loc * 4, +variable->type, &setup_output_type, abi->output_types); } static void @@ -4328,6 +4393,9 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS * 4; i++) + ctx.abi->output_types[i] = ac->i32; + nir_foreach_variable(variable, &nir->outputs) ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, ctx.stage); diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h index ee18e6c1923..274deeb13a4 100644 --- a/src/amd/common/ac_shader_abi.h +++ b/src/amd/common/ac_shader_abi.h @@ -69,6 +69,7 @@ struct ac_shader_abi { LLVMValueRef view_index; LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4]; + LLVMTypeRef output_types[AC_LLVM_MAX_OUTPUTS * 4]; /* For VS and PS: pre-loaded shader inputs. * diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index d3795eec403..8fdaee72036 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -3910,6 +3910,9 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out; ac_setup_rings(&ctx); + for (unsigned i = 0; i < AC
[Mesa-dev] [PATCH v2 35/41] radv: store all fragment shader inputs as f32
v2: rebase Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_nir_to_llvm.c | 14 -- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 2002a744545..01b8b097ea1 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2056,7 +2056,6 @@ static void interp_fs_input(struct radv_shader_context *ctx, LLVMValueRef attr_number; unsigned chan; LLVMValueRef i, j; - bool interp = !LLVMIsUndef(interp_param); attr_number = LLVMConstInt(ctx->ac.i32, attr, false); @@ -2070,7 +2069,7 @@ static void interp_fs_input(struct radv_shader_context *ctx, * fs.interp cannot be used on integers, because they can be equal * to NaN. */ - if (interp) { + if (interp_param) { interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2f32, ""); @@ -2083,7 +2082,7 @@ static void interp_fs_input(struct radv_shader_context *ctx, for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); - if (interp) { + if (interp_param) { result[chan] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, @@ -2095,7 +2094,6 @@ static void interp_fs_input(struct radv_shader_context *ctx, attr_number, prim_mask); result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, ""); - result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], LLVMTypeOf(interp_param), ""); } } } @@ -2123,10 +2121,6 @@ handle_fs_input_decl(struct radv_shader_context *ctx, interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type); } - bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); - LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32; - if (interp == NULL) - interp = LLVMGetUndef(type); for (unsigned i = 0; i < attrib_count; ++i) ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp; @@ -2187,7 +2181,7 @@ handle_fs_inputs(struct radv_shader_context *ctx, if (ctx->shader_info->info.ps.uses_input_attachments || ctx->shader_info->info.needs_multiview_view_index) { ctx->input_mask |= 1ull << VARYING_SLOT_LAYER; - ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = LLVMGetUndef(ctx->ac.i32); + ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = NULL; } for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) { @@ -2203,7 +2197,7 @@ handle_fs_inputs(struct radv_shader_context *ctx, interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, inputs); - if (LLVMIsUndef(interp_param)) + if (!interp_param) ctx->shader_info->fs.flat_shaded_mask |= 1u << index; if (i >= VARYING_SLOT_VAR0) ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH v2 37/41] radv, ac: implement 16-bit interpolation
This patch can be ignored. I forgot to delete it and it ended up getting sent. "[PATCH v2 37/41] WIP: radv, ac: implement 16-bit interpolation" is the correct one. On Sat, 16 Feb 2019 at 00:23, Rhys Perry wrote: > > v2: add to patch series > > Signed-off-by: Rhys Perry > --- > src/amd/common/ac_llvm_build.c | 33 +--- > src/amd/common/ac_llvm_build.h | 3 ++- > src/amd/common/ac_nir_to_llvm.c | 14 +++--- > src/amd/vulkan/radv_nir_to_llvm.c| 27 ++- > src/amd/vulkan/radv_pipeline.c | 19 -- > src/amd/vulkan/radv_shader.h | 1 + > src/gallium/drivers/radeonsi/si_shader.c | 2 +- > 7 files changed, 69 insertions(+), 30 deletions(-) > > diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c > index dff369aae7f..be2c2251a21 100644 > --- a/src/amd/common/ac_llvm_build.c > +++ b/src/amd/common/ac_llvm_build.c > @@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, >LLVMValueRef attr_number, >LLVMValueRef params, >LLVMValueRef i, > - LLVMValueRef j) > + LLVMValueRef j, > + int word) > { > - LLVMValueRef args[5]; > + LLVMValueRef args[6]; > LLVMValueRef p1; > > args[0] = i; > args[1] = llvm_chan; > args[2] = attr_number; > - args[3] = params; > - > - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", > - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); > + if (word >= 0) { > + args[3] = LLVMConstInt(ctx->i1, word, false); > + args[4] = params; > + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", > + ctx->f16, args, 5, > AC_FUNC_ATTR_READNONE); > + } else { > + args[3] = params; > + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", > + ctx->f32, args, 4, > AC_FUNC_ATTR_READNONE); > + } > > args[0] = p1; > args[1] = j; > args[2] = llvm_chan; > args[3] = attr_number; > - args[4] = params; > - > - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", > - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); > + if (word >= 0) { > + args[4] = LLVMConstInt(ctx->i1, word, false); > + args[5] = params; > + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", > + ctx->f16, args, 6, > AC_FUNC_ATTR_READNONE); > + } else { > + args[4] = params; > + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", > + ctx->f32, args, 5, > AC_FUNC_ATTR_READNONE); > + } > } > > LLVMValueRef > diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h > index 61c9b5e4b6c..655427567c4 100644 > --- a/src/amd/common/ac_llvm_build.h > +++ b/src/amd/common/ac_llvm_build.h > @@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, >LLVMValueRef attr_number, >LLVMValueRef params, >LLVMValueRef i, > - LLVMValueRef j); > + LLVMValueRef j, > + int word); > > LLVMValueRef > ac_build_fs_interp_mov(struct ac_llvm_context *ctx, > diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c > index bf7024c68e4..939b8eb13de 100644 > --- a/src/amd/common/ac_nir_to_llvm.c > +++ b/src/amd/common/ac_nir_to_llvm.c > @@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context > *ctx, > LLVMValueRef j = LLVMBuildExtractElement( > ctx->ac.builder, interp_param, > ctx->ac.i32_1, ""); > > + /* This fp16 handling isn't technically > correct > +* but should be correct for the attributes we > +* are actually going to use. */ > + bool fp16 = instr->dest.ssa.bit_size == 16; > + int word = fp16 ? 0 : -1; > v = ac_build_fs_interp(&ctx->ac, llvm_chan, > attr_number, > -
[Mesa-dev] [PATCH v2 34/41] ac/nir: store all outputs as f32
v2: rebase v2: fix 64-bit visit_load_var() Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 14 ++ src/amd/vulkan/radv_nir_to_llvm.c | 22 +- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 5821c18aeb1..bf7024c68e4 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -2114,7 +2114,10 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, unreachable("unhandle variable mode"); } ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp); - return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); + if (instr->dest.ssa.bit_size == 16) + return ac_build_reinterpret(&ctx->ac, ret, get_def_type(ctx, &instr->dest.ssa)); + else + return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); } static void @@ -2152,6 +2155,11 @@ visit_store_var(struct ac_nir_context *ctx, writemask = writemask << comp; + LLVMTypeRef type = ctx->ac.f32; + if (LLVMGetTypeKind(LLVMTypeOf(src)) == LLVMVectorTypeKind) + type = LLVMVectorType(ctx->ac.f32, LLVMGetVectorSize(LLVMTypeOf(src))); + src = ac_build_reinterpret(&ctx->ac, src, type); + switch (deref->mode) { case nir_var_shader_out: @@ -4329,12 +4337,10 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx, } } - bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); - LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32; for (unsigned i = 0; i < attrib_count; ++i) { for (unsigned chan = 0; chan < 4; chan++) { abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] = - ac_build_alloca_undef(ctx, type, ""); + ac_build_alloca_undef(ctx, ctx->f32, ""); } } diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 8fdaee72036..2002a744545 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2305,6 +2305,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; if (ctx->stage == MESA_SHADER_FRAGMENT) { + bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; unsigned index = target - V_008DFC_SQ_EXP_MRT; unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; @@ -2421,16 +2422,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, return; } - if (is_16bit) { - for (unsigned chan = 0; chan < 4; chan++) { - values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, ""); - args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, ""); - } - } else - memcpy(&args->out[0], values, sizeof(values[0]) * 4); - - for (unsigned i = 0; i < 4; ++i) - args->out[i] = ac_to_float(&ctx->ac, args->out[i]); + for (unsigned chan = 0; chan < 4; chan++) + args->out[chan] = ac_build_reinterpret(&ctx->ac, values[chan], ctx->ac.f32); } static void @@ -3137,9 +3130,12 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) if (i < FRAG_RESULT_DATA0) continue; - for (unsigned j = 0; j < 4; j++) - values[j] = ac_to_float(&ctx->ac, - radv_load_output(ctx, i, j)); + for (unsigned j = 0; j < 4; j++) { + values[j] = radv_load_output(ctx, i, j); + unsigned index = ac_llvm_reg_index_soa(i, 0); + LLVMTypeRef new_type = ctx->abi.output_types[index]; + values[j] = ac_build_reinterpret(&ctx->ac, values[j], new_type); + } bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 37/41] WIP: radv, ac: implement 16-bit interpolation
v2: add to patch series Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 33 +--- src/amd/common/ac_llvm_build.h | 3 ++- src/amd/common/ac_nir_to_llvm.c | 14 +++--- src/amd/vulkan/radv_nir_to_llvm.c| 27 ++- src/amd/vulkan/radv_pipeline.c | 19 -- src/amd/vulkan/radv_shader.h | 1 + src/gallium/drivers/radeonsi/si_shader.c | 2 +- 7 files changed, 69 insertions(+), 30 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index dff369aae7f..be2c2251a21 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, - LLVMValueRef j) + LLVMValueRef j, + int word) { - LLVMValueRef args[5]; + LLVMValueRef args[6]; LLVMValueRef p1; args[0] = i; args[1] = llvm_chan; args[2] = attr_number; - args[3] = params; - - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + if (word >= 0) { + args[3] = LLVMConstInt(ctx->i1, word, false); + args[4] = params; + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", + ctx->f16, args, 5, AC_FUNC_ATTR_READNONE); + } else { + args[3] = params; + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", + ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + } args[0] = p1; args[1] = j; args[2] = llvm_chan; args[3] = attr_number; - args[4] = params; - - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + if (word >= 0) { + args[4] = LLVMConstInt(ctx->i1, word, false); + args[5] = params; + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", + ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); + } else { + args[4] = params; + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", + ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + } } LLVMValueRef diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 61c9b5e4b6c..655427567c4 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, - LLVMValueRef j); + LLVMValueRef j, + int word); LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bf7024c68e4..939b8eb13de 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, LLVMValueRef j = LLVMBuildExtractElement( ctx->ac.builder, interp_param, ctx->ac.i32_1, ""); + /* This fp16 handling isn't technically correct +* but should be correct for the attributes we +* are actually going to use. */ + bool fp16 = instr->dest.ssa.bit_size == 16; + int word = fp16 ? 0 : -1; v = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, - ctx->abi->prim_mask, i, j); + ctx->abi->prim_mask, i, j, word); + if (fp16) + v = ac_build_reinterpret(&ctx->ac, v, ctx->ac.f32); } else { v = ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, false), llvm_chan, attr_number, ctx->abi->prim_mask); @@ -3134,8 +3141,9 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, attrib_idx, ""); } - retu
[Mesa-dev] [PATCH v2 37/41] radv, ac: implement 16-bit interpolation
v2: add to patch series Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 33 +--- src/amd/common/ac_llvm_build.h | 3 ++- src/amd/common/ac_nir_to_llvm.c | 14 +++--- src/amd/vulkan/radv_nir_to_llvm.c| 27 ++- src/amd/vulkan/radv_pipeline.c | 19 -- src/amd/vulkan/radv_shader.h | 1 + src/gallium/drivers/radeonsi/si_shader.c | 2 +- 7 files changed, 69 insertions(+), 30 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index dff369aae7f..be2c2251a21 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, - LLVMValueRef j) + LLVMValueRef j, + int word) { - LLVMValueRef args[5]; + LLVMValueRef args[6]; LLVMValueRef p1; args[0] = i; args[1] = llvm_chan; args[2] = attr_number; - args[3] = params; - - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + if (word >= 0) { + args[3] = LLVMConstInt(ctx->i1, word, false); + args[4] = params; + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", + ctx->f16, args, 5, AC_FUNC_ATTR_READNONE); + } else { + args[3] = params; + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", + ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + } args[0] = p1; args[1] = j; args[2] = llvm_chan; args[3] = attr_number; - args[4] = params; - - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + if (word >= 0) { + args[4] = LLVMConstInt(ctx->i1, word, false); + args[5] = params; + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", + ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); + } else { + args[4] = params; + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", + ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + } } LLVMValueRef diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 61c9b5e4b6c..655427567c4 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, - LLVMValueRef j); + LLVMValueRef j, + int word); LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bf7024c68e4..939b8eb13de 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, LLVMValueRef j = LLVMBuildExtractElement( ctx->ac.builder, interp_param, ctx->ac.i32_1, ""); + /* This fp16 handling isn't technically correct +* but should be correct for the attributes we +* are actually going to use. */ + bool fp16 = instr->dest.ssa.bit_size == 16; + int word = fp16 ? 0 : -1; v = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, - ctx->abi->prim_mask, i, j); + ctx->abi->prim_mask, i, j, word); + if (fp16) + v = ac_build_reinterpret(&ctx->ac, v, ctx->ac.f32); } else { v = ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, false), llvm_chan, attr_number, ctx->abi->prim_mask); @@ -3134,8 +3141,9 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, attrib_idx, ""); } - retu
[Mesa-dev] [PATCH v2 38/41] WIP: ac, radv: run LLVM's SLP vectorizer
v2: rebase v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass v2: run unconditionally on GFX9 and later v2: mark as WIP because it can make 32-bit code much worse Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_util.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c index 69446863b95..8d78b5a850b 100644 --- a/src/amd/common/ac_llvm_util.c +++ b/src/amd/common/ac_llvm_util.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "c11/threads.h" #include "gallivm/lp_bld_misc.h" #include "util/u_math.h" @@ -175,7 +176,7 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, } static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_library_info, - bool check_ir) + bool check_ir, enum radeon_family family) { LLVMPassManagerRef passmgr = LLVMCreatePassManager(); if (!passmgr) @@ -203,6 +204,9 @@ static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_libr LLVMAddCFGSimplificationPass(passmgr); /* This is recommended by the instruction combining pass. */ LLVMAddEarlyCSEMemSSAPass(passmgr); + /* vectorization is disabled on pre-GFX9 because it's not very useful there */ + if (family >= CHIP_VEGA10) + LLVMAddSLPVectorizePass(passmgr); LLVMAddInstructionCombiningPass(passmgr); return passmgr; } @@ -327,7 +331,7 @@ ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, goto fail; compiler->passmgr = ac_create_passmgr(compiler->target_library_info, - tm_options & AC_TM_CHECK_IR); + tm_options & AC_TM_CHECK_IR, family); if (!compiler->passmgr) goto fail; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 39/41] ac/nir: generate better code for nir_op_f2f16_rtz
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 939b8eb13de..8bfc63958ca 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -889,7 +889,9 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 }; result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); + // generates better code than an extractelement with slp vectorization + result = LLVMBuildBitCast(ctx->ac.builder, result, ctx->ac.i32, ""); + result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, ""); break; case nir_op_f2f16_rtne: case nir_op_f2f16: -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 41/41] radv, docs: expose float16, int16 and int8 features and extensions
v2: rebase v2: mark VK_KHR_8bit_storage as DONE in features.txt Signed-off-by: Rhys Perry --- docs/features.txt | 2 +- src/amd/vulkan/radv_device.c | 17 + src/amd/vulkan/radv_extensions.py | 4 src/amd/vulkan/radv_shader.c | 3 +++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/features.txt b/docs/features.txt index 6c2b6d59377..ded753b0182 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -439,7 +439,7 @@ Vulkan 1.1 -- all DONE: anv, radv VK_KHR_variable_pointers DONE (anv, radv) Khronos extensions that are not part of any Vulkan version: - VK_KHR_8bit_storage DONE (anv) + VK_KHR_8bit_storage DONE (anv, radv) VK_KHR_android_surfacenot started VK_KHR_create_renderpass2 DONE (anv, radv) VK_KHR_displayDONE (anv, radv) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 0fef92773e1..4137b778466 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -877,6 +877,23 @@ void radv_GetPhysicalDeviceFeatures2( features->bufferDeviceAddressMultiDevice = false; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: { + VkPhysicalDeviceFloat16Int8FeaturesKHR *features = + (VkPhysicalDeviceFloat16Int8FeaturesKHR*)ext; + bool enabled = pdevice->rad_info.chip_class >= VI; + features->shaderFloat16 = enabled && HAVE_LLVM >= 0x0800; + features->shaderInt8 = enabled; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: { + VkPhysicalDevice8BitStorageFeaturesKHR *features = + (VkPhysicalDevice8BitStorageFeaturesKHR*)ext; + bool enabled = pdevice->rad_info.chip_class >= VI; + features->storageBuffer8BitAccess = enabled; + features->uniformAndStorageBuffer8BitAccess = enabled; + features->storagePushConstant8 = enabled; + break; + } default: break; } diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index f218598f123..e38cfcfdcbe 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -91,6 +91,8 @@ EXTENSIONS = [ Extension('VK_KHR_xlib_surface', 6, 'VK_USE_PLATFORM_XLIB_KHR'), Extension('VK_KHR_multiview', 1, True), Extension('VK_KHR_display', 23, 'VK_USE_PLATFORM_DISPLAY_KHR'), +Extension('VK_KHR_shader_float16_int8', 1, 'device->rad_info.chip_class >= VI'), +Extension('VK_KHR_8bit_storage', 1, 'device->rad_info.chip_class >= VI'), Extension('VK_EXT_direct_mode_display', 1, 'VK_USE_PLATFORM_DISPLAY_KHR'), Extension('VK_EXT_acquire_xlib_display', 1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'), Extension('VK_EXT_buffer_device_address', 1, True), @@ -121,6 +123,8 @@ EXTENSIONS = [ Extension('VK_AMD_shader_core_properties',1, True), Extension('VK_AMD_shader_info', 1, True), Extension('VK_AMD_shader_trinary_minmax', 1, True), +Extension('VK_AMD_gpu_shader_half_float', 1, 'device->rad_info.chip_class >= VI && HAVE_LLVM >= 0x0800'), +Extension('VK_AMD_gpu_shader_int16', 1, 'device->rad_info.chip_class >= VI'), Extension('VK_GOOGLE_decorate_string',1, True), Extension('VK_GOOGLE_hlsl_functionality1',1, True), ] diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index adba730ad8b..44dea8e7203 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -249,6 +249,9 @@ radv_shader_compile_to_nir(struct radv_device *device, .transform_feedback = true, .trinary_minmax = true, .variable_pointers = true, + .float16 = true, + .storage_8bit = true, + .int8 =
[Mesa-dev] [PATCH v2 40/41] ac/nir: have nir_op_f2f16 round to zero
In the hope that one day LLVM will then be able to generate code with vectorized v_cvt_pkrtz_f16_f32 instructions. Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 8bfc63958ca..7a5e95506f2 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -884,6 +884,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); break; case nir_op_f2f16_rtz: + case nir_op_f2f16: src[0] = ac_to_float(&ctx->ac, src[0]); if (LLVMTypeOf(src[0]) == ctx->ac.f64) src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); @@ -894,7 +895,6 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, ""); break; case nir_op_f2f16_rtne: - case nir_op_f2f16: case nir_op_f2f32: case nir_op_f2f64: src[0] = ac_to_float(&ctx->ac, src[0]); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 28/41] ac/nir: implement 8 and 16 bit ac_build_imsb
v2: fix C++ style comment Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 4 1 file changed, 4 insertions(+) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index ec87a7b9343..c986f800fa4 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -1531,6 +1531,10 @@ ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) { + /* TODO: support 64-bit integers */ + if (LLVMTypeOf(arg) != ctx->i32) + arg = LLVMBuildSExt(ctx->builder, arg, ctx->i32, ""); + LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 10/41] ac/nir: make ac_build_clamp work on all bit sizes
v2: don't use ac_get_zerof() and ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index b53d9c7ff8c..667f9700764 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -1597,16 +1597,20 @@ ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { + char intr[64]; + snprintf(intr, sizeof(intr), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); LLVMValueRef args[2] = {a, b}; - return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2, + return ac_build_intrinsic(ctx, intr, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); } LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { + char intr[64]; + snprintf(intr, sizeof(intr), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); LLVMValueRef args[2] = {a, b}; - return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2, + return ac_build_intrinsic(ctx, intr, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); } @@ -1633,8 +1637,9 @@ LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) { - return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0), -ctx->f32_1); + LLVMTypeRef t = LLVMTypeOf(value); + return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), +LLVMConstReal(t, 1.0)); } void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 14/41] ac/nir: make ac_build_fdiv support 16-bit floats
v2: don't use ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 23e454385d7..fb871a47400 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -661,7 +661,7 @@ ac_build_fdiv(struct ac_llvm_context *ctx, * If we do (num * (1 / den)), LLVM does: *return num * v_rcp_f32(den); */ - LLVMValueRef one = LLVMTypeOf(num) == ctx->f64 ? ctx->f64_1 : ctx->f32_1; + LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0); LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, ""); LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 21/41] ac/nir: implement 16-bit shifts
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 9 +++-- 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 75bb19031bf..bad1c2a990e 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -672,20 +672,17 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_ishl: result = LLVMBuildShl(ctx->ac.builder, src[0], - LLVMBuildZExt(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""), + ac_build_ui_cast(&ctx->ac, src[1], LLVMTypeOf(src[0])), ""); break; case nir_op_ishr: result = LLVMBuildAShr(ctx->ac.builder, src[0], - LLVMBuildZExt(ctx->ac.builder, src[1], -LLVMTypeOf(src[0]), ""), + ac_build_ui_cast(&ctx->ac, src[1], LLVMTypeOf(src[0])), ""); break; case nir_op_ushr: result = LLVMBuildLShr(ctx->ac.builder, src[0], - LLVMBuildZExt(ctx->ac.builder, src[1], -LLVMTypeOf(src[0]), ""), + ac_build_ui_cast(&ctx->ac, src[1], LLVMTypeOf(src[0])), ""); break; case nir_op_ilt32: -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 11/41] ac/nir: make ac_build_fract work on all bit sizes
Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 13 +++-- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 667f9700764..db937eb66fb 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2049,16 +2049,9 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16) LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { - LLVMTypeRef type; - char *intr; - - if (bitsize == 32) { - intr = "llvm.floor.f32"; - type = ctx->f32; - } else { - intr = "llvm.floor.f64"; - type = ctx->f64; - } + LLVMTypeRef type = ac_float_of_size(ctx, bitsize); + char intr[64]; + snprintf(intr, sizeof(intr), "llvm.floor.f%d", bitsize); LLVMValueRef params[] = { src0, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 16/41] ac/nir: implement half-float nir_op_frsq
v2: don't use ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index cba0cec3e8f..8b0e07d2930 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -788,8 +788,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) case nir_op_frsq: result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", ac_to_float_type(&ctx->ac, def_type), src[0]); - result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1, - result); + result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(result), 1.0), result); break; case nir_op_frexp_exp: src[0] = ac_to_float(&ctx->ac, src[0]); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 15/41] ac/nir: implement half-float nir_op_frcp
v2: don't use ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 741059b5f1a..cba0cec3e8f 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -657,8 +657,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_frcp: src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1, - src[0]); + result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]); break; case nir_op_iand: result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 18/41] radv: lower 16-bit flrp
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_shader.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 1dcb0606246..adba730ad8b 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -53,6 +53,7 @@ static const struct nir_shader_compiler_options nir_options = { .vertex_id_zero_based = true, .lower_scmp = true, + .lower_flrp16 = true, .lower_flrp32 = true, .lower_flrp64 = true, .lower_device_index_to_zero = true, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 17/41] ac/nir: implement half-float nir_op_ldexp
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 8b0e07d2930..0e5946dfdb3 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -829,8 +829,10 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_ldexp: src[0] = ac_to_float(&ctx->ac, src[0]); - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 32) + if (ac_get_elem_bits(&ctx->ac, def_type) == 32) result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, AC_FUNC_ATTR_READNONE); + else if (ac_get_elem_bits(&ctx->ac, def_type) == 16) + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, AC_FUNC_ATTR_READNONE); else result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, AC_FUNC_ATTR_READNONE); break; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 26/41] ac/nir: make ac_find_lsb work on all bit sizes
v2: don't use ac_get_zero() and ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 33 ++--- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index aa92c55c822..61085db9320 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2474,30 +2474,11 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0) { - unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef zero; - - switch (src0_bitsize) { - case 64: - intrin_name = "llvm.cttz.i64"; - type = ctx->i64; - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.cttz.i32"; - type = ctx->i32; - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.cttz.i16"; - type = ctx->i16; - zero = ctx->i16_0; - break; - default: - unreachable(!"invalid bitsize"); - } + LLVMTypeRef type = LLVMTypeOf(src0); + unsigned src0_bitsize = ac_get_elem_bits(ctx, type); + char intrin_name[64]; + LLVMValueRef zero = LLVMConstInt(type, 0, false); + snprintf(intrin_name, sizeof(intrin_name), "llvm.cttz.i%d", src0_bitsize); LLVMValueRef params[2] = { src0, @@ -2518,9 +2499,7 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, params, 2, AC_FUNC_ATTR_READNONE); - if (src0_bitsize == 64) { - lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); - } + lsb = ac_build_ui_cast(ctx, lsb, ctx->i32); /* TODO: We need an intrinsic to skip this conditional. */ /* Check for zero: */ -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 12/41] ac/nir: make ac_build_isign work on all bit sizes
v2: don't use ac_get_zero(), ac_get_one() and ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 27 --- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index db937eb66fb..3b2257e8bf0 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2064,30 +2064,11 @@ LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { - LLVMValueRef cmp, val, zero, one; - LLVMTypeRef type; - - switch (bitsize) { - case 64: - type = ctx->i64; - zero = ctx->i64_0; - one = ctx->i64_1; - break; - case 32: - type = ctx->i32; - zero = ctx->i32_0; - one = ctx->i32_1; - break; - case 16: - type = ctx->i16; - zero = ctx->i16_0; - one = ctx->i16_1; - break; - default: - unreachable(!"invalid bitsize"); - break; - } + LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize); + LLVMValueRef zero = LLVMConstInt(type, 0, false); + LLVMValueRef one = LLVMConstInt(type, 1, false); + LLVMValueRef cmp, val; cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, ""); val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 19/41] ac/nir: support half floats in emit_b2f
This seems to generate fine code, even though the IR is a bit ugly. Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 0e5946dfdb3..e459001c1cf 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -316,14 +316,20 @@ static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx, unsigned bitsize) { LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, - LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), + LLVMBuildBitCast(ctx->builder, ctx->f32_1, ctx->i32, ""), ""); result = LLVMBuildBitCast(ctx->builder, result, ctx->f32, ""); - if (bitsize == 32) + switch (bitsize) { + case 16: + return LLVMBuildFPTrunc(ctx->builder, result, ctx->f16, ""); + case 32: return result; - - return LLVMBuildFPExt(ctx->builder, result, ctx->f64, ""); + case 64: + return LLVMBuildFPExt(ctx->builder, result, ctx->f64, ""); + default: + unreachable("Unsupported bit size."); + } } static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 27/41] ac/nir: make ac_build_umsb work on all bit sizes
v2: don't use ac_get_zero() and ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 38 +++--- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 61085db9320..ec87a7b9343 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -1555,36 +1555,12 @@ ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) { - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef highest_bit; - LLVMValueRef zero; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); - switch (bitsize) { - case 64: - intrin_name = "llvm.ctlz.i64"; - type = ctx->i64; - highest_bit = LLVMConstInt(ctx->i64, 63, false); - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.ctlz.i32"; - type = ctx->i32; - highest_bit = LLVMConstInt(ctx->i32, 31, false); - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.ctlz.i16"; - type = ctx->i16; - highest_bit = LLVMConstInt(ctx->i16, 15, false); - zero = ctx->i16_0; - break; - default: - unreachable(!"invalid bitsize"); - break; - } + LLVMTypeRef type = LLVMTypeOf(arg); + unsigned bitsize = ac_get_elem_bits(ctx, type); + LLVMValueRef highest_bit = LLVMConstInt(type, bitsize - 1, false); + LLVMValueRef zero = LLVMConstInt(type, 0, false); + char intrin_name[64]; + snprintf(intrin_name, sizeof(intrin_name), "llvm.ctlz.i%d", bitsize); LLVMValueRef params[2] = { arg, @@ -1598,7 +1574,7 @@ ac_build_umsb(struct ac_llvm_context *ctx, /* The HW returns the last bit index from MSB, but TGSI/NIR wants * the index from LSB. Invert it by doing "31 - msb". */ msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); - msb = LLVMBuildTruncOrBitCast(ctx->builder, msb, ctx->i32, ""); + msb = ac_build_ui_cast(ctx, msb, dst_type); /* check for zero */ return LLVMBuildSelect(ctx->builder, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 24/41] ac/nir: implement 8 and 16 bit ac_build_readlane
v2: don't use ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 71eaac4b7bd..aa92c55c822 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2868,9 +2868,15 @@ ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef la { LLVMTypeRef src_type = LLVMTypeOf(src); src = ac_to_integer(ctx, src); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + unsigned src_bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + unsigned bits = src_bits; LLVMValueRef ret; + if (bits < 32) { + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + bits = 32; + } + if (bits == 32) { ret = _ac_build_readlane(ctx, src, lane); } else { @@ -2887,6 +2893,10 @@ ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef la LLVMConstInt(ctx->i32, i, 0), ""); } } + + if (src_bits < 32) + ret = LLVMBuildTrunc(ctx->builder, ret, LLVMIntTypeInContext(ctx->context, src_bits), ""); + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); } -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 30/41] ac/nir: make ac_build_bitfield_reverse work on all bit sizes
Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 26 ++ 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 46738faea9d..dff369aae7f 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2100,28 +2100,14 @@ LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMValueRef result; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + unsigned bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - switch (bitsize) { - case 32: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - case 16: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - default: - unreachable(!"invalid bitsize"); - break; - } + char name[64]; + snprintf(name, sizeof(name), "llvm.bitreverse.i%d", bitsize); - return result; + return ac_build_intrinsic(ctx, name, LLVMTypeOf(src0), + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); } #define AC_EXP_TARGET 0 -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 23/41] ac/nir: implement 16-bit ac_build_ddxy
v2: rebase Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 20 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index fb871a47400..71eaac4b7bd 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -1481,6 +1481,11 @@ ac_build_ddxy(struct ac_llvm_context *ctx, LLVMValueRef tl, trbl; LLVMValueRef result; + int size = ac_get_type_size(LLVMTypeOf(val)); + + if (size == 2) + val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); + for (unsigned i = 0; i < 4; ++i) { tl_lanes[i] = i & mask; trbl_lanes[i] = (i & mask) + idx; @@ -1493,12 +1498,19 @@ ac_build_ddxy(struct ac_llvm_context *ctx, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]); - tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, ""); - trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, ""); + if (size == 2) { + tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); + trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); + } + + LLVMTypeRef type = ac_float_of_size(ctx, size * 8); + tl = LLVMBuildBitCast(ctx->builder, tl, type, ""); + trbl = LLVMBuildBitCast(ctx->builder, trbl, type, ""); result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); - result = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, - &result, 1, 0); + result = ac_build_intrinsic(ctx, + LLVMTypeOf(val) == ctx->f32 ? "llvm.amdgcn.wqm.f32" : "llvm.amdgcn.wqm.f16", type, + &result, 1, 0); return result; } -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 29/41] ac/nir: make ac_build_bit_count work on all bit sizes
Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 33 +++-- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index c986f800fa4..46738faea9d 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2085,35 +2085,16 @@ LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMValueRef result; - unsigned bitsize; + unsigned bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + char name[64]; + snprintf(name, sizeof(name), "llvm.ctpop.i%d", bitsize); - switch (bitsize) { - case 64: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); - break; - case 32: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - case 16: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - default: - unreachable(!"invalid bitsize"); - break; - } + LLVMValueRef result = ac_build_intrinsic(ctx, name, LLVMTypeOf(src0), +(LLVMValueRef []) { src0 }, 1, +AC_FUNC_ATTR_READNONE); - return result; + return ac_build_ui_cast(ctx, result, ctx->i32); } LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 36/41] radv: handle all fragment output types
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_nir_to_llvm.c | 55 --- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 01b8b097ea1..c46eabf3656 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2297,9 +2297,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (!values) return; - bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; if (ctx->stage == MESA_SHADER_FRAGMENT) { - bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; unsigned index = target - V_008DFC_SQ_EXP_MRT; unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; @@ -2310,6 +2308,28 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, bool hi) = NULL; + if (LLVMTypeOf(values[0]) == ctx->ac.f16 && + col_format != V_028714_SPI_SHADER_FP16_ABGR) { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildFPExt(ctx->ac.builder, + values[chan], + ctx->ac.f32, ""); + } + + if (LLVMTypeOf(values[0]) == ctx->ac.i16 || LLVMTypeOf(values[0]) == ctx->ac.i8) { + if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildSExt(ctx->ac.builder, + values[chan], + ctx->ac.i32, ""); + } else { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildZExt(ctx->ac.builder, + values[chan], + ctx->ac.i32, ""); + } + } + switch(col_format) { case V_028714_SPI_SHADER_ZERO: args->enabled_channels = 0; /* writemask */ @@ -2335,12 +2355,16 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, case V_028714_SPI_SHADER_FP16_ABGR: args->enabled_channels = 0x5; - packf = ac_build_cvt_pkrtz_f16; - if (is_16bit) { - for (unsigned chan = 0; chan < 4; chan++) - values[chan] = LLVMBuildFPExt(ctx->ac.builder, - values[chan], - ctx->ac.f32, ""); + if (LLVMTypeOf(values[0]) == ctx->ac.f16) { + packi = ac_build_cvt_pk_u16; + for (unsigned chan = 0; chan < 4; chan++) { + values[chan] = ac_to_integer(&ctx->ac, values[chan]); + values[chan] = LLVMBuildZExt(ctx->ac.builder, + values[chan], + ctx->ac.i32, ""); + } + } else { + packf = ac_build_cvt_pkrtz_f16; } break; @@ -2357,23 +2381,11 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, case V_028714_SPI_SHADER_UINT16_ABGR: args->enabled_channels = 0x5; packi = ac_build_cvt_pk_u16; - if (is_16bit) { - for (unsigned chan = 0; chan < 4; chan++) - values[chan] = LLVMBuildZExt(ctx->ac.builder, - ac_to_integer(&ctx->ac, values[chan]), - ctx->ac.i32, ""); - } break; case V_028714_SPI_SHADER_SIN
[Mesa-dev] [PATCH v2 22/41] compiler/nir: add lowering option for 16-bit ffma
The lowering needs to be disabled for sufficient precision to pass deqp-vk's 16-bit fma test on radv. Signed-off-by: Rhys Perry --- src/broadcom/compiler/nir_to_vir.c| 1 + src/compiler/nir/nir.h| 1 + src/compiler/nir/nir_opt_algebraic.py | 4 +++- src/gallium/drivers/radeonsi/si_get.c | 1 + src/gallium/drivers/vc4/vc4_program.c | 1 + 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index d983f91e718..6c0a623096a 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -2471,6 +2471,7 @@ const nir_shader_compiler_options v3d_nir_options = { .lower_fdiv = true, .lower_find_lsb = true, .lower_ffma = true, +.lower_ffma16 = true, .lower_flrp32 = true, .lower_fpow = true, .lower_fsat = true, diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 740c64d2a94..8df275f4aa3 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2111,6 +2111,7 @@ typedef struct nir_function { typedef struct nir_shader_compiler_options { bool lower_fdiv; + bool lower_ffma16; bool lower_ffma; bool fuse_ffma; bool lower_flrp16; diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 71c626e1b3f..63dff878d35 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -136,7 +136,9 @@ optimizations = [ (('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a, ('bcsel', c, b, a), 'options->lower_flrp32'), (('~fadd@32', a, ('fmul', c , ('fadd', b, ('fneg', a, ('flrp', a, b, c), '!options->lower_flrp32'), (('~fadd@64', a, ('fmul', c , ('fadd', b, ('fneg', a, ('flrp', a, b, c), '!options->lower_flrp64'), - (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), + (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), + (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), + (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'), (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)), diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index f8ca02d4fcf..5bf107ef6fe 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -491,6 +491,7 @@ static const struct nir_shader_compiler_options nir_options = { .lower_fdiv = true, .lower_sub = true, .lower_ffma = true, + .lower_ffma16 = true, .lower_pack_snorm_2x16 = true, .lower_pack_snorm_4x8 = true, .lower_pack_unorm_2x16 = true, diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 2d0a52bb5fb..8be258cbba4 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2234,6 +2234,7 @@ static const nir_shader_compiler_options nir_options = { .lower_extract_word = true, .lower_fdiv = true, .lower_ffma = true, +.lower_ffma16 = true, .lower_flrp32 = true, .lower_fpow = true, .lower_fsat = true, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 20/41] ac/nir: make emit_b2i work on all bit sizes
v2: don't use ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index e459001c1cf..75bb19031bf 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -347,11 +347,7 @@ static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, unsigned bitsize) { LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, ""); - - if (bitsize == 32) - return result; - - return LLVMBuildZExt(ctx->builder, result, ctx->i64, ""); + return ac_build_ui_cast(ctx, result, LLVMIntTypeInContext(ctx->context, bitsize)); } static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 05/41] ac/nir: implement 8-bit ssbo stores
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 17d952d1ae8..89a78b43c6f 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1524,7 +1524,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, get_src(ctx, instr->src[1]), true); - LLVMValueRef base_data = ac_to_float(&ctx->ac, src_data); + LLVMValueRef base_data = src_data; base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components); LLVMValueRef base_offset = get_src(ctx, instr->src[2]); @@ -1565,7 +1565,25 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, offset = LLVMBuildAdd(ctx->ac.builder, base_offset, LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), ""); } - if (num_bytes == 2) { + if (num_bytes == 1) { + store_name = "llvm.amdgcn.tbuffer.store.i32"; + data_type = ctx->ac.i32; + data = LLVMBuildZExt(ctx->ac.builder, data, data_type, ""); + LLVMValueRef tbuffer_params[] = { + data, + rsrc, + ctx->ac.i32_0, /* vindex */ + offset,/* voffset */ + ctx->ac.i32_0, + ctx->ac.i32_0, + LLVMConstInt(ctx->ac.i32, 1, false), // dfmt (= 8bit) + LLVMConstInt(ctx->ac.i32, 4, false), // nfmt (= uint) + glc, + ctx->ac.i1false, + }; + ac_build_intrinsic(&ctx->ac, store_name, + ctx->ac.voidt, tbuffer_params, 10, 0); + } else if (num_bytes == 2) { store_name = "llvm.amdgcn.tbuffer.store.i32"; data_type = ctx->ac.i32; LLVMValueRef tbuffer_params[] = { -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 03/41] ac: add various helpers for float16/int16/int8
v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof() v2: remove ac_int_of_size() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 55 ++--- src/amd/common/ac_llvm_build.h | 15 +++-- src/amd/common/ac_nir_to_llvm.c | 30 +- 3 files changed, 79 insertions(+), 21 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 9395bd1bbda..b53d9c7ff8c 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -87,12 +87,16 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, ctx->v4f32 = LLVMVectorType(ctx->f32, 4); ctx->v8i32 = LLVMVectorType(ctx->i32, 8); + ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); + ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); + ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); + ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); @@ -201,7 +205,9 @@ ac_get_type_size(LLVMTypeRef type) static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) { - if (t == ctx->f16 || t == ctx->i16) + if (t == ctx->i8) + return ctx->i8; + else if (t == ctx->f16 || t == ctx->i16) return ctx->i16; else if (t == ctx->f32 || t == ctx->i32) return ctx->i32; @@ -281,6 +287,42 @@ ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); } +LLVMTypeRef ac_float_of_size(struct ac_llvm_context *ctx, unsigned bit_size) +{ + switch (bit_size) { + case 16: + return ctx->f16; + case 32: + return ctx->f32; + case 64: + return ctx->f64; + default: + unreachable("Unhandled bit size"); + } +} + +LLVMValueRef ac_build_ui_cast(struct ac_llvm_context *ctx, LLVMValueRef v, LLVMTypeRef t) +{ + unsigned new_bit_size = ac_get_elem_bits(ctx, t); + unsigned old_bit_size = ac_get_elem_bits(ctx, LLVMTypeOf(v)); + if (new_bit_size > old_bit_size) + return LLVMBuildZExt(ctx->builder, v, t, ""); + else if (new_bit_size < old_bit_size) + return LLVMBuildTrunc(ctx->builder, v, t, ""); + else + return v; +} + +LLVMValueRef ac_build_reinterpret(struct ac_llvm_context *ctx, LLVMValueRef v, LLVMTypeRef t) +{ + if (LLVMTypeOf(v) == t) + return v; + + v = ac_to_integer(ctx, v); + v = ac_build_ui_cast(ctx, v, ac_to_integer_type(ctx, t)); + return LLVMBuildBitCast(ctx->builder, v, t, ""); +} + LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, @@ -1338,15 +1380,18 @@ LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx, } LLVMValueRef -ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, +ac_build_tbuffer_load_short_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset, - LLVMValueRef glc) + LLVMValueRef glc, + unsigned size) { + assert(size == 1 || size == 2); const char *name = "llvm.amdgcn.tbuffer.load.i32"; + int data_format = size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 : V_008F0C_BUF_DATA_FORMAT_16; LLVMTypeRef type = ctx->i32; LLVMValueRef params[] = { rsrc, @@ -1354,13 +1399,13 @@ ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, voffset, soffset, immoffset, - LLVMConstInt(ctx->i32, V_008F0C_BUF_DATA_FORMAT_16, false), + LLVMConstInt(ctx->i32, data_format, false), LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, false), glc, ctx->i1false, }; LLVMValueRef res = ac_build
[Mesa-dev] [PATCH v2 09/41] ac/nir: fix 64-bit nir_op_f2f16_rtz
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 691d444db05..741059b5f1a 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -886,6 +886,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_f2f16_rtz: src[0] = ac_to_float(&ctx->ac, src[0]); + if (LLVMTypeOf(src[0]) == ctx->ac.f64) + src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 }; result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 02/41] radv: ensure export arguments are always float
So that the signature is correct and consistent, the inputs to a export intrinsic should always be 32-bit floats. This and the previous commit fixes a large amount crashes from dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_* tests Fixes: b722b29f10d ('radv: add support for 16bit input/output') Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_nir_to_llvm.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index a8268c44ecf..d3795eec403 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2429,12 +2429,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, } else memcpy(&args->out[0], values, sizeof(values[0]) * 4); - for (unsigned i = 0; i < 4; ++i) { - if (!(args->enabled_channels & (1 << i))) - continue; - + for (unsigned i = 0; i < 4; ++i) args->out[i] = ac_to_float(&ctx->ac, args->out[i]); - } } static void -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 07/41] ac/nir: implement 8-bit nir_load_const_instr
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 4 1 file changed, 4 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index b260142c177..f39232b91a1 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1114,6 +1114,10 @@ static void visit_load_const(struct ac_nir_context *ctx, for (unsigned i = 0; i < instr->def.num_components; ++i) { switch (instr->def.bit_size) { + case 8: + values[i] = LLVMConstInt(element_type, +instr->value.u8[i], false); + break; case 16: values[i] = LLVMConstInt(element_type, instr->value.u16[i], false); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 13/41] ac/nir: make ac_build_fsign work on all bit sizes
v2: don't use ac_get_zerof() and ac_get_onef() Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 16 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 3b2257e8bf0..23e454385d7 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2079,19 +2079,11 @@ LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { - LLVMValueRef cmp, val, zero, one; - LLVMTypeRef type; - - if (bitsize == 32) { - type = ctx->f32; - zero = ctx->f32_0; - one = ctx->f32_1; - } else { - type = ctx->f64; - zero = ctx->f64_0; - one = ctx->f64_1; - } + LLVMTypeRef type = ac_float_of_size(ctx, bitsize); + LLVMValueRef zero = LLVMConstReal(type, 0.0); + LLVMValueRef one = LLVMConstReal(type, 1.0); + LLVMValueRef cmp, val; cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, ""); val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, ""); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 08/41] ac/nir: implement 8-bit conversions
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index f39232b91a1..691d444db05 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -858,12 +858,14 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) src[i] = ac_to_integer(&ctx->ac, src[i]); result = ac_build_gather_values(&ctx->ac, src, num_components); break; + case nir_op_f2i8: case nir_op_f2i16: case nir_op_f2i32: case nir_op_f2i64: src[0] = ac_to_float(&ctx->ac, src[0]); result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, ""); break; + case nir_op_f2u8: case nir_op_f2u16: case nir_op_f2u32: case nir_op_f2u64: @@ -898,15 +900,14 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) else result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); break; + case nir_op_u2u8: case nir_op_u2u16: case nir_op_u2u32: case nir_op_u2u64: src[0] = ac_to_integer(&ctx->ac, src[0]); - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, ""); - else - result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); + result = ac_build_ui_cast(&ctx->ac, src[0], def_type); break; + case nir_op_i2i8: case nir_op_i2i16: case nir_op_i2i32: case nir_op_i2i64: -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 04/41] ac/nir: implement 8-bit push constant, ssbo and ubo loads
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 37 +++-- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bed52490bad..17d952d1ae8 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1399,7 +1399,30 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr); - if (instr->dest.ssa.bit_size == 16) { + if (instr->dest.ssa.bit_size == 8) { + unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1; + LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords); + ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); + LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + + LLVMValueRef params[3]; + if (load_dwords > 1) { + LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), ""); + params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), ""); + params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), ""); + } else { + res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, ""); + params[0] = ctx->ac.i32_0; + params[1] = res; + } + params[2] = addr; + res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0); + + res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), ""); + if (instr->dest.ssa.num_components > 1) + res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), ""); + return res; + } else if (instr->dest.ssa.bit_size == 16) { unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1; LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords); ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); @@ -1676,7 +1699,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false); LLVMValueRef ret; - if (load_bytes == 2) { + if (load_bytes <= 2) { ret = ac_build_tbuffer_load_short_byte(&ctx->ac, rsrc, vindex, @@ -1684,7 +1707,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, ctx->ac.i32_0, immoffset, glc, - 2); + load_bytes); } else { const char *load_name; LLVMTypeRef data_type; @@ -1700,6 +1723,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, data_type = ctx->ac.v2f32; break; case 4: + case 3: load_name = "llvm.amdgcn.buffer.load.f32"; data_type = ctx->ac.f32; break; @@ -1746,7 +1770,8 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, if (instr->dest.ssa.bit_size == 64) num_components *= 2; - if (instr->dest.ssa.bit_size == 16) { + if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) { + unsigned size = instr->dest.ssa.bit_size / 8; LLVMValueRef results[num_components]; for (unsigned i = 0; i < num_components; ++i) { results[i] = ac_build_tbuffer_load_short_byte(&ctx->ac, @@ -1754,9 +1779,9 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, ctx->ac.i32_0,
[Mesa-dev] [PATCH v2 06/41] ac/nir: fix 16-bit ssbo stores
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 89a78b43c6f..b260142c177 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1586,6 +1586,8 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, } else if (num_bytes == 2) { store_name = "llvm.amdgcn.tbuffer.store.i32"; data_type = ctx->ac.i32; + data = LLVMBuildBitCast(ctx->ac.builder, data, ctx->ac.i16, ""); + data = LLVMBuildZExt(ctx->ac.builder, data, data_type, ""); LLVMValueRef tbuffer_params[] = { data, rsrc, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 01/41] radv: bitcast 16-bit outputs to integers
16-bit outputs are stored as 16-bit floats in the outputs array, so they have to be bitcast. Fixes: b722b29f10d ('radv: add support for 16bit input/output') Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_nir_to_llvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 7f74678d5f1..a8268c44ecf 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2365,7 +2365,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (is_16bit) { for (unsigned chan = 0; chan < 4; chan++) values[chan] = LLVMBuildZExt(ctx->ac.builder, - values[chan], + ac_to_integer(&ctx->ac, values[chan]), ctx->ac.i32, ""); } break; @@ -2376,7 +2376,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (is_16bit) { for (unsigned chan = 0; chan < 4; chan++) values[chan] = LLVMBuildSExt(ctx->ac.builder, - values[chan], + ac_to_integer(&ctx->ac, values[chan]), ctx->ac.i32, ""); } break; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage
This series add support for: - VK_KHR_shader_float16_int8 - VK_AMD_gpu_shader_half_float - VK_AMD_gpu_shader_int16 - VK_KHR_8bit_storage on VI+. Half floats are disabled on LLVM 7 because of a bug causing large memory usage and long (or unbounded) compilation times with some CTS tests. It is written against the following patch series: - https://patchwork.freedesktop.org/series/53454/ (v4) - https://patchwork.freedesktop.org/series/53660/ (v1) With LLVM 9, there are no reproducable Vulkan CTS regressions with Vega and VI except for dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_float_64_to_16.* which fails or crashes because of unrelated radv bugs with 64-bit varyings and because the tests use VK_FORMAT_R64_SFLOAT as a vertex format even though radv does not support it. With LLVM 9, there are no reproducable piglit regressions except for glsl-array-bounds-12.shader_test because of a LLVM bug when SLP vectorization is enabled. With LLVM 8, there are no reproducable Vulkan CTS regressions with Vega and VI except for those with LLVM 9 and a couple of tests because of a LLVM bug after the SLP vectorizer and with the current lack of fallback for 16-bit interpolation on LLVM versions before LLVM 9. With LLVM 7, there are no reproducable Vulkan CTS regressions with Vega and VI except for those with LLVM 9 and a couple of tests because of a LLVM bug after the SLP vectorizer. The SLP vectorization patch is marked as WIP because it exposes LLVM bugs with piglit's glsl-array-bounds-12.shader_test, some Vulkan CTS tests and some shader-db test for a game I can't remember. It also over-vectorizes 32-bit code which can cause significant worsening in generated code quality. The 16-bit interpolation patch is marked as WIP because it currently requires intrinsics only available in LLVM 9 and does not have a fallback. A branch on Github containing this series can be found at: https://github.com/pendingchaos/mesa/commits/radv_fp16_int16_int8_v2 v2: rebase v2: implement 16-bit interpolation v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass v2: run vectorization unconditionally on GFX9 and later v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof() v2: remove ac_int_of_size() v2: fix 64-bit visit_load_var() v2: mark VK_KHR_8bit_storage as DONE in features.txt v2: mark SLP vectorization patch as WIP v2: fix C++ style comment Rhys Perry (41): radv: bitcast 16-bit outputs to integers radv: ensure export arguments are always float ac: add various helpers for float16/int16/int8 ac/nir: implement 8-bit push constant, ssbo and ubo loads ac/nir: implement 8-bit ssbo stores ac/nir: fix 16-bit ssbo stores ac/nir: implement 8-bit nir_load_const_instr ac/nir: implement 8-bit conversions ac/nir: fix 64-bit nir_op_f2f16_rtz ac/nir: make ac_build_clamp work on all bit sizes ac/nir: make ac_build_fract work on all bit sizes ac/nir: make ac_build_isign work on all bit sizes ac/nir: make ac_build_fsign work on all bit sizes ac/nir: make ac_build_fdiv support 16-bit floats ac/nir: implement half-float nir_op_frcp ac/nir: implement half-float nir_op_frsq ac/nir: implement half-float nir_op_ldexp radv: lower 16-bit flrp ac/nir: support half floats in emit_b2f ac/nir: make emit_b2i work on all bit sizes ac/nir: implement 16-bit shifts compiler/nir: add lowering option for 16-bit ffma ac/nir: implement 16-bit ac_build_ddxy ac/nir: implement 8 and 16 bit ac_build_readlane nir: make bitfield_reverse and ifind_msb work with all integers ac/nir: make ac_find_lsb work on all bit sizes ac/nir: make ac_build_umsb work on all bit sizes ac/nir: implement 8 and 16 bit ac_build_imsb ac/nir: make ac_build_bit_count work on all bit sizes ac/nir: make ac_build_bitfield_reverse work on all bit sizes ac/nir: implement 16-bit pack/unpack opcodes ac/nir: add 8-bit types to glsl_base_to_llvm_type ac/nir,radv: create an array of varying output types ac/nir: store all outputs as f32 radv: store all fragment shader inputs as f32 radv: handle all fragment output types WIP: radv,ac: implement 16-bit interpolation WIP: ac,radv: run LLVM's SLP vectorizer ac/nir: generate better code for nir_op_f2f16_rtz ac/nir: have nir_op_f2f16 round to zero radv,docs: expose float16, int16 and int8 features and extensions docs/features.txt| 2 +- src/amd/common/ac_llvm_build.c | 325 +++ src/amd/common/ac_llvm_build.h | 18 +- src/amd/common/ac_llvm_util.c| 8 +- src/amd/common/ac_nir_to_llvm.c | 268 +++ src/amd/common/ac_shader_abi.h | 1 + src/amd/vulkan/radv_device.c | 17 ++ src/amd/vulkan/radv_extensions.py| 4 + src/amd/vulkan/radv_nir_to_llvm.c| 123 + src/amd/vulkan/radv_pipeline.c | 19 +- src/amd/vulkan/radv_shader.c | 4 +
Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage
Quite a bit of the patches aren't specific to a single extension as many make code size-generic and some of the extensions intersect in functionality. It might still be possible to roughly order the patches by functionality but I'm not sure if it would be very useful (possible order in attachment). I didn't look at the actual content of the patches when creating the attachment, this is from memory and looking at the descriptions. Would you like me to send out a v2 of this series doing like that? On Tue, 12 Feb 2019 at 17:08, Samuel Pitoiset wrote: > > How about splitting this series in four different parts? One for every > extension? Is this doable without too much troubles? > > On 2/12/19 6:02 PM, Rhys Perry wrote: > > It currently requires review (and possibly rebasing). Marek Olšák send > > some feedback for a few of the patches but other than that, it hasn't > > gotten much attention. > > > > Also patch 35 seems to vectorize 32-bit code which can help or hurt > > shaders quite a bit and seems to hurt shaders overall. I'm not yet > > sure how to solve this without removing it or changing the result of > > LLVM's SLP vectorizer significantly. > > IIRC enabling SLP vectorizer also uncovered a RA bug with a shader. > > > > I think I'll look into the issues with patch 35 again. > > > > On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset > > wrote: > >> What's the status of this? > >> > >> On 12/7/18 6:21 PM, Rhys Perry wrote: > >>> This series add support for: > >>> - VK_KHR_shader_float16_int8 > >>> - VK_AMD_gpu_shader_half_float > >>> - VK_AMD_gpu_shader_int16 > >>> - VK_KHR_8bit_storage > >>> on VI+. Half floats are currently disabled on LLVM 7 because of a bug > >>> causing large memory usage and long (or unbounded) compilation times with > >>> some tests. > >>> > >>> It depends on the follow patch series: > >>> - https://patchwork.freedesktop.org/series/53454/ > >>> - https://patchwork.freedesktop.org/series/53602/ > >>> - https://patchwork.freedesktop.org/series/53660/ > >>> > >>> An older version was tested on my Polaris card, but due to hardware issues > >>> I currently can't test the latest version of the series. > >>> > >>> deqp-vk has no regressions and none of the newly enabled tests fail. > >>> > >>> Rhys Perry (38): > >>> ac: add various helpers for float16/int16/int8 > >>> ac/nir: implement 8-bit push constant, ssbo and ubo loads > >>> ac/nir: implement 8-bit ssbo stores > >>> ac/nir: fix 16-bit ssbo stores > >>> ac/nir: implement 8-bit nir_load_const_instr > >>> ac/nir: implement 8-bit conversions > >>> ac/nir: fix 64-bit nir_op_f2f16_rtz > >>> ac/nir: make ac_build_clamp work on all bit sizes > >>> ac/nir: make ac_build_fract work on all bit sizes > >>> ac/nir: make ac_build_isign work on all bit sizes > >>> ac/nir: make ac_build_fsign work on all bit sizes > >>> ac/nir: make ac_build_fdiv support 16-bit floats > >>> ac/nir: implement half-float nir_op_frcp > >>> ac/nir: implement half-float nir_op_frsq > >>> ac/nir: implement half-float nir_op_ldexp > >>> radv: lower 16-bit flrp > >>> ac/nir: support half floats in emit_b2f > >>> ac/nir: make emit_b2i work on all bit sizes > >>> ac/nir: implement 16-bit shifts > >>> compiler/nir: add lowering option for 16-bit ffma > >>> ac/nir: implement 16-bit ac_build_ddxy > >>> ac/nir: implement 8 and 16 bit ac_build_readlane > >>> nir: make bitfield_reverse and ifind_msb work with all integers > >>> ac/nir: make ac_find_lsb work on all bit sizes > >>> ac/nir: make ac_build_umsb work on all bit sizes > >>> ac/nir: implement 8 and 16 bit ac_build_imsb > >>> ac/nir: make ac_build_bit_count work on all bit sizes > >>> ac/nir: make ac_build_bitfield_reverse work on all bit sizes > >>> ac/nir: implement 16-bit pack/unpack opcodes > >>> ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type > >>> ac/nir,radv: create an array of varying output types > >>> ac/nir: store all outputs as f32 > >>> radv: store all fragment shader inputs as f32 > >>> radv: handle all fragment output types > >>> ac,ra
Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage
It currently requires review (and possibly rebasing). Marek Olšák send some feedback for a few of the patches but other than that, it hasn't gotten much attention. Also patch 35 seems to vectorize 32-bit code which can help or hurt shaders quite a bit and seems to hurt shaders overall. I'm not yet sure how to solve this without removing it or changing the result of LLVM's SLP vectorizer significantly. IIRC enabling SLP vectorizer also uncovered a RA bug with a shader. I think I'll look into the issues with patch 35 again. On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset wrote: > > What's the status of this? > > On 12/7/18 6:21 PM, Rhys Perry wrote: > > This series add support for: > > - VK_KHR_shader_float16_int8 > > - VK_AMD_gpu_shader_half_float > > - VK_AMD_gpu_shader_int16 > > - VK_KHR_8bit_storage > > on VI+. Half floats are currently disabled on LLVM 7 because of a bug > > causing large memory usage and long (or unbounded) compilation times with > > some tests. > > > > It depends on the follow patch series: > > - https://patchwork.freedesktop.org/series/53454/ > > - https://patchwork.freedesktop.org/series/53602/ > > - https://patchwork.freedesktop.org/series/53660/ > > > > An older version was tested on my Polaris card, but due to hardware issues > > I currently can't test the latest version of the series. > > > > deqp-vk has no regressions and none of the newly enabled tests fail. > > > > Rhys Perry (38): > >ac: add various helpers for float16/int16/int8 > >ac/nir: implement 8-bit push constant, ssbo and ubo loads > >ac/nir: implement 8-bit ssbo stores > >ac/nir: fix 16-bit ssbo stores > >ac/nir: implement 8-bit nir_load_const_instr > >ac/nir: implement 8-bit conversions > >ac/nir: fix 64-bit nir_op_f2f16_rtz > >ac/nir: make ac_build_clamp work on all bit sizes > >ac/nir: make ac_build_fract work on all bit sizes > >ac/nir: make ac_build_isign work on all bit sizes > >ac/nir: make ac_build_fsign work on all bit sizes > >ac/nir: make ac_build_fdiv support 16-bit floats > >ac/nir: implement half-float nir_op_frcp > >ac/nir: implement half-float nir_op_frsq > >ac/nir: implement half-float nir_op_ldexp > >radv: lower 16-bit flrp > >ac/nir: support half floats in emit_b2f > >ac/nir: make emit_b2i work on all bit sizes > >ac/nir: implement 16-bit shifts > >compiler/nir: add lowering option for 16-bit ffma > >ac/nir: implement 16-bit ac_build_ddxy > >ac/nir: implement 8 and 16 bit ac_build_readlane > >nir: make bitfield_reverse and ifind_msb work with all integers > >ac/nir: make ac_find_lsb work on all bit sizes > >ac/nir: make ac_build_umsb work on all bit sizes > >ac/nir: implement 8 and 16 bit ac_build_imsb > >ac/nir: make ac_build_bit_count work on all bit sizes > >ac/nir: make ac_build_bitfield_reverse work on all bit sizes > >ac/nir: implement 16-bit pack/unpack opcodes > >ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type > >ac/nir,radv: create an array of varying output types > >ac/nir: store all outputs as f32 > >radv: store all fragment shader inputs as f32 > >radv: handle all fragment output types > >ac,radv: run LLVM's SLP vectorizer > >ac/nir: generate better code for nir_op_f2f16_rtz > >ac/nir: have nir_op_f2f16 round to zero > >radv: expose float16, int16 and int8 features and extensions > > > > src/amd/common/ac_llvm_build.c| 355 ++ > > src/amd/common/ac_llvm_build.h| 22 +- > > src/amd/common/ac_llvm_util.c | 9 +- > > src/amd/common/ac_llvm_util.h | 1 + > > src/amd/common/ac_nir_to_llvm.c | 258 +++ > > src/amd/common/ac_shader_abi.h| 1 + > > src/amd/vulkan/radv_device.c | 17 ++ > > src/amd/vulkan/radv_extensions.py | 4 + > > src/amd/vulkan/radv_nir_to_llvm.c | 92 --- > > src/amd/vulkan/radv_shader.c | 7 + > > src/broadcom/compiler/nir_to_vir.c| 1 + > > src/compiler/nir/nir.h| 1 + > > src/compiler/nir/nir_opcodes.py | 4 +- > > src/compiler/nir/nir_opt_algebraic.py | 4 +- > > src/gallium/drivers/radeonsi/si_get.c | 1 + > > src/gallium/drivers/vc4/vc4_program.c | 1 + > > 16 files changed, 516 insertions(+), 262 deletions(-) > > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] radv: add missed situations for scissor bug workaround
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 65 src/amd/vulkan/radv_private.h| 2 + 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f430b4f20dd..6d538d7e88a 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -920,6 +920,8 @@ radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.dynamic.scissor.scissors, cmd_buffer->state.dynamic.viewport.viewports, cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband); + + cmd_buffer->state.workaround_scissor_bug = false; } static void @@ -1217,6 +1219,8 @@ radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, radv_update_zrange_precision(cmd_buffer, &att->ds, image, layout, false); } + + cmd_buffer->state.workaround_scissor_bug = true; } /** @@ -1442,6 +1446,8 @@ radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2); radeon_emit(cs, color_values[0]); radeon_emit(cs, color_values[1]); + + cmd_buffer->state.workaround_scissor_bug = true; } /** @@ -1704,6 +1710,8 @@ void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) } radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control); + + cmd_buffer->state.workaround_scissor_bug = true; } static void @@ -2185,6 +2193,27 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, state->last_primitive_reset_index = primitive_reset_index; } } + + if (draw_info->strmout_buffer) { + uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo); + + va += draw_info->strmout_buffer->offset + + draw_info->strmout_buffer_offset; + + radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, + draw_info->stride); + + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_REG) | + COPY_DATA_WR_CONFIRM); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); + radeon_emit(cs, 0); /* unused */ + + radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo); + } } static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, @@ -3470,27 +3499,6 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys *ws = cmd_buffer->device->ws; struct radeon_cmdbuf *cs = cmd_buffer->cs; - if (info->strmout_buffer) { - uint64_t va = radv_buffer_get_va(info->strmout_buffer->bo); - - va += info->strmout_buffer->offset + - info->strmout_buffer_offset; - - radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, - info->stride); - - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | - COPY_DATA_DST_SEL(COPY_DATA_REG) | - COPY_DATA_WR_CONFIRM); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); - radeon_emit(cs, 0); /* unused */ - - radv_cs_add_buffer(ws, cs, info->strmout_buffer->bo); - } - if (info->indirect) { uint64_t va = radv_buffer_get_va(info->indirect->bo); uint64_t count_va = 0; @@ -3609,13 +3617,16 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, * any context registers. */ static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, -bool indexed_draw) +const struct radv_draw_info *info) { struct radv_cmd_state *state = &cmd_buffer->state; if (!cmd_buffer->device->physical_device->has_scissor_bug) return false; + if (cmd_buffer->state.workaround_scissor_bug || info->strmout_buffer) + return true; + uint32_
[Mesa-dev] [PATCH 1/2] radv: pass radv_draw_info to radv_emit_draw_registers()
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 118 +++ 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f41d6c0b3e7..f430b4f20dd 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2074,10 +2074,60 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); } +struct radv_draw_info { + /** +* Number of vertices. +*/ + uint32_t count; + + /** +* Index of the first vertex. +*/ + int32_t vertex_offset; + + /** +* First instance id. +*/ + uint32_t first_instance; + + /** +* Number of instances. +*/ + uint32_t instance_count; + + /** +* First index (indexed draws only). +*/ + uint32_t first_index; + + /** +* Whether it's an indexed draw. +*/ + bool indexed; + + /** +* Indirect draw parameters resource. +*/ + struct radv_buffer *indirect; + uint64_t indirect_offset; + uint32_t stride; + + /** +* Draw count parameters resource. +*/ + struct radv_buffer *count_buffer; + uint64_t count_buffer_offset; + + /** +* Stream output parameters resource. +*/ + struct radv_buffer *strmout_buffer; + uint64_t strmout_buffer_offset; +}; + static void -radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw, -bool instanced_draw, bool indirect_draw, -uint32_t draw_vertex_count) +radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, +const struct radv_draw_info *draw_info) { struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; struct radv_cmd_state *state = &cmd_buffer->state; @@ -2087,8 +2137,9 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw, /* Draw state. */ ia_multi_vgt_param = - si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, - indirect_draw, draw_vertex_count); + si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, + draw_info->indirect, + draw_info->indirect ? 0 : draw_info->count); if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) { if (info->chip_class >= GFX9) { @@ -2108,7 +2159,7 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw, /* Primitive restart. */ primitive_reset_en = - indexed_draw && state->pipeline->graphics.prim_restart_enable; + draw_info->indexed && state->pipeline->graphics.prim_restart_enable; if (primitive_reset_en != state->last_primitive_reset_en) { state->last_primitive_reset_en = primitive_reset_en; @@ -3411,57 +3462,6 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, } } -struct radv_draw_info { - /** -* Number of vertices. -*/ - uint32_t count; - - /** -* Index of the first vertex. -*/ - int32_t vertex_offset; - - /** -* First instance id. -*/ - uint32_t first_instance; - - /** -* Number of instances. -*/ - uint32_t instance_count; - - /** -* First index (indexed draws only). -*/ - uint32_t first_index; - - /** -* Whether it's an indexed draw. -*/ - bool indexed; - - /** -* Indirect draw parameters resource. -*/ - struct radv_buffer *indirect; - uint64_t indirect_offset; - uint32_t stride; - - /** -* Draw count parameters resource. -*/ - struct radv_buffer *count_buffer; - uint64_t count_buffer_offset; - - /** -* Stream output parameters resource. -*/ - struct radv_buffer *strmout_buffer; - uint64_t strmout_buffer_offset; -}; - static void radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info) @@ -3672,9 +3672,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, radv_cmd_buffer_flush_dynamic_state(cmd_buffer); - radv_emit_draw_registers(cmd_buffer, info->indexed, -info->instance_count > 1, info->indirect, -info->indirect ? 0 : info->count); + radv_emit_draw_registers(cmd_
[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
It's common in some applications to bind a new graphics pipeline without ending up changing any context registers. This has a pipline have two command buffers: one for setting context registers and one for everything else. The context register command buffer is only emitted if it differs from the previous pipeline's. v2: ensure late scissor emission is done when radv_emit_rbplus_state() is called v2: make use of cmd_buffer->state.workaround_scissor_bug Signed-off-by: Rhys Perry --- This second version depends on the patch "radv: add missed situations for scissor bug workaround". src/amd/vulkan/radv_cmd_buffer.c | 30 - src/amd/vulkan/radv_pipeline.c | 217 --- src/amd/vulkan/radv_private.h| 2 + 3 files changed, 141 insertions(+), 108 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 6d538d7e88a..f406a3a42f3 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } + + cmd_buffer->state.workaround_scissor_bug = true; } static void @@ -857,10 +859,13 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); } + /* TODO: avoid redundantly setting context registers */ radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); radeon_emit(cmd_buffer->cs, sx_ps_downconvert); radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon); radeon_emit(cmd_buffer->cs, sx_blend_opt_control); + + cmd_buffer->state.workaround_scissor_bug = true; } static void @@ -884,6 +889,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); + if (!cmd_buffer->state.emitted_pipeline || + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw || + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash || + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, + pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw); + cmd_buffer->state.workaround_scissor_bug = true; + } + for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { if (!pipeline->shaders[i]) continue; @@ -2939,6 +2953,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) return; + assert(!pipeline->ctx_cs.cdw); + cmd_buffer->state.emitted_compute_pipeline = pipeline; radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); @@ -3630,20 +3646,16 @@ static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL; /* Index, vertex and streamout buffers don't change context regs, and -* pipeline is handled later. +* pipeline is already handled. */ used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_STREAMOUT_BUFFER | RADV_CMD_DIRTY_PIPELINE); - /* Assume all state changes except these two can imply context rolls. */ if (cmd_buffer->state.dirty & used_states) return true; - if (cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline) - return true; - if (info->indexed && state->pipeline->graphics.prim_restart_enable && (state->index_type ? 0xu : 0xu) != state->last_primitive_reset_index) return true; @@ -3655,7 +3667,7 @@ static void radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info) { - bool late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info); + bool late_scissor_emission; if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) || cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipel
[Mesa-dev] [PATCH v3 0/5] nvc0: Implement EXT_shader_image_load_formatted
This patch series implements EXT_shader_image_load_formatted on Maxwell+. It should implement all of the spec except, if the extension is enabled, passing image variables without a format qualifier to atomic operations will not raise a compilation error like it should. This is because knowing the format used in an image operation before function inlining can be difficult, because formats don't have to (and currently can't) be specified in the paramter declaration. So this series leaves this issue to hopefully be resolved in a later patch. I tested the second version of this series when it was released in June 2018 but I can't easily test this version. Nothing changed too much though so it should be fine. v2: change from PIPE_SHADER_CAP_* to PIPE_CAP_* v2: fix broken feature detection in the state tracker v2: move code in AlgebraicOpt::handleSULDP() to nv50_ir_ra.cpp v3: rebase v3: make use of u_pipe_screen_get_param_defaults v3: move RA code into it's own function Rhys Perry (5): gallium: add support for formatted image loads mesa,glsl: add support for EXT_shader_image_load_formatted st/mesa: add support for EXT_shader_image_load_formatted nv50/ir: use suld.p on GM107+ nvc0,nv50/ir: enable support for formatted image loads on GM107+ src/compiler/glsl/ast_to_hir.cpp | 5 +++ src/compiler/glsl/glsl_parser_extras.cpp | 1 + src/compiler/glsl/glsl_parser_extras.h| 7 src/gallium/auxiliary/util/u_screen.c | 1 + src/gallium/docs/source/screen.rst| 1 + src/gallium/drivers/nouveau/codegen/nv50_ir.h | 4 +++ .../nouveau/codegen/nv50_ir_emit_gm107.cpp| 34 --- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 3 +- .../drivers/nouveau/codegen/nv50_ir_print.cpp | 17 ++ .../drivers/nouveau/codegen/nv50_ir_ra.cpp| 31 + .../drivers/nouveau/nv30/nv30_screen.c| 1 + .../drivers/nouveau/nv50/nv50_screen.c| 1 + .../drivers/nouveau/nvc0/nvc0_screen.c| 2 ++ src/gallium/drivers/swr/swr_screen.cpp| 1 + src/gallium/drivers/vc4/vc4_screen.c | 1 + src/gallium/include/pipe/p_defines.h | 1 + src/mesa/main/extensions_table.h | 1 + src/mesa/main/mtypes.h| 1 + src/mesa/state_tracker/st_extensions.c| 1 + 19 files changed, 100 insertions(+), 14 deletions(-) -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 5/5] nvc0, nv50/ir: enable support for formatted image loads on GM107+
v3: rebase Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 3 +-- src/gallium/drivers/nouveau/nvc0/nvc0_screen.c| 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 295497be2f..6c134962b4 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -2414,12 +2414,11 @@ NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su) bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), TYPE_U32, bld.mkImm(0), loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); - if (su->op != OP_SUSTP && su->tex.format) { + if (su->op != OP_SUSTP && su->tex.format && su->tex.format->components > 0) { const TexInstruction::ImgFormatDesc *format = su->tex.format; int blockwidth = format->bits[0] + format->bits[1] + format->bits[2] + format->bits[3]; - assert(format->components != 0); // make sure that the format doesn't mismatch when it's not FMT_NONE bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), TYPE_U32, bld.loadImm(NULL, blockwidth / 8), diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index b7cf2cd2e4..c47502cae1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -288,6 +288,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return class_3d >= GM200_3D_CLASS; case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES: return class_3d >= GP100_3D_CLASS; + case PIPE_CAP_IMAGE_LOAD_FORMATTED: + return class_3d >= GM107_3D_CLASS; /* unsupported caps */ case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: @@ -334,7 +336,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS: case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS: case PIPE_CAP_SURFACE_SAMPLE_COUNT: - case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_VENDOR_ID: -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 4/5] nv50/ir: use suld.p on GM107+
v3: rebase v3: move RA code into it's own function Signed-off-by: Rhys Perry --- src/gallium/drivers/nouveau/codegen/nv50_ir.h | 4 +++ .../nouveau/codegen/nv50_ir_emit_gm107.cpp| 34 --- .../drivers/nouveau/codegen/nv50_ir_print.cpp | 17 ++ .../drivers/nouveau/codegen/nv50_ir_ra.cpp| 31 + 4 files changed, 74 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index 8085bb2f54..2388f3923c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -224,6 +224,10 @@ enum operation #define NV50_IR_SUBOP_SULD_ZERO0 #define NV50_IR_SUBOP_SULD_TRAP1 #define NV50_IR_SUBOP_SULD_SDCL3 +// These three are only for GM107+ and are set during register allocation +#define NV50_IR_SUBOP_SULDP_RGBA (0 << 2) +#define NV50_IR_SUBOP_SULDP_RG (1 << 2) +#define NV50_IR_SUBOP_SULDP_R (2 << 2) #define NV50_IR_SUBOP_SUBFM_3D 1 #define NV50_IR_SUBOP_SUCLAMP_2D 0x10 #define NV50_IR_SUBOP_SUCLAMP_SD(r, d) (( 0 + (r)) | ((d == 2) ? 0x10 : 0)) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index be00db3131..d7f4380b34 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -3257,26 +3257,36 @@ void CodeEmitterGM107::emitSULDx() { const TexInstruction *insn = this->insn->asTex(); - int type = 0; emitInsn(0xeb00); if (insn->op == OP_SULDB) emitField(0x34, 1, 1); emitSUTarget(); - switch (insn->dType) { - case TYPE_S8: type = 1; break; - case TYPE_U16: type = 2; break; - case TYPE_S16: type = 3; break; - case TYPE_U32: type = 4; break; - case TYPE_U64: type = 5; break; - case TYPE_B128: type = 6; break; - default: - assert(insn->dType == TYPE_U8); - break; + if (insn->op == OP_SULDB) { + int type = 0; + switch (insn->dType) { + case TYPE_S8: type = 1; break; + case TYPE_U16: type = 2; break; + case TYPE_S16: type = 3; break; + case TYPE_U32: type = 4; break; + case TYPE_U64: type = 5; break; + case TYPE_B128: type = 6; break; + default: + assert(insn->dType == TYPE_U8); + break; + } + emitField(0x14, 3, type); + } else { + int type = 0; + switch (insn->subOp & 0xc) { + case NV50_IR_SUBOP_SULDP_R:type = 0x1; break; + case NV50_IR_SUBOP_SULDP_RG: type = 0x3; break; + case NV50_IR_SUBOP_SULDP_RGBA: type = 0xf; break; + } + emitField(0x14, 4, type); } emitLDSTc(0x18); - emitField(0x14, 3, type); emitGPR (0x00, insn->def(0)); emitGPR (0x08, insn->src(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp index 5dcbf3c3e0..43011c23af 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp @@ -246,6 +246,16 @@ static const char *xmadOpCModeStr[] = "clo", "chi", "csfu", "cbcc" }; +static const char *suldOpStr[] = +{ + "zero", "trap", "sdcl" +}; + +static const char *suldSwizzleOpStr[] = +{ + "rgba", "rg", "r" +}; + static const char *DataTypeStr[] = { "-", @@ -672,6 +682,13 @@ void Instruction::print() const PRINT("h%d ", (subOp & NV50_IR_SUBOP_XMAD_H1(i)) ? 1 : 0); break; } + case OP_SULDB: + case OP_SULDP: + if ((subOp & 0x3) < ARRAY_SIZE(suldOpStr)) +PRINT("%s ", suldOpStr[subOp & 0x3]); + if (op == OP_SULDP && subOp >> 2 < (int)ARRAY_SIZE(suldSwizzleOpStr)) +PRINT("%s ", suldSwizzleOpStr[subOp >> 2]); + break; default: if (subOp) PRINT("(SUBOP:%u) ", subOp); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 322b79fe62..8e57bda254 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -264,6 +264,7 @@ private: void addHazard(Instruction *i, const ValueRef *src); void textureMask(TexInstruction *); + void suldpMask(TexInstruction *); void addConstraint(Instruction *, int s, int n); bool detectConflict(Instruction *, int s); @@ -1996,6 +1997,33 @@ RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex) tex->setDef(c, NULL); } +void +RegAlloc::InsertConstraintsPass::suldpMask(TexInstruction *tex) +{ + int max = 0; +
[Mesa-dev] [PATCH v3 2/5] mesa, glsl: add support for EXT_shader_image_load_formatted
v3: rebase Signed-off-by: Rhys Perry Reviewed-by: Marek Olšák (v2) --- src/compiler/glsl/ast_to_hir.cpp | 5 + src/compiler/glsl/glsl_parser_extras.cpp | 1 + src/compiler/glsl/glsl_parser_extras.h | 7 +++ src/mesa/main/extensions_table.h | 1 + src/mesa/main/mtypes.h | 1 + 5 files changed, 15 insertions(+) diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index 67a5a8c050..d9a57d37f6 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -3476,6 +3476,11 @@ apply_image_qualifier_to_variable(const struct ast_type_qualifier *qual, } var->data.image_format = qual->image_format; + } else if (state->has_image_load_formatted()) { + if (var->data.mode == ir_var_uniform && + state->EXT_shader_image_load_formatted_warn) { + _mesa_glsl_warning(loc, state, "GL_EXT_image_load_formatted used"); + } } else { if (var->data.mode == ir_var_uniform) { if (state->es_shader) { diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp index 2048a7f900..1e035e94d8 100644 --- a/src/compiler/glsl/glsl_parser_extras.cpp +++ b/src/compiler/glsl/glsl_parser_extras.cpp @@ -721,6 +721,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = { EXT(EXT_separate_shader_objects), EXT(EXT_shader_framebuffer_fetch), EXT(EXT_shader_framebuffer_fetch_non_coherent), + EXT(EXT_shader_image_load_formatted), EXT(EXT_shader_implicit_conversions), EXT(EXT_shader_integer_mix), EXT_AEP(EXT_shader_io_blocks), diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h index b17b5125e0..63a5cca5d2 100644 --- a/src/compiler/glsl/glsl_parser_extras.h +++ b/src/compiler/glsl/glsl_parser_extras.h @@ -344,6 +344,11 @@ struct _mesa_glsl_parse_state { return ARB_bindless_texture_enable; } + bool has_image_load_formatted() const + { + return EXT_shader_image_load_formatted_enable; + } + bool has_implicit_conversions() const { return EXT_shader_implicit_conversions_enable || is_version(120, 0); @@ -816,6 +821,8 @@ struct _mesa_glsl_parse_state { bool EXT_shader_framebuffer_fetch_warn; bool EXT_shader_framebuffer_fetch_non_coherent_enable; bool EXT_shader_framebuffer_fetch_non_coherent_warn; + bool EXT_shader_image_load_formatted_enable; + bool EXT_shader_image_load_formatted_warn; bool EXT_shader_implicit_conversions_enable; bool EXT_shader_implicit_conversions_warn; bool EXT_shader_integer_mix_enable; diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h index dad38124d5..c3eb019f81 100644 --- a/src/mesa/main/extensions_table.h +++ b/src/mesa/main/extensions_table.h @@ -264,6 +264,7 @@ EXT(EXT_separate_shader_objects , dummy_true EXT(EXT_separate_specular_color , dummy_true , GLL, x , x , x , 1997) EXT(EXT_shader_framebuffer_fetch, EXT_shader_framebuffer_fetch , GLL, GLC, x , ES2, 2013) EXT(EXT_shader_framebuffer_fetch_non_coherent, EXT_shader_framebuffer_fetch_non_coherent, GLL, GLC, x, ES2, 2018) +EXT(EXT_shader_image_load_formatted , EXT_shader_image_load_formatted , GLL, GLC, x , x , 2014) EXT(EXT_shader_implicit_conversions , dummy_true , x , x , x , 31, 2013) EXT(EXT_shader_integer_mix , EXT_shader_integer_mix , GLL, GLC, x , 30, 2013) EXT(EXT_shader_io_blocks, dummy_true , x , x , x , 31, 2014) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 241c2b92f7..bd90727e26 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -4264,6 +4264,7 @@ struct gl_extensions GLboolean EXT_render_snorm; GLboolean EXT_semaphore; GLboolean EXT_semaphore_fd; + GLboolean EXT_shader_image_load_formatted; GLboolean EXT_shader_integer_mix; GLboolean EXT_shader_samples_identical; GLboolean EXT_stencil_two_side; -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 3/5] st/mesa: add support for EXT_shader_image_load_formatted
v3: rebase Signed-off-by: Rhys Perry Reviewed-by: Marek Olšák (v2) --- src/mesa/state_tracker/st_extensions.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 4628079260..b713eed969 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -717,6 +717,7 @@ void st_init_extensions(struct pipe_screen *screen, { o(ARB_shader_clock), PIPE_CAP_TGSI_CLOCK }, { o(ARB_shader_draw_parameters), PIPE_CAP_DRAW_PARAMETERS }, { o(ARB_shader_group_vote),PIPE_CAP_TGSI_VOTE }, + { o(EXT_shader_image_load_formatted), PIPE_CAP_IMAGE_LOAD_FORMATTED }, { o(ARB_shader_stencil_export),PIPE_CAP_SHADER_STENCIL_EXPORT }, { o(ARB_shader_texture_image_samples), PIPE_CAP_TGSI_TXQS }, { o(ARB_shader_texture_lod), PIPE_CAP_SM3 }, -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 1/5] gallium: add support for formatted image loads
v3: rebase v3: make use of u_pipe_screen_get_param_defaults Signed-off-by: Rhys Perry --- src/gallium/auxiliary/util/u_screen.c | 1 + src/gallium/docs/source/screen.rst | 1 + src/gallium/drivers/nouveau/nv30/nv30_screen.c | 1 + src/gallium/drivers/nouveau/nv50/nv50_screen.c | 1 + src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 1 + src/gallium/drivers/swr/swr_screen.cpp | 1 + src/gallium/drivers/vc4/vc4_screen.c | 1 + src/gallium/include/pipe/p_defines.h | 1 + 8 files changed, 8 insertions(+) diff --git a/src/gallium/auxiliary/util/u_screen.c b/src/gallium/auxiliary/util/u_screen.c index c14edde859..470632f5ec 100644 --- a/src/gallium/auxiliary/util/u_screen.c +++ b/src/gallium/auxiliary/util/u_screen.c @@ -314,6 +314,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen, case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS: case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS: case PIPE_CAP_TGSI_ATOMFADD: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_MAX_GS_INVOCATIONS: diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index 9b75a407db..b2d0c401d5 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -483,6 +483,7 @@ The integer capabilities: * ``PIPE_CAP_TGSI_ATOMFADD``: Atomic floating point adds are supported on images, buffers, and shared memory. * ``PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND``: True if the driver needs blend state to use zero/one instead of destination alpha for RGB/XRGB formats. +* ``PIPE_CAP_IMAGE_LOAD_FORMATTED``: True if a format for image loads does not need to be specified in the shader IR .. _pipe_capf: diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 2b69a8f696..d6e0f43f6c 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -243,6 +243,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE: case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS: case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_MAX_GS_INVOCATIONS: diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index d83926f2b1..ff92012894 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -310,6 +310,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS: case PIPE_CAP_SURFACE_SAMPLE_COUNT: case PIPE_CAP_TGSI_ATOMFADD: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index f5f3cf..b7cf2cd2e4 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -334,6 +334,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS: case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS: case PIPE_CAP_SURFACE_SAMPLE_COUNT: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp index de9008ddf6..38b76366cb 100644 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -364,6 +364,7 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS: case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS: case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_CAP_MAX_GS_INVOCATIONS: return 32; diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index e7f7c82c27..22de60f02c 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -293,6 +293,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: +case PIPE_SHADER_CAP_IMAGE_LOAD_FORMATTED: return 0; case PIPE_SHADER_CAP_SCALAR_ISA: return 1; diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index ae53c723c7..5c0652d7a9 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -854,6 +854,7 @@ enum pipe_cap PIPE_CAP_TGSI_ATOMFADD
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
Seems I accidentally had it use Fedora 29's mesa build in both the before and after runs... Running again I get (again, average of 3 runs): GeothermalValley: 58.2 fps -> 59.633 fps (+2.5%) ProphetsTomb: 59 fps -> 60 fps (+1.7%) SpineOfTheMountain: 64 fps -> 64.06667 fps (+0.1%) (1 extreme from "before" run excluded) Sorry for the noise. On Wed, 16 Jan 2019 at 11:46, Rhys Perry wrote: > > Rise of the Tomb Raider from without to with the change (average of 3 runs): > SpineOfTheMountain: 73.46667 fps -> 73.56667 fps (+0.14%) > ProphetsTomb: 58.4 fps -> 58.46667 fps (+0.11%) > GeothermalValley: 57.2 fps -> 57.46667 fps (+0.47%) > > So not much improvement (if any). > > On Wed, 16 Jan 2019 at 00:39, Rhys Perry wrote: > > > > I did a before/after comparison during development with multiple runs > > but only 1 before and after run to produce the numbers I sent. They > > seemed to match up well enough to the runs during development, so I > > wasn't too concerned. > > > > IIRC, the two runs were with a Vega 64 at 1080p with "High" settings. > > The kernel/distro was 4.19.13 and Fedora 29. Also > > "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to > > "performance" and > > "/sys/class/drm/card*/device/power_dpm_force_performance_level" was > > set to "high" while running. > > > > I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I > > get anything too different. > > > > On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen > > wrote: > > > > > > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry > > > wrote: > > > > > > > > I did and found small improvements in Rise of the Tomb Raider. I > > > > measured framerates ~104.3% that of without the changes for the > > > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > > > for Prophets Tomb. > > > > > > My main question would be what the statistical significance is. e.g. > > > did you do one run of each, did you do multiple, and what was your > > > test setup? > > > > > > Just curious because I have tried the exact same thing before and > > > could not find anything more than noise. > > > > > > > > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > > > > wrote: > > > > > > > > > > Did you benchmark? > > > > > > > > > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > > > > > It's common in some applications to bind a new graphics pipeline > > > > > > without > > > > > > ending up changing any context registers. > > > > > > > > > > > > This has a pipline have two command buffers: one for setting context > > > > > > registers and one for everything else. The context register command > > > > > > buffer > > > > > > is only emitted if it differs from the previous pipeline's. > > > > > > > > > > > > Signed-off-by: Rhys Perry > > > > > > --- > > > > > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > > > > > src/amd/vulkan/radv_pipeline.c | 217 > > > > > > --- > > > > > > src/amd/vulkan/radv_private.h| 2 + > > > > > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > > > > > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > > > > > b/src/amd/vulkan/radv_cmd_buffer.c > > > > > > index f41d6c0b3e7..59903ab64d8 100644 > > > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct > > > > > > radv_cmd_buffer *cmd_buffer, > > > > > > } > > > > > > } > > > > > > > > > > > > -static void > > > > > > +static bool > > > > > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > > > > > struct radv_pipeline *pipeline) > > > > > > { > > > > > > @@ -646,7 +646,7 @@ radv_update_multisamp
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
Rise of the Tomb Raider from without to with the change (average of 3 runs): SpineOfTheMountain: 73.46667 fps -> 73.56667 fps (+0.14%) ProphetsTomb: 58.4 fps -> 58.46667 fps (+0.11%) GeothermalValley: 57.2 fps -> 57.46667 fps (+0.47%) So not much improvement (if any). On Wed, 16 Jan 2019 at 00:39, Rhys Perry wrote: > > I did a before/after comparison during development with multiple runs > but only 1 before and after run to produce the numbers I sent. They > seemed to match up well enough to the runs during development, so I > wasn't too concerned. > > IIRC, the two runs were with a Vega 64 at 1080p with "High" settings. > The kernel/distro was 4.19.13 and Fedora 29. Also > "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to > "performance" and > "/sys/class/drm/card*/device/power_dpm_force_performance_level" was > set to "high" while running. > > I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I > get anything too different. > > On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen > wrote: > > > > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry wrote: > > > > > > I did and found small improvements in Rise of the Tomb Raider. I > > > measured framerates ~104.3% that of without the changes for the > > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > > for Prophets Tomb. > > > > My main question would be what the statistical significance is. e.g. > > did you do one run of each, did you do multiple, and what was your > > test setup? > > > > Just curious because I have tried the exact same thing before and > > could not find anything more than noise. > > > > > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > > > wrote: > > > > > > > > Did you benchmark? > > > > > > > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > > > > It's common in some applications to bind a new graphics pipeline > > > > > without > > > > > ending up changing any context registers. > > > > > > > > > > This has a pipline have two command buffers: one for setting context > > > > > registers and one for everything else. The context register command > > > > > buffer > > > > > is only emitted if it differs from the previous pipeline's. > > > > > > > > > > Signed-off-by: Rhys Perry > > > > > --- > > > > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > > > > src/amd/vulkan/radv_pipeline.c | 217 > > > > > --- > > > > > src/amd/vulkan/radv_private.h| 2 + > > > > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > > > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > > > > b/src/amd/vulkan/radv_cmd_buffer.c > > > > > index f41d6c0b3e7..59903ab64d8 100644 > > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct > > > > > radv_cmd_buffer *cmd_buffer, > > > > > } > > > > > } > > > > > > > > > > -static void > > > > > +static bool > > > > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > > > > struct radv_pipeline *pipeline) > > > > > { > > > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct > > > > > radv_cmd_buffer *cmd_buffer, > > > > > cmd_buffer->sample_positions_needed = true; > > > > > > > > > > if (old_pipeline && num_samples == > > > > > old_pipeline->graphics.ms.num_samples) > > > > > - return; > > > > > + return false; > > > > > > > > > > radeon_set_context_reg_seq(cmd_buffer->cs, > > > > > R_028BDC_PA_SC_LINE_CNTL, 2); > > > > > radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > > > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct > > > > > radv_cmd_buffer *cmd_buffer, > > > > > radeon_emit(cmd_buffer-&g
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
I did a before/after comparison during development with multiple runs but only 1 before and after run to produce the numbers I sent. They seemed to match up well enough to the runs during development, so I wasn't too concerned. IIRC, the two runs were with a Vega 64 at 1080p with "High" settings. The kernel/distro was 4.19.13 and Fedora 29. Also "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to "performance" and "/sys/class/drm/card*/device/power_dpm_force_performance_level" was set to "high" while running. I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I get anything too different. On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen wrote: > > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry wrote: > > > > I did and found small improvements in Rise of the Tomb Raider. I > > measured framerates ~104.3% that of without the changes for the > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > for Prophets Tomb. > > My main question would be what the statistical significance is. e.g. > did you do one run of each, did you do multiple, and what was your > test setup? > > Just curious because I have tried the exact same thing before and > could not find anything more than noise. > > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > > wrote: > > > > > > Did you benchmark? > > > > > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > > > It's common in some applications to bind a new graphics pipeline without > > > > ending up changing any context registers. > > > > > > > > This has a pipline have two command buffers: one for setting context > > > > registers and one for everything else. The context register command > > > > buffer > > > > is only emitted if it differs from the previous pipeline's. > > > > > > > > Signed-off-by: Rhys Perry > > > > --- > > > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > > > src/amd/vulkan/radv_pipeline.c | 217 --- > > > > src/amd/vulkan/radv_private.h| 2 + > > > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > > > b/src/amd/vulkan/radv_cmd_buffer.c > > > > index f41d6c0b3e7..59903ab64d8 100644 > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct > > > > radv_cmd_buffer *cmd_buffer, > > > > } > > > > } > > > > > > > > -static void > > > > +static bool > > > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > > > struct radv_pipeline *pipeline) > > > > { > > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct > > > > radv_cmd_buffer *cmd_buffer, > > > > cmd_buffer->sample_positions_needed = true; > > > > > > > > if (old_pipeline && num_samples == > > > > old_pipeline->graphics.ms.num_samples) > > > > - return; > > > > + return false; > > > > > > > > radeon_set_context_reg_seq(cmd_buffer->cs, > > > > R_028BDC_PA_SC_LINE_CNTL, 2); > > > > radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct > > > > radv_cmd_buffer *cmd_buffer, > > > > radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > > > > radeon_emit(cmd_buffer->cs, > > > > EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); > > > > } > > > > + > > > > + return true; > > > > } > > > > > > > > static void > > > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > > > > *cmd_buffer) > > > > radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > > > > } > > > > > > > > -static void > > > > +static bool > > > > radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > > > > { > > > > struct radv_pipeline *pipeline =
Re: [Mesa-dev] [PATCH] radv: prevent dirtying of dynamic state when it does not change
I misread some code and forgot to remove it. It was always unrelated to this patch. On Wed, 16 Jan 2019 at 00:22, Bas Nieuwenhuizen wrote: > > On Tue, Jan 15, 2019 at 10:59 PM Rhys Perry wrote: > > > > DXVK often sets dynamic state without actually changing it. > > > > Signed-off-by: Rhys Perry > > --- > > src/amd/vulkan/radv_cmd_buffer.c | 92 ++-- > > 1 file changed, 76 insertions(+), 16 deletions(-) > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > b/src/amd/vulkan/radv_cmd_buffer.c > > index 59903ab64d8..56b3c934c2e 100644 > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > @@ -2965,6 +2965,11 @@ void radv_CmdSetViewport( > > assert(firstViewport < MAX_VIEWPORTS); > > assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); > > > > + if (!memcmp(state->dynamic.viewport.viewports + firstViewport, > > + pViewports, viewportCount * sizeof(*pViewports))) { > > + return; > > + } > > + > > memcpy(state->dynamic.viewport.viewports + firstViewport, > > pViewports, > >viewportCount * sizeof(*pViewports)); > > > > @@ -2984,6 +2989,11 @@ void radv_CmdSetScissor( > > assert(firstScissor < MAX_SCISSORS); > > assert(total_count >= 1 && total_count <= MAX_SCISSORS); > > > > + if (!memcmp(state->dynamic.scissor.scissors + firstScissor, > > pScissors, > > + scissorCount * sizeof(*pScissors))) { > > + return; > > + } > > + > > memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, > >scissorCount * sizeof(*pScissors)); > > > > @@ -2995,6 +3005,10 @@ void radv_CmdSetLineWidth( > > float lineWidth) > > { > > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); > > + > > + if (cmd_buffer->state.dynamic.line_width == lineWidth) > > + return; > > + > > cmd_buffer->state.dynamic.line_width = lineWidth; > > cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; > > } > > @@ -3006,12 +3020,19 @@ void radv_CmdSetDepthBias( > > float depthBiasSlopeFactor) > > { > > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); > > + struct radv_cmd_state *state = &cmd_buffer->state; > > > > - cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor; > > - cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp; > > - cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor; > > + if (state->dynamic.depth_bias.bias == depthBiasConstantFactor && > > + state->dynamic.depth_bias.clamp == depthBiasClamp && > > + state->dynamic.depth_bias.slope == depthBiasSlopeFactor) { > > + return; > > + } > > > > - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; > > + state->dynamic.depth_bias.bias = depthBiasConstantFactor; > > + state->dynamic.depth_bias.clamp = depthBiasClamp; > > + state->dynamic.depth_bias.slope = depthBiasSlopeFactor; > > + > > + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; > > } > > > > void radv_CmdSetBlendConstants( > > @@ -3019,11 +3040,14 @@ void radv_CmdSetBlendConstants( > > const float blendConstants[4]) > > { > > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); > > + struct radv_cmd_state *state = &cmd_buffer->state; > > > > - memcpy(cmd_buffer->state.dynamic.blend_constants, > > - blendConstants, sizeof(float) * 4); > > + if (!memcmp(state->dynamic.blend_constants, blendConstants, > > sizeof(float) * 4)) > > + return; > > + > > + memcpy(state->dynamic.blend_constants, blendConstants, > > sizeof(float) * 4); > > > > - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; > > + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; > > } > > > > void radv_CmdSetDepthBounds( > > @@ -3032,11 +3056,17 @@ void radv_CmdSetDepthBounds( > > float maxDepthB
[Mesa-dev] [PATCH] radv: prevent dirtying of dynamic state when it does not change
DXVK often sets dynamic state without actually changing it. Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 92 ++-- 1 file changed, 76 insertions(+), 16 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 59903ab64d8..56b3c934c2e 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2965,6 +2965,11 @@ void radv_CmdSetViewport( assert(firstViewport < MAX_VIEWPORTS); assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); + if (!memcmp(state->dynamic.viewport.viewports + firstViewport, + pViewports, viewportCount * sizeof(*pViewports))) { + return; + } + memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, viewportCount * sizeof(*pViewports)); @@ -2984,6 +2989,11 @@ void radv_CmdSetScissor( assert(firstScissor < MAX_SCISSORS); assert(total_count >= 1 && total_count <= MAX_SCISSORS); + if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors, + scissorCount * sizeof(*pScissors))) { + return; + } + memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, scissorCount * sizeof(*pScissors)); @@ -2995,6 +3005,10 @@ void radv_CmdSetLineWidth( float lineWidth) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + + if (cmd_buffer->state.dynamic.line_width == lineWidth) + return; + cmd_buffer->state.dynamic.line_width = lineWidth; cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; } @@ -3006,12 +3020,19 @@ void radv_CmdSetDepthBias( float depthBiasSlopeFactor) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; - cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor; - cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp; - cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor; + if (state->dynamic.depth_bias.bias == depthBiasConstantFactor && + state->dynamic.depth_bias.clamp == depthBiasClamp && + state->dynamic.depth_bias.slope == depthBiasSlopeFactor) { + return; + } - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; + state->dynamic.depth_bias.bias = depthBiasConstantFactor; + state->dynamic.depth_bias.clamp = depthBiasClamp; + state->dynamic.depth_bias.slope = depthBiasSlopeFactor; + + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; } void radv_CmdSetBlendConstants( @@ -3019,11 +3040,14 @@ void radv_CmdSetBlendConstants( const float blendConstants[4]) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; - memcpy(cmd_buffer->state.dynamic.blend_constants, - blendConstants, sizeof(float) * 4); + if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4)) + return; + + memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4); - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; } void radv_CmdSetDepthBounds( @@ -3032,11 +3056,17 @@ void radv_CmdSetDepthBounds( float maxDepthBounds) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; - cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds; - cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds; + if (state->dynamic.depth_bounds.min == minDepthBounds && + state->dynamic.depth_bounds.max == maxDepthBounds) { + return; + } + + state->dynamic.depth_bounds.min = minDepthBounds; + state->dynamic.depth_bounds.max = maxDepthBounds; - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; } void radv_CmdSetStencilCompareMask( @@ -3045,13 +3075,21 @@ void radv_CmdSetStencilCompareMask( uint32_tcompareMask) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; + bool front_same = state->dynamic.stencil_compare_mask.front == compareMask; + bool back_same = state->dyn
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
Sure On Mon, 14 Jan 2019 at 16:50, Samuel Pitoiset wrote: > > While you are on it, can you experiment the tracked ctx stuff that > RadeonSI implements (ie. SI_TRACKED_XXX)? > > This approach will likely be more costly from the CPU side, but it will > reduce the number of register changes a lot more. > > Not sure if that will improve anything though, but I think it's worth to > try? > > On 1/14/19 5:12 PM, Rhys Perry wrote: > > I did and found small improvements in Rise of the Tomb Raider. I > > measured framerates ~104.3% that of without the changes for the > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > for Prophets Tomb. > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > > wrote: > >> Did you benchmark? > >> > >> On 1/14/19 5:01 PM, Rhys Perry wrote: > >>> It's common in some applications to bind a new graphics pipeline without > >>> ending up changing any context registers. > >>> > >>> This has a pipline have two command buffers: one for setting context > >>> registers and one for everything else. The context register command buffer > >>> is only emitted if it differs from the previous pipeline's. > >>> > >>> Signed-off-by: Rhys Perry > >>> --- > >>>src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > >>>src/amd/vulkan/radv_pipeline.c | 217 --- > >>>src/amd/vulkan/radv_private.h| 2 + > >>>3 files changed, 150 insertions(+), 115 deletions(-) > >>> > >>> diff --git a/src/amd/vulkan/radv_cmd_buffer.c > >>> b/src/amd/vulkan/radv_cmd_buffer.c > >>> index f41d6c0b3e7..59903ab64d8 100644 > >>> --- a/src/amd/vulkan/radv_cmd_buffer.c > >>> +++ b/src/amd/vulkan/radv_cmd_buffer.c > >>> @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer > >>> *cmd_buffer, > >>>} > >>>} > >>> > >>> -static void > >>> +static bool > >>>radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > >>> struct radv_pipeline *pipeline) > >>>{ > >>> @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer > >>> *cmd_buffer, > >>>cmd_buffer->sample_positions_needed = true; > >>> > >>>if (old_pipeline && num_samples == > >>> old_pipeline->graphics.ms.num_samples) > >>> - return; > >>> + return false; > >>> > >>>radeon_set_context_reg_seq(cmd_buffer->cs, > >>> R_028BDC_PA_SC_LINE_CNTL, 2); > >>>radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > >>> @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer > >>> *cmd_buffer, > >>>radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > >>>radeon_emit(cmd_buffer->cs, > >>> EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); > >>>} > >>> + > >>> + return true; > >>>} > >>> > >>>static void > >>> @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > >>> *cmd_buffer) > >>>radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > >>>} > >>> > >>> -static void > >>> +static bool > >>>radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > >>>{ > >>>struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; > >>> > >>>if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) > >>> - return; > >>> + return false; > >>> > >>> - radv_update_multisample_state(cmd_buffer, pipeline); > >>> + bool context_roll = radv_update_multisample_state(cmd_buffer, > >>> pipeline); > >>> > >>>cmd_buffer->scratch_size_needed = > >>> MAX2(cmd_buffer->scratch_size_needed, > >>> @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > >>> *cmd_buffer) > >>> > >>>radeon_emit_array(c
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
This is with Rise of the Tomb Raider's graphics settings set to "High" by the way. On Mon, 14 Jan 2019 at 16:12, Rhys Perry wrote: > > I did and found small improvements in Rise of the Tomb Raider. I > measured framerates ~104.3% that of without the changes for the > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > for Prophets Tomb. > > I found no change with Dota 2 but I've heard it's cpu-bound. > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset > wrote: > > > > Did you benchmark? > > > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > > It's common in some applications to bind a new graphics pipeline without > > > ending up changing any context registers. > > > > > > This has a pipline have two command buffers: one for setting context > > > registers and one for everything else. The context register command buffer > > > is only emitted if it differs from the previous pipeline's. > > > > > > Signed-off-by: Rhys Perry > > > --- > > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > > src/amd/vulkan/radv_pipeline.c | 217 --- > > > src/amd/vulkan/radv_private.h| 2 + > > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > > b/src/amd/vulkan/radv_cmd_buffer.c > > > index f41d6c0b3e7..59903ab64d8 100644 > > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer > > > *cmd_buffer, > > > } > > > } > > > > > > -static void > > > +static bool > > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > > struct radv_pipeline *pipeline) > > > { > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer > > > *cmd_buffer, > > > cmd_buffer->sample_positions_needed = true; > > > > > > if (old_pipeline && num_samples == > > > old_pipeline->graphics.ms.num_samples) > > > - return; > > > + return false; > > > > > > radeon_set_context_reg_seq(cmd_buffer->cs, > > > R_028BDC_PA_SC_LINE_CNTL, 2); > > > radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer > > > *cmd_buffer, > > > radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > > > radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) > > > | EVENT_INDEX(0)); > > > } > > > + > > > + return true; > > > } > > > > > > static void > > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > > > *cmd_buffer) > > > radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > > > } > > > > > > -static void > > > +static bool > > > radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > > > { > > > struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; > > > > > > if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) > > > - return; > > > + return false; > > > > > > - radv_update_multisample_state(cmd_buffer, pipeline); > > > + bool context_roll = radv_update_multisample_state(cmd_buffer, > > > pipeline); > > > > > > cmd_buffer->scratch_size_needed = > > > MAX2(cmd_buffer->scratch_size_needed, > > > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > > > *cmd_buffer) > > > > > > radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, > > > pipeline->cs.cdw); > > > > > > + if (!cmd_buffer->state.emitted_pipeline || > > > + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != > > > pipeline->ctx_cs.cdw || > > > + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != > > > pipeline->ctx_cs_hash || > > > + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, > > > +pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { > > >
Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
I did and found small improvements in Rise of the Tomb Raider. I measured framerates ~104.3% that of without the changes for the Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% for Prophets Tomb. I found no change with Dota 2 but I've heard it's cpu-bound. On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset wrote: > > Did you benchmark? > > On 1/14/19 5:01 PM, Rhys Perry wrote: > > It's common in some applications to bind a new graphics pipeline without > > ending up changing any context registers. > > > > This has a pipline have two command buffers: one for setting context > > registers and one for everything else. The context register command buffer > > is only emitted if it differs from the previous pipeline's. > > > > Signed-off-by: Rhys Perry > > --- > > src/amd/vulkan/radv_cmd_buffer.c | 46 +-- > > src/amd/vulkan/radv_pipeline.c | 217 --- > > src/amd/vulkan/radv_private.h| 2 + > > 3 files changed, 150 insertions(+), 115 deletions(-) > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > > b/src/amd/vulkan/radv_cmd_buffer.c > > index f41d6c0b3e7..59903ab64d8 100644 > > --- a/src/amd/vulkan/radv_cmd_buffer.c > > +++ b/src/amd/vulkan/radv_cmd_buffer.c > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer > > *cmd_buffer, > > } > > } > > > > -static void > > +static bool > > radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > > struct radv_pipeline *pipeline) > > { > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer > > *cmd_buffer, > > cmd_buffer->sample_positions_needed = true; > > > > if (old_pipeline && num_samples == > > old_pipeline->graphics.ms.num_samples) > > - return; > > + return false; > > > > radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, > > 2); > > radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer > > *cmd_buffer, > > radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > > radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | > > EVENT_INDEX(0)); > > } > > + > > + return true; > > } > > > > static void > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > > *cmd_buffer) > > radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > > } > > > > -static void > > +static bool > > radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > > { > > struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; > > > > if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) > > - return; > > + return false; > > > > - radv_update_multisample_state(cmd_buffer, pipeline); > > + bool context_roll = radv_update_multisample_state(cmd_buffer, > > pipeline); > > > > cmd_buffer->scratch_size_needed = > > MAX2(cmd_buffer->scratch_size_needed, > > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > > *cmd_buffer) > > > > radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); > > > > + if (!cmd_buffer->state.emitted_pipeline || > > + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != > > pipeline->ctx_cs.cdw || > > + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != > > pipeline->ctx_cs_hash || > > + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, > > +pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { > > + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, > > pipeline->ctx_cs.cdw); > > + context_roll = true; > > + } > > + > > for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { > > if (!pipeline->shaders[i]) > > continue; > > @@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > > *cmd_buffer) > > cmd_buffer->state.emitted_pipeline = pipeline; > > > > cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; > > + > > + return context_roll;
[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines
It's common in some applications to bind a new graphics pipeline without ending up changing any context registers. This has a pipline have two command buffers: one for setting context registers and one for everything else. The context register command buffer is only emitted if it differs from the previous pipeline's. Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 46 +-- src/amd/vulkan/radv_pipeline.c | 217 --- src/amd/vulkan/radv_private.h| 2 + 3 files changed, 150 insertions(+), 115 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f41d6c0b3e7..59903ab64d8 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, } } -static void +static bool radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) { @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, cmd_buffer->sample_positions_needed = true; if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples) - return; + return false; radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2); radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } + + return true; } static void @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) radeon_emit(cmd_buffer->cs, sx_blend_opt_control); } -static void +static bool radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) { struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) - return; + return false; - radv_update_multisample_state(cmd_buffer, pipeline); + bool context_roll = radv_update_multisample_state(cmd_buffer, pipeline); cmd_buffer->scratch_size_needed = MAX2(cmd_buffer->scratch_size_needed, @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); + if (!cmd_buffer->state.emitted_pipeline || + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw || + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash || + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, + pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw); + context_roll = true; + } + for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { if (!pipeline->shaders[i]) continue; @@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.emitted_pipeline = pipeline; cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; + + return context_roll; } static void @@ -2859,6 +2872,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer) if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline) return; + assert(!pipeline->ctx_cs.cdw); + cmd_buffer->state.emitted_compute_pipeline = pipeline; radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); @@ -3609,30 +3624,30 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer, * any context registers. */ static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, -bool indexed_draw) +bool indexed_draw, +bool pipeline_context_roll) { struct radv_cmd_state *state = &cmd_buffer->state; if (!cmd_buffer->device->physical_device->has_scissor_bug) return false; + if (pipeline_context_roll) + return true; + uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL; /* Index, vertex and streamout buffers don't change context regs, and -* pipeline is handle
[Mesa-dev] [PATCH] nir: fix copy-paste error in nir_lower_constant_initializers
Fixes: 393b59e0772e7bf0426bdf61c740752c4e09dde1 ('nir: Rework nir_lower_constant_initializers() to handle functions') --- src/compiler/nir/nir_lower_constant_initializers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compiler/nir/nir_lower_constant_initializers.c b/src/compiler/nir/nir_lower_constant_initializers.c index cbee59b1f30..959d1eabfca 100644 --- a/src/compiler/nir/nir_lower_constant_initializers.c +++ b/src/compiler/nir/nir_lower_constant_initializers.c @@ -104,10 +104,10 @@ nir_lower_constant_initializers(nir_shader *shader, nir_variable_mode modes) impl_progress |= lower_const_initializer(&builder, &shader->outputs); if ((modes & nir_var_private) && function->is_entrypoint) - impl_progress |= lower_const_initializer(&builder, &shader->outputs); + impl_progress |= lower_const_initializer(&builder, &shader->globals); if ((modes & nir_var_system_value) && function->is_entrypoint) - impl_progress |= lower_const_initializer(&builder, &shader->outputs); + impl_progress |= lower_const_initializer(&builder, &shader->system_values); if (modes & nir_var_function) impl_progress |= lower_const_initializer(&builder, &function->impl->locals); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] radv: use dithered alpha-to-coverage
Seems I sent the wrong commit message. It was meant to be: This matches the behaviour of AMDVLK and hides banding. It is also seems to be allowed by the Vulkan spec. Signed-off-by: Rhys Perry On Wed, 9 Jan 2019 at 14:40, Rhys Perry wrote: > > This matches the behaviour of AMDVLK and hides banding > > TODO: run tests > --- > src/amd/vulkan/radv_pipeline.c | 9 + > 1 file changed, 5 insertions(+), 4 deletions(-) > > diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c > index 3561d17aaba..26ee59f11dd 100644 > --- a/src/amd/vulkan/radv_pipeline.c > +++ b/src/amd/vulkan/radv_pipeline.c > @@ -681,10 +681,11 @@ radv_pipeline_init_blend_state(struct radv_pipeline > *pipeline, > else > blend.cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY); > > - blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | > - S_028B70_ALPHA_TO_MASK_OFFSET1(2) | > - S_028B70_ALPHA_TO_MASK_OFFSET2(2) | > - S_028B70_ALPHA_TO_MASK_OFFSET3(2); > + blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | > + S_028B70_ALPHA_TO_MASK_OFFSET1(1) | > + S_028B70_ALPHA_TO_MASK_OFFSET2(0) | > + S_028B70_ALPHA_TO_MASK_OFFSET3(2) | > + S_028B70_OFFSET_ROUND(1); > > if (vkms && vkms->alphaToCoverageEnable) { > blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1); > -- > 2.20.1 > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radv: use dithered alpha-to-coverage
This matches the behaviour of AMDVLK and hides banding TODO: run tests --- src/amd/vulkan/radv_pipeline.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 3561d17aaba..26ee59f11dd 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -681,10 +681,11 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline, else blend.cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY); - blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | - S_028B70_ALPHA_TO_MASK_OFFSET1(2) | - S_028B70_ALPHA_TO_MASK_OFFSET2(2) | - S_028B70_ALPHA_TO_MASK_OFFSET3(2); + blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | + S_028B70_ALPHA_TO_MASK_OFFSET1(1) | + S_028B70_ALPHA_TO_MASK_OFFSET2(0) | + S_028B70_ALPHA_TO_MASK_OFFSET3(2) | + S_028B70_OFFSET_ROUND(1); if (vkms && vkms->alphaToCoverageEnable) { blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1); -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] ac/nir, radv, radeonsi/nir: use correct indices for interpolation intrinsics
Fixes artifacts in World of Warcraft when Multi-sample Alpha-Test is enabled. It also fixes various piglit interpolateAt*() tests with NIR. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106595 Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 +- src/amd/common/ac_shader_abi.h | 2 ++ src/amd/vulkan/radv_nir_to_llvm.c| 2 ++ src/gallium/drivers/radeonsi/si_shader_nir.c | 3 +++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 6d97212b805..8fd8580087f 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -2829,7 +2829,7 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx, LLVMValueRef src0 = NULL; nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - int input_index = var->data.location - VARYING_SLOT_VAR0; + int input_index = ctx->abi->fs_input_attr_indices[var->data.location - VARYING_SLOT_VAR0]; switch (instr->intrinsic) { case nir_intrinsic_interp_deref_at_centroid: location = INTERP_CENTROID; diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h index 6b9a91c92a9..4f51aa9b0c0 100644 --- a/src/amd/common/ac_shader_abi.h +++ b/src/amd/common/ac_shader_abi.h @@ -76,6 +76,8 @@ struct ac_shader_abi { * driver_location. */ LLVMValueRef *inputs; + /* Varying -> attribute number mapping. Also NIR-only */ + unsigned fs_input_attr_indices[MAX_VARYING]; void (*emit_outputs)(struct ac_shader_abi *abi, unsigned max_outputs, diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 322b10b67a0..cd58167b766 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2239,6 +2239,8 @@ handle_fs_inputs(struct radv_shader_context *ctx, if (LLVMIsUndef(interp_param)) ctx->shader_info->fs.flat_shaded_mask |= 1u << index; + if (i >= VARYING_SLOT_VAR0) + ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index; ++index; } else if (i == VARYING_SLOT_CLIP_DIST0) { int length = ctx->shader_info->info.ps.num_input_clips_culls; diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 0a692277f64..d5b8a8416d9 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -1019,6 +1019,9 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) LLVMValueRef data[4]; unsigned loc = variable->data.location; + if (loc >= VARYING_SLOT_VAR0 && nir->info.stage == MESA_SHADER_FRAGMENT) + ctx->abi.fs_input_attr_indices[loc - VARYING_SLOT_VAR0] = input_idx / 4; + for (unsigned i = 0; i < attrib_count; i++) { /* Packed components share the same location so skip * them if we have already processed the location. -- 2.20.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radv: allow secondary command buffers to inherit unknown framebuffers
Fixes: f4e499ec79 ('radv: add initial non-conformant radv vulkan driver') Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107986 Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_cmd_buffer.c | 59 ++-- src/amd/vulkan/radv_meta_clear.c | 8 + src/amd/vulkan/radv_private.h| 2 ++ 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index c61310f3fc9..96fe5acb3bf 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -730,6 +730,9 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; const struct radv_subpass *subpass = cmd_buffer->state.subpass; + /* FIXME: handle when the framebuffer is unknown in secondary framebuffers */ + assert(!cmd_buffer->inherit_unknown_fb); + unsigned sx_ps_downconvert = 0; unsigned sx_blend_opt_epsilon = 0; unsigned sx_blend_opt_control = 0; @@ -1189,19 +1192,22 @@ radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; const struct radv_subpass *subpass = cmd_buffer->state.subpass; struct radeon_cmdbuf *cs = cmd_buffer->cs; - struct radv_attachment_info *att; - uint32_t att_idx; + struct radv_attachment_info *att = NULL; - if (!framebuffer || !subpass) + if (!subpass) return; - - att_idx = subpass->depth_stencil_attachment.attachment; - if (att_idx == VK_ATTACHMENT_UNUSED) + if (!framebuffer && !cmd_buffer->inherit_unknown_fb) return; - att = &framebuffer->attachments[att_idx]; - if (att->attachment->image != image) - return; + if (framebuffer) { + uint32_t att_idx = subpass->depth_stencil_attachment.attachment; + if (att_idx == VK_ATTACHMENT_UNUSED) + return; + + att = &framebuffer->attachments[att_idx]; + if (att->attachment->image != image) + return; + } radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); radeon_emit(cs, ds_clear_value.stencil); @@ -1212,6 +1218,8 @@ radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, */ if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) { + assert(att); + VkImageLayout layout = subpass->depth_stencil_attachment.layout; radv_update_zrange_precision(cmd_buffer, &att->ds, image, @@ -1426,19 +1434,22 @@ radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; const struct radv_subpass *subpass = cmd_buffer->state.subpass; struct radeon_cmdbuf *cs = cmd_buffer->cs; - struct radv_attachment_info *att; - uint32_t att_idx; - if (!framebuffer || !subpass) + if (!subpass) return; - - att_idx = subpass->color_attachments[cb_idx].attachment; - if (att_idx == VK_ATTACHMENT_UNUSED) + if (!framebuffer && !cmd_buffer->inherit_unknown_fb) return; - att = &framebuffer->attachments[att_idx]; - if (att->attachment->image != image) - return; + if (framebuffer) { + struct radv_attachment_info *att; + uint32_t att_idx = subpass->color_attachments[cb_idx].attachment; + if (att_idx == VK_ATTACHMENT_UNUSED) + return; + + att = &framebuffer->attachments[att_idx]; + if (att->attachment->image != image) + return; + } radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2); radeon_emit(cs, color_values[0]); @@ -2528,6 +2539,7 @@ VkResult radv_BeginCommandBuffer( cmd_buffer->state.last_first_instance = -1; cmd_buffer->state.predication_type = -1; cmd_buffer->usage_flags = pBeginInfo->flags; + cmd_buffer->inherit_unknown_fb = false; if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) { @@ -2535,6 +2547,9 @@ VkResult radv_BeginCommandBuffer( cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); + if (cmd_buffer->state.pa
Re: [Mesa-dev] [PATCH 01/38] ac: add various helpers for float16/int16/int8
I would expect these helpers to be much more efficient than the functions you suggested. They are also (in my opinion) more readable than the suggested functions. I don't think it matters much though, so I'm fine either way. On Tue, 18 Dec 2018 at 02:48, Marek Olšák wrote: > > On Fri, Dec 7, 2018 at 12:22 PM Rhys Perry wrote: >> >> Signed-off-by: Rhys Perry >> --- >> src/amd/common/ac_llvm_build.c | 123 ++-- >> src/amd/common/ac_llvm_build.h | 22 +- >> src/amd/common/ac_nir_to_llvm.c | 30 >> 3 files changed, 154 insertions(+), 21 deletions(-) >> >> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c >> index 154cc696a2..cc7c6da5a4 100644 >> --- a/src/amd/common/ac_llvm_build.c >> +++ b/src/amd/common/ac_llvm_build.c >> @@ -87,12 +87,16 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, >> ctx->v4f32 = LLVMVectorType(ctx->f32, 4); >> ctx->v8i32 = LLVMVectorType(ctx->i32, 8); >> >> + ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); >> + ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); >> ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); >> ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); >> ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); >> ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); >> ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); >> ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); >> + ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); >> + ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); >> ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); >> ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); >> ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); >> @@ -201,7 +205,9 @@ ac_get_type_size(LLVMTypeRef type) >> >> static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, >> LLVMTypeRef t) >> { >> - if (t == ctx->f16 || t == ctx->i16) >> + if (t == ctx->i8) >> + return ctx->i8; >> + else if (t == ctx->f16 || t == ctx->i16) >> return ctx->i16; >> else if (t == ctx->f32 || t == ctx->i32) >> return ctx->i32; >> @@ -268,6 +274,110 @@ ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef >> v) >> return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, >> type), ""); >> } >> >> +LLVMValueRef ac_get_zerof(struct ac_llvm_context *ctx, LLVMTypeRef t) >> +{ >> + if (t == ctx->f16) >> + return ctx->f16_0; >> + else if (t == ctx->f32) >> + return ctx->f32_0; >> + else if (t == ctx->f64) >> + return ctx->f64_0; >> + else >> + unreachable("Unhandled float size"); >> +} >> + >> +LLVMValueRef ac_get_onef(struct ac_llvm_context *ctx, LLVMTypeRef t) >> +{ >> + if (t == ctx->f16) >> + return ctx->f16_1; >> + else if (t == ctx->f32) >> + return ctx->f32_1; >> + else if (t == ctx->f64) >> + return ctx->f64_1; >> + else >> + unreachable("Unhandled float size"); >> +} >> + >> +LLVMValueRef ac_get_zero(struct ac_llvm_context *ctx, LLVMTypeRef t) >> +{ >> + if (t == ctx->i8) >> + return ctx->i8_0; >> + else if (t == ctx->i16) >> + return ctx->i16_0; >> + else if (t == ctx->i32) >> + return ctx->i32_0; >> + else if (t == ctx->i64) >> + return ctx->i64_0; >> + else >> + unreachable("Unhandled bit size"); >> +} >> + >> +LLVMValueRef ac_get_one(struct ac_llvm_context *ctx, LLVMTypeRef t) >> +{ >> + if (t == ctx->i8) >> + return ctx->i8_1; >> + else if (t == ctx->i16) >> + return ctx->i16_1; >> + else if (t == ctx->i32) >> + return ctx->i32_1; >> + else if (t == ctx->i64) >> + return ctx->i64_1; >> + else >> + unreachable("Unhandled bit size"); >> +} > > > You don't need these helpers. You can just use LLVMConstInt and LLVMConstReal. > &g
Re: [Mesa-dev] [PATCH 0/2] radv/query: Use 1-bit booleans in query shaders
You missed this change (or something functionally similar): diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index e7bb81489f6..5d35af05579 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -630,8 +630,8 @@ build_tfb_query_shader(struct radv_device *device) avails[1] = nir_iand(&b, nir_channel(&b, &load2->dest.ssa, 1), nir_channel(&b, &load2->dest.ssa, 3)); nir_ssa_def *result_is_available = - nir_iand(&b, nir_iand(&b, avails[0], avails[1]), -nir_imm_int(&b, 0x8000)); + nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), +nir_imm_int(&b, 0x8000))); /* Only compute result if available. */ nir_if *available_if = nir_if_create(b.shader); Other than that, this looks fine and seems to work correctly on my Vega. With that change (and for what it's worth), this is: Reviewed-by: Rhys Perry On Wed, 19 Dec 2018 at 19:45, Jason Ekstrand wrote: > > When we switched over to 1-bit booleans, the radv query shaders ended up > still using 32-bit booleans for most stuff. While this is technically > valid from an IR perspective, most of the NIR passes don't really support > 32-bit booleans correctly anymore now that we've moved to 1-bit. This tiny > series attempts to convert the radv query shaders over to using 1-bit > Booleans. > > I've only compile-tested it and read through it a couple times but am not > really set up for testing radv. I would very much appreciate if someone > more familiar with radv could review and test these patches (and possibly > rewrite them if appropriate). > > Cc: Dave Airlie > Cc: Timothy Arceri > Cc: Bas Nieuwenhuizen > > Jason Ekstrand (2): > radv/query: Add a nir_flag_set helper > radv/query: Use 1-bit booleans in query shaders > > src/amd/vulkan/radv_query.c | 67 +++-- > 1 file changed, 34 insertions(+), 33 deletions(-) > > -- > 2.19.2 > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] nir: create 32-bit bcsel for 32-bit conditions
The 32-bit condition was in one of radv's meta shaders and had it from the start. 32-bit conditions seems to be valid after lowering booleans to 32-bit. Are they supposed to be invalid before the lowering? On Wed, 19 Dec 2018 at 00:59, Jason Ekstrand wrote: > > Seems reasonable though I'm a bit surprised your running peephole_select > after lowering booleans. > > On December 18, 2018 18:16:46 Timothy Arceri wrote: > > > Reviewed-by: Timothy Arceri > > > > On 18/12/18 3:16 am, Rhys Perry wrote: > >> Signed-off-by: Rhys Perry > >> --- > >> src/compiler/nir/nir_opt_peephole_select.c | 4 +++- > >> 1 file changed, 3 insertions(+), 1 deletion(-) > >> > >> diff --git a/src/compiler/nir/nir_opt_peephole_select.c > >> b/src/compiler/nir/nir_opt_peephole_select.c > >> index ad9d0abec0..241627ed99 100644 > >> --- a/src/compiler/nir/nir_opt_peephole_select.c > >> +++ b/src/compiler/nir/nir_opt_peephole_select.c > >> @@ -205,7 +205,9 @@ nir_opt_peephole_select_block(nir_block *block, > >> nir_shader *shader, > >>break; > >> > >> nir_phi_instr *phi = nir_instr_as_phi(instr); > >> - nir_alu_instr *sel = nir_alu_instr_create(shader, nir_op_bcsel); > >> + nir_op sel_op = nir_src_bit_size(if_stmt->condition) == 1 ? > >> + nir_op_bcsel : nir_op_b32csel; > >> + nir_alu_instr *sel = nir_alu_instr_create(shader, sel_op); > >> nir_src_copy(&sel->src[0].src, &if_stmt->condition, sel); > >> /* Splat the condition to all channels */ > >> memset(sel->src[0].swizzle, 0, sizeof sel->src[0].swizzle); > >> > > ___ > > mesa-dev mailing list > > mesa-dev@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] nir: create 32-bit bcsel for 32-bit conditions
Signed-off-by: Rhys Perry --- src/compiler/nir/nir_opt_peephole_select.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c index ad9d0abec0..241627ed99 100644 --- a/src/compiler/nir/nir_opt_peephole_select.c +++ b/src/compiler/nir/nir_opt_peephole_select.c @@ -205,7 +205,9 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, break; nir_phi_instr *phi = nir_instr_as_phi(instr); - nir_alu_instr *sel = nir_alu_instr_create(shader, nir_op_bcsel); + nir_op sel_op = nir_src_bit_size(if_stmt->condition) == 1 ? + nir_op_bcsel : nir_op_b32csel; + nir_alu_instr *sel = nir_alu_instr_create(shader, sel_op); nir_src_copy(&sel->src[0].src, &if_stmt->condition, sel); /* Splat the condition to all channels */ memset(sel->src[0].swizzle, 0, sizeof sel->src[0].swizzle); -- 2.19.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radv: don't set surf_index for stencil-only images
Fixes: f8d5b377c8b ('radv: set cb base tile swizzles for MRT speedups (v4)') Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108116 Signed-off-by: Rhys Perry --- Unfortunately I was not able to test this patch on a Polaris due to hardware issues. It fixed the deqp-vk tests mentioned in the bugzilla without regressions on Vega though. src/amd/vulkan/radv_image.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c index 2cff4d5283..2bd74e202f 100644 --- a/src/amd/vulkan/radv_image.c +++ b/src/amd/vulkan/radv_image.c @@ -986,7 +986,7 @@ radv_image_create(VkDevice _device, image->shareable = vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR) != NULL; - if (!vk_format_is_depth(pCreateInfo->format) && !create_info->scanout && !image->shareable) { + if (!vk_format_is_depth_or_stencil(pCreateInfo->format) && !create_info->scanout && !image->shareable) { image->info.surf_index = &device->image_mrt_offset_counter; } -- 2.19.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] ac: refactor visit_load_buffer
This is so that we can split different types of loads more easily. Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 8 ++-- src/amd/common/ac_nir_to_llvm.c | 80 - src/compiler/nir/nir.h | 2 +- 3 files changed, 44 insertions(+), 46 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index abc18da13d..154cc696a2 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2943,9 +2943,11 @@ LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, if (count == num_components) return value; - LLVMValueRef masks[] = { - ctx->i32_0, ctx->i32_1, - LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)}; + LLVMValueRef masks[MAX2(count, 2)]; + masks[0] = ctx->i32_0; + masks[1] = ctx->i32_1; + for (unsigned i = 2; i < count; i++) + masks[i] = LLVMConstInt(ctx->i32, i, false); if (count == 1) return LLVMBuildExtractElement(ctx->builder, value, masks[0], diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index a109f5a815..c05b45e084 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1623,37 +1623,43 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { - LLVMValueRef results[2]; - int load_bytes; int elem_size_bytes = instr->dest.ssa.bit_size / 8; int num_components = instr->num_components; - int num_bytes = num_components * elem_size_bytes; enum gl_access_qualifier access = nir_intrinsic_access(instr); LLVMValueRef glc = ctx->ac.i1false; if (access & (ACCESS_VOLATILE | ACCESS_COHERENT)) glc = ctx->ac.i1true; - for (int i = 0; i < num_bytes; i += load_bytes) { - load_bytes = MIN2(num_bytes - i, 16); - const char *load_name; - LLVMTypeRef data_type; - LLVMValueRef offset = get_src(ctx, instr->src[1]); - LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i, false); - LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, - get_src(ctx, instr->src[0]), false); - LLVMValueRef vindex = ctx->ac.i32_0; - - int idx = i ? 1 : 0; + LLVMValueRef offset = get_src(ctx, instr->src[1]); + LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, + get_src(ctx, instr->src[0]), false); + LLVMValueRef vindex = ctx->ac.i32_0; + + LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa); + LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type; + + LLVMValueRef results[4]; + for (int i = 0; i < num_components;) { + int num_elems = num_components - i; + if (num_elems * elem_size_bytes > 16) + num_elems = 16 / elem_size_bytes; + int load_bytes = num_elems * elem_size_bytes; + + LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false); + + LLVMValueRef ret; if (load_bytes == 2) { - results[idx] = ac_build_tbuffer_load_short(&ctx->ac, - rsrc, - vindex, - offset, - ctx->ac.i32_0, - immoffset, - glc); + ret = ac_build_tbuffer_load_short(&ctx->ac, + rsrc, + vindex, + offset, + ctx->ac.i32_0, + immoffset, + glc); } else { + const char *load_name; + LLVMTypeRef data_type; switch (load_bytes) { case 16: case 12: @@ -1679,33 +1685,23 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, glc,
[Mesa-dev] [PATCH 2/2] ac: split 16-bit ssbo loads that may not be dword aligned
Fixes: 7e7ee826982 ('ac: add support for 16bit buffer loads') Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108114 Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index c05b45e084..4a4c09cf5f 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1642,6 +1642,8 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, LLVMValueRef results[4]; for (int i = 0; i < num_components;) { int num_elems = num_components - i; + if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0) + num_elems = 1; if (num_elems * elem_size_bytes > 16) num_elems = 16 / elem_size_bytes; int load_bytes = num_elems * elem_size_bytes; -- 2.19.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 2/2] radv: ensure export arguments are always float
(accidently sent an incomplete email) Seems my LLVM configuration was messed up and I might have used my distro's LLVM too. LLVM 8 and 7 with a release build passes. A debug build of 8 (and my messed up builds of 7 and 8 which I thought were release ones) results in an assert. On Thu, 13 Dec 2018 at 08:38, Samuel Pitoiset wrote: > > > > On 12/6/18 3:18 PM, Rhys Perry wrote: > > ./deqp-vk > > --deqp-case=dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_32_to_16.scalar_uint0_frag > > should crash with something like: > > deqp-vk: lib/IR/Instructions.cpp:2590: static llvm::CastInst* > > llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, > > llvm::Type*, const llvm::Twine&, llvm::Instruction*): Assertion > > `castIsValid(op, S, Ty) && "Invalid cast!"' failed. > > because it's trying to zext/sext a half float to a i32. > > > > and ./deqp-vk > > --deqp-case=dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_32_to_16.scalar_uint0_vert > > should crash with something like: > > deqp-vk: lib/IR/Instructions.cpp:348: void > > llvm::CallInst::init(llvm::FunctionType*, llvm::Value*, > > llvm::ArrayRef, > > llvm::ArrayRef >, const > > llvm::Twine&): Assertion `(i >= FTy->getNumParams() || > > FTy->getParamType(i) == Args[i]->getType()) && "Calling a function > > with a bad signature!"' failed. > > because it's calling the export intrinsic with incorrect argument types. > > > > For both tests, it seems to only assert with LLVM 8 for some reason. > > I guess you use a debug llvm build? Can you figure out what change > introduces this crash? > > > On Thu, 6 Dec 2018 at 13:31, Samuel Pitoiset > > wrote: > >> > >> > >> > >> On 12/6/18 2:15 PM, Rhys Perry wrote: > >>> So that the signature is correct and consistent, the inputs to a export > >>> intrinsic should always be 32-bit floats. > >>> > >>> This and the previous commit fixes a large amount crashes from > >>> dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_* > >>> tests > >>> > >> > >> They don't crash for me? Please explain how to reproduce. > >> > >>> Fixes: b722b29f10d ('radv: add support for 16bit input/output') > >>> Signed-off-by: Rhys Perry > >>> --- > >>>src/amd/vulkan/radv_nir_to_llvm.c | 6 +- > >>>1 file changed, 1 insertion(+), 5 deletions(-) > >>> > >>> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c > >>> b/src/amd/vulkan/radv_nir_to_llvm.c > >>> index 0c91118e5a..90bcc8dbfe 100644 > >>> --- a/src/amd/vulkan/radv_nir_to_llvm.c > >>> +++ b/src/amd/vulkan/radv_nir_to_llvm.c > >>> @@ -2464,12 +2464,8 @@ si_llvm_init_export_args(struct > >>> radv_shader_context *ctx, > >>>} else > >>>memcpy(&args->out[0], values, sizeof(values[0]) * 4); > >>> > >>> - for (unsigned i = 0; i < 4; ++i) { > >>> - if (!(args->enabled_channels & (1 << i))) > >>> - continue; > >>> - > >>> + for (unsigned i = 0; i < 4; ++i) > >>>args->out[i] = ac_to_float(&ctx->ac, args->out[i]); > >>> - } > >>>} > >>> > >>>static void > >>> ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 2/2] radv: ensure export arguments are always float
Seems my LLVM configuration was messed up and I might have used my distro's LLVM too. On Thu, 13 Dec 2018 at 08:38, Samuel Pitoiset wrote: > > > > On 12/6/18 3:18 PM, Rhys Perry wrote: > > ./deqp-vk > > --deqp-case=dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_32_to_16.scalar_uint0_frag > > should crash with something like: > > deqp-vk: lib/IR/Instructions.cpp:2590: static llvm::CastInst* > > llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, > > llvm::Type*, const llvm::Twine&, llvm::Instruction*): Assertion > > `castIsValid(op, S, Ty) && "Invalid cast!"' failed. > > because it's trying to zext/sext a half float to a i32. > > > > and ./deqp-vk > > --deqp-case=dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_32_to_16.scalar_uint0_vert > > should crash with something like: > > deqp-vk: lib/IR/Instructions.cpp:348: void > > llvm::CallInst::init(llvm::FunctionType*, llvm::Value*, > > llvm::ArrayRef, > > llvm::ArrayRef >, const > > llvm::Twine&): Assertion `(i >= FTy->getNumParams() || > > FTy->getParamType(i) == Args[i]->getType()) && "Calling a function > > with a bad signature!"' failed. > > because it's calling the export intrinsic with incorrect argument types. > > > > For both tests, it seems to only assert with LLVM 8 for some reason. > > I guess you use a debug llvm build? Can you figure out what change > introduces this crash? > > > On Thu, 6 Dec 2018 at 13:31, Samuel Pitoiset > > wrote: > >> > >> > >> > >> On 12/6/18 2:15 PM, Rhys Perry wrote: > >>> So that the signature is correct and consistent, the inputs to a export > >>> intrinsic should always be 32-bit floats. > >>> > >>> This and the previous commit fixes a large amount crashes from > >>> dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_* > >>> tests > >>> > >> > >> They don't crash for me? Please explain how to reproduce. > >> > >>> Fixes: b722b29f10d ('radv: add support for 16bit input/output') > >>> Signed-off-by: Rhys Perry > >>> --- > >>>src/amd/vulkan/radv_nir_to_llvm.c | 6 +- > >>>1 file changed, 1 insertion(+), 5 deletions(-) > >>> > >>> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c > >>> b/src/amd/vulkan/radv_nir_to_llvm.c > >>> index 0c91118e5a..90bcc8dbfe 100644 > >>> --- a/src/amd/vulkan/radv_nir_to_llvm.c > >>> +++ b/src/amd/vulkan/radv_nir_to_llvm.c > >>> @@ -2464,12 +2464,8 @@ si_llvm_init_export_args(struct > >>> radv_shader_context *ctx, > >>>} else > >>>memcpy(&args->out[0], values, sizeof(values[0]) * 4); > >>> > >>> - for (unsigned i = 0; i < 4; ++i) { > >>> - if (!(args->enabled_channels & (1 << i))) > >>> - continue; > >>> - > >>> + for (unsigned i = 0; i < 4; ++i) > >>>args->out[i] = ac_to_float(&ctx->ac, args->out[i]); > >>> - } > >>>} > >>> > >>>static void > >>> ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] radv: implement VK_EXT_sample_locations
A small number of questions/concerns: - sampleLocationCoordinateRange[1] should probably be set to 0.9375, because of how the sample locations are encoded - gl_SamplePosition doesn't seem like it would return the new sample locations - R_028BD4_PA_SC_CENTROID_PRIORITY_{0,1} isn't updated. I'm not sure if this is required, but it's probably best to do so. - I think it can pointlessly call radv_cayman_emit_msaa_sample_locs() before radv_emit_sample_locations() - unlike AMDVLK, this doesn't seem to make use of sample location information during layout transitions? You said that this implements the bare minimum, so you might already know about some of these though (unless you were just talking about the variableSampleLocations thing). On Fri, 7 Dec 2018 at 16:19, Samuel Pitoiset wrote: > > Basically, this extension allows applications to use custom > sample locations. This only implements the barely minimum. > It doesn't support variable sample locations during subpass. > > Most of the dEQP-VK.pipeline.multisample.sample_locations_ext.* > CTS now pass. > > Only enabled on VI+ because it's untested on older chips. > > Signed-off-by: Samuel Pitoiset > --- > src/amd/vulkan/radv_cmd_buffer.c | 177 +- > src/amd/vulkan/radv_device.c | 27 + > src/amd/vulkan/radv_extensions.py | 1 + > src/amd/vulkan/radv_pipeline.c| 30 + > src/amd/vulkan/radv_private.h | 26 +++-- > 5 files changed, 253 insertions(+), 8 deletions(-) > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > b/src/amd/vulkan/radv_cmd_buffer.c > index b4aea5bc898..c4bebeda0ce 100644 > --- a/src/amd/vulkan/radv_cmd_buffer.c > +++ b/src/amd/vulkan/radv_cmd_buffer.c > @@ -105,6 +105,7 @@ radv_bind_dynamic_state(struct radv_cmd_buffer > *cmd_buffer, > dest->viewport.count = src->viewport.count; > dest->scissor.count = src->scissor.count; > dest->discard_rectangle.count = src->discard_rectangle.count; > + dest->sample_location.count = src->sample_location.count; > > if (copy_mask & RADV_DYNAMIC_VIEWPORT) { > if (memcmp(&dest->viewport.viewports, > &src->viewport.viewports, > @@ -192,6 +193,22 @@ radv_bind_dynamic_state(struct radv_cmd_buffer > *cmd_buffer, > } > } > > + if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) { > + if (dest->sample_location.per_pixel != > src->sample_location.per_pixel || > + dest->sample_location.grid_size.width != > src->sample_location.grid_size.width || > + dest->sample_location.grid_size.height != > src->sample_location.grid_size.height || > + memcmp(&dest->sample_location.locations, > + &src->sample_location.locations, > + src->sample_location.count * > sizeof(VkSampleLocationEXT))) { > + dest->sample_location.per_pixel = > src->sample_location.per_pixel; > + dest->sample_location.grid_size = > src->sample_location.grid_size; > + typed_memcpy(dest->sample_location.locations, > +src->sample_location.locations, > +src->sample_location.count); > + dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS; > + } > + } > + > cmd_buffer->state.dirty |= dest_mask; > } > > @@ -634,6 +651,135 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer > *cmd_buffer, > } > } > > +/** > + * Convert the user sample locations to hardware sample locations (the values > + * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*). > + */ > +static void > +radv_convert_user_sample_locs(struct radv_sample_locations_state *state, > + uint32_t x, uint32_t y, VkOffset2D *sample_locs) > +{ > + uint32_t x_offset = x % state->grid_size.width; > + uint32_t y_offset = y % state->grid_size.height; > + uint32_t num_samples = (uint32_t)state->per_pixel; > + VkSampleLocationEXT *user_locs; > + uint32_t pixel_offset; > + > + pixel_offset = (x_offset + y_offset * state->grid_size.width) * > num_samples; > + > + assert(pixel_offset <= MAX_SAMPLE_LOCATIONS); > + user_locs = &state->locations[pixel_offset]; > + > + for (uint32_t i = 0; i < num_samples; i++) { > + float shifted_pos_x = user_locs[i].x - 0.5; > + float shifted_pos_y = user_locs[i].y - 0.5; > + > + int32_t scaled_pos_x = floor(shifted_pos_x * 16); > + int32_t scaled_pos_y = floor(shifted_pos_y * 16); > + > + sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7); > + sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7); > + } > +} > + > +/** > + * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample > + * locations. > + */ > +static void > +radv_compute_sample_locs_pixel(uint32_t
[Mesa-dev] [PATCH 29/38] ac/nir: implement 16-bit pack/unpack opcodes
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 24 1 file changed, 24 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index aac3330c0d..d69135cc25 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1011,6 +1011,30 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; } + case nir_op_pack_32_2x16_split: { + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); + result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + + case nir_op_unpack_32_2x16_split_x: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i16, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, +ctx->ac.i32_0, ""); + break; + } + + case nir_op_unpack_32_2x16_split_y: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i16, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, +ctx->ac.i32_1, ""); + break; + } + case nir_op_cube_face_coord: { src[0] = ac_to_float(&ctx->ac, src[0]); LLVMValueRef results[2]; -- 2.19.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 24/38] ac/nir: make ac_find_lsb work on all bit sizes
Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.c | 31 +-- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 6266058b77..754ceda89b 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2752,29 +2752,10 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMValueRef src0) { unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef zero; - - switch (src0_bitsize) { - case 64: - intrin_name = "llvm.cttz.i64"; - type = ctx->i64; - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.cttz.i32"; - type = ctx->i32; - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.cttz.i16"; - type = ctx->i16; - zero = ctx->i16_0; - break; - default: - unreachable(!"invalid bitsize"); - } + char intrin_name[64]; + LLVMTypeRef type = ac_int_of_size(ctx, src0_bitsize); + LLVMValueRef zero = ac_get_zero(ctx, type); + snprintf(intrin_name, sizeof(intrin_name), "llvm.cttz.i%d", src0_bitsize); LLVMValueRef params[2] = { src0, @@ -2795,9 +2776,7 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, params, 2, AC_FUNC_ATTR_READNONE); - if (src0_bitsize == 64) { - lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); - } + lsb = ac_build_ui_cast(ctx, lsb, ctx->i32); /* TODO: We need an intrinsic to skip this conditional. */ /* Check for zero: */ -- 2.19.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 38/38] radv: expose float16, int16 and int8 features and extensions
Signed-off-by: Rhys Perry --- src/amd/vulkan/radv_device.c | 17 + src/amd/vulkan/radv_extensions.py | 4 src/amd/vulkan/radv_shader.c | 3 +++ 3 files changed, 24 insertions(+) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index ad057a8750..8444651a84 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -848,6 +848,23 @@ void radv_GetPhysicalDeviceFeatures2( features->geometryStreams = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: { + VkPhysicalDeviceFloat16Int8FeaturesKHR *features = + (VkPhysicalDeviceFloat16Int8FeaturesKHR*)ext; + bool enabled = pdevice->rad_info.chip_class >= VI; + features->shaderFloat16 = enabled && HAVE_LLVM >= 0x0800; + features->shaderInt8 = enabled; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: { + VkPhysicalDevice8BitStorageFeaturesKHR *features = + (VkPhysicalDevice8BitStorageFeaturesKHR*)ext; + bool enabled = pdevice->rad_info.chip_class >= VI; + features->storageBuffer8BitAccess = enabled; + features->uniformAndStorageBuffer8BitAccess = enabled; + features->storagePushConstant8 = enabled; + break; + } default: break; } diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index 6bdf988d11..62c58e98af 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -91,6 +91,8 @@ EXTENSIONS = [ Extension('VK_KHR_xlib_surface', 6, 'VK_USE_PLATFORM_XLIB_KHR'), Extension('VK_KHR_multiview', 1, True), Extension('VK_KHR_display', 23, 'VK_USE_PLATFORM_DISPLAY_KHR'), +Extension('VK_KHR_shader_float16_int8', 1, 'device->rad_info.chip_class >= VI'), +Extension('VK_KHR_8bit_storage', 1, 'device->rad_info.chip_class >= VI'), Extension('VK_EXT_direct_mode_display', 1, 'VK_USE_PLATFORM_DISPLAY_KHR'), Extension('VK_EXT_acquire_xlib_display', 1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'), Extension('VK_EXT_calibrated_timestamps', 1, True), @@ -117,6 +119,8 @@ EXTENSIONS = [ Extension('VK_AMD_shader_core_properties',1, True), Extension('VK_AMD_shader_info', 1, True), Extension('VK_AMD_shader_trinary_minmax', 1, True), +Extension('VK_AMD_gpu_shader_half_float', 1, 'device->rad_info.chip_class >= VI && HAVE_LLVM >= 0x0800'), +Extension('VK_AMD_gpu_shader_int16', 1, 'device->rad_info.chip_class >= VI'), Extension('VK_GOOGLE_decorate_string',1, True), Extension('VK_GOOGLE_hlsl_functionality1',1, True), ] diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index a2ddf17680..921b9669f0 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -246,6 +246,9 @@ radv_shader_compile_to_nir(struct radv_device *device, .storage_16bit = true, .geometry_streams = true, .transform_feedback = true, + .float16 = true, + .storage_8bit = true, + .int8 = true, }, }; entry_point = spirv_to_nir(spirv, module->size / 4, -- 2.19.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 31/38] ac/nir, radv: create an array of varying output types
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 68 +++ src/amd/common/ac_shader_abi.h| 1 + src/amd/vulkan/radv_nir_to_llvm.c | 3 ++ 3 files changed, 72 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index e4ae85a1ae..fa7b8c70f0 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3917,6 +3917,68 @@ static void visit_cf_list(struct ac_nir_context *ctx, } } +static unsigned traverse_var_component_slots(struct ac_llvm_context *ctx, bool vs_in, +struct nir_variable *var, unsigned cur_offset, +const struct glsl_type *cur_type, +void (*cb)(struct ac_llvm_context *, unsigned, enum glsl_base_type, void *), +void *cbdata) +{ + if (glsl_type_is_struct(cur_type)) { + for (unsigned i = 0; i < glsl_get_length(cur_type); i++) { + const struct glsl_type *ft = glsl_get_struct_field(cur_type, i); + cur_offset = traverse_var_component_slots(ctx, vs_in, var, cur_offset, ft, cb, cbdata); + } + return (cur_offset + 3) / 4 * 4; + } + + enum glsl_base_type base_type = glsl_get_base_type(glsl_without_array_or_matrix(cur_type)); + + unsigned stride = glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); + if (!var->data.compact) + stride = (stride + 3) / 4 * 4; + unsigned arr_len = MAX2(glsl_get_matrix_columns(cur_type), 1); + if (glsl_type_is_array(cur_type)) + arr_len *= glsl_get_aoa_size(cur_type); + for (unsigned i = 0; i < arr_len; i++) { + for (unsigned j = 0; j < glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); j++) { + cb(ctx, cur_offset + var->data.location_frac + j, base_type, cbdata); + } + cur_offset += stride; + } + return cur_offset; +} + +static void setup_output_type(struct ac_llvm_context *ctx, unsigned index, enum glsl_base_type base, void *output_types) +{ + LLVMTypeRef type; + switch (base) { + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT8: + type = ctx->i8; + break; + case GLSL_TYPE_INT16: + case GLSL_TYPE_UINT16: + type = ctx->i16; + break; + case GLSL_TYPE_FLOAT16: + type = ctx->f16; + break; + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_INT64: + case GLSL_TYPE_UINT64: + type = ctx->i32; + break; + case GLSL_TYPE_FLOAT: + case GLSL_TYPE_DOUBLE: + default: + type = ctx->f32; + break; + } + ((LLVMTypeRef*)output_types)[index] = type; +} + void ac_handle_shader_output_decl(struct ac_llvm_context *ctx, struct ac_shader_abi *abi, @@ -3954,6 +4016,9 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx, ac_build_alloca_undef(ctx, type, ""); } } + + traverse_var_component_slots(ctx, false, variable, output_loc * 4, +variable->type, &setup_output_type, abi->output_types); } static LLVMTypeRef @@ -4077,6 +4142,9 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS * 4; i++) + ctx.abi->output_types[i] = ac->i32; + nir_foreach_variable(variable, &nir->outputs) ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, ctx.stage); diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h index 6b9a91c92a..1d078fc42d 100644 --- a/src/amd/common/ac_shader_abi.h +++ b/src/amd/common/ac_shader_abi.h @@ -69,6 +69,7 @@ struct ac_shader_abi { LLVMValueRef view_index; LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4]; + LLVMTypeRef output_types[AC_LLVM_MAX_OUTPUTS * 4]; /* For VS and PS: pre-loaded shader inputs. * diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 90bcc8dbfe..f114a86018 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -3945,6 +3945,9 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out; ac_setup_rings(&ctx); + for (unsigned i
[Mesa-dev] [PATCH 37/38] ac/nir: have nir_op_f2f16 round to zero
In the hope that one day LLVM will then be able to generate code with vectorized v_cvt_pkrtz_f16_f32 instructions. Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 92b773981b..88b26e019f 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -884,6 +884,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); break; case nir_op_f2f16_rtz: + case nir_op_f2f16: src[0] = ac_to_float(&ctx->ac, src[0]); if (LLVMTypeOf(src[0]) == ctx->ac.f64) src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); @@ -894,7 +895,6 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, ""); break; case nir_op_f2f16_rtne: - case nir_op_f2f16: case nir_op_f2f32: case nir_op_f2f64: src[0] = ac_to_float(&ctx->ac, src[0]); -- 2.19.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 30/38] ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type
Signed-off-by: Rhys Perry --- src/amd/common/ac_nir_to_llvm.c | 8 1 file changed, 8 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index d69135cc25..e4ae85a1ae 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3961,11 +3961,19 @@ glsl_base_to_llvm_type(struct ac_llvm_context *ac, enum glsl_base_type type) { switch (type) { + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT8: + return ac->i8; + case GLSL_TYPE_INT16: + case GLSL_TYPE_UINT16: + return ac->i16; case GLSL_TYPE_INT: case GLSL_TYPE_UINT: case GLSL_TYPE_BOOL: case GLSL_TYPE_SUBROUTINE: return ac->i32; + case GLSL_TYPE_FLOAT16: + return ac->f16; case GLSL_TYPE_FLOAT: /* TODO handle mediump */ return ac->f32; case GLSL_TYPE_INT64: -- 2.19.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev