Re: [Mesa-dev] [PATCH 06/25] amd/common: scan/reduce across waves of a workgroup
On 06.12.18 15:20, Connor Abbott wrote: > Is this going to be used by an extension? If you don't have a use for > it yet, it would probably be better to wait. Well, I have been using it quite extensively in a branch I've been working on, but that's not quite ready yet. Cheers, Nicolai > On Thu, Dec 6, 2018 at 3:01 PM Nicolai Hähnle wrote: >> >> From: Nicolai Hähnle >> >> Order-aware scan/reduce can trade-off LDS traffic for external atomics >> memory traffic in producer/consumer compute shaders. >> --- >> src/amd/common/ac_llvm_build.c | 195 - >> src/amd/common/ac_llvm_build.h | 36 ++ >> 2 files changed, 227 insertions(+), 4 deletions(-) >> >> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c >> index 68c8bad9e83..932f4bbdeef 100644 >> --- a/src/amd/common/ac_llvm_build.c >> +++ b/src/amd/common/ac_llvm_build.c >> @@ -3345,68 +3345,88 @@ ac_build_alu_op(struct ac_llvm_context *ctx, >> LLVMValueRef lhs, LLVMValueRef rhs, >> _64bit ? ctx->f64 : ctx->f32, >> (LLVMValueRef[]){lhs, rhs}, 2, >> AC_FUNC_ATTR_READNONE); >> case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); >> case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, ""); >> case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, ""); >> default: >> unreachable("bad reduction intrinsic"); >> } >> } >> >> -/* TODO: add inclusive and excluse scan functions for SI chip class. */ >> +/** >> + * \param maxprefix specifies that the result only needs to be correct for a >> + * prefix of this many threads >> + * >> + * TODO: add inclusive and excluse scan functions for SI chip class. >> + */ >> static LLVMValueRef >> -ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, >> LLVMValueRef identity) >> +ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, >> LLVMValueRef identity, >> + unsigned maxprefix) >> { >> LLVMValueRef result, tmp; >> result = src; >> + if (maxprefix <= 1) >> + return result; >> tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, >> false); >> result = ac_build_alu_op(ctx, result, tmp, op); >> + if (maxprefix <= 2) >> + return result; >> tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, >> false); >> result = ac_build_alu_op(ctx, result, tmp, op); >> + if (maxprefix <= 3) >> + return result; >> tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, >> false); >> result = ac_build_alu_op(ctx, result, tmp, op); >> + if (maxprefix <= 4) >> + return result; >> tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, >> false); >> result = ac_build_alu_op(ctx, result, tmp, op); >> + if (maxprefix <= 8) >> + return result; >> tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, >> false); >> result = ac_build_alu_op(ctx, result, tmp, op); >> + if (maxprefix <= 16) >> + return result; >> tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, >> 0xf, false); >> result = ac_build_alu_op(ctx, result, tmp, op); >> + if (maxprefix <= 32) >> + return result; >> tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, >> 0xf, false); >> result = ac_build_alu_op(ctx, result, tmp, op); >> return result; >> } >> >> LLVMValueRef >> ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, >> nir_op op) >> { >> ac_build_optimization_barrier(ctx, ); >> LLVMValueRef result; >> LLVMValueRef identity = >> get_reduction_identity(ctx, op, >> ac_get_type_size(LLVMTypeOf(src))); >> result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, >> src, identity), >>LLVMTypeOf(identity), ""); >> - result = ac_build_scan(ctx, op, result, identity); >> + result = ac_build_scan(ctx, op, result, identity, 64); >> >> return ac_build_wwm(ctx, result); >> } >> >> LLVMValueRef >> ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, >> nir_op op) >> { >> ac_build_optimization_barrier(ctx, ); >> LLVMValueRef result; >> LLVMValueRef identity = >> get_reduction_identity(ctx, op, >> ac_get_type_size(LLVMTypeOf(src))); >> result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, >> src, identity), >>LLVMTypeOf(identity), ""); >> result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, >> false); >> - result =
Re: [Mesa-dev] [PATCH 06/25] amd/common: scan/reduce across waves of a workgroup
Is this going to be used by an extension? If you don't have a use for it yet, it would probably be better to wait. On Thu, Dec 6, 2018 at 3:01 PM Nicolai Hähnle wrote: > > From: Nicolai Hähnle > > Order-aware scan/reduce can trade-off LDS traffic for external atomics > memory traffic in producer/consumer compute shaders. > --- > src/amd/common/ac_llvm_build.c | 195 - > src/amd/common/ac_llvm_build.h | 36 ++ > 2 files changed, 227 insertions(+), 4 deletions(-) > > diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c > index 68c8bad9e83..932f4bbdeef 100644 > --- a/src/amd/common/ac_llvm_build.c > +++ b/src/amd/common/ac_llvm_build.c > @@ -3345,68 +3345,88 @@ ac_build_alu_op(struct ac_llvm_context *ctx, > LLVMValueRef lhs, LLVMValueRef rhs, > _64bit ? ctx->f64 : ctx->f32, > (LLVMValueRef[]){lhs, rhs}, 2, > AC_FUNC_ATTR_READNONE); > case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); > case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, ""); > case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, ""); > default: > unreachable("bad reduction intrinsic"); > } > } > > -/* TODO: add inclusive and excluse scan functions for SI chip class. */ > +/** > + * \param maxprefix specifies that the result only needs to be correct for a > + * prefix of this many threads > + * > + * TODO: add inclusive and excluse scan functions for SI chip class. > + */ > static LLVMValueRef > -ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, > LLVMValueRef identity) > +ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, > LLVMValueRef identity, > + unsigned maxprefix) > { > LLVMValueRef result, tmp; > result = src; > + if (maxprefix <= 1) > + return result; > tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, > false); > result = ac_build_alu_op(ctx, result, tmp, op); > + if (maxprefix <= 2) > + return result; > tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, > false); > result = ac_build_alu_op(ctx, result, tmp, op); > + if (maxprefix <= 3) > + return result; > tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, > false); > result = ac_build_alu_op(ctx, result, tmp, op); > + if (maxprefix <= 4) > + return result; > tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, > false); > result = ac_build_alu_op(ctx, result, tmp, op); > + if (maxprefix <= 8) > + return result; > tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, > false); > result = ac_build_alu_op(ctx, result, tmp, op); > + if (maxprefix <= 16) > + return result; > tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, > false); > result = ac_build_alu_op(ctx, result, tmp, op); > + if (maxprefix <= 32) > + return result; > tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, > false); > result = ac_build_alu_op(ctx, result, tmp, op); > return result; > } > > LLVMValueRef > ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, > nir_op op) > { > ac_build_optimization_barrier(ctx, ); > LLVMValueRef result; > LLVMValueRef identity = > get_reduction_identity(ctx, op, > ac_get_type_size(LLVMTypeOf(src))); > result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, > src, identity), > LLVMTypeOf(identity), ""); > - result = ac_build_scan(ctx, op, result, identity); > + result = ac_build_scan(ctx, op, result, identity, 64); > > return ac_build_wwm(ctx, result); > } > > LLVMValueRef > ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, > nir_op op) > { > ac_build_optimization_barrier(ctx, ); > LLVMValueRef result; > LLVMValueRef identity = > get_reduction_identity(ctx, op, > ac_get_type_size(LLVMTypeOf(src))); > result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, > src, identity), > LLVMTypeOf(identity), ""); > result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, > false); > - result = ac_build_scan(ctx, op, result, identity); > + result = ac_build_scan(ctx, op, result, identity, 64); > > return ac_build_wwm(ctx, result); > } > > LLVMValueRef > ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, > unsigned cluster_size) > { > if (cluster_size == 1) return src; >
[Mesa-dev] [PATCH 06/25] amd/common: scan/reduce across waves of a workgroup
From: Nicolai Hähnle Order-aware scan/reduce can trade-off LDS traffic for external atomics memory traffic in producer/consumer compute shaders. --- src/amd/common/ac_llvm_build.c | 195 - src/amd/common/ac_llvm_build.h | 36 ++ 2 files changed, 227 insertions(+), 4 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 68c8bad9e83..932f4bbdeef 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -3345,68 +3345,88 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, _64bit ? ctx->f64 : ctx->f32, (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, ""); case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, ""); default: unreachable("bad reduction intrinsic"); } } -/* TODO: add inclusive and excluse scan functions for SI chip class. */ +/** + * \param maxprefix specifies that the result only needs to be correct for a + * prefix of this many threads + * + * TODO: add inclusive and excluse scan functions for SI chip class. + */ static LLVMValueRef -ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity) +ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity, + unsigned maxprefix) { LLVMValueRef result, tmp; result = src; + if (maxprefix <= 1) + return result; tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 2) + return result; tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false); result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 3) + return result; tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false); result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 4) + return result; tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false); result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 8) + return result; tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false); result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 16) + return result; tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 32) + return result; tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); result = ac_build_alu_op(ctx, result, tmp, op); return result; } LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) { ac_build_optimization_barrier(ctx, ); LLVMValueRef result; LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), LLVMTypeOf(identity), ""); - result = ac_build_scan(ctx, op, result, identity); + result = ac_build_scan(ctx, op, result, identity, 64); return ac_build_wwm(ctx, result); } LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) { ac_build_optimization_barrier(ctx, ); LLVMValueRef result; LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), LLVMTypeOf(identity), ""); result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false); - result = ac_build_scan(ctx, op, result, identity); + result = ac_build_scan(ctx, op, result, identity, 64); return ac_build_wwm(ctx, result); } LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size) { if (cluster_size == 1) return src; ac_build_optimization_barrier(ctx, ); LLVMValueRef result, swap; @@ -3450,20 +3470,187 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); return ac_build_wwm(ctx, result); } else { swap = ac_build_readlane(ctx, result, ctx->i32_0); result