Re: [Mesa-dev] [PATCH 06/25] amd/common: scan/reduce across waves of a workgroup

2018-12-07 Thread Haehnle, Nicolai
On 06.12.18 15:20, Connor Abbott wrote:
> Is this going to be used by an extension? If you don't have a use for
> it yet, it would probably be better to wait.

Well, I have been using it quite extensively in a branch I've been 
working on, but that's not quite ready yet.

Cheers,
Nicolai


> On Thu, Dec 6, 2018 at 3:01 PM Nicolai Hähnle  wrote:
>>
>> From: Nicolai Hähnle 
>>
>> Order-aware scan/reduce can trade-off LDS traffic for external atomics
>> memory traffic in producer/consumer compute shaders.
>> ---
>>   src/amd/common/ac_llvm_build.c | 195 -
>>   src/amd/common/ac_llvm_build.h |  36 ++
>>   2 files changed, 227 insertions(+), 4 deletions(-)
>>
>> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
>> index 68c8bad9e83..932f4bbdeef 100644
>> --- a/src/amd/common/ac_llvm_build.c
>> +++ b/src/amd/common/ac_llvm_build.c
>> @@ -3345,68 +3345,88 @@ ac_build_alu_op(struct ac_llvm_context *ctx, 
>> LLVMValueRef lhs, LLVMValueRef rhs,
>>  _64bit ? ctx->f64 : ctx->f32,
>>  (LLVMValueRef[]){lhs, rhs}, 2, 
>> AC_FUNC_ATTR_READNONE);
>>  case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
>>  case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
>>  case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
>>  default:
>>  unreachable("bad reduction intrinsic");
>>  }
>>   }
>>
>> -/* TODO: add inclusive and excluse scan functions for SI chip class.  */
>> +/**
>> + * \param maxprefix specifies that the result only needs to be correct for a
>> + * prefix of this many threads
>> + *
>> + * TODO: add inclusive and excluse scan functions for SI chip class.
>> + */
>>   static LLVMValueRef
>> -ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, 
>> LLVMValueRef identity)
>> +ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, 
>> LLVMValueRef identity,
>> + unsigned maxprefix)
>>   {
>>  LLVMValueRef result, tmp;
>>  result = src;
>> +   if (maxprefix <= 1)
>> +   return result;
>>  tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, 
>> false);
>>  result = ac_build_alu_op(ctx, result, tmp, op);
>> +   if (maxprefix <= 2)
>> +   return result;
>>  tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, 
>> false);
>>  result = ac_build_alu_op(ctx, result, tmp, op);
>> +   if (maxprefix <= 3)
>> +   return result;
>>  tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, 
>> false);
>>  result = ac_build_alu_op(ctx, result, tmp, op);
>> +   if (maxprefix <= 4)
>> +   return result;
>>  tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, 
>> false);
>>  result = ac_build_alu_op(ctx, result, tmp, op);
>> +   if (maxprefix <= 8)
>> +   return result;
>>  tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, 
>> false);
>>  result = ac_build_alu_op(ctx, result, tmp, op);
>> +   if (maxprefix <= 16)
>> +   return result;
>>  tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 
>> 0xf, false);
>>  result = ac_build_alu_op(ctx, result, tmp, op);
>> +   if (maxprefix <= 32)
>> +   return result;
>>  tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 
>> 0xf, false);
>>  result = ac_build_alu_op(ctx, result, tmp, op);
>>  return result;
>>   }
>>
>>   LLVMValueRef
>>   ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, 
>> nir_op op)
>>   {
>>  ac_build_optimization_barrier(ctx, );
>>  LLVMValueRef result;
>>  LLVMValueRef identity =
>>  get_reduction_identity(ctx, op, 
>> ac_get_type_size(LLVMTypeOf(src)));
>>  result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, 
>> src, identity),
>>LLVMTypeOf(identity), "");
>> -   result = ac_build_scan(ctx, op, result, identity);
>> +   result = ac_build_scan(ctx, op, result, identity, 64);
>>
>>  return ac_build_wwm(ctx, result);
>>   }
>>
>>   LLVMValueRef
>>   ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, 
>> nir_op op)
>>   {
>>  ac_build_optimization_barrier(ctx, );
>>  LLVMValueRef result;
>>  LLVMValueRef identity =
>>  get_reduction_identity(ctx, op, 
>> ac_get_type_size(LLVMTypeOf(src)));
>>  result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, 
>> src, identity),
>>LLVMTypeOf(identity), "");
>>  result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, 
>> false);
>> -   result = 

Re: [Mesa-dev] [PATCH 06/25] amd/common: scan/reduce across waves of a workgroup

2018-12-06 Thread Connor Abbott
Is this going to be used by an extension? If you don't have a use for
it yet, it would probably be better to wait.
On Thu, Dec 6, 2018 at 3:01 PM Nicolai Hähnle  wrote:
>
> From: Nicolai Hähnle 
>
> Order-aware scan/reduce can trade-off LDS traffic for external atomics
> memory traffic in producer/consumer compute shaders.
> ---
>  src/amd/common/ac_llvm_build.c | 195 -
>  src/amd/common/ac_llvm_build.h |  36 ++
>  2 files changed, 227 insertions(+), 4 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 68c8bad9e83..932f4bbdeef 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -3345,68 +3345,88 @@ ac_build_alu_op(struct ac_llvm_context *ctx, 
> LLVMValueRef lhs, LLVMValueRef rhs,
> _64bit ? ctx->f64 : ctx->f32,
> (LLVMValueRef[]){lhs, rhs}, 2, 
> AC_FUNC_ATTR_READNONE);
> case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
> case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
> case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
> default:
> unreachable("bad reduction intrinsic");
> }
>  }
>
> -/* TODO: add inclusive and excluse scan functions for SI chip class.  */
> +/**
> + * \param maxprefix specifies that the result only needs to be correct for a
> + * prefix of this many threads
> + *
> + * TODO: add inclusive and excluse scan functions for SI chip class.
> + */
>  static LLVMValueRef
> -ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, 
> LLVMValueRef identity)
> +ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, 
> LLVMValueRef identity,
> + unsigned maxprefix)
>  {
> LLVMValueRef result, tmp;
> result = src;
> +   if (maxprefix <= 1)
> +   return result;
> tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, 
> false);
> result = ac_build_alu_op(ctx, result, tmp, op);
> +   if (maxprefix <= 2)
> +   return result;
> tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, 
> false);
> result = ac_build_alu_op(ctx, result, tmp, op);
> +   if (maxprefix <= 3)
> +   return result;
> tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, 
> false);
> result = ac_build_alu_op(ctx, result, tmp, op);
> +   if (maxprefix <= 4)
> +   return result;
> tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, 
> false);
> result = ac_build_alu_op(ctx, result, tmp, op);
> +   if (maxprefix <= 8)
> +   return result;
> tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, 
> false);
> result = ac_build_alu_op(ctx, result, tmp, op);
> +   if (maxprefix <= 16)
> +   return result;
> tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, 
> false);
> result = ac_build_alu_op(ctx, result, tmp, op);
> +   if (maxprefix <= 32)
> +   return result;
> tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, 
> false);
> result = ac_build_alu_op(ctx, result, tmp, op);
> return result;
>  }
>
>  LLVMValueRef
>  ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, 
> nir_op op)
>  {
> ac_build_optimization_barrier(ctx, );
> LLVMValueRef result;
> LLVMValueRef identity =
> get_reduction_identity(ctx, op, 
> ac_get_type_size(LLVMTypeOf(src)));
> result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, 
> src, identity),
>   LLVMTypeOf(identity), "");
> -   result = ac_build_scan(ctx, op, result, identity);
> +   result = ac_build_scan(ctx, op, result, identity, 64);
>
> return ac_build_wwm(ctx, result);
>  }
>
>  LLVMValueRef
>  ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, 
> nir_op op)
>  {
> ac_build_optimization_barrier(ctx, );
> LLVMValueRef result;
> LLVMValueRef identity =
> get_reduction_identity(ctx, op, 
> ac_get_type_size(LLVMTypeOf(src)));
> result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, 
> src, identity),
>   LLVMTypeOf(identity), "");
> result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, 
> false);
> -   result = ac_build_scan(ctx, op, result, identity);
> +   result = ac_build_scan(ctx, op, result, identity, 64);
>
> return ac_build_wwm(ctx, result);
>  }
>
>  LLVMValueRef
>  ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, 
> unsigned cluster_size)
>  {
> if (cluster_size == 1) return src;
>