Re: [Mesa-dev] [PATCH v3 5/7] ac: add LLVM build functions for subgroup instrinsics
I'm a bit late, but just some stylistic nitpicks for the future: As a general stylistic note, prefer `false' as the last argument to LLVMConstInt (since it's a boolean, whether to sign-extend). [snip] +{ + ac_build_optimization_barrier(ctx, ); + LLVMValueRef result; + LLVMValueRef identity = get_reduction_identity(ctx, op, + ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, + ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); Weird whitespace, are you using a non-default tab width? + result = ac_build_scan(ctx, op, result, identity); + + return ac_build_wwm(ctx, result); +} + +LLVMValueRef +ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) +{ + ac_build_optimization_barrier(ctx, ); + LLVMValueRef result; + LLVMValueRef identity = get_reduction_identity(ctx, op, + ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, + ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); + result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false); + result = ac_build_scan(ctx, op, result, identity); + + return ac_build_wwm(ctx, result); +} + +LLVMValueRef +ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size) +{ + if (cluster_size == 1) return src; Prefer to put the return on a new line, please. Thanks, Nicolai + ac_build_optimization_barrier(ctx, ); + LLVMValueRef result, swap; + LLVMValueRef identity = get_reduction_identity(ctx, op, + ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, + ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); Weird whitespace again. With those style nitpicks fixed, Reviewed-by: Nicolai Hähnle+ swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 2) return ac_build_wwm(ctx, result); + + swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 4) return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= VI) + swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 8) return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= VI) + swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 16) return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= VI && cluster_size != 32) + swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 32) return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= VI) { + swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); + result = ac_build_alu_op(ctx, result, swap, op); + result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); + return ac_build_wwm(ctx, result); + } else { + swap = ac_build_readlane(ctx, result, ctx->i32_0); + result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0)); + result = ac_build_alu_op(ctx, result, swap, op); + return ac_build_wwm(ctx, result); + } +} + +LLVMValueRef +ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) +{ + unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3); + if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0600) { + return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false); + } else { +
[Mesa-dev] [PATCH v3 5/7] ac: add LLVM build functions for subgroup instrinsics
Co-authored-by: Connor Abbott--- src/amd/common/ac_llvm_build.c | 456 + src/amd/common/ac_llvm_build.h | 30 ++- 2 files changed, 485 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 2fb8aeaac6..9a00bb1114 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -2507,3 +2507,459 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], ""); } + +static LLVMValueRef +_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) +{ + ac_build_optimization_barrier(ctx, ); + return ac_build_intrinsic(ctx, + lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane", + LLVMTypeOf(src), (LLVMValueRef []) { + src, lane }, + lane == NULL ? 1 : 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); +} + +/** + * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic. + * @param ctx + * @param src + * @param lane - id of the lane or NULL for the first active lane + * @return value of the lane + */ +LLVMValueRef +ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + + if (bits == 32) { + ret = _ac_build_readlane(ctx, src, lane); + } else { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = + LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, + LLVMConstInt(ctx->i32, i, 0), ""); + LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane); + ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp, + LLVMConstInt(ctx->i32, i, 0), ""); + } + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +LLVMValueRef +ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane) +{ + /* TODO: Use the actual instruction when LLVM adds an intrinsic for it. +*/ + LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane, + ac_get_thread_id(ctx), ""); + return LLVMBuildSelect(ctx->builder, pred, value, src, ""); +} + +LLVMValueRef +ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) +{ + LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, +LLVMVectorType(ctx->i32, 2), +""); + LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, + ctx->i32_0, ""); + LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, + ctx->i32_1, ""); + LLVMValueRef val = + ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, + (LLVMValueRef []) { mask_lo, ctx->i32_0 }, + 2, AC_FUNC_ATTR_READNONE); + val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, +(LLVMValueRef []) { mask_hi, val }, +2, AC_FUNC_ATTR_READNONE); + return val; +} + +enum dpp_ctrl { + _dpp_quad_perm = 0x000, + _dpp_row_sl = 0x100, + _dpp_row_sr = 0x110, + _dpp_row_rr = 0x120, + dpp_wf_sl1 = 0x130, + dpp_wf_rl1 = 0x134, + dpp_wf_sr1 = 0x138, + dpp_wf_rr1 = 0x13C, + dpp_row_mirror = 0x140, + dpp_row_half_mirror = 0x141, + dpp_row_bcast15 = 0x142, + dpp_row_bcast31 = 0x143 +}; + +static inline enum dpp_ctrl +dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) +{ + assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); + return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); +} + +static inline enum dpp_ctrl +dpp_row_sl(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return _dpp_row_sl | amount; +} + +static inline enum dpp_ctrl +dpp_row_sr(unsigned amount)