Module: Mesa Branch: main Commit: f98871608cbae6f7fd16561e9c92f5c22334e5f1 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=f98871608cbae6f7fd16561e9c92f5c22334e5f1
Author: Marek Olšák <[email protected]> Date: Thu Apr 27 03:49:10 2023 -0400 ac/llvm: rewrite and unify how GLC, DLC, SLC are set Use ACCESS_* flags in call sites instead of GLC/DLC/SLC. ACCESS_* flags are extended to describe other aspects of memory instructions like load/store/atomic/smem. Then add a function that converts the access flags to GLC, DLC, SLC. The new functions are also usable by ACO. Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22770> --- src/amd/common/ac_shader_util.c | 140 +++++++++++++++++++++ src/amd/common/ac_shader_util.h | 40 ++++++ src/amd/llvm/ac_llvm_build.c | 84 ++++++------- src/amd/llvm/ac_llvm_build.h | 29 ++--- src/amd/llvm/ac_nir_to_llvm.c | 89 +++++-------- src/gallium/drivers/radeonsi/si_shader_llvm_tess.c | 11 +- 6 files changed, 270 insertions(+), 123 deletions(-) diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 29015ec271e..464a2fccd13 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -1014,3 +1014,143 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info, *tmpring_size = S_0286E8_WAVES(max_scratch_waves) | S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift); } + +/* Get chip-agnostic memory instruction access flags (as opposed to chip-specific GLC/DLC/SLC) + * from a NIR memory intrinsic. + */ +enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr) +{ + enum gl_access_qualifier access = + nir_intrinsic_has_access(instr) ? nir_intrinsic_access(instr) : 0; + + /* Determine ACCESS_MAY_STORE_SUBDWORD. (for the GFX6 TC L1 bug workaround) */ + if (!nir_intrinsic_infos[instr->intrinsic].has_dest) { + switch (instr->intrinsic) { + case nir_intrinsic_bindless_image_store: + access |= ACCESS_MAY_STORE_SUBDWORD; + break; + + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_buffer_amd: + case nir_intrinsic_store_global: + case nir_intrinsic_store_global_amd: + if (access & ACCESS_USES_FORMAT_AMD || + (nir_intrinsic_has_align_offset(instr) && nir_intrinsic_align(instr) % 4 != 0) || + ((instr->src[0].ssa->bit_size / 8) * instr->src[0].ssa->num_components) % 4 != 0) + access |= ACCESS_MAY_STORE_SUBDWORD; + break; + + default: + unreachable("unexpected store instruction"); + } + } + + return access; +} + +/* Convert chip-agnostic memory access flags into hw-specific cache flags. + * + * "access" must be a result of ac_get_mem_access_flags() with the appropriate ACCESS_TYPE_* + * flags set. + */ +union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level, + enum gl_access_qualifier access) +{ + union ac_hw_cache_flags result; + result.value = 0; + + assert(util_bitcount(access & (ACCESS_TYPE_LOAD | ACCESS_TYPE_STORE | + ACCESS_TYPE_ATOMIC)) == 1); + assert(!(access & ACCESS_TYPE_SMEM) || access & ACCESS_TYPE_LOAD); + assert(!(access & ACCESS_IS_SWIZZLED_AMD) || !(access & ACCESS_TYPE_SMEM)); + assert(!(access & ACCESS_MAY_STORE_SUBDWORD) || access & ACCESS_TYPE_STORE); + + bool scope_is_device = access & (ACCESS_COHERENT | ACCESS_VOLATILE); + + if (gfx_level >= GFX11) { + /* GFX11 simplified it and exposes what is actually useful. + * + * GLC means device scope for loads only. (stores and atomics are always device scope) + * SLC means non-temporal for GL1 and GL2 caches. (GL1 = hit-evict, GL2 = stream, unavailable in SMEM) + * DLC means non-temporal for MALL. (noalloc, i.e. coherent bypass) + * + * GL0 doesn't have a non-temporal flag, so you always get LRU caching in CU scope. + */ + if (access & ACCESS_TYPE_LOAD && scope_is_device) + result.value |= ac_glc; + + if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM)) + result.value |= ac_slc; + } else if (gfx_level >= GFX10) { + /* GFX10-10.3: + * + * VMEM and SMEM loads (SMEM only supports the first four): + * !GLC && !DLC && !SLC means CU scope <== use for normal loads with CU scope + * GLC && !DLC && !SLC means SA scope + * !GLC && DLC && !SLC means CU scope, GL1 bypass + * GLC && DLC && !SLC means device scope <== use for normal loads with device scope + * !GLC && !DLC && SLC means CU scope, non-temporal (GL0 = GL1 = hit-evict, GL2 = stream) <== use for non-temporal loads with CU scope + * GLC && !DLC && SLC means SA scope, non-temporal (GL1 = hit-evict, GL2 = stream) + * !GLC && DLC && SLC means CU scope, GL0 non-temporal, GL1-GL2 coherent bypass (GL0 = hit-evict, GL1 = bypass, GL2 = noalloc) + * GLC && DLC && SLC means device scope, GL2 coherent bypass (noalloc) <== use for non-temporal loads with device scope + * + * VMEM stores/atomics (stores are CU scope only if they overwrite the whole cache line, + * atomics are always device scope, GL1 is always bypassed): + * !GLC && !DLC && !SLC means CU scope <== use for normal stores with CU scope + * GLC && !DLC && !SLC means device scope <== use for normal stores with device scope + * !GLC && DLC && !SLC means CU scope, GL2 non-coherent bypass + * GLC && DLC && !SLC means device scope, GL2 non-coherent bypass + * !GLC && !DLC && SLC means CU scope, GL2 non-temporal (stream) <== use for non-temporal stores with CU scope + * GLC && !DLC && SLC means device scope, GL2 non-temporal (stream) <== use for non-temporal stores with device scope + * !GLC && DLC && SLC means CU scope, GL2 coherent bypass (noalloc) + * GLC && DLC && SLC means device scope, GL2 coherent bypass (noalloc) + * + * "stream" allows write combining in GL2. "coherent bypass" doesn't. + * "non-coherent bypass" doesn't guarantee ordering with any coherent stores. + */ + if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC)) + result.value |= ac_glc | (access & ACCESS_TYPE_LOAD ? ac_dlc : 0); + + if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM)) + result.value |= ac_slc; + } else { + /* GFX6-GFX9: + * + * VMEM loads: + * !GLC && !SLC means CU scope + * GLC && !SLC means (GFX6: device scope, GFX7-9: device scope [*]) + * !GLC && SLC means (GFX6: CU scope, GFX7: device scope, GFX8-9: CU scope), GL2 non-temporal (stream) + * GLC && SLC means device scope, GL2 non-temporal (stream) + * + * VMEM stores (atomics don't have [*]): + * !GLC && !SLC means (GFX6: CU scope, GFX7-9: device scope [*]) + * GLC && !SLC means (GFX6-7: device scope, GFX8-9: device scope [*]) + * !GLC && SLC means (GFX6: CU scope, GFX7-9: device scope [*]), GL2 non-temporal (stream) + * GLC && SLC means device scope, GL2 non-temporal (stream) + * + * [*] data can be cached in GL1 for future CU scope + * + * SMEM loads: + * GLC means device scope (available on GFX8+) + */ + if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC)) { + /* SMEM doesn't support the device scope on GFX6-7. */ + assert(gfx_level >= GFX8 || !(access & ACCESS_TYPE_SMEM)); + result.value |= ac_glc; + } + + if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM)) + result.value |= ac_slc; + + /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All store opcodes not + * aligned to a dword are affected. + */ + if (gfx_level == GFX6 && access & ACCESS_MAY_STORE_SUBDWORD) + result.value |= ac_glc; + } + + if (access & ACCESS_IS_SWIZZLED_AMD) + result.value |= ac_swizzled; + + return result; +} diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 7c2b5ea4035..dc36cb373ea 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -46,6 +46,41 @@ extern "C" { #define AC_SENDMSG_GS_OP_EMIT (2 << 4) #define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4) +/* An extension of gl_access_qualifier describing other aspects of memory operations + * for code generation. + */ +enum { + /* Only one of LOAD/STORE/ATOMIC can be set. */ + ACCESS_TYPE_LOAD = BITFIELD_BIT(27), + ACCESS_TYPE_STORE = BITFIELD_BIT(28), + ACCESS_TYPE_ATOMIC = BITFIELD_BIT(29), + + /* This access is expected to use an SMEM instruction if source operands are non-divergent. + * Only loads can set this. + */ + ACCESS_TYPE_SMEM = BITFIELD_BIT(30), + + /* Whether a store offset or size alignment is less than 4. */ + ACCESS_MAY_STORE_SUBDWORD = BITFIELD_BIT(31), +}; + +/* The meaning of these enums is different between chips. They match LLVM definitions, + * but they can also be used by ACO. Use ac_get_hw_cache_flags to get these. + */ +enum ac_cache_flags +{ + ac_glc = BITFIELD_BIT(0), + ac_slc = BITFIELD_BIT(1), + ac_dlc = BITFIELD_BIT(2), + ac_swizzled = BITFIELD_BIT(3), +}; + +union ac_hw_cache_flags +{ + /* NOTE: This will contain more fields in the future. */ + enum ac_cache_flags value; +}; + enum ac_image_dim { ac_image_1d, @@ -199,6 +234,11 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage, bool streamout_enabled, bool can_cull); +enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr); + +union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level, + enum gl_access_qualifier access); + #ifdef __cplusplus } #endif diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 89c3af49c93..423cedd04b0 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -1221,23 +1221,15 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, false); } -static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy) +static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access) { - return cache_policy | - (ctx->gfx_level >= GFX10 && ctx->gfx_level < GFX11 && cache_policy & ac_glc ? ac_dlc : 0); -} - -static unsigned get_store_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy) -{ - if (ctx->gfx_level >= GFX11) - cache_policy &= ~ac_glc; /* GLC has no effect on stores */ - return cache_policy; + return ac_get_hw_cache_flags(ctx->gfx_level, access).value; } static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy, bool use_format) + enum gl_access_qualifier access, bool use_format) { LLVMValueRef args[6]; int idx = 0; @@ -1247,7 +1239,7 @@ static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueR args[idx++] = vindex ? vindex : ctx->i32_0; args[idx++] = voffset ? voffset : ctx->i32_0; args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, get_store_cache_policy(ctx, cache_policy), 0); + args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0); const char *indexing_kind = vindex ? "struct" : "raw"; char name[256], type_name[8]; @@ -1264,15 +1256,15 @@ static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueR } void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data, - LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy) + LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access) { - ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true); + ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true); } /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */ void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy) + enum gl_access_qualifier access) { unsigned num_channels = ac_get_llvm_num_components(vdata); @@ -1288,19 +1280,19 @@ void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, LLVMConstInt(ctx->i32, 8, 0), ""); - ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, cache_policy); - ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, cache_policy); + ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access); + ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access); return; } ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset, - cache_policy, false); + access, false); } static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, unsigned num_channels, - LLVMTypeRef channel_type, unsigned cache_policy, + LLVMTypeRef channel_type, enum gl_access_qualifier access, bool can_speculate, bool use_format) { LLVMValueRef args[5]; @@ -1310,7 +1302,7 @@ static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLV args[idx++] = vindex; args[idx++] = voffset ? voffset : ctx->i32_0; args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); + args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0); unsigned func = !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels; const char *indexing_kind = vindex ? "struct" : "raw"; @@ -1339,11 +1331,10 @@ static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLV LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, - LLVMTypeRef channel_type, unsigned cache_policy, + LLVMTypeRef channel_type, enum gl_access_qualifier access, bool can_speculate, bool allow_smem) { - if (allow_smem && !(cache_policy & ac_slc) && - (!(cache_policy & ac_glc) || ctx->gfx_level >= GFX8)) { + if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) { assert(vindex == NULL); LLVMValueRef result[32]; @@ -1365,7 +1356,8 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc LLVMValueRef args[3] = { rsrc, offset, - LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0), + LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD | + ACCESS_TYPE_SMEM), 0), }; result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD); } @@ -1386,7 +1378,7 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), ""); LLVMValueRef item = ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels, - channel_type, cache_policy, can_speculate, false); + channel_type, access, can_speculate, false); result = ac_build_concat(ctx, result, item); } @@ -1395,13 +1387,13 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vindex, LLVMValueRef voffset, - unsigned num_channels, unsigned cache_policy, + unsigned num_channels, enum gl_access_qualifier access, bool can_speculate, bool d16, bool tfe) { if (tfe) { assert(!d16); - cache_policy = get_load_cache_policy(ctx, cache_policy); + unsigned cache_flags = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD); char code[256]; /* The definition in the assembly and the one in the constraint string @@ -1415,9 +1407,9 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueR "v_mov_b32 v4, 0\n" "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n" "s_waitcnt vmcnt(0)", - cache_policy & ac_glc ? "glc" : "", - cache_policy & ac_slc ? "slc" : "", - cache_policy & ac_dlc ? "dlc" : ""); + cache_flags & ac_glc ? "glc" : "", + cache_flags & ac_slc ? "slc" : "", + cache_flags & ac_dlc ? "dlc" : ""); LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32}; LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false); @@ -1435,7 +1427,7 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueR } return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, - num_channels, d16 ? ctx->f16 : ctx->f32, cache_policy, + num_channels, d16 ? ctx->f16 : ctx->f32, access, can_speculate, true); } @@ -1443,7 +1435,7 @@ static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValue LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, unsigned num_channels, unsigned tbuffer_format, LLVMTypeRef channel_type, - unsigned cache_policy, bool can_speculate) + enum gl_access_qualifier access, bool can_speculate) { LLVMValueRef args[6]; int idx = 0; @@ -1453,7 +1445,7 @@ static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValue args[idx++] = voffset ? voffset : ctx->i32_0; args[idx++] = soffset ? soffset : ctx->i32_0; args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0); - args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); + args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0); const char *indexing_kind = vindex ? "struct" : "raw"; char name[256], type_name[8]; @@ -1474,7 +1466,7 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe unsigned align_offset, unsigned align_mul, unsigned num_channels, - unsigned cache_policy, + enum gl_access_qualifier access, bool can_speculate) { const unsigned max_channels = vtx_info->num_channels; @@ -1503,7 +1495,7 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe LLVMValueRef item = ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset, fetch_num_channels, fetch_format, channel_type, - cache_policy, can_speculate); + access, can_speculate); result = ac_build_concat(ctx, result, item); } @@ -1513,35 +1505,35 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy) + enum gl_access_qualifier access) { return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16, - cache_policy, false, false); + access, false, false); } LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy) + enum gl_access_qualifier access) { - return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy, + return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access, false, false); } void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy) + enum gl_access_qualifier access) { vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, ""); - ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false); + ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false); } void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, - LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy) + LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access) { vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, ""); - ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false); + ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false); } /** @@ -2025,7 +2017,11 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_ args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */ args[num_args++] = LLVMConstInt( - ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false); + ctx->i32, get_cache_flags(ctx, + a->access | + (atomic ? ACCESS_TYPE_ATOMIC : + load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)), + false); const char *name; const char *atomic_subop = ""; diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index b80e86e5af0..840ab74b172 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -281,28 +281,28 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy); + enum gl_access_qualifier access); void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data, - LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy); + LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access); LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, - LLVMTypeRef channel_type, unsigned cache_policy, + LLVMTypeRef channel_type, enum gl_access_qualifier access, bool can_speculate, bool allow_smem); LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vindex, LLVMValueRef voffset, - unsigned num_channels, unsigned cache_policy, + unsigned num_channels, enum gl_access_qualifier access, bool can_speculate, bool d16, bool tfe); LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy); + enum gl_access_qualifier access); LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy); + enum gl_access_qualifier access); LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vindex, LLVMValueRef voffset, @@ -312,15 +312,15 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe unsigned align_offset, unsigned align_mul, unsigned num_channels, - unsigned cache_policy, + enum gl_access_qualifier access, bool can_speculate); void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned cache_policy); + enum gl_access_qualifier access); void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, - LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy); + LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access); void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo, unsigned hi); @@ -391,21 +391,12 @@ enum ac_atomic_op ac_atomic_fmax, }; -/* These cache policy bits match the definitions used by the LLVM intrinsics. */ -enum ac_image_cache_policy -{ - ac_glc = 1 << 0, /* per-CU cache control */ - ac_slc = 1 << 1, /* global L2 cache control */ - ac_dlc = 1 << 2, /* per-shader-array cache control */ - ac_swizzled = 1 << 3, /* the access is swizzled, disabling load/store merging */ -}; - struct ac_image_args { enum ac_image_opcode opcode; enum ac_atomic_op atomic; /* for the ac_image_atomic opcode */ enum ac_image_dim dim; + enum gl_access_qualifier access; unsigned dmask : 4; - unsigned cache_policy : 3; bool unorm : 1; bool level_zero : 1; bool d16 : 1; /* GFX8+: data and return values are 16-bit */ diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index fe887798ea4..97373ee5c62 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -1804,26 +1804,6 @@ static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueR } } -static unsigned get_cache_policy(struct ac_nir_context *ctx, enum gl_access_qualifier access, - bool may_store_unaligned) -{ - unsigned cache_policy = 0; - - /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All - * store opcodes not aligned to a dword are affected. The only way to - * get unaligned stores is through shader images. - */ - if (((may_store_unaligned && ctx->ac.gfx_level == GFX6) || - access & (ACCESS_COHERENT | ACCESS_VOLATILE))) { - cache_policy |= ac_glc; - } - - if (access & ACCESS_NON_TEMPORAL) - cache_policy |= ac_slc | ac_glc; - - return cache_policy; -} - static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx, struct waterfall_context *wctx, const nir_intrinsic_instr *instr, nir_src src) { @@ -1841,8 +1821,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in LLVMValueRef src_data = get_src(ctx, instr->src[0]); int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8; unsigned writemask = nir_intrinsic_write_mask(instr); - enum gl_access_qualifier access = nir_intrinsic_access(instr); - unsigned cache_policy = get_cache_policy(ctx, access, false); + enum gl_access_qualifier access = ac_get_mem_access_flags(instr); struct waterfall_context wctx; LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]); @@ -1897,9 +1876,9 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), ""); if (num_bytes == 1) { - ac_build_buffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy); + ac_build_buffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, access); } else if (num_bytes == 2) { - ac_build_buffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy); + ac_build_buffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, access); } else { switch (num_bytes) { case 16: /* v4f32 */ @@ -1920,7 +1899,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, ""); ac_build_buffer_store_dword(&ctx->ac, rsrc, data, NULL, offset, - ctx->ac.i32_0, cache_policy); + ctx->ac.i32_0, access); } } @@ -2066,11 +2045,16 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_ data = ac_to_float(&ctx->ac, data); return_type = LLVMTypeOf(data); } + + unsigned cache_flags = + ac_get_hw_cache_flags(ctx->ac.gfx_level, + ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value; + params[arg_count++] = data; params[arg_count++] = descriptor; params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ params[arg_count++] = ctx->ac.i32_0; /* soffset */ - params[arg_count++] = ctx->ac.i32_0; /* slc */ + params[arg_count++] = LLVMConstInt(ctx->ac.i32, cache_flags, 0); ac_build_type_name_for_intr(return_type, type, sizeof(type)); snprintf(name, sizeof(name), "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type); @@ -2095,8 +2079,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_ int elem_size_bytes = instr->dest.ssa.bit_size / 8; int num_components = instr->num_components; - enum gl_access_qualifier access = nir_intrinsic_access(instr); - unsigned cache_policy = get_cache_policy(ctx, access, false); + enum gl_access_qualifier access = ac_get_mem_access_flags(instr); LLVMValueRef offset = get_src(ctx, instr->src[1]); LLVMValueRef rsrc = ctx->abi->load_ssbo ? @@ -2122,16 +2105,16 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_ if (load_bytes == 1) { ret = ac_build_buffer_load_byte(&ctx->ac, rsrc, voffset, ctx->ac.i32_0, - cache_policy); + access); } else if (load_bytes == 2) { ret = ac_build_buffer_load_short(&ctx->ac, rsrc, voffset, ctx->ac.i32_0, - cache_policy); + access); } else { int num_channels = util_next_power_of_two(load_bytes) / 4; bool can_speculate = access & ACCESS_CAN_REORDER; ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, vindex, voffset, ctx->ac.i32_0, - ctx->ac.f32, cache_policy, can_speculate, false); + ctx->ac.f32, access, can_speculate, false); } LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret))); @@ -2507,7 +2490,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri struct ac_image_args args = {0}; - args.cache_policy = get_cache_policy(ctx, access, false); + args.access = ac_get_mem_access_flags(instr); args.tfe = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load; if (dim == GLSL_SAMPLER_DIM_BUF) { @@ -2523,7 +2506,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri assert(instr->dest.is_ssa); bool can_speculate = access & ACCESS_CAN_REORDER; res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, ctx->ac.i32_0, num_channels, - args.cache_policy, can_speculate, + args.access, can_speculate, instr->dest.ssa.bit_size == 16, args.tfe); res = ac_build_expand(&ctx->ac, res, num_channels, args.tfe ? 5 : 4); @@ -2588,14 +2571,13 @@ static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_in } enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); - enum gl_access_qualifier access = nir_intrinsic_access(instr); bool is_array = nir_intrinsic_image_array(instr); struct waterfall_context wctx; LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); struct ac_image_args args = {0}; - args.cache_policy = get_cache_policy(ctx, access, true); + args.access = ac_get_mem_access_flags(instr); LLVMValueRef src = get_src(ctx, instr->src[3]); if (instr->src[3].ssa->bit_size == 64) { @@ -2617,7 +2599,7 @@ static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_in vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, ""); - ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.cache_policy); + ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.access); } else { bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; @@ -2730,9 +2712,12 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int } else { LLVMTypeRef data_type = LLVMTypeOf(params[0]); char type[8]; + unsigned cache_flags = + ac_get_hw_cache_flags(ctx->ac.gfx_level, + ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value; params[param_count++] = ctx->ac.i32_0; /* soffset */ - params[param_count++] = ctx->ac.i32_0; /* slc */ + params[param_count++] = LLVMConstInt(ctx->ac.i32, cache_flags, 0); ac_build_type_name_for_intr(data_type, type, sizeof(type)); length = snprintf(intrinsic_name, sizeof(intrinsic_name), @@ -2752,6 +2737,7 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int args.resource = ctx->abi->load_sampler_desc(ctx->abi, dynamic_index, AC_DESC_IMAGE); get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); args.dim = ac_get_image_dim(ctx->ac.gfx_level, dim, is_array); + args.access = ac_get_mem_access_flags(instr); result = ac_build_image_opcode(&ctx->ac, &args); } @@ -3805,19 +3791,9 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[src_base + 3]) : NULL; unsigned num_components = instr->dest.ssa.num_components; unsigned const_offset = nir_intrinsic_base(instr); - bool swizzled = nir_intrinsic_access(instr) & ACCESS_IS_SWIZZLED_AMD; bool reorder = nir_intrinsic_can_reorder(instr); - bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT; - bool slc = nir_intrinsic_access(instr) & ACCESS_NON_TEMPORAL; - bool uses_format = nir_intrinsic_access(instr) & ACCESS_USES_FORMAT_AMD; - - enum ac_image_cache_policy cache_policy = 0; - if (swizzled) - cache_policy |= ac_swizzled; - if (slc) - cache_policy |= ac_slc; - if (coherent) - cache_policy |= ac_glc; + enum gl_access_qualifier access = ac_get_mem_access_flags(instr); + bool uses_format = access & ACCESS_USES_FORMAT_AMD; LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, addr_voffset, LLVMConstInt(ctx->ac.i32, const_offset, 0), ""); @@ -3825,12 +3801,12 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins if (instr->intrinsic == nir_intrinsic_load_buffer_amd && uses_format) { assert(instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 32); result = ac_build_buffer_load_format(&ctx->ac, descriptor, vidx, voffset, num_components, - cache_policy, reorder, + access, reorder, instr->dest.ssa.bit_size == 16, false); result = ac_to_integer(&ctx->ac, result); } else if (instr->intrinsic == nir_intrinsic_store_buffer_amd && uses_format) { assert(instr->src[0].ssa->bit_size == 16 || instr->src[0].ssa->bit_size == 32); - ac_build_buffer_store_format(&ctx->ac, descriptor, store_data, vidx, voffset, cache_policy); + ac_build_buffer_store_format(&ctx->ac, descriptor, store_data, vidx, voffset, access); } else if (instr->intrinsic == nir_intrinsic_load_buffer_amd || instr->intrinsic == nir_intrinsic_load_typed_buffer_amd) { /* LLVM is unable to select instructions for larger than 32-bit channel types. @@ -3843,7 +3819,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins if (instr->intrinsic == nir_intrinsic_load_buffer_amd) { result = ac_build_buffer_load(&ctx->ac, descriptor, fetch_num_components, vidx, voffset, - addr_soffset, channel_type, cache_policy, reorder, false); + addr_soffset, channel_type, access, reorder, false); } else { const unsigned align_offset = nir_intrinsic_align_offset(instr); const unsigned align_mul = nir_intrinsic_align_mul(instr); @@ -3854,7 +3830,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins result = ac_build_safe_tbuffer_load(&ctx->ac, descriptor, vidx, addr_voffset, addr_soffset, channel_type, vtx_info, const_offset, align_offset, - align_mul, fetch_num_components, cache_policy, reorder); + align_mul, fetch_num_components, access, reorder); } /* Trim to needed vector components. */ @@ -3884,7 +3860,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins LLVMValueRef data = extract_vector_range(&ctx->ac, store_data, start, count); ac_build_buffer_store_dword(&ctx->ac, descriptor, data, vidx, voffset, addr_soffset, - cache_policy); + access); } } break; @@ -3933,12 +3909,15 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins LLVMValueRef data = get_src(ctx, instr->src[1]); unsigned base = nir_intrinsic_base(instr); LLVMTypeRef return_type = LLVMTypeOf(data); + unsigned cache_flags = + ac_get_hw_cache_flags(ctx->ac.gfx_level, + ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value; LLVMValueRef args[] = { data, desc, LLVMConstInt(ctx->ac.i32, base, false), ctx->ac.i32_0, /* soffset */ - ctx->ac.i32_0, /* cachepolicy */ + LLVMConstInt(ctx->ac.i32, cache_flags, 0), }; char name[64], type[8]; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 53fb4991594..a2dad78a78c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -372,7 +372,8 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504); ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0), - NULL, LLVMConstInt(ctx->ac.i32, offset, 0), tf_base, ac_glc); + NULL, LLVMConstInt(ctx->ac.i32, offset, 0), tf_base, + ACCESS_COHERENT); ac_build_endif(&ctx->ac, 6504); offset += 4; } @@ -381,13 +382,13 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, NULL, LLVMBuildAdd(ctx->ac.builder, byteoffset, LLVMConstInt(ctx->ac.i32, offset, 0), ""), - tf_base, ac_glc); + tf_base, ACCESS_COHERENT); offset += 16; if (vec1) ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, NULL, LLVMBuildAdd(ctx->ac.builder, byteoffset, LLVMConstInt(ctx->ac.i32, offset, 0), ""), - tf_base, ac_glc); + tf_base, ACCESS_COHERENT); /* Store the tess factors into the offchip buffer if TES reads them. */ if (shader->key.ge.part.tcs.epilog.tes_reads_tess_factors) { @@ -405,7 +406,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_comps); ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, NULL, tf_outer_offset, - base, ac_glc); + base, ACCESS_COHERENT); if (inner_comps) { param_inner = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER); tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, @@ -413,7 +414,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader inner_vec = ac_build_gather_values(&ctx->ac, inner, inner_comps); ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, NULL, - tf_inner_offset, base, ac_glc); + tf_inner_offset, base, ACCESS_COHERENT); } }
