This is more based on documentation reading that on testing as still only limited MI300 testing has been done and seemingly this code does not usually get touched.
MI300's "9.1.10 Memory Scope and Temporal Control" distinguishes between scalar memory (9.1.10.1) for which a single control bit exists: GLC (Globally Coherent) [+ dlc, slc, scc, but not used by MI300]. And, for vector memory (9.1.10.2; flat, global, scratch, buffer), there is the system cache level SC[1:0] (wave, group, device system) and also NT (non temporal). This patch moves back to 'glc' for scalar memory access. OK for mainline? Tobias PS: Some more smaller fixes are in the pipeline and there are some known MI300 issues, not all fully understood. Likewise in the (to-do) pipeline is more more in depth testing.
gcn: Fix glc vs. sc0 handling for scalar memory access gfx942 still uses glc for scalar access ('s_...') and only uses sc0/nt/sc1 for vector access. gcc/ChangeLog: * config/gcn/gcn-opts.h (TARGET_GLC_NAME): Fix and extend the description in the comment. (TARGET_KERNARG_PRELOAD): Define. * config/gcn/gcn.cc (gcn_hsa_declare_function_name): Use it. (print_operand): Extend the comment about 'G'/'g'. * config/gcn/gcn.md: Use 'glc' instead of %G where appropriate. gcc/config/gcn/gcn-opts.h | 10 ++++++++-- gcc/config/gcn/gcn.cc | 8 ++++++++ gcc/config/gcn/gcn.md | 30 +++++++++++++++--------------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index bcea14f3fe7..2c17e30df3c 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -84,8 +84,11 @@ enum hsaco_attr_type #define TARGET_DPP8 TARGET_RDNA2_PLUS /* Device requires CDNA1-style manually inserted wait states for AVGPRs. */ #define TARGET_AVGPR_CDNA1_NOPS TARGET_CDNA1 -/* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0, sc1) flag - for scalar memory operations. The string starts on purpose with a space. */ +/* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag + for non-scalar memory operations. The string starts on purpose with a space. + Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used. + CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however, + there is no non-scalar user so far. */ #define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc") /* The metadata on different devices need different granularity. */ #define TARGET_VGPR_GRANULARITY \ @@ -94,6 +97,9 @@ enum hsaco_attr_type : 4) /* This mostly affects the metadata. */ #define TARGET_ARCHITECTED_FLAT_SCRATCH (TARGET_RDNA3 || TARGET_CDNA3) +/* Whether kernarg_preload is supported; note that not all gfx90a + support it (MI210 does not). */ +#define TARGET_KERNARG_PRELOAD TARGET_CDNA2_PLUS /* Device has Sub-DWord Addressing instrucions. */ #define TARGET_SDWA (!TARGET_RDNA3) /* Different devices uses different cache control instructions. */ diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 2d8dfa3232e..e37f379a318 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -6653,6 +6653,12 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, if (!TARGET_ARCHITECTED_FLAT_SCRATCH) fprintf (file, "\t .amdhsa_reserve_flat_scratch\t0\n"); + /* Currently, set to the default value of 0; note that gfx90a's MI210 does + not support it, only other gfx90a devices. */ + if (TARGET_KERNARG_PRELOAD) + fprintf (file, + "\t .amdhsa_user_sgpr_kernarg_preload_length\t0\n" + "\t .amdhsa_user_sgpr_kernarg_preload_offset\t0\n"); if (TARGET_AVGPR_COMBINED) fprintf (file, "\t .amdhsa_accum_offset\t%i\n", @@ -7103,6 +7109,8 @@ print_operand_address (FILE *file, rtx mem) O - print offset:n for data share operations. G - print "glc" (or for gfx94x: sc0) unconditionally [+ indep. of regnum] g - print "glc" (or for gfx94x: sc0), if appropriate for given MEM + NOTE: Do not use 'G' or 'g with scalar memory access ('s_...') as those + require "glc" also with gfx94x. L - print low-part of a multi-reg value H - print second part of a multi-reg value (high-part of 2-reg value) J - print third part of a multi-reg value diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 1998931e052..2ce2e054fbf 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -206,7 +206,7 @@ ; vdata: vgpr0-255 ; srsrc: sgpr0-102 ; soffset: sgpr0-102 -; flags: offen, idxen, %G, lds, slc, tfe +; flags: offen, idxen, glc, lds, slc, tfe ; ; mtbuf - Typed memory buffer operation. Two words ; offset: 12-bit constant @@ -216,10 +216,10 @@ ; vdata: vgpr0-255 ; srsrc: sgpr0-102 ; soffset: sgpr0-102 -; flags: offen, idxen, %G, lds, slc, tfe +; flags: offen, idxen, glc, lds, slc, tfe ; ; flat - flat or global memory operations -; flags: %G, slc +; flags: {CDNA3: sc0, nt, sc1 | otherwise: glc, slc, scc} ; addr: vgpr0-255 ; data: vgpr0-255 ; vdst: vgpr0-255 @@ -1987,7 +1987,7 @@ (use (match_operand 3 "const_int_operand"))] "0 /* Disabled. */" "@ - s_atomic_<bare_mnemonic><X>\t%0, %1, %2 %G2\;s_waitcnt\tlgkmcnt(0) + s_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0) flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 %G2\;s_waitcnt\t0 global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") @@ -2054,7 +2054,7 @@ UNSPECV_ATOMIC))] "" "@ - s_atomic_cmpswap<X>\t%0, %1, %2 %G2\;s_waitcnt\tlgkmcnt(0) + s_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0) flat_atomic_cmpswap<X>\t%0, %1, %2 %G2\;s_waitcnt\t0 global_atomic_cmpswap<X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") @@ -2096,7 +2096,7 @@ switch (which_alternative) { case 0: - return "s_load%o0\t%0, %A1 %G1\;s_waitcnt\tlgkmcnt(0)"; + return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)"; case 1: return (TARGET_RDNA2 /* Not GFX11. */ ? "flat_load%o0\t%0, %A1%O1 %G1 dlc\;s_waitcnt\t0" @@ -2113,7 +2113,7 @@ switch (which_alternative) { case 0: - return "s_load%o0\t%0, %A1 %G1\;s_waitcnt\tlgkmcnt(0)\;" + return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;" "s_dcache_wb_vol"; case 1: return (TARGET_RDNA2 @@ -2147,7 +2147,7 @@ switch (which_alternative) { case 0: - return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 %G1\;" + return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;" "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; case 1: return (TARGET_RDNA2 @@ -2196,7 +2196,7 @@ switch (which_alternative) { case 0: - return "s_store%o1\t%1, %A0 %G1\;s_waitcnt\tlgkmcnt(0)"; + return "s_store%o1\t%1, %A0 glc\;s_waitcnt\tlgkmcnt(0)"; case 1: return "flat_store%o1\t%A0, %1%O0 %G1\;s_waitcnt\t0"; case 2: @@ -2208,7 +2208,7 @@ switch (which_alternative) { case 0: - return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 %G1"; + return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc"; case 1: return (TARGET_GLn_CACHE ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 %G1" @@ -2233,7 +2233,7 @@ switch (which_alternative) { case 0: - return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 %G1\;" + return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;" "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; case 1: return (TARGET_GLn_CACHE @@ -2282,7 +2282,7 @@ switch (which_alternative) { case 0: - return "s_atomic_swap<X>\t%0, %1, %2 %G1\;s_waitcnt\tlgkmcnt(0)"; + return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)"; case 1: return "flat_atomic_swap<X>\t%0, %1, %2 %G1\;s_waitcnt\t0"; case 2: @@ -2296,7 +2296,7 @@ switch (which_alternative) { case 0: - return "s_atomic_swap<X>\t%0, %1, %2 %G1\;s_waitcnt\tlgkmcnt(0)\;" + return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)\;" "s_dcache_wb_vol\;s_dcache_inv_vol"; case 1: return (TARGET_GLn_CACHE @@ -2327,7 +2327,7 @@ switch (which_alternative) { case 0: - return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 %G1\;" + return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;" "s_waitcnt\tlgkmcnt(0)"; case 1: return (TARGET_GLn_CACHE @@ -2362,7 +2362,7 @@ switch (which_alternative) { case 0: - return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 %G1\;" + return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;" "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; case 1: return (TARGET_GLn_CACHE