On Fri, 26 Jan 2024, Richard Biener wrote: > On Wed, 24 Jan 2024, Andrew Stubbs wrote: > > > This is enough to get gfx1100 working for most purposes, on top of the > > patch that Tobias committed a week or so ago; there are still some test > > failures to investigate, and probably some tuning to do. > > > > It might also get gfx1030 working too. @Richi, could you test it, > > please? > > I can report partial success here. I do see quite some FAILs because of > > /space/rguenther/src/gcc-autopar_devel/libgomp/testsuite/libgomp.fortran/examples-4/declare_target-4.f90: > > In function 'accum_._omp_fn.1':^M > /space/rguenther/src/gcc-autopar_devel/libgomp/testsuite/libgomp.fortran/examples-4/declare_target-4.f90:20:38: > > error: unrecognizable insn:^M > (insn 108 107 109 6 (set (reg:V8SF 849)^M > (unspec:V8SF [^M > (reg:V8SF 844 [ vect__43.12_106 ]) repeated x2^M > (const_int 1 [0x1])^M > ] UNSPEC_PLUS_DPP_SHR)) > "/space/rguenther/src/gcc-autopar_devel/libgomp/testsuite/libgomp.fortran/examples-4/declare_target-4.f90":22:29 > > discrim 1 -1^M > (nil))^M > during RTL pass: vregs^M > /space/rguenther/src/gcc-autopar_devel/libgomp/testsuite/libgomp.fortran/examples-4/declare_target-4.f90:20:38: > > internal compiler error: in extract_insn, at recog.cc:2812^M > > there are also quite a number of execution FAILs like > > icv-5.exe: > /space/rguenther/src/gcc-autopar_devel/libgomp/plugin/plugin-gcn.c:2462: > isa_matches_agent: Assertion `agent_isa_s' failed. > FAIL: libgomp.c/../libgomp.c-c++-common/icv-5.c execution test > > (the assert in question looks bad - yeah, somehow we got past > device initialization - not sure how - but end up here) > > Maybe HSA behaves odd here - I didn't constrain the device it should > choose but it works (most of the time). GCN_DEBUG prints me all the > HSA agents available but I don't see any debug on which agent > is actually initialized during libgomp device init (at least nothing > I can easily match up). Maybe sth to improve. > > I'll followup with a test summary once the (serial) run of libgomp > testing finished. At least there are quite some number of > actual kernel executions and PASSing testcases.
=== libgomp Summary === # of expected passes 29126 # of unexpected failures 697 # of unexpected successes 1 # of expected failures 703 # of unresolved testcases 318 # of unsupported tests 766 full summary attached (compressed). Even compressed libgomp.log is too big to send. Richard. > > Richard. > > > I can't test the other multilibs right now. @PA, can you test it please? > > > > I can self-approve the patch, but I'll hold off the commit until the > > test results come back. > > > > Andrew > > > > gcc/ChangeLog: > > > > * config/gcn/gcn-opts.h (TARGET_PACKED_WORK_ITEMS): Add TARGET_RDNA3. > > * config/gcn/gcn-valu.md (all_convert): New iterator. > > (<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>): New > > define_expand, and rename the old one to ... > > (*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>): ... this. > > (extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>): Likewise, to ... > > (extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>): .. this. > > (*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_shift<exec>): New. > > * config/gcn/gcn.cc (gcn_global_address_p): Use "offsetbits" correctly. > > (gcn_hsa_declare_function_name): Update the vgpr counting for gfx1100. > > * config/gcn/gcn.md (<u>mulhisi3): Disable on RDNA3. > > (<u>mulqihi3_scalar): Likewise. > > > > libgcc/ChangeLog: > > > > * config/gcn/amdgcn_veclib.h (CDNA3_PLUS): Handle RDNA3. > > > > libgomp/ChangeLog: > > > > * config/gcn/time.c (RTC_TICKS): Configure RDNA3. > > (omp_get_wtime): Add RDNA3-compatible variant. > > * plugin/plugin-gcn.c (max_isa_vgprs): Tune for gfx1030 and gfx1100. > > > > Signed-off-by: Andrew Stubbs <a...@baylibre.com> > > --- > > gcc/config/gcn/gcn-opts.h | 2 +- > > gcc/config/gcn/gcn-valu.md | 41 ++++++++++++++++++++++++++++--- > > gcc/config/gcn/gcn.cc | 31 ++++++++++++++++------- > > gcc/config/gcn/gcn.md | 4 +-- > > libgcc/config/gcn/amdgcn_veclib.h | 2 +- > > libgomp/config/gcn/time.c | 10 ++++++++ > > libgomp/plugin/plugin-gcn.c | 6 +++-- > > 7 files changed, 77 insertions(+), 19 deletions(-) > > > > diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h > > index 79fbda3ab25..6be2c9204fa 100644 > > --- a/gcc/config/gcn/gcn-opts.h > > +++ b/gcc/config/gcn/gcn-opts.h > > @@ -62,7 +62,7 @@ extern enum gcn_isa { > > > > > > #define TARGET_M0_LDS_LIMIT (TARGET_GCN3) > > -#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS) > > +#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS || TARGET_RDNA3) > > > > #define TARGET_XNACK (flag_xnack != HSACO_ATTR_OFF) > > > > diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md > > index 3d5b6271ee6..cd027f8b369 100644 > > --- a/gcc/config/gcn/gcn-valu.md > > +++ b/gcc/config/gcn/gcn-valu.md > > @@ -3555,30 +3555,63 @@ > > ;; }}} > > ;; {{{ Int/int conversions > > > > +(define_code_iterator all_convert [truncate zero_extend sign_extend]) > > (define_code_iterator zero_convert [truncate zero_extend]) > > (define_code_attr convop [ > > (sign_extend "extend") > > (zero_extend "zero_extend") > > (truncate "trunc")]) > > > > -(define_insn "<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>" > > +(define_expand "<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>" > > + [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") > > + (all_convert:V_INT_1REG > > + (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))] > > + "") > > + > > +(define_insn "*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>" > > [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") > > (zero_convert:V_INT_1REG > > (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))] > > - "" > > + "!TARGET_RDNA3" > > "v_mov_b32_sdwa\t%0, %1 dst_sel:<V_INT_1REG:sdwa> dst_unused:UNUSED_PAD > > src0_sel:<V_INT_1REG_ALT:sdwa>" > > [(set_attr "type" "vop_sdwa") > > (set_attr "length" "8")]) > > > > -(define_insn "extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>" > > +(define_insn "extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>" > > [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") > > (sign_extend:V_INT_1REG > > (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))] > > - "" > > + "!TARGET_RDNA3" > > "v_mov_b32_sdwa\t%0, sext(%1) src0_sel:<V_INT_1REG_ALT:sdwa>" > > [(set_attr "type" "vop_sdwa") > > (set_attr "length" "8")]) > > > > +(define_insn "*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_shift<exec>" > > + [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") > > + (all_convert:V_INT_1REG > > + (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))] > > + "TARGET_RDNA3" > > + { > > + enum {extend, zero_extend, trunc}; > > + rtx shiftwidth = (<V_INT_1REG_ALT:SCALAR_MODE>mode == QImode > > + || <V_INT_1REG:SCALAR_MODE>mode == QImode > > + ? GEN_INT (24) > > + : <V_INT_1REG_ALT:SCALAR_MODE>mode == HImode > > + || <V_INT_1REG:SCALAR_MODE>mode == HImode > > + ? GEN_INT (16) > > + : NULL); > > + operands[2] = shiftwidth; > > + > > + if (!shiftwidth) > > + return "v_mov_b32 %0, %1"; > > + else if (<convop> == extend || <convop> == trunc) > > + return "v_lshlrev_b32\t%0, %2, %1\;v_ashrrev_i32\t%0, %2, %0"; > > + else > > + return "v_lshlrev_b32\t%0, %2, %1\;v_lshrrev_b32\t%0, %2, %0"; > > + } > > + [(set_attr "type" "mult") > > + (set_attr "length" "8")]) > > + > > ;; GCC can already do these for scalar types, but not for vector types. > > ;; Unfortunately you can't just do SUBREG on a vector to select the low > > part, > > ;; so there must be a few tricks here. > > diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc > > index e668ce7c69e..e80de2ce056 100644 > > --- a/gcc/config/gcn/gcn.cc > > +++ b/gcc/config/gcn/gcn.cc > > @@ -1597,8 +1597,8 @@ gcn_global_address_p (rtx addr) > > rtx offset = XEXP (addr, 1); > > int offsetbits = (TARGET_RDNA2_PLUS ? 11 : 12); > > bool immediate_p = (CONST_INT_P (offset) > > - && INTVAL (offset) >= -(1 << 12) > > - && INTVAL (offset) < (1 << 12)); > > + && INTVAL (offset) >= -(1 << offsetbits) > > + && INTVAL (offset) < (1 << offsetbits)); > > > > if ((gcn_address_register_p (base, DImode, false) > > || gcn_vec_address_register_p (base, DImode, false)) > > @@ -6597,8 +6597,10 @@ gcn_hsa_declare_function_name (FILE *file, const > > char *name, > > if (df_regs_ever_live_p (FIRST_AVGPR_REG + avgpr)) > > break; > > avgpr++; > > - vgpr = (vgpr + 3) & ~3; > > - avgpr = (avgpr + 3) & ~3; > > + > > + /* The main function epilogue uses v8, but df doesn't see that. */ > > + if (vgpr < 9) > > + vgpr = 9; > > > > if (!leaf_function_p ()) > > { > > @@ -6611,9 +6613,18 @@ gcn_hsa_declare_function_name (FILE *file, const > > char *name, > > avgpr = MAX_NORMAL_AVGPR_COUNT; > > } > > > > - /* The gfx90a accum_offset field can't represent 0 registers. */ > > - if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4) > > - vgpr = 4; > > + /* SIMD32 devices count double in wavefront64 mode. */ > > + if (TARGET_RDNA2_PLUS) > > + vgpr *= 2; > > + > > + /* Round up to the allocation block size. */ > > + int vgpr_block_size = (TARGET_RDNA3 ? 12 > > + : TARGET_RDNA2_PLUS || TARGET_CDNA2_PLUS ? 8 > > + : 4); > > + if (vgpr % vgpr_block_size) > > + vgpr += vgpr_block_size - (vgpr % vgpr_block_size); > > + if (avgpr % vgpr_block_size) > > + avgpr += vgpr_block_size - (avgpr % vgpr_block_size); > > > > fputs ("\t.rodata\n" > > "\t.p2align\t6\n" > > @@ -6714,12 +6725,14 @@ gcn_hsa_declare_function_name (FILE *file, const > > char *name, > > " .private_segment_fixed_size: 0\n" > > " .wavefront_size: 64\n" > > " .sgpr_count: %i\n" > > - " .vgpr_count: %i\n" > > + " .vgpr_count: %i%s\n" > > " .max_flat_workgroup_size: 1024\n", > > cfun->machine->kernarg_segment_byte_size, > > cfun->machine->kernarg_segment_alignment, > > LDS_SIZE, > > - sgpr, next_free_vgpr); > > + sgpr, next_free_vgpr, > > + (TARGET_RDNA2_PLUS ? " ; wavefrontsize64 counts double on SIMD32" > > + : "")); > > if (gcn_arch == PROCESSOR_GFX90a || gcn_arch == PROCESSOR_GFX908) > > fprintf (file, " .agpr_count: %i\n", avgpr); > > fputs (" .end_amdgpu_metadata\n", file); > > diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md > > index 492b833e255..1f3c692b7a6 100644 > > --- a/gcc/config/gcn/gcn.md > > +++ b/gcc/config/gcn/gcn.md > > @@ -1618,7 +1618,7 @@ > > (mult:SI > > (any_extend:SI (match_operand:HI 1 "register_operand" "%v")) > > (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))] > > - "" > > + "!TARGET_RDNA3" > > "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 > > src1_sel:WORD_0" > > [(set_attr "type" "vop_sdwa") > > (set_attr "length" "8")]) > > @@ -1628,7 +1628,7 @@ > > (mult:HI > > (any_extend:HI (match_operand:QI 1 "register_operand" "%v")) > > (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))] > > - "" > > + "!TARGET_RDNA3" > > "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 > > src1_sel:BYTE_0" > > [(set_attr "type" "vop_sdwa") > > (set_attr "length" "8")]) > > diff --git a/libgcc/config/gcn/amdgcn_veclib.h > > b/libgcc/config/gcn/amdgcn_veclib.h > > index 821f6386dd6..d268c6cac16 100644 > > --- a/libgcc/config/gcn/amdgcn_veclib.h > > +++ b/libgcc/config/gcn/amdgcn_veclib.h > > @@ -230,7 +230,7 @@ do { \ > > > > #if defined (__GCN3__) || defined (__GCN5__) \ > > || defined (__CDNA1__) || defined (__CDNA2__) \ > > - || defined (__RDNA2__) > > + || defined (__RDNA2__) || defined (__RDNA3__) > > #define CDNA3_PLUS 0 > > #else > > #define CDNA3_PLUS 1 > > diff --git a/libgomp/config/gcn/time.c b/libgomp/config/gcn/time.c > > index 30a0d0188e4..efcd04f5f43 100644 > > --- a/libgomp/config/gcn/time.c > > +++ b/libgomp/config/gcn/time.c > > @@ -30,15 +30,25 @@ > > /* According to AMD: > > dGPU RTC is 27MHz > > AGPU RTC is 100MHz > > + RDNA3 ISA manual states "typically 100MHz" > > FIXME: DTRT on an APU. */ > > +#ifdef __RDNA3__ > > +#define RTC_TICKS (1.0 / 100000000.0) /* 100MHz */ > > +#else > > #define RTC_TICKS (1.0 / 27000000.0) /* 27MHz */ > > +#endif > > > > double > > omp_get_wtime (void) > > { > > uint64_t clock; > > +#ifdef __RDNA3__ > > + asm ("s_sendmsg_rtn_b64 %0 0x83 ;Get REALTIME\n\t" > > + "s_waitcnt 0" : "=r" (clock)); > > +#else > > asm ("s_memrealtime %0\n\t" > > "s_waitcnt 0" : "=r" (clock)); > > +#endif > > return clock * RTC_TICKS; > > } > > > > diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c > > index 0339848451e..db28781dedb 100644 > > --- a/libgomp/plugin/plugin-gcn.c > > +++ b/libgomp/plugin/plugin-gcn.c > > @@ -1741,11 +1741,13 @@ max_isa_vgprs (int isa) > > case EF_AMDGPU_MACH_AMDGCN_GFX900: > > case EF_AMDGPU_MACH_AMDGCN_GFX906: > > case EF_AMDGPU_MACH_AMDGCN_GFX908: > > - case EF_AMDGPU_MACH_AMDGCN_GFX1030: > > - case EF_AMDGPU_MACH_AMDGCN_GFX1100: > > return 256; > > case EF_AMDGPU_MACH_AMDGCN_GFX90a: > > return 512; > > + case EF_AMDGPU_MACH_AMDGCN_GFX1030: > > + return 512; /* 512 SIMD32 = 256 wavefrontsize64. */ > > + case EF_AMDGPU_MACH_AMDGCN_GFX1100: > > + return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */ > > } > > GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs"); > > } > > > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
r.gz
Description: test summary