Re: [PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers
On 21/03/2023 13:35, Andrew Jenner wrote: I have updated this patch to incorporate the feedback from Andrew Stubbs. Tested on CDNA2 GFX90a. gcc/ChangeLog: * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): Declare. * config/gcn/gcn-valu.md (@dpp_swap_pairs) (@dpp_distribute_even, @dpp_distribute_odd) (cmul3, cml4, vec_addsub3) (cadd3, vec_fmaddsub4, vec_fmsubadd4) (fms4, fms4_negop2, fms4) (fms4_negop2): New patterns. * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): New functions. * config/gcn/gcn.md: Add entries to unspec enum. gcc/testsuite/ChangeLog: * gcc.target/gcn/complex.c: New test. OK. Andrew
Re: [PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers
I have updated this patch to incorporate the feedback from Andrew Stubbs. Tested on CDNA2 GFX90a. gcc/ChangeLog: * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): Declare. * config/gcn/gcn-valu.md (@dpp_swap_pairs) (@dpp_distribute_even, @dpp_distribute_odd) (cmul3, cml4, vec_addsub3) (cadd3, vec_fmaddsub4, vec_fmsubadd4) (fms4, fms4_negop2, fms4) (fms4_negop2): New patterns. * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): New functions. * config/gcn/gcn.md: Add entries to unspec enum. gcc/testsuite/ChangeLog: * gcc.target/gcn/complex.c: New test.diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index 861044e77f0..d7862b21a2a 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int regno); extern rtx get_exec (int64_t); extern rtx get_exec (machine_mode mode); extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int); +extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int); +extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *, + int unspec); +extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *, + int unspec); extern void gcn_expand_epilogue (); extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale, bool unsigned_p, rtx exec); diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 75e9a59600b..787d7709d0d 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -1224,6 +1224,45 @@ [(set_attr "type" "vop_dpp") (set_attr "length" "16")]) +(define_insn "@dpp_swap_pairs" + [(set (match_operand:V_noHI 0 "register_operand""=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_SWAP_PAIRS))] + "" + { +return gcn_expand_dpp_swap_pairs_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_SWAP_PAIRS); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + +(define_insn "@dpp_distribute_even" + [(set (match_operand:V_noHI 0 "register_operand""=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))] + "" + { +return gcn_expand_dpp_distribute_even_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + +(define_insn "@dpp_distribute_odd" + [(set (match_operand:V_noHI 0 "register_operand""=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))] + "" + { +return gcn_expand_dpp_distribute_odd_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_DISTRIBUTE_ODD); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + ;; }}} ;; {{{ ALU special case: add/sub @@ -2185,6 +2224,180 @@ DONE; }) +(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ]) +(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")]) +(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")]) +(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")]) + +(define_expand "cmul3" + [(set (match_operand:V_noHI 0 "register_operand""=") +(unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" "v") + (match_operand:V_noHI 2 "register_operand" "v")] + UNSPEC_CMUL_OP))] + "" + { +// operands[1] a b +// operands[2] c d +rtx t1 = gen_reg_rtx (mode); +emit_insn (gen_mul3 (t1, operands[1], operands[2])); // a*c b*d + +rtx s2_perm = gen_reg_rtx (mode); +emit_insn (gen_dpp_swap_pairs (s2_perm, operands[2])); // d c + +rtx t2 = gen_reg_rtx (mode); +emit_insn (gen_mul3 (t2, operands[1], s2_perm)); // a*d b*c + +rtx t1_perm = gen_reg_rtx (mode); +emit_insn (gen_dpp_swap_pairs (t1_perm, t1)); // b*d a*c + +rtx even = gen_rtx_REG (DImode, EXEC_REG); +emit_move_insn (even, get_exec (0xUL)); +rtx dest = operands[0]; +emit_insn (gen_3_exec (dest, t1, t1_perm, dest, even)); + // a*c-b*d 0 + +rtx t2_perm = gen_reg_rtx (mode); +emit_insn (gen_dpp_swap_pairs (t2_perm, t2)); // b*c a*d + +rtx odd
Re: [PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers
On 09/02/2023 20:13, Andrew Jenner wrote: This patch introduces instruction patterns for complex number operations in the GCN machine description. These patterns are cmul, cmul_conj, vec_addsub, vec_fmaddsub, vec_fmsubadd, cadd90, cadd270, cmla and cmls (cmla_conj and cmls_conj were not found to be favorable to implement). As a side effect of adding cmls, I also added fms patterns corresponding to the existing fma patterns. Tested on CDNA2 GFX90a. OK to commit? gcc/ChangeLog: * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): Declare. * config/gcn/gcn-valu.md (@dpp_swap_pairs) (@dpp_distribute_even, @dpp_distribute_odd) (cmul3, cml4, vec_addsub3) (cadd3, vec_fmaddsub4, vec_fmsubadd4) (fms4, fms4_negop2, fms4) (fms4_negop2): New patterns. * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): New functions. * config/gcn/gcn.md: Add entries to unspec enum. gcc/testsuite/ChangeLog: * gcc.target/gcn/complex.c: New test. +;; It would be possible to represent these without the UNSPEC as +;; +;; (vec_merge +;; (fma op1 op2 op3) +;; (fma op1 op2 (neg op3)) +;; (merge-const)) +;; +;; But this doesn't seem useful in practice. + +(define_expand "vec_fmaddsub4" + [(set (match_operand:V_noHI 0 "register_operand" "=") +(unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" "v") + (match_operand:V_noHI 2 "register_operand" "v") + (match_operand:V_noHI 3 "register_operand" "v")] + UNSPEC_FMADDSUB))] This is a define_expand pattern that has a custom-code expansion with an unconditional "DONE", so the actual RTL representation is irrelevant here: it only needs to have the match_operand entries. The UNSPEC_FMADDSUB is therefore dead (as in, it will never appear in the IR). We can safely remove those, although I don't hate them for readability purposes. The UNSPEC_CMUL and UNSPEC_CMUL_CONJ are similarly "dead", but since you use them for an iterator they're still useful in the machine description. +(define_insn "fms4" + [(set (match_operand:V_FP 0 "register_operand" "= v, v") + (fma:V_FP + (match_operand:V_FP 1 "gcn_alu_operand" "% vA, vA") + (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA") + (neg:V_FP + (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA"] + "" + "v_fma%i0\t%0, %1, %2, -%3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) Please ensure that the alternatives are vertically aligned in the same style as the rest of the file. +/* Generate DPP pairwise swap instruction. + The opcode is given by INSN. */ + +char * +gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn, + int ARG_UNUSED (unspec)) +/* Generate DPP distribute even instruction. + The opcode is given by INSN. */ + +char * +gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn, +int ARG_UNUSED (unspec)) +/* Generate DPP distribute odd instruction. + The opcode is given by INSN. */ + +char * +gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn, + int ARG_UNUSED (unspec)) Please add a comment that isn't just the function name in words. Explain what operation happens here and maybe show an example of what it produces. +++ b/gcc/testsuite/gcc.target/gcn/complex.c @@ -0,0 +1,640 @@ +// { dg-do run } +// { dg-options "-O -fopenmp-simd -ftree-loop-if-convert -fno-ssa-phiopt" } Does the -fopenmp-simd option do anything here? There are no "omp declare simd" directives. +void cmulF(float *td, float *te, float *tf, float *tg, int tas) +{ + typedef _Complex float complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) +{ + d[i] = e[i] * f[i]; +} +} Tests in gcc.target/gcn won't do anything with "omp target" directives. I would expect the loop to vectorize without, at -O2 or above (or "-O1 -ftree-vectorize"), but you might find the output easier to read with "__restrict" on the parameters as that will avoid emitting the runtime alias check and scalar code implementation. I'd also expect you to have to do something to avoid inlining. + td = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc); + te = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc); + tf = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc); + tg = (float*)omp_aligned_alloc(ALIGNMENT,
[PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers
This patch introduces instruction patterns for complex number operations in the GCN machine description. These patterns are cmul, cmul_conj, vec_addsub, vec_fmaddsub, vec_fmsubadd, cadd90, cadd270, cmla and cmls (cmla_conj and cmls_conj were not found to be favorable to implement). As a side effect of adding cmls, I also added fms patterns corresponding to the existing fma patterns. Tested on CDNA2 GFX90a. OK to commit? gcc/ChangeLog: * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): Declare. * config/gcn/gcn-valu.md (@dpp_swap_pairs) (@dpp_distribute_even, @dpp_distribute_odd) (cmul3, cml4, vec_addsub3) (cadd3, vec_fmaddsub4, vec_fmsubadd4) (fms4, fms4_negop2, fms4) (fms4_negop2): New patterns. * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): New functions. * config/gcn/gcn.md: Add entries to unspec enum. gcc/testsuite/ChangeLog: * gcc.target/gcn/complex.c: New test.diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index 861044e77f0..d7862b21a2a 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int regno); extern rtx get_exec (int64_t); extern rtx get_exec (machine_mode mode); extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int); +extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int); +extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *, + int unspec); +extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *, + int unspec); extern void gcn_expand_epilogue (); extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale, bool unsigned_p, rtx exec); diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 47d9d87d58a..cb650bca3ff 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -1224,6 +1224,45 @@ [(set_attr "type" "vop_dpp") (set_attr "length" "16")]) +(define_insn "@dpp_swap_pairs" + [(set (match_operand:V_noHI 0 "register_operand""=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_SWAP_PAIRS))] + "" + { +return gcn_expand_dpp_swap_pairs_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_SWAP_PAIRS); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + +(define_insn "@dpp_distribute_even" + [(set (match_operand:V_noHI 0 "register_operand""=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))] + "" + { +return gcn_expand_dpp_distribute_even_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + +(define_insn "@dpp_distribute_odd" + [(set (match_operand:V_noHI 0 "register_operand""=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))] + "" + { +return gcn_expand_dpp_distribute_odd_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_DISTRIBUTE_ODD); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + ;; }}} ;; {{{ ALU special case: add/sub @@ -2185,6 +2224,194 @@ DONE; }) +(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ]) +(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")]) +(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")]) +(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")]) + +(define_expand "cmul3" + [(set (match_operand:V_noHI 0 "register_operand" "= ") +(unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" "v") + (match_operand:V_noHI 2 "register_operand" "v")] + UNSPEC_CMUL_OP))] + "" + { +// operands[1] a b +// operands[2] c d +rtx t1 = gen_reg_rtx (mode); +emit_insn (gen_mul3 (t1, operands[1], operands[2])); // a*c b*d + +rtx s2_perm = gen_reg_rtx (mode); +emit_insn (gen_dpp_swap_pairs (s2_perm, operands[2])); // d c + +rtx t2 = gen_reg_rtx (mode); +emit_insn (gen_mul3 (t2, operands[1], s2_perm)); // a*d b*c + +rtx t1_perm = gen_reg_rtx (mode); +emit_insn (gen_dpp_swap_pairs (t1_perm, t1)); // b*d a*c + +rtx even