Re: [PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers

2023-03-21 Thread Andrew Stubbs

On 21/03/2023 13:35, Andrew Jenner wrote:
I have updated this patch to incorporate the feedback from Andrew 
Stubbs. Tested on CDNA2 GFX90a.


gcc/ChangeLog:

     * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
     (gcn_expand_dpp_distribute_even_insn)
     (gcn_expand_dpp_distribute_odd_insn): Declare.
     * config/gcn/gcn-valu.md (@dpp_swap_pairs)
     (@dpp_distribute_even, @dpp_distribute_odd)
     (cmul3, cml4, vec_addsub3)
     (cadd3, vec_fmaddsub4, vec_fmsubadd4)
     (fms4, fms4_negop2, fms4)
     (fms4_negop2): New patterns.
     * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
     (gcn_expand_dpp_distribute_even_insn)
     (gcn_expand_dpp_distribute_odd_insn): New functions.
     * config/gcn/gcn.md: Add entries to unspec enum.

gcc/testsuite/ChangeLog:

     * gcc.target/gcn/complex.c: New test.


OK.

Andrew


Re: [PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers

2023-03-21 Thread Andrew Jenner
I have updated this patch to incorporate the feedback from Andrew 
Stubbs. Tested on CDNA2 GFX90a.


gcc/ChangeLog:

* config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
(gcn_expand_dpp_distribute_even_insn)
(gcn_expand_dpp_distribute_odd_insn): Declare.
* config/gcn/gcn-valu.md (@dpp_swap_pairs)
(@dpp_distribute_even, @dpp_distribute_odd)
(cmul3, cml4, vec_addsub3)
(cadd3, vec_fmaddsub4, vec_fmsubadd4)
(fms4, fms4_negop2, fms4)
(fms4_negop2): New patterns.
* config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
(gcn_expand_dpp_distribute_even_insn)
(gcn_expand_dpp_distribute_odd_insn): New functions.
* config/gcn/gcn.md: Add entries to unspec enum.

gcc/testsuite/ChangeLog:

* gcc.target/gcn/complex.c: New test.diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index 861044e77f0..d7862b21a2a 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int 
regno);
 extern rtx get_exec (int64_t);
 extern rtx get_exec (machine_mode mode);
 extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int);
+extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int);
+extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *,
+  int unspec);
+extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *,
+ int unspec);
 extern void gcn_expand_epilogue ();
 extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets,
  rtx scale, bool unsigned_p, rtx exec);
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 75e9a59600b..787d7709d0d 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -1224,6 +1224,45 @@
   [(set_attr "type" "vop_dpp")
(set_attr "length" "16")])
 
+(define_insn "@dpp_swap_pairs"
+  [(set (match_operand:V_noHI 0 "register_operand""=v")
+   (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_SWAP_PAIRS))]
+  ""
+  {
+return gcn_expand_dpp_swap_pairs_insn (mode, "v_mov_b32",
+  UNSPEC_MOV_DPP_SWAP_PAIRS);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_even"
+  [(set (match_operand:V_noHI 0 "register_operand""=v")
+   (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+  ""
+  {
+return gcn_expand_dpp_distribute_even_insn (mode, "v_mov_b32",
+   UNSPEC_MOV_DPP_DISTRIBUTE_EVEN);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_odd"
+  [(set (match_operand:V_noHI 0 "register_operand""=v")
+   (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+  ""
+  {
+return gcn_expand_dpp_distribute_odd_insn (mode, "v_mov_b32",
+  UNSPEC_MOV_DPP_DISTRIBUTE_ODD);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
 ;; }}}
 ;; {{{ ALU special case: add/sub
 
@@ -2185,6 +2224,180 @@
 DONE;
   })
 
+(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ])
+(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")])
+(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")])
+(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")])
+
+(define_expand "cmul3"
+  [(set (match_operand:V_noHI 0 "register_operand""=")
+(unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" "v")
+  (match_operand:V_noHI 2 "register_operand" "v")]
+ UNSPEC_CMUL_OP))]
+  ""
+  {
+// operands[1]  a   b
+// operands[2]  c   d
+rtx t1 = gen_reg_rtx (mode);
+emit_insn (gen_mul3 (t1, operands[1], operands[2]));   // a*c b*d
+
+rtx s2_perm = gen_reg_rtx (mode);
+emit_insn (gen_dpp_swap_pairs (s2_perm, operands[2])); // d   c
+
+rtx t2 = gen_reg_rtx (mode);
+emit_insn (gen_mul3 (t2, operands[1], s2_perm));   // a*d b*c
+
+rtx t1_perm = gen_reg_rtx (mode);
+emit_insn (gen_dpp_swap_pairs (t1_perm, t1));  // b*d a*c
+
+rtx even = gen_rtx_REG (DImode, EXEC_REG);
+emit_move_insn (even, get_exec (0xUL));
+rtx dest = operands[0];
+emit_insn (gen_3_exec (dest, t1, t1_perm, dest, even));
+ // a*c-b*d 0
+
+rtx t2_perm = gen_reg_rtx (mode);
+emit_insn (gen_dpp_swap_pairs (t2_perm, t2));  // b*c a*d
+
+rtx odd 

Re: [PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers

2023-02-14 Thread Andrew Stubbs

On 09/02/2023 20:13, Andrew Jenner wrote:
This patch introduces instruction patterns for complex number operations 
in the GCN machine description. These patterns are cmul, cmul_conj, 
vec_addsub, vec_fmaddsub, vec_fmsubadd, cadd90, cadd270, cmla and cmls 
(cmla_conj and cmls_conj were not found to be favorable to implement). 
As a side effect of adding cmls, I also added fms patterns corresponding 
to the existing fma patterns. Tested on CDNA2 GFX90a.


OK to commit?


gcc/ChangeLog:

 * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
     (gcn_expand_dpp_distribute_even_insn)
     (gcn_expand_dpp_distribute_odd_insn): Declare.
     * config/gcn/gcn-valu.md (@dpp_swap_pairs)
     (@dpp_distribute_even, @dpp_distribute_odd)
     (cmul3, cml4, vec_addsub3)
     (cadd3, vec_fmaddsub4, vec_fmsubadd4)
     (fms4, fms4_negop2, fms4)
     (fms4_negop2): New patterns.
     * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
     (gcn_expand_dpp_distribute_even_insn)
     (gcn_expand_dpp_distribute_odd_insn): New functions.
     * config/gcn/gcn.md: Add entries to unspec enum.

gcc/testsuite/ChangeLog:

 * gcc.target/gcn/complex.c: New test.


+;; It would be possible to represent these without the UNSPEC as
+;;
+;; (vec_merge
+;;   (fma op1 op2 op3)
+;;   (fma op1 op2 (neg op3))
+;;   (merge-const))
+;;
+;; But this doesn't seem useful in practice.
+
+(define_expand "vec_fmaddsub4"
+  [(set (match_operand:V_noHI 0 "register_operand" "=")
+(unspec:V_noHI
+  [(match_operand:V_noHI 1 "register_operand" "v")
+   (match_operand:V_noHI 2 "register_operand" "v")
+   (match_operand:V_noHI 3 "register_operand" "v")]
+  UNSPEC_FMADDSUB))]

This is a define_expand pattern that has a custom-code expansion with an 
unconditional "DONE", so the actual RTL representation is irrelevant 
here: it only needs to have the match_operand entries. The 
UNSPEC_FMADDSUB is therefore dead (as in, it will never appear in the 
IR). We can safely remove those, although I don't hate them for 
readability purposes.


The UNSPEC_CMUL and UNSPEC_CMUL_CONJ are similarly "dead", but since you 
use them for an iterator they're still useful in the machine description.


+(define_insn "fms4"
+  [(set (match_operand:V_FP 0 "register_operand"  "=  v,   v")
+   (fma:V_FP
+ (match_operand:V_FP 1 "gcn_alu_operand" "% vA,  vA")
+   (match_operand:V_FP 2 "gcn_alu_operand" "  vA,vSvA")
+   (neg:V_FP
+ (match_operand:V_FP 3 "gcn_alu_operand" "vSvA,  vA"]
+  ""
+  "v_fma%i0\t%0, %1, %2, -%3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])

Please ensure that the alternatives are vertically aligned in the same 
style as the rest of the file.


+/* Generate DPP pairwise swap instruction.
+   The opcode is given by INSN.  */
+
+char *
+gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn,
+   int ARG_UNUSED (unspec))

+/* Generate DPP distribute even instruction.
+   The opcode is given by INSN.  */
+
+char *
+gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn,
+int ARG_UNUSED (unspec))

+/* Generate DPP distribute odd instruction.
+   The opcode is given by INSN.  */
+
+char *
+gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn,
+   int ARG_UNUSED (unspec))

Please add a comment that isn't just the function name in words. Explain 
what operation happens here and maybe show an example of what it produces.


+++ b/gcc/testsuite/gcc.target/gcn/complex.c
@@ -0,0 +1,640 @@
+// { dg-do run }
+// { dg-options "-O -fopenmp-simd -ftree-loop-if-convert -fno-ssa-phiopt" }

Does the -fopenmp-simd option do anything here? There are no "omp 
declare simd" directives.


+void cmulF(float *td, float *te, float *tf, float *tg, int tas)
+{
+  typedef _Complex float complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+{
+  d[i] = e[i] * f[i];
+}
+}

Tests in gcc.target/gcn won't do anything with "omp target" directives. 
I would expect the loop to vectorize without, at -O2 or above (or "-O1 
-ftree-vectorize"), but you might find the output easier to read with 
"__restrict" on the parameters as that will avoid emitting the runtime 
alias check and scalar code implementation.


I'd also expect you to have to do something to avoid inlining.

+  td = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, 
omp_default_mem_alloc);
+  te = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, 
omp_default_mem_alloc);
+  tf = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, 
omp_default_mem_alloc);
+  tg = (float*)omp_aligned_alloc(ALIGNMENT, 

[PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers

2023-02-09 Thread Andrew Jenner
This patch introduces instruction patterns for complex number operations 
in the GCN machine description. These patterns are cmul, cmul_conj, 
vec_addsub, vec_fmaddsub, vec_fmsubadd, cadd90, cadd270, cmla and cmls 
(cmla_conj and cmls_conj were not found to be favorable to implement). 
As a side effect of adding cmls, I also added fms patterns corresponding 
to the existing fma patterns. Tested on CDNA2 GFX90a.


OK to commit?


gcc/ChangeLog:

* config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
(gcn_expand_dpp_distribute_even_insn)
(gcn_expand_dpp_distribute_odd_insn): Declare.
* config/gcn/gcn-valu.md (@dpp_swap_pairs)
(@dpp_distribute_even, @dpp_distribute_odd)
(cmul3, cml4, vec_addsub3)
(cadd3, vec_fmaddsub4, vec_fmsubadd4)
(fms4, fms4_negop2, fms4)
(fms4_negop2): New patterns.
* config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
(gcn_expand_dpp_distribute_even_insn)
(gcn_expand_dpp_distribute_odd_insn): New functions.
* config/gcn/gcn.md: Add entries to unspec enum.

gcc/testsuite/ChangeLog:

* gcc.target/gcn/complex.c: New test.diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index 861044e77f0..d7862b21a2a 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int 
regno);
 extern rtx get_exec (int64_t);
 extern rtx get_exec (machine_mode mode);
 extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int);
+extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int);
+extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *,
+  int unspec);
+extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *,
+ int unspec);
 extern void gcn_expand_epilogue ();
 extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets,
  rtx scale, bool unsigned_p, rtx exec);
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 47d9d87d58a..cb650bca3ff 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -1224,6 +1224,45 @@
   [(set_attr "type" "vop_dpp")
(set_attr "length" "16")])
 
+(define_insn "@dpp_swap_pairs"
+  [(set (match_operand:V_noHI 0 "register_operand""=v")
+   (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_SWAP_PAIRS))]
+  ""
+  {
+return gcn_expand_dpp_swap_pairs_insn (mode, "v_mov_b32",
+  UNSPEC_MOV_DPP_SWAP_PAIRS);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_even"
+  [(set (match_operand:V_noHI 0 "register_operand""=v")
+   (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+  ""
+  {
+return gcn_expand_dpp_distribute_even_insn (mode, "v_mov_b32",
+   UNSPEC_MOV_DPP_DISTRIBUTE_EVEN);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_odd"
+  [(set (match_operand:V_noHI 0 "register_operand""=v")
+   (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+  ""
+  {
+return gcn_expand_dpp_distribute_odd_insn (mode, "v_mov_b32",
+  UNSPEC_MOV_DPP_DISTRIBUTE_ODD);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
 ;; }}}
 ;; {{{ ALU special case: add/sub
 
@@ -2185,6 +2224,194 @@
 DONE;
   })
 
+(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ])
+(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")])
+(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")])
+(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")])
+
+(define_expand "cmul3"
+  [(set (match_operand:V_noHI 0 "register_operand" "= ")
+(unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" "v")
+  (match_operand:V_noHI 2 "register_operand" "v")]
+ UNSPEC_CMUL_OP))]
+  ""
+  {
+// operands[1]  a   b
+// operands[2]  c   d
+rtx t1 = gen_reg_rtx (mode);
+emit_insn (gen_mul3 (t1, operands[1], operands[2]));   // a*c b*d
+
+rtx s2_perm = gen_reg_rtx (mode);
+emit_insn (gen_dpp_swap_pairs (s2_perm, operands[2])); // d   c
+
+rtx t2 = gen_reg_rtx (mode);
+emit_insn (gen_mul3 (t2, operands[1], s2_perm));   // a*d b*c
+
+rtx t1_perm = gen_reg_rtx (mode);
+emit_insn (gen_dpp_swap_pairs (t1_perm, t1));  // b*d a*c
+
+rtx even