[PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers

Andrew Jenner Thu, 09 Feb 2023 12:14:08 -0800

This patch introduces instruction patterns for complex number operationsin the GCN machine description. These patterns are cmul, cmul_conj,vec_addsub, vec_fmaddsub, vec_fmsubadd, cadd90, cadd270, cmla and cmls(cmla_conj and cmls_conj were not found to be favorable to implement).As a side effect of adding cmls, I also added fms patterns correspondingto the existing fma patterns. Tested on CDNA2 GFX90a.


OK to commit?



gcc/ChangeLog:

        * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
        (gcn_expand_dpp_distribute_even_insn)
        (gcn_expand_dpp_distribute_odd_insn): Declare.
        * config/gcn/gcn-valu.md (@dpp_swap_pairs<mode>)
        (@dpp_distribute_even<mode>, @dpp_distribute_odd<mode>)
        (cmul<conj_op><mode>3, cml<addsub_as><mode>4, vec_addsub<mode>3)
        (cadd<rot><mode>3, vec_fmaddsub<mode>4, vec_fmsubadd<mode>4)
        (fms<mode>4<exec>, fms<mode>4_negop2<exec>, fms<mode>4)
        (fms<mode>4_negop2): New patterns.
        * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
        (gcn_expand_dpp_distribute_even_insn)
        (gcn_expand_dpp_distribute_odd_insn): New functions.
        * config/gcn/gcn.md: Add entries to unspec enum.

gcc/testsuite/ChangeLog:

        * gcc.target/gcn/complex.c: New test.

diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index 861044e77f0..d7862b21a2a 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int 
regno);
 extern rtx get_exec (int64_t);
 extern rtx get_exec (machine_mode mode);
 extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int);
+extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int);
+extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *,
+                                                  int unspec);
+extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *,
+                                                 int unspec);
 extern void gcn_expand_epilogue ();
 extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets,
                                      rtx scale, bool unsigned_p, rtx exec);
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 47d9d87d58a..cb650bca3ff 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -1224,6 +1224,45 @@
   [(set_attr "type" "vop_dpp")
    (set_attr "length" "16")])
 
+(define_insn "@dpp_swap_pairs<mode>"
+  [(set (match_operand:V_noHI 0 "register_operand"    "=v")
+       (unspec:V_noHI
+         [(match_operand:V_noHI 1 "register_operand" " v")]
+         UNSPEC_MOV_DPP_SWAP_PAIRS))]
+  ""
+  {
+    return gcn_expand_dpp_swap_pairs_insn (<MODE>mode, "v_mov_b32",
+                                          UNSPEC_MOV_DPP_SWAP_PAIRS);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_even<mode>"
+  [(set (match_operand:V_noHI 0 "register_operand"    "=v")
+       (unspec:V_noHI
+         [(match_operand:V_noHI 1 "register_operand" " v")]
+         UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+  ""
+  {
+    return gcn_expand_dpp_distribute_even_insn (<MODE>mode, "v_mov_b32",
+                                               UNSPEC_MOV_DPP_DISTRIBUTE_EVEN);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_odd<mode>"
+  [(set (match_operand:V_noHI 0 "register_operand"    "=v")
+       (unspec:V_noHI
+         [(match_operand:V_noHI 1 "register_operand" " v")]
+         UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+  ""
+  {
+    return gcn_expand_dpp_distribute_odd_insn (<MODE>mode, "v_mov_b32",
+                                              UNSPEC_MOV_DPP_DISTRIBUTE_ODD);
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
 ;; }}}
 ;; {{{ ALU special case: add/sub
 
@@ -2185,6 +2224,194 @@
     DONE;
   })
 
+(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ])
+(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")])
+(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")])
+(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")])
+
+(define_expand "cmul<conj_op><mode>3"
+  [(set (match_operand:V_noHI 0 "register_operand" "= &v")
+        (unspec:V_noHI
+         [(match_operand:V_noHI 1 "register_operand" "v")
+          (match_operand:V_noHI 2 "register_operand" "v")]
+         UNSPEC_CMUL_OP))]
+  ""
+  {
+    // operands[1]                                                  a   b
+    // operands[2]                                                  c   d
+    rtx t1 = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));   // a*c b*d
+
+    rtx s2_perm = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_dpp_swap_pairs<mode> (s2_perm, operands[2])); // d   c
+
+    rtx t2 = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_mul<mode>3 (t2, operands[1], s2_perm));       // a*d b*c
+
+    rtx t1_perm = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_dpp_swap_pairs<mode> (t1_perm, t1));          // b*d a*c
+
+    rtx even = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (even, get_exec (0x5555555555555555UL));
+    rtx dest = operands[0];
+    emit_insn (gen_<cmul_subadd><mode>3_exec (dest, t1, t1_perm, dest, even));
+                                                             // a*c-b*d 0
+
+    rtx t2_perm = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_dpp_swap_pairs<mode> (t2_perm, t2));          // b*c a*d
+
+    rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+    emit_insn (gen_<cmul_addsub><mode>3_exec (dest, t2, t2_perm, dest, odd));
+                                                                   // 0 a*d+b*c
+    DONE;
+  })
+
+(define_code_iterator addsub [plus minus])
+(define_code_attr addsub_as [(plus "a") (minus "s")])
+
+(define_expand "cml<addsub_as><mode>4"
+  [(set (match_operand:V_FP 0 "register_operand" "= &v")
+       (addsub:V_FP
+         (unspec:V_FP
+           [(match_operand:V_FP 1 "register_operand" "v")
+            (match_operand:V_FP 2 "register_operand" "v")]
+           UNSPEC_CMUL)
+         (match_operand:V_FP 3 "register_operand" "v")))]
+  ""
+  {
+    rtx a = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_dpp_distribute_even<mode> (a, operands[1]));    // a   a
+
+    rtx t1 = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_fm<addsub_as><mode>4 (t1, a, operands[2], operands[3]));
+                                                                   // a*c a*d
+
+    rtx b = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_dpp_distribute_odd<mode> (b, operands[1]));     // b   b
+
+    rtx t2 = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_mul<mode>3 (t2, b, operands[2]));               // b*c b*d
+
+    rtx t2_perm = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_dpp_swap_pairs<mode> (t2_perm, t2));            // b*d b*c
+
+    rtx even = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (even, get_exec (0x5555555555555555UL));
+    rtx dest = operands[0];
+    emit_insn (gen_sub<mode>3_exec (dest, t1, t2_perm, dest, even));
+
+    rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+    emit_insn (gen_add<mode>3_exec (dest, t1, t2_perm, dest, odd));
+
+    DONE;
+  })
+
+(define_expand "vec_addsub<mode>3"
+  [(set (match_operand:V_noHI 0 "register_operand" "= &v")
+        (vec_merge:V_noHI
+          (minus:V_noHI
+            (match_operand:V_noHI 1 "register_operand" "v")
+            (match_operand:V_noHI 2 "register_operand" "v"))
+          (plus:V_noHI (match_dup 1) (match_dup 2))
+          (const_int 6148914691236517205)))]
+  ""
+  {
+    rtx even = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (even, get_exec (0x5555555555555555UL));
+    rtx dest = operands[0];
+    rtx x = operands[1];
+    rtx y = operands[2];
+    emit_insn (gen_sub<mode>3_exec (dest, x, y, dest, even));
+    rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+    emit_insn (gen_add<mode>3_exec (dest, x, y, dest, odd));
+
+    DONE;
+  })
+
+(define_int_iterator CADD [UNSPEC_CADD90 UNSPEC_CADD270])
+(define_int_attr rot [(UNSPEC_CADD90 "90") (UNSPEC_CADD270 "270")])
+(define_int_attr cadd_subadd [(UNSPEC_CADD90 "sub") (UNSPEC_CADD270 "add")])
+(define_int_attr cadd_addsub [(UNSPEC_CADD90 "add") (UNSPEC_CADD270 "sub")])
+
+(define_expand "cadd<rot><mode>3"
+  [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+        (unspec:V_noHI [(match_operand:V_noHI 1 "register_operand" "v")
+                        (match_operand:V_noHI 2 "register_operand" "v")]
+                        CADD))]
+  ""
+  {
+    rtx dest = operands[0];
+    rtx x = operands[1];
+    rtx y = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_dpp_swap_pairs<mode> (y, operands[2]));
+
+    rtx even = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (even, get_exec (0x5555555555555555UL));
+    emit_insn (gen_<cadd_subadd><mode>3_exec (dest, x, y, dest, even));
+    rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+    emit_insn (gen_<cadd_addsub><mode>3_exec (dest, x, y, dest, odd));
+
+    DONE;
+  })
+
+;; It would be possible to represent these without the UNSPEC as
+;;
+;; (vec_merge
+;;   (fma op1 op2 op3)
+;;   (fma op1 op2 (neg op3))
+;;   (merge-const))
+;;
+;; But this doesn't seem useful in practice.
+
+(define_expand "vec_fmaddsub<mode>4"
+  [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+        (unspec:V_noHI
+          [(match_operand:V_noHI 1 "register_operand" "v")
+           (match_operand:V_noHI 2 "register_operand" "v")
+           (match_operand:V_noHI 3 "register_operand" "v")]
+          UNSPEC_FMADDSUB))]
+  ""
+  {
+    rtx t1 = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));
+    rtx even = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (even, get_exec (0x5555555555555555UL));
+    rtx dest = operands[0];
+    emit_insn (gen_sub<mode>3_exec (dest, t1, operands[3], dest, even));
+    rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+    emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, odd));
+
+    DONE;
+  })
+
+(define_expand "vec_fmsubadd<mode>4"
+  [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+        (unspec:V_noHI
+          [(match_operand:V_noHI 1 "register_operand" "v")
+           (match_operand:V_noHI 2 "register_operand" "v")
+           (neg:V_noHI
+             (match_operand:V_noHI 3 "register_operand" "v"))]
+          UNSPEC_FMADDSUB))]
+  ""
+  {
+    rtx t1 = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));
+    rtx even = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (even, get_exec (0x5555555555555555UL));
+    rtx dest = operands[0];
+    emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, even));
+    rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+    emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+    emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, odd));
+
+    DONE;
+  })
+
 ;; }}}
 ;; {{{ ALU generic case
 
@@ -2768,6 +2995,56 @@
   [(set_attr "type" "vop3a")
    (set_attr "length" "8")])
 
+(define_insn "fms<mode>4<exec>"
+  [(set (match_operand:V_FP 0 "register_operand"  "=  v,   v")
+       (fma:V_FP
+         (match_operand:V_FP 1 "gcn_alu_operand" "% vA,  vA")
+       (match_operand:V_FP 2 "gcn_alu_operand" "  vA,vSvA")
+       (neg:V_FP
+         (match_operand:V_FP 3 "gcn_alu_operand" "vSvA,  vA"))))]
+  ""
+  "v_fma%i0\t%0, %1, %2, -%3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fms<mode>4_negop2<exec>"
+  [(set (match_operand:V_FP 0 "register_operand"    "=  v,   v,   v")
+       (fma:V_FP
+         (match_operand:V_FP 1 "gcn_alu_operand"   "  vA,  vA,vSvA")
+         (neg:V_FP
+           (match_operand:V_FP 2 "gcn_alu_operand" "  vA,vSvA,  vA"))
+         (neg:V_FP
+           (match_operand:V_FP 3 "gcn_alu_operand"   "vSvA,  vA,  vA"))))]
+  ""
+  "v_fma%i0\t%0, %1, -%2, -%3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fms<mode>4"
+  [(set (match_operand:FP 0 "register_operand"  "=  v,   v")
+       (fma:FP
+         (match_operand:FP 1 "gcn_alu_operand" "% vA,  vA")
+         (match_operand:FP 2 "gcn_alu_operand" "  vA,vSvA")
+         (neg:FP
+           (match_operand:FP 3 "gcn_alu_operand" "vSvA,  vA"))))]
+  ""
+  "v_fma%i0\t%0, %1, %2, -%3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fms<mode>4_negop2"
+  [(set (match_operand:FP 0 "register_operand"    "=  v,   v,   v")
+       (fma:FP
+         (match_operand:FP 1 "gcn_alu_operand"   "  vA,  vA,vSvA")
+         (neg:FP
+           (match_operand:FP 2 "gcn_alu_operand" "  vA,vSvA,  vA"))
+         (neg:FP
+           (match_operand:FP 3 "gcn_alu_operand"   "vSvA,  vA,  vA"))))]
+  ""
+  "v_fma%i0\t%0, %1, -%2, -%3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
 ;; }}}
 ;; {{{ FP division
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 23ab01e75d8..c04fae2650f 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5012,6 +5012,72 @@ gcn_vector_alignment_reachable (const_tree ARG_UNUSED 
(type), bool is_packed)
   return !is_packed;
 }
 
+/* Generate DPP pairwise swap instruction.
+   The opcode is given by INSN.  */
+
+char *
+gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn,
+                               int ARG_UNUSED (unspec))
+{
+  static char buf[128];
+  const char *dpp;
+
+  /* Add the DPP modifiers.  */
+  dpp = "quad_perm:[1,0,3,2]";
+
+  if (vgpr_2reg_mode_p (mode))
+    sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+            insn, dpp, insn, dpp);
+  else
+    sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+  return buf;
+}
+
+/* Generate DPP distribute even instruction.
+   The opcode is given by INSN.  */
+
+char *
+gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn,
+                                    int ARG_UNUSED (unspec))
+{
+  static char buf[128];
+  const char *dpp;
+
+  /* Add the DPP modifiers.  */
+  dpp = "quad_perm:[0,0,2,2]";
+
+  if (vgpr_2reg_mode_p (mode))
+    sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+            insn, dpp, insn, dpp);
+  else
+    sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+  return buf;
+}
+
+/* Generate DPP distribute odd instruction.
+   The opcode is given by INSN.  */
+
+char *
+gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn,
+                                   int ARG_UNUSED (unspec))
+{
+  static char buf[128];
+  const char *dpp;
+
+  /* Add the DPP modifiers.  */
+  dpp = "quad_perm:[1,1,3,3]";
+
+  if (vgpr_2reg_mode_p (mode))
+    sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+            insn, dpp, insn, dpp);
+  else
+    sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+  return buf;
+}
+
 /* Generate DPP instructions used for vector reductions.
 
    The opcode is given by INSN.
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 10d2b874cce..dc14da6a058 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -78,6 +78,14 @@
   UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
   UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
   UNSPEC_MOV_DPP_SHR
+  UNSPEC_MOV_DPP_SWAP_PAIRS
+  UNSPEC_MOV_DPP_DISTRIBUTE_EVEN
+  UNSPEC_MOV_DPP_DISTRIBUTE_ODD
+  UNSPEC_CMUL UNSPEC_CMUL_CONJ
+  UNSPEC_CMUL_ADD UNSPEC_CMUL_SUB
+  UNSPEC_FMADDSUB
+  UNSPEC_CADD90
+  UNSPEC_CADD270
   UNSPEC_GATHER
   UNSPEC_SCATTER
   UNSPEC_RCP
diff --git a/gcc/testsuite/gcc.target/gcn/complex.c 
b/gcc/testsuite/gcc.target/gcn/complex.c
new file mode 100755
index 00000000000..3b8a6cc854c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/complex.c
@@ -0,0 +1,640 @@
+// { dg-do run }
+// { dg-options "-O -fopenmp-simd -ftree-loop-if-convert -fno-ssa-phiopt" }
+
+#include <stdlib.h>
+#include <omp.h>
+#include <stdbool.h>
+
+#define COUNT 1000
+#define MAX 1000
+#define ALIGNMENT (2*1024*1024) // 2MB
+
+_Complex double conj(_Complex double);
+_Complex float conjf(_Complex float);
+
+unsigned int device = 0;
+
+// cmul
+
+void cmulF(float *td, float *te, float *tf, float *tg, int tas)
+{
+  typedef _Complex float complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] * f[i];
+    }
+}
+
+bool cmulFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c)
+        return false;
+    }
+  return true;
+}
+
+void cmulD(double *td, double *te, double *tf, double *tg, int tas)
+{
+  typedef _Complex double complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] * f[i];
+    }
+}
+
+bool cmulDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c)
+        return false;
+    }
+  return true;
+}
+
+
+// cmul_conj
+
+void cmul_conjF(float *td, float *te, float *tf, float *tg, int tas)
+{
+  typedef _Complex float complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] * conj(f[i]);
+    }
+}
+
+bool cmul_conjFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d)
+        return false;
+    }
+  return true;
+}
+
+void cmul_conjD(double *td, double *te, double *tf, double *tg, int tas)
+{
+  typedef _Complex double complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] * conj(f[i]);
+    }
+}
+
+bool cmul_conjDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d)
+        return false;
+    }
+  return true;
+}
+
+
+// addsub
+
+void addsubF(float *td, float *te, float *tf, float *tg, int tas)
+{
+  typedef _Complex float complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] - conjf(f[i]);
+    }
+}
+
+bool addsubFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      if (td[i*2] != a-c || td[i*2+1] != b+d)
+        return false;
+    }
+  return true;
+}
+
+void addsubD(double *td, double *te, double *tf, double *tg, int tas)
+{
+  typedef _Complex double complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] - conj(f[i]);
+    }
+}
+
+bool addsubDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      if (td[i*2] != a-c || td[i*2+1] != b+d)
+        return false;
+    }
+  return true;
+}
+
+
+// fmaddsub
+
+void fmaddsubF(float *td, float *te, float *tf, float *tg, int tas)
+{
+  int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      td[i*2] = te[i*2]*tf[i*2]-tg[i*2];
+      td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1];
+    }
+}
+
+bool fmaddsubFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      float e = tg[i*2];
+      float f = tg[i*2+1];
+      if (td[i*2] != a*c-e || td[i*2+1] != b*d+f)
+        return false;
+    }
+  return true;
+}
+
+void fmaddsubD(double *td, double *te, double *tf, double *tg, int tas)
+{
+  int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      td[i*2] = te[i*2]*tf[i*2]-tg[i*2];
+      td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1];
+    }
+}
+
+bool fmaddsubDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      double e = tg[i*2];
+      double f = tg[i*2+1];
+      if (td[i*2] != a*c-e || td[i*2+1] != b*d+f)
+        return false;
+    }
+  return true;
+}
+
+
+// fmsubadd
+
+void fmsubaddF(float *td, float *te, float *tf, float *tg, int tas)
+{
+  int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      td[i*2] = te[i*2]*tf[i*2]+tg[i*2];
+      td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1];
+    }
+}
+
+bool fmsubaddFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      float e = tg[i*2];
+      float f = tg[i*2+1];
+      if (td[i*2] != a*c+e || td[i*2+1] != b*d-f)
+        return false;
+    }
+  return true;
+}
+
+void fmsubaddD(double *td, double *te, double *tf, double *tg, int tas)
+{
+  int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      td[i*2] = te[i*2]*tf[i*2]+tg[i*2];
+      td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1];
+    }
+}
+
+bool fmsubaddDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      double e = tg[i*2];
+      double f = tg[i*2+1];
+      if (td[i*2] != a*c+e || td[i*2+1] != b*d-f)
+        return false;
+    }
+  return true;
+}
+
+
+// cadd90
+
+void cadd90F(float *td, float *te, float *tf, float *tg, int tas)
+{
+  int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      td[i*2] = te[i*2] - tf[i*2+1];
+      td[i*2+1] = te[i*2+1] + tf[i*2];
+    }
+}
+
+bool cadd90Fcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      if (td[i*2] != a-d || td[i*2+1] != b+c)
+        return false;
+    }
+  return true;
+}
+
+void cadd90D(double *td, double *te, double *tf, double *tg, int tas)
+{
+  int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      td[i*2] = te[i*2] - tf[i*2+1];
+      td[i*2+1] = te[i*2+1] + tf[i*2];
+    }
+}
+
+bool cadd90Dcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      if (td[i*2] != a-d || td[i*2+1] != b+c)
+        return false;
+    }
+  return true;
+}
+
+// cadd270
+
+void cadd270F(float *td, float *te, float *tf, float *tg, int tas)
+{
+  int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      td[i*2] = te[i*2] + tf[i*2+1];
+      td[i*2+1] = te[i*2+1] - tf[i*2];
+    }
+}
+
+bool cadd270Fcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      if (td[i*2] != a+d || td[i*2+1] != b-c)
+        return false;
+    }
+  return true;
+}
+
+void cadd270D(double *td, double *te, double *tf, double *tg, int tas)
+{
+  int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      td[i*2] = te[i*2] + tf[i*2+1];
+      td[i*2+1] = te[i*2+1] - tf[i*2];
+    }
+}
+
+bool cadd270Dcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      if (td[i*2] != a+d || td[i*2+1] != b-c)
+        return false;
+    }
+  return true;
+}
+
+
+// cmla
+
+void cmlaF(float *td, float *te, float *tf, float *tg, int tas)
+{
+  typedef _Complex float complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+  complexT *g = (complexT*)(tg);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] * f[i] + g[i];
+    }
+}
+
+bool cmlaFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      float e = tg[i*2];
+      float f = tg[i*2+1];
+      if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f)
+        return false;
+    }
+  return true;
+}
+
+void cmlaD(double *td, double *te, double *tf, double *tg, int tas)
+{
+  typedef _Complex double complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+  complexT *g = (complexT*)(tg);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] * f[i] + g[i];
+    }
+}
+
+bool cmlaDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      double e = tg[i*2];
+      double f = tg[i*2+1];
+      if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f)
+        return false;
+    }
+  return true;
+}
+
+
+// cmls
+
+void cmlsF(float *td, float *te, float *tf, float *tg, int tas)
+{
+  typedef _Complex float complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+  complexT *g = (complexT*)(tg);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] * f[i] - g[i];
+    }
+}
+
+bool cmlsFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      float a = te[i*2];
+      float b = te[i*2+1];
+      float c = tf[i*2];
+      float d = tf[i*2+1];
+      float e = tg[i*2];
+      float f = tg[i*2+1];
+      if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f)
+        return false;
+    }
+  return true;
+}
+
+void cmlsD(double *td, double *te, double *tf, double *tg, int tas)
+{
+  typedef _Complex double complexT;
+  int array_size = tas/2;
+  complexT *d = (complexT*)(td);
+  complexT *e = (complexT*)(te);
+  complexT *f = (complexT*)(tf);
+  complexT *g = (complexT*)(tg);
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+    {
+      d[i] = e[i] * f[i] - g[i];
+    }
+}
+
+bool cmlsDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+  for (int i = 0; i < tas/2; ++i)
+    {
+      double a = te[i*2];
+      double b = te[i*2+1];
+      double c = tf[i*2];
+      double d = tf[i*2+1];
+      double e = tg[i*2];
+      double f = tg[i*2+1];
+      if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f)
+        return false;
+    }
+  return true;
+}
+
+
+typedef void(*runF)(float *td, float *te, float *tf, float *tg, int tas);
+typedef void(*runD)(double *td, double *te, double *tf, double *tg, int tas);
+typedef bool(*checkF)(float *td, float *te, float *tf, float *tg, int tas);
+typedef bool(*checkD)(double *td, double *te, double *tf, double *tg, int tas);
+
+typedef struct
+{
+  runF rF;
+  runD rD;
+  checkF cF;
+  checkD cD;
+} operation;
+
+operation ops[] = {
+  {cmulF, cmulD, cmulFcheck, cmulDcheck},
+  {cmul_conjF, cmul_conjD, cmul_conjFcheck, cmul_conjDcheck},
+  {addsubF, addsubD, addsubFcheck, addsubDcheck},
+  {fmaddsubF, fmaddsubD, fmaddsubFcheck, fmaddsubDcheck},
+  {fmsubaddF, fmsubaddD, fmsubaddFcheck, fmsubaddDcheck},
+  {cadd90F, cadd90D, cadd90Fcheck, cadd90Dcheck},
+  {cadd270F, cadd270D, cadd270Fcheck, cadd270Dcheck},
+  {cmlaF, cmlaD, cmlaFcheck, cmlaDcheck},
+  {cmlsF, cmlsD, cmlsFcheck, cmlsDcheck}
+};
+
+void testF(operation* op)
+{
+  float* td;
+  float* te;
+  float* tf;
+  float* tg;
+  int array_size = COUNT;
+  td = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, 
omp_default_mem_alloc);
+  te = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, 
omp_default_mem_alloc);
+  tf = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, 
omp_default_mem_alloc);
+  tg = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, 
omp_default_mem_alloc);
+  omp_set_default_device(device);
+  float* dd = td;
+  float* ee = te;
+  float* ff = tf;
+  float* gg = tg;
+  for (int i = 0; i < COUNT; ++i)
+    {
+      te[i] = (float)(rand() % MAX);
+      tf[i] = (float)(rand() % MAX);
+      tg[i] = (float)(rand() % MAX);
+    }
+  // Set up data region on device
+#pragma omp target enter data map(to: dd[0:array_size], ee[0:array_size], 
ff[0:array_size], gg[0:array_size])
+  {}
+  op->rF(td, te, tf, tg, COUNT);
+#pragma omp target exit data map(from: dd[0:array_size], ee[0:array_size], 
ff[0:array_size], gg[0:array_size])
+  {}
+  if (!op->cF(td, te, tf, tg, COUNT))
+    abort();
+}
+
+void testD(operation* op)
+{
+  double* td;
+  double* te;
+  double* tf;
+  double* tg;
+  int array_size = COUNT;
+  td = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, 
omp_default_mem_alloc);
+  te = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, 
omp_default_mem_alloc);
+  tf = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, 
omp_default_mem_alloc);
+  tg = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, 
omp_default_mem_alloc);
+  omp_set_default_device(device);
+  double* dd = td;
+  double* ee = te;
+  double* ff = tf;
+  double* gg = tg;
+  for (int i = 0; i < COUNT; ++i)
+    {
+      te[i] = (double)(rand() % MAX);
+      tf[i] = (double)(rand() % MAX);
+      tg[i] = (double)(rand() % MAX);
+    }
+  // Set up data region on device
+#pragma omp target enter data map(to: dd[0:array_size], ee[0:array_size], 
ff[0:array_size], gg[0:array_size])
+  {}
+  op->rD(td, te, tf, tg, COUNT);
+#pragma omp target exit data map(from: dd[0:array_size], ee[0:array_size], 
ff[0:array_size], gg[0:array_size])
+  {}
+  if (!op->cD(td, te, tf, tg, COUNT))
+    abort();
+}
+
+int main()
+{
+   for (int i = 0; i < 9; ++i)
+    {
+      testF(&ops[i]);
+      testD(&ops[i]);
+    }
+}
+

[PATCH] amdgcn: Add instruction patterns for vector operations on complex numbers

Reply via email to