[PATCH][AArch64] Leveraging the use of STP instruction for vec_duplicate

2021-03-18 Thread Victor Do Nascimento via Gcc-patches
The backend pattern for storing a pair of identical values in 32 and 64-bit 
modes with the machine instruction STP was missing, and multiple instructions 
were needed to reproduce this behavior as a result of failed RTL pattern match 
in combine pass.

For the test case :

typedef long long v2di __attribute__((vector_size (16)));
typedef int v2si __attribute__((vector_size (8)));

void
foo (v2di *x, long long a)
{
v2di tmp = {a, a};
*x = tmp;
}

void
foo2 (v2si *x, int a)
{
v2si tmp = {a, a};
*x = tmp;
}

at -O2 on aarch64 gives:

foo:
stp x1, x1, [x0]
ret
foo2:
stp w1, w1, [x0]
ret

instead of:

foo:
dup v0.2d, x1
str q0, [x0]
ret
foo2:
dup v0.2s, w1
str d0, [x0]
ret

In preparation for the next stage 1  phase of development, added new RTL 
template, unittest and checked for regressions on bootstrapped 
aarch64-none-linux-gnu.

gcc/ChangeLog

2021-02-04 victor Do Nascimento 

* config/aarch64/aarch64-simd.md: Implement RTX pattern for
mapping 'vec_duplicate' RTX onto 'STP' ASM insn.
* config/aarch64/iterators.md: Implement ldpstp_vel_sz iterator
to map STP/LDP vector element mode to correct suffix in
attribute type definition of aarch64_simd_stp pattern.

gcc/testsuite/ChangeLog

2021-02-04 Victor Do Nascimento 

* gcc.target/stp_vec-dup_32_64-1.c: Added test.

Regards,
Victor

---
 gcc/config/aarch64/aarch64-simd.md| 10 +
 gcc/config/aarch64/iterators.md   |  3 +++
 .../gcc.target/aarch64/stp_vec_dup_32_64-1.c  | 22 +++
 3 files changed, 35 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 71aa77dd010..3d53bab0018 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -205,6 +205,16 @@
   [(set_attr "type" "neon_stp")]
 )
 
+(define_insn "aarch64_simd_stp"
+  [(set (match_operand:VP_2E 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+   (vec_duplicate:VP_2E (match_operand: 1 "register_operand" 
"w,r")))]
+  "TARGET_SIMD"
+  "@
+   stp\\t%1, %1, %z0
+   stp\\t%1, %1, %z0"
+  [(set_attr "type" "neon_stp, store_")]
+)
+
 (define_insn "load_pair"
   [(set (match_operand:VQ 0 "register_operand" "=w")
(match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index fb6e228651e..196055d31e5 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -898,6 +898,9 @@
 ;; Likewise for load/store pair.
 (define_mode_attr ldpstp_sz [(SI "8") (DI "16")])
 
+;; Size of element access for STP/LDP-generated vectors.
+(define_mode_attr ldpstp_vel_sz [(V2SI "8") (V2SF "8") (V2DI "16") (V2DF 
"16")])
+
 ;; For inequal width int to float conversion
 (define_mode_attr w1 [(HF "w") (SF "w") (DF "x")])
 (define_mode_attr w2 [(HF "x") (SF "x") (DF "w")])
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c 
b/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c
new file mode 100644
index 000..a37c903dfd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c
@@ -0,1 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (8)));
+
+void
+foo (v2di *x, long long a)
+{
+  v2di tmp = {a, a};
+  *x = tmp;
+}
+
+void
+foo2 (v2si *x, int a)
+{
+  v2si tmp = {a, a};
+  *x = tmp;
+}
+
+/* { dg-final { scan-assembler-times "stp\t" 2 } } */
+/* { dg-final { scan-assembler-not "dup\t" } } */
-- 
2.17.1

@@ -1,0 +23,0 @@


[PATCH][AArch64] Leveraging the use of STP instruction for vec_duplicate

2021-02-11 Thread Victor Do Nascimento via Gcc-patches
Dear GCC community,

The backend pattern for storing a pair of identical values in 32 and 64-bit 
modes with the machine instruction STP was missing, and multiple instructions 
were needed to reproduce this behavior as a result of failed RTL pattern match 
in combine pass. 

For the test case :

typedef long long v2di __attribute__((vector_size (16))); typedef int v2si 
__attribute__((vector_size (8)));

void
foo (v2di *x, long long a)
{
  v2di tmp = {a, a};
  *x = tmp;
}

void
foo2 (v2si *x, int a)
{
  v2si tmp = {a, a};
  *x = tmp;
}

at -O2 on aarch64 gives:

foo:
    stp x1, x1, [x0]
    ret
foo2:
    stp w1, w1, [x0]
    ret

instead of:

foo:
    dup v0.2d, x1
    str q0, [x0]
    ret
foo2:
    dup v0.2s, w1
    str d0, [x0]
    ret
    
Added new RTL template, unittest and checked for regressions on bootstrapped 
aarch64-none-linux-gnu.

gcc/ChangeLog

2021-02-04 victor Do Nascimento 

* config/aarch64/aarch64-simd.md: Implement RTX pattern for
mapping 'vec_duplicate' RTX onto 'STP' ASM insn.
* config/aarch64/iterators.md: Implement ldpstp_vel_sz iterator
to map STP/LDP vector element mode to correct suffix in 
attribute type definition of aarch64_simd_stp pattern.
    
gcc/testsuite/ChangeLog
    
2021-02-04 Victor Do Nascimento 

* gcc.target/stp_vec-dup_32_64-1.c: New.

Regards,
Victor

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 68baf416045..4623cbb95f4 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -205,6 +205,16 @@
   [(set_attr "type" "neon_stp")]
 )
 
+(define_insn "aarch64_simd_stp"
+[(set (match_operand:VP_2E 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+  (vec_duplicate:VP_2E (match_operand: 1 "register_operand" "w,r")))]
+"TARGET_SIMD"
+"@
+ stp\\t%1, %1, %z0
+ stp\\t%1, %1, %z0"
+[(set_attr "type" "neon_stp, store_")]
+)
+
 (define_insn "load_pair"
   [(set (match_operand:VQ 0 "register_operand" "=w")
(match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index fb1426b7752..aac6e0b5bd9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -880,6 +880,9 @@
 ;; Likewise for load/store pair.
 (define_mode_attr ldpstp_sz [(SI "8") (DI "16")])
 
+;; Size of element access for STP/LDP-generated vectors.
+(define_mode_attr ldpstp_vel_sz [(V2SI "8") (V2SF "8") (V2DI "16") (V2DF 
"16")])
+
 ;; For inequal width int to float conversion
 (define_mode_attr w1 [(HF "w") (SF "w") (DF "x")])
 (define_mode_attr w2 [(HF "x") (SF "x") (DF "w")])
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c 
b/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c
new file mode 100644
index 000..a37c903dfd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (8)));
+
+void
+foo (v2di *x, long long a)
+{
+  v2di tmp = {a, a};
+  *x = tmp;
+}
+
+void
+foo2 (v2si *x, int a)
+{
+  v2si tmp = {a, a};
+  *x = tmp;
+}
+
+/* { dg-final { scan-assembler-times "stp\t" 2 } } */
+/* { dg-final { scan-assembler-not "dup\t" } } */