Hi all,
In this PR the 128-bit load-duplicate intrinsics in neon.exp ICE on big-endian
with an unrecognisable insn error:
neon-vld1_dupQ.c:24:1: error: unrecognizable insn:
(insn 94 93 31 (set (subreg:DI (reg:V2DI 95 d16 [orig:137 D.14400 ] [137]) 0)
(subreg:DI (reg:V2DI 95 d16 [orig:137 D.14400 ] [137]) 8))
The problem seems to be that the neon_vld1_dupv2di splitter generates subregs
after reload with gen_lowpart and gen_highpart. Since that splitter always
matches after reload, we already know the hard register numbers, so we can just
manipulate those directly to extract the two doubleword parts of a quadword reg.
While we're at it, we might as well use a more general move instruction when the
alignment is natural to potentially take advantage of more complex addressing
modes. We're allowed to do that because the vld1Q_dup*64 intrinsics describe a
behaviour and do not guarantee that a particular instruction will be used.
Therefore the vld1Q_dup*64 tests are updated to be run-time tests instead to
test the functionality. New *_misaligned tests are added, however, to make sure
that we still generate vld1.64 when the address is explicitly unaligned, since
vld1.64 is the only instruction that can handle that.
Did an armeb-none-linux-gnueabihf build.
The vld1Q_dup*64* tests now pass on big and little endian.
arm-none-linux-gnueabihf bootstrap on Chromebook successful.
This is a regression since 4.7. I've tested this on trunk. Will test this on the
4.8 and 4.7 branches.
Ok for those branches if no regressions?
Thanks,
Kyrill
2014-02-11 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
PR target/55426
* config/arm/neon.md (neon_vld1_dupv2di): Do not generate
low and high part subregs, use hard reg numbers.
* config/arm/arm.c (arm_mem_aligned_p): New function.
(arm_init_neon_builtins): Allow for memory operands
in load operations.
* config/arm/arm-protos.h (arm_mem_aligned_p): Declare
extern.
* config/arm/constraints.md (Uo): New constraint.
2014-02-11 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
PR target/55426
* gcc.target/arm/neon/vld1Q_dupp64.c: Change to run-time test.
* gcc.target/arm/neon/vld1Q_dups64.c: Likewise.
* gcc.target/arm/neon/vld1Q_dupu64.c: Likewise.
* gcc.target/arm/neon/vld1Q_dupp64_misaligned.c: New test.
* gcc.target/arm/neon/vld1Q_dups64_misaligned.c: Likewise.
* gcc.target/arm/neon/vld1Q_dupu64_misaligned.c: Likewise.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 13874ee..56f46e3 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -95,6 +95,7 @@ extern enum reg_class coproc_secondary_reload_class (enum machine_mode, rtx,
extern bool arm_tls_referenced_p (rtx);
extern int arm_coproc_mem_operand (rtx, bool);
+extern bool arm_mem_aligned_p (rtx, unsigned int);
extern int neon_vector_mem_operand (rtx, int, bool);
extern int neon_struct_mem_operand (rtx);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index fc81bf6..33c829d 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12499,6 +12499,14 @@ arm_coproc_mem_operand (rtx op, bool wb)
return FALSE;
}
+/* Return true if the MEM RTX x has the given alignment. */
+bool
+arm_mem_aligned_p (rtx x, unsigned int alignment)
+{
+ gcc_assert (MEM_P (x));
+ return MEM_ALIGN (x) == alignment;
+}
+
/* Return TRUE if OP is a memory operand which we can load or store a vector
to/from. TYPE is one of the following values:
0 - Vector load/stor (vldr)
@@ -23644,7 +23652,9 @@ arm_init_neon_builtins (void)
/* Neon load patterns always have the memory
operand in the operand 1 position. */
gcc_assert (insn_data[d->code].operand[k].predicate
- == neon_struct_operand);
+ == neon_struct_operand
+ || insn_data[d->code].operand[k].predicate
+ == memory_operand);
switch (d->mode)
{
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 85dd116..86947dd 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -381,6 +381,14 @@
(and (match_code "mem")
(match_test "TARGET_32BIT && neon_vector_mem_operand (op, 2, true)")))
+(define_memory_constraint "Uo"
+ "@internal
+ In ARM/Thumb-2 state a valid address for Neon element and structure
+ load/store instructions or normal load on doubleword alignment."
+ (and (match_code "mem")
+ (match_test "TARGET_32BIT && (arm_mem_aligned_p (op, DOUBLEWORD_ALIGNMENT)
+ || neon_vector_mem_operand (op, 2, true))")))
+
(define_memory_constraint "Us"
"@internal
In ARM/Thumb-2 state a valid address for non-offset loads/stores of
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 2f06e42..e4490ba 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -4406,19 +4406,28 @@
(define_insn_and_split "neon_vld1_dupv2di"
[(set (match_operand:V2DI 0 "s_register_operand" "=w")
- (vec_duplicate:V2DI (match_operand:DI 1 "neon_struct_operand" "Um")))]
+ (vec_duplicate:V2DI (match_operand:DI 1 "memory_operand" "Uo")))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
- rtx tmprtx = gen_lowpart (DImode, operands[0]);
- emit_insn (gen_neon_vld1_dupdi (tmprtx, operands[1]));
- emit_move_insn (gen_highpart (DImode, operands[0]), tmprtx );
- DONE;
- }
- [(set_attr "length" "8")
- (set_attr "type" "neon_load1_all_lanes_q")]
+ rtx lo_reg = gen_rtx_REG (DImode, REGNO (operands[0]));
+ rtx hi_reg = gen_rtx_REG (DImode, REGNO (operands[0]) + 2);
+
+ /* If the alignment is not natural, we have to use vld1.
+ In all other cases we can try to take advantage of a more general
+ move operation. */
+
+ if (arm_mem_aligned_p (operands[1], DOUBLEWORD_ALIGNMENT))
+ emit_move_insn (lo_reg, operands[1]);
+ else
+ emit_insn (gen_neon_vld1_dupdi (lo_reg, operands[1]));
+
+ emit_move_insn (hi_reg, lo_reg);
+ DONE;
+ }
+ [(set_attr "length" "8")]
)
(define_expand "vec_store_lanes<mode><mode>"
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c
index 2d504c1..a606eef 100644
--- a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c
@@ -1,19 +1,35 @@
-/* Test the `vld1Q_dupp64' ARM Neon intrinsic. */
-/* This file was autogenerated by neon-testgen. */
+/* Test the `vld1Q_dup_p64' ARM Neon intrinsic.
+ If the argument to vld1q_dup_p64 is naturally aligned the compiler has
+ the freedom to choose any instruction equivalent to vld1.64, therefore we
+ do not scan for it explicitly but instead test the runtime functionality.
+ */
-/* { dg-do assemble } */
+/* { dg-do run } */
/* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2" } */
/* { dg-add-options arm_crypto } */
#include "arm_neon.h"
-void test_vld1Q_dupp64 (void)
-{
- poly64x2_t out_poly64x2_t;
+extern void abort (void);
- out_poly64x2_t = vld1q_dup_p64 (0);
+poly64x2_t
+test_vld1Q_dupp64 (poly64_t* p)
+{
+ return vld1q_dup_p64 (p);
}
-/* { dg-final { scan-assembler "vld1\.64\[ \]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { cleanup-saved-temps } } */
+int
+main (void)
+{
+ uint64_t c = 0xf00ba1;
+ poly64_t pol = vcreate_p64 (c);
+ poly64x2_t b = test_vld1Q_dupp64 (&pol);
+
+ if (vreinterpret_u64_p64 (vget_high_p64 (b)) != c
+ || vreinterpret_u64_p64 (vget_low_p64 (b) != c))
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64_misaligned.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64_misaligned.c
new file mode 100644
index 0000000..3a916ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64_misaligned.c
@@ -0,0 +1,43 @@
+/* Test the `vld1Q_dup_p64' ARM Neon intrinsic. */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_crypto } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+struct __attribute__ ((__packed__)) my_struct
+{
+ int i;
+ poly64_t str;
+};
+
+poly64x2_t
+test_vld1Q_dupp64 (struct my_struct* s)
+{
+ return vld1q_dup_p64 (&(s->str));
+}
+
+int
+main (void)
+{
+ uint64_t c = 0xf00ba1;
+ poly64_t pol = vcreate_p64 (c);
+ struct my_struct st;
+ st.i = -1;
+ st.str = pol;
+ poly64x2_t b = test_vld1Q_dupp64 (&st);
+
+ if (vreinterpret_u64_p64 (vget_high_p64 (b)) != c
+ || vreinterpret_u64_p64 (vget_low_p64 (b) != c))
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler "vld1\.64\[ \]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c
index 4fceee8..a7e57f4 100644
--- a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c
@@ -1,19 +1,33 @@
-/* Test the `vld1Q_dups64' ARM Neon intrinsic. */
-/* This file was autogenerated by neon-testgen. */
+/* Test the `vld1Q_dup_s64' ARM Neon intrinsic.
+ If the argument to vld1q_dup_s64 is naturally aligned the compiler has
+ the freedom to choose any instruction equivalent to vld1.64, therefore we
+ do not scan for it explicitly but instead test the runtime functionality.
+ */
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
/* { dg-add-options arm_neon } */
#include "arm_neon.h"
-void test_vld1Q_dups64 (void)
-{
- int64x2_t out_int64x2_t;
+extern void abort (void);
- out_int64x2_t = vld1q_dup_s64 (0);
+int64x2_t
+test_vld1Q_dups64 (int64_t* p)
+{
+ return vld1q_dup_s64 (p);
}
-/* { dg-final { scan-assembler "vld1\.64\[ \]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { cleanup-saved-temps } } */
+int
+main (void)
+{
+ int64_t c = 0xf00ba1;
+ int64x2_t b = test_vld1Q_dups64 (&c);
+
+ if (vget_high_s64 (b) != c
+ || vget_low_s64 (b) != c)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64_misaligned.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64_misaligned.c
new file mode 100644
index 0000000..2079a66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64_misaligned.c
@@ -0,0 +1,41 @@
+/* Test the `vld1Q_dup_s64' ARM Neon intrinsic. */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+struct __attribute__ ((__packed__)) my_struct
+{
+ int i;
+ int64_t str;
+};
+
+int64x2_t
+test_vld1Q_dups64 (struct my_struct* s)
+{
+ return vld1q_dup_s64 (&(s->str));
+}
+
+int
+main (void)
+{
+ int64_t c = 0xf00ba1;
+ struct my_struct st;
+ st.i = -1;
+ st.str = c;
+ int64x2_t b = test_vld1Q_dups64 (&st);
+
+ if (vget_high_s64 (b) != c
+ || vget_low_s64 (b) != c)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler "vld1\.64\[ \]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c
index ef0a382..985670f 100644
--- a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c
@@ -1,19 +1,33 @@
-/* Test the `vld1Q_dupu64' ARM Neon intrinsic. */
-/* This file was autogenerated by neon-testgen. */
+/* Test the `vld1Q_dup_u64' ARM Neon intrinsic.
+ If the argument to vld1q_dup_u64 is naturally aligned the compiler has
+ the freedom to choose any instruction equivalent to vld1.64, therefore we
+ do not scan for it explicitly but instead test the runtime functionality.
+ */
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
/* { dg-add-options arm_neon } */
#include "arm_neon.h"
-void test_vld1Q_dupu64 (void)
-{
- uint64x2_t out_uint64x2_t;
+extern void abort (void);
- out_uint64x2_t = vld1q_dup_u64 (0);
+uint64x2_t
+test_vld1Q_dupu64 (uint64_t* p)
+{
+ return vld1q_dup_u64 (p);
}
-/* { dg-final { scan-assembler "vld1\.64\[ \]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { cleanup-saved-temps } } */
+int
+main (void)
+{
+ uint64_t c = 0xf00ba1;
+ uint64x2_t b = test_vld1Q_dupu64 (&c);
+
+ if (vget_high_u64 (b) != c
+ || vget_low_u64 (b) != c)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64_misaligned.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64_misaligned.c
new file mode 100644
index 0000000..5468792
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64_misaligned.c
@@ -0,0 +1,42 @@
+/* Test the `vld1Q_dup_u64' ARM Neon intrinsic. */
+/* This file was autogenerated by neon-testgen. */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+struct __attribute__ ((__packed__)) my_struct
+{
+ int i;
+ uint64_t str;
+};
+
+uint64x2_t
+test_vld1Q_dupu64 (struct my_struct* s)
+{
+ return vld1q_dup_u64 (&(s->str));
+}
+
+int
+main (void)
+{
+ uint64_t c = 0xf00ba1;
+ struct my_struct st;
+ st.i = -1;
+ st.str = c;
+ uint64x2_t b = test_vld1Q_dupu64 (&st);
+
+ if (vget_high_u64 (b) != c
+ || vget_low_u64 (b) != c)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler "vld1\.64\[ \]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */