On 06/01/25 11:44, Andrew Pinski wrote:
External email: Use caution opening links or attachments
On Sun, Jan 5, 2025 at 10:06 PM Dhruv Chawla <dhr...@nvidia.com> wrote:
This patch modifies Advanced SIMD assembly generation to emit an LDR
instruction when a vector is created using a load to the first element with the
other elements being zero.
This is similar to what *aarch64_combinez<mode> already does.
Example:
uint8x16_t foo(uint8_t *x) {
uint8x16_t r = vdupq_n_u8(0);
r[0] = *x;
return r;
}
Currently, this generates:
foo:
movi v0.4s, 0
ld1 {v0.b}[0], [x0]
ret
After applying the patch, this generates:
foo:
ldr b0, [x0]
ret
Bootstrapped and regtested on aarch64-linux-gnu.
Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md
(*aarch64_simd_vec_set_low<mode>): New pattern.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/ldr_first_1.c: New test.
---
gcc/config/aarch64/aarch64-simd.md | 12 ++++
.../gcc.target/aarch64/simd/ldr_first_1.c | 55 +++++++++++++++++++
2 files changed, 67 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index 7959cca520a..b8a1e01b92f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1164,6 +1164,18 @@
[(set_attr "type" "neon_logic<q>")]
)
+(define_insn "*aarch64_simd_vec_set_low<mode>"
+ [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+ (vec_merge:VALL_F16
+ (vec_duplicate:VALL_F16
+ (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m"))
+ (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
+ (match_operand:SI 2 "immediate_operand" "i")))]
+ "TARGET_FLOAT && exact_log2 (INTVAL (operands[2])) == 0"
This is NOT correct for big-endian.
See https://gcc.gnu.org/pipermail/gcc-patches/2024-October/667088.html
for a similar patch which had the big-endian fixes and handles more
cases too.
Hi,
Sorry for the (very) delayed reply. I've attached an updated patch that fixes
this on big-endian.
-- >8 --
This patch modifies Advanced SIMD assembly generation to emit an LDR
instruction when a vector is created using a load to the first element with the
other elements being zero.
This is similar to what *aarch64_combinez<mode> already does.
Example:
uint8x16_t foo(uint8_t *x) {
uint8x16_t r = vdupq_n_u8(0);
r = vsetq_lane_u8(*x, r, 0);
return r;
}
Currently, this generates:
foo:
movi v0.4s, 0
ld1 {v0.b}[0], [x0]
ret
After applying the patch, this generates:
foo:
ldr b0, [x0]
ret
Bootstrapped and regtested on aarch64-linux-gnu. Tested on
aarch64_be-unknown-linux-gnu as well.
Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md
(*aarch64_simd_vec_set_low<mode>): New pattern.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/ldr_first_le.c: New test.
* gcc.target/aarch64/simd/ldr_first_be.c: Likewise.
---
gcc/config/aarch64/aarch64-simd.md | 12 ++
.../gcc.target/aarch64/simd/ldr_first_be.c | 140 ++++++++++++++++++
.../gcc.target/aarch64/simd/ldr_first_le.c | 139 +++++++++++++++++
3 files changed, 291 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index e2afe87e513..7be1c685fcf 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1164,6 +1164,18 @@
[(set_attr "type" "neon_logic<q>")]
)
+(define_insn "*aarch64_simd_vec_set_low<mode>"
+ [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+ (vec_merge:VALL_F16
+ (vec_duplicate:VALL_F16
+ (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m"))
+ (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
+ (match_operand:SI 2 "immediate_operand" "i")))]
+ "TARGET_FLOAT && ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) ==
0"
+ "ldr\\t%<Vetype>0, %1"
+ [(set_attr "type" "f_loads")]
+)
+
(define_insn "@aarch64_simd_vec_set<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
(vec_merge:VALL_F16
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
new file mode 100644
index 00000000000..12d4c965a6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
@@ -0,0 +1,140 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mbig-endian -march=armv8-a+bf16" } */
+/* { dg-require-effective-target stdint_types_mbig_endian } */
+
+/* Tests using ACLE intrinsics. */
+
+#include <arm_neon.h>
+
+#define LDR_ACLE(S, T, U)
\
+ __attribute__ ((noinline)) S acle_##S##T##U (T *x)
\
+ {
\
+ S r = vdupq_n_##U (0);
\
+ r = vsetq_lane_##U (*x, r, 0);
\
+ return r;
\
+ }
+
+LDR_ACLE (int8x16_t, int8_t, s8)
+LDR_ACLE (int16x8_t, int16_t, s16)
+LDR_ACLE (int32x4_t, int32_t, s32)
+LDR_ACLE (int64x2_t, int64_t, s64)
+
+LDR_ACLE (uint8x16_t, uint8_t, u8)
+LDR_ACLE (uint16x8_t, uint16_t, u16)
+LDR_ACLE (uint32x4_t, uint32_t, u32)
+LDR_ACLE (uint64x2_t, uint64_t, u64)
+
+LDR_ACLE (float16x8_t, float16_t, f16)
+LDR_ACLE (float32x4_t, float32_t, f32)
+LDR_ACLE (float64x2_t, float64_t, f64)
+
+LDR_ACLE (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_ACLE_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S acle_##S##T##U (T *x)
\
+ {
\
+ S r = vdup_n_##U (0);
\
+ r = vset_lane_##U (*x, r, 0);
\
+ return r;
\
+ }
+
+LDR_ACLE_NARROW (int8x8_t, int8_t, s8)
+LDR_ACLE_NARROW (int16x4_t, int16_t, s16)
+LDR_ACLE_NARROW (int32x2_t, int32_t, s32)
+LDR_ACLE_NARROW (int64x1_t, int64_t, s64)
+
+LDR_ACLE_NARROW (uint8x8_t, uint8_t, u8)
+LDR_ACLE_NARROW (uint16x4_t, uint16_t, u16)
+LDR_ACLE_NARROW (uint32x2_t, uint32_t, u32)
+LDR_ACLE_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_ACLE_NARROW (float16x4_t, float16_t, f16)
+LDR_ACLE_NARROW (float32x2_t, float32_t, f32)
+LDR_ACLE_NARROW (float64x1_t, float64_t, f64)
+
+LDR_ACLE_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* Tests using GCC vector types. */
+
+typedef int8_t v16i8 __attribute__ ((vector_size (16)));
+typedef int16_t v8i16 __attribute__ ((vector_size (16)));
+typedef int32_t v4i32 __attribute__ ((vector_size (16)));
+typedef int64_t v2i64 __attribute__ ((vector_size (16)));
+
+typedef uint8_t v16u8 __attribute__ ((vector_size (16)));
+typedef uint16_t v8u16 __attribute__ ((vector_size (16)));
+typedef uint32_t v4u32 __attribute__ ((vector_size (16)));
+typedef uint64_t v2u64 __attribute__ ((vector_size (16)));
+
+typedef float16_t v8f16 __attribute__ ((vector_size (16)));
+typedef float32_t v4f32 __attribute__ ((vector_size (16)));
+typedef float64_t v2f64 __attribute__ ((vector_size (16)));
+
+typedef bfloat16_t v8bf16 __attribute__ ((vector_size (16)));
+
+#define LDR_GCC(S, T, U)
\
+ __attribute__ ((noinline)) S gcc_##S##T##U (T *x)
\
+ {
\
+ S r = {0};
\
+ r[sizeof (S) / sizeof (T) - 1] = *x;
\
+ return r;
\
+ }
+
+LDR_GCC (v16i8, int8_t, s8)
+LDR_GCC (v8i16, int16_t, s16)
+LDR_GCC (v4i32, int32_t, s32)
+LDR_GCC (v2i64, int64_t, s64)
+
+LDR_GCC (v16u8, uint8_t, u8)
+LDR_GCC (v8u16, uint16_t, u16)
+LDR_GCC (v4u32, uint32_t, u32)
+LDR_GCC (v2u64, uint64_t, u64)
+
+LDR_GCC (v8f16, float16_t, f16)
+LDR_GCC (v4f32, float32_t, f32)
+LDR_GCC (v2f64, float64_t, f64)
+
+LDR_GCC (v8bf16, bfloat16_t, bf16)
+
+typedef int8_t v8i8 __attribute__ ((vector_size (8)));
+typedef int16_t v4i16 __attribute__ ((vector_size (8)));
+typedef int32_t v2i32 __attribute__ ((vector_size (8)));
+typedef int64_t v1i64 __attribute__ ((vector_size (8)));
+
+typedef uint8_t v8u8 __attribute__ ((vector_size (8)));
+typedef uint16_t v4u16 __attribute__ ((vector_size (8)));
+typedef uint32_t v2u32 __attribute__ ((vector_size (8)));
+typedef uint64_t v1u64 __attribute__ ((vector_size (8)));
+
+typedef float16_t v4f16 __attribute__ ((vector_size (8)));
+typedef float32_t v2f32 __attribute__ ((vector_size (8)));
+typedef float64_t v1f64 __attribute__ ((vector_size (8)));
+
+typedef bfloat16_t v4bf16 __attribute__ ((vector_size (8)));
+
+#define LDR_GCC_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S gcc_##S##T##U (T *x)
\
+ {
\
+ S r = {0};
\
+ r[sizeof (S) / sizeof (T) - 1] = *x;
\
+ return r;
\
+ }
+
+LDR_GCC_NARROW (v8i8, int8_t, s8)
+LDR_GCC_NARROW (v4i16, int16_t, s16)
+LDR_GCC_NARROW (v2i32, int32_t, s32)
+LDR_GCC_NARROW (v1i64, int64_t, s64)
+
+LDR_GCC_NARROW (v8u8, uint8_t, u8)
+LDR_GCC_NARROW (v4u16, uint16_t, u16)
+LDR_GCC_NARROW (v2u32, uint32_t, u32)
+LDR_GCC_NARROW (v1u64, uint64_t, u64)
+
+LDR_GCC_NARROW (v4f16, float16_t, f16)
+LDR_GCC_NARROW (v2f32, float32_t, f32)
+LDR_GCC_NARROW (v1f64, float64_t, f64)
+
+LDR_GCC_NARROW (v4bf16, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 48 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
new file mode 100644
index 00000000000..1d60f47854e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlittle-endian -march=armv8-a+bf16" } */
+
+/* Tests using ACLE intrinsics. */
+
+#include <arm_neon.h>
+
+#define LDR_ACLE(S, T, U)
\
+ __attribute__ ((noinline)) S acle_##S##T##U (T *x)
\
+ {
\
+ S r = vdupq_n_##U (0);
\
+ r = vsetq_lane_##U (*x, r, 0);
\
+ return r;
\
+ }
+
+LDR_ACLE (int8x16_t, int8_t, s8)
+LDR_ACLE (int16x8_t, int16_t, s16)
+LDR_ACLE (int32x4_t, int32_t, s32)
+LDR_ACLE (int64x2_t, int64_t, s64)
+
+LDR_ACLE (uint8x16_t, uint8_t, u8)
+LDR_ACLE (uint16x8_t, uint16_t, u16)
+LDR_ACLE (uint32x4_t, uint32_t, u32)
+LDR_ACLE (uint64x2_t, uint64_t, u64)
+
+LDR_ACLE (float16x8_t, float16_t, f16)
+LDR_ACLE (float32x4_t, float32_t, f32)
+LDR_ACLE (float64x2_t, float64_t, f64)
+
+LDR_ACLE (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_ACLE_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S acle_##S##T##U (T *x)
\
+ {
\
+ S r = vdup_n_##U (0);
\
+ r = vset_lane_##U (*x, r, 0);
\
+ return r;
\
+ }
+
+LDR_ACLE_NARROW (int8x8_t, int8_t, s8)
+LDR_ACLE_NARROW (int16x4_t, int16_t, s16)
+LDR_ACLE_NARROW (int32x2_t, int32_t, s32)
+LDR_ACLE_NARROW (int64x1_t, int64_t, s64)
+
+LDR_ACLE_NARROW (uint8x8_t, uint8_t, u8)
+LDR_ACLE_NARROW (uint16x4_t, uint16_t, u16)
+LDR_ACLE_NARROW (uint32x2_t, uint32_t, u32)
+LDR_ACLE_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_ACLE_NARROW (float16x4_t, float16_t, f16)
+LDR_ACLE_NARROW (float32x2_t, float32_t, f32)
+LDR_ACLE_NARROW (float64x1_t, float64_t, f64)
+
+LDR_ACLE_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* Tests using GCC vector types. */
+
+typedef int8_t v16i8 __attribute__ ((vector_size (16)));
+typedef int16_t v8i16 __attribute__ ((vector_size (16)));
+typedef int32_t v4i32 __attribute__ ((vector_size (16)));
+typedef int64_t v2i64 __attribute__ ((vector_size (16)));
+
+typedef uint8_t v16u8 __attribute__ ((vector_size (16)));
+typedef uint16_t v8u16 __attribute__ ((vector_size (16)));
+typedef uint32_t v4u32 __attribute__ ((vector_size (16)));
+typedef uint64_t v2u64 __attribute__ ((vector_size (16)));
+
+typedef float16_t v8f16 __attribute__ ((vector_size (16)));
+typedef float32_t v4f32 __attribute__ ((vector_size (16)));
+typedef float64_t v2f64 __attribute__ ((vector_size (16)));
+
+typedef bfloat16_t v8bf16 __attribute__ ((vector_size (16)));
+
+#define LDR_GCC(S, T, U)
\
+ __attribute__ ((noinline)) S gcc_##S##T##U (T *x)
\
+ {
\
+ S r = {0};
\
+ r[0] = *x;
\
+ return r;
\
+ }
+
+LDR_GCC (v16i8, int8_t, s8)
+LDR_GCC (v8i16, int16_t, s16)
+LDR_GCC (v4i32, int32_t, s32)
+LDR_GCC (v2i64, int64_t, s64)
+
+LDR_GCC (v16u8, uint8_t, u8)
+LDR_GCC (v8u16, uint16_t, u16)
+LDR_GCC (v4u32, uint32_t, u32)
+LDR_GCC (v2u64, uint64_t, u64)
+
+LDR_GCC (v8f16, float16_t, f16)
+LDR_GCC (v4f32, float32_t, f32)
+LDR_GCC (v2f64, float64_t, f64)
+
+LDR_GCC (v8bf16, bfloat16_t, bf16)
+
+typedef int8_t v8i8 __attribute__ ((vector_size (8)));
+typedef int16_t v4i16 __attribute__ ((vector_size (8)));
+typedef int32_t v2i32 __attribute__ ((vector_size (8)));
+typedef int64_t v1i64 __attribute__ ((vector_size (8)));
+
+typedef uint8_t v8u8 __attribute__ ((vector_size (8)));
+typedef uint16_t v4u16 __attribute__ ((vector_size (8)));
+typedef uint32_t v2u32 __attribute__ ((vector_size (8)));
+typedef uint64_t v1u64 __attribute__ ((vector_size (8)));
+
+typedef float16_t v4f16 __attribute__ ((vector_size (8)));
+typedef float32_t v2f32 __attribute__ ((vector_size (8)));
+typedef float64_t v1f64 __attribute__ ((vector_size (8)));
+
+typedef bfloat16_t v4bf16 __attribute__ ((vector_size (8)));
+
+#define LDR_GCC_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S gcc_##S##T##U (T *x)
\
+ {
\
+ S r = {0};
\
+ r[0] = *x;
\
+ return r;
\
+ }
+
+LDR_GCC_NARROW (v8i8, int8_t, s8)
+LDR_GCC_NARROW (v4i16, int16_t, s16)
+LDR_GCC_NARROW (v2i32, int32_t, s32)
+LDR_GCC_NARROW (v1i64, int64_t, s64)
+
+LDR_GCC_NARROW (v8u8, uint8_t, u8)
+LDR_GCC_NARROW (v4u16, uint16_t, u16)
+LDR_GCC_NARROW (v2u32, uint32_t, u32)
+LDR_GCC_NARROW (v1u64, uint64_t, u64)
+
+LDR_GCC_NARROW (v4f16, float16_t, f16)
+LDR_GCC_NARROW (v2f32, float32_t, f32)
+LDR_GCC_NARROW (v1f64, float64_t, f64)
+
+LDR_GCC_NARROW (v4bf16, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 48 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
--
2.44.0
Thanks,
Andrew
+ "ldr\\t%<Vetype>0, %1"
+ [(set_attr "type" "f_loads")]
+)
+
(define_insn "aarch64_simd_vec_set<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
(vec_merge:VALL_F16
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
new file mode 100644
index 00000000000..c7efde21041
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+bf16" } */
+
+#include <arm_neon.h>
+
+#define LDR(S, T, U)
\
+ __attribute__ ((noinline)) S S##T##U (T *x)
\
+ {
\
+ S r = vdupq_n_##U (0);
\
+ r[0] = *x;
\
+ return r;
\
+ }
+
+LDR (int8x16_t, int8_t, s8)
+LDR (int16x8_t, int16_t, s16)
+LDR (int32x4_t, int32_t, s32)
+LDR (int64x2_t, int64_t, s64)
+
+LDR (uint8x16_t, uint8_t, u8)
+LDR (uint16x8_t, uint16_t, u16)
+LDR (uint32x4_t, uint32_t, u32)
+LDR (uint64x2_t, uint64_t, u64)
+
+LDR (float16x8_t, float16_t, f16)
+LDR (float32x4_t, float32_t, f32)
+LDR (float64x2_t, float64_t, f64)
+
+LDR (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S S##T##U (T *x)
\
+ {
\
+ S r = vdup_n_##U (0);
\
+ r[0] = *x;
\
+ return r;
\
+ }
+
+LDR_NARROW (int8x8_t, int8_t, s8)
+LDR_NARROW (int16x4_t, int16_t, s16)
+LDR_NARROW (int32x2_t, int32_t, s32)
+LDR_NARROW (int64x1_t, int64_t, s64)
+
+LDR_NARROW (uint8x8_t, uint8_t, u8)
+LDR_NARROW (uint16x4_t, uint16_t, u16)
+LDR_NARROW (uint32x2_t, uint32_t, u32)
+LDR_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_NARROW (float16x4_t, float16_t, f16)
+LDR_NARROW (float32x2_t, float32_t, f32)
+LDR_NARROW (float64x1_t, float64_t, f64)
+
+LDR_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 24 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
--
2.44.0
--
Regards,
Dhruv
From 0c3e14d30fc7618a57b9fbc4fc9fe13c86663cfe Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhr...@nvidia.com>
Date: Thu, 19 Dec 2024 19:56:23 -0800
Subject: [PATCH] aarch64: Use LDR for first-element loads for Advanced SIMD
This patch modifies Advanced SIMD assembly generation to emit an LDR
instruction when a vector is created using a load to the first element with the
other elements being zero.
This is similar to what *aarch64_combinez<mode> already does.
Example:
uint8x16_t foo(uint8_t *x) {
uint8x16_t r = vdupq_n_u8(0);
r = vsetq_lane_u8(*x, r, 0);
return r;
}
Currently, this generates:
foo:
movi v0.4s, 0
ld1 {v0.b}[0], [x0]
ret
After applying the patch, this generates:
foo:
ldr b0, [x0]
ret
Bootstrapped and regtested on aarch64-linux-gnu. Tested on
aarch64_be-unknown-linux-gnu as well.
Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md
(*aarch64_simd_vec_set_low<mode>): New pattern.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/ldr_first_le.c: New test.
* gcc.target/aarch64/simd/ldr_first_be.c: Likewise.
---
gcc/config/aarch64/aarch64-simd.md | 12 ++
.../gcc.target/aarch64/simd/ldr_first_be.c | 140 ++++++++++++++++++
.../gcc.target/aarch64/simd/ldr_first_le.c | 139 +++++++++++++++++
3 files changed, 291 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index e2afe87e513..7be1c685fcf 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1164,6 +1164,18 @@
[(set_attr "type" "neon_logic<q>")]
)
+(define_insn "*aarch64_simd_vec_set_low<mode>"
+ [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+ (vec_merge:VALL_F16
+ (vec_duplicate:VALL_F16
+ (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m"))
+ (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
+ (match_operand:SI 2 "immediate_operand" "i")))]
+ "TARGET_FLOAT && ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])))
== 0"
+ "ldr\\t%<Vetype>0, %1"
+ [(set_attr "type" "f_loads")]
+)
+
(define_insn "@aarch64_simd_vec_set<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
(vec_merge:VALL_F16
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
new file mode 100644
index 00000000000..12d4c965a6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
@@ -0,0 +1,140 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mbig-endian -march=armv8-a+bf16" } */
+/* { dg-require-effective-target stdint_types_mbig_endian } */
+
+/* Tests using ACLE intrinsics. */
+
+#include <arm_neon.h>
+
+#define LDR_ACLE(S, T, U)
\
+ __attribute__ ((noinline)) S acle_##S##T##U (T *x)
\
+ {
\
+ S r = vdupq_n_##U (0);
\
+ r = vsetq_lane_##U (*x, r, 0);
\
+ return r;
\
+ }
+
+LDR_ACLE (int8x16_t, int8_t, s8)
+LDR_ACLE (int16x8_t, int16_t, s16)
+LDR_ACLE (int32x4_t, int32_t, s32)
+LDR_ACLE (int64x2_t, int64_t, s64)
+
+LDR_ACLE (uint8x16_t, uint8_t, u8)
+LDR_ACLE (uint16x8_t, uint16_t, u16)
+LDR_ACLE (uint32x4_t, uint32_t, u32)
+LDR_ACLE (uint64x2_t, uint64_t, u64)
+
+LDR_ACLE (float16x8_t, float16_t, f16)
+LDR_ACLE (float32x4_t, float32_t, f32)
+LDR_ACLE (float64x2_t, float64_t, f64)
+
+LDR_ACLE (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_ACLE_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S acle_##S##T##U (T *x)
\
+ {
\
+ S r = vdup_n_##U (0);
\
+ r = vset_lane_##U (*x, r, 0);
\
+ return r;
\
+ }
+
+LDR_ACLE_NARROW (int8x8_t, int8_t, s8)
+LDR_ACLE_NARROW (int16x4_t, int16_t, s16)
+LDR_ACLE_NARROW (int32x2_t, int32_t, s32)
+LDR_ACLE_NARROW (int64x1_t, int64_t, s64)
+
+LDR_ACLE_NARROW (uint8x8_t, uint8_t, u8)
+LDR_ACLE_NARROW (uint16x4_t, uint16_t, u16)
+LDR_ACLE_NARROW (uint32x2_t, uint32_t, u32)
+LDR_ACLE_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_ACLE_NARROW (float16x4_t, float16_t, f16)
+LDR_ACLE_NARROW (float32x2_t, float32_t, f32)
+LDR_ACLE_NARROW (float64x1_t, float64_t, f64)
+
+LDR_ACLE_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* Tests using GCC vector types. */
+
+typedef int8_t v16i8 __attribute__ ((vector_size (16)));
+typedef int16_t v8i16 __attribute__ ((vector_size (16)));
+typedef int32_t v4i32 __attribute__ ((vector_size (16)));
+typedef int64_t v2i64 __attribute__ ((vector_size (16)));
+
+typedef uint8_t v16u8 __attribute__ ((vector_size (16)));
+typedef uint16_t v8u16 __attribute__ ((vector_size (16)));
+typedef uint32_t v4u32 __attribute__ ((vector_size (16)));
+typedef uint64_t v2u64 __attribute__ ((vector_size (16)));
+
+typedef float16_t v8f16 __attribute__ ((vector_size (16)));
+typedef float32_t v4f32 __attribute__ ((vector_size (16)));
+typedef float64_t v2f64 __attribute__ ((vector_size (16)));
+
+typedef bfloat16_t v8bf16 __attribute__ ((vector_size (16)));
+
+#define LDR_GCC(S, T, U)
\
+ __attribute__ ((noinline)) S gcc_##S##T##U (T *x)
\
+ {
\
+ S r = {0};
\
+ r[sizeof (S) / sizeof (T) - 1] = *x;
\
+ return r;
\
+ }
+
+LDR_GCC (v16i8, int8_t, s8)
+LDR_GCC (v8i16, int16_t, s16)
+LDR_GCC (v4i32, int32_t, s32)
+LDR_GCC (v2i64, int64_t, s64)
+
+LDR_GCC (v16u8, uint8_t, u8)
+LDR_GCC (v8u16, uint16_t, u16)
+LDR_GCC (v4u32, uint32_t, u32)
+LDR_GCC (v2u64, uint64_t, u64)
+
+LDR_GCC (v8f16, float16_t, f16)
+LDR_GCC (v4f32, float32_t, f32)
+LDR_GCC (v2f64, float64_t, f64)
+
+LDR_GCC (v8bf16, bfloat16_t, bf16)
+
+typedef int8_t v8i8 __attribute__ ((vector_size (8)));
+typedef int16_t v4i16 __attribute__ ((vector_size (8)));
+typedef int32_t v2i32 __attribute__ ((vector_size (8)));
+typedef int64_t v1i64 __attribute__ ((vector_size (8)));
+
+typedef uint8_t v8u8 __attribute__ ((vector_size (8)));
+typedef uint16_t v4u16 __attribute__ ((vector_size (8)));
+typedef uint32_t v2u32 __attribute__ ((vector_size (8)));
+typedef uint64_t v1u64 __attribute__ ((vector_size (8)));
+
+typedef float16_t v4f16 __attribute__ ((vector_size (8)));
+typedef float32_t v2f32 __attribute__ ((vector_size (8)));
+typedef float64_t v1f64 __attribute__ ((vector_size (8)));
+
+typedef bfloat16_t v4bf16 __attribute__ ((vector_size (8)));
+
+#define LDR_GCC_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S gcc_##S##T##U (T *x)
\
+ {
\
+ S r = {0};
\
+ r[sizeof (S) / sizeof (T) - 1] = *x;
\
+ return r;
\
+ }
+
+LDR_GCC_NARROW (v8i8, int8_t, s8)
+LDR_GCC_NARROW (v4i16, int16_t, s16)
+LDR_GCC_NARROW (v2i32, int32_t, s32)
+LDR_GCC_NARROW (v1i64, int64_t, s64)
+
+LDR_GCC_NARROW (v8u8, uint8_t, u8)
+LDR_GCC_NARROW (v4u16, uint16_t, u16)
+LDR_GCC_NARROW (v2u32, uint32_t, u32)
+LDR_GCC_NARROW (v1u64, uint64_t, u64)
+
+LDR_GCC_NARROW (v4f16, float16_t, f16)
+LDR_GCC_NARROW (v2f32, float32_t, f32)
+LDR_GCC_NARROW (v1f64, float64_t, f64)
+
+LDR_GCC_NARROW (v4bf16, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 48 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
new file mode 100644
index 00000000000..1d60f47854e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlittle-endian -march=armv8-a+bf16" } */
+
+/* Tests using ACLE intrinsics. */
+
+#include <arm_neon.h>
+
+#define LDR_ACLE(S, T, U)
\
+ __attribute__ ((noinline)) S acle_##S##T##U (T *x)
\
+ {
\
+ S r = vdupq_n_##U (0);
\
+ r = vsetq_lane_##U (*x, r, 0);
\
+ return r;
\
+ }
+
+LDR_ACLE (int8x16_t, int8_t, s8)
+LDR_ACLE (int16x8_t, int16_t, s16)
+LDR_ACLE (int32x4_t, int32_t, s32)
+LDR_ACLE (int64x2_t, int64_t, s64)
+
+LDR_ACLE (uint8x16_t, uint8_t, u8)
+LDR_ACLE (uint16x8_t, uint16_t, u16)
+LDR_ACLE (uint32x4_t, uint32_t, u32)
+LDR_ACLE (uint64x2_t, uint64_t, u64)
+
+LDR_ACLE (float16x8_t, float16_t, f16)
+LDR_ACLE (float32x4_t, float32_t, f32)
+LDR_ACLE (float64x2_t, float64_t, f64)
+
+LDR_ACLE (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_ACLE_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S acle_##S##T##U (T *x)
\
+ {
\
+ S r = vdup_n_##U (0);
\
+ r = vset_lane_##U (*x, r, 0);
\
+ return r;
\
+ }
+
+LDR_ACLE_NARROW (int8x8_t, int8_t, s8)
+LDR_ACLE_NARROW (int16x4_t, int16_t, s16)
+LDR_ACLE_NARROW (int32x2_t, int32_t, s32)
+LDR_ACLE_NARROW (int64x1_t, int64_t, s64)
+
+LDR_ACLE_NARROW (uint8x8_t, uint8_t, u8)
+LDR_ACLE_NARROW (uint16x4_t, uint16_t, u16)
+LDR_ACLE_NARROW (uint32x2_t, uint32_t, u32)
+LDR_ACLE_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_ACLE_NARROW (float16x4_t, float16_t, f16)
+LDR_ACLE_NARROW (float32x2_t, float32_t, f32)
+LDR_ACLE_NARROW (float64x1_t, float64_t, f64)
+
+LDR_ACLE_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* Tests using GCC vector types. */
+
+typedef int8_t v16i8 __attribute__ ((vector_size (16)));
+typedef int16_t v8i16 __attribute__ ((vector_size (16)));
+typedef int32_t v4i32 __attribute__ ((vector_size (16)));
+typedef int64_t v2i64 __attribute__ ((vector_size (16)));
+
+typedef uint8_t v16u8 __attribute__ ((vector_size (16)));
+typedef uint16_t v8u16 __attribute__ ((vector_size (16)));
+typedef uint32_t v4u32 __attribute__ ((vector_size (16)));
+typedef uint64_t v2u64 __attribute__ ((vector_size (16)));
+
+typedef float16_t v8f16 __attribute__ ((vector_size (16)));
+typedef float32_t v4f32 __attribute__ ((vector_size (16)));
+typedef float64_t v2f64 __attribute__ ((vector_size (16)));
+
+typedef bfloat16_t v8bf16 __attribute__ ((vector_size (16)));
+
+#define LDR_GCC(S, T, U)
\
+ __attribute__ ((noinline)) S gcc_##S##T##U (T *x)
\
+ {
\
+ S r = {0};
\
+ r[0] = *x;
\
+ return r;
\
+ }
+
+LDR_GCC (v16i8, int8_t, s8)
+LDR_GCC (v8i16, int16_t, s16)
+LDR_GCC (v4i32, int32_t, s32)
+LDR_GCC (v2i64, int64_t, s64)
+
+LDR_GCC (v16u8, uint8_t, u8)
+LDR_GCC (v8u16, uint16_t, u16)
+LDR_GCC (v4u32, uint32_t, u32)
+LDR_GCC (v2u64, uint64_t, u64)
+
+LDR_GCC (v8f16, float16_t, f16)
+LDR_GCC (v4f32, float32_t, f32)
+LDR_GCC (v2f64, float64_t, f64)
+
+LDR_GCC (v8bf16, bfloat16_t, bf16)
+
+typedef int8_t v8i8 __attribute__ ((vector_size (8)));
+typedef int16_t v4i16 __attribute__ ((vector_size (8)));
+typedef int32_t v2i32 __attribute__ ((vector_size (8)));
+typedef int64_t v1i64 __attribute__ ((vector_size (8)));
+
+typedef uint8_t v8u8 __attribute__ ((vector_size (8)));
+typedef uint16_t v4u16 __attribute__ ((vector_size (8)));
+typedef uint32_t v2u32 __attribute__ ((vector_size (8)));
+typedef uint64_t v1u64 __attribute__ ((vector_size (8)));
+
+typedef float16_t v4f16 __attribute__ ((vector_size (8)));
+typedef float32_t v2f32 __attribute__ ((vector_size (8)));
+typedef float64_t v1f64 __attribute__ ((vector_size (8)));
+
+typedef bfloat16_t v4bf16 __attribute__ ((vector_size (8)));
+
+#define LDR_GCC_NARROW(S, T, U)
\
+ __attribute__ ((noinline)) S gcc_##S##T##U (T *x)
\
+ {
\
+ S r = {0};
\
+ r[0] = *x;
\
+ return r;
\
+ }
+
+LDR_GCC_NARROW (v8i8, int8_t, s8)
+LDR_GCC_NARROW (v4i16, int16_t, s16)
+LDR_GCC_NARROW (v2i32, int32_t, s32)
+LDR_GCC_NARROW (v1i64, int64_t, s64)
+
+LDR_GCC_NARROW (v8u8, uint8_t, u8)
+LDR_GCC_NARROW (v4u16, uint16_t, u16)
+LDR_GCC_NARROW (v2u32, uint32_t, u32)
+LDR_GCC_NARROW (v1u64, uint64_t, u64)
+
+LDR_GCC_NARROW (v4f16, float16_t, f16)
+LDR_GCC_NARROW (v2f32, float32_t, f32)
+LDR_GCC_NARROW (v1f64, float64_t, f64)
+
+LDR_GCC_NARROW (v4bf16, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 48 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
--
2.44.0