Re: [PATCH] aarch64: Use LDR for first-element loads for Advanced SIMD

Dhruv Chawla Tue, 06 May 2025 03:24:51 -0700

On 06/01/25 11:44, Andrew Pinski wrote:

External email: Use caution opening links or attachments



On Sun, Jan 5, 2025 at 10:06 PM Dhruv Chawla <dhr...@nvidia.com> wrote:


This patch modifies Advanced SIMD assembly generation to emit an LDR
instruction when a vector is created using a load to the first element with the
other elements being zero.

This is similar to what *aarch64_combinez<mode> already does.

Example:

uint8x16_t foo(uint8_t *x) {
    uint8x16_t r = vdupq_n_u8(0);
    r[0] = *x;
    return r;
}

Currently, this generates:

foo:
         movi    v0.4s, 0
         ld1     {v0.b}[0], [x0]
         ret

After applying the patch, this generates:

foo:
         ldr     b0, [x0]
         ret

Bootstrapped and regtested on aarch64-linux-gnu.

Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>

gcc/ChangeLog:

         * config/aarch64/aarch64-simd.md
         (*aarch64_simd_vec_set_low<mode>): New pattern.

gcc/testsuite/ChangeLog:

         * gcc.target/aarch64/simd/ldr_first_1.c: New test.
---
   gcc/config/aarch64/aarch64-simd.md            | 12 ++++
   .../gcc.target/aarch64/simd/ldr_first_1.c     | 55 +++++++++++++++++++
   2 files changed, 67 insertions(+)
   create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 7959cca520a..b8a1e01b92f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1164,6 +1164,18 @@
     [(set_attr "type" "neon_logic<q>")]
   )

+(define_insn "*aarch64_simd_vec_set_low<mode>"
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+       (vec_merge:VALL_F16
+           (vec_duplicate:VALL_F16
+               (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m"))
+           (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
+           (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_FLOAT && exact_log2 (INTVAL (operands[2])) == 0"


This is NOT correct for big-endian.
See https://gcc.gnu.org/pipermail/gcc-patches/2024-October/667088.html
for a similar patch which had the big-endian fixes and handles more
cases too.


Hi,

Sorry for the (very) delayed reply. I've attached an updated patch that fixes 
this on big-endian.

-- >8 --

This patch modifies Advanced SIMD assembly generation to emit an LDR
instruction when a vector is created using a load to the first element with the
other elements being zero.

This is similar to what *aarch64_combinez<mode> already does.

Example:

uint8x16_t foo(uint8_t *x) {
  uint8x16_t r = vdupq_n_u8(0);
  r = vsetq_lane_u8(*x, r, 0);
  return r;
}

Currently, this generates:

foo:
        movi    v0.4s, 0
        ld1     {v0.b}[0], [x0]
        ret

After applying the patch, this generates:

foo:
        ldr     b0, [x0]
        ret

Bootstrapped and regtested on aarch64-linux-gnu. Tested on
aarch64_be-unknown-linux-gnu as well.

Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md
        (*aarch64_simd_vec_set_low<mode>): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/ldr_first_le.c: New test.
        * gcc.target/aarch64/simd/ldr_first_be.c: Likewise.
---
 gcc/config/aarch64/aarch64-simd.md            |  12 ++
 .../gcc.target/aarch64/simd/ldr_first_be.c    | 140 ++++++++++++++++++
 .../gcc.target/aarch64/simd/ldr_first_le.c    | 139 +++++++++++++++++
 3 files changed, 291 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index e2afe87e513..7be1c685fcf 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1164,6 +1164,18 @@
   [(set_attr "type" "neon_logic<q>")]
 )

+(define_insn "*aarch64_simd_vec_set_low<mode>"

+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+       (vec_merge:VALL_F16
+           (vec_duplicate:VALL_F16
+               (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m"))
+           (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
+           (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_FLOAT && ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) == 
0"
+  "ldr\\t%<Vetype>0, %1"
+  [(set_attr "type" "f_loads")]
+)
+
 (define_insn "@aarch64_simd_vec_set<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
        (vec_merge:VALL_F16
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c 
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
new file mode 100644
index 00000000000..12d4c965a6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
@@ -0,0 +1,140 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mbig-endian -march=armv8-a+bf16" } */
+/* { dg-require-effective-target stdint_types_mbig_endian } */
+
+/* Tests using ACLE intrinsics.  */
+
+#include <arm_neon.h>
+
+#define LDR_ACLE(S, T, U)                                                      
\
+  __attribute__ ((noinline)) S acle_##S##T##U (T *x)                           
\
+  {                                                                            
\
+    S r = vdupq_n_##U (0);                                                     
\
+    r = vsetq_lane_##U (*x, r, 0);                                             
\
+    return r;                                                                  
\
+  }
+
+LDR_ACLE (int8x16_t, int8_t, s8)
+LDR_ACLE (int16x8_t, int16_t, s16)
+LDR_ACLE (int32x4_t, int32_t, s32)
+LDR_ACLE (int64x2_t, int64_t, s64)
+
+LDR_ACLE (uint8x16_t, uint8_t, u8)
+LDR_ACLE (uint16x8_t, uint16_t, u16)
+LDR_ACLE (uint32x4_t, uint32_t, u32)
+LDR_ACLE (uint64x2_t, uint64_t, u64)
+
+LDR_ACLE (float16x8_t, float16_t, f16)
+LDR_ACLE (float32x4_t, float32_t, f32)
+LDR_ACLE (float64x2_t, float64_t, f64)
+
+LDR_ACLE (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_ACLE_NARROW(S, T, U)                                               
\
+  __attribute__ ((noinline)) S acle_##S##T##U (T *x)                           
\
+  {                                                                            
\
+    S r = vdup_n_##U (0);                                                      
\
+    r = vset_lane_##U (*x, r, 0);                                              
\
+    return r;                                                                  
\
+  }
+
+LDR_ACLE_NARROW (int8x8_t, int8_t, s8)
+LDR_ACLE_NARROW (int16x4_t, int16_t, s16)
+LDR_ACLE_NARROW (int32x2_t, int32_t, s32)
+LDR_ACLE_NARROW (int64x1_t, int64_t, s64)
+
+LDR_ACLE_NARROW (uint8x8_t, uint8_t, u8)
+LDR_ACLE_NARROW (uint16x4_t, uint16_t, u16)
+LDR_ACLE_NARROW (uint32x2_t, uint32_t, u32)
+LDR_ACLE_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_ACLE_NARROW (float16x4_t, float16_t, f16)
+LDR_ACLE_NARROW (float32x2_t, float32_t, f32)
+LDR_ACLE_NARROW (float64x1_t, float64_t, f64)
+
+LDR_ACLE_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* Tests using GCC vector types.  */
+
+typedef int8_t v16i8 __attribute__ ((vector_size (16)));
+typedef int16_t v8i16 __attribute__ ((vector_size (16)));
+typedef int32_t v4i32 __attribute__ ((vector_size (16)));
+typedef int64_t v2i64 __attribute__ ((vector_size (16)));
+
+typedef uint8_t v16u8 __attribute__ ((vector_size (16)));
+typedef uint16_t v8u16 __attribute__ ((vector_size (16)));
+typedef uint32_t v4u32 __attribute__ ((vector_size (16)));
+typedef uint64_t v2u64 __attribute__ ((vector_size (16)));
+
+typedef float16_t v8f16 __attribute__ ((vector_size (16)));
+typedef float32_t v4f32 __attribute__ ((vector_size (16)));
+typedef float64_t v2f64 __attribute__ ((vector_size (16)));
+
+typedef bfloat16_t v8bf16 __attribute__ ((vector_size (16)));
+
+#define LDR_GCC(S, T, U)                                                       
\
+  __attribute__ ((noinline)) S gcc_##S##T##U (T *x)                            
\
+  {                                                                            
\
+    S r = {0};                                                                 
\
+    r[sizeof (S) / sizeof (T) - 1] = *x;                                       
\
+    return r;                                                                  
\
+  }
+
+LDR_GCC (v16i8, int8_t, s8)
+LDR_GCC (v8i16, int16_t, s16)
+LDR_GCC (v4i32, int32_t, s32)
+LDR_GCC (v2i64, int64_t, s64)
+
+LDR_GCC (v16u8, uint8_t, u8)
+LDR_GCC (v8u16, uint16_t, u16)
+LDR_GCC (v4u32, uint32_t, u32)
+LDR_GCC (v2u64, uint64_t, u64)
+
+LDR_GCC (v8f16, float16_t, f16)
+LDR_GCC (v4f32, float32_t, f32)
+LDR_GCC (v2f64, float64_t, f64)
+
+LDR_GCC (v8bf16, bfloat16_t, bf16)
+
+typedef int8_t v8i8 __attribute__ ((vector_size (8)));
+typedef int16_t v4i16 __attribute__ ((vector_size (8)));
+typedef int32_t v2i32 __attribute__ ((vector_size (8)));
+typedef int64_t v1i64 __attribute__ ((vector_size (8)));
+
+typedef uint8_t v8u8 __attribute__ ((vector_size (8)));
+typedef uint16_t v4u16 __attribute__ ((vector_size (8)));
+typedef uint32_t v2u32 __attribute__ ((vector_size (8)));
+typedef uint64_t v1u64 __attribute__ ((vector_size (8)));
+
+typedef float16_t v4f16 __attribute__ ((vector_size (8)));
+typedef float32_t v2f32 __attribute__ ((vector_size (8)));
+typedef float64_t v1f64 __attribute__ ((vector_size (8)));
+
+typedef bfloat16_t v4bf16 __attribute__ ((vector_size (8)));
+
+#define LDR_GCC_NARROW(S, T, U)                                                
\
+  __attribute__ ((noinline)) S gcc_##S##T##U (T *x)                            
\
+  {                                                                            
\
+    S r = {0};                                                                 
\
+    r[sizeof (S) / sizeof (T) - 1] = *x;                                       
\
+    return r;                                                                  
\
+  }
+
+LDR_GCC_NARROW (v8i8, int8_t, s8)
+LDR_GCC_NARROW (v4i16, int16_t, s16)
+LDR_GCC_NARROW (v2i32, int32_t, s32)
+LDR_GCC_NARROW (v1i64, int64_t, s64)
+
+LDR_GCC_NARROW (v8u8, uint8_t, u8)
+LDR_GCC_NARROW (v4u16, uint16_t, u16)
+LDR_GCC_NARROW (v2u32, uint32_t, u32)
+LDR_GCC_NARROW (v1u64, uint64_t, u64)
+
+LDR_GCC_NARROW (v4f16, float16_t, f16)
+LDR_GCC_NARROW (v2f32, float32_t, f32)
+LDR_GCC_NARROW (v1f64, float64_t, f64)
+
+LDR_GCC_NARROW (v4bf16, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 48 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c 
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
new file mode 100644
index 00000000000..1d60f47854e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlittle-endian -march=armv8-a+bf16" } */
+
+/* Tests using ACLE intrinsics.  */
+
+#include <arm_neon.h>
+
+#define LDR_ACLE(S, T, U)                                                      
\
+  __attribute__ ((noinline)) S acle_##S##T##U (T *x)                           
\
+  {                                                                            
\
+    S r = vdupq_n_##U (0);                                                     
\
+    r = vsetq_lane_##U (*x, r, 0);                                             
\
+    return r;                                                                  
\
+  }
+
+LDR_ACLE (int8x16_t, int8_t, s8)
+LDR_ACLE (int16x8_t, int16_t, s16)
+LDR_ACLE (int32x4_t, int32_t, s32)
+LDR_ACLE (int64x2_t, int64_t, s64)
+
+LDR_ACLE (uint8x16_t, uint8_t, u8)
+LDR_ACLE (uint16x8_t, uint16_t, u16)
+LDR_ACLE (uint32x4_t, uint32_t, u32)
+LDR_ACLE (uint64x2_t, uint64_t, u64)
+
+LDR_ACLE (float16x8_t, float16_t, f16)
+LDR_ACLE (float32x4_t, float32_t, f32)
+LDR_ACLE (float64x2_t, float64_t, f64)
+
+LDR_ACLE (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_ACLE_NARROW(S, T, U)                                               
\
+  __attribute__ ((noinline)) S acle_##S##T##U (T *x)                           
\
+  {                                                                            
\
+    S r = vdup_n_##U (0);                                                      
\
+    r = vset_lane_##U (*x, r, 0);                                              
\
+    return r;                                                                  
\
+  }
+
+LDR_ACLE_NARROW (int8x8_t, int8_t, s8)
+LDR_ACLE_NARROW (int16x4_t, int16_t, s16)
+LDR_ACLE_NARROW (int32x2_t, int32_t, s32)
+LDR_ACLE_NARROW (int64x1_t, int64_t, s64)
+
+LDR_ACLE_NARROW (uint8x8_t, uint8_t, u8)
+LDR_ACLE_NARROW (uint16x4_t, uint16_t, u16)
+LDR_ACLE_NARROW (uint32x2_t, uint32_t, u32)
+LDR_ACLE_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_ACLE_NARROW (float16x4_t, float16_t, f16)
+LDR_ACLE_NARROW (float32x2_t, float32_t, f32)
+LDR_ACLE_NARROW (float64x1_t, float64_t, f64)
+
+LDR_ACLE_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* Tests using GCC vector types.  */
+
+typedef int8_t v16i8 __attribute__ ((vector_size (16)));
+typedef int16_t v8i16 __attribute__ ((vector_size (16)));
+typedef int32_t v4i32 __attribute__ ((vector_size (16)));
+typedef int64_t v2i64 __attribute__ ((vector_size (16)));
+
+typedef uint8_t v16u8 __attribute__ ((vector_size (16)));
+typedef uint16_t v8u16 __attribute__ ((vector_size (16)));
+typedef uint32_t v4u32 __attribute__ ((vector_size (16)));
+typedef uint64_t v2u64 __attribute__ ((vector_size (16)));
+
+typedef float16_t v8f16 __attribute__ ((vector_size (16)));
+typedef float32_t v4f32 __attribute__ ((vector_size (16)));
+typedef float64_t v2f64 __attribute__ ((vector_size (16)));
+
+typedef bfloat16_t v8bf16 __attribute__ ((vector_size (16)));
+
+#define LDR_GCC(S, T, U)                                                       
\
+  __attribute__ ((noinline)) S gcc_##S##T##U (T *x)                            
\
+  {                                                                            
\
+    S r = {0};                                                                 
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR_GCC (v16i8, int8_t, s8)
+LDR_GCC (v8i16, int16_t, s16)
+LDR_GCC (v4i32, int32_t, s32)
+LDR_GCC (v2i64, int64_t, s64)
+
+LDR_GCC (v16u8, uint8_t, u8)
+LDR_GCC (v8u16, uint16_t, u16)
+LDR_GCC (v4u32, uint32_t, u32)
+LDR_GCC (v2u64, uint64_t, u64)
+
+LDR_GCC (v8f16, float16_t, f16)
+LDR_GCC (v4f32, float32_t, f32)
+LDR_GCC (v2f64, float64_t, f64)
+
+LDR_GCC (v8bf16, bfloat16_t, bf16)
+
+typedef int8_t v8i8 __attribute__ ((vector_size (8)));
+typedef int16_t v4i16 __attribute__ ((vector_size (8)));
+typedef int32_t v2i32 __attribute__ ((vector_size (8)));
+typedef int64_t v1i64 __attribute__ ((vector_size (8)));
+
+typedef uint8_t v8u8 __attribute__ ((vector_size (8)));
+typedef uint16_t v4u16 __attribute__ ((vector_size (8)));
+typedef uint32_t v2u32 __attribute__ ((vector_size (8)));
+typedef uint64_t v1u64 __attribute__ ((vector_size (8)));
+
+typedef float16_t v4f16 __attribute__ ((vector_size (8)));
+typedef float32_t v2f32 __attribute__ ((vector_size (8)));
+typedef float64_t v1f64 __attribute__ ((vector_size (8)));
+
+typedef bfloat16_t v4bf16 __attribute__ ((vector_size (8)));
+
+#define LDR_GCC_NARROW(S, T, U)                                                
\
+  __attribute__ ((noinline)) S gcc_##S##T##U (T *x)                            
\
+  {                                                                            
\
+    S r = {0};                                                                 
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR_GCC_NARROW (v8i8, int8_t, s8)
+LDR_GCC_NARROW (v4i16, int16_t, s16)
+LDR_GCC_NARROW (v2i32, int32_t, s32)
+LDR_GCC_NARROW (v1i64, int64_t, s64)
+
+LDR_GCC_NARROW (v8u8, uint8_t, u8)
+LDR_GCC_NARROW (v4u16, uint16_t, u16)
+LDR_GCC_NARROW (v2u32, uint32_t, u32)
+LDR_GCC_NARROW (v1u64, uint64_t, u64)
+
+LDR_GCC_NARROW (v4f16, float16_t, f16)
+LDR_GCC_NARROW (v2f32, float32_t, f32)
+LDR_GCC_NARROW (v1f64, float64_t, f64)
+
+LDR_GCC_NARROW (v4bf16, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 48 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
--
2.44.0


Thanks,
Andrew

+  "ldr\\t%<Vetype>0, %1"
+  [(set_attr "type" "f_loads")]
+)
+
   (define_insn "aarch64_simd_vec_set<mode>"
     [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
         (vec_merge:VALL_F16
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
new file mode 100644
index 00000000000..c7efde21041
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+bf16" } */
+
+#include <arm_neon.h>
+
+#define LDR(S, T, U)                                                           
\
+  __attribute__ ((noinline)) S S##T##U (T *x)                                  
\
+  {                                                                            
\
+    S r = vdupq_n_##U (0);                                                     
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR (int8x16_t, int8_t, s8)
+LDR (int16x8_t, int16_t, s16)
+LDR (int32x4_t, int32_t, s32)
+LDR (int64x2_t, int64_t, s64)
+
+LDR (uint8x16_t, uint8_t, u8)
+LDR (uint16x8_t, uint16_t, u16)
+LDR (uint32x4_t, uint32_t, u32)
+LDR (uint64x2_t, uint64_t, u64)
+
+LDR (float16x8_t, float16_t, f16)
+LDR (float32x4_t, float32_t, f32)
+LDR (float64x2_t, float64_t, f64)
+
+LDR (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_NARROW(S, T, U)                                                    
\
+  __attribute__ ((noinline)) S S##T##U (T *x)                                  
\
+  {                                                                            
\
+    S r = vdup_n_##U (0);                                                      
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR_NARROW (int8x8_t, int8_t, s8)
+LDR_NARROW (int16x4_t, int16_t, s16)
+LDR_NARROW (int32x2_t, int32_t, s32)
+LDR_NARROW (int64x1_t, int64_t, s64)
+
+LDR_NARROW (uint8x8_t, uint8_t, u8)
+LDR_NARROW (uint16x4_t, uint16_t, u16)
+LDR_NARROW (uint32x2_t, uint32_t, u32)
+LDR_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_NARROW (float16x4_t, float16_t, f16)
+LDR_NARROW (float32x2_t, float32_t, f32)
+LDR_NARROW (float64x1_t, float64_t, f64)
+
+LDR_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 24 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
--
2.44.0


--
Regards,
Dhruv

From 0c3e14d30fc7618a57b9fbc4fc9fe13c86663cfe Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhr...@nvidia.com>
Date: Thu, 19 Dec 2024 19:56:23 -0800
Subject: [PATCH] aarch64: Use LDR for first-element loads for Advanced SIMD

This patch modifies Advanced SIMD assembly generation to emit an LDR
instruction when a vector is created using a load to the first element with the
other elements being zero.

This is similar to what *aarch64_combinez<mode> already does.

Example:

uint8x16_t foo(uint8_t *x) {
  uint8x16_t r = vdupq_n_u8(0);
  r = vsetq_lane_u8(*x, r, 0);
  return r;
}

Currently, this generates:

foo:
        movi    v0.4s, 0
        ld1     {v0.b}[0], [x0]
        ret

After applying the patch, this generates:

foo:
        ldr     b0, [x0]
        ret

Bootstrapped and regtested on aarch64-linux-gnu. Tested on
aarch64_be-unknown-linux-gnu as well.

Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md
        (*aarch64_simd_vec_set_low<mode>): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/ldr_first_le.c: New test.
        * gcc.target/aarch64/simd/ldr_first_be.c: Likewise.
---
 gcc/config/aarch64/aarch64-simd.md            |  12 ++
 .../gcc.target/aarch64/simd/ldr_first_be.c    | 140 ++++++++++++++++++
 .../gcc.target/aarch64/simd/ldr_first_le.c    | 139 +++++++++++++++++
 3 files changed, 291 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index e2afe87e513..7be1c685fcf 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1164,6 +1164,18 @@
   [(set_attr "type" "neon_logic<q>")]
 )
 
+(define_insn "*aarch64_simd_vec_set_low<mode>"
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+       (vec_merge:VALL_F16
+           (vec_duplicate:VALL_F16
+               (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m"))
+           (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
+           (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_FLOAT && ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) 
== 0"
+  "ldr\\t%<Vetype>0, %1"
+  [(set_attr "type" "f_loads")]
+)
+
 (define_insn "@aarch64_simd_vec_set<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
        (vec_merge:VALL_F16
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c 
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
new file mode 100644
index 00000000000..12d4c965a6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_be.c
@@ -0,0 +1,140 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mbig-endian -march=armv8-a+bf16" } */
+/* { dg-require-effective-target stdint_types_mbig_endian } */
+
+/* Tests using ACLE intrinsics.  */
+
+#include <arm_neon.h>
+
+#define LDR_ACLE(S, T, U)                                                      
\
+  __attribute__ ((noinline)) S acle_##S##T##U (T *x)                           
\
+  {                                                                            
\
+    S r = vdupq_n_##U (0);                                                     
\
+    r = vsetq_lane_##U (*x, r, 0);                                             
\
+    return r;                                                                  
\
+  }
+
+LDR_ACLE (int8x16_t, int8_t, s8)
+LDR_ACLE (int16x8_t, int16_t, s16)
+LDR_ACLE (int32x4_t, int32_t, s32)
+LDR_ACLE (int64x2_t, int64_t, s64)
+
+LDR_ACLE (uint8x16_t, uint8_t, u8)
+LDR_ACLE (uint16x8_t, uint16_t, u16)
+LDR_ACLE (uint32x4_t, uint32_t, u32)
+LDR_ACLE (uint64x2_t, uint64_t, u64)
+
+LDR_ACLE (float16x8_t, float16_t, f16)
+LDR_ACLE (float32x4_t, float32_t, f32)
+LDR_ACLE (float64x2_t, float64_t, f64)
+
+LDR_ACLE (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_ACLE_NARROW(S, T, U)                                               
\
+  __attribute__ ((noinline)) S acle_##S##T##U (T *x)                           
\
+  {                                                                            
\
+    S r = vdup_n_##U (0);                                                      
\
+    r = vset_lane_##U (*x, r, 0);                                              
\
+    return r;                                                                  
\
+  }
+
+LDR_ACLE_NARROW (int8x8_t, int8_t, s8)
+LDR_ACLE_NARROW (int16x4_t, int16_t, s16)
+LDR_ACLE_NARROW (int32x2_t, int32_t, s32)
+LDR_ACLE_NARROW (int64x1_t, int64_t, s64)
+
+LDR_ACLE_NARROW (uint8x8_t, uint8_t, u8)
+LDR_ACLE_NARROW (uint16x4_t, uint16_t, u16)
+LDR_ACLE_NARROW (uint32x2_t, uint32_t, u32)
+LDR_ACLE_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_ACLE_NARROW (float16x4_t, float16_t, f16)
+LDR_ACLE_NARROW (float32x2_t, float32_t, f32)
+LDR_ACLE_NARROW (float64x1_t, float64_t, f64)
+
+LDR_ACLE_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* Tests using GCC vector types.  */
+
+typedef int8_t v16i8 __attribute__ ((vector_size (16)));
+typedef int16_t v8i16 __attribute__ ((vector_size (16)));
+typedef int32_t v4i32 __attribute__ ((vector_size (16)));
+typedef int64_t v2i64 __attribute__ ((vector_size (16)));
+
+typedef uint8_t v16u8 __attribute__ ((vector_size (16)));
+typedef uint16_t v8u16 __attribute__ ((vector_size (16)));
+typedef uint32_t v4u32 __attribute__ ((vector_size (16)));
+typedef uint64_t v2u64 __attribute__ ((vector_size (16)));
+
+typedef float16_t v8f16 __attribute__ ((vector_size (16)));
+typedef float32_t v4f32 __attribute__ ((vector_size (16)));
+typedef float64_t v2f64 __attribute__ ((vector_size (16)));
+
+typedef bfloat16_t v8bf16 __attribute__ ((vector_size (16)));
+
+#define LDR_GCC(S, T, U)                                                       
\
+  __attribute__ ((noinline)) S gcc_##S##T##U (T *x)                            
\
+  {                                                                            
\
+    S r = {0};                                                                 
\
+    r[sizeof (S) / sizeof (T) - 1] = *x;                                       
\
+    return r;                                                                  
\
+  }
+
+LDR_GCC (v16i8, int8_t, s8)
+LDR_GCC (v8i16, int16_t, s16)
+LDR_GCC (v4i32, int32_t, s32)
+LDR_GCC (v2i64, int64_t, s64)
+
+LDR_GCC (v16u8, uint8_t, u8)
+LDR_GCC (v8u16, uint16_t, u16)
+LDR_GCC (v4u32, uint32_t, u32)
+LDR_GCC (v2u64, uint64_t, u64)
+
+LDR_GCC (v8f16, float16_t, f16)
+LDR_GCC (v4f32, float32_t, f32)
+LDR_GCC (v2f64, float64_t, f64)
+
+LDR_GCC (v8bf16, bfloat16_t, bf16)
+
+typedef int8_t v8i8 __attribute__ ((vector_size (8)));
+typedef int16_t v4i16 __attribute__ ((vector_size (8)));
+typedef int32_t v2i32 __attribute__ ((vector_size (8)));
+typedef int64_t v1i64 __attribute__ ((vector_size (8)));
+
+typedef uint8_t v8u8 __attribute__ ((vector_size (8)));
+typedef uint16_t v4u16 __attribute__ ((vector_size (8)));
+typedef uint32_t v2u32 __attribute__ ((vector_size (8)));
+typedef uint64_t v1u64 __attribute__ ((vector_size (8)));
+
+typedef float16_t v4f16 __attribute__ ((vector_size (8)));
+typedef float32_t v2f32 __attribute__ ((vector_size (8)));
+typedef float64_t v1f64 __attribute__ ((vector_size (8)));
+
+typedef bfloat16_t v4bf16 __attribute__ ((vector_size (8)));
+
+#define LDR_GCC_NARROW(S, T, U)                                                
\
+  __attribute__ ((noinline)) S gcc_##S##T##U (T *x)                            
\
+  {                                                                            
\
+    S r = {0};                                                                 
\
+    r[sizeof (S) / sizeof (T) - 1] = *x;                                       
\
+    return r;                                                                  
\
+  }
+
+LDR_GCC_NARROW (v8i8, int8_t, s8)
+LDR_GCC_NARROW (v4i16, int16_t, s16)
+LDR_GCC_NARROW (v2i32, int32_t, s32)
+LDR_GCC_NARROW (v1i64, int64_t, s64)
+
+LDR_GCC_NARROW (v8u8, uint8_t, u8)
+LDR_GCC_NARROW (v4u16, uint16_t, u16)
+LDR_GCC_NARROW (v2u32, uint32_t, u32)
+LDR_GCC_NARROW (v1u64, uint64_t, u64)
+
+LDR_GCC_NARROW (v4f16, float16_t, f16)
+LDR_GCC_NARROW (v2f32, float32_t, f32)
+LDR_GCC_NARROW (v1f64, float64_t, f64)
+
+LDR_GCC_NARROW (v4bf16, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 48 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c 
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
new file mode 100644
index 00000000000..1d60f47854e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_le.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlittle-endian -march=armv8-a+bf16" } */
+
+/* Tests using ACLE intrinsics.  */
+
+#include <arm_neon.h>
+
+#define LDR_ACLE(S, T, U)                                                      
\
+  __attribute__ ((noinline)) S acle_##S##T##U (T *x)                           
\
+  {                                                                            
\
+    S r = vdupq_n_##U (0);                                                     
\
+    r = vsetq_lane_##U (*x, r, 0);                                             
\
+    return r;                                                                  
\
+  }
+
+LDR_ACLE (int8x16_t, int8_t, s8)
+LDR_ACLE (int16x8_t, int16_t, s16)
+LDR_ACLE (int32x4_t, int32_t, s32)
+LDR_ACLE (int64x2_t, int64_t, s64)
+
+LDR_ACLE (uint8x16_t, uint8_t, u8)
+LDR_ACLE (uint16x8_t, uint16_t, u16)
+LDR_ACLE (uint32x4_t, uint32_t, u32)
+LDR_ACLE (uint64x2_t, uint64_t, u64)
+
+LDR_ACLE (float16x8_t, float16_t, f16)
+LDR_ACLE (float32x4_t, float32_t, f32)
+LDR_ACLE (float64x2_t, float64_t, f64)
+
+LDR_ACLE (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_ACLE_NARROW(S, T, U)                                               
\
+  __attribute__ ((noinline)) S acle_##S##T##U (T *x)                           
\
+  {                                                                            
\
+    S r = vdup_n_##U (0);                                                      
\
+    r = vset_lane_##U (*x, r, 0);                                              
\
+    return r;                                                                  
\
+  }
+
+LDR_ACLE_NARROW (int8x8_t, int8_t, s8)
+LDR_ACLE_NARROW (int16x4_t, int16_t, s16)
+LDR_ACLE_NARROW (int32x2_t, int32_t, s32)
+LDR_ACLE_NARROW (int64x1_t, int64_t, s64)
+
+LDR_ACLE_NARROW (uint8x8_t, uint8_t, u8)
+LDR_ACLE_NARROW (uint16x4_t, uint16_t, u16)
+LDR_ACLE_NARROW (uint32x2_t, uint32_t, u32)
+LDR_ACLE_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_ACLE_NARROW (float16x4_t, float16_t, f16)
+LDR_ACLE_NARROW (float32x2_t, float32_t, f32)
+LDR_ACLE_NARROW (float64x1_t, float64_t, f64)
+
+LDR_ACLE_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* Tests using GCC vector types.  */
+
+typedef int8_t v16i8 __attribute__ ((vector_size (16)));
+typedef int16_t v8i16 __attribute__ ((vector_size (16)));
+typedef int32_t v4i32 __attribute__ ((vector_size (16)));
+typedef int64_t v2i64 __attribute__ ((vector_size (16)));
+
+typedef uint8_t v16u8 __attribute__ ((vector_size (16)));
+typedef uint16_t v8u16 __attribute__ ((vector_size (16)));
+typedef uint32_t v4u32 __attribute__ ((vector_size (16)));
+typedef uint64_t v2u64 __attribute__ ((vector_size (16)));
+
+typedef float16_t v8f16 __attribute__ ((vector_size (16)));
+typedef float32_t v4f32 __attribute__ ((vector_size (16)));
+typedef float64_t v2f64 __attribute__ ((vector_size (16)));
+
+typedef bfloat16_t v8bf16 __attribute__ ((vector_size (16)));
+
+#define LDR_GCC(S, T, U)                                                       
\
+  __attribute__ ((noinline)) S gcc_##S##T##U (T *x)                            
\
+  {                                                                            
\
+    S r = {0};                                                                 
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR_GCC (v16i8, int8_t, s8)
+LDR_GCC (v8i16, int16_t, s16)
+LDR_GCC (v4i32, int32_t, s32)
+LDR_GCC (v2i64, int64_t, s64)
+
+LDR_GCC (v16u8, uint8_t, u8)
+LDR_GCC (v8u16, uint16_t, u16)
+LDR_GCC (v4u32, uint32_t, u32)
+LDR_GCC (v2u64, uint64_t, u64)
+
+LDR_GCC (v8f16, float16_t, f16)
+LDR_GCC (v4f32, float32_t, f32)
+LDR_GCC (v2f64, float64_t, f64)
+
+LDR_GCC (v8bf16, bfloat16_t, bf16)
+
+typedef int8_t v8i8 __attribute__ ((vector_size (8)));
+typedef int16_t v4i16 __attribute__ ((vector_size (8)));
+typedef int32_t v2i32 __attribute__ ((vector_size (8)));
+typedef int64_t v1i64 __attribute__ ((vector_size (8)));
+
+typedef uint8_t v8u8 __attribute__ ((vector_size (8)));
+typedef uint16_t v4u16 __attribute__ ((vector_size (8)));
+typedef uint32_t v2u32 __attribute__ ((vector_size (8)));
+typedef uint64_t v1u64 __attribute__ ((vector_size (8)));
+
+typedef float16_t v4f16 __attribute__ ((vector_size (8)));
+typedef float32_t v2f32 __attribute__ ((vector_size (8)));
+typedef float64_t v1f64 __attribute__ ((vector_size (8)));
+
+typedef bfloat16_t v4bf16 __attribute__ ((vector_size (8)));
+
+#define LDR_GCC_NARROW(S, T, U)                                                
\
+  __attribute__ ((noinline)) S gcc_##S##T##U (T *x)                            
\
+  {                                                                            
\
+    S r = {0};                                                                 
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR_GCC_NARROW (v8i8, int8_t, s8)
+LDR_GCC_NARROW (v4i16, int16_t, s16)
+LDR_GCC_NARROW (v2i32, int32_t, s32)
+LDR_GCC_NARROW (v1i64, int64_t, s64)
+
+LDR_GCC_NARROW (v8u8, uint8_t, u8)
+LDR_GCC_NARROW (v4u16, uint16_t, u16)
+LDR_GCC_NARROW (v2u32, uint32_t, u32)
+LDR_GCC_NARROW (v1u64, uint64_t, u64)
+
+LDR_GCC_NARROW (v4f16, float16_t, f16)
+LDR_GCC_NARROW (v2f32, float32_t, f32)
+LDR_GCC_NARROW (v1f64, float64_t, f64)
+
+LDR_GCC_NARROW (v4bf16, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 48 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
-- 
2.44.0

Re: [PATCH] aarch64: Use LDR for first-element loads for Advanced SIMD

Reply via email to