from:"James Greenhalgh"

[AArch64] Upgrade integer MLA intrinsics to GCC vector extensions

2020-08-12 Thread James Greenhalgh


Hi,

As subject, this patch rewrites the mla intrinsics to use a + b * c rather
than inline assembler, thereby opening them to CSE, scheduling, etc.

Bootstrapped and tested on aarch64-none-linux-gnu.

OK?

Thanks,
James

---

gcc/Changelog:

2020-08-11  James Greenhalgh  

config/aarch64/arm_neon.h (vmla_s8): Upgrade to C rather than asm.
(vmla_s16): Likewise.
(vmla_s32): Likewise.
(vmla_u8): Likewise.
(vmla_u16): Likewise.
(vmla_u32): Likewise.
(vmlaq_s8): Likewise.
(vmlaq_s16): Likewise.
(vmlaq_s32): Likewise.
(vmlaq_u8): Likewise.
(vmlaq_u16): Likewise.
(vmlaq_u32): Likewise.

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 50f8b23bc17..aa548e4e6c7 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7400,72 +7400,42 @@ __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 #define vmlal_high_lane_s16(a, b, c, d) \
@@ -7941,72 +7911,42 @@ __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int8x16_t __result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute

[AArch64] Move vmull_* to intrinsics

2020-02-18 Thread James Greenhalgh


Hi,

As title, move some arm_neon.h functions which currently use assembly over
to intrinsics.

Bootstrapped and tested on aarch64-none-linux-gnu.

OK, if so can someone please apply on my behalf?

Thanks,
James

---
gcc/

2020-02-18  James Greenhalgh  

* config/aarch64/aarch64-simd-builtins.def
(intrinsic_vec_smult_lo_): New.
(intrinsic_vec_umult_lo_): Likewise.
(vec_widen_smult_hi_): Likewise.
(vec_widen_umult_hi_): Likewise.
* config/aarch64/aarch64-simd.md
(aarch64_intrinsic_vec_mult_lo_): New.
* config/aarch64/arm_neon.h (vmull_high_s8): Use intrinsics.
(vmull_high_s16): Likewise.
(vmull_high_s32): Likewise.
(vmull_high_u8): Likewise.
(vmull_high_u16): Likewise.
(vmull_high_u32): Likewise.
(vmull_s8): Likewise.
(vmull_s16): Likewise.
(vmull_s32): Likewise.
(vmull_u8): Likewise.
(vmull_u16): Likewise.
(vmull_u32): Likewise.

gcc/testsuite/

2020-02-18  James Greenhalgh  

* gcc.target/aarch64/vmull_high.c: New.

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 57fc5933b43..f86866b9e78 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -185,6 +185,12 @@
   BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0)
   BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0)
 
+  BUILTIN_VD_BHSI (BINOP, intrinsic_vec_smult_lo_, 0)
+  BUILTIN_VD_BHSI (BINOPU, intrinsic_vec_umult_lo_, 0)
+
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
+
   BUILTIN_VSD_HSI (BINOP, sqdmull, 0)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4e28cf97516..281b9ce93b9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1791,6 +1791,17 @@
   [(set_attr "type" "neon_mul__long")]
 )
 
+(define_insn "aarch64_intrinsic_vec_mult_lo_"
+ [(set (match_operand: 0 "register_operand" "=w")
+   (mult: (ANY_EXTEND:
+			 (match_operand:VD_BHSI 1 "register_operand" "w"))
+		 (ANY_EXTEND:
+ (match_operand:VD_BHSI 2 "register_operand" "w"]
+  "TARGET_SIMD"
+  "mull\\t%0., %1., %2."
+  [(set_attr "type" "neon_mul__long")]
+)
+
 (define_expand "vec_widen_mult_lo_"
   [(match_operand: 0 "register_operand")
(ANY_EXTEND: (match_operand:VQW 1 "register_operand"))
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c7425346b86..0b11d670837 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9218,72 +9218,42 @@ __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_s8 (int8x16_t __a, int8x16_t __b)
 {
-  int16x8_t __result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-   : "=w"(__result)
-   : "w"(__a), "w"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int32x4_t __result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-   : "=w"(__result)
-   : "w"(__a), "w"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_s32 (int32x4_t __a, int32x4_t __b)
 {
-  int64x2_t __result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-   : "=w"(__result)
-   : "w"(__a), "w"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  uint16x8_t __result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-   : "=w"(__result)
-   : "w"(__a), "w"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint32x4_t __result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-   : &qu

Remove my name from AArch64 port maintainers

2019-11-20 Thread James Greenhalgh


Hi,

After personal reflection on my current day-to-day involvement with the
GCC project and the expected behaviours and responsibilities delegated to
GNU project maintainers, I have come to the conclusion that the AArch64
port maintenance role is not one I am able to continue to commit to.

This patch therefore removes my name from the AArch64 maintainers list.
I've left my name under write-after-approval, just in case I need it in
future.

Thanks to the steering committee for the opportunity to contribute to GCC
as a maintainer, I've very much enjoyed seeing the many contributions to
the AArch64 port over the past two years.

Kyrill Tkachov, Richard Earnshaw, Richard Sandiford and Marcus Shawcroft
make for a great team of maintainers - I fully expect the AArch64 to
continue to thrive under their watch.

Best Regards,
James

2019-11-19  James Greenhalgh  

* MAINTAINERS (aarch64 port): Remove my name, move to...
(Write After Approval): ...Here.

diff --git a/MAINTAINERS b/MAINTAINERS
index 1385214f789..54edab3f177 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -44,7 +44,6 @@ docs, and the testsuite related to that.
 			CPU Port Maintainers	(CPU alphabetical order)
 
 aarch64 port		Richard Earnshaw	
-aarch64 port		James Greenhalgh	
 aarch64 port		Richard Sandiford	
 aarch64 port		Marcus Shawcroft	
 aarch64 port		Kyrylo Tkachov		
@@ -399,6 +398,7 @@ Jan-Benedict Glaw
 Marc Glisse	
 Prachi Godbole	
 Torbjorn Granlund
+James Greenhalgh
 Doug Gregor	
 Matthew Gretton-Dann
 Yury Gribov

Re: [AArch64] Fix cost of (plus ... (const_int -C))

2019-09-25 Thread James Greenhalgh

On Mon, Sep 23, 2019 at 10:45:29AM +0100, Richard Sandiford wrote:
> The PLUS handling in aarch64_rtx_costs only checked for nonnegative
> constants, meaning that simple immediate subtractions like:
> 
>   (set (reg R1) (plus (reg R2) (const_int -8)))
> 
> had a cost of two instructions.
> 
> Tested on aarch64-linux-gnu (with and without SVE).  OK to install?

OK.

Thanks,
James

> 
> Richard
> 
> 
> 2019-09-23  Richard Sandiford  
> 
> gcc/
>   * config/aarch64/aarch64.c (aarch64_rtx_costs): Use
>   aarch64_plus_immediate rather than aarch64_uimm12_shift
>   to test for valid PLUS immediates.
>

Re: [PATCH][AArch64] Don't split 64-bit constant stores to volatile location

2019-09-25 Thread James Greenhalgh

On Tue, Sep 24, 2019 at 02:40:20PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> On 8/22/19 10:16 AM, Kyrill Tkachov wrote:
> > Hi all,
> >
> > The optimisation to optimise:
> >    typedef unsigned long long u64;
> >
> >    void bar(u64 *x)
> >    {
> >  *x = 0xabcdef10abcdef10;
> >    }
> >
> > from:
> >     mov x1, 61200
> >     movk    x1, 0xabcd, lsl 16
> >     movk    x1, 0xef10, lsl 32
> >     movk    x1, 0xabcd, lsl 48
> >     str x1, [x0]
> >
> > into:
> >     mov w1, 61200
> >     movk    w1, 0xabcd, lsl 16
> >     stp w1, w1, [x0]
> >
> > ends up producing two distinct stores if the destination is volatile:
> >   void bar(u64 *x)
> >   {
> >     *(volatile u64 *)x = 0xabcdef10abcdef10;
> >   }
> >     mov w1, 61200
> >     movk    w1, 0xabcd, lsl 16
> >     str w1, [x0]
> >     str w1, [x0, 4]
> >
> > because we end up not merging the strs into an stp. It's questionable 
> > whether the use of STP is valid for volatile in the first place.
> > To avoid unnecessary pain in a context where it's unlikely to be 
> > performance critical [1] (use of volatile), this patch avoids this
> > transformation for volatile destinations, so we produce the original 
> > single STR-X.
> >
> > Bootstrapped and tested on aarch64-none-linux-gnu.
> >
> > Ok for trunk (and eventual backports)?
> >
> This has been approved by James offline.
> 
> Committed to trunk with r276098.

Does this need backporting?

Thanks,
James

> 
> Thanks,
> 
> Kyrill
> 
> > Thanks,
> > Kyrill
> >
> > [1] 
> > https://lore.kernel.org/lkml/20190821103200.kpufwtviqhpbuv2n@willie-the-truck/
> >
> >
> > gcc/
> > 2019-08-22  Kyrylo Tkachov 
> >
> >     * config/aarch64/aarch64.md (mov): Don't call
> >     aarch64_split_dimode_const_store on volatile MEM.
> >
> > gcc/testsuite/
> > 2019-08-22  Kyrylo Tkachov 
> >
> >     * gcc.target/aarch64/nosplit-di-const-volatile_1.c: New test.
> >

Re: [AArch64] Split built-in function codes into major and minor codes

2019-09-25 Thread James Greenhalgh

On Wed, Aug 07, 2019 at 08:28:50PM +0100, Richard Sandiford wrote:
> It was easier to add the SVE ACLE support without enumerating every
> function at build time.  This in turn meant that it was easier if the
> SVE builtins occupied a distinct numberspace from the existing AArch64
> ones, which *are* enumerated at build time.  This patch therefore
> divides the built-in functions codes into "major" and "minor" codes.
> At present the major code is just "general", but the SVE patch will add
> "SVE" as well.
> 
> Also, it was convenient to put the SVE ACLE support in its own file,
> so the patch makes aarch64.c provide the frontline target hooks directly,
> forwarding to the other files for the real work.
> 
> The reason for organising the files this way is that aarch64.c needs
> to define the target hook macros whatever happens, and having aarch64.c
> macros forward to aarch64-builtins.c functions and aarch64-bulitins.c
> functions forward to the SVE file seemed a bit indirect.  Doing things
> the way the patch does them puts aarch64-builtins.c and the SVE code on
> more of an equal footing.
> 
> The aarch64_(general_)gimple_fold_builtin change is mostly just
> reindentation.  I've attached a -b version of the diff as well.
> 
> Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
> OK to install when the ACLE patch itself is ready to install?

OK.

Thanks,
James

> 
> Richard
> 
> 
> 2019-08-07  Richard Sandiford  
> 
> gcc/
>   * config/aarch64/aarch64-protos.h (aarch64_builtin_class): New enum.
>   (AARCH64_BUILTIN_SHIFT, AARCH64_BUILTIN_CLASS): New constants.
>   (aarch64_gimple_fold_builtin, aarch64_mangle_builtin_type)
>   (aarch64_fold_builtin, aarch64_init_builtins, aarch64_expand_builtin):
>   (aarch64_builtin_decl, aarch64_builtin_rsqrt): Delete.
>   (aarch64_general_mangle_builtin_type, aarch64_general_init_builtins):
>   (aarch64_general_fold_builtin, aarch64_general_gimple_fold_builtin):
>   (aarch64_general_expand_builtin, aarch64_general_builtin_decl):
>   (aarch64_general_builtin_rsqrt): Declare.
>   * config/aarch64/aarch64-builtins.c (aarch64_general_add_builtin):
>   New function.
>   (aarch64_mangle_builtin_type): Rename to...
>   (aarch64_general_mangle_builtin_type): ...this.
>   (aarch64_init_fcmla_laneq_builtins, aarch64_init_simd_builtins)
>   (aarch64_init_crc32_builtins, aarch64_init_builtin_rsqrt)
>   (aarch64_init_pauth_hint_builtins, aarch64_init_tme_builtins): Use
>   aarch64_general_add_builtin instead of add_builtin_function.
>   (aarch64_init_builtins): Rename to...
>   (aarch64_general_init_builtins): ...this.  Use
>   aarch64_general_add_builtin instead of add_builtin_function.
>   (aarch64_builtin_decl): Rename to...
>   (aarch64_general_builtin_decl): ...this and remove the unused
>   arguments.
>   (aarch64_expand_builtin): Rename to...
>   (aarch64_general_expand_builtin): ...this and remove the unused
>   arguments.
>   (aarch64_builtin_rsqrt): Rename to...
>   (aarch64_general_builtin_rsqrt): ...this.
>   (aarch64_fold_builtin): Rename to...
>   (aarch64_general_fold_builtin): ...this.  Take the function subcode
>   and return type as arguments.  Remove the "ignored" argument.
>   (aarch64_gimple_fold_builtin): Rename to...
>   (aarch64_general_gimple_fold_builtin): ...this.  Take the function
>   subcode and gcall as arguments, and return the new function call.
>   * config/aarch64/aarch64.c (aarch64_init_builtins)
>   (aarch64_fold_builtin, aarch64_gimple_fold_builtin)
>   (aarch64_expand_builtin, aarch64_builtin_decl): New functions.
>   (aarch64_builtin_reciprocal): Call aarch64_general_builtin_rsqrt
>   instead of aarch64_builtin_rsqrt.
>   (aarch64_mangle_type): Call aarch64_general_mangle_builtin_type
>   instead of aarch64_mangle_builtin_type.
>

Re: [PATCH][AArch64] Add support for missing CPUs

2019-09-02 Thread James Greenhalgh

On Thu, Aug 22, 2019 at 12:03:33PM +0100, Kyrill Tkachov wrote:
> Hi Dennis,
> 
> On 8/21/19 10:27 AM, Dennis Zhang wrote:
> > Hi all,
> >
> > This patch adds '-mcpu' options for following CPUs:
> > Cortex-A77, Cortex-A76AE, Cortex-A65, Cortex-A65AE, and Cortex-A34.
> >
> > Related specifications are as following:
> > https://developer.arm.com/ip-products/processors/cortex-a
> >
> > Bootstraped/regtested for aarch64-none-linux-gnu.
> >
> > Please help to check if it's ready.
> >
> This looks ok to me but you'll need maintainer approval.

At this point Kyrill, I fully trust your OK without looking at the
patch in any more detail...

I think at Cauldron we ought to add some time during the Arm/AArch64 BoF
to discuss what the community would like us to do about maintainership in
AArch64. It seems clear to me that I'm just slowing you and others down now
by rubberstamping your decisions.

To be clear, this particular patch is OK for trunk - but I think it is
time to have a conversation about how we can make this experience easier
for everyone.

Thanks,
James

> 
> Thanks,
> 
> Kyrill
> 
> 
> > Many thanks!
> > Dennis
> >
> > gcc/ChangeLog:
> >
> > 2019-08-21  Dennis Zhang  
> >
> >     * config/aarch64/aarch64-cores.def (AARCH64_CORE): New entries
> >     for Cortex-A77, Cortex-A76AE, Cortex-A65, Cortex-A65AE, and
> >     Cortex-A34.
> >     * config/aarch64/aarch64-tune.md: Regenerated.
> >     * doc/invoke.texi: Document the new processors.

Re: [PATCH][AArch64] Add Linux hwcap strings for some extensions

2019-09-02 Thread James Greenhalgh

On Fri, Aug 23, 2019 at 05:42:30PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> This patch adds feature strings for some of the extensions. This string 
> is what is read from /proc/cpuinfo on Linux systems
> and used during -march=native detection.
> 
> The strings are taken from the kernel source tree at:
> https://github.com/torvalds/linux/blob/master/arch/arm64/kernel/cpuinfo.c#L45
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?

OK.

Thanks,
James

> Thanks,
> Kyrill
> 
> 2019-08-23  Kyrylo Tkachov  
> 
>      * config/aarch64/aarch64-option-extensions.def (sb): Add feature
>      string.
>      (ssbs): Likewise.
>      (sve2): Likewise.
>      (sve2-sm4): Likewise.
>      (sveaes): Likewise.
>      (svesha3): Likewise.
>      (svebitperm): Likewise.
>

Re: [PATCH][AArch64] Add support for __jcvt intrinsic

2019-09-02 Thread James Greenhalgh

On Mon, Sep 02, 2019 at 01:16:32PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> This patch implements the __jcvt ACLE intrinsic [1] that maps down to 
> the FJCVTZS [2] instruction from Armv8.3-a.
> No fancy mode iterators or nothing. Just a single builtin, UNSPEC and 
> define_insn and the associate plumbing.
> This patch also defines __ARM_FEATURE_JCVT to indicate when the 
> intrinsic is available.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?

OK.

Thanks,
James

> Thanks,
> Kyrill
> 
> [1] https://developer.arm.com/docs/101028/latest/data-processing-intrinsics
> [2] 
> https://developer.arm.com/docs/ddi0596/latest/simd-and-floating-point-instructions-alphabetic-order/fjcvtzs-floating-point-javascript-convert-to-signed-fixed-point-rounding-toward-zero
> 
> 2019-09-02  Kyrylo Tkachov  
> 
>      * config/aarch64/aarch64.md (UNSPEC_FJCVTZS): Define.
>      (aarch64_fjcvtzs): New define_insn.
>      * config/aarch64/aarch64.h (TARGET_JSCVT): Define.
>      * config/aarch64/aarch64-builtins.c (aarch64_builtins):
>      Add AARCH64_JSCVT.
>      (aarch64_init_builtins): Initialize __builtin_aarch64_jcvtzs.
>      (aarch64_expand_builtin): Handle AARCH64_JSCVT.
>      * config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
>      __ARM_FEATURE_JCVT where appropriate.
>      * config/aarch64/arm_acle.h (__jcvt): Define.
> 
> 2019-09-02  Kyrylo Tkachov  
> 
>      * gcc.target/aarch64/acle/jcvt_1.c: New test.
>

Re: [PATCH][AArch64] Expand DImode constant stores to two SImode stores when profitable

2019-08-21 Thread James Greenhalgh

On Mon, Oct 24, 2016 at 03:27:10PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> When storing a 64-bit immediate that has equal bottom and top halves we 
> currently
> synthesize the repeating 32-bit pattern twice and perform a single X-store.
> With this patch we synthesize the 32-bit pattern once into a W register and 
> store
> that twice using an STP. This reduces codesize bloat from synthesising the 
> same
> constant multiple times at the expense of converting a store to a store-pair.
> It will only trigger if we can save two or more instructions, so it will only 
> transform:
>  mov x1, 49370
>  movkx1, 0xc0da, lsl 32
>  str x1, [x0]
> 
> into:
> 
>  mov w1, 49370
>  stp w1, w1, [x0]
> 
> when optimising for -Os, whereas it will always transform a 4-insn synthesis
> sequence into a two-insn sequence + STP (see comments in the patch).
> 
> This patch triggers already but will trigger more with the store merging pass
> that I'm working on since that will generate more of these repeating 64-bit 
> constants.
> This helps improve codegen on 456.hmmer where store merging can sometimes 
> create very
> complex repeating constants and target-specific expand needs to break them 
> down.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?

Hi Kyrill,

Does this do the right thing for:

  void bar(u64 *x)
  {
*(volatile u64 *)x = 0xabcdef10abcdef10;
  }

C.f. 
https://lore.kernel.org/lkml/20190821103200.kpufwtviqhpbuv2n@willie-the-truck/

i.e. is this optimization still valid for volatile?

Thanks,
James

> 
> Thanks,
> Kyrill
> 
> 2016-10-24  Kyrylo Tkachov  
> 
>  * config/aarch64/aarch64.md (mov): Call
>  aarch64_split_dimode_const_store on DImode constant stores.
>  * config/aarch64/aarch64-protos.h (aarch64_split_dimode_const_store):
>  New prototype.
>  * config/aarch64/aarch64.c (aarch64_split_dimode_const_store): New
>  function.
> 
> 2016-10-24  Kyrylo Tkachov  
> 
>  * gcc.target/aarch64/store_repeating_constant_1.c: New test.
>  * gcc.target/aarch64/store_repeating_constant_2.c: Likewise.

Re: [AArch64] Tweak handling of fp moves via int registers

2019-08-19 Thread James Greenhalgh

On Wed, Aug 07, 2019 at 07:12:19PM +0100, Richard Sandiford wrote:
> The AArch64 port uses define_splits to prefer moving certain float
> constants via integer registers over loading them from memory.  E.g.:
> 
> (set (reg:SF X) (const_double:SF C))
> 
> splits to:
> 
> (set (reg:SI tmp) (const_int C'))
> (set (reg:SF X) (subreg:SF (reg:SI tmp) 0))
> 
> The problem with using splits for this -- especially when the split
> instruction is a constant move -- is that the original form is still
> valid and can be recreated by later pre-RA passes.  (And I think that's
> a valid thing for them to do, since they're folding away what appears in
> rtl terms to be a redundant instruction.)
> 
> One pass that can do this is ira's combine_and_move_insns, which among
> other things looks for registers that are set once and used once.
> If the register is set to a rematerialisable value, the code tries
> to fold that value into the single use.
> 
> We don't normally see this effect at -O2 and above because
> combine_and_move_insns isn't run when -fsched-pressure is enabled
> (which it is by default on AArch64).  But arguably the combine part is
> useful independently of -fsched-pressure, and only the move part is
> suspect.  So I don't think we should rely on the combination not
> happening here.
> 
> The new tests demonstrate the problem by running the original tests
> at -O instead of -O2.
> 
> This patch does the optimisation by splitting the moves at generation
> time and rejecting the combined form while the split is still possible.
> REG_EQUAL notes on the second move still give the original floating-point
> value for passes that need it.
> 
> Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
> OK to install?

OK.

Thanks,
James

> Richard
> 
> 
> 2019-08-07  Richard Sandiford  
> 
> gcc/
>   * config/aarch64/aarch64-protos.h (aarch64_move_float_via_int_p):
>   Declare.
>   * config/aarch64/aarch64.c (aarch64_move_float_via_int_p): New
>   function, extracted from the GPF_HF move splitter.
>   * config/aarch64/aarch64.md: Remove GPF_HF move splitter.
>   (mov): Move via an integer register if
>   aarch64_move_float_via_int_p.
>   (*movhf_aarch64, *movsf_aarch64, *movdf_aarch64): Check
>   aarch64_move_float_via_int_p.
>   * config/aarch64/iterators.md (fcvt_target): Handle TI and TF.
>   (FCVT_TARGET): Likewise.
> 
> gcc/testsuite/
>   * gcc.target/aarch64/dbl_mov_immediate_2.c: New test.
>   * gcc.target/aarch64/f16_mov_immediate_5.c: Likewise.
>   * gcc.target/aarch64/flt_mov_immediate_2.c: Likewise.

Re: PR90724 - ICE with __sync_bool_compare_and_swap with -march=armv8.2-a

2019-08-19 Thread James Greenhalgh

On Thu, Aug 15, 2019 at 02:11:25PM +0100, Prathamesh Kulkarni wrote:
> On Thu, 8 Aug 2019 at 11:22, Prathamesh Kulkarni
>  wrote:
> >
> > On Thu, 1 Aug 2019 at 15:34, Prathamesh Kulkarni
> >  wrote:
> > >
> > > On Thu, 25 Jul 2019 at 11:56, Prathamesh Kulkarni
> > >  wrote:
> > > >
> > > > On Wed, 17 Jul 2019 at 18:15, Prathamesh Kulkarni
> > > >  wrote:
> > > > >
> > > > > On Wed, 17 Jul 2019 at 13:45, Kyrill Tkachov
> > > > >  wrote:
> > > > > >
> > > > > > Hi Prathamesh
> > > > > >
> > > > > > On 7/10/19 12:24 PM, Prathamesh Kulkarni wrote:
> > > > > > > Hi,
> > > > > > > For following test-case,
> > > > > > > static long long AL[24];
> > > > > > >
> > > > > > > int
> > > > > > > check_ok (void)
> > > > > > > {
> > > > > > >   return (__sync_bool_compare_and_swap (AL+1, 0x20003ll,
> > > > > > > 0x1234567890ll));
> > > > > > > }
> > > > > > >
> > > > > > > Compiling with -O2 -march=armv8.2-a results in:
> > > > > > > pr90724.c: In function ‘check_ok’:
> > > > > > > pr90724.c:7:1: error: unrecognizable insn:
> > > > > > > 7 | }
> > > > > > >   | ^
> > > > > > > (insn 11 10 12 2 (set (reg:CC 66 cc)
> > > > > > > (compare:CC (reg:DI 95)
> > > > > > > (const_int 8589934595 [0x20003]))) 
> > > > > > > "pr90724.c":6:11 -1
> > > > > > >  (nil))
> > > > > > >
> > > > > > > IIUC, the issue is that 0x20003 falls outside the range of
> > > > > > > allowable immediate in cmp ? If it's replaced by a small constant 
> > > > > > > then
> > > > > > > it works.
> > > > > > >
> > > > > > > The ICE results with -march=armv8.2-a because, we enter if
> > > > > > > (TARGET_LSE) { ... } condition
> > > > > > > in aarch64_expand_compare_and_swap, while with -march=armv8.a it 
> > > > > > > goes
> > > > > > > into else,
> > > > > > > which forces oldval into register if the predicate fails to match.
> > > > > > >
> > > > > > > The attached patch checks if y (oldval) satisfies 
> > > > > > > aarch64_plus_operand
> > > > > > > predicate and if not, forces it to be in register, which resolves 
> > > > > > > ICE.
> > > > > > > Does it look OK ?
> > > > > > >
> > > > > > > Bootstrap+testing in progress on aarch64-linux-gnu.
> > > > > > >
> > > > > > > PS: The issue has nothing to do with SVE, which I incorrectly
> > > > > > > mentioned in bug report.
> > > > > > >
> > > > > > This looks ok to me (but you'll need maintainer approval).
> > > > > >
> > > > > > Does this fail on the branches as well?
> > > > > Hi Kyrill,
> > > > > Thanks for the review. The test also fails on gcc-9-branch (but not 
> > > > > on gcc-8).
> > > > Hi James,
> > > > Is the patch OK to commit  ?
> > > > https://gcc.gnu.org/ml/gcc-patches/2019-07/msg00793.html
> > > ping * 3: https://gcc.gnu.org/ml/gcc-patches/2019-07/msg00793.html
> > ping * 4: https://gcc.gnu.org/ml/gcc-patches/2019-07/msg00793.html
> ping * 5: https://gcc.gnu.org/ml/gcc-patches/2019-07/msg00793.html

Hi,

Sorry, this missed my filters as it didn't mention AArch64 in the subject
line.

Thais is good for trunk, thanks for waiting.

James

Re: [PING][AArch64] Use scvtf fbits option where appropriate

2019-08-19 Thread James Greenhalgh

On Mon, Jul 08, 2019 at 04:41:06PM +0100, Joel Hutton wrote:
> On 01/07/2019 18:03, James Greenhalgh wrote:
> 
> >> gcc/testsuite/ChangeLog:
> >>
> >> 2019-06-12  Joel Hutton  
> >>
> >>   * gcc.target/aarch64/fmul_scvtf_1.c: New test.
> > This testcase will fail on ILP32 targets where unsigned long will still
> > live in a 'w' register.
> Updated to use long long and unsigned long long.

Sorry, this slipped through the cracks.

OK for trunk.

Thanks,
James


> 
> Joel
> 

> From e10d5fdb9430799cd2050b8a2f567d1b4e43cde1 Mon Sep 17 00:00:00 2001
> From: Joel Hutton 
> Date: Mon, 8 Jul 2019 11:59:50 +0100
> Subject: [PATCH] SCVTF
> 
> ---
>  gcc/config/aarch64/aarch64-protos.h   |   1 +
>  gcc/config/aarch64/aarch64.c  |  23 +++
>  gcc/config/aarch64/aarch64.md |  39 +
>  gcc/config/aarch64/constraints.md |   7 +
>  gcc/config/aarch64/predicates.md  |   4 +
>  .../gcc.target/aarch64/fmul_scvtf_1.c | 140 ++
>  6 files changed, 214 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c
>

Re: [patch][aarch64]: add intrinsics for vld1(q)_x4 and vst1(q)_x4

2019-08-19 Thread James Greenhalgh

On Thu, Aug 15, 2019 at 12:28:27PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> On 8/6/19 10:51 AM, Richard Earnshaw (lists) wrote:
> On 18/07/2019 18:18, James Greenhalgh wrote:
> > On Mon, Jun 10, 2019 at 06:21:05PM +0100, Sylvia Taylor wrote:
> >> Greetings,
> >>
> >> This patch adds the intrinsic functions for:
> >> - vld1__x4
> >> - vst1__x4
> >> - vld1q__x4
> >> - vst1q__x4
> >>
> >> Bootstrapped and tested on aarch64-none-linux-gnu.
> >>
> >> Ok for trunk? If yes, I don't have any commit rights, so can someone
> >> please commit it on my behalf.
> >
> > Hi,
> >
> > I'm concerned by this strategy for implementing the arm_neon.h builtins:
> >
> >> +__extension__ extern __inline int8x8x4_t
> >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >> +vld1_s8_x4 (const int8_t *__a)
> >> +{
> >> +  union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
> >> +  __au.__o
> >> += __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) 
> >> __a);
> >> +  return __au.__i;
> >> +}
> >
> > As far as I know this is undefined behaviour in C++11. This was the best
> > resource I could find pointing to the relevant standards paragraphs.
> >
> >
> > https://stackoverflow.com/questions/11373203/accessing-inactive-union-member-and-undefined-behavior
> >
> > That said, GCC explicitly allows it, so maybe this is fine?
> >
> >
> > https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/Optimize-Options.html#Type-punning
> >
> > Can anyone from the languages side chime in on whether we're exposing
> > undefined behaviour (in either C or C++) here?
> 
> Yes, this is a GNU extension.  My only question is whether or not this
> can be disabled within GCC if you're trying to check for strict
> standards conformance of your code?  And if so, is there a way of making
> sure that this header still works in that case?  A number of GNU
> extensions can be protected with __extension__ but it's not clear how
> that could be applied in this case.  Perhaps the outer __extension__ on
> the function will already do that.
> 
> 
> It should still work. The only relevant flag is -fstrict-aliasing and it is
> documented to preserve this case:
> 
> https://gcc.gnu.org/onlinedocs/gcc-9.2.0/gcc/Optimize-Options.html#Optimize-Options
> 
> Note that we've already been using this idiom in arm_neon.h since 2014 [1]
> and it's worked fine.

Based on that input, this is OK for trunk.

Thanks,
James

> 
> Thanks,
> 
> Kyrill
> 
> [1] http://gcc.gnu.org/r209880
> 
> 
> 
> R.

Re: [PATCH][AArch64] Increase default function alignment

2019-08-12 Thread James Greenhalgh

On Fri, May 31, 2019 at 12:52:32PM +0100, Wilco Dijkstra wrote:
> With -mcpu=generic the function alignment is currently 8, however almost all
> supported cores prefer 16 or higher, so increase the default to 16:12.
> This gives ~0.2% performance increase on SPECINT2017, while codesize is 0.12%
> larger.

OK.

Thanks,
James

> ChangeLog:
> 2019-05-31  Wilco Dijkstra  
> 
>   * config/aarch64/aarch64.c (generic_tunings): Set function alignment to 
> 16.
> 
> --
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 0023cb37bbae5afe9387840c1bb6b43586d4fac2..ed1422af6aab5e3c6eeea37ec57e69b64092a0ab
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -693,7 +693,7 @@ static const struct tune_params generic_tunings =
>4, /* memmov_cost  */
>2, /* issue_rate  */
>(AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
> -  "8",   /* function_align.  */
> +  "16:12",   /* function_align.  */
>"4",   /* jump_align.  */
>"8",   /* loop_align.  */
>2, /* int_reassoc_width.  */
>

Re: [PATCH][aarch64] Use neoversen1 tuning struct for -mcpu=cortex-a76

2019-08-12 Thread James Greenhalgh

On Tue, Jul 30, 2019 at 05:59:15PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> The neoversen1 tuning struct gives better performance on the Cortex-A76, 
> so use that.
> The only difference from the current tuning is the function and label 
> alignment settings.
> 
> This gives about 1.3% improvement on SPEC2006 int and 0.3% on SPEC2006 fp.
> 
> Tested on aarch64-none-elf.
> 
> Ok for trunk?

OK.

Thanks,
James

> Thanks,
> Kyrill
> 
> 2019-07-31  Kyrylo Tkachov  
> 
>      * config/aarch64/aarch64-cores.def (cortex-a76): Use neoversen1 tuning
>      struct.
>

Re: [PATCH][AArch64] Fix PR81800

2019-08-12 Thread James Greenhalgh

On Tue, May 28, 2019 at 06:11:29PM +0100, Wilco Dijkstra wrote:
> PR81800 is about the lrint inline giving spurious FE_INEXACT exceptions.
> The previous change for PR81800 didn't fix this: when lrint is disabled
> in the backend, the midend will simply use llrint.  This actually makes
> things worse since llrint now also ignores FE_INVALID exceptions!
> The fix is to disable lrint/llrint on double if the size of a long is
> smaller (ie. ilp32).
> 
> Passes regress and bootstrap on AArch64. OK for commit?

OK.

Thanks,
James

> 
> ChangeLog
> 2018-11-13  Wilco Dijkstra
> 
> gcc/
>   PR target/81800
>   * gcc/config/aarch64/aarch64.md (lrint): Disable lrint pattern if GPF
>   operand is larger than a long int.
> 
> testsuite/
>   PR target/81800
>   * gcc.target/aarch64/no-inline-lrint_3.c: New test.
> 
> --
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 5a1894063a1ed2db1cc947c9c449d48808ed96ae..f08cd0930b3fc6527fbca218ad3c464f1ead0103
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -6304,7 +6304,7 @@ (define_expand "lrint2"
>[(match_operand:GPI 0 "register_operand")
> (match_operand:GPF 1 "register_operand")]
>"TARGET_FLOAT
> -   && ((GET_MODE_SIZE (mode) <= GET_MODE_SIZE (mode))
> +   && ((GET_MODE_BITSIZE (mode) <= LONG_TYPE_SIZE)
> || !flag_trapping_math || flag_fp_int_builtin_inexact)"
>  {
>rtx cvt = gen_reg_rtx (mode);
> diff --git a/gcc/testsuite/gcc.target/aarch64/no-inline-lrint_3.c 
> b/gcc/testsuite/gcc.target/aarch64/no-inline-lrint_3.c
> new file mode 100644
> index 
> ..ca772cb999e7b6cfbd3f080111d3eb479d43f47b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/no-inline-lrint_3.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target ilp32 } */
> +/* { dg-options "-O3 -fno-math-errno -fno-fp-int-builtin-inexact" } */
> +
> +#define TEST(name, float_type, int_type, fn) void f_##name (float_type x) \
> +{  \
> +  volatile int_type   b = __builtin_##fn (x);
>   \
> +}
> +
> +TEST (dld, double, long, lrint)
> +TEST (flf, float , long, lrintf)
> +
> +TEST (did, double, int, lrint)
> +TEST (fif, float , int, lrintf)
> +
> +/* { dg-final { scan-assembler-times "fcvtzs\tw\[0-9\]+, \[d,s\]\[0-9\]+" 2 
> } } */
> +/* { dg-final { scan-assembler-times "bl\tlrint" 2 } } */
>

Re: [AArch64] Add a "y" constraint for V0-V7

2019-08-12 Thread James Greenhalgh

On Wed, Aug 07, 2019 at 07:19:12PM +0100, Richard Sandiford wrote:
> Some indexed SVE FCMLA operations have a 3-bit register field that
> requires one of Z0-Z7.  This patch adds a public "y" constraint for that.
> 
> The patch also documents "x", which is again intended to be a public
> constraint.
> 
> Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
> OK to install?


I had the vague recollection that 'y' already meant something... I'm
guessing you already checked, but just in case, please check.

Otherwise, this is OK.

Thanks,
James


> 
> Richard
> 
> 
> 2019-08-07  Richard Sandiford  
> 
> gcc/
>   * doc/md.texi: Document the x and y constraints for AArch64.
>   * config/aarch64/aarch64.h (FP_LO8_REGNUM_P): New macro.
>   (FP_LO8_REGS): New reg_class.
>   (REG_CLASS_NAMES, REG_CLASS_CONTENTS): Add an entry for FP_LO8_REGS.
>   * config/aarch64/aarch64.c (aarch64_hard_regno_nregs)
>   (aarch64_regno_regclass, aarch64_class_max_nregs): Handle FP_LO8_REGS.
>   * config/aarch64/predicates.md (aarch64_simd_register): Use
>   FP_REGNUM_P instead of checking the classes manually.
>   * config/aarch64/constraints.md (y): New constraint.
> 
> gcc/testsuite/
>   * gcc.target/aarch64/asm-x-constraint-1.c: New test.
>   * gcc.target/aarch64/asm-y-constraint-1.c: Likewise.
>

Re: [AArch64] Make aarch64_classify_vector_mode use a switch statement

2019-08-12 Thread James Greenhalgh

On Wed, Aug 07, 2019 at 07:24:18PM +0100, Richard Sandiford wrote:
> aarch64_classify_vector_mode used properties of a mode to test whether
> the mode was a single Advanced SIMD vector, a single SVE vector, or a
> tuple of SVE vectors.  That works well for current trunk and is simpler
> than checking for modes by name.
> 
> However, for the ACLE and for planned autovec improvements, we also
> need partial SVE vector modes that hold:
> 
> - half of the available 32-bit elements
> - a half or quarter of the available 16-bit elements
> - a half, quarter, or eighth of the available 8-bit elements
> 
> These should be packed in memory and unpacked in registers.  E.g.
> VNx2SI has half the number of elements of VNx4SI, and so is half the
> size in memory.  When stored in registers, each VNx2SI element occupies
> the low 32 bits of a VNx2DI element, with the upper bits being undefined.
> 
> The upshot is that:
> 
>   GET_MODE_SIZE (VNx4SImode) == 2 * GET_MODE_SIZE (VNx2SImode)
> 
> since GET_MODE_SIZE must always be the memory size.  This in turn means
> that for fixed-length SVE, some partial modes can have the same size as
> Advanced SIMD modes.  We then need to be specific about which mode we're
> dealing with.
> 
> This patch prepares for that by switching based on the mode instead
> of querying properties.
> 
> A later patch makes sure that Advanced SIMD modes always win over
> partial SVE vector modes in normal queries.
> 
> Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
> OK to install?

OK.

Thanks,
James

> 
> Richard
> 
> 
> 2019-08-07  Richard Sandiford  
> 
> gcc/
>   * config/aarch64/aarch64.c (aarch64_classify_vector_mode): Switch
>   based on the mode instead of testing properties of it.

Re: [AArch64] Make the complete mnemonic

2019-08-12 Thread James Greenhalgh

On Wed, Aug 07, 2019 at 08:23:48PM +0100, Richard Sandiford wrote:
> The Advanced SIMD and SVE permute patterns both split the permute
> operation into a base name and a hilo suffix.  That works well, but it
> means that for "@" patterns, we need to pass the permute code twice,
> once for the base name and once for the suffix.
> 
> Having a unified name avoids that and also makes the definitions
> slightly simpler.
> 
> Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
> OK to install?

OK.

Thanks,
James

> 
> 2019-08-07  Richard Sandiford  
> 
> gcc/
>   * config/aarch64/iterators.md (perm_insn): Include the "1"/"2" suffix.
>   (perm_hilo): Remove UNSPEC_ZIP*, UNSEPC_TRN* and UNSPEC_UZP*.
>   * config/aarch64/aarch64-simd.md
>   (aarch64_): Rename to..
>   (aarch64_): ...this and remove perm_hilo
>   from the asm template.
>   * config/aarch64/aarch64-sve.md
>   (aarch64_): Rename to..
>   (aarch64_): ...this and remove perm_hilo
>   from the asm template.
>   (aarch64_): Rename to..
>   (aarch64_): ...this and remove perm_hilo
>   from the asm template.
>   * config/aarch64/aarch64-simd-builtins.def: Update comment.

Re: [PATCH, GCC, AArch64] Enable Transactional Memory Extension

2019-07-22 Thread James Greenhalgh

On Wed, Jul 10, 2019 at 07:55:42PM +0100, Sudakshina Das wrote:
> Hi
> 
> This patch enables the new Transactional Memory Extension announced 
> recently as part of Arm's new architecture technologies.
> We introduce a new optional extension "tme" to enable this. The 
> following instructions are part of the extension:
> * tstart 
> * ttest 
> * tcommit
> * tcancel #
> The documentation for the above can be found here:
> https://developer.arm.com/docs/ddi0602/latest/base-instructions-alphabetic-order
> 
> We have also added ACLE intrinsics for the instructions above according to:
> https://developer.arm.com/docs/101028/latest/transactional-memory-extension-tme-intrinsics
> 
> Builds and regression tested on aarch64-none-linux-gnu and added new 
> tests for the new instructions.
> 
> Is this okay for trunk?

This looks good to me.

OK for trunk.

Thanks,
James

> 
> Thanks
> Sudi
> 
> *** gcc/ChangeLog ***
> 
> 2019-xx-xx  Sudakshina Das  
> 
>   * config/aarch64/aarch64-builtins.c (enum aarch64_builtins): Add
>   AARCH64_TME_BUILTIN_TSTART, AARCH64_TME_BUILTIN_TCOMMIT,
>   AARCH64_TME_BUILTIN_TTEST and AARCH64_TME_BUILTIN_TCANCEL.
>   (aarch64_init_tme_builtins): New.
>   (aarch64_init_builtins): Call aarch64_init_tme_builtins.
>   (aarch64_expand_builtin_tme): New.
>   (aarch64_expand_builtin): Handle TME builtins.
>   * config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
>   __ARM_FEATURE_TME when enabled.
>   * config/aarch64/aarch64-option-extensions.def: Add "tme".
>   * config/aarch64/aarch64.h (AARCH64_FL_TME, AARCH64_ISA_TME): New.
>   (TARGET_TME): New.
>   * config/aarch64/aarch64.md (define_c_enum "unspec"): Add UNSPEC_TTEST.
>   (define_c_enum "unspecv"): Add UNSPECV_TSTART, UNSPECV_TCOMMIT and
>   UNSPECV_TCANCEL.
>   (tstart, ttest, tcommit, tcancel): New instructions.
>   * config/aarch64/arm_acle.h (__tstart, __tcommit): New.
>   (__tcancel, __ttest): New.
>   (_TMFAILURE_REASON, _TMFAILURE_RTRY, _TMFAILURE_CNCL): New macro.
>   (_TMFAILURE_MEM, _TMFAILURE_IMP, _TMFAILURE_ERR): Likewise.
>   (_TMFAILURE_SIZE, _TMFAILURE_NEST, _TMFAILURE_DBG): Likewise.
>   (_TMFAILURE_INT, _TMFAILURE_TRIVIAL): Likewise.
>   * config/arm/types.md: Add new tme type attr.
>   * doc/invoke.texi: Document "tme".
> 
> *** gcc/testsuite/ChangeLog ***
> 
> 2019-xx-xx  Sudakshina Das  
> 
>   * gcc.target/aarch64/acle/tme.c: New test.
>   * gcc.target/aarch64/pragma_cpp_predefs_2.c: New test.

> diff --git a/gcc/config/aarch64/aarch64-builtins.c 
> b/gcc/config/aarch64/aarch64-builtins.c
> index 
> 549a6c249243372eacb5d29923b5d1abce4ac79a..16c1d42ea2be0f477692be592e30ba8ce27f05a7
>  100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -438,6 +438,11 @@ enum aarch64_builtins
>/* Special cased Armv8.3-A Complex FMA by Lane quad Builtins.  */
>AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE,
>AARCH64_SIMD_FCMLA_LANEQ_BUILTINS
> +  /* TME builtins.  */
> +  AARCH64_TME_BUILTIN_TSTART,
> +  AARCH64_TME_BUILTIN_TCOMMIT,
> +  AARCH64_TME_BUILTIN_TTEST,
> +  AARCH64_TME_BUILTIN_TCANCEL,
>AARCH64_BUILTIN_MAX
>  };
>  
> @@ -1067,6 +1072,35 @@ aarch64_init_pauth_hint_builtins (void)
>   NULL_TREE);
>  }
>  
> +/* Initialize the transactional memory extension (TME) builtins.  */
> +static void
> +aarch64_init_tme_builtins (void)
> +{
> +  tree ftype_uint64_void
> += build_function_type_list (uint64_type_node, NULL);
> +  tree ftype_void_void
> += build_function_type_list (void_type_node, NULL);
> +  tree ftype_void_uint64
> += build_function_type_list (void_type_node, uint64_type_node, NULL);
> +
> +  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TSTART]
> += add_builtin_function ("__builtin_aarch64_tstart", ftype_uint64_void,
> + AARCH64_TME_BUILTIN_TSTART, BUILT_IN_MD,
> + NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TTEST]
> += add_builtin_function ("__builtin_aarch64_ttest", ftype_uint64_void,
> + AARCH64_TME_BUILTIN_TTEST, BUILT_IN_MD,
> + NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TCOMMIT]
> += add_builtin_function ("__builtin_aarch64_tcommit", ftype_void_void,
> + AARCH64_TME_BUILTIN_TCOMMIT, BUILT_IN_MD,
> + NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TCANCEL]
> += add_builtin_function ("__builtin_aarch64_tcancel", ftype_void_uint64,
> + AARCH64_TME_BUILTIN_TCANCEL, BUILT_IN_MD,
> + NULL, NULL_TREE);
> +}
> +
>  void
>  aarch64_init_builtins (void)
>  {
> @@ -1104,6 +1138,9 @@ aarch64_init_builtins (void)
>   register them.  */
>if (!TARGET_ILP32)
>  aarch64_init_pauth_hint_builtins ();
> +
> +  if (TARGET_TME)
> +

Re: [patch][aarch64]: add usra and ssra combine patterns

2019-07-22 Thread James Greenhalgh

On Mon, Jun 17, 2019 at 05:42:45PM +0100, Sylvia Taylor wrote:
> Updating patch with missing scan-assembler checks.

This is OK. I committed it on your behalf as r273703.

Thanks,
James

> Cheers,
> Syl
> 
> -Original Message-
> From: Sylvia Taylor 
> Sent: 04 June 2019 12:24
> To: James Greenhalgh 
> Cc: Richard Earnshaw ; Marcus Shawcroft 
> ; gcc-patches@gcc.gnu.org; nd 
> Subject: RE: [patch][aarch64]: add usra and ssra combine patterns
> 
> Hi James,
> 
> I've managed to remove the odd redundant git diff change.
> 
> Regarding aarch64_sra_n, this patch shouldn't affect it.
> 
> I am also not aware of any way of enabling this combine inside the pattern 
> used for those intrinsics, so I kept them separate.
> 
> Cheers,
> Syl
> 
> -Original Message-
> From: James Greenhalgh 
> Sent: 03 June 2019 11:20
> To: Sylvia Taylor 
> Cc: Richard Earnshaw ; Marcus Shawcroft 
> ; gcc-patches@gcc.gnu.org; nd 
> Subject: Re: [patch][aarch64]: add usra and ssra combine patterns
> 
> On Thu, May 30, 2019 at 03:25:19PM +0100, Sylvia Taylor wrote:
> > Greetings,
> > 
> > This patch adds support to combine:
> > 
> > 1) ushr and add into usra, example:
> > 
> > ushrv0.16b, v0.16b, 2
> > add v0.16b, v0.16b, v2.16b
> > ---
> > usrav2.16b, v0.16b, 2
> > 
> > 2) sshr and add into ssra, example:
> > 
> > sshrv1.16b, v1.16b, 2
> > add v1.16b, v1.16b, v3.16b
> > ---
> > ssrav3.16b, v1.16b, 2
> > 
> > Bootstrapped and tested on aarch64-none-linux-gnu.
> > 
> > Ok for trunk? If yes, I don't have any commit rights, so can someone 
> > please commit it on my behalf.
> 
> This patch has an unrelated change to
> aarch64_get_lane_zero_extend Please revert that and 
> resend.
> 
> What changes (if any) should we make to aarch64_sra_n based on 
> this patch, and to the vsra_n intrinsics in arm_neon.h ?
> 
> Thanks,
> James
> 
> > 
> > Cheers,
> > Syl
> > 
> > gcc/ChangeLog:
> > 
> > 2019-05-30  Sylvia Taylor  
> > 
> > * config/aarch64/aarch64-simd.md
> > (*aarch64_simd_sra): New.
> > * config/aarch64/iterators.md
> > (SHIFTRT): New iterator.
> > (sra_op): New attribute.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > 2019-05-30  Sylvia Taylor  
> > 
> > * gcc.target/aarch64/simd/ssra.c: New test.
> > * gcc.target/aarch64/simd/usra.c: New test.
>

Re: [patch][aarch64]: add intrinsics for vld1(q)_x4 and vst1(q)_x4

2019-07-18 Thread James Greenhalgh

On Mon, Jun 10, 2019 at 06:21:05PM +0100, Sylvia Taylor wrote:
> Greetings,
> 
> This patch adds the intrinsic functions for:
> - vld1__x4
> - vst1__x4
> - vld1q__x4
> - vst1q__x4
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk? If yes, I don't have any commit rights, so can someone 
> please commit it on my behalf.

Hi,

I'm concerned by this strategy for implementing the arm_neon.h builtins:

> +__extension__ extern __inline int8x8x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vld1_s8_x4 (const int8_t *__a)
> +{
> +  union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
> +  __au.__o
> += __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
> +  return __au.__i;
> +}

As far as I know this is undefined behaviour in C++11. This was the best
resource I could find pointing to the relevant standards paragraphs.

  
https://stackoverflow.com/questions/11373203/accessing-inactive-union-member-and-undefined-behavior

That said, GCC explicitly allows it, so maybe this is fine?

  
https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/Optimize-Options.html#Type-punning

Can anyone from the languages side chime in on whether we're exposing
undefined behaviour (in either C or C++) here?

Thanks,
James



> 
> Cheers,
> Syl
> 
> gcc/ChangeLog:
> 
> 2019-06-10  Sylvia Taylor  
> 
>   * config/aarch64/aarch64-simd-builtins.def:
>   (ld1x4): New.
>   (st1x4): Likewise.
>   * config/aarch64/aarch64-simd.md:
>   (aarch64_ld1x4): New pattern.
>   (aarch64_st1x4): Likewise.
>   (aarch64_ld1_x4_): Likewise.
>   (aarch64_st1_x4_): Likewise.
>   * config/aarch64/arm_neon.h:
>   (vld1_s8_x4): New function.
>   (vld1q_s8_x4): Likewise.
>   (vld1_s16_x4): Likewise.
>   (vld1q_s16_x4): Likewise.
>   (vld1_s32_x4): Likewise.
>   (vld1q_s32_x4): Likewise.
>   (vld1_u8_x4): Likewise.
>   (vld1q_u8_x4): Likewise.
>   (vld1_u16_x4): Likewise.
>   (vld1q_u16_x4): Likewise.
>   (vld1_u32_x4): Likewise.
>   (vld1q_u32_x4): Likewise.
>   (vld1_f16_x4): Likewise.
>   (vld1q_f16_x4): Likewise.
>   (vld1_f32_x4): Likewise.
>   (vld1q_f32_x4): Likewise.
>   (vld1_p8_x4): Likewise.
>   (vld1q_p8_x4): Likewise.
>   (vld1_p16_x4): Likewise.
>   (vld1q_p16_x4): Likewise.
>   (vld1_s64_x4): Likewise.
>   (vld1_u64_x4): Likewise.
>   (vld1_p64_x4): Likewise.
>   (vld1q_s64_x4): Likewise.
>   (vld1q_u64_x4): Likewise.
>   (vld1q_p64_x4): Likewise.
>   (vld1_f64_x4): Likewise.
>   (vld1q_f64_x4): Likewise.
>   (vst1_s8_x4): Likewise.
>   (vst1q_s8_x4): Likewise.
>   (vst1_s16_x4): Likewise.
>   (vst1q_s16_x4): Likewise.
>   (vst1_s32_x4): Likewise.
>   (vst1q_s32_x4): Likewise.
>   (vst1_u8_x4): Likewise.
>   (vst1q_u8_x4): Likewise.
>   (vst1_u16_x4): Likewise.
>   (vst1q_u16_x4): Likewise.
>   (vst1_u32_x4): Likewise.
>   (vst1q_u32_x4): Likewise.
>   (vst1_f16_x4): Likewise.
>   (vst1q_f16_x4): Likewise.
>   (vst1_f32_x4): Likewise.
>   (vst1q_f32_x4): Likewise.
>   (vst1_p8_x4): Likewise.
>   (vst1q_p8_x4): Likewise.
>   (vst1_p16_x4): Likewise.
>   (vst1q_p16_x4): Likewise.
>   (vst1_s64_x4): Likewise.
>   (vst1_u64_x4): Likewise.
>   (vst1_p64_x4): Likewise.
>   (vst1q_s64_x4): Likewise.
>   (vst1q_u64_x4): Likewise.
>   (vst1q_p64_x4): Likewise.
>   (vst1_f64_x4): Likewise.
>   (vst1q_f64_x4): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-06-10  Sylvia Taylor  
> 
>   * gcc.target/aarch64/advsimd-intrinsics/vld1x4.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vst1x4.c: New test.

Re: [PATCH][GCC][AArch64] Make processing less fragile in config.gcc

2019-07-08 Thread James Greenhalgh

On Tue, Jun 25, 2019 at 09:30:30AM +0100, Tamar Christina wrote:
> Hi All,
> 
> This is an update to the patch rebased to after the SVE2 options have been 
> merged.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for trunk?

OK.

Thanks,
James

> 
> Thanks,
> Tamar
>

Re: [patch 1/2][aarch64]: redefine aes patterns

2019-07-08 Thread James Greenhalgh

On Fri, Jul 05, 2019 at 12:24:42PM +0100, Sylvia Taylor wrote:
> Greetings,
> 
> This first patch removes aarch64 usage of the aese/aesmc and aesd/aesimc
> fusions (i.e. aes fusion) implemented in the scheduler due to unpredictable
> behaviour observed in cases such as:
> - when register allocation goes bad (e.g. extra movs)
> - aes operations with xor and zeroed keys among interleaved operations
> 
> A more stable version should be provided by instead doing the aes fusion 
> during the combine pass. Since the aese and aesd patterns have been 
> rewritten as encapsulating a xor operation, the existing combine fusion 
> patterns have also been updated. The purpose is to simplify the need of 
> having additional combine patterns for cases like the ones below:
> 
> For AESE (though it also applies to AESD as both have a xor operation):
> 
> data = data ^ key;
> data = vaeseq_u8(data, zero);
> ---
> eor   v1.16b, v0.16b, v1.16b
> aese  v1.16b, v2.16b
> 
> Should mean and generate the same as:
> 
> data = vaeseq_u8(data, key);
> ---
> aese  v1.16b, v0.16b
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.

OK for trunk.

I couldn't see you in the maintainers file, do you need someone to apply
this on your behalf?

Thanks,
James

> 
> Cheers,
> Syl
> 
> gcc/ChangeLog:
> 
> 2019-07-05  Sylvia Taylor  
> 
>   * config/aarch64/aarch64-simd.md
>   (aarch64_crypto_aesv16qi): Redefine pattern with xor.
>   (aarch64_crypto_aesv16qi): Remove attribute enabled.
>   (*aarch64_crypto_aesv16qi_xor_combine): Remove both.
>   (*aarch64_crypto_aese_fused,
>   *aarch64_crypto_aesd_fused): Update to new definition.
>   * config/aarch64/aarch64.c
>   (aarch_macro_fusion_pair_p): Remove aese/aesmc fusion check.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-07-05  Sylvia Taylor  
> 
>   * gcc.target/aarch64/crypto-fuse-1.c: Remove.
>   * gcc.target/aarch64/crypto-fuse-2.c: Remove.
>   * gcc.target/aarch64/aes-fuse-1.c: New testcase.
>   * gcc.target/aarch64/aes-fuse-2.c: New testcase.

Re: [PING][AArch64] Use scvtf fbits option where appropriate

2019-07-01 Thread James Greenhalgh

On Wed, Jun 26, 2019 at 10:35:00AM +0100, Joel Hutton wrote:
> Ping, plus minor rework (mostly non-functional changes)
> 
> gcc/ChangeLog:
> 
> 2019-06-12  Joel Hutton  
> 
>  * config/aarch64/aarch64-protos.h (aarch64_fpconst_pow2_recip): New 
> prototype
>  * config/aarch64/aarch64.c (aarch64_fpconst_pow2_recip): New function
>  * config/aarch64/aarch64.md 
> (*aarch64_cvtf2_mult): New pattern

Cool; I learned a new instruction!

>  (*aarch64_cvtf2_mult): New pattern
>  * config/aarch64/constraints.md (Dt): New constraint
>  * config/aarch64/predicates.md (aarch64_fpconst_pow2_recip): New 
> predicate
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-06-12  Joel Hutton  
> 
>  * gcc.target/aarch64/fmul_scvtf_1.c: New test.

This testcase will fail on ILP32 targets where unsigned long will still
live in a 'w' register.

Thanks,
James

Re: [PATCH][AArch64] Remove constraint strings from define_expand constructs in the back end

2019-07-01 Thread James Greenhalgh

On Mon, Jun 24, 2019 at 04:33:40PM +0100, Dennis Zhang wrote:
> Hi,
> 
> A number of AArch64 define_expand patterns have specified constraints 
> for their operands. But the constraint strings are ignored at expand 
> time and are therefore redundant/useless. We now avoid specifying 
> constraints in new define_expands, but we should clean up the existing 
> define_expand definitions.
> 
> For example, the constraint "=w" is removed in the following case:
> (define_expand "sqrt2"
>[(set (match_operand:GPF_F16 0 "register_operand" "=w")
> The "" marks with an empty constraint in define_expand are removed as well.
> 
> The patch is tested with the build configuration of 
> --target=aarch64-none-linux-gnu, and it passes gcc/testsuite.

This is OK for trunk.

Thanks,
James

> gcc/ChangeLog:
> 
> 2019-06-21  Dennis Zhang  
> 
>   * config/aarch64/aarch64-simd.md: Remove redundant constraints
>   from define_expand.
>   * config/aarch64/aarch64-sve.md: Likewise.
>   * config/aarch64/aarch64.md: Likewise.
>   * config/aarch64/atomics.md: Likewise.

Re: [PATCH][arm/AArch64] Assume unhandled NEON types are neon_arith_basic types when scheduling for Cortex-A5

2019-07-01 Thread James Greenhalgh

On Mon, Jul 01, 2019 at 04:13:40PM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> Some scheduling descriptions, like the Cortex-A57 one, are reused for 
> multiple -mcpu options.
> Sometimes those other -mcpu cores support more architecture features 
> than the Armv8-A Cortex-A57.
> For example, the Cortex-A75 and Cortex-A76 support Armv8.2-A as well as 
> the Dot Product instructions.
> These Dot Product instructions have the neon_dot and neon_dot_q 
> scheduling type, but that type is not
> handled in cortex-a57.md, since the Cortex-A57 itself doesn't need to 
> care about these instructions.
> 
> But if we just ignore the neon_dot(_q) type at scheduling we get really 
> terrible codegen when compiling
> for -mcpu=cortex-a76, for example, because the scheduler just pools all 
> the UDOT instructions at the end
> of the basic block, since it doesn't assume anything about their behaviour.
> 
> This patch ameliorates the situation somewhat by telling the Cortex-A57 
> scheduling model to treat any
> insn that doesn't get assigned a cortex_a57_neon_type but is actually a 
> is_neon_type instruction as
> a simple neon_arith_basic instruction. This allows us to treat 
> post-Armv8-A SIMD instructions more sanely
> without having to model each of them explicitly in cortex-a57.md.
> 
> Bootstrapped and tested on arm-none-linux-gnueabihf and 
> aarch64-none-linux-gnu.
> 
> Ok for trunk from an aarch64 perspective?

OK.

Thansk,
James

> 
> Thanks,
> Kyrill
>

Re: [PATCH] aarch64: fix asm visibility for extern symbols

2019-06-04 Thread James Greenhalgh

On Tue, Jun 04, 2019 at 03:58:07PM +0100, Szabolcs Nagy wrote:
> Commit r271869 broke visibility declarations in asm for extern symbols, 
> because
> the new ASM_OUTPUT_EXTERNAL hook failed to call the default hook for elf.

OK.

In future, you can consider a patch like this to fall under the "obvious"
rule and commit it without review.

Thanks,
James

> gcc/ChangeLog:
> 
> 2019-06-04  Szabolcs Nagy  
> 
>   * config/aarch64/aarch64-protos.h (aarch64_asm_output_external): Remove
>   const.
>   * config/aarch64/aarch64.c (aarch64_asm_output_external): Call
>   default_elf_asm_output_external.

> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 6dccabc8cf7..1e3b1c91db1 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -437,7 +437,7 @@ bool aarch64_is_noplt_call_p (rtx);
>  bool aarch64_label_mentioned_p (rtx);
>  void aarch64_declare_function_name (FILE *, const char*, tree);
>  void aarch64_asm_output_alias (FILE *, const tree, const tree);
> -void aarch64_asm_output_external (FILE *, const tree, const char*);
> +void aarch64_asm_output_external (FILE *, tree, const char*);
>  bool aarch64_legitimate_pic_operand_p (rtx);
>  bool aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode, rtx, rtx);
>  bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned 
> HOST_WIDE_INT,
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 263ed21442c..7acc3227a78 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -15650,8 +15650,9 @@ aarch64_asm_output_alias (FILE *stream, const tree 
> decl, const tree target)
> function symbol references.  */
>  
>  void
> -aarch64_asm_output_external (FILE *stream, const tree decl, const char* name)
> +aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
>  {
> +  default_elf_asm_output_external (stream, decl, name);
>aarch64_asm_output_variant_pcs (stream, decl, name);
>  }
>

Re: [PATCH][GCC][AArch64] Add support for hint intrinsics: yield, wfe, wfi, sev and __sevl.

2019-06-03 Thread James Greenhalgh

On Wed, May 29, 2019 at 03:48:29PM +0100, Srinath Parvathaneni wrote:
> Hi All,
> 
> This patch implements the __yield(), __wfe(), __wfi(), __sev() and 
> __sevl() ACLE (hint) intrinsics for AArch64 as yield, wfe, wfi, sev and 
> sevl (hint) instructions respectively.
> 
> The instructions are documented in the ArmARM[1] and the intrinsics 
> specification are published on the Arm website [2].
> 
> [1] 
> https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile
> [2] 
> https://developer.arm.com/docs/ihi0053/latest/arm-c-language-extensions-21-architecture-specification
> 
> Bootstrapped on aarch64-none-linux-gnu and regression tested on 
> aarch64-none-elf with no regressions.
> 
> Ok for trunk? If ok, could someone please commit the patch on my behalf, 
> I don't have commit rights.

I can't tell from the documentation, is this the expected behaviour?

  $ cat yield.c
#include "arm_acle.h"
int foo (int* counter)
{
  *counter = 5;
  __yield();
  *counter = 5;
  __yield();
  *counter = 5;
  __yield();
  *counter = 5;
  __yield();
}
  
  $ gcc -O2 yield.c
  
foo:
  mov w1, 5
  str w1, [x0]
  yield
  yield
  yield
  yield
  ret

i.e. should we expect the memory writes to be considered dead and eliminated,
or should they live over the __yield.

I have similar questions as to whether GCC's unspec_volatile actually
implements the required semantics from the ACLE.

Thanks,
James

  
> 
> Thanks,
> Srinath
> 
> gcc/ChangeLog:
> 
> 2019-05-29  Srinath Parvathaneni  
> 
>   * config/aarch64/aarch64.md (UNSPECV_YIELD): New volatile unspec.
>   (UNSPECV_WFE): Likewise.
>   (UNSPECV_WFI): Likewise.
>   (UNSPECV_SEV): Likewise.
>   (UNSPECV_SEVL): Likewise.
>   (yield): New pattern name.
>   (wfe): Likewise.
>   (wfi): Likewise.
>   (sev): Likewise.
>   (sevl): Likewise.
>   * config/aarch64/aarch64-builtins.c (aarch64_builtins):
>   AARCH64_BUILTIN_YIELD: New builtin.
>   AARCH64_BUILTIN_WFE: Likewise.
>   AARCH64_BUILTIN_WFI: Likewise.
>   AARCH64_BUILTIN_SEV: Likewise.
>   AARCH64_BUILTIN_SEVL: Likewise.
>   (aarch64_init_syshintop_builtins): New function.
>   (aarch64_init_builtins): New call statement.
>   (aarch64_expand_builtin): New case.
>   * config/aarch64/arm_acle.h (__yield): New inline function.
>   (__sev): Likewise.
>   (__sevl): Likewise.
>   (__wfi): Likewise.
>   (__wfe): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-05-29  Srinath Parvathaneni  
> 
>   * gcc.target/aarch64/acle/hint-1.c: New test.
>

Re: [PATCH v2] aarch64: emit .variant_pcs for aarch64_vector_pcs symbol references

2019-06-03 Thread James Greenhalgh

On Wed, May 29, 2019 at 11:00:46AM +0100, Richard Sandiford wrote:
> Szabolcs Nagy  writes:
> > v2:
> > - use aarch64_simd_decl_p to check for aarch64_vector_pcs.
> > - emit the .variant_pcs directive even for local functions.
> > - don't require .variant_pcs asm support in compile only tests.
> > - add weakref tests.
> >
> > A dynamic linker with lazy binding support may need to handle vector PCS
> > function symbols specially, so an ELF symbol table marking was
> > introduced for such symbols.
> >
> > Function symbol references and definitions that follow the vector PCS
> > are marked in the generated assembly with .variant_pcs and then the
> > STO_AARCH64_VARIANT_PCS st_other flag is set on the symbol in the object
> > file.  The marking is propagated to the dynamic symbol table by the
> > static linker so a dynamic linker can handle such symbols specially.
> >
> > For this to work, the assembler, the static linker and the dynamic
> > linker has to be updated on a system.  Old assembler does not support
> > the new .variant_pcs directive, so a toolchain with old binutils won't
> > be able to compile code that references vector PCS symbols.
> >
> > gcc/ChangeLog:
> >
> > 2019-05-28  Szabolcs Nagy  
> >
> > * config/aarch64/aarch64-protos.h (aarch64_asm_output_alias): Declare.
> > (aarch64_asm_output_external): Declare.
> > * config/aarch64/aarch64.c (aarch64_asm_output_variant_pcs): New.
> > (aarch64_declare_function_name): Call aarch64_asm_output_variant_pcs.
> > (aarch64_asm_output_alias): New.
> > (aarch64_asm_output_external): New.
> > * config/aarch64/aarch64.h (ASM_OUTPUT_DEF_FROM_DECLS): Define.
> > (ASM_OUTPUT_EXTERNAL): Define.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 2019-05-28  Szabolcs Nagy  
> >
> > * gcc.target/aarch64/pcs_attribute-2.c: New test.
> > * gcc.target/aarch64/torture/simd-abi-4.c: Check .variant_pcs support.
> > * lib/target-supports.exp (check_effective_target_aarch64_variant_pcs):
> > New.
> 
> LGTM, but an AArch64 maintainer will need to approve.

OK with Richard's change suggested below.

Thanks,
James

> 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c 
> > b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c
> > index e399690f364..80ebd955e10 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c
> > @@ -1,4 +1,5 @@
> >  /* dg-do run */
> > +/* { dg-require-effective-target aarch64_variant_pcs } */
> >  /* { dg-additional-options "-std=c99" }  */
> 
> Not your problem of course, but mind fixing the dg-do markup while
> you're there?  It should be
> 
> /* { dg-do run } */
> 
> instead.  As things stand, the test only gets compiled, not run.
> 
> Thanks,
> Richard

Re: [PATCH][AArch64] Emit TARGET_DOTPROD-specific sequence for sadv16qi

2019-06-03 Thread James Greenhalgh

On Mon, May 13, 2019 at 12:18:25PM +0100, Kyrill Tkachov wrote:
> Hi Richard,
> 
> On 5/9/19 9:06 AM, Richard Sandiford wrote:
> > Kyrill Tkachov  writes:
> >> +;; Helper expander for aarch64_abd_3 to save the callers
> >> +;; the hassle of constructing the other arm of the MINUS.
> >> +(define_expand "abd_3"
> >> +  [(use (match_operand:VDQ_BHSI 0 "register_operand"))
> >> +   (USMAX:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand")
> >> + (match_operand:VDQ_BHSI 2 "register_operand"))]
> >> +  "TARGET_SIMD"
> >> +  {
> >> +rtx other_arm
> >> +  = gen_rtx_ (mode, operands[1], operands[2]);
> >> +emit_insn (gen_aarch64_abd_3 (operands[0], operands[1],
> >> + operands[2], other_arm));
> > Should be indented to the innermost "(" instead.
> >
> > LGTM otherwise, but an AArch6 maintainer should have the final say.
> 
> Thanks.
> 
> After your recent r271107 I've updated the patch and this helper pattern 
> is no longer necessary.
> 
> This version is shorter and has been bootstrapped and tested on 
> aarch64-none-linux-gnu.

OK.

Thanks,
James

> 
> Thanks,
> 
> Kyrill
> 
> 
> 2019-13-05  Kyrylo Tkachov  
> 
>      * config/aarch64/iterators.md (MAX_OPP): New code attr.
>      * config/aarch64/aarch64-simd.md (*aarch64_abd_3): Rename 
> to...
>      (aarch64_abd_3): ... This.
>      (sadv16qi): Add TARGET_DOTPROD expansion.
> 
> 2019-13-05  Kyrylo Tkachov  
> 
>      * gcc.target/aarch64/ssadv16qi.c: Add +nodotprod to pragma.
>      * gcc.target/aarch64/usadv16qi.c: Likewise.
>      * gcc.target/aarch64/ssadv16qi-dotprod.c: New test.
>      * gcc.target/aarch64/usadv16qi-dotprod.c: Likewise.
> 
> 
> > Thanks,
> > Richard

Re: [PATCH] AARCH64: ILP32: Fix aarch64_asan_shadow_offset

2019-06-03 Thread James Greenhalgh

On Thu, May 23, 2019 at 04:54:30AM +0100, Andrew Pinski wrote:
> aarch64_asan_shadow_offset is using the wrong
> offset for ILP32.  Change it to be a decent one.
> 
> OK?  Bootstrapped and tested on aarch64-linux-gnu
> with no regressions,

OK.

Thanks,
James

> 
> Thanks,
> Andrew Pinski
> 
> ChangeLog:
> * config/aarch64/aarch64.c (aarch64_asan_shadow_offset):
> Fix ILP32 value.
> ---
>  gcc/config/aarch64/aarch64.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 3d872438..e5fefe93 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -17258,7 +17258,10 @@ aarch64_expand_subvti (rtx op0, rtx low_dest, rtx 
> low_in1,
>  static unsigned HOST_WIDE_INT
>  aarch64_asan_shadow_offset (void)
>  {
> -  return (HOST_WIDE_INT_1 << 36);
> +  if (TARGET_ILP32)
> +return (HOST_WIDE_INT_1 << 29);
> +  else
> +return (HOST_WIDE_INT_1 << 36);
>  }
>  
>  static rtx
> -- 
> 1.8.3.1
>

Re: [PATCH][AArch64] PR tree-optimization/90332: Implement vec_init where N is a vector mode

2019-06-03 Thread James Greenhalgh

On Fri, May 10, 2019 at 10:32:22AM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> This patch fixes the failing gcc.dg/vect/slp-reduc-sad-2.c testcase on 
> aarch64
> by implementing a vec_init optab that can handle two half-width vectors 
> producing a full-width one
> by concatenating them.
> 
> In the gcc.dg/vect/slp-reduc-sad-2.c case it's a V8QI reg concatenated 
> with a V8QI const_vector of zeroes.
> This can be implemented efficiently using the aarch64_combinez pattern 
> that just loads a D-register to make
> use of the implicit zero-extending semantics of that load.
> Otherwise it concatenates the two vector using aarch64_simd_combine.
> 
> With this patch I'm seeing the effect from richi's original patch that 
> added gcc.dg/vect/slp-reduc-sad-2.c on aarch64
> and 525.x264_r improves by about 1.5%.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu. Also tested on 
> aarch64_be-none-elf.
> 
> Ok for trunk?

I have a question on the patch. Otherise, this is OK for trunk.

> 2019-10-05  Kyrylo Tkachov  
> 
>      PR tree-optimization/90332
>      * config/aarch64/aarch64.c (aarch64_expand_vector_init):
>      Handle VALS containing two vectors.
>      * config/aarch64/aarch64-simd.md (*aarch64_combinez): Rename
>      to...
>      (@aarch64_combinez): ... This.
>      (*aarch64_combinez_be): Rename to...
>      (@aarch64_combinez_be): ... This.
>      (vec_init): New define_expand.
>      * config/aarch64/iterators.md (Vhalf): Handle V8HF.
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 0c2c17ed8269923723d066b250974ee1ff423d26..52c933cfdac20c5c566c13ae2528f039efda4c46
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -15075,6 +15075,43 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>rtx v0 = XVECEXP (vals, 0, 0);
>bool all_same = true;
>  
> +  /* This is a special vec_init where N is not an element mode but a
> + vector mode with half the elements of M.  We expect to find two entries
> + of mode N in VALS and we must put their concatentation into TARGET.  */
> +  if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 
> 0

Should you validate the two vector modes are actually half-size vectors here,
and not something unexpected?

Thanks,
James


> +{
> +  rtx lo = XVECEXP (vals, 0, 0);
> +  rtx hi = XVECEXP (vals, 0, 1);
> +  machine_mode narrow_mode = GET_MODE (lo);
> +  gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
> +  gcc_assert (narrow_mode == GET_MODE (hi));
> +
> +  /* When we want to concatenate a half-width vector with zeroes we can
> +  use the aarch64_combinez[_be] patterns.  Just make sure that the
> +  zeroes are in the right half.  */
> +  if (BYTES_BIG_ENDIAN
> +   && aarch64_simd_imm_zero (lo, narrow_mode)
> +   && general_operand (hi, narrow_mode))
> + emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
> +  else if (!BYTES_BIG_ENDIAN
> +&& aarch64_simd_imm_zero (hi, narrow_mode)
> +&& general_operand (lo, narrow_mode))
> + emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
> +  else
> + {
> +   /* Else create the two half-width registers and combine them.  */
> +   if (!REG_P (lo))
> + lo = force_reg (GET_MODE (lo), lo);
> +   if (!REG_P (hi))
> + hi = force_reg (GET_MODE (hi), hi);
> +
> +   if (BYTES_BIG_ENDIAN)
> + std::swap (lo, hi);
> +   emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
> + }
> + return;
> +   }
> +
>/* Count the number of variable elements to initialise.  */
>for (int i = 0; i < n_elts; ++i)
>  {

Re: [patch][aarch64]: add usra and ssra combine patterns

2019-06-03 Thread James Greenhalgh

On Thu, May 30, 2019 at 03:25:19PM +0100, Sylvia Taylor wrote:
> Greetings,
> 
> This patch adds support to combine:
> 
> 1) ushr and add into usra, example:
> 
> ushr  v0.16b, v0.16b, 2
> add   v0.16b, v0.16b, v2.16b
> ---
> usra  v2.16b, v0.16b, 2
> 
> 2) sshr and add into ssra, example:
> 
> sshr  v1.16b, v1.16b, 2
> add   v1.16b, v1.16b, v3.16b
> ---
> ssra  v3.16b, v1.16b, 2
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk? If yes, I don't have any commit rights,
> so can someone please commit it on my behalf.

This patch has an unrelated change to
aarch64_get_lane_zero_extend Please revert that and
resend.

What changes (if any) should we make to 
aarch64_sra_n based on this patch, and to the vsra_n intrinsics
in arm_neon.h ?

Thanks,
James

> 
> Cheers,
> Syl
> 
> gcc/ChangeLog:
> 
> 2019-05-30  Sylvia Taylor  
> 
>   * config/aarch64/aarch64-simd.md
>   (*aarch64_simd_sra): New.
>   * config/aarch64/iterators.md
>   (SHIFTRT): New iterator.
>   (sra_op): New attribute.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-05-30  Sylvia Taylor  
> 
>   * gcc.target/aarch64/simd/ssra.c: New test.
>   * gcc.target/aarch64/simd/usra.c: New test.

> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> e3852c5d182b70978d7603225fce55c0b8ee2894..502ac5f3b45a1da059bb07701150a531091378ed
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3110,22 +3122,22 @@
>  operands[2] = aarch64_endian_lane_rtx (mode, INTVAL (operands[2]));
>  return "smov\\t%0, %1.[%2]";
>}
> -  [(set_attr "type" "neon_to_gp")]
> -)
> -
> -(define_insn "*aarch64_get_lane_zero_extend"
> -  [(set (match_operand:GPI 0 "register_operand" "=r")
> - (zero_extend:GPI
> -   (vec_select:
> - (match_operand:VDQQH 1 "register_operand" "w")
> - (parallel [(match_operand:SI 2 "immediate_operand" "i")]]
> -  "TARGET_SIMD"
> -  {
> -operands[2] = aarch64_endian_lane_rtx (mode,
> -INTVAL (operands[2]));
> -return "umov\\t%w0, %1.[%2]";
> -  }
> -  [(set_attr "type" "neon_to_gp")]
> +  [(set_attr "type" "neon_to_gp")]
> +)
> +
> +(define_insn "*aarch64_get_lane_zero_extend"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> + (zero_extend:GPI
> +   (vec_select:
> + (match_operand:VDQQH 1 "register_operand" "w")
> + (parallel [(match_operand:SI 2 "immediate_operand" "i")]]
> +  "TARGET_SIMD"
> +  {
> +operands[2] = aarch64_endian_lane_rtx (mode,
> +INTVAL (operands[2]));
> +return "umov\\t%w0, %1.[%2]";
> +  }
> +  [(set_attr "type" "neon_to_gp")]
>  )
>  
>  ;; Lane extraction of a value, neither sign nor zero extension

These changes should be dropped.

Re: [PATCH, GCC, AARCH64] Add GNU note section with BTI and PAC.

2019-04-18 Thread James Greenhalgh

On Thu, Apr 04, 2019 at 05:01:06PM +0100, Sudakshina Das wrote:
> Hi Richard
> 
> On 03/04/2019 11:28, Richard Henderson wrote:
> > On 4/3/19 5:19 PM, Sudakshina Das wrote:
> >> +  /* PT_NOTE header: namesz, descsz, type.
> >> +   namesz = 4 ("GNU\0")
> >> +   descsz = 16 (Size of the program property array)
> >> +   type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
> >> +  assemble_align (POINTER_SIZE);
> >> +  assemble_integer (GEN_INT (4), 4, 32, 1);
> >> +  assemble_integer (GEN_INT (16), 4, 32, 1);
> > 
> > So, it's 16 only if POINTER_SIZE == 64.
> > 
> > I think ROUND_UP (12, POINTER_BYTES) is what you want here.
> >
> 
> 
> Ah yes. I have made that change now.

This is OK, but instead of:

> diff --git a/gcc/testsuite/gcc.target/aarch64/va_arg_1.c 
> b/gcc/testsuite/gcc.target/aarch64/va_arg_1.c
> index 
> e8e3cdac51350b545e5c2a644a3e1f4d1c37f88d..1fe92ff08935d4c6f08affcbd77ea91537030640
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/va_arg_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/va_arg_1.c
> @@ -4,7 +4,9 @@
>  int
>  f (int a, ...)
>  {
> -  /* { dg-final { scan-assembler-not "str" } } */
> +  /* Fails on aarch64*-*-linux* if configured with
> +--enable-standard-branch-protection because of the GNU NOTE section.  */
> +  /* { dg-final { scan-assembler-not "str" { target { ! aarch64*-*-linux* } 
> || { ! default_branch_protection } } } } */
>return a;
>  }

Can you just change the regex to check for str followed by a tab, or
something that looks else which looks like the instruction and doesn't
match against 'string'.

Thanks,
James


> 
> Thanks
> Sudi
> 
> > 
> > r~
> > 
>

Re: Re : add tsv110 pipeline scheduling

2019-04-08 Thread James Greenhalgh

Thank you for the ChangeLog entry for your patch.

I have applied it to trunk as revision 270212.

We're very late in GCC 9 development, but this patch only impacts TSV
scheduling.

Thanks,
James


On Thu, Apr 04, 2019 at 02:11:12AM +0100, wuyuan (E) wrote:
> Hi ,James:
>  Thank you for your review, Please attach the following author 
> information to the patch.
> 
> 2019-04-04  wu yuan 
> 
>   * config/aarch64/aarch64-cores.def (tsv1100): Change scheduling model.
>   * config/aarch64/aarch64.md : Add "tsv110.md"
>   * config/aarch64/tsv110.md: New file.
>   
> Thanks,
>       wuyuan
> 
> -邮件原件-
> 发件人: James Greenhalgh [mailto:james.greenha...@arm.com] 
> 发送时间: 2019年4月4日 1:58
> 收件人: wuyuan (E) 
> 抄送: Kyrill Tkachov ; gcc-patches@gcc.gnu.org; 
> Zhangyichao (AB) ; Zhanghaijian (A) 
> ; nd 
> 主题: Re: Re : add tsv110 pipeline scheduling
> 
> On Tue, Apr 02, 2019 at 03:26:22PM +0100, wuyuan (E) wrote:
> > Hi ,James:
> > Has the submitted patch been merged into the trunk?  Looking forward to 
> > your reply , thank you very much!   
> > 
> > 
> > 
> > Best Regards,
> > 
> > wuyuan
> 
> Hi Wuyuan,
> 
> This patch is OK for trunk. Thank you for your many clarifications.
> 
> Will you need one of us to apply this to trunk on your behalf?
> 
> If you would like me to apply your patch, please provide the full ChangeLog 
> with author information, like so:
> 
> 2019-04-03  James Greenhalgh  
>   Second Author  
>   Third Author  
> 
>   * config/aarch64/aarch64-cores.def (tsv1100): Change scheduling model.
>   * config/aarch64/aarch64.md : Add "tsv110.md"
>   * config/aarch64/tsv110.md: New file.
> 
> Thanks,
> James
> 
> 
> > -邮件原件-
> > 发件人: wuyuan (E)
> > 发送时间: 2019年3月15日 21:57
> > 收件人: 'James Greenhalgh' 
> > 抄送: Kyrill Tkachov ; 
> > gcc-patches@gcc.gnu.org; Zhangyichao (AB) 
> > ; Zhanghaijian (A) 
> > ; nd ; wufeng (O) 
> > ; Yangfei (Felix) 
> > 主题: Re : add tsv110 pipeline scheduling
> > 
> > Hi , James:
> >  Thank you very much for your meticulous review work. The explanation 
> > of the two questions as follows:
> >  The first problem is caused by my negligence and should be changed to 
> > " crypto_sha256_fast" .
> >   The second question I have verified with the hardware engineer. Only 
> > ALU2/ALU3 could support PSTATE register update so any instruction intends 
> > to update NZCV will be issued to ALU2/ALU3.   MDU could provide a better 
> > pipeline efficiency for multi cycle ALU instruction so we issue 2 cycles 
> > ALU w/o PSTATE update to MDU unit.  the current pipeline processing is  ok  
> > , except the pipeline " tsv110_alu2" should replace with " tsv110_alu2| 
> > tsv110_alu3".
> > 
> > 
> >  
> > 
> > The detailed patches are as follows:
> > 
> >   * config/aarch64/aarch64-cores.def (tsv1100): Change scheduling model.
> >   * config/aarch64/aarch64.md : Add "tsv110.md"
> >   * config/aarch64/tsv110.md: New file.
> > 
> > 
> > diff --git a/gcc/config/aarch64/aarch64-cores.def 
> > b/gcc/config/aarch64/aarch64-cores.def
> > index ed56e5e..82d91d6
> > --- a/gcc/config/aarch64/aarch64-cores.def
> > +++ b/gcc/config/aarch64/aarch64-cores.def
> > @@ -105,7 +105,7 @@ AARCH64_CORE("neoverse-n1",  neoversen1, 
> > cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_  AARCH64_CORE("neoverse-e1",  
> > neoversee1, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 
> > | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa53, 
> > 0x41, 0xd4a, -1)
> >  
> >  /* HiSilicon ('H') cores. */
> > -AARCH64_CORE("tsv110",  tsv110, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | 
> > AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, 
> > tsv110,   0x48, 0xd01, -1)
> >

Re: Re : add tsv110 pipeline scheduling

2019-04-03 Thread James Greenhalgh

On Tue, Apr 02, 2019 at 03:26:22PM +0100, wuyuan (E) wrote:
> Hi ,James:
> Has the submitted patch been merged into the trunk?  Looking forward to your 
> reply , thank you very much!  
>   
>  
>   
> Best Regards,
>   
> wuyuan

Hi Wuyuan,

This patch is OK for trunk. Thank you for your many clarifications.

Will you need one of us to apply this to trunk on your behalf?

If you would like me to apply your patch, please provide the full ChangeLog
with author information, like so:

2019-04-03  James Greenhalgh  
Second Author  
Third Author  

* config/aarch64/aarch64-cores.def (tsv1100): Change scheduling model.
* config/aarch64/aarch64.md : Add "tsv110.md"
* config/aarch64/tsv110.md: New file.

Thanks,
James


> -邮件原件-
> 发件人: wuyuan (E)
> 发送时间: 2019年3月15日 21:57
> 收件人: 'James Greenhalgh' 
> 抄送: Kyrill Tkachov ; gcc-patches@gcc.gnu.org; 
> Zhangyichao (AB) ; Zhanghaijian (A) 
> ; nd ; wufeng (O) 
> ; Yangfei (Felix) 
> 主题: Re : add tsv110 pipeline scheduling
> 
> Hi , James:
>  Thank you very much for your meticulous review work. The explanation of 
> the two questions as follows:
>  The first problem is caused by my negligence and should be changed to " 
> crypto_sha256_fast" .
>   The second question I have verified with the hardware engineer. Only 
> ALU2/ALU3 could support PSTATE register update so any instruction intends to 
> update NZCV will be issued to ALU2/ALU3.   MDU could provide a better 
> pipeline efficiency for multi cycle ALU instruction so we issue 2 cycles ALU 
> w/o PSTATE update to MDU unit.  the current pipeline processing is  ok  , 
> except the pipeline " tsv110_alu2" should replace with " tsv110_alu2| 
> tsv110_alu3".
>   
>   
>
> 
> The detailed patches are as follows:
> 
>   * config/aarch64/aarch64-cores.def (tsv1100): Change scheduling model.
>   * config/aarch64/aarch64.md : Add "tsv110.md"
>   * config/aarch64/tsv110.md: New file.
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-cores.def 
> b/gcc/config/aarch64/aarch64-cores.def
> index ed56e5e..82d91d6
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -105,7 +105,7 @@ AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, 8_2A, 
>  AARCH64_FL_FOR_ARCH8_  AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, 
> 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | 
> AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa53, 0x41, 0xd4a, -1)
>  
>  /* HiSilicon ('H') cores. */
> -AARCH64_CORE("tsv110",  tsv110, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | 
> AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, 
> tsv110,   0x48, 0xd01, -1)
> +AARCH64_CORE("tsv110",  tsv110, tsv110, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | 
> AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, 
> tsv110,   0x48, 0xd01, -1)
>  
>  /* ARMv8.4-A Architecture Processors.  */
>  
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md 
> index b7cd9fc..861f059 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -361,6 +361,7 @@
>  (include "thunderx.md")
>  (include "../arm/xgene1.md")
>  (include "thunderx2t99.md")
> +(include "tsv110.md")
>  
>  ;; ---
>  ;; Jumps and other miscellaneous insns
> diff --git a/gcc/config/aarch64/tsv110.md b/gcc/config/aarch64/tsv110.md new 
> file mode 100644 index 000..9d12839
> --- /dev/null
> +++ b/gcc/config/aarch64/tsv110.md
> @@ -0,0 +1,708 @@
> +;; tsv110 pipeline description
> +;; Copyright (C) 2018 Free Software Foundation, Inc.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it ;; 
> +under the terms of the GNU General Public License as published by ;; 
> +the Free Software Foundation; either version 3, or (at your option) ;; 
> +any later version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but ;; 
> +WITHOUT ANY WARRANTY; without even the implied

Re: [Patch, aarch64] PR 89628 - fix register allocation in SIMD functions

2019-03-22 Thread James Greenhalgh

On Fri, Mar 22, 2019 at 05:35:02PM +, James Greenhalgh wrote:
> On Mon, Mar 11, 2019 at 04:10:15PM +, Steve Ellcey wrote:
> > Richard,
> > 
> > I don't necessarily disagree with anything in your comments and long
> > term I think that is the right direction, but I wonder if that level of
> > change is appropriate for GCC Stage 4 which is where we are now.  Your
> > changes would require fixes in shared code, whereas setting
> > REG_ALLOC_ORDER only affects Aarch64 and seems like a safer change.
> > I am not sure how long it would take me to implement something along
> > the lines of what you are suggesting.
> 
> I'll leave it to Richard to decide, but your workaround seems like the
> right level of risk for this time of the release. I'd be happy taking it.

Excuse me, I missed the fork in this thread (didn't hit my "AArch64"
filter). I'll try to take a look at Richard's approach early next week.

Thanks,
James

> 
> Thanks,
> James
> 
> > 
> > On Sat, 2019-03-09 at 08:03 +, Richard Sandiford wrote:
> > 
> > > Steve Ellcey  writes:
> > > > This is a patch to fix the register allocation in SIMD functions.  In
> > > > normal functions registers V16 to V31 are all caller saved.  In SIMD
> > > > functions V16 to V23 are callee saved and V24 to V31 are caller saved.
> > > > This means that SIMD functions should use V24 to V31 before V16 to V23
> > > > in order to avoid some saves and restores.
> > > > 
> > > > My fix for this is to define REG_ALLOC_ORDER.  Right now it is not
> > > > defined on Aarch64, so I just defined it as all the registers in order
> > > > except for putting V24 to V31 before V16 to V23.  This fixes the
> > > > register allocation in SIMD functions.  It also changes the register
> > > > allocation order in regular functions but since all the registers (V16
> > > > to V31) are caller saved in that case, it doesn't matter.  We could use
> > > > ADJUST_REG_ALLOC_ORDER to only affect SIMD functions but I did not see
> > > > any reason to do that.
> > > 
> > > REG_ALLOC_ORDER shouldn't really be needed for testcases like the ones
> > > in the PR.  Like you say, we don't currently need it to handle the
> > > equivalent situation for the standard ABI.
> > > 
> > > I think the problem is that the RA is still using the register sets
> > > for the default ABI when evaluating the prologue/epilogue cost of using
> > > a hard register.  E.g. see calculate_saved_nregs.
> > > 
> > > Maybe one fix would be to add an equivalent of call_used_reg_set to
> > > rtl_data.  By default it would be the same as call_used_reg_set,
> > > but the target could have an opportunity to change it.  Then code like
> > > calculate_saved_nregs should use the new set to find out what registers
> > > the current function can use without spilling.
> > > 
> > > This would also be useful for targets that implement interrupt handler
> > > attributes.
> > > 
> > > It would be good to add the testcase in the PR to the testsuite,
> > > with a scan-assembler to check for spills.
> > > 
> > > > diff --git a/gcc/config/aarch64/aarch64.h
> > > > b/gcc/config/aarch64/aarch64.h
> > > > index 7bd3bf5..d3723ff 100644
> > > > --- a/gcc/config/aarch64/aarch64.h
> > > > +++ b/gcc/config/aarch64/aarch64.h
> > > > @@ -328,7 +328,9 @@ extern unsigned aarch64_architecture_version;
> > > > ZR  zero register, encoded as X/R31 elsewhere
> > > >  
> > > > 32 x 128-bit floating-point/vector registers
> > > > -   V16-V31 Caller-saved (temporary) registers
> > > > +   V24-V31 Caller-saved (temporary) registers
> > > > +   V16-V23 Caller-saved (temporary) registers in most functions;
> > > > +   Callee-saved in SIMD functions.
> > > > V8-V15  Callee-saved registers
> > > > V0-V7   Parameter/result registers
> > > 
> > > Probably better as s/SIMD/vector PCS/.  The hunk above is OK with
> > > that
> > > change, independently of the rest.
> > > 
> > > Thanks,
> > > Richard

Re: [PATCH/AARCH64] Fix zero_extendsidi2_aarch64 type attribute

2019-03-22 Thread James Greenhalgh

On Sun, Mar 10, 2019 at 06:26:07PM +, Andrew Pinski wrote:
> Hi,
>   "uxtw x0, w1" is an alias for "mov w0, w1"  but currently the
> back-end marks it as extend type rather than mov_reg.  This patch
> fixes that.  For most schedule models, this does not matter; I am
> adding one where mov (both 32bit and 64bit register mov) can be
> considered as zero latency in some cases so being able to find them is
> important.
> 
> OK?  Bootstrapped and tested on aarch64-linux-gnu with no regressions.

OK.

Thanks,
James

> 
> Thanks,
> Andrew Pinski
> 
> ChangeLog:
> * config/aarch64/aarch64.md (zero_extendsidi2_aarch64): Fix type
> attrribute for uxtw.

> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index ff83974aeb0..70f04186127 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1621,7 +1621,7 @@
> ldr\t%s0, %1
> fmov\t%w0, %s1
> fmov\t%s0, %s1"
> -  [(set_attr "type" "extend,load_4,f_mcr,f_loads,f_mrc,fmov")
> +  [(set_attr "type" "mov_reg,load_4,f_mcr,f_loads,f_mrc,fmov")
> (set_attr "arch" "*,*,fp,fp,fp,fp")]
>  )
>

Re: [Patch, aarch64] PR 89628 - fix register allocation in SIMD functions

2019-03-22 Thread James Greenhalgh

On Mon, Mar 11, 2019 at 04:10:15PM +, Steve Ellcey wrote:
> Richard,
> 
> I don't necessarily disagree with anything in your comments and long
> term I think that is the right direction, but I wonder if that level of
> change is appropriate for GCC Stage 4 which is where we are now.  Your
> changes would require fixes in shared code, whereas setting
> REG_ALLOC_ORDER only affects Aarch64 and seems like a safer change.
> I am not sure how long it would take me to implement something along
> the lines of what you are suggesting.

I'll leave it to Richard to decide, but your workaround seems like the
right level of risk for this time of the release. I'd be happy taking it.

Thanks,
James

> 
> On Sat, 2019-03-09 at 08:03 +, Richard Sandiford wrote:
> 
> > Steve Ellcey  writes:
> > > This is a patch to fix the register allocation in SIMD functions.  In
> > > normal functions registers V16 to V31 are all caller saved.  In SIMD
> > > functions V16 to V23 are callee saved and V24 to V31 are caller saved.
> > > This means that SIMD functions should use V24 to V31 before V16 to V23
> > > in order to avoid some saves and restores.
> > > 
> > > My fix for this is to define REG_ALLOC_ORDER.  Right now it is not
> > > defined on Aarch64, so I just defined it as all the registers in order
> > > except for putting V24 to V31 before V16 to V23.  This fixes the
> > > register allocation in SIMD functions.  It also changes the register
> > > allocation order in regular functions but since all the registers (V16
> > > to V31) are caller saved in that case, it doesn't matter.  We could use
> > > ADJUST_REG_ALLOC_ORDER to only affect SIMD functions but I did not see
> > > any reason to do that.
> > 
> > REG_ALLOC_ORDER shouldn't really be needed for testcases like the ones
> > in the PR.  Like you say, we don't currently need it to handle the
> > equivalent situation for the standard ABI.
> > 
> > I think the problem is that the RA is still using the register sets
> > for the default ABI when evaluating the prologue/epilogue cost of using
> > a hard register.  E.g. see calculate_saved_nregs.
> > 
> > Maybe one fix would be to add an equivalent of call_used_reg_set to
> > rtl_data.  By default it would be the same as call_used_reg_set,
> > but the target could have an opportunity to change it.  Then code like
> > calculate_saved_nregs should use the new set to find out what registers
> > the current function can use without spilling.
> > 
> > This would also be useful for targets that implement interrupt handler
> > attributes.
> > 
> > It would be good to add the testcase in the PR to the testsuite,
> > with a scan-assembler to check for spills.
> > 
> > > diff --git a/gcc/config/aarch64/aarch64.h
> > > b/gcc/config/aarch64/aarch64.h
> > > index 7bd3bf5..d3723ff 100644
> > > --- a/gcc/config/aarch64/aarch64.h
> > > +++ b/gcc/config/aarch64/aarch64.h
> > > @@ -328,7 +328,9 @@ extern unsigned aarch64_architecture_version;
> > > ZRzero register, encoded as X/R31 elsewhere
> > >  
> > > 32 x 128-bit floating-point/vector registers
> > > -   V16-V31   Caller-saved (temporary) registers
> > > +   V24-V31   Caller-saved (temporary) registers
> > > +   V16-V23   Caller-saved (temporary) registers in most functions;
> > > + Callee-saved in SIMD functions.
> > > V8-V15Callee-saved registers
> > > V0-V7 Parameter/result registers
> > 
> > Probably better as s/SIMD/vector PCS/.  The hunk above is OK with
> > that
> > change, independently of the rest.
> > 
> > Thanks,
> > Richard

Re: [PATCH, wwwdocs] Mention -march=armv8.5-a and other new command line options for AArch64 and Arm for GCC 9

2019-03-22 Thread James Greenhalgh

On Wed, Mar 20, 2019 at 10:17:41AM +, Sudakshina Das wrote:
> Hi Kyrill
> 
> On 12/03/2019 12:03, Kyrill Tkachov wrote:
> > Hi Sudi,
> > 
> > On 2/22/19 10:45 AM, Sudakshina Das wrote:
> >> Hi
> >>
> >> This patch documents the addition of the new Armv8.5-A and corresponding
> >> extensions in the gcc-9/changes.html.
> >> As per https://gcc.gnu.org/about.html, I have used W3 Validator.
> >> Is this ok for cvs?
> >>
> >> Thanks
> >> Sudi
> > 
> > 
> > Index: htdocs/gcc-9/changes.html
> > ===
> > RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-9/changes.html,v
> > retrieving revision 1.43
> > diff -u -r1.43 changes.html
> > --- htdocs/gcc-9/changes.html    21 Feb 2019 10:32:55 -    1.43
> > +++ htdocs/gcc-9/changes.html    21 Feb 2019 18:25:09 -
> > @@ -283,6 +283,19 @@
> >   
> >   The intrinsics are defined by the ACLE specification.
> >     
> > +  
> > +    The Armv8.5-A architecture is now supported. This can be used by 
> > specifying the
> > +   -march=armv8.5-a option.
> > 
> > 
> > I tend to prefer the wording "... is now supported through the 
> > -march=armv8.5-a option".
> > Otherwise it reads as the compiler "using" the architecture, whereas we 
> > usually talk about "targeting" an architecture.
> > 
> > +  
> > +   The Armv8.5-A architecture also adds some security features that 
> > are optional to all older
> > +    architecture versions. These are also supported now and only effect 
> > the assembler.
> > +    
> > +     Speculation Barrier instruction using 
> > -march=armv8-a+sb.
> > +     Execution and Data Prediction Restriction instructions using 
> > -march=armv8-a+predres.
> > +     Speculative Store Bypass Safe instruction using 
> > -march=armv8-a+ssbs. This does not
> > + require a compiler option for Arm and thus 
> > -march=armv8-a+ssbs is a AArch64 specific option.
> > 
> > "AArch64-specific"
> > 
> > 
> > LGTM otherwise.
> > Thanks,
> > Kyrill
> 
> Thanks for the review and sorry for the delay in response. I had edited 
> the language for adding new options in a few other places as well.
> 
> +   The Armv8.5-A architecture also adds some security features that are
> +optional to all older architecture versions. These are also supported now

s/also supported now/now supported/

> +and only effect the assembler.

s/effect/affect/

> +
> +  Speculation Barrier instruction through the
> +  -march=armv8-a+sb option.
> +  Execution and Data Prediction Restriction instructions through
> +  the -march=armv8-a+predres option.
> +  Speculative Store Bypass Safe instruction through the
> +  -march=armv8-a+ssbs option. This does not require a
> +  compiler option for Arm and thus -march=armv8-a+ssbs
> +  is an AArch64-specific option.
> +
> +  
>  
>  
>  AArch64 specific
> @@ -362,6 +380,23 @@
>  The default value is 16 (64Kb) and can be changed at configure
>  time using the flag 
> --with-stack-clash-protection-guard-size=12|16.
>
> +  
> +The option -msign-return-address= has been deprecated. This
> +has been replaced by the new -mbranch-protection= option. 
> This
> +new option can now be used to enable the return address signing as well 
> as
> +the new Branch Target Identification feature of Armv8.5-A architecture. 
> For
> +more information on the arguments accepted by this option, please refer 
> to
> +  href="https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html#AArch64-Options;>AArch64-Options.
> +  
> +   The following optional extensions to Armv8.5-A architecture are also
> +   supported now and only effect the assembler.

s/effect/affect/

> +
> +  Random Number Generation instructions through the
> +  -march=armv8.5-a+rng option.
> +  Memory Tagging Extension through the
> +  -march=armv8.5-a+memtag option.
> +
> +  
>  
>  
>  Arm specific

Otherwise, OK by me but feel free to wait for people with gooder
grammar than me to have their say.

Thanks,
James

Re: Re : add tsv110 pipeline scheduling

2019-03-14 Thread James Greenhalgh

On Sat, Feb 23, 2019 at 01:28:22PM +, wuyuan (E) wrote:
> Hi ,James:
> Sorry for not responding to your email in time because of Chinese New Year’s 
> holiday and urgent work. The three questions you mentioned last email are due 
> to my misunderstanding of pipeline.
> the first question, These instructions will occupy both the tsv110_ls* and 
> tsv110_fsu* Pipeline at the same time.

Hi Wuyuan,

Please accept my apologies for how long it has taken me to revisit your
patch and review it.

I have two questions:

> +(define_insn_reservation "tsv110_crypto_sha256_fast" 2
> +  (and (eq_attr "tune" "tsv110")
> +   (eq_attr "type" "crypto_sha1_fast"))
> +  "tsv110_fsu1")

I think you intended to check for type crypto_sha256_fast here.

> +;; ALU ops with shift
> +(define_insn_reservation "tsv110_alu_shift" 2
> +  (and (eq_attr "tune" "tsv110")
> +   (eq_attr "type" "extend,\
> + alu_shift_imm,alu_shift_reg,\
> + crc,logic_shift_imm,logic_shift_reg,\
> + mov_shift,mvn_shift,\
> + mov_shift_reg,mvn_shift_reg"))
> +  "tsv110_mdu")
> +  
> +(define_insn_reservation "tsv110_alus_shift" 2
> +  (and (eq_attr "tune" "tsv110")
> +   (eq_attr "type" "alus_shift_imm,alus_shift_reg,\
> + logics_shift_imm,logics_shift_reg"))
> +  "tsv110_alu2")

Is this the correct description? This code says that ALU operations with
shift are executed in MDU, but ALU operations with shift that are also
flag setting are executed in ALU2?

Otherwise, this patch is OK for trunk. Thank you for your patience.

Best Regards,
James

> rewritten as follows:
> (define_insn_reservation
>   "tsv110_neon_ld4_lane" 9
>   (and (eq_attr "tune" "tsv110")
>(eq_attr "type" "neon_load4_all_lanes,neon_load4_all_lanes_q,\
>  neon_load4_one_lane,neon_load4_one_lane_q"))
>   "(tsv110_ls1 + tsv110_fsu1)|(tsv110_ls1 + tsv110_fsu2)|(tsv110_ls2 + 
> tsv110_fsu1)|(tsv110_ls2 + tsv110_fsu2)")
> 
> the second question, These instructions will use tsv110_fsu1 Pipeline or 
> tsv110_fsu2 Pipeline.
> rewritten as follows:
> (define_insn_reservation  "tsv110_neon_abd_aba" 4
>   (and (eq_attr "tune" "tsv110")
>(eq_attr "type" "neon_abd,neon_arith_acc"))
>   "tsv110_fsu1|tsv110_fsu2")
> 
> the third question, These instructions will use tsv110_fsu1 Pipeline or 
> tsv110_fsu2 Pipeline.
> rewritten as follows:
> (define_insn_reservation  "tsv110_neon_abd_aba_q" 4
>   (and (eq_attr "tune" "tsv110")
>(eq_attr "type" "neon_arith_acc_q"))
>   "tsv110_fsu1|tsv110_fsu2")
> 
> In addition to the above changes, I asked hardware engineers and colleagues 
> to review my  patch and modify some of the errors. The detailed patches are 
> as follows:
> 
>   * config/aarch64/aarch64-cores.def (tsv1100): Change scheduling model.
>   * config/aarch64/aarch64.md : Add "tsv110.md"
>   * config/aarch64/tsv110.md: New file.
>

Re: [PATCH][GCC][AArch64] Have empty HWCAPs string ignored during native feature detection

2019-02-27 Thread James Greenhalgh

On Thu, Feb 07, 2019 at 04:43:24AM -0600, Tamar Christina wrote:
> Hi All,
> 
> Since this hasn't been reviewed yet anyway I've updated this patch to also 
> fix the memory leaks etc.
> 
> --
> 
> This patch makes the feature detection code for AArch64 GCC not add features
> automatically when the feature had no hwcaps string to match against.
> 
> This means that -mcpu=native no longer adds feature flags such as +profile.
> The behavior wasn't noticed before because at the time +profile was added a 
> bug
> was preventing any feature bits from being added by native detections.
> 
> The loop has also been changed as Jakub specified in order to avoid a memory
> leak that was present in the existing code and to be slightly more efficient.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for trunk?

OK. Is this also desirable for a backport?

Thanks,
James

> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 2019-02-07  Tamar Christina  
> 
>   PR target/88530
>   * config/aarch64/aarch64-option-extensions.def: Document it.
>   * config/aarch64/driver-aarch64.c (host_detect_local_cpu): Skip feature
>   if empty hwcaps.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-02-07  Tamar Christina  
> 
>   PR target/88530
>   * gcc.target/aarch64/options_set_10.c: New test.
>

Re: [PATCH] Improve arm and aarch64 casesi (PR target/70341)

2019-02-27 Thread James Greenhalgh

On Fri, Feb 22, 2019 at 06:20:51PM -0600, Jakub Jelinek wrote:
> Hi!
> 
> The testcase in the PR doesn't hoist any memory loads from the large switch
> before the switch on aarch64 and arm (unlike e.g. x86), because the
> arm/aarch64 casesi patterns don't properly annotate the memory load from the
> jump table.  It is created by gen_* and in RTL directly one can't specify
> the needed flags (MEM_READONLY_P and MEM_NOTRAP_P).
> 
> Fixed thusly, bootstrapped/regtested on armv7hl-linux-gnueabi and
> aarch64-linux, ok for trunk?
> 
> 2019-02-23  Jakub Jelinek  
> 
>   PR target/70341
>   * config/aarch64/aarch64.md (casesi): Create the casesi_dispatch
>   MEM manually here, set MEM_READONLY_P and MEM_NOTRAP_P on it.

This AArch64 part is OK for trunk.

Thanks,
James

> --- gcc/config/aarch64/aarch64.md.jj  2019-01-19 09:39:18.847831222 +0100
> +++ gcc/config/aarch64/aarch64.md 2019-02-21 15:25:27.874532191 +0100
> @@ -622,13 +622,27 @@ (define_expand "casesi"
>   operands[0], operands[2], operands[4]));
>  
>  operands[2] = force_reg (DImode, gen_rtx_LABEL_REF (DImode, 
> operands[3]));
> -emit_jump_insn (gen_casesi_dispatch (operands[2], operands[0],
> -  operands[3]));
> +operands[2]
> +  = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, operands[2], operands[0]),
> + UNSPEC_CASESI);
> +operands[2] = gen_rtx_MEM (DImode, operands[2]);
> +MEM_READONLY_P (operands[2]) = 1;
> +MEM_NOTRAP_P (operands[2]) = 1;
> +emit_jump_insn (gen_casesi_dispatch (operands[2], operands[3]));
>  DONE;
>}
>  )
>  
> -(define_insn "casesi_dispatch"
> +(define_expand "casesi_dispatch"
> +  [(parallel
> +[(set (pc) (match_operand:DI 0 ""))
> + (clobber (reg:CC CC_REGNUM))
> + (clobber (match_scratch:DI 2))
> + (clobber (match_scratch:DI 3))
> + (use (label_ref:DI (match_operand 1 "")))])]
> +  "")
> +
> +(define_insn "*casesi_dispatch"
>[(parallel
>  [(set (pc)
> (mem:DI (unspec [(match_operand:DI 0 "register_operand" "r")
> @@ -637,7 +651,7 @@ (define_insn "casesi_dispatch"
>   (clobber (reg:CC CC_REGNUM))
>   (clobber (match_scratch:DI 3 "=r"))
>   (clobber (match_scratch:DI 4 "=r"))
> - (use (label_ref (match_operand 2 "" "")))])]
> + (use (label_ref:DI (match_operand 2 "" "")))])]
>""
>"*
>return aarch64_output_casesi (operands);
> 
>   Jakub

Re: [Patch] [aarch64] PR target/89324 Handle stack pointer for SUBS/ADDS instructions

2019-02-22 Thread James Greenhalgh

On Fri, Feb 22, 2019 at 09:39:59AM -0600, Matthew Malcomson wrote:
> Hi James,
> 
> On 22/02/19 00:09, James Greenhalgh wrote:
> > On Mon, Feb 18, 2019 at 08:40:12AM -0600, Matthew Malcomson wrote:
> >>
> >> Additionally, this patch contains two tidy-ups (happy to remove them or 
> >> put in
> >> a separate patch if people want):
> > 
> > Generally, yes I prefer that.
> > 
> 
> I've removed the tidy ups and retested -- modified patch attached.
> 
> >>
> >> OK for trunk?
> > 
> > This patch is fine for GCC 10, so not on trunk yet please unless there is
> > a corectness reason for the change.
> > 
> 
> The patch is a correctness change to fix the P1 regression that occurs 
> when the stack pointer was passed to the define_insn from the 
> define_peephole2.
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89324
> 
> Ok for trunk?

Yes, please.

Thanks,
James

Re: [Patch] [aarch64] PR target/89324 Handle stack pointer for SUBS/ADDS instructions

2019-02-21 Thread James Greenhalgh

On Mon, Feb 18, 2019 at 08:40:12AM -0600, Matthew Malcomson wrote:
> Handle stack pointer with SUBS/ADDS instructions.
> 
> In general the stack pointer was not handled for many SUBS/ADDS patterns in
> aarch64.md.
> Both the "extended register" and "immediate" forms allow the stack pointer to 
> be
> used as the source register, while no form allows the stack pointer for the
> destination register.
> 
> The define_insn patterns generating ADDS/SUBS did not allow the stack pointer
> for any operand, while the define_peephole2 patterns that generated RTX to be
> matched by these patterns allowed the stack pointer for any operand.
> 
> The patterns are fixed by adding the 'k' constraint for the first source 
> operand
> to all define_insns that generate the ADDS/SUBS "extended register" and
> "immediate" forms (but not the "shifted register" form).
> 
> In peephole optimizations, constraint strings are ignored (see "(gccint) C
> Constraint Interface" info node in the documentation), so the decision to act 
> or
> not is based solely on the predicate and condition.
> This patch introduces a new predicate "aarch64_general_reg" to be used in
> define_peephole2 patterns where only GENERAL_REGS registers are acceptable and
> uses that predicate in the peepholes that generate patterns for ADDS/SUBS.
> 
> Additionally, this patch contains two tidy-ups (happy to remove them or put in
> a separate patch if people want):

Generally, yes I prefer that.

> We change the condition of sub3_compare1_imm pattern from checking
> "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
> to checking
> "INTVAL (operands[2]) == -INTVAL (operands[3])"
> for clarity, since the values checked are signed integers, there are negations
> involved in the check, and the condition used by the corresponding peepholes
> also uses INTVAL.
> 
> The superfluous  iterator in the assembly template for
> add3_compareV_imm is removed -- it was applied to an operand that is
> known to be a const_int.
> 
> Full bootstrap and regtest done on aarch64-none-linux-gnu.
> Regression tests done on aarch64-none-linux-gnu and aarch64-none-elf cross
> compiler.
> 
> OK for trunk?

This patch is fine for GCC 10, so not on trunk yet please unless there is
a corectness reason for the change.

> NOTE: I have included a bunch of RTL testcases that I used in development, 
> these
> don't exercise much of the compiler and are pretty specific to the backend as 
> it
> currently is, so I'm not sure they give much value. I'd appreciate feedback on
> whether this is in general considered useful.

I would happily take the RTL test, do you want to check with a testsuite
maintainer?

> gcc/ChangeLog:
> 
> 2019-02-18  Matthew Malcomson  
> 
>   PR target/89324
>   * config/aarch64/aarch64.md: Use aarch64_general_reg predicate on
>   destination register in peepholes generating patterns for ADDS/SUBS.
>   (add3_compare0,
>   *addsi3_compare0_uxtw, add3_compareC,
>   add3_compareV_imm, add3_compareV,
>   *adds__,
>   *subs__,
>   *adds__shift_,
>   *subs__shift_,
>   *adds__multp2, *subs__multp2,
>   *sub3_compare0, *subsi3_compare0_uxtw,
>   sub3_compare1): Allow stack pointer for source register.
>   * config/aarch64/predicates.md (aarch64_general_reg): New predicate.
> 
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-02-18  Matthew Malcomson  
> 
>   PR target/89324
>   * gcc.dg/rtl/aarch64/subs_adds_sp.c: New test.
>   * gfortran.fortran-torture/compile/pr89324.f90: New test.
> 
> 
> 
> ### Attachment also inlined for ease of reply
> ###

I appreciate that.


Thanks,
James

Re: [PATCH, GCC, AArch64] Fix a couple of bugs in BTI

2019-02-21 Thread James Greenhalgh

On Thu, Feb 21, 2019 at 06:19:10AM -0600, Sudakshina Das wrote:
> Hi
> 
> While doing more testing I found a couple of issues with my BTI patches. 
> This patch fixes them:
> 1) Remove a reference to return address key. The original patch was 
> written based on a different not yet committed patch ([PATCH 
> 3/3][GCC][AARCH64] Add support for pointer authentication B key) and I 
> missed out on cleaning this up. This is hidden behind the configuration 
> option and thus went unnoticed.
> 2) Add a missed case for adding the BTI instruction in thunk functions.
> 
> Bootstrapped on aarch64-none-linux-gnu and regression tested on 
> aarch64-none-elf with configuration turned on.

OK.

Thanks,
James

> 
> gcc/ChangeLog:
> 
> 2019-xx-xx  Sudakshina Das  
> 
>   * config/aarch64/aarch64.c (aarch64_output_mi_thunk): Add bti
>   instruction if enabled.
>   (aarch64_override_options): Remove reference to return address
>   key.
> 
> 
> Is this ok for trunk?
> Sudi

Re: [PATCH 1/2][GCC][AArch64] Update Armv8.4-a's FP16 FML intrinsics

2019-02-21 Thread James Greenhalgh

On Wed, Feb 20, 2019 at 08:00:13AM -0600, Tamar Christina wrote:
> Hi All,
> 
> This patch updates the Armv8.4-a FP16 FML intrinsics's suffixes from u32 to 
> f16
> to be more consistent with the naming convention for intrinsics.
> 
> The specifications for these intrinsics have not been published yet so we do
> not need to maintain the old names.
> 
> The patch was created with the following script:
> 
> grep -lIE "(vfml[as].+)_u32" -r gcc/ | grep -iEv ".+Changelog.*" \
>   | xargs sed -i -E -e "s/(vfml[as].+)_u32/\1_f16/g"

Big bonus points for including this!

> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for trunk? and eventual backport to GCC 8?

Woops.

Yes, OK for trunk and backport it please.

Thanks,
James

> gcc/ChangeLog:
> 
> 2019-02-20  Tamar Christina  
> 
>   * config/aarch64/arm_neon.h (vfmlal_low_u32, vfmlsl_low_u32,
>   vfmlalq_low_u32, vfmlslq_low_u32, vfmlal_high_u32, vfmlsl_high_u32,
>   vfmlalq_high_u32, vfmlslq_high_u32, vfmlal_lane_low_u32,
>   vfmlsl_lane_low_u32, vfmlal_laneq_low_u32, vfmlsl_laneq_low_u32,
>   vfmlalq_lane_low_u32, vfmlslq_lane_low_u32, vfmlalq_laneq_low_u32,
>   vfmlslq_laneq_low_u32, vfmlal_lane_high_u32, vfmlsl_lane_high_u32,
>   vfmlal_laneq_high_u32, vfmlsl_laneq_high_u32, vfmlalq_lane_high_u32,
>   vfmlslq_lane_high_u32, vfmlalq_laneq_high_u32, vfmlslq_laneq_high_u32):
>   Rename ...
>   (vfmlal_low_f16, vfmlsl_low_f16, vfmlalq_low_f16, vfmlslq_low_f16,
>   vfmlal_high_f16, vfmlsl_high_f16, vfmlalq_high_f16, vfmlslq_high_f16,
>   vfmlal_lane_low_f16, vfmlsl_lane_low_f16, vfmlal_laneq_low_f16,
>   vfmlsl_laneq_low_f16, vfmlalq_lane_low_f16, vfmlslq_lane_low_f16,
>   vfmlalq_laneq_low_f16, vfmlslq_laneq_low_f16, vfmlal_lane_high_f16,
>   vfmlsl_lane_high_f16, vfmlal_laneq_high_f16, vfmlsl_laneq_high_f16,
>   vfmlalq_lane_high_f16, vfmlslq_lane_high_f16, vfmlalq_laneq_high_f16,
>   vfmlslq_laneq_high_f16): ... To this.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-02-20  Tamar Christina  
> 
>   * gcc.target/aarch64/fp16_fmul_high.h (test_vfmlal_high_u32,
>   test_vfmlalq_high_u32, test_vfmlsl_high_u32, test_vfmlslq_high_u32):
>   Rename ...
>   (test_vfmlal_high_f16, test_vfmlalq_high_f16, test_vfmlsl_high_f16,
>   test_vfmlslq_high_f16): ... To this.
>   * gcc.target/aarch64/fp16_fmul_lane_high.h (test_vfmlal_lane_high_u32,
>   tets_vfmlsl_lane_high_u32, test_vfmlal_laneq_high_u32,
>   test_vfmlsl_laneq_high_u32, test_vfmlalq_lane_high_u32,
>   test_vfmlslq_lane_high_u32, test_vfmlalq_laneq_high_u32,
>   test_vfmlslq_laneq_high_u32): Rename ...
>   (test_vfmlal_lane_high_f16, tets_vfmlsl_lane_high_f16,
>   test_vfmlal_laneq_high_f16, test_vfmlsl_laneq_high_f16,
>   test_vfmlalq_lane_high_f16, test_vfmlslq_lane_high_f16,
>   test_vfmlalq_laneq_high_f16, test_vfmlslq_laneq_high_f16): ... To this.
>   * gcc.target/aarch64/fp16_fmul_lane_low.h (test_vfmlal_lane_low_u32,
>   test_vfmlsl_lane_low_u32, test_vfmlal_laneq_low_u32,
>   test_vfmlsl_laneq_low_u32, test_vfmlalq_lane_low_u32,
>   test_vfmlslq_lane_low_u32, test_vfmlalq_laneq_low_u32,
>   test_vfmlslq_laneq_low_u32): Rename ...
>   (test_vfmlal_lane_low_f16, test_vfmlsl_lane_low_f16,
>   test_vfmlal_laneq_low_f16, test_vfmlsl_laneq_low_f16,
>   test_vfmlalq_lane_low_f16, test_vfmlslq_lane_low_f16,
>   test_vfmlalq_laneq_low_f16, test_vfmlslq_laneq_low_f16): ... To this.
>   * gcc.target/aarch64/fp16_fmul_low.h (test_vfmlal_low_u32,
>   test_vfmlalq_low_u32, test_vfmlsl_low_u32, test_vfmlslq_low_u32):
>   Rename ...
>   (test_vfmlal_low_f16, test_vfmlalq_low_f16, test_vfmlsl_low_f16,
>   test_vfmlslq_low_f16): ... To This.
>   * lib/target-supports.exp
>   (check_effective_target_arm_fp16fml_neon_ok_nocache): Update test.
> 
> -- 

> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 
> f405a325cf5f3f8970e5f4b78322335c280fa7a4..314ef30187d1ba1882eaf5c610770d380344e920
>  100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -33777,63 +33777,63 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
> float32x4_t __a, float32x4_t __b,
>  
>  __extension__ extern __inline float32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vfmlal_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
> +vfmlal_low_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
>  {
>return __builtin_aarch64_fmlal_lowv2sf (__r, __a, __b);
>  }
>  
>  __extension__ extern __inline float32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vfmlsl_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
> +vfmlsl_low_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
>  {
>return __builtin_aarch64_fmlsl_lowv2sf (__r, __a, __b);
>  }
>  
>  __extension__ extern __inline float32x4_t
>

Re: [PATCH][AArch64] Add support for Neoverse E1

2019-02-21 Thread James Greenhalgh

On Thu, Feb 21, 2019 at 11:43:08AM -0600, Kyrill Tkachov wrote:
> Hi all,
> 
> This patch adds -mcpu and -mtune support for the Neoverse E1 CPU [1].
> The new option is -mcpu=neoverse-e1.
> Bootstrapped and tested on aarch64-none-linux-gnu.

OK.

Thanks,
James

> [1] 
> https://community.arm.com/processors/b/blog/posts/arm-neoverse-e1-platform-empowering-the-infrastructure-to-meet-next-generation-throughput-demands
> 
> 2019-02-21  Kyrylo Tkachov  
> 
>      * config/aarch64/aarch64-cores.def (neoverse-e1): Define.
>      * config/aarch64/aarch64-tune.md: Regenerate.
>      * doc/invoke.texi (AArch64 Options): Document neoverse-e1 -mcpu option.

> diff --git a/gcc/config/aarch64/aarch64-cores.def 
> b/gcc/config/aarch64/aarch64-cores.def
> index 
> b0c7d2f23ace1e3c3a89f4e3ab10c9ad08f56b22..ed56e5eded13664597343659db859c5ed481627d
>  100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -102,6 +102,7 @@ AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  
> AARCH64_FL_FOR_ARCH8_2
>  AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, 8_2A,  
> AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | 
> AARCH64_FL_DOTPROD, cortexa72, 0x41, 0xd0b, -1)
>  AARCH64_CORE("ares",  ares, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | 
> AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, 
> neoversen1, 0x41, 0xd0c, -1)
>  AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, 8_2A,  
> AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | 
> AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1)
> +AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, 8_2A,  
> AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | 
> AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa53, 0x41, 0xd4a, -1)
>  
>  /* HiSilicon ('H') cores. */
>  AARCH64_CORE("tsv110",  tsv110, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | 
> AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, 
> tsv110,   0x48, 0xd01, -1)
> diff --git a/gcc/config/aarch64/aarch64-tune.md 
> b/gcc/config/aarch64/aarch64-tune.md
> index 
> 5b1341525e9c2e3fe6306e7c9fef41f5d658420c..2b1ec85ae3190ec62f70d8abacd88e825244f2b1
>  100644
> --- a/gcc/config/aarch64/aarch64-tune.md
> +++ b/gcc/config/aarch64/aarch64-tune.md
> @@ -1,5 +1,5 @@
>  ;; -*- buffer-read-only: t -*-
>  ;; Generated automatically by gentune.sh from aarch64-cores.def
>  (define_attr "tune"
> - 
> "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,ares,neoversen1,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
> + 
> "cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,ares,neoversen1,neoversee1,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
>   (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 
> 8ecc28c4acccd619ff9dcc25202fcf87582145a4..24513970e0508611c4810aabbb6089dca7d5501c
>  100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -15771,10 +15771,11 @@ performance of the code.  Permissible values for 
> this option are:
>  @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
>  @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
>  @samp{cortex-a76}, @samp{ares}, @samp{exynos-m1}, @samp{emag}, @samp{falkor},
> -@samp{neoverse-n1},@samp{qdf24xx}, @samp{saphira}, @samp{phecda}, 
> @samp{xgene1},
> -@samp{vulcan}, @samp{octeontx}, @samp{octeontx81},  @samp{octeontx83},
> -@samp{thunderx}, @samp{thunderxt88}, @samp{thunderxt88p1}, 
> @samp{thunderxt81},
> -@samp{tsv110}, @samp{thunderxt83}, @samp{thunderx2t99},
> +@samp{neoverse-e1},@samp{neoverse-n1},@samp{qdf24xx}, @samp{saphira},
> +@samp{phecda}, @samp{xgene1}, @samp{vulcan}, @samp{octeontx},
> +@samp{octeontx81},  @samp{octeontx83}, @samp{thunderx}, @samp{thunderxt88},
> +@samp{thunderxt88p1}, @samp{thunderxt81}, @samp{tsv110},
> +@samp{thunderxt83}, @samp{thunderx2t99},
>  @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
>  @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
>  @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55}

Re: [PATCH][AArch64] Add support for Neoverse N1

2019-02-21 Thread James Greenhalgh

On Thu, Feb 21, 2019 at 11:42:56AM -0600, Kyrill Tkachov wrote:
> Hi all,
> 
> This patch adds support for the Neoverse N1 CPU [1]. This was supported 
> in GCC earlier through the codename Ares,
> which it now replaces. -mcpu=ares is still accepted as there's been a 
> binutils release supporting it,
> but the internal structures are renamed to use Neoverse N1-related 
> identifiers.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?

OK.

Thanks,
James

> 
> [1] 
> https://community.arm.com/processors/b/blog/posts/arm-neoverse-n1-platform-accelerating-the-transformation-to-a-scalable-cloud-to-edge-infrastructure
> 
> 2019-02-21  Kyrylo Tkachov  
> 
>      * config/aarch64/aarch64.c (ares_tunings): Rename to...
>      (neoversen1_tunings): ... This.
>      * config/aarch64/aarch64-cores.def (ares): Change tuning to the above.
>      (neoverse-n1): New CPU.
>      * config/aarch64/aarch64-tune.md: Regenerate.
>      * doc/invoke.txt (AArch64 Options): Document neoverse-n1.
>

Re: [PATCH][GCC][AArch64] Fix command line options canonicalization version #2. (PR target/88530)

2019-02-21 Thread James Greenhalgh

On Wed, Feb 20, 2019 at 08:00:38AM -0600, Tamar Christina wrote:
> Hi All,
> 
> Commandline options on AArch64 don't get canonicalized into the smallest
> possible set before output to the assembler. This means that overlapping 
> feature
> sets are emitted with superfluous parts.
> 
> Normally this isn't an issue, but in the case of crypto we have retro-actively
> split it into aes and sha2. We need to emit only +crypto to the assembler
> so old assemblers continue to work.
> 
> Because of how -mcpu=native and -march=native work they end up enabling all
> feature bits. Instead we need to get the smallest possible set, which would 
> also
> fix the problem with older the assemblers and the retro-active split.
> 
> The function that handles this is called quite often.  It is called for any
> push/pop options or attribute that changes arch, cpu etc.  In order to not 
> make
> this search for the smallest set too expensive we sort the options based on 
> the
> number of features (bits) they enable.  This allows us to process the list
> linearly instead of quadratically (Once we have enabled a feature, we know 
> that
> anything else that enables it can be ignored.  By sorting we'll get the 
> biggest
> groups first and thus the smallest combination of commandline flags).
> 
> The Option handling structures have been extended to have a boolean to 
> indicate
> whether the option is synthetic, with that I mean if the option flag itself
> enables a new feature.
> 
> e.g. +crypto isn't an actual feature, it just enables other features, but 
> others
> like +rdma enable multiple dependent features but is itself also a feature.
> 
> There are two ways to solve this.
> 
> 1) Either have the options that are feature bits also turn themselves on, e.g.
>change rdma to turn on FP, SIMD and RDMA as dependency bits.
> 
> 2) Make a distinction between these two different type of features and have 
> the
>framework handle it correctly.
> 
> Even though it's more code I went for the second approach, as it's the one
> that'll be less fragile (people can't forget it) and gives the least 
> surprises.
> 
> Effectively this patch changes the following:
> 
> The values before the => are the old compiler and after the => the new code.
> 
> -march=armv8.2-a+crypto+sha2 => -march=armv8.2-a+crypto
> -march=armv8.2-a+sha2+aes => -march=armv8.2-a+crypto
> 
> The remaining behaviors stay the same.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for trunk?

OK, but I don't understand why the CRC special case is needed. My copy of
the Arm Architecture Reference Manual suggests that all versions of the
architceture from ARmv8.1-A are required to implement the CRC32 extension.
Is there some old assembler that doesn't honour that? Whatever is driving
that requirement could usefully be added to the comments.

I find it very hard to believe that this code is what we need for correct
behaviour on AArch64; this level of complexity implies we're doing something
very wrong with either the definition or the implementation of these features
bits which makes it hard for us to maintain, and hard for users and dependent
tools (e.g. assemblers) to know what to expect from the compiler.

I've seen a number of bugs in this code recently. While I appreciate your
patch for fixing one of them, I find the cases and expectations so hard
to reason about that I can't say I am sure we are now bug free.

OK for trunk as an improvement over today, and to help us get towards a
release; but I'm very unhappy with this corner of the compiler!

Thanks,
James

> gcc/ChangeLog:
> 
> 2019-02-20  Tamar Christina  
> 
>   PR target/88530
>   * common/config/aarch64/aarch64-common.c
>   (struct aarch64_option_extension): Add is_synthetic.
>   (all_extensions): Use it.
>   (TARGET_OPTION_INIT_STRUCT): Define hook.
>   (struct gcc_targetm_common): Moved to end.
>   (all_extensions_by_on): New.
>   (opt_ext_cmp, typedef opt_ext): New.
>   (aarch64_option_init_struct): New.
>   (aarch64_contains_opt): New.
>   (aarch64_get_extension_string_for_isa_flags): Output smallest set.
>   * config/aarch64/aarch64-option-extensions.def
>   (AARCH64_OPT_EXTENSION): Explicitly include AES and SHA2 in crypto.
>   (fp, simd, crc, lse, fp16, rcpc, rdma, dotprod, aes, sha2, sha3,
>   sm4, fp16fml, sve, profile, rng, memtag, sb, ssbs, predres):
>   Set is_synthetic to false.
>   (crypto): Set is_synthetic to true.
>   * config/aarch64/driver-aarch64.c (AARCH64_OPT_EXTENSION): Add
>   SYNTHETIC.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-02-20  Tamar Christina  
> 
>   PR target/88530
>   * gcc.target/aarch64/options_set_1.c: New test.
>   * gcc.target/aarch64/options_set_2.c: New test.
>   * gcc.target/aarch64/options_set_3.c: New test.
>   * gcc.target/aarch64/options_set_4.c: New test.
>   * gcc.target/aarch64/options_set_5.c: New test.
>   *

Re: [Aarch64][SVE] Vectorise sum-of-absolute-differences

2019-02-06 Thread James Greenhalgh

On Mon, Feb 04, 2019 at 07:34:05AM -0600, Alejandro Martinez Vicente wrote:
> Hi,
> 
> This patch adds support to vectorize sum of absolute differences (SAD_EXPR)
> using SVE. It also uses the new functionality to ensure that the resulting 
> loop
> is masked. Therefore, it depends on
> 
> https://gcc.gnu.org/ml/gcc-patches/2019-02/msg00016.html
> 
> Given this input code:
> 
> int
> sum_abs (uint8_t *restrict x, uint8_t *restrict y, int n)
> {
>   int sum = 0;
> 
>   for (int i = 0; i < n; i++)
> {
>   sum += __builtin_abs (x[i] - y[i]);
> }
> 
>   return sum;
> }
> 
> The resulting SVE code is:
> 
>  :
>0: 715fcmp w2, #0x0
>4: 5400026db.le50 
>8: d283mov x3, #0x0// #0
>c: 93407c42sxtwx2, w2
>   10: 2538c002mov z2.b, #0
>   14: 25221fe0whilelo p0.b, xzr, x2
>   18: 2538c023mov z3.b, #1
>   1c: 2518e3e1ptrue   p1.b
>   20: a4034000ld1b{z0.b}, p0/z, [x0, x3]
>   24: a4034021ld1b{z1.b}, p0/z, [x1, x3]
>   28: 0430e3e3incbx3
>   2c: 0520c021sel z1.b, p0, z1.b, z0.b
>   30: 25221c60whilelo p0.b, x3, x2
>   34: 040d0420uabdz0.b, p1/m, z0.b, z1.b
>   38: 44830402udotz2.s, z0.b, z3.b
>   3c: 5421b.ne20   // b.any
>   40: 2598e3e0ptrue   p0.s
>   44: 04812042uaddv   d2, p0, z2.s
>   48: 1e260040fmovw0, s2
>   4c: d65f03c0ret
>   50: 1e2703e2fmovs2, wzr
>   54: 1e260040fmovw0, s2
>   58: d65f03c0ret
> 
> Notice how udot is used inside a fully masked loop.
> 
> I tested this patch in an aarch64 machine bootstrapping the compiler and
> running the checks.

This doesn't give us much confidence in SVE coverage; unless you have
been running in an environment using SVE by default? Do you have some set
of workloads you could test the compiler against to ensure correct operation
of the SVE vectorization?

> 
> I admit it is too late to merge this into gcc 9, but I'm posting it anyway so
> it can be considered for gcc 10.

Richard Sandiford has the call on whether this patch is OK for trunk now or
GCC 10. With the minimal testing it has had, I'd be uncomfortable with it as
a GCC 9 patch. That said, it is a fairly self-contained pattern for the
compiler and it would be good to see this optimization in GCC 9.

> 
> Alejandro
> 
> 
> gcc/Changelog:
> 
> 2019-02-04  Alejandro Martinez  
> 
>   * config/aarch64/aarch64-sve.md (abd_3): New define_expand.
>   (aarch64_abd_3): Likewise.
>   (*aarch64_abd_3): New define_insn.
>   (sad): New define_expand.
>   * config/aarch64/iterators.md: Added MAX_OPP and max_opp attributes.
>   Added USMAX iterator.
>   * config/aarch64/predicates.md: Added aarch64_smin and aarch64_umin
>   predicates.
>   * tree-vect-loop.c (use_mask_by_cond_expr_p): Add SAD_EXPR.
>   (build_vect_cond_expr): Likewise.
> 
> gcc/testsuite/Changelog:
>  
> 2019-02-04  Alejandro Martinez  
> 
>   * gcc.target/aarch64/sve/sad_1.c: New test for sum of absolute
>   differences.

Re: [PATCH][AArch64] Change representation of SABD in RTL

2019-02-06 Thread James Greenhalgh

On Mon, Feb 04, 2019 at 04:23:32AM -0600, Kyrill Tkachov wrote:
> Hi all,
> 
> Richard raised a concern about the RTL we use to represent the AdvSIMD SABD
> (vector signed absolute difference) instruction.
> We currently represent it as ABS (MINUS op1 op2).
> 
> This isn't exactly what SABD does. ABS treats its input as a signed value
> and returns the absolute of that.
> 
> For example:
> (sabd:QI 64 -128) == 192 (unsigned) aka -64 (signed)
> whereas
> (minus:QI 64 -128) == 192 (unsigned) aka -64 (signed), (abs ...) of that is 
> 64.
> 
> A better way to describe the instruction is with MINUS (SMAX (op1 op2) SMIN 
> (op1 op2)).
> This patch implements that, and also implements similar semantics for the 
> UABD instruction
> that uses UMAX and UMIN.
> 
> That way for the example above we'll have:
> (minus:QI (smax:QI (64 -128)) (smin:QI (64 -128))) == (minus:QI 64 -128) == 
> 192 (or -64 signed) which matches
> what SABD does.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?

Not without a comment explaining the above subtlety and preferably a
testcase which would fail today on trunk.

Otherwise, OK.

James

> 
> Thanks,
> Kyrill
> 
> 2019-04-02  Kyrylo Tkachov  
> 
>  * config/aarch64/iterators.md (max_opp): New code_attr.
>  (USMAX): New code iterator.
>  * config/aarch64/predicates.md (aarch64_smin): New predicate.
>  (aarch64_smax): Likewise.
>  * config/aarch64/aarch64-simd.md (abd_3): Rename to...
>  (*aarch64_abd_3): ... Change RTL representation to
>  MINUS (MAX MIN).
> 
> 2019-04-02  Kyrylo Tkachov  
> 
>  * gcc.target/aarch64/abd_1.c: New test.

Re: [PATCH][AArch64] Use neon_dot_q type for 128-bit [US]DOT instructions where appropriate

2019-02-06 Thread James Greenhalgh

On Tue, Feb 05, 2019 at 11:52:10AM -0600, Kyrill Tkachov wrote:
> Hi all,
> 
> For the Dot Product instructions we have the scheduling types neon_dot and 
> neon_dot_q for the 128-bit versions.
> It seems that we're only using the former though, not assigning the 
> neon_dot_q type anywhere.
> 
> This patch fixes that by adding the  mode attribute suffix to the type, 
> similar to how we do it for other
> types in aarch64-simd.md.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?

OK.

James

> 2019-05-02  Kyrylo Tkachov  
> 
>  * config/aarch64/aarch64-simd.md (aarch64_dot): Use 
> neon_dot for type.
>  (aarch64_dot_lane): Likewise.
>  (aarch64_dot_laneq): Likewise.

Re: [PATCH][AArch64] Use implementation namespace consistently in arm_neon.h

2019-02-06 Thread James Greenhalgh

On Wed, Feb 06, 2019 at 07:52:42AM -0600, Kyrill Tkachov wrote:
> [resending with patch compressed]
> 
> Hi all,
> 
> We're somewhat inconsistent in arm_neon.h when it comes to using the 
> implementation namespace for local
> identifiers. This means things like:
> #define hash_abcd 0
> #define hash_e 1
> #define wk 2
> 
> #include "arm_neon.h"
> 
> uint32x4_t
> foo (uint32x4_t a, uint32_t b, uint32x4_t c)
> {
>return vsha1cq_u32 (a, b, c);
> }
> 
> don't compile.
> This patch fixes these issues throughout the whole of arm_neon.h
> Bootstrapped and tested on aarch64-none-linux-gnu.
> The advsimd-intrinsics.exp tests pass just fine.
> 
> Don't feel sorry for me having to write the ChangeLog. ./contrib/mklog.pl 
> automated the whole thing.
> 
> Ok for trunk?

I assume you've just run some simple sed over this file.

I'd rather review them than the patch; what were they?

James

> 2019-02-06  Kyrylo Tkachov  
> 
>  * config/aarch64/arm_neon.h (vaba_s8): Use __ in identifiers
>  consistenly.

Re: [PATCH][wwwdocs][Arm][AArch64] Update changes with new features and flags.

2019-01-30 Thread James Greenhalgh

On Wed, Jan 23, 2019 at 04:43:02AM -0600, Tamar Christina wrote:
> Hi All,
> 
> This patch adds the documentation for Stack clash protection and Armv8.3-a 
> support to
> changes.html for GCC 9.
> I have validated the html using the W3C validator.
> 
> Ok for cvs?

Almost OK by me.

> 
> Thanks,
> Tamar
> 
> -- 

> Index: htdocs/gcc-9/changes.html
> ===
> RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-9/changes.html,v
> retrieving revision 1.35
> diff -u -r1.35 changes.html
> --- htdocs/gcc-9/changes.html 15 Jan 2019 13:17:49 -  1.35
> +++ htdocs/gcc-9/changes.html 22 Jan 2019 11:16:07 -
> @@ -214,6 +214,27 @@
>  -mtune=cortex-a76.cortex-a55 or as arguments to the 
> equivalent target
>  attributes and pragmas.
>
> +  
> +The AArch64 port now has support for stack clash protection using the
> +-fstack-clash-protection option.  The protection also works 
> for
> +SVE systems.  The probing interval/guard size can be set by using

I would drop this "also works" part. The option is just available for AArch64,
SVE is a part of AArch64.

Otherwise OK, though I don't remember if that is for me to OK, or someone
else.

Thanks,
James

> +--param stack-clash-protection-guard-size=12|16.
> +The value of this parameter must be in bytes represented as a power of 
> two.
> +The only two supported values for this parameter are 12 and 16 being
> +4Kb (2^12) and 64Kb (2^16) respectively.
> +
> +The default value is 16 (64Kb) and can be changed at configure
> +time using the flag 
> --with-stack-clash-protection-guard-size=12|16.
> +  
> +  
> +The Armv8.3-A complex number instructions are now supported via 
> intrinsics
> +when the option -march=armv8.3-a or equivalent is specified.
> +For the half-precision floating-point variants of these instructions use 
> the
> +architecture extension flag +fp16, e.g.
> +-march=armv8.3-a+fp16.
> +
> +The intrinsics are defined by the ACLE specification.
> +  
>  
>  
>  ARC
> @@ -250,6 +271,15 @@
>   (which have no known implementations) has been removed.
>   Note that Armv5T, Armv5TE and Armv5TEJ architectures remain supported.
>
> +  
> +The Armv8.3-A complex number instructions are now supported via 
> intrinsics
> +when the option -march=armv8.3-a or equivalent is specified.
> +For the half-precision floating-point variants of these instructions use 
> the
> +architecture extension flag +fp16, e.g.
> +-march=armv8.3-a+fp16.
> +
> +The intrinsics are defined by the ACLE specification.
> +  
>  
>  
>  
>

Re: add tsv110 pipeline scheduling

2019-01-17 Thread James Greenhalgh

On Mon, Jan 14, 2019 at 08:02:45AM -0600, wuyuan (E) wrote:
> Hi  Kyrill:
>  The gcc 7.3.0 does not discard the store1 and load1 command; I did 
> not expect the community's latest gcc changes so large .   
>  now I downloaded the latest GCC code, put the patch into GCC source 
> code, the compiler can pass, thank you very much for your work!
>   
> Best Regards,
>   
> wuyuan

Please check your modeling of Advanced SIMD operations.

> +(define_insn_reservation
> +  "tsv110_neon_ld4_lane" 9
> +  (and (eq_attr "tune" "tsv110")
> +   (eq_attr "type" "neon_load4_all_lanes,neon_load4_all_lanes_q,\
> +neon_load4_one_lane,neon_load4_one_lane_q"))
> +  "((tsv110_ls1*8)|(tsv110_ls2*8)|(tsv110_fsu1*8)|(tsv110_fsu2*8))")
> +

This model says you will reserve
 LS1 for 8 cycles,
  OR LS2 for 8 cycles,
  OR FSU1 for 8 cycles,
  OR FSU2 for 8 cycles.

> +(define_insn_reservation  "tsv110_neon_abd_aba" 4
> +  (and (eq_attr "tune" "tsv110")
> +   (eq_attr "type" "neon_abd,neon_arith_acc"))
> +  "tsv110_fsu1,tsv110_fsu2")

This model says you will reserve
   FSU1 for 1 cycle,
  THEN FSU2 for 1 cycle.

> +(define_insn_reservation  "tsv110_neon_abd_aba_q" 4
> +  (and (eq_attr "tune" "tsv110")
> +   (eq_attr "type" "neon_arith_acc_q"))
> +  "(tsv110_fsu1,tsv110_fsu2)+(tsv110_fsu1,tsv110_fsu2)")
> +

This model says you will reserve:
  FSU1 for 1 cycle,
 THEN FSU2 for 1 cycle
  AND
  FSU1 for 1 cycle,
 THEN FSU2 for 1 cycle

Which would be a redundant AND.

Is that how you intend to model these operations?

Remember,

If you are looking to model a 'THEN' relationship you can use the ',' operator,
If you are looking to model an 'AND' relationship you can use the '+' operator,
If you are looking to model an 'OR' relationship you can use the '|' operator.

Taking Cortex-A57 as an example:

> (define_insn_reservation
>   "cortex_a57_neon_load_d" 11
>   (and (eq_attr "tune" "cortexa57")
>(eq_attr "cortex_a57_neon_type" "neon_load_d"))
>   "ca57_cx1_issue+ca57_cx2_issue,
>ca57_ls_issue+ca57_ls_issue,ca57_ldr*2")

This model says you will reserve:

   CX1_ISSUE AND CX2_ISSUE,
  THEN LS_ISSUE AND LS_ISSUE,
  THEN LDR for 2 cycles.

Please let me know if you plan to update the model. If I have misunderstood
your intentions, please accept my apologies.

Best Regards,
James Greenhalgh

> 
> 
>   * config/aarch64/aarch64-cores.def (tsv1100): Change scheduling model.
>   * config/aarch64/aarch64.md : Add "tsv110.md"
>   * config/aarch64/tsv110.md: New file.

Re: [PATCH] Fix arm_neon.h #pragma GCC target syntax (PR target/88734)

2019-01-17 Thread James Greenhalgh

On Thu, Jan 17, 2019 at 07:47:32AM -0600, Jakub Jelinek wrote:
> Hi!
> 
> arm_neon.h on both targets contained a couple of spots with invalid
> #pragma GCC target syntax.  This doesn't result in errors, just warnings and
> those warnings are surpressed in system headers, so are visible with
> -Wsystem-headers only.  Anyway, the end result was that these pragmas were
> ignored, when they meant to be there.
> 
> The following patch fixes it.  Also, on aarch64 the sha3 intrinsics were
> wrapped with arch=armv8.2-a+crypto rather than arch=armv8.2-a+sha3, but
> because of the invalid syntax it wasn't covered in the testsuite.
> 
> Without the patch, besides -Wsystem-headers warnings on it, if somebody
> attempts to use those intrinsics in code compiled with target options that
> do not include the necessary ISA features, one will get ICEs rather than
> errors.
> 
> Bootstrapped/regtested on aarch64-linux, ok for trunk?
> 
> Note, I haven't included a testcase, as I'm not familiar enough with
> gcc.target/aarch64/ test style, but a test would be roughly include the
> testcase from the PR, compile it with -march=something that doesn't include
> the needed ISA options, probably have a dg-skip-if if somebody overrides it
> from the --target_board and make sure it emits a dg-error message rather
> than ICE.

AArch64 parts of this are OK by me. Thanks for the fix.

James

> 
> 2019-01-17  Jakub Jelinek  
> 
>   PR target/88734
>   * config/arm/arm_neon.h: Fix #pragma GCC target syntax - replace
>   (("..."))) with ("...").
>   * config/aarch64/arm_neon.h: Likewise.  Use arch=armv8.2-a+sha3
>   instead of arch=armv8.2-a+crypto for vsha512hq_u64 etc. intrinsics.

Re: [PATCH] PR target/85596 Add --with-multilib-list doc for aarch64

2019-01-17 Thread James Greenhalgh

On Mon, Jan 07, 2019 at 09:07:35AM -0600, Christophe Lyon wrote:
> Hi,
> 
> This small patch adds a short description of --with-multilib-list for aarch64.
> OK?

OK.

Thanks,
James

> 
> Thanks,
> 
> Christophe

> 2019-01-07  Christophe Lyon  
> 
>   PR target/85596
>   * doc/install.texi (with-multilib-list): Document for aarch64.
>

Re: [PATCH][AArch64] Initial -mcpu=ares tuning

2019-01-16 Thread James Greenhalgh

On Tue, Jan 15, 2019 at 09:29:46AM -0600, Kyrill Tkachov wrote:
> Hi all,
> 
> This patch adds a tuning struct for the Arm Ares CPU and uses it for 
> -m{cpu,tune}=ares.
> The tunings are an initial attempt and may be improved upon in the future, 
> but they serve
> as a decent starting point for GCC 9.
> 
> With this I see a 1.3% improvement on SPEC2006 int and 0.3% on SPEC2006 fp 
> with -mcpu=ares.
> On SPEC2017 I see a 0.6% improvement in intrate and changes in the noise for 
> fprate.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?

This only changes non-default tuning.

OK.

Are we nearly done with these types of changes in AArch64 for GCC 9? I'd
like to see us start acting like it is stage 4 soon!

James

> 2019-01-15  Kyrylo Tkachov  
> 
>  * config/aarch64/aarch64.c (ares_tunings): Define.
>  * config/aarch64/aarch64-cores.def (ares): Use the above.

Re: [PATCH][GCC][AArch64] Rename stack-clash CFA register to avoid clash.

2019-01-16 Thread James Greenhalgh

On Wed, Jan 16, 2019 at 11:03:41AM -0600, Tamar Christina wrote:
> Hi All,
> 
> We had multiple patches in flight that required used of scratch registers in
> frame layout code.  As it happens two of these features picked the same 
> register
> and landed at around the same time.  As such there is a clash when both are 
> used
> at the same time.   This patch changes the temporary r15 to r11 for stack 
> clash
> and documents the "reserved" registers in the frame layout comment.
> 
> Cross compiled and regtested on aarch64-none-elf with SVE on by default and no
> issues.
> Bootstrapped on aarch64-none-linux-gnu and no issues.
> 
> Ok for trunk?

My comments are all on your new comment detailing the register allocations,
which I fully support.

This patch is OK with those changes.

> gcc/ChangeLog:
> 
> 2019-01-16  Tamar Christina  
> 
>   PR target/88851
>   * config/aarch64/aarch64.md (STACK_CLASH_SVE_CFA_REGNUM): New.
>   * config/aarch64/aarch64.c (aarch64_allocate_and_probe_stack_space): Use
>   it and document registers.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-01-16  Tamar Christina  
> 
>   PR target/88851
>   * gcc.target/aarch64/stack-check-cfa-3.c: Update test.
> 
> -- 

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> fd60bddb1e1cbcb3dd46c319ccd182c7b9d1cd41..6a5f4956247b89932f955abbe96776a1b1ffb9cb
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -5317,11 +5317,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, 
> rtx temp2,
>   {
> /* This is done to provide unwinding information for the stack
>adjustments we're about to do, however to prevent the optimizers
> -  from removing the R15 move and leaving the CFA note (which would be
> +  from removing the R11 move and leaving the CFA note (which would be
>very wrong) we tie the old and new stack pointer together.
>The tie will expand to nothing but the optimizers will not touch
>the instruction.  */
> -   rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
> +   rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
> emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
> emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
>  
> @@ -5548,7 +5548,18 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned 
> int reg,
> to the stack we track as implicit probes are the FP/LR stores.
>  
> For outgoing arguments we probe if the size is larger than 1KB, such that
> -   the ABI specified buffer is maintained for the next callee.  */
> +   the ABI specified buffer is maintained for the next callee.
> +
> +   Aside from LR, FP, IP1 and IP0 there are a few other registers that if 
> used
> +   would clash with other features:

How about...

 The following registers are reserved during frame layout and should not be
 used for any other purpose.

  - LR 

> +
> +   - r14 and r15: Used by mitigation code.

"Used for speculation tracking." seems more correct. 'Mitigation Code' is
too broad I think.

> +   - r16 and r17: Used by indirect tailcalls
> +   - r12 and r13: Used as temporaries for stack adjustment
> + (EP0_REGNUM/EP1_REGNUM)
> +   - r11: Used by stack clash protection when SVE is enabled.

Put them in numerical (or other logical) order?

> +
> +   These registers should be avoided in frame layout related code.  */

s/should/must/

>  
>  /* Generate the prologue instructions for entry into a function.
> Establish the stack frame by decreasing the stack pointer with a

Re: [PATCH][GCC][AArch64] Fix big-endian neon-intrinsics ICEs

2019-01-16 Thread James Greenhalgh

On Mon, Jan 14, 2019 at 08:01:47AM -0600, Tamar Christina wrote:
> Hi All,
> 
> 
> This patch fixes some ICEs when the fcmla_lane intrinsics are used on
> big endian by correcting the lane indices and removing the hardcoded byte
> offset from subreg calls and instead use subreg_lowpart_offset.

Woops.

> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> Cross compiled and regtested on aarch64_be-none-elf and no issues.
> 
> Ok for trunk?

OK.

Thanks,
James

> gcc/ChangeLog:
> 
> 2019-01-14  Tamar Christina  
> 
>   * config/aarch64/aarch64-builtins.c (aarch64_simd_expand_args): Use 
> correct
>   max nunits for endian swap.
>   (aarch64_expand_fcmla_builtin): Correct subreg code.
>   * config/aarch64/aarch64-simd.md (aarch64_fcmla_lane,
>   aarch64_fcmla_laneqv4hf, aarch64_fcmlaq_lane): Correct 
> lane
>   endianness.
> 
> --

Re: [RFC][AArch64] Add support for system register based stack protector canary access

2019-01-10 Thread James Greenhalgh

On Mon, Dec 03, 2018 at 03:55:36AM -0600, Ramana Radhakrishnan wrote:
> For quite sometime the kernel guys, (more specifically Ard) have been 
> talking about using a system register (sp_el0) and an offset from that 
> for a canary based access. This patchset adds support for a new set of
> command line options similar to how powerpc has done this.
> 
> I don't intend to change the defaults in userland, we've discussed this 
> for user-land in the past and as far as glibc and userland is concerned 
> we stick to the options as currently existing. The system register 
> option is really for the kernel to use along with an offset as they 
> control their ABI and this is a decision for them to make.
> 
> I did consider sticking this all under a mcmodel=kernel-small option but
> thought that would be a bit too aggressive. There is very little error
> checking I can do in terms of the system register being used and really
> the assembler would barf quite quickly in case things go wrong. I've
> managed to rebuild Ard's kernel tree with an additional patch that
> I will send to him. I haven't managed to boot this kernel.
> 
> There was an additional question asked about the performance 
> characteristics of this but it's a security feature and the kernel 
> doesn't have the luxury of a hidden symbol. Further since the kernel 
> uses sp_el0 for access everywhere and if they choose to use the same 
> register I don't think the performance characteristics would be too bad, 
> but that's a decision for the kernel folks to make when taking in the 
> feature into the kernel.
> 
> I still need to add some tests and documentation in invoke.texi but
> this is at the stage where it would be nice for some other folks
> to look at this.
> 
> The difference in code generated is as below.
> 
> extern void bar (char *);
> int foo (void)
> {
>char a[100];
>bar ();
> }
> 
> $GCC -O2  -fstack-protector-strong  vs 
> -mstack-protector-guard-reg=sp_el0 -mstack-protector-guard=sysreg 
> -mstack-protector-guard-offset=1024 -fstack-protector-strong
> 
>   
> --- tst.s 2018-12-03 09:46:21.174167443 +
> +++ tst.s.1   2018-12-03 09:46:03.546257203 +
> @@ -15,15 +15,14 @@
>   mov x29, sp
>   str x19, [sp, 16]
>   .cfi_offset 19, -128
> - adrpx19, __stack_chk_guard
> - add x19, x19, :lo12:__stack_chk_guard
> - ldr x0, [x19]
> - str x0, [sp, 136]
> - mov x0,0
> + mrs x19, sp_el0
>   add x0, sp, 32
> + ldr x1, [x19, 1024]
> + str x1, [sp, 136]
> + mov x1,0
>   bl  bar
>   ldr x0, [sp, 136]
> - ldr x1, [x19]
> + ldr x1, [x19, 1024]
>   eor x1, x0, x1
>   cbnzx1, .L5
> 
> 
> 
> 
> I will be afk tomorrow and day after but this is to elicit some comments 
> and for Ard to try this out with his kernel patches.
> 
> Thoughts ?

I didn't see ananswer on list to Ard's questions about the command-line logic.
Remember to also fix up the error message concerns Florian raised.

That said, if Jakub is happy with this in Stage 4, I am too.

My biggest concern is the -mstack-protector-guard-reg interface, which
is unchecked user input and so opens up nasty ways to force the compiler
towards out of bounds accesses (e.g.
-mstack-protector-guard-reg="What memory is at %10")

Thanks,
James

> 
> regards
> Ramana
> 
> gcc/ChangeLog:
> 
> 2018-11-23  Ramana Radhakrishnan  
> 
>  * config/aarch64/aarch64-opts.h (enum stack_protector_guard): New
>  * config/aarch64/aarch64.c (aarch64_override_options_internal): 
> Handle
>  and put in error checks for stack protector guard options.
>  (aarch64_stack_protect_guard): New.
>  (TARGET_STACK_PROTECT_GUARD): Define.
>  * config/aarch64/aarch64.md (UNSPEC_SSP_SYSREG): New.
>  (reg_stack_protect_address): New.
>  (stack_protect_set): Adjust for SSP_GLOBAL.
>  (stack_protect_test): Likewise.
>  * config/aarch64/aarch64.opt (-mstack-protector-guard-reg): New.
>  (-mstack-protector-guard): Likewise.
>  (-mstack-protector-guard-offset): Likewise.
>  * doc/invoke.texi: Document new AArch64 options.

Re: [PATCH][AArch64] Use Q-reg loads/stores in movmem expansion

2019-01-09 Thread James Greenhalgh

On Fri, Dec 21, 2018 at 06:30:49AM -0600, Kyrill Tkachov wrote:
> Hi all,
> 
> Our movmem expansion currently emits TImode loads and stores when copying 
> 128-bit chunks.
> This generates X-register LDP/STP sequences as these are the most preferred 
> registers for that mode.
> 
> For the purpose of copying memory, however, we want to prefer Q-registers.
> This uses one fewer register, so helping with register pressure.
> It also allows merging of 256-bit and larger copies into Q-reg LDP/STP, 
> further helping code size.
> 
> The implementation of that is easy: we just use a 128-bit vector mode 
> (V4SImode in this patch)
> rather than a TImode.
> 
> With this patch the testcase:
> #define N 8
> int src[N], dst[N];
> 
> void
> foo (void)
> {
>__builtin_memcpy (dst, src, N * sizeof (int));
> }
> 
> generates:
> foo:
>  adrpx1, src
>  add x1, x1, :lo12:src
>  adrpx0, dst
>  add x0, x0, :lo12:dst
>  ldp q1, q0, [x1]
>  stp q1, q0, [x0]
>  ret
> 
> instead of:
> foo:
>  adrpx1, src
>  add x1, x1, :lo12:src
>  adrpx0, dst
>  add x0, x0, :lo12:dst
>  ldp x2, x3, [x1]
>  stp x2, x3, [x0]
>  ldp x2, x3, [x1, 16]
>  stp x2, x3, [x0, 16]
>  ret
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> I hope this is a small enough change for GCC 9.
> One could argue that it is finishing up the work done this cycle to support 
> Q-register LDP/STPs
> 
> I've seen this give about 1.8% on 541.leela_r on Cortex-A57 with other 
> changes in SPEC2017 in the noise
> but there is reduction in code size everywhere (due to more LDP/STP-Q pairs 
> being formed)
> 
> Ok for trunk?

I'm surprised by the logic. If we want to use 256-bit copies, shouldn't we
be explicit about that in the movmem code, rather than using 128-bit copies
that get merged. Why do TImode loads require two X registers? Shouldn't we
just fix TImode loads to use Q registers if that is preferable?

I'm not opposed to the principle of using LDP-Q in our movmem, but is this
the best way to make that happen?

Thanks,
James

> 2018-12-21  Kyrylo Tkachov  
> 
>  * config/aarch64/aarch64.c (aarch64_expand_movmem): Use V4SImode for
>  128-bit moves.
> 
> 2018-12-21  Kyrylo Tkachov  
> 
>  * gcc.target/aarch64/movmem-q-reg_1.c: New test.

Re: [PATCH 6/9][GCC][AArch64] Add Armv8.3-a complex intrinsics

2019-01-09 Thread James Greenhalgh

On Fri, Dec 21, 2018 at 11:57:55AM -0600, Tamar Christina wrote:
> Hi All,
> 
> This updated patch adds NEON intrinsics and tests for the Armv8.3-a complex
> multiplication and add instructions with a rotate along the Argand plane.
> 
> The instructions are documented in the ArmARM[1] and the intrinsics 
> specification
> will be published on the Arm website [2].
> 
> The Lane versions of these instructions are special in that they always 
> select a pair.
> using index 0 means selecting lane 0 and 1.  Because of this the range check 
> for the
> intrinsics require special handling.
> 
> There're a few complexities with the intrinsics for the laneq variants for 
> AArch64:
> 
> 1) The architecture does not have a version for V2SF. However since the 
> instructions always
>selects a pair of values, the only valid index for V2SF would have been 0. 
> As such the lane
>versions for V2SF are all mapped to the 3SAME variant of the instructions 
> and not the By element
>variant.
> 
> 2) Because of no# 1 above, the laneq versions of the instruction become 
> tricky. The valid indices are 0 and 1.
>For index 0 we treat it the same as the lane version of this instruction 
> and just pass the lower half of the
>register to the 3SAME instruction.  When index is 1 we extract the upper 
> half of the register and pass that to
>the 3SAME version of the instruction.
> 
> 2) The architecture forbits the laneq version of the V4HF instruction from 
> having an index greater than 1.  For index 0-1
>we do no extra work. For index 2-3 we extract the upper parts of the 
> register and pass that to the instruction it would
>have normally used, and re-map the index into a range of 0-1.
> 
> [1] 
> https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile
> [2] https://developer.arm.com/docs/101028/latest
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> Additional runtime checks done but not posted with the patch.
> 
> Ok for trunk?

OK with a refactor.

This isn't a great fit for Stage 4, but it is also completely self-contained.

I hope we can slow down new content in the AArch64 back-end and start
stabilising the port for release.

Thanks,
James

> @@ -1395,6 +1494,80 @@ aarch64_expand_builtin (tree exp,
>   }
>  
>return target;
> +
> +case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF:
> +case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF:
> +case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V2SF:
> +case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V2SF:
> +case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V4HF:
> +case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V4HF:
> +case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V4HF:
> +case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V4HF:

Pull all of this out to a new function please.

> +  int bcode = fcode - AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE - 1;
> +  aarch64_fcmla_laneq_builtin_datum* d
> + = _fcmla_lane_builtin_data[bcode];
> +  machine_mode quadmode = GET_MODE_2XWIDER_MODE (d->mode).require ();
> +  op0 = force_reg (d->mode, expand_normal (CALL_EXPR_ARG (exp, 0)));
> +  rtx op1 = force_reg (d->mode, expand_normal (CALL_EXPR_ARG (exp, 1)));
> +  rtx op2 = force_reg (quadmode, expand_normal (CALL_EXPR_ARG (exp, 2)));
> +  tree tmp = CALL_EXPR_ARG (exp, 3);
> +  rtx lane_idx = expand_expr (tmp, NULL_RTX, VOIDmode, 
> EXPAND_INITIALIZER);
> +
> +  /* Validate that the lane index is a constant.  */
> +  if (!CONST_INT_P (lane_idx))
> + {
> +   error ("%Kargument %d must be a constant immediate", exp, 4);
> +   return const0_rtx;
> + }
> +
> +  /* Validate that the index is within the expected range.  */
> +  int nunits = GET_MODE_NUNITS (quadmode).to_constant ();
> +  aarch64_simd_lane_bounds (lane_idx, 0, nunits / 2, exp);
> +
> +  /* Keep to GCC-vector-extension lane indices in the RTL.  */
> +  lane_idx = aarch64_endian_lane_rtx (quadmode, INTVAL (lane_idx));
> +
> +  /* Generate the correct register and mode.  */
> +  int lane = INTVAL (lane_idx);
> +
> +  if (lane < nunits / 4)
> + op2 = simplify_gen_subreg (d->mode, op2, quadmode, 0);
> +  else
> + {
> +   /* Select the upper 64 bits, either a V2SF or V4HF, this however
> +  is quite messy, as the operation required even though simple
> +  doesn't have a simple RTL pattern, and seems it's quite hard to
> +  define using a single RTL pattern.  The target generic version
> +  gen_highpart_mode generates code that isn't optimal.  */
> +   rtx temp1 = gen_reg_rtx (d->mode);
> +   rtx temp2 = gen_reg_rtx (DImode);
> +   temp1 = simplify_gen_subreg (d->mode, op2, quadmode, 0);
> +   temp1 = simplify_gen_subreg (V2DImode, temp1, d->mode, 0);
> +   emit_insn (gen_aarch64_get_lanev2di (temp2, temp1 , const1_rtx));
> +   op2 = simplify_gen_subreg (d->mode, temp2, GET_MODE

Re: [PATCH 3/3][GCC][AARCH64] Add support for pointer authentication B key

2019-01-07 Thread James Greenhalgh

On Fri, Dec 21, 2018 at 09:00:10AM -0600, Sam Tebbs wrote:
> On 11/9/18 11:04 AM, Sam Tebbs wrote:
 


> Attached is an improved patch with "hint" removed from the test scans, 
> pauth_hint_num_a and pauth_hint_num_b merged into pauth_hint_num and the 
> "gcc_assert (cfun->machine->frame.laid_out)" removal reverted since was 
> an unnecessary change.
> 
> OK for trunk?

While the AArch64 parts look OK to me and are buried behind an option so are
relatively safe even though we're late in development, you'll need someone
else to approve the libgcc changes. Especially as you change a generic
routine with an undocumented (?) AArch64-specific change.

Thanks,
James

> 
> gcc/
> 2018-12-21  Sam Tebbs
> 
>   * config/aarch64/aarch64-builtins.c (aarch64_builtins): Add
>   AARCH64_PAUTH_BUILTIN_AUTIB1716 and AARCH64_PAUTH_BUILTIN_PACIB1716.
>   * config/aarch64/aarch64-builtins.c (aarch64_init_pauth_hint_builtins):
>   Add autib1716 and pacib1716 initialisation.
>   * config/aarch64/aarch64-builtins.c (aarch64_expand_builtin): Add checks
>   for autib1716 and pacib1716.
>   * config/aarch64/aarch64-protos.h (aarch64_key_type,
>   aarch64_post_cfi_startproc): Define.
>   * config/aarch64/aarch64-protos.h (aarch64_ra_sign_key): Define extern.
>   * config/aarch64/aarch64.c (aarch64_return_address_signing_enabled): Add
>   check for b-key.
>   * config/aarch64/aarch64.c (aarch64_ra_sign_key,
>   aarch64_post_cfi_startproc, aarch64_handle_pac_ret_b_key): Define.
>   * config/aarch64/aarch64.h (TARGET_ASM_POST_CFI_STARTPROC): Define.
>   * config/aarch64/aarch64.c (aarch64_pac_ret_subtypes): Add "b-key".
>   * config/aarch64/aarch64.md (unspec): Add UNSPEC_AUTIA1716,
>   UNSPEC_AUTIB1716, UNSPEC_AUTIASP, UNSPEC_AUTIBSP, UNSPEC_PACIA1716,
>   UNSPEC_PACIB1716, UNSPEC_PACIASP, UNSPEC_PACIBSP.
>   * config/aarch64/aarch64.md (do_return): Add check for b-key.
>   * config/aarch64/aarch64.md (sp): Replace
>   pauth_hint_num_a with pauth_hint_num.
>   * config/aarch64/aarch64.md (1716): Replace
>   pauth_hint_num_a with pauth_hint_num.
>   * config/aarch64/aarch64.opt (msign-return-address=): Deprecate.
>   * config/aarch64/iterators.md (PAUTH_LR_SP): Add UNSPEC_AUTIASP,
>   UNSPEC_AUTIBSP, UNSPEC_PACIASP, UNSPEC_PACIBSP.
>   * config/aarch64/iterators.md (PAUTH_17_16): Add UNSPEC_AUTIA1716,
>   UNSPEC_AUTIB1716, UNSPEC_PACIA1716, UNSPEC_PACIB1716.
>   * config/aarch64/iterators.md (pauth_mnem_prefix): Add UNSPEC_AUTIA1716,
>   UNSPEC_AUTIB1716, UNSPEC_PACIA1716, UNSPEC_PACIB1716, UNSPEC_AUTIASP,
>   UNSPEC_AUTIBSP, UNSPEC_PACIASP, UNSPEC_PACIBSP.
>   * config/aarch64/iterators.md (pauth_hint_num_a): Replace
>   UNSPEC_PACI1716 and UNSPEC_AUTI1716 with UNSPEC_PACIA1716 and
>   UNSPEC_AUTIA1716 respectively.
>   * config/aarch64/iterators.md (pauth_hint_num_a): Rename to 
> pauth_hint_num
>   and add UNSPEC_PACIBSP, UNSPEC_AUTIBSP, UNSPEC_PACIB1716, 
> UNSPEC_AUTIB1716.
> 
> gcc/testsuite
> 2018-12-21  Sam Tebbs
> 
>   * gcc.target/aarch64/return_address_sign_1.c (dg-final): Replace
>   "autiasp" and "paciasp" with "hint\t29 // autisp" and
>   "hint\t25 // pacisp" respectively.
>   * gcc.target/aarch64/return_address_sign_2.c (dg-final): Replace
>   "paciasp" with "hint\t25 // pacisp".
>   * gcc.target/aarch64/return_address_sign_3.c (dg-final): Replace
>   "paciasp" and "autiasp" with "pacisp" and "autisp" respectively.
>   * gcc.target/aarch64/return_address_sign_b_1.c: New file.
>   * gcc.target/aarch64/return_address_sign_b_2.c: New file.
>   * gcc.target/aarch64/return_address_sign_b_3.c: New file.
>   * gcc.target/aarch64/return_address_sign_b_exception.c: New file.
>   * gcc.target/aarch64/return_address_sign_builtin.c: New file
> 
> libgcc/
> 2018-12-21  Sam Tebbs
> 
>   * config/aarch64/aarch64-unwind.h (aarch64_cie_signed_with_b_key): New
>   function.
>   * config/aarch64/aarch64-unwind.h (aarch64_post_extract_frame_addr,
>   aarch64_post_frob_eh_handler_addr): Add check for b-key.
>   * unwind-dw2-fde.c (get_cie_encoding): Add check for 'B' in augmentation
>   string.
>   * unwind-dw2.c (extract_cie_info): Add check for 'B' in augmentation
>   string.
>

Re: [PATCH 2/3][GCC][AARCH64] Add new -mbranch-protection option to combine pointer signing and BTI

2019-01-07 Thread James Greenhalgh

On Thu, Dec 20, 2018 at 10:38:42AM -0600, Sam Tebbs wrote:
> On 11/22/18 4:54 PM, Sam Tebbs wrote:



> 
> Hi all,
> 
> Attached is an updated patch with branch_protec_type renamed to 
> branch_protect_type, some unneeded ATTRIBUTE_USED removed and an added 
> use of ARRAY_SIZE.
> 
> Below is the updated changelog.
> 
> OK for trunk? I have committed the preceding patch in the series.


OK. Please get this in soon as we really want to be closing down for Stage 4
(and fix a few bugs in return :-) ).

Thanks,
James

> 
> gcc/ChangeLog:
> 
> 2018-12-20  Sam Tebbs
> 
>   * config/aarch64/aarch64.c (BRANCH_PROTECT_STR_MAX,
>   aarch64_parse_branch_protection,
>   struct aarch64_branch_protect_type,
>   aarch64_handle_no_branch_protection,
>   aarch64_handle_standard_branch_protection,
>   aarch64_validate_mbranch_protection,
>   aarch64_handle_pac_ret_protection,
>   aarch64_handle_attr_branch_protection,
>   accepted_branch_protection_string,
>   aarch64_pac_ret_subtypes,
>   aarch64_branch_protect_types,
>   aarch64_handle_pac_ret_leaf): Define.
>   (aarch64_override_options_after_change_1): Add check for
>   accepted_branch_protection_string.
>   (aarch64_override_options): Add check for
>   accepted_branch_protection_string.
>   (aarch64_option_save): Save accepted_branch_protection_string.
>   (aarch64_option_restore): Save
>   accepted_branch_protection_string.
>   * config/aarch64/aarch64.c (aarch64_attributes): Add branch-protection.
>   * config/aarch64/aarch64.opt: Add mbranch-protection. Deprecate
>   msign-return-address.
>   * doc/invoke.texi: Add mbranch-protection.
> 
> gcc/testsuite/ChangeLog:
> 
> 2018-12-20  Sam Tebbs
> 
>   * (gcc.target/aarch64/return_address_sign_1.c,
>   gcc.target/aarch64/return_address_sign_2.c,
>   gcc.target/aarch64/return_address_sign_3.c (__attribute__)): Change
>   option to -mbranch-protection.
>   * gcc.target/aarch64/(branch-protection-option.c,
>   branch-protection-option-2.c, branch-protection-attr.c,
>   branch-protection-attr-2.c): New file.
>

Re: [PATCH, GCC, AARCH64, 5/6] Enable BTI : Add new pass for BTI.

2018-12-19 Thread James Greenhalgh

On Fri, Dec 14, 2018 at 10:09:03AM -0600, Sudakshina Das wrote:



> I have updated the patch according to our discussions offline.
> The md pattern is now split into 4 patterns and i have added a new
> test for the setjmp case along with some comments where missing.

This is OK for trunk.

Thanks,
James

> *** gcc/ChangeLog ***
> 
> 2018-xx-xx  Sudakshina Das  
>   Ramana Radhakrishnan  
> 
>   * config.gcc (aarch64*-*-*): Add aarch64-bti-insert.o.
>   * gcc/config/aarch64/aarch64.h: Update comment for
>   TRAMPOLINE_SIZE.
>   * config/aarch64/aarch64.c (aarch64_asm_trampoline_template):
>   Update if bti is enabled.
>   * config/aarch64/aarch64-bti-insert.c: New file.
>   * config/aarch64/aarch64-passes.def (INSERT_PASS_BEFORE): Insert
>   bti pass.
>   * config/aarch64/aarch64-protos.h (make_pass_insert_bti):
>   Declare the new bti pass.
>   * config/aarch64/aarch64.md (unspecv): Add UNSPECV_BTI_NOARG,
>   UNSPECV_BTI_C, UNSPECV_BTI_J and UNSPECV_BTI_JC.
>   (bti_noarg, bti_j, bti_c, bti_jc): New define_insns.
>   * config/aarch64/t-aarch64: Add rule for aarch64-bti-insert.o.
> 
> *** gcc/testsuite/ChangeLog ***
> 
> 2018-xx-xx  Sudakshina Das  
> 
>   * gcc.target/aarch64/bti-1.c: New test.
>   * gcc.target/aarch64/bti-2.c: New test.
>   * gcc.target/aarch64/bti-3.c: New test.
>   * lib/target-supports.exp
>   (check_effective_target_aarch64_bti_hw): Add new check for
>   BTI hw.
> 
> Thanks
> Sudi

Re: [ping] allow target (OS) SUBTARGET_OVERRIDE_OPTIONS on aarch64

2018-12-12 Thread James Greenhalgh

On Wed, Dec 12, 2018 at 09:42:05AM -0600, Olivier Hainque wrote:
> Ping for one of the changes last proposed here:
> 
>  https://gcc.gnu.org/ml/gcc-patches/2018-11/msg00761.html
> 
> Submitted separately as an attempt to facilitate the review
> process.
> 
> This one proposes the possibility for target (OS) configurations
> to provide a SUBTARGET_OVERRIDE_OPTIONS macro as other CPU ports
> do, needed by our aarch64-vxworks7 port to come.
> 
> Got access to a linux box, so in addition to the Ada nighty
> testing we do on the cross port, bootstrapped and regression
> tested on aarch64-linux.
> 
> OK to commit ?

OK.

Thanks,
James

> 2018-12-12  Olivier Hainque  
> 
>   * config/aarch64/aarch64.c (aarch64_override_options): Once arch,
>   cpu and tune were validated, insert SUBTARGET_OVERRIDE_OPTIONS if
>   defined.

Re: [PATCH, GCC, AARCH64, 3/6] Restrict indirect tail calls to x16 and x17

2018-12-07 Thread James Greenhalgh

On Thu, Nov 29, 2018 at 10:56:46AM -0600, Sudakshina Das wrote:
> Hi
> 
> On 02/11/18 18:37, Sudakshina Das wrote:
> > Hi
> > 
> > This patch is part of a series that enables ARMv8.5-A in GCC and
> > adds Branch Target Identification Mechanism.
> > (https://developer.arm.com/products/architecture/cpu-architecture/a-profile/exploration-tools)
> > 
> > This patch changes the registers that are allowed for indirect tail
> > calls. We are choosing to restrict these to only x16 or x17.
> > 
> > Indirect tail calls are special in a way that they convert a call
> > statement (BLR instruction) to a jump statement (BR instruction). For
> > the best possible use of Branch Target Identification Mechanism, we
> > would like to place a "BTI C" (call) at the beginning of the function
> > which is only compatible with BLRs and BR X16/X17. In order to make
> > indirect tail calls compatible with this scenario, we are restricting
> > the TAILCALL_ADDR_REGS.
> > 
> > In order to use x16/x17 for this purpose, we also had to change the use
> > of these registers in the epilogue/prologue handling. For this purpose
> > we are now using x12 and x13 named as EP0_REGNUM and EP1_REGNUM as
> > scratch registers for epilogue and prologue.
> > 
> > Bootstrapped and regression tested with aarch64-none-linux-gnu. Updated
> > test. Ran Spec2017 and no performance hit.
> > 
> > Is this ok for trunk?

While this isn't strictly needed outside of compilation for targets with BTI
protection enabled, I can well appreciate the simplification in our backend
code to avoid special cases in these areas.

I don't forsee a high likelihood of performance issues from this patch, 
but please do keep an eye out for any reports as we move through stage 3.

This is OK for trunk.

Thanks,
James


> > 
> > Thanks
> > Sudi
> > 
> > 
> > *** gcc/ChangeLog***
> > 
> > 2018-xx-xx  Sudakshina Das  
> > 
> >* config/aarch64/aarch64.c (aarch64_expand_prologue): Use new
> >epilogue/prologue scratch registers EP0_REGNUM and EP1_REGNUM.
> >(aarch64_expand_epilogue): Likewise.
> >(aarch64_output_mi_thunk): Likewise
> >* config/aarch64/aarch64.h (REG_CLASS_CONTENTS): Change
> > TAILCALL_ADDR_REGS
> >to x16 and x17.
> >* config/aarch64/aarch64.md: Define EP0_REGNUM and EP1_REGNUM.
> > 
> > *** gcc/testsuite/ChangeLog ***
> > 
> > 2018-xx-xx  Sudakshina Das  
> > 
> >* gcc.target/aarch64/test_frame_17.c: Update to check for
> > EP0_REGNUM instead of IP0_REGNUM and add test case.
> > 
> I have edited the patch to take out a change that was not needed as part
> of this patch in aarch64_expand_epilogue. The only change now happening
> there is as mentioned in the ChangeLog to replace the uses of IP0/IP1.
> ChangeLog still applies.

Re: [PATCH 5/9][GCC][AArch64/Arm] Add auto-vectorization tests.

2018-11-28 Thread James Greenhalgh

On Sun, Nov 11, 2018 at 04:27:33AM -0600, Tamar Christina wrote:
> Hi All,
> 
> This patch adds tests for AArch64 and Arm to test the autovectorization
> of complex numbers using the Armv8.3-a instructions.
> 
> This patch enables them only for AArch64 at this point.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> The instructions have also been tested on aarch64-none-elf on a Armv8.3-a 
> model
> and -march=Armv8.3-a+fp16 and all tests pass.
> 
> Ok for trunk?

The style seems a bit weird, and there's a whole lot of redundancy I'm
not keep on. Why have .C files which are always skipped and whose only
purpose is to be included by another file; don't we normally make them .h
files to miss the glob in the testsuite?

Can we clean up at all?

Some of these have scan-assembler for the Arm backend, which I'm guessing
is just a rebase issue.

Thanks,
James

> 
> Thanks,
> Tamar
> 
> gcc/testsuite/ChangeLog:
> 
> 2018-11-11  Tamar Christina  
> 
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays-autovec-270.c: New 
> test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays-autovec-90.c: New 
> test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_1.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_2.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_3.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_4.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_5.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_6.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-complex-autovec.c: New 
> test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-complex_1.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-complex_2.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-complex_3.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-complex_4.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-complex_5.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcadd-complex_6.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex-autovec.c: New 
> test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_1.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_180_1.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_180_2.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_180_3.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_2.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_270_1.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_270_2.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_270_3.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_3.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_90_1.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_90_2.c: New test.
>   * gcc.target/aarch64/advsimd-intrinsics/vcmla-complex_90_3.c: New test.
> 
> -- 

> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays-autovec-270.c
>  
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays-autovec-270.c
> new file mode 100644
> index 
> ..8f660f392153c3a6a83b31486e275be316c6ad2b
> --- /dev/null
> +++ 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays-autovec-270.c
> @@ -0,0 +1,13 @@
> +/* { dg-skip-if "" { *-*-* } } */
> +
> +#define N 200
> +
> +__attribute__ ((noinline))
> +void calc (TYPE a[N], TYPE b[N], TYPE *c)
> +{
> +  for (int i=0; i < N; i+=2)
> +{
> +  c[i] = a[i] + b[i+1];
> +  c[i+1] = a[i+1] - b[i];
> +}
> +}
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays-autovec-90.c
>  
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays-autovec-90.c
> new file mode 100644
> index 
> ..14014b9d4f2c41e75be3e253d2e47e639e4224c0
> --- /dev/null
> +++ 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays-autovec-90.c
> @@ -0,0 +1,12 @@
> +/* { dg-skip-if "" { *-*-* } } */
> +#define N 200
> +
> +__attribute__ ((noinline))
> +void calc (TYPE a[N], TYPE b[N], TYPE *c)
> +{
> +  for (int i=0; i < N; i+=2)
> +{
> +  c[i] = a[i] - b[i+1];
> +  c[i+1] = a[i+1] + b[i];
> +}
> +}
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_1.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_1.c
> new file mode 100644
> index 
> ..627f2e78daee9c4a4f86c2071080b4114820c209
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcadd-arrays_1.c
> @@ -0,0 +1,28 @@
> +/* { dg-do run } */
> +/*

Re: [PATCH 4/9][GCC][AArch64/Arm] Add new testsuite directives to check complex instructions.

2018-11-28 Thread James Greenhalgh

On Sun, Nov 11, 2018 at 04:27:04AM -0600, Tamar Christina wrote:
> Hi All,
> 
> This patch adds new testsuite directive for both Arm and AArch64 to support
> testing of the Complex Arithmetic operations form Armv8.3-a.
> 
> Bootstrap and Regtest on aarch64-none-linux-gnu, arm-none-gnueabihf and 
> x86_64-pc-linux-gnu
> and no regressions.
> 
> The instructions have also been tested on aarch64-none-elf and arm-none-eabi 
> on a Armv8.3-a model
> and -march=Armv8.3-a+fp16 and all tests pass.
> 
> Ok for trunk?

OK by me on principle, but I don't speak TCL and can't approve the Arm part.

Please ask a testsuite maintainer.

Thanks,
James

> 
> Thanks,
> Tamar
> 
> gcc/testsuite/ChangeLog:
> 
> 2018-11-11  Tamar Christina  
> 
>   * lib/target-supports.exp
>   (check_effective_target_arm_v8_3a_complex_neon_ok_nocache,
>   check_effective_target_arm_v8_3a_complex_neon_ok,
>   add_options_for_arm_v8_3a_complex_neon,
>   check_effective_target_arm_v8_3a_complex_neon_hw,
>   check_effective_target_vect_complex_rot_N): New.
> 
> --

Re: [PATCH 3/9][GCC][AArch64] Add autovectorization support for Complex instructions

2018-11-28 Thread James Greenhalgh

On Mon, Nov 12, 2018 at 06:31:45AM -0600, Tamar Christina wrote:
> Hi Kyrill,
> 
> > Hi Tamar,
> > 
> > On 11/11/18 10:26, Tamar Christina wrote:
> > > Hi All,
> > >
> > > This patch adds the expander support for supporting autovectorization of 
> > > complex number operations
> > > such as Complex addition with a rotation along the Argand plane.  This 
> > > also adds support for complex
> > > FMA.
> > >
> > > The instructions are described in the ArmARM [1] and are available from 
> > > Armv8.3-a onwards.
> > >
> > > Concretely, this generates
> > >
> > > f90:
> > > mov x3, 0
> > > .p2align 3,,7
> > > .L2:
> > > ldr q0, [x0, x3]
> > > ldr q1, [x1, x3]
> > > fcadd   v0.2d, v0.2d, v1.2d, #90
> > > str q0, [x2, x3]
> > > add x3, x3, 16
> > > cmp x3, 3200
> > > bne .L2
> > > ret
> > >
> > > now instead of
> > >
> > > f90:
> > > mov x4, x1
> > > mov x1, x2
> > > add x3, x4, 31
> > > add x2, x0, 31
> > > sub x3, x3, x1
> > > sub x2, x2, x1
> > > cmp x3, 62
> > > mov x3, 62
> > > ccmpx2, x3, 0, hi
> > > bls .L5
> > > mov x2, x4
> > > add x3, x0, 3200
> > > .p2align 3,,7
> > > .L3:
> > > ld2 {v2.2d - v3.2d}, [x0], 32
> > > ld2 {v4.2d - v5.2d}, [x2], 32
> > > cmp x0, x3
> > > fsubv0.2d, v2.2d, v5.2d
> > > faddv1.2d, v4.2d, v3.2d
> > > st2 {v0.2d - v1.2d}, [x1], 32
> > > bne .L3
> > > ret
> > > .L5:
> > > add x6, x0, 8
> > > add x5, x4, 8
> > > add x2, x1, 8
> > > mov x3, 0
> > > .p2align 3,,7
> > > .L2:
> > > ldr d1, [x0, x3]
> > > ldr d3, [x5, x3]
> > > ldr d0, [x6, x3]
> > > ldr d2, [x4, x3]
> > > fsubd1, d1, d3
> > > faddd0, d0, d2
> > > str d1, [x1, x3]
> > > str d0, [x2, x3]
> > > add x3, x3, 16
> > > cmp x3, 3200
> > > bne .L2
> > > ret
> > >
> > > For complex additions with a 90* rotation along the Argand plane.
> > >
> > > [1] 
> > > https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile
> > >
> > > Bootstrap and Regtest on aarch64-none-linux-gnu, arm-none-gnueabihf and 
> > > x86_64-pc-linux-gnu
> > > are still on going but previous patch showed no regressions.
> > >
> > > The instructions have also been tested on aarch64-none-elf and 
> > > arm-none-eabi on a Armv8.3-a model
> > > and -march=Armv8.3-a+fp16 and all tests pass.
> > >
> > > Ok for trunk?

OK with the comment typos fixed.

> > > gcc/ChangeLog:
> > >
> > > 2018-11-11  Tamar Christina  
> > >
> > > * config/aarch64/aarch64-simd.md (aarch64_fcadd,
> > > fcadd3, aarch64_fcmla,
> > > fcmla4): New.
> > > * config/aarch64/aarch64.h (TARGET_COMPLEX): New.
> > > * config/aarch64/iterators.md (UNSPEC_FCADD90, UNSPEC_FCADD270,
> > > UNSPEC_FCMLA, UNSPEC_FCMLA90, UNSPEC_FCMLA180, UNSPEC_FCMLA270): 
> > > New.
> > > (FCADD, FCMLA): New.
> > > (rot, rotsplit1, rotsplit2): New.
> > > * config/arm/types.md (neon_fcadd, neon_fcmla): New.

Should we push these to an existing class for now, and split them later when
someone provides a scheduling model which makes use of them?

> > diff --git a/gcc/config/aarch64/aarch64-simd.md 
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 
> > c4be3101fdec930707918106cd7c53cf7584553e..12a91183a98ea23015860c77a97955cb1b30bfbb
> >  100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -419,6 +419,63 @@
> >   }
> >   )
> >   
> > +;; The fcadd and fcmla patterns are made UNSPEC for the explicitly due to 
> > the

s/for the explicitly/explicitly

> > +;; fact that their usage need to guarantee that the source vectors are

s/need/needs

> > +;; contiguous.  It would be wrong to describe the operation without being 
> > able
> > +;; to describe the permute that is also required, but even if that is done
> > +;; the permute would have been created as a LOAD_LANES which means the 
> > values
> > +;; in the registers are in the wrong order.
> > +(define_insn "aarch64_fcadd"
> > +  [(set (match_operand:VHSDF 0 "register_operand" "=w")
> > +   (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
> > +  (match_operand:VHSDF 2 "register_operand" "w")]
> > +  FCADD))]
> > +  "TARGET_COMPLEX"
> > +  "fcadd\t%0., %1., %2., #"
> > +  [(set_attr "type" "neon_fcadd")]
> > +)

Re: [PATCH][GCC][AARCH64] Replace calls to strtok with strtok_r in aarch64 attribute handling code

2018-11-28 Thread James Greenhalgh

On Fri, Nov 23, 2018 at 08:22:49AM -0600, Sam Tebbs wrote:
> Hi all,
> 
> They AArch64 general attribute handling code uses the strtok function to
> separate comma-delimited attributes in a string. This causes problems for and
> interfers with attribute-specific handling code that also uses strtok to
> separate attribute arguments, since strtok isn't re-entrant. This patch
> replaces calls to strtok with strtok_r to avoid these problems when
> adding/modifying attribute behaviour in the future.
> 
> Bootstrapped and regression tested on aarch64-none-elf with no regressions.
> 
> OK for trunk?

OK.

James
> 
> gcc/ChangeLog:
> 
> 2018-11-23  Sam Tebbs
> 
>   * config/aarch64/aarch64.c (aarch64_process_target_attr): Replace
>   calls to strtok with strtok_r.

Re: [PATCH][AArch64][2/3] Correct type attribute for mul and mneg instructions

2018-11-28 Thread James Greenhalgh

On Mon, Nov 26, 2018 at 11:36:43AM -0600, Kyrill Tkachov wrote:
> Hi all,
> 
> In the AAarch64 ISA the MUL and MNEG instructions are actually aliases of 
> MADD and MSUB.
> Therefore they should have the type attribute mla, rather than mul, which 
> should only be used
> for AArch32 32-bit multiplication instructions.
> 
> This will ensure more consistent scheduling decisions.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?

OK in principle. Did you audit the pipeline models to check this doesn't
change scheduling class in an undesirable way for any of our supported
targets? OK if so, if not can you run that audit and figure out the right
thing to do to resolve it.

Thanks,
James

Re: [PATCH][AArch64][3/3] Introduce mla64 type

2018-11-28 Thread James Greenhalgh

On Mon, Nov 26, 2018 at 11:36:47AM -0600, Kyrill Tkachov wrote:
> Hi all,
> 
> On some cores the X-register MADD/MSUB (and hence MUL and MNEG) instructions 
> may behave differently
> than the W-register forms and the scheduling models may want to reflect that.
> That is currently not possible because both sizes use the mla scheduling type.
> 
> This patch introduces the mla64 type that is used to represent the 64-bit 
> MADD,MSUB,MUL,MNEG instructions.
> It annotates the necessary aarch64.md patterns and updates the existing 
> aarch64 scheduling models to handle it.
> It is currently handled the same way as the 32-bit forms so as not to 
> introduce a difference in behaviour.
> However, now we have the freedom to model it differently, if we so choose.
> Sameera, Steve, Philipp, this is an FYI that you can update the relevant .md 
> files to model these instructions
> in a more precise way, if warranted (if this is approved.)
> 
> Bootstrapped and tested on aarch64-none-linux (and tested on arm-none-eabi).
> 
> Ok for trunk?

OK for AArch64. You can self-approve the Arm part.

James

> 2018-11-26  Kyrylo Tkachov  
> 
>  * config/arm/types.md (mla64): New type.
>  * config/arm/xgene1.md: Handle mla64.
>  * config/arm/cortex-a57.md: Likewise.
>  * config/arm/cortex-a53.md: Likewise.
>  * config/aarch64/thunderx2t99.md: Likewise.
>  * config/aarch64/thunderx.md: Likewise.
>  * config/aarch64/falkor.md: Likewise.
>  * config/aarch64/saphira.md: Likewise.
>  * config/aarch64/iterators.md (is_64_suf): New mode attribute.
>  * config/aarch64/aarch64.md (mul3): Use is_64_suf in type.
>  (madd): Likewise.
>  (*msub): Likewise.
>  (*mul_neg): Likewise.

Re: [PATCH v3] [aarch64] Correct the maximum shift amount for shifted operands.

2018-11-28 Thread James Greenhalgh

On Wed, Nov 28, 2018 at 07:08:02AM -0600, Philipp Tomsich wrote:
> 
> 
> On 28.11.2018, at 13:10, Richard Earnshaw (lists) 
> mailto:richard.earns...@arm.com>> wrote:
> 
> On 26/11/2018 19:50, Christoph Muellner wrote:
> The aarch64 ISA specification allows a left shift amount to be applied
> after extension in the range of 0 to 4 (encoded in the imm3 field).
> 
> This is true for at least the following instructions:
> 
> * ADD (extend register)
> * ADDS (extended register)
> * SUB (extended register)
> 
> The result of this patch can be seen, when compiling the following code:
> 
> uint64_t myadd(uint64_t a, uint64_t b)
> {
>  return a+(((uint8_t)b)<<4);
> }
> 
> Without the patch the following sequence will be generated:
> 
>  :
>   0: d37c1c21  ubfiz x1, x1, #4, #8
>   4: 8b20  add x0, x1, x0
>   8: d65f03c0  ret
> 
> With the patch the ubfiz will be merged into the add instruction:
> 
>  :
>   0: 8b211000  add x0, x0, w1, uxtb #4
>   4: d65f03c0  ret
> 
> Tested with "make check" and no regressions found.
> 
> *** gcc/ChangeLog ***
> 
> 2018-xx-xx  Christoph Muellner  
> mailto:christoph.muell...@theobroma-systems.com>>
> Philipp Tomsich  
> mailto:philipp.toms...@theobroma-systems.com>>
> 
> * config/aarch64/aarch64.c (aarch64_uxt_size): Correct the maximum
> shift amount for shifted operands.
> 
> *** gcc/testsuite/ChangeLog ***
> 
> 2018-xx-xx  Christoph Muellner  
> mailto:christoph.muell...@theobroma-systems.com>>
> Philipp Tomsich  
> mailto:philipp.toms...@theobroma-systems.com>>
> 
> * gcc.target/aarch64/extend.c: Adjust the testcases to cover
> the changed shift amount.
> 
> 
> This is OK.  Thanks.
> 
> R.
> 
> PS, I was sufficiently surprised by this that I went and checked the
> original commit (it's not an obvious off-by-one error).  But it does
> appear that it's been this way since the code was originally added
> (prior to the initial publication of the port) and there's no obvious
> reason why.
> 
> Since we don’t have any Reported-by: tags in GCC: the credit for initially 
> finding and
> reporting this goes to AppliedMicro's original chip verification team for the 
> XGene1.

Good spot that team! This also took me down a rabbit hole of Architecture
Reference Manuals and old source trees wondering how we got off by one. My
best guess is that 0-3 just feels like the more natural range than 0-4...

Thanks for the patch and the distraction!

James

Re: Patch ping (was Re: [PATCH] Fix aarch64_compare_and_swap* constraints (PR target/87839))

2018-11-21 Thread James Greenhalgh

On Tue, Nov 20, 2018 at 11:04:46AM -0600, Jakub Jelinek wrote:
> Hi!
> 
> On Tue, Nov 13, 2018 at 10:28:16AM +0100, Jakub Jelinek wrote:
> > 2018-11-13  Jakub Jelinek  
> > 
> > PR target/87839
> > * config/aarch64/atomics.md (@aarch64_compare_and_swap): Use
> > rIJ constraint for aarch64_plus_operand rather than rn.
> > 
> > * gcc.target/aarch64/pr87839.c: New test.
> 
> I'd like to ping this patch, Kyrill had kindly tested it, ok for trunk?

OK.

Thanks,
James

Re: [PATCH][AArch64] PR79262: Adjust vector cost

2018-11-09 Thread James Greenhalgh

On Mon, Jan 22, 2018 at 09:22:27AM -0600, Richard Biener wrote:
> On Mon, Jan 22, 2018 at 4:01 PM, Wilco Dijkstra  
> wrote:
> > PR79262 has been fixed for almost all AArch64 cpus, however the example is 
> > still
> > vectorized in a few cases, resulting in lower performance.  Increase the 
> > cost of
> > vector-to-scalar moves so it is more similar to the other vector costs. As 
> > a result
> > -mcpu=cortex-a53 no longer vectorizes the testcase - libquantum and SPECv6
> > performance improves.
> >
> > OK for commit?
> 
> It would be better to dissect this cost into vec_to_scalar and vec_extract 
> where
> vec_to_scalar really means getting at the scalar value of a vector of
> uniform values
> which most targets can do without any instruction (just use a subreg).
> 
> I suppose we could also make vec_to_scalar equal to vector extraction and 
> remove
> the uses for the other case (reduction vector result to scalar reg).

I have dug up Richard's comments from last year, which you appear to have
ignored and made no reference to when resubmitting the patch.

Please don't do that. Carefully consider Richard's review feedback before
resubmitting this patch.

To reiterate, it is not OK for trunk.

Thanks,
James

> 
> Richard.
> 
> > ChangeLog:
> > 2018-01-22  Wilco Dijkstra  
> >
> > PR target/79262
> > * config/aarch64/aarch64.c (generic_vector_cost): Adjust 
> > vec_to_scalar_cost.

Re: [PATCH][AArch64] PR79262: Adjust vector cost

2018-11-09 Thread James Greenhalgh

On Fri, Nov 09, 2018 at 08:14:27AM -0600, Wilco Dijkstra wrote:
> PR79262 has been fixed for almost all AArch64 cpus, however the example is 
> still
> vectorized in a few cases, resulting in lower performance.  Increase the cost 
> of
> vector-to-scalar moves so it is more similar to the other vector costs. As a 
> result
> -mcpu=cortex-a53 no longer vectorizes the testcase - libquantum and SPECv6
> performance improves.
> 
> OK for commit?

No.

We have 7 unique target tuning structures in the AArch64 backend, of which
only one has a 2x ratio between scalar_int_cost and vec_to_scalar_cost. Other
ratios are 1, 3, 8, 3, 4, 6.

What makes this choice correct? What makes it more correct than what we
have now? On which of the 28 entries in config/aarch64/aarch64-cores.def does
performance improve? Are the Spec benchmarks sufficiently representative to
change the generic vectorisation costs?

Please validate the performance effect of this patch, which changes default
code generation for everyone, on more than one testcase in a bug report.

Thanks,
James

> ChangeLog:
> 2018-01-22  Wilco Dijkstra  
> 
>     PR target/79262
>     * config/aarch64/aarch64.c (generic_vector_cost): Adjust 
> vec_to_scalar_cost.
> --

Re: [PATCH] Remove extra memory allocation of strings.

2018-11-08 Thread James Greenhalgh

On Tue, Oct 23, 2018 at 08:17:43AM -0500, Martin Liška wrote:
> Hello.
> 
> As a follow up patch I would like to remove redundant string allocation
> on string which is not needed in my opinion.
> 
> That bootstrap on aarch64-linux.


OK,

Thanks,
James

> From a21a626055442635057985323bb42ef29526e182 Mon Sep 17 00:00:00 2001
> From: marxin 
> Date: Mon, 22 Oct 2018 15:18:23 +0200
> Subject: [PATCH] Remove extra memory allocation of strings.
> 
> gcc/ChangeLog:
> 
> 2018-10-22  Martin Liska  
> 
>   * config/aarch64/aarch64.c (aarch64_parse_arch): Do not copy
>   string to a stack buffer.
>   (aarch64_parse_cpu): Likewise.
>   (aarch64_parse_tune): Likewise.

Re: [PATCH, GCC, AARCH64, 6/6] Enable BTI: Add configure option for BTI and PAC-RET

2018-11-07 Thread James Greenhalgh

On Fri, Nov 02, 2018 at 01:38:46PM -0500, Sudakshina Das wrote:
> Hi
> 
> This patch is part of a series that enables ARMv8.5-A in GCC and
> adds Branch Target Identification Mechanism.
> (https://developer.arm.com/products/architecture/cpu-architecture/a-profile/exploration-tools)
> 
> This patch is adding a new configure option for enabling and return
> address signing by default with --enable-standard-branch-protection.
> This is equivalent to -mbranch-protection=standard which would
> imply -mbranch-protection=pac-ret+bti.
> 
> Bootstrapped and regression tested with aarch64-none-linux-gnu with
> and without the configure option turned on.
> Also tested on aarch64-none-elf with and without configure option with a
> BTI enabled aem. Only 2 regressions and these were because newlib
> requires patches to protect hand coded libraries with BTI.
> 
> Is this ok for trunk?

With a tweak to the comment above your changes in aarch64.c, yes this is OK.

> *** gcc/ChangeLog ***
> 
> 2018-xx-xx  Sudakshina Das  
> 
>   * config/aarch64/aarch64.c (aarch64_override_options): Add case to check
>   configure option to set BTI and Return Address Signing.
>   * configure.ac: Add --enable-standard-branch-protection and
>   --disable-standard-branch-protection.
>   * configure: Regenerated.
>   * doc/install.texi: Document the same.
> 
> *** gcc/testsuite/ChangeLog ***
> 
> 2018-xx-xx  Sudakshina Das  
> 
>   * gcc.target/aarch64/bti-1.c: Update test to not add command
>   line option when configure with bti.
>   * gcc.target/aarch64/bti-2.c: Likewise.
>   * lib/target-supports.exp
>   (check_effective_target_default_branch_protection):
>   Add configure check for --enable-standard-branch-protection.
> 

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 12a55a640de4fdc5df21d313c7ea6841f1daf3f2..a1a5b7b464eaa2ce67ac66d9aea837159590aa07
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -11558,6 +11558,26 @@ aarch64_override_options (void)
>if (!selected_tune)
>  selected_tune = selected_cpu;
>  
> +  if (aarch64_enable_bti == 2)
> +{
> +#ifdef TARGET_ENABLE_BTI
> +  aarch64_enable_bti = 1;
> +#else
> +  aarch64_enable_bti = 0;
> +#endif
> +}
> +
> +  /* No command-line option yet.  */

This is too broad. Can you narrow this down to which command line option this
relates to, and what the expected default behaviours are (for both LP64 and
ILP32).

Thanks,
James

> +  if (accepted_branch_protection_string == NULL && !TARGET_ILP32)
> +{
> +#ifdef TARGET_ENABLE_PAC_RET
> +  aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
> +  aarch64_ra_sign_key = AARCH64_KEY_A;
> +#else
> +  aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
> +#endif
> +}
> +
>  #ifndef HAVE_AS_MABI_OPTION
>/* The compiler may have been configured with 2.23.* binutils, which does
>   not have support for ILP32.  */

Re: [PATCH, GCC, AARCH64, 4/6] Enable BTI: Add new to -mbranch-protection.

2018-11-07 Thread James Greenhalgh

On Fri, Nov 02, 2018 at 01:38:25PM -0500, Sudakshina Das wrote:
> Hi
> 
> This patch is part of a series that enables ARMv8.5-A in GCC and
> adds Branch Target Identification Mechanism.
> (https://developer.arm.com/products/architecture/cpu-architecture/a-profile/exploration-tools)
> 
> NOTE: This patch is dependent on Sam Tebbs patch to deprecate
> -msign-return-address and add new -mbranch-protection option
> https://gcc.gnu.org/ml/gcc-patches/2018-11/msg00104.html
> 
> This pass updates the CLI of -mbranch-protection to add "bti" as a new
> type of branch protection and also add it its definition of "none" and
> "standard". Since the BTI instructions, just like the return address
> signing instructions are in the HINT space, this option is not limited
> to ARMv8.5-A architecture version.
> 
> The option does not really do anything functional.
> The functional changes are in the next patch. I am initializing the 
> target variable aarch64_enable_bti to 2 since I am also adding a
> configure option in a later patch and a value different from 0 and 1
> would help identify if its already been updated.
> 
> Bootstrapped and regression tested with aarch64-none-linux-gnu.
> Is this ok for trunk?

OK.

Thanks,
James

> *** gcc/ChangeLog ***
> 
> 2018-xx-xx  Sudakshina Das  
> 
>   * config/aarch64/aarch64-protos.h (aarch64_bti_enabled):
>   Declare.
>   * config/aarch64/aarch64.c
>   (aarch64_handle_no_branch_protection): Disable bti for
>   -mbranch-protection=none.
>   (aarch64_handle_standard_branch_protection): Enable bti for
>   -mbranch-protection=standard.
>   (aarch64_handle_bti_protection): Enable bti for "bti" in the
>   string to -mbranch-protection.
>   (aarch64_bti_enabled): Check if bti is enabled.
>   * config/aarch64/aarch64.opt: Declare target variable.
>   * doc/invoke.texi: Add bti to the -mbranch-protection
>   documentation.

Re: [PATCH, GCC, AARCH64, 2/6] Add new arch command line feaures from ARMv8.5-A

2018-11-07 Thread James Greenhalgh

On Fri, Nov 02, 2018 at 01:37:41PM -0500, Sudakshina Das wrote:
> Hi
> 
> This patch is part of a series that enables ARMv8.5-A in GCC and
> adds Branch Target Identification Mechanism.
> (https://developer.arm.com/products/architecture/cpu-architecture/a-profile/exploration-tools)
> 
> This patch add all the command line feature that are added by ARMv8.5.
> Optional extensions to armv8.5-a:
> +rng : Random number Generation Instructions.
> +memtag : Memory Tagging Extension.
> 
> ARMv8.5-A features that are optional to older arch:
> +sb : Speculation barrier instruction.
> +ssbs: Speculative Store Bypass Safe instruction.
> +predres: Execution and Data Prediction Restriction instructions.
> 
> All of the above only effect the assembler and have already (or almost
> for a couple of cases) gone in the trunk of binutils.
> 
> Bootstrapped and regression tested with aarch64-none-linux-gnu.
> 
> Is this ok for trunk?

OK, but will need rebased to keep the AARCH64_FL_* in order.

Thanks,
James

> *** gcc/ChangeLog ***
> 
> 2018-xx-xx  Sudakshina Das  
> 
>   * config/aarch64/aarch64-option-extensions.def: Define
>   AARCH64_OPT_EXTENSION for memtag, rng, sb, ssbs and predres.
>   * gcc/config/aarch64/aarch64.h (AARCH64_FL_RNG): New.
>   (AARCH64_FL_MEMTAG, ARCH64_FL_SB, AARCH64_FL_SSBS): New.
>   (AARCH64_FL_PREDRES): New.
>   (AARCH64_FL_FOR_ARCH8_5): Add AARCH64_FL_SB, AARCH64_FL_SSBS and
>   AARCH64_FL_PREDRES by default.
>   * gcc/doc/invoke.texi: Document rng, memtag, sb, ssbs and
>   predres.
>

Re: [PATCH, GCC, AARCH64, 1/6] Enable ARMv8.5-A in gcc

2018-11-07 Thread James Greenhalgh

On Fri, Nov 02, 2018 at 01:37:33PM -0500, Sudakshina Das wrote:
> Hi
> 
> This patch is part of a series that enables ARMv8.5-A in GCC and
> adds Branch Target Identification Mechanism.
> (https://developer.arm.com/products/architecture/cpu-architecture/a-profile/exploration-tools)
> 
> This patch add the march option for armv8.5-a.
> 
> Bootstrapped and regression tested with aarch64-none-linux-gnu.
> Is this ok for trunk?

One minor tweak, otherwise OK.

> *** gcc/ChangeLog ***
> 
> 2018-xx-xx  Sudakshina Das  
> 
>   * config/aarch64/aarch64-arches.def: Define AARCH64_ARCH for
>   ARMv8.5-A.
>   * gcc/config/aarch64/aarch64.h (AARCH64_FL_V8_5): New.
>   (AARCH64_FL_FOR_ARCH8_5, AARCH64_ISA_V8_5): New.
>   * gcc/doc/invoke.texi: Document ARMv8.5-A.

> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> fa9af26fd40fd23b1c9cd6da9b6300fd77089103..b324cdd2fede33af13c03362750401f9eb1c9a90
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -170,6 +170,8 @@ extern unsigned aarch64_architecture_version;
>  #define AARCH64_FL_SHA3(1 << 18)  /* Has ARMv8.4-a SHA3 and 
> SHA512.  */
>  #define AARCH64_FL_F16FML (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  
> */
>  #define AARCH64_FL_RCPC8_4(1 << 20)  /* Has ARMv8.4-a RCPC extensions.  
> */
> +/* ARMv8.5-A architecture extensions.  */
> +#define AARCH64_FL_V8_5(1 << 22)  /* Has ARMv8.5-A features.  */
>  
>  /* Statistical Profiling extensions.  */
>  #define AARCH64_FL_PROFILE(1 << 21)

Let's keep this in order. 20, 21, 22.

Thanks,
James

Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st instructions

2018-10-31 Thread James Greenhalgh

On Wed, Oct 31, 2018 at 04:55:26PM -0500, Richard Henderson wrote:
> On 10/31/18 5:51 PM, Will Deacon wrote:
> > Aha, maybe this is the problem. An acquire fence on AArch64 is implemented
> > using a DMB LD instruction, which orders prior reads against subsequent
> > reads and writes. However, the architecture says:
> > 
> >   | The ST instructions, and LD instructions where the destination
> >   | register is WZR or XZR, are not regarded as doing a read for the purpose
> >   | of a DMB LD barrier.
> > 
> > and so therefore an ST atomic is not affected by a subsequent acquire fence,
> > whereas an LD atomic is.
> > 
> > Does that help at all?
> 
> Yes it does, thanks.  Lest this come up again, let's document this.

Good idea. OK.

James

> >From 804f690fc8ebaa436b97ea4c9fef830f9cd2b873 Mon Sep 17 00:00:00 2001
> From: Richard Henderson 
> Date: Wed, 19 Sep 2018 22:18:09 +
> Subject: [PATCH] aarch64: Remove early clobber from ATOMIC_LDOP scratch
> 
>   * config/aarch64/atomics.md (aarch64_atomic__lse):
>   The scratch register need not be early-clobber.  Document the reason
>   why we cannot use ST.
> ---
>  gcc/config/aarch64/atomics.md | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index 2198649b1be..8944b5690b5 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -263,6 +263,18 @@
>}
>  )
>  
> +;; It is tempting to want to use ST here for relaxed and release
> +;; memory models here.  However, that is incompatible with the C++
> +;; memory model for the following case:
> +;;
> +;;   atomic_fetch_add(ptr, 1, memory_order_relaxed);
> +;;   atomic_thread_fence(memory_order_acquire);
> +;;
> +;; The problem is that the architecture says that ST (and LD
> +;; insns where the destination is XZR) are not regarded as a read.
> +;; However we also implement the acquire memory barrier with DMB LD,
> +;; and so the ST is not blocked by the barrier.
> +
>  (define_insn "aarch64_atomic__lse"
>[(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
>   (unspec_volatile:ALLI
> @@ -270,7 +282,7 @@
>  (match_operand:ALLI 1 "register_operand" "r")
>  (match_operand:SI 2 "const_int_operand")]
>ATOMIC_LDOP))
> -   (clobber (match_scratch:ALLI 3 "="))]
> +   (clobber (match_scratch:ALLI 3 "=r"))]
>"TARGET_LSE"
>{
> enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -- 
> 2.17.2
>

Re: [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors

2018-10-30 Thread James Greenhalgh

This one needs some other reviewers copied in, who may have missed that
it is not an AARch64 only patch (it looks fine to me).

James

On Tue, Oct 02, 2018 at 11:19:10AM -0500, Richard Henderson wrote:
>   * optabs-libfuncs.c (build_libfunc_function_visibility):
>   New, split out from...
>   (build_libfunc_function): ... here.
>   (init_one_libfunc_visibility): New, split out from ...
>   (init_one_libfunc): ... here.
> ---
>  gcc/optabs-libfuncs.h |  2 ++
>  gcc/optabs-libfuncs.c | 26 --
>  2 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/optabs-libfuncs.h b/gcc/optabs-libfuncs.h
> index 0669ea1fdd7..cf39da36887 100644
> --- a/gcc/optabs-libfuncs.h
> +++ b/gcc/optabs-libfuncs.h
> @@ -63,7 +63,9 @@ void gen_satfract_conv_libfunc (convert_optab, const char *,
>  void gen_satfractuns_conv_libfunc (convert_optab, const char *,
>  machine_mode, machine_mode);
>  
> +tree build_libfunc_function_visibility (const char *, symbol_visibility);
>  tree build_libfunc_function (const char *);
> +rtx init_one_libfunc_visibility (const char *, symbol_visibility);
>  rtx init_one_libfunc (const char *);
>  rtx set_user_assembler_libfunc (const char *, const char *);
>  
> diff --git a/gcc/optabs-libfuncs.c b/gcc/optabs-libfuncs.c
> index bd0df8baa37..73a28e9ca7a 100644
> --- a/gcc/optabs-libfuncs.c
> +++ b/gcc/optabs-libfuncs.c
> @@ -719,10 +719,10 @@ struct libfunc_decl_hasher : ggc_ptr_hash
>  /* A table of previously-created libfuncs, hashed by name.  */
>  static GTY (()) hash_table *libfunc_decls;
>  
> -/* Build a decl for a libfunc named NAME.  */
> +/* Build a decl for a libfunc named NAME with visibility VIS.  */
>  
>  tree
> -build_libfunc_function (const char *name)
> +build_libfunc_function_visibility (const char *name, symbol_visibility vis)
>  {
>/* ??? We don't have any type information; pretend this is "int foo ()".  
> */
>tree decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
> @@ -731,7 +731,7 @@ build_libfunc_function (const char *name)
>DECL_EXTERNAL (decl) = 1;
>TREE_PUBLIC (decl) = 1;
>DECL_ARTIFICIAL (decl) = 1;
> -  DECL_VISIBILITY (decl) = VISIBILITY_DEFAULT;
> +  DECL_VISIBILITY (decl) = vis;
>DECL_VISIBILITY_SPECIFIED (decl) = 1;
>gcc_assert (DECL_ASSEMBLER_NAME (decl));
>  
> @@ -742,11 +742,19 @@ build_libfunc_function (const char *name)
>return decl;
>  }
>  
> +/* Build a decl for a libfunc named NAME.  */
> +
> +tree
> +build_libfunc_function (const char *name)
> +{
> +  return build_libfunc_function_visibility (name, VISIBILITY_DEFAULT);
> +}
> +
>  /* Return a libfunc for NAME, creating one if we don't already have one.
> -   The returned rtx is a SYMBOL_REF.  */
> +   The decl is given visibility VIS.  The returned rtx is a SYMBOL_REF.  */
>  
>  rtx
> -init_one_libfunc (const char *name)
> +init_one_libfunc_visibility (const char *name, symbol_visibility vis)
>  {
>tree id, decl;
>hashval_t hash;
> @@ -763,12 +771,18 @@ init_one_libfunc (const char *name)
>  {
>/* Create a new decl, so that it can be passed to
>targetm.encode_section_info.  */
> -  decl = build_libfunc_function (name);
> +  decl = build_libfunc_function_visibility (name, vis);
>*slot = decl;
>  }
>return XEXP (DECL_RTL (decl), 0);
>  }
>  
> +rtx
> +init_one_libfunc (const char *name)
> +{
> +  return init_one_libfunc_visibility (name, VISIBILITY_DEFAULT);
> +}
> +
>  /* Adjust the assembler name of libfunc NAME to ASMSPEC.  */
>  
>  rtx
> -- 
> 2.17.1
>

Re: [PATCH, AArch64 v2 09/11] aarch64: Force TImode values into even registers

2018-10-30 Thread James Greenhalgh

On Tue, Oct 02, 2018 at 11:19:13AM -0500, Richard Henderson wrote:
> The LSE CASP instruction requires values to be placed in even
> register pairs.  A solution involving two additional register
> classes was rejected in favor of the much simpler solution of
> simply requiring all TImode values to be aligned.

OK.

Thanks,
James

> 
>   * config/aarch64/aarch64.c (aarch64_hard_regno_mode_ok): Force
>   16-byte modes held in GP registers to use an even regno.
> ---
>  gcc/config/aarch64/aarch64.c | 12 
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 49b47382b5d..ce4d7e51d00 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1451,10 +1451,14 @@ aarch64_hard_regno_mode_ok (unsigned regno, 
> machine_mode mode)
>if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
>  return mode == Pmode;
>  
> -  if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
> -return true;
> -
> -  if (FP_REGNUM_P (regno))
> +  if (GP_REGNUM_P (regno))
> +{
> +  if (known_le (GET_MODE_SIZE (mode), 8))
> + return true;
> +  else if (known_le (GET_MODE_SIZE (mode), 16))
> + return (regno & 1) == 0;
> +}
> +  else if (FP_REGNUM_P (regno))
>  {
>if (vec_flags & VEC_STRUCT)
>   return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
> -- 
> 2.17.1
>

Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st instructions

2018-10-30 Thread James Greenhalgh

On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote:
> When the result of an operation is not used, we can ignore the
> result by storing to XZR.  For two of the memory models, using
> XZR with LD has a preferred assembler alias, ST.

ST has different semantics to LD, in particular, ST is not
ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics.

The relevant Arm Arm text is:

  If the destination register is not one of WZR or XZR, LDADDA and
  LDADDAL load from memory with acquire semantics

  LDADDL and LDADDAL store to memory with release semantics.

  LDADD has no memory ordering requirements.

I'm taking this to mean that even if the result is unused, using XZR is not
a valid transformation; it weakens the expected acquire semantics to
unordered.

The example I have from Will Deacon on an internal bug database is:

  P0 (atomic_int* y,atomic_int* x) {
atomic_store_explicit(x,1,memory_order_relaxed);
atomic_thread_fence(memory_order_release);
atomic_store_explicit(y,1,memory_order_relaxed);
  }

  P1 (atomic_int* y,atomic_int* x) {
int r0 = atomic_fetch_add_explicit(y,1,memory_order_relaxed);
atomic_thread_fence(memory_order_acquire);
int r1 = atomic_load_explicit(x,memory_order_relaxed);
  }

  The outcome where y == 2 and P1 has r0 = 1 and r1 = 0 is illegal.

This example comes from a while back in my memory; so copying Will for
any more detailed questions.

My impression is that this transformation is not safe, and so the patch is
not OK.

Thanks,
James

> 
>   * config/aarch64/atomics.md (aarch64_atomic__lse):
>   Use ST for relaxed and release models; load to XZR otherwise;
>   remove the now unnecessary scratch register.
> 
>   * gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}.
>   * gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly.
> ---
>  .../gcc.target/aarch64/atomic-inst-ldadd.c| 18 ---
>  .../gcc.target/aarch64/atomic-inst-ldlogic.c  | 54 ---
>  gcc/config/aarch64/atomics.md | 15 +++---
>  3 files changed, 57 insertions(+), 30 deletions(-)
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c 
> b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
> index 4b2282c6861..db2206186b4 100644
> --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
> @@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN)
>  TEST (sub_load, SUB_LOAD)
>  TEST (sub_load_notreturn, SUB_LOAD_NORETURN)
>  
> -/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */
> -/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */
> +/* { dg-final { scan-assembler-times "staddb\t" 8} } */
> +/* { dg-final { scan-assembler-times "staddlb\t" 8} } */
>  
> -/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */
> -/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */
> +/* { dg-final { scan-assembler-times "staddh\t" 8} } */
> +/* { dg-final { scan-assembler-times "staddlh\t" 8} } */
>  
> -/* { dg-final { scan-assembler-times "ldadd\t" 32} } */
> +/* { dg-final { scan-assembler-times "ldadd\t" 16} } */
>  /* { dg-final { scan-assembler-times "ldadda\t" 64} } */
> -/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */
> +/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */
>  /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */
> +/* { dg-final { scan-assembler-times "stadd\t" 16} } */
> +/* { dg-final { scan-assembler-times "staddl\t" 16} } */
>  
>  /* { dg-final { scan-assembler-not "ldaxr\t" } } */
>  /* { dg-final { scan-assembler-not "stlxr\t" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c 
> b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
> index 4879d52b9b4..b8a53e0a676 100644
> --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
> @@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN)
>  
>  /* Load-OR.  */
>  
> -/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */
> -/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */
> +/* { dg-final { scan-assembler-times "stsetb\t" 4} } */
> +/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */

Re: [PATCH, AArch64 v2 04/11] aarch64: Improve atomic-op lse generation

2018-10-30 Thread James Greenhalgh

On Tue, Oct 02, 2018 at 11:19:08AM -0500, Richard Henderson wrote:
> Fix constraints; avoid unnecessary split.  Drop the use of the atomic_op
> iterator in favor of the ATOMIC_LDOP iterator; this is simplier and more
> logical for ldclr aka bic.

OK.

Thanks,
James

> 
>   * config/aarch64/aarch64.c (aarch64_emit_bic): Remove.
>   (aarch64_atomic_ldop_supported_p): Remove.
>   (aarch64_gen_atomic_ldop): Remove.
>   * config/aarch64/atomic.md (atomic_):
>   Fully expand LSE operations here.
>   (atomic_fetch_): Likewise.
>   (atomic__fetch): Likewise.
>   (aarch64_atomic__lse): Drop atomic_op iterator
>   and use ATOMIC_LDOP instead; use register_operand for the input;
>   drop the split and emit insns directly.
>   (aarch64_atomic_fetch__lse): Likewise.
>   (aarch64_atomic__fetch_lse): Remove.
>   (@aarch64_atomic_load): Remove.
> ---
>  gcc/config/aarch64/aarch64-protos.h |   2 -
>  gcc/config/aarch64/aarch64.c| 176 -
>  gcc/config/aarch64/atomics.md   | 197 +++-
>  gcc/config/aarch64/iterators.md |   5 +-
>  4 files changed, 108 insertions(+), 272 deletions(-)
>

Re: [PATCH, AArch64 v2 03/11] aarch64: Improve swp generation

2018-10-30 Thread James Greenhalgh

On Tue, Oct 02, 2018 at 11:19:07AM -0500, Richard Henderson wrote:
> Allow zero as an input; fix constraints; avoid unnecessary split.

OK.

James

> 
>   * config/aarch64/aarch64.c (aarch64_emit_atomic_swap): Remove.
>   (aarch64_gen_atomic_ldop): Don't call it.
>   * config/aarch64/atomics.md (atomic_exchange):
>   Use aarch64_reg_or_zero.
>   (aarch64_atomic_exchange): Likewise.
>   (aarch64_atomic_exchange_lse): Remove split; remove & from
>   operand 0; use aarch64_reg_or_zero for input; merge ...
>   (@aarch64_atomic_swp): ... this and remove.
> ---

Re: [PATCH, AArch64 v2 02/11] aarch64: Improve cas generation

2018-10-30 Thread James Greenhalgh

On Tue, Oct 02, 2018 at 11:19:06AM -0500, Richard Henderson wrote:
> Do not zero-extend the input to the cas for subword operations;
> instead, use the appropriate zero-extending compare insns.
> Correct the predicates and constraints for immediate expected operand.

OK, modulo two very dull style comments.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index fbec54fe5da..0e2b85de1e3 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1613,6 +1613,33 @@ aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
>return cc_reg;
>  }
>  
> +/* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
> +
> +static rtx
> +aarch64_gen_compare_reg_maybe_ze(RTX_CODE code, rtx x, rtx y,
> + machine_mode y_mode)

Space before the bracket: aarch64_gen_compare_reg_maybe_ze (RTX_CODE

> @@ -14187,26 +14197,32 @@ aarch64_expand_compare_and_swap (rtx operands[])
>/* The CAS insn requires oldval and rval overlap, but we need to
>have a copy of oldval saved across the operation to tell if
>the operation is successful.  */
> -  if (mode == QImode || mode == HImode)
> - rval = copy_to_mode_reg (SImode, gen_lowpart (SImode, oldval));
> -  else if (reg_overlap_mentioned_p (rval, oldval))
> -rval = copy_to_mode_reg (mode, oldval);
> -  else
> - emit_move_insn (rval, oldval);
> +  if (reg_overlap_mentioned_p (rval, oldval))
> +rval = copy_to_mode_reg (r_mode, oldval);
> +  else 

Trailing space on else.

> + emit_move_insn (rval, gen_lowpart (r_mode, oldval));
> +
>emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
>  newval, mod_s));
> -  aarch64_gen_compare_reg (EQ, rval, oldval);
> +  cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
>  }

Thanks,
James

Re: [PATCH, AArch64 v2 01/11] aarch64: Simplify LSE cas generation

2018-10-30 Thread James Greenhalgh

On Tue, Oct 02, 2018 at 11:19:05AM -0500, Richard Henderson wrote:
> The cas insn is a single insn, and if expanded properly need not
> be split after reload.  Use the proper inputs for the insn.

OK.

Thanks,
James

> 
>   * config/aarch64/aarch64.c (aarch64_expand_compare_and_swap):
>   Force oldval into the rval register for TARGET_LSE; emit the compare
>   during initial expansion so that it may be deleted if unused.
>   (aarch64_gen_atomic_cas): Remove.
>   * config/aarch64/atomics.md (@aarch64_compare_and_swap_lse):
>   Change = to +r for operand 0; use match_dup for operand 2;
>   remove is_weak and mod_f operands as unused.  Drop the split
>   and merge with...
>   (@aarch64_atomic_cas): ... this pattern's output; remove.
>   (@aarch64_compare_and_swap_lse): Similarly.
>   (@aarch64_atomic_cas): Similarly.

Re: [PATCH] Provide extension hint for aarch64 target (PR driver/83193).

2018-10-30 Thread James Greenhalgh

On Thu, Oct 25, 2018 at 05:53:22AM -0500, Martin Liška wrote:
> On 10/24/18 7:48 PM, Martin Sebor wrote:
> > On 10/24/2018 03:52 AM, Martin Liška wrote:
> >> On 10/23/18 6:31 PM, Martin Sebor wrote:
> >>> On 10/22/2018 07:05 AM, Martin Liška wrote:
> >>>> On 10/16/18 6:57 PM, James Greenhalgh wrote:
> >>>>> On Mon, Oct 08, 2018 at 05:34:52AM -0500, Martin Liška wrote:
> >>>>>> Hi.
> >>>>>>
> >>>>>> I'm attaching updated version of the patch.
> >>>>>
> >>>>> Can't say I'm thrilled by the allocation/free (aarch64_parse_extension
> >>>>> allocates, everyone else has to free) responsibilities here.
> >>>>
> >>>> Agreed.
> >>>>
> >>>>>
> >>>>> If you can clean that up I'd be much happier. The overall patch is OK.
> >>>>
> >>>> I rewrote that to use std::string, hope it's improvement?
> >>>
> >>
> >> Hi Martin
> >>
> >>> If STR below is not nul-terminated the std::string ctor is not
> >>> safe.
> >>
> >> Appreciate the help. The string should be null-terminated, it either comes
> >> from GCC command line or it's a valid of an attribute in source code.
> >>
> >>  If it is nul-terminated but LEN is equal to its length
> >>> then the nul assignment should be unnecessary.  If LEN is less
> >>> than its length and the goal is to truncate the string then
> >>> calling resize() would be the right way to do it.  Otherwise,
> >>> assigning a nul to an element into the middle won't truncate
> >>> (it will leave the remaining elements there).  (This may not
> >>> matter if the string isn't appended to after that.)
> >>
> >> That's new for me, I reworked the patch to use resize. Btw. it sounds
> >> a candidate for a new warning ;) ? Must be quite common mistake?
> > 
> > I should have also mentioned that there is constructor that
> > takes a pointer and a count:
> > 
> >   *invalid_extension = std::string (str, len);
> > 
> > That would be even better than calling resize (sorry about that).
> 
> That's fine, I'm sending updated patch. Tested just locally as cross compiler
> in valgind.
> 
> > 
> > There are lots of opportunities for warnings about misuses of
> > the standard library.  I think we need to first solve
> > the -Wno-system-headers problem (which disables most warnings
> > for standard library headers).
> 
> I see!

OK.

Thanks,
James

Re: [AArch64] Add Saphira pipeline description.

2018-10-30 Thread James Greenhalgh

On Tue, Oct 30, 2018 at 05:12:58AM -0500, Sameera Deshpande wrote:
> On Fri, 26 Oct 2018 at 13:33, Sameera Deshpande
>  wrote:
> >
> > Hi!
> >
> > Please find attached the patch to add a pipeline description for the
> > Qualcomm Saphira core.  It is tested with a bootstrap and make check,
> > with no regressions.
> >
> > Ok for trunk?

OK.

I wonder if there's anything we can do to improve maintainability in these
cases where two pipeline models have considerable overlaps. 

Thanks,
James

> >
> > gcc/
> > Changelog:
> >
> > 2018-10-26 Sameera Deshpande 
> >
> > * config/aarch64/aarch64-cores.def (saphira): Use saphira pipeline.
> > * config/aarch64/aarch64.md: Include saphira.md
> > * config/aarch64/saphira.md: New file for pipeline description.
> >
> > --
> > - Thanks and regards,
> >   Sameera D.
> 
> Hi!
> 
> Please find attached updated patch.
> Bootstrap and make check passed without regression. Ok for trunk?
> 
> -- 
> - Thanks and regards,
>   Sameera D.

> diff --git a/gcc/config/aarch64/aarch64-cores.def 
> b/gcc/config/aarch64/aarch64-cores.def
> index 3d876b8..8e4c646 100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -90,7 +90,7 @@ AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, 8_2A,  
> AARCH64_FL_FOR_ARCH8_2
>  /* ARMv8.4-A Architecture Processors.  */
>  
>  /* Qualcomm ('Q') cores. */
> -AARCH64_CORE("saphira", saphira,falkor,8_4A,  
> AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   
> 0x51, 0xC01, -1)
> +AARCH64_CORE("saphira", saphira,saphira,8_4A,  
> AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   
> 0x51, 0xC01, -1)
>  
>  /* ARMv8-A big.LITTLE implementations.  */
>  
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index a014a01..f951354 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -298,6 +298,7 @@
>  (include "../arm/cortex-a57.md")
>  (include "../arm/exynos-m1.md")
>  (include "falkor.md")
> +(include "saphira.md")
>  (include "thunderx.md")
>  (include "../arm/xgene1.md")
>  (include "thunderx2t99.md")
> diff --git a/gcc/config/aarch64/saphira.md b/gcc/config/aarch64/saphira.md
> new file mode 100644
> index 000..bbf1c5c
> --- /dev/null
> +++ b/gcc/config/aarch64/saphira.md
> @@ -0,0 +1,583 @@
> +;; Saphira pipeline description
> +;; Copyright (C) 2017-2018 Free Software Foundation, Inc.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it
> +;; under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but
> +;; WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;; General Public License for more details.
> +;;
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; .
> +
> +(define_automaton "saphira")
> +
> +;; Complex int instructions (e.g. multiply and divide) execute in the X
> +;; pipeline.  Simple int instructions execute in the X, Y, Z and B pipelines.
> +
> +(define_cpu_unit "saphira_x" "saphira")
> +(define_cpu_unit "saphira_y" "saphira")
> +
> +;; Branches execute in the Z or B pipeline or in one of the int pipelines 
> depending
> +;; on how complex it is.  Simple int insns (like movz) can also execute here.
> +
> +(define_cpu_unit "saphira_z" "saphira")
> +(define_cpu_unit "saphira_b" "saphira")
> +
> +;; Vector and FP insns execute in the VX and VY pipelines.
> +
> +(define_automaton "saphira_vfp")
> +
> +(define_cpu_unit "saphira_vx" "saphira_vfp")
> +(define_cpu_unit "saphira_vy" "saphira_vfp")
> +
> +;; Loads execute in the LD pipeline.
> +;; Stores execute in the ST pipeline, for address, data, and
> +;; vector data.
> +
> +(define_automaton "saphira_mem")
> +
> +(define_cpu_unit "saphira_ld" "saphira_mem")
> +(define_cpu_unit "saphira_st" "saphira_mem")
> +
> +;; The GTOV and VTOG pipelines are for general to vector reg moves, and vice
> +;; versa.
> +
> +(define_cpu_unit "saphira_gtov" "saphira")
> +(define_cpu_unit "saphira_vtog" "saphira")
> +
> +;; Common reservation combinations.
> +
> +(define_reservation "saphira_vxvy" "saphira_vx|saphira_vy")
> +(define_reservation "saphira_zb"   "saphira_z|saphira_b")
> +(define_reservation "saphira_xyzb" "saphira_x|saphira_y|saphira_z|saphira_b")
> +
> +;; SIMD Floating-Point Instructions
> +
> +(define_insn_reservation "saphira_afp_1_vxvy" 1
> +  (and (eq_attr "tune" "saphira")
> +   (eq_attr "type" 
> "neon_fp_neg_s,neon_fp_neg_d,neon_fp_abs_s,neon_fp_abs_d,neon_fp_neg_s_q,neon_fp_neg_d_q,neon_fp_abs_s_q,neon_fp_abs_d_q"))
> +  "saphira_vxvy")
> +
> +(define_insn_reservation

Re: [PATCH] Provide extension hint for aarch64 target (PR driver/83193).

2018-10-16 Thread James Greenhalgh

On Mon, Oct 08, 2018 at 05:34:52AM -0500, Martin Liška wrote:
> Hi.
> 
> I'm attaching updated version of the patch.

Can't say I'm thrilled by the allocation/free (aarch64_parse_extension
allocates, everyone else has to free) responsibilities here.

If you can clean that up I'd be much happier. The overall patch is OK.

Thanks,
James

> From d36974540cda9fb0e159103fdcf92d26ef2f1b94 Mon Sep 17 00:00:00 2001
> From: marxin 
> Date: Thu, 4 Oct 2018 16:31:49 +0200
> Subject: [PATCH] Provide extension hint for aarch64 target (PR driver/83193).
> 
> gcc/ChangeLog:
> 
> 2018-10-05  Martin Liska  
> 
>   PR driver/83193
>   * common/config/aarch64/aarch64-common.c (aarch64_parse_extension):
>   Add new argument invalid_extension.
>   (aarch64_get_all_extension_candidates): New function.
>   (aarch64_rewrite_selected_cpu): Add NULL to function call.
>   * config/aarch64/aarch64-protos.h (aarch64_parse_extension): Add
>   new argument.
>   (aarch64_get_all_extension_candidates): New function.
>   * config/aarch64/aarch64.c (aarch64_parse_arch): Add new
>   argument invalid_extension.
>   (aarch64_parse_cpu): Likewise.
>   (aarch64_print_hint_for_extensions): New function.
>   (aarch64_validate_mcpu): Provide hint about invalid extension.
>   (aarch64_validate_march): Likewise.
>   (aarch64_handle_attr_arch): Pass new argument.
>   (aarch64_handle_attr_cpu): Provide hint about invalid extension.
>   (aarch64_handle_attr_isa_flags): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 2018-10-05  Martin Liska  
> 
>   PR driver/83193
>   * gcc.target/aarch64/spellcheck_7.c: New test.
>   * gcc.target/aarch64/spellcheck_8.c: New test.
>   * gcc.target/aarch64/spellcheck_9.c: New test.
> ---
>  gcc/common/config/aarch64/aarch64-common.c| 24 +-
>  gcc/config/aarch64/aarch64-protos.h   |  4 +-
>  gcc/config/aarch64/aarch64.c  | 75 +++
>  .../gcc.target/aarch64/spellcheck_7.c | 12 +++
>  .../gcc.target/aarch64/spellcheck_8.c | 13 
>  .../gcc.target/aarch64/spellcheck_9.c | 13 
>  6 files changed, 121 insertions(+), 20 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/spellcheck_7.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/spellcheck_8.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/spellcheck_9.c
>

Re: [PATCH v4] [aarch64] Add HiSilicon tsv110 CPU support

2018-09-20 Thread James Greenhalgh

On Wed, Sep 19, 2018 at 04:53:52AM -0500, Shaokun Zhang wrote:
> This patch adds HiSilicon's an mcpu: tsv110, which supports v8_4A.
> It has been tested on aarch64 and no regressions from this patch.

This patch is OK for Trunk.

Do you need someone to commit it on your behalf?

Thanks,
James

> 
> ---
>  gcc/ChangeLog|   9 +++
>  gcc/config/aarch64/aarch64-cores.def |   3 +
>  gcc/config/aarch64/aarch64-cost-tables.h | 104 
> +++
>  gcc/config/aarch64/aarch64-tune.md   |   2 +-
>  gcc/config/aarch64/aarch64.c |  82 
>  gcc/doc/invoke.texi  |   2 +-
>  6 files changed, 200 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 69e2e14..a040daa 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,12 @@
> +2018-09-19  Shaokun Zhang  
> +Bo Zhou  
> +
> + * config/aarch64/aarch64-cores.def (tsv110): New CPU.
> + * config/aarch64/aarch64-tune.md: Regenerated.
> + * doc/invoke.texi (AArch64 Options/-mtune): Add "tsv110".
> + * config/aarch64/aarch64.c (tsv110_tunings): New tuning table.
> + * config/aarch64/aarch64-cost-tables.h: Add "tsv110" extra costs.
> +
>  2018-09-18  Marek Polacek  
>  
>   P1064R0 - Allowing Virtual Function Calls in Constant Expressions

Re: [PATCH][AAarch64][v3] Add support for TARGET_COMPUTE_FRAME_LAYOUT

2018-09-12 Thread James Greenhalgh

On Wed, Sep 12, 2018 at 08:07:41AM -0500, Vlad Lazar wrote:
> On 11/09/18 17:53, James Greenhalgh wrote:
> > On Mon, Aug 06, 2018 at 11:14:17AM -0500, Vlad Lazar wrote:
> >> Hi,
> >>
> >> The patch adds support for the TARGET_COMPUTE_FRAME_LAYOUT hook on AArch64
> >> and removes unneeded frame layout recalculation.
> >>
> >> The removed aarch64_layout_frame calls are unnecessary because the 
> >> functions in which
> >> they appear will be called during or after the reload pass in which the 
> >> TARGET_COMPUTE_FRAME_LAYOUT
> >> hook is called. The if statement in aarch64_layout_frame had the purpose 
> >> of avoiding
> >> the extra work from the calls which have been removed and is now redundant.
> >
> > I'm not sure I understand, I may be missing something as the frame layout
> > is complex, but I can't get where I need to be to accept your patch from 
> > this
> > comment.
> >
> > The check you removed ensures that if we're after reload, and the frame is
> > laid out, we do no additional work. That part I understand, and that would
> > mean that any post-reload calls were no-ops. Is the argument that all
> > users of this code that you eliminate are after reload, and consequently
> > would have hit this no-op path? Can you talk me through why each case is
> > safe?
> >
> Thanks for taking a look at the patch.
> 
> Indeed, all the removed calls are happening during or after reload. I'll go 
> trough all of them
> and try to explain the rationale behind.
> 
> aarch64_expand_prologue and aarch64_expand_epilogue are called after the 
> pro_and_epilogue pass,
> which runs after reload where TARGET_COMPUTE_FRAME_LAYOUT is called.
> 
> aarch64_use_return_insn_p checks explicitly for reload_completed at the 
> beginning of the function
> and returns false if reload has not run. So it's safe to remove the call as 
> the frame layout is
> computed by the time it reaches that point.
> 
> aarch64_get_separate_components implements the 
> TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS hook.
> This hook only seems to be used int 
> shrink_wrap.c:try_shrink_wrapping_separate. The actual origin
> of this hook call can be traced back to the pro_and_epilogue pass:
> shrink_wrap.c:try_shrink_wrapping_separate <-
> function.c:thread_prologue_and_epilogue_insns <-
> function.c:rest_of_handle_thread_prologue_and_epilogue (pro_and_epilogue pass 
> entry point).
> Therefore, aarch64_get_separate_components only gets called post reload.
> 
> aarch64_get_separate_components implements the INITIAL_ELIMINATION_OFFSET 
> hook, which is used in:
>   1. rtlanal.c:get_initial_register_offset: Before using the hook it 
> checks that reload has
>   been completed.
>   2. reload1.c:get_initial_register_offset and 
> reload1.c:set_initial_elim_offsets: These functions
>   explicitly call TARGET_COMPUTE_FRAME_LAYOUT before using the hook.
>   3. lra-eliminitations.c:update_reg_eliminate: The 
> TARGET_COMPUTE_FRAME_LAYOUT is, again, called
>   before the INITIAL_ELIMINATION_OFFSET hook is used.
> 
> I hope this helps make things a bit clearer.

Thanks for this, it is very helpful. The patch is OK for trunk.

James

> >> gcc/
> >> 2018-08-06  Vlad Lazar  
> >>
> >>* config/aarch64/aarch64.h (TARGET_COMPUTE_FRAME_LAYOUT): Define.
> >>* config/aarch64/aarch64.c (aarch64_expand_prologue): Remove 
> >> aarch64_layout_frame call.
> >>(aarch64_expand_epilogue): Likewise.
> >>(aarch64_initial_elimination_offset): Likewise.
> >>(aarch64_get_separate_components): Likewise.
> >>(aarch64_use_return_insn_p): Likewise.
> >>(aarch64_layout_frame): Remove unneeded check.
>

Re: [PATCH][AAarch64][v3] Add support for TARGET_COMPUTE_FRAME_LAYOUT

2018-09-11 Thread James Greenhalgh

On Mon, Aug 06, 2018 at 11:14:17AM -0500, Vlad Lazar wrote:
> Hi,
> 
> The patch adds support for the TARGET_COMPUTE_FRAME_LAYOUT hook on AArch64
> and removes unneeded frame layout recalculation.
> 
> The removed aarch64_layout_frame calls are unnecessary because the functions 
> in which
> they appear will be called during or after the reload pass in which the 
> TARGET_COMPUTE_FRAME_LAYOUT
> hook is called. The if statement in aarch64_layout_frame had the purpose of 
> avoiding
> the extra work from the calls which have been removed and is now redundant.

I'm not sure I understand, I may be missing something as the frame layout
is complex, but I can't get where I need to be to accept your patch from this
comment.

The check you removed ensures that if we're after reload, and the frame is
laid out, we do no additional work. That part I understand, and that would
mean that any post-reload calls were no-ops. Is the argument that all
users of this code that you eliminate are after reload, and consequently
would have hit this no-op path? Can you talk me through why each case is
safe?

Thanks,
James

> gcc/
> 2018-08-06  Vlad Lazar  
> 
>   * config/aarch64/aarch64.h (TARGET_COMPUTE_FRAME_LAYOUT): Define.
>   * config/aarch64/aarch64.c (aarch64_expand_prologue): Remove 
> aarch64_layout_frame call.
>   (aarch64_expand_epilogue): Likewise.
>   (aarch64_initial_elimination_offset): Likewise.
>   (aarch64_get_separate_components): Likewise.
>   (aarch64_use_return_insn_p): Likewise.
>   (aarch64_layout_frame): Remove unneeded check.

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1649 matches

Mail list logo